From a7f495f170864e6bddc4bb29ae7fae293a7136aa Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 10 Jun 2025 13:30:31 -0700
Subject: [PATCH 001/851] [lldb] Revive TestSimulatorPlatform.py (#142244)

This test was incorrectly disabled and bitrotted since then. This PR
fixes up the test and re-enables it.

 - Build against the system libc++ (which can target the simulator)
 - Bump the deployment target for iOS and tvOS on Apple Silicon
 - Skip backdeploying to pre-Apple Silicon OS on Apple Silicon.
---
 .../Python/lldbsuite/test/decorators.py       | 54 +++++++++++++++++--
 .../macosx/simulator/TestSimulatorPlatform.py | 12 +++--
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 868e9f7e5eca0..a391319ca9b0e 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -9,6 +9,7 @@
 import sys
 import tempfile
 import subprocess
+import json
 
 # Third-party modules
 import unittest
@@ -451,24 +452,67 @@ def apple_simulator_test(platform):
     """
     Decorate the test as a test requiring a simulator for a specific platform.
 
-    Consider that a simulator is available if you have the corresponding SDK installed.
-    The SDK identifiers for simulators are iphonesimulator, appletvsimulator, watchsimulator
+    Consider that a simulator is available if you have the corresponding SDK
+    and runtime installed.
+
+    The SDK identifiers for simulators are iphonesimulator, appletvsimulator,
+    watchsimulator
     """
 
     def should_skip_simulator_test():
         if lldbplatformutil.getHostPlatform() not in ["darwin", "macosx"]:
             return "simulator tests are run only on darwin hosts."
+
+        # Make sure we recognize the platform.
+        mapping = {
+            "iphone": "ios",
+            "appletv": "tvos",
+            "watch": "watchos",
+        }
+        if platform not in mapping:
+            return "unknown simulator platform: {}".format(platform)
+
+        # Make sure we have an SDK.
         try:
             output = subprocess.check_output(
                 ["xcodebuild", "-showsdks"], stderr=subprocess.DEVNULL
             ).decode("utf-8")
-            if re.search("%ssimulator" % platform, output):
-                return None
-            else:
+            if not re.search("%ssimulator" % platform, output):
                 return "%s simulator is not supported on this system." % platform
         except subprocess.CalledProcessError:
             return "Simulators are unsupported on this system (xcodebuild failed)"
 
+        # Make sure we a simulator runtime.
+        try:
+            sim_devices_str = subprocess.check_output(
+                ["xcrun", "simctl", "list", "-j", "devices"]
+            ).decode("utf-8")
+
+            sim_devices = json.loads(sim_devices_str)["devices"]
+            for simulator in sim_devices:
+                if isinstance(simulator, dict):
+                    runtime = simulator["name"]
+                    devices = simulator["devices"]
+                else:
+                    runtime = simulator
+                    devices = sim_devices[simulator]
+
+                if not mapping[platform] in runtime.lower():
+                    continue
+
+                for device in devices:
+                    if (
+                        "availability" in device
+                        and device["availability"] == "(available)"
+                    ):
+                        return None
+                    if "isAvailable" in device and device["isAvailable"]:
+                        return None
+
+            return "{} simulator is not supported on this system.".format(platform)
+        except (subprocess.CalledProcessError, json.decoder.JSONDecodeError):
+            return "Simulators are unsupported on this system (simctl failed)"
+
     return skipTestIfFn(should_skip_simulator_test)
 
 
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index faf2256b03a0d..74ba0ee6c83bb 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -39,15 +39,15 @@ def check_debugserver(self, log, expected_platform, expected_version):
         if expected_version:
             self.assertEqual(aout_info["min_version_os_sdk"], expected_version)
 
-    @skipIf(bugnumber="rdar://76995109")
     def run_with(self, arch, os, vers, env, expected_load_command):
         env_list = [env] if env else []
         triple = "-".join([arch, "apple", os + vers] + env_list)
         sdk = lldbutil.get_xcode_sdk(os, env)
 
-        version_min = ""
         if not vers:
             vers = lldbutil.get_xcode_sdk_version(sdk)
+
+        version_min = ""
         if env == "simulator":
             version_min = "-m{}-simulator-version-min={}".format(os, vers)
         elif os == "macosx":
@@ -56,11 +56,14 @@ def run_with(self, arch, os, vers, env, expected_load_command):
         sdk_root = lldbutil.get_xcode_sdk_root(sdk)
         clang = lldbutil.get_xcode_clang(sdk)
 
+        print(triple)
+
         self.build(
             dictionary={
                 "ARCH": arch,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "SDKROOT": sdk_root,
+                "USE_SYSTEM_STDLIB": 1,
             },
             compiler=clang,
         )
@@ -146,6 +149,7 @@ def test_watchos_armv7k(self):
 
     @skipUnlessDarwin
     @skipIfDarwinEmbedded
+    @skipIf(archs=["arm64", "arm64e"])
     def test_lc_version_min_macosx(self):
         """Test running a back-deploying non-simulator MacOS X binary"""
         self.run_with(
@@ -198,7 +202,7 @@ def test_ios_backdeploy_apple_silicon(self):
         self.run_with(
             arch=self.getArchitecture(),
             os="ios",
-            vers="11.0",
+            vers="14.0",
             env="simulator",
             expected_load_command="LC_BUILD_VERSION",
         )
@@ -229,7 +233,7 @@ def test_tvos_backdeploy_apple_silicon(self):
         self.run_with(
             arch=self.getArchitecture(),
             os="tvos",
-            vers="11.0",
+            vers="14.0",
             env="simulator",
             expected_load_command="LC_BUILD_VERSION",
         )

From d7282c56cd294a2eb4890e50c84e6eae6f7c6671 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 10 Jun 2025 23:34:26 +0300
Subject: [PATCH 002/851] [llvm-rc] Add support for multiplication and division
 in expressions (#143373)

This is supported by GNU windres. MS rc.exe does accept these
expressions, but doesn't evalulate them correctly, it only returns the
left hand side.

This fixes one aspect of
https://github.com/llvm/llvm-project/issues/143157.
---
 llvm/test/tools/llvm-rc/Inputs/parser-expr.rc |  5 ++
 llvm/test/tools/llvm-rc/Inputs/tokens.rc      |  1 +
 llvm/test/tools/llvm-rc/parser-expr.test      |  5 ++
 llvm/test/tools/llvm-rc/tokenizer.test        |  5 ++
 llvm/tools/llvm-rc/ResourceScriptParser.cpp   | 49 ++++++++++++++-----
 llvm/tools/llvm-rc/ResourceScriptParser.h     |  1 +
 llvm/tools/llvm-rc/ResourceScriptStmt.h       | 24 +++++++++
 llvm/tools/llvm-rc/ResourceScriptToken.cpp    | 12 ++++-
 llvm/tools/llvm-rc/ResourceScriptToken.h      |  7 ++-
 .../tools/llvm-rc/ResourceScriptTokenList.def |  2 +
 10 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
index 8e69c1cd1fa16..2f8e4b2d344a0 100644
--- a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
@@ -5,6 +5,11 @@ LANGUAGE 1|1&0, 0&0|1
 LANGUAGE 3+4-5, 3-4+5
 LANGUAGE 1+2|3, 3|1+2
 LANGUAGE 6&~5, 6&-8
+LANGUAGE 7/3, 7*3
+LANGUAGE 5/2*2, 5*3/2
+LANGUAGE 1+2*3, (1+2)*3
+LANGUAGE 100/12/5*5, 1+1+1+1*4
+LANGUAGE 9/(1+3), (4+5)/4
 LANGUAGE -1, --1
 LANGUAGE ----1, -----1
 LANGUAGE ~1, ~~1
diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
index 6a781202a7e37..20f77912477d9 100644
--- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
@@ -1,4 +1,5 @@
 ﻿1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
+1*3/4
 He11o LLVM
 identifier-with-dashes
 
diff --git a/llvm/test/tools/llvm-rc/parser-expr.test b/llvm/test/tools/llvm-rc/parser-expr.test
index ed6796529fdfa..14a299c9e3e96 100644
--- a/llvm/test/tools/llvm-rc/parser-expr.test
+++ b/llvm/test/tools/llvm-rc/parser-expr.test
@@ -7,6 +7,11 @@
 ; CHECK-NEXT:  Language: 2, Sublanguage: 4
 ; CHECK-NEXT:  Language: 3, Sublanguage: 5
 ; CHECK-NEXT:  Language: 2, Sublanguage: 0
+; CHECK-NEXT:  Language: 2, Sublanguage: 21
+; CHECK-NEXT:  Language: 4, Sublanguage: 7
+; CHECK-NEXT:  Language: 7, Sublanguage: 9
+; CHECK-NEXT:  Language: 5, Sublanguage: 7
+; CHECK-NEXT:  Language: 2, Sublanguage: 2
 ; CHECK-NEXT:  Language: 4294967295, Sublanguage: 1
 ; CHECK-NEXT:  Language: 1, Sublanguage: 4294967295
 ; CHECK-NEXT:  Language: 4294967294, Sublanguage: 1
diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test
index 8486f8bd78690..3062e2bf64629 100644
--- a/llvm/test/tools/llvm-rc/tokenizer.test
+++ b/llvm/test/tools/llvm-rc/tokenizer.test
@@ -25,6 +25,11 @@
 ; CHECK-NEXT:  BlockEnd: }
 ; CHECK-NEXT:  BlockBegin: Begin
 ; CHECK-NEXT:  BlockEnd: End
+; CHECK-NEXT:  Int: 1; int value = 1
+; CHECK-NEXT:  Asterisk: *
+; CHECK-NEXT:  Int: 3; int value = 3
+; CHECK-NEXT:  Slash: /
+; CHECK-NEXT:  Int: 4; int value = 4
 ; CHECK-NEXT:  Identifier: He11o
 ; CHECK-NEXT:  Identifier: LLVM
 ; CHECK-NEXT:  Identifier: identifier-with-dashes
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 69798152c1f25..e4efc83c933b4 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -132,12 +132,13 @@ void RCParser::consume() {
 //
 // The following grammar is used to parse the expressions Exp1:
 //   Exp1 ::= Exp2 || Exp1 + Exp2 || Exp1 - Exp2 || Exp1 | Exp2 || Exp1 & Exp2
-//   Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
-// (More conveniently, Exp1 is a non-empty sequence of Exp2 expressions,
-// separated by binary operators.)
+//   Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3
+//   Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1)
+// (More conveniently, Exp1 and Exp2 are non-empty sequences of Exp3
+// expressions, separated by binary operators.)
 //
-// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, while Exp2
-// is read by parseIntExpr2().
+// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, Exp2
+// is read by parseIntExpr2() and Exp3 is read by parseIntExpr3().
 //
 // The original Microsoft tool handles multiple unary operators incorrectly.
 // For example, in 16-bit little-endian integers:
@@ -158,7 +159,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
   ASSIGN_OR_RETURN(FirstResult, parseIntExpr2());
   IntWithNotMask Result = *FirstResult;
 
-  while (!isEof() && look().isBinaryOp()) {
+  while (!isEof() && look().isLowPrecedenceBinaryOp()) {
     auto OpToken = read();
     ASSIGN_OR_RETURN(NextResult, parseIntExpr2());
 
@@ -180,7 +181,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
       break;
 
     default:
-      llvm_unreachable("Already processed all binary ops.");
+      llvm_unreachable("Already processed all low precedence binary ops.");
     }
   }
 
@@ -188,7 +189,33 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
 }
 
 Expected<IntWithNotMask> RCParser::parseIntExpr2() {
-  // Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
+  // Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3.
+  ASSIGN_OR_RETURN(FirstResult, parseIntExpr3());
+  IntWithNotMask Result = *FirstResult;
+
+  while (!isEof() && look().isHighPrecedenceBinaryOp()) {
+    auto OpToken = read();
+    ASSIGN_OR_RETURN(NextResult, parseIntExpr3());
+
+    switch (OpToken.kind()) {
+    case Kind::Asterisk:
+      Result *= *NextResult;
+      break;
+
+    case Kind::Slash:
+      Result /= *NextResult;
+      break;
+
+    default:
+      llvm_unreachable("Already processed all high precedence binary ops.");
+    }
+  }
+
+  return Result;
+}
+
+Expected<IntWithNotMask> RCParser::parseIntExpr3() {
+  // Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1).
   static const char ErrorMsg[] = "'-', '~', integer or '('";
 
   if (isEof())
@@ -197,13 +224,13 @@ Expected<IntWithNotMask> RCParser::parseIntExpr2() {
   switch (look().kind()) {
   case Kind::Minus: {
     consume();
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return -(*Result);
   }
 
   case Kind::Tilde: {
     consume();
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return ~(*Result);
   }
 
@@ -220,7 +247,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr2() {
   case Kind::Identifier: {
     if (!read().value().equals_insensitive("not"))
       return getExpectedError(ErrorMsg, true);
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return IntWithNotMask(0, (*Result).getValue());
   }
 
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index aa7f847187c49..1e7618c84142e 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -88,6 +88,7 @@ class RCParser {
   // Helper integer expression parsing methods.
   Expected<IntWithNotMask> parseIntExpr1();
   Expected<IntWithNotMask> parseIntExpr2();
+  Expected<IntWithNotMask> parseIntExpr3();
 
   // Advance the state by one, discarding the current token.
   // If the discarded token had an incorrect type, fail.
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 8f099202c0b47..a81e384fda365 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -49,6 +49,16 @@ class RCInt {
     return *this;
   }
 
+  RCInt &operator*=(const RCInt &Rhs) {
+    std::tie(Val, Long) = std::make_pair(Val * Rhs.Val, Long | Rhs.Long);
+    return *this;
+  }
+
+  RCInt &operator/=(const RCInt &Rhs) {
+    std::tie(Val, Long) = std::make_pair(Val / Rhs.Val, Long | Rhs.Long);
+    return *this;
+  }
+
   RCInt &operator|=(const RCInt &Rhs) {
     std::tie(Val, Long) = std::make_pair(Val | Rhs.Val, Long | Rhs.Long);
     return *this;
@@ -98,6 +108,20 @@ class IntWithNotMask {
     return *this;
   }
 
+  IntWithNotMask &operator*=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value *= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask &operator/=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value /= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
   IntWithNotMask &operator|=(const IntWithNotMask &Rhs) {
     Value &= ~Rhs.NotMask;
     Value |= Rhs.Value;
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.cpp b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
index aad1060c4a381..0070037e63e6a 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
@@ -64,7 +64,7 @@ StringRef RCToken::value() const { return TokenValue; }
 
 Kind RCToken::kind() const { return TokenKind; }
 
-bool RCToken::isBinaryOp() const {
+bool RCToken::isLowPrecedenceBinaryOp() const {
   switch (TokenKind) {
   case Kind::Plus:
   case Kind::Minus:
@@ -76,6 +76,16 @@ bool RCToken::isBinaryOp() const {
   }
 }
 
+bool RCToken::isHighPrecedenceBinaryOp() const {
+  switch (TokenKind) {
+  case Kind::Asterisk:
+  case Kind::Slash:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static Error getStringError(const Twine &message) {
   return make_error<StringError>("Error parsing file: " + message,
                                  inconvertibleErrorCode());
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.h b/llvm/tools/llvm-rc/ResourceScriptToken.h
index 29f7502f89efd..3dcdfafd2d576 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.h
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.h
@@ -56,8 +56,11 @@ class RCToken {
   StringRef value() const;
   Kind kind() const;
 
-  // Check if a token describes a binary operator.
-  bool isBinaryOp() const;
+  // Check if a token describes a low precedence binary operator.
+  bool isLowPrecedenceBinaryOp() const;
+
+  // Check if a token describes a high precedence binary operator.
+  bool isHighPrecedenceBinaryOp() const;
 
 private:
   Kind TokenKind;
diff --git a/llvm/tools/llvm-rc/ResourceScriptTokenList.def b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
index a61a96461f0fb..6ee13b2815d35 100644
--- a/llvm/tools/llvm-rc/ResourceScriptTokenList.def
+++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
@@ -29,6 +29,8 @@ SHORT_TOKEN(BlockEnd, '}')     // End of the block; can also be END.
 SHORT_TOKEN(Comma, ',')        // Comma - resource arguments separator.
 SHORT_TOKEN(Plus, '+')         // Addition operator.
 SHORT_TOKEN(Minus, '-')        // Subtraction operator.
+SHORT_TOKEN(Asterisk, '*')     // Multiplication operator.
+SHORT_TOKEN(Slash, '/')        // Division operator.
 SHORT_TOKEN(Pipe, '|')         // Bitwise-OR operator.
 SHORT_TOKEN(Amp, '&')          // Bitwise-AND operator.
 SHORT_TOKEN(Tilde, '~')        // Bitwise-NOT operator.

From 62b3e89afc54a118d597a27185f6915a68e408a0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 10 Jun 2025 21:37:03 +0100
Subject: [PATCH 003/851] [LV] Remove unused LoopBypassBlocks from ILV (NFC).

After recent refactorings to move parts of skeleton creation
LoopBypassBlocks isn't used any more. Remove it.
---
 .../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 333e50ee98418..427c1460fcfc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -611,9 +611,6 @@ class InnerLoopVectorizer {
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock = nullptr;
 
-  /// A list of all bypass blocks. The first block is the entry of the loop.
-  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
@@ -2445,7 +2442,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-  LoopBypassBlocks.push_back(TCCheckBlock);
 
   assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
              TCCheckBlock &&
@@ -2461,9 +2457,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   assert((!Cost->OptForSize ||
           Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
-  assert(!LoopBypassBlocks.empty() &&
-         "Should already be a bypass block due to iteration count check");
-  LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
 
   introduceCheckBlockInVPlan(SCEVCheckBlock);
@@ -2499,7 +2492,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -7557,8 +7549,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                    nullptr, "vector.ph");
 
   if (ForEpilogue) {
-    LoopBypassBlocks.push_back(TCCheckBlock);
-
     // Save the trip count so we don't have to regenerate it in the
     // vec.epilog.iter.check. This is safe to do because the trip count
     // generated here dominates the vector epilog iter check.
@@ -7619,13 +7609,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
 
   DT->changeImmediateDominator(LoopScalarPreHeader,
                                EPI.EpilogueIterationCountCheck);
-  // Keep track of bypass blocks, as they feed start values to the induction and
-  // reduction phis in the scalar loop preheader.
-  if (EPI.SCEVSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
-  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
   // reductions which merge control-flow from the latch block and the middle
@@ -7696,7 +7679,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     setBranchWeights(BI, Weights, /*IsExpected=*/false);
   }
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
-  LoopBypassBlocks.push_back(Insert);
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
   // otherwise we would try to modify the entry to the main vector loop.

From 830a74092adafa425db05e1c5120d3294f874777 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Wed, 11 Jun 2025 05:42:36 +0900
Subject: [PATCH 004/851] [Clang] [Cygwin] va_list must be treated like normal
 Windows (#143115)

Handling of va_list on Cygwin environment must be matched to normal
Windows environment.

The existing test `test/CodeGen/ms_abi.c` seems relevant, but it
contains `__attribute__((sysv_abi))`, which is not supported on Cygwin.
The new test is based on the `__attribute__((ms_abi))` portion of that
test.

---------

Co-authored-by: jeremyd2019 <github@jdrake.com>
---
 clang/lib/Basic/Targets/X86.h           |  4 +++
 clang/test/CodeGen/X86/cygwin-varargs.c | 35 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 clang/test/CodeGen/X86/cygwin-varargs.c

diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 6f8a2365be256..ecb31ffa4750f 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -997,6 +997,10 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo {
     if (Opts.CPlusPlus)
       Builder.defineMacro("_GNU_SOURCE");
   }
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::CharPtrBuiltinVaList;
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY DarwinX86_64TargetInfo
diff --git a/clang/test/CodeGen/X86/cygwin-varargs.c b/clang/test/CodeGen/X86/cygwin-varargs.c
new file mode 100644
index 0000000000000..4eea7d64bcb35
--- /dev/null
+++ b/clang/test/CodeGen/X86/cygwin-varargs.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm < %s | FileCheck %s
+
+struct foo {
+  int x;
+  float y;
+  char z;
+};
+// CHECK: %[[STRUCT_FOO:.*]] = type { i32, float, i8 }
+
+void f(int a, ...) {
+  // CHECK-LABEL: define dso_local void @f
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  // CHECK: %[[AP:.*]] = alloca ptr
+  // CHECK: call void @llvm.va_start
+  int b = __builtin_va_arg(ap, int);
+  // CHECK: %[[AP_CUR:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT]], ptr %[[AP]]
+  double _Complex c = __builtin_va_arg(ap, double _Complex);
+  // CHECK: %[[AP_CUR2:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR2]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT2]], ptr %[[AP]]
+  // CHECK-NEXT: load ptr, ptr %[[AP_CUR2]]
+  struct foo d = __builtin_va_arg(ap, struct foo);
+  // CHECK: %[[AP_CUR3:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR3]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT3]], ptr %[[AP]]
+  __builtin_va_list ap2;
+  __builtin_va_copy(ap2, ap);
+  // CHECK: call void @llvm.va_copy
+  __builtin_va_end(ap);
+  // CHECK: call void @llvm.va_end
+}

From 13ccce28776d8ad27b0c6a92b5a452d62da05663 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni@amd.com>
Date: Tue, 10 Jun 2025 15:46:27 -0500
Subject: [PATCH 005/851] [SeparateConstOffsetFromGEP] Decompose constant xor
 operand if possible (#135788)

Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
the base for memory operations. This transformation is true under the
following conditions
Check 1 -  B and C are disjoint.
Check 2 - XOR(A,C) and B are disjoint.

This transformation is beneficial particularly for GEPs because
Disjoint OR operations often map better to addressing modes than XOR.
This can enable further optimizations in the GEP offset folding pipeline
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 193 +++++++++++++++++
 .../AMDGPU/xor-to-or-disjoint.ll              | 204 ++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 320b79203c0b3..6fae9f1dd2404 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,6 +174,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -190,6 +191,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -198,6 +200,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "separate-offset-gep"
+
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -488,6 +492,42 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
+/// A helper class that aims to convert xor operations into or operations when
+/// their operands are disjoint and the result is used in a GEP's index. This
+/// can then enable further GEP optimizations by effectively turning BaseVal |
+/// Const into BaseVal + Const when they are disjoint, which
+/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
+/// sets up a grid of memory accesses across a wave where each thread acesses
+/// data at various offsets.
+class XorToOrDisjointTransformer {
+public:
+  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
+                             const DataLayout &DL)
+      : F(F), DT(DT), DL(DL) {}
+
+  bool run();
+
+private:
+  Function &F;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  /// Maps a common operand to all Xor instructions
+  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
+  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
+  XorBaseValInst XorGroups;
+
+  /// Checks if the given value has at least one GetElementPtr user
+  static bool hasGEPUser(const Value *V);
+
+  /// Helper function to check if BaseXor dominates all XORs in the group
+  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
+
+  /// Processes a group of XOR instructions that share the same non-constant
+  /// base operand. Returns true if this group's processing modified the
+  /// function.
+  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
+};
+
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1223,6 +1263,154 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
+// Helper function to check if an instruction has at least one GEP user
+bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
+  return llvm::any_of(V->users(), [](const User *U) {
+    return isa<llvm::GetElementPtrInst>(U);
+  });
+}
+
+bool XorToOrDisjointTransformer::dominatesAllXors(
+    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
+  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
+    BinaryOperator *XorInst = XorEntry.first;
+    // Do not evaluate the BaseXor, otherwise we end up cloning it.
+    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
+  });
+}
+
+bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
+                                                 XorOpList &XorsInGroup) {
+  bool Changed = false;
+  if (XorsInGroup.size() <= 1)
+    return false;
+
+  // Sort XorsInGroup by the constant offset value in increasing order.
+  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
+    return A.second.slt(B.second);
+  });
+
+  // Dominance check
+  // The "base" XOR for dominance purposes is the one with the smallest
+  // constant.
+  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
+
+  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                      << ": Cloning and inserting XOR with smallest constant ("
+                      << *XorWithSmallConst
+                      << ") as it does not dominate all other XORs"
+                      << " in function " << F.getName() << "\n");
+
+    BinaryOperator *ClonedXor =
+        cast<BinaryOperator>(XorWithSmallConst->clone());
+    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
+    ClonedXor->insertAfter(OriginalBaseInst);
+    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
+    Changed = true;
+    XorWithSmallConst = ClonedXor;
+  }
+
+  SmallVector<Instruction *, 8> InstructionsToErase;
+  const APInt SmallestConst =
+      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
+
+  // Main transformation loop: Iterate over the original XORs in the sorted
+  // group.
+  for (const auto &XorEntry : XorsInGroup) {
+    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
+    const APInt ConstOffsetVal = XorEntry.second;
+
+    // Do not process the one with smallest constant as it is the base.
+    if (XorInst == XorWithSmallConst)
+      continue;
+
+    // Disjointness Check 1
+    APInt NewConstVal = ConstOffsetVal - SmallestConst;
+    if ((NewConstVal & SmallestConst) != 0) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
+                        << F.getName() << ":\n"
+                        << "  New Const: " << NewConstVal
+                        << "  Smallest Const: " << SmallestConst
+                        << "  are not disjoint \n");
+      continue;
+    }
+
+    // Disjointness Check 2
+    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
+                          0)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                        << ": Transforming XOR to OR (disjoint) in function "
+                        << F.getName() << ":\n"
+                        << "  Xor: " << *XorInst << "\n"
+                        << "  Base Val: " << *XorWithSmallConst << "\n"
+                        << "  New Const: " << NewConstVal << "\n");
+
+      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
+          XorWithSmallConst,
+          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
+          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
+
+      NewOrInst->copyMetadata(*XorInst);
+      XorInst->replaceAllUsesWith(NewOrInst);
+      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
+      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
+
+      Changed = true;
+    } else {
+      LLVM_DEBUG(
+          dbgs() << DEBUG_TYPE
+                 << ": Cannot transform XOR (not proven disjoint) in function "
+                 << F.getName() << ":\n"
+                 << "  Xor: " << *XorInst << "\n"
+                 << "  Base Val: " << *XorWithSmallConst << "\n"
+                 << "  New Const: " << NewConstVal << "\n");
+    }
+  }
+
+  for (Instruction *I : InstructionsToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
+// the base for memory operations. This transformation is true under the
+// following conditions
+// Check 1 -  B and C are disjoint.
+// Check 2 - XOR(A,C) and B are disjoint.
+//
+// This transformation is beneficial particularly for GEPs because:
+// 1. OR operations often map better to addressing modes than XOR
+// 2. Disjoint OR operations preserve the semantics of the original XOR
+// 3. This can enable further optimizations in the GEP offset folding pipeline
+bool XorToOrDisjointTransformer::run() {
+  bool Changed = false;
+
+  // Collect all candidate XORs
+  for (Instruction &I : instructions(F)) {
+    Instruction *Op0 = nullptr;
+    ConstantInt *C1 = nullptr;
+    BinaryOperator *MatchedXorOp = nullptr;
+
+    // Attempt to match the instruction 'I' as XOR operation.
+    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
+                               m_BinOp(MatchedXorOp))) &&
+        hasGEPUser(MatchedXorOp))
+      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
+  }
+
+  if (XorGroups.empty())
+    return false;
+
+  // Process each group of XORs
+  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
+    if (processXorGroup(OriginalBaseInst, XorsInGroup))
+      Changed = true;
+
+  return Changed;
+}
+
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1242,6 +1430,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
+
+  // Decompose xor in to "or disjoint" if possible.
+  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
+  Changed |= XorTransformer.run();
+
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
new file mode 100644
index 0000000000000..825227292fe14
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+
+
+; Test a simple case of xor to or disjoint transformation
+define half @test_basic_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_basic_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 4128
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test the decreasing order of offset xor to or disjoint transformation
+define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_descending_offset_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 4128
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 32
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test that %addr2 is not transformed to or disjoint.
+define half @test_no_transfomation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_no_transfomation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 64  ; Should not be transformed
+  %addr3 = xor i64 %base, 2080
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test case with xor instructions in different basic blocks
+define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
+; CHECK-LABEL: define half @test_dom_tree(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    br label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
+; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
+; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
+; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192   ; Clear low bits
+  %addr1 = xor i64 %base,16
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %val1 = load half, ptr %gep1
+  br i1 %cond, label %then, label %else
+
+then:
+  %addr2 = xor i64 %base, 48
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %val2 = load half, ptr %gep2
+  br label %merge
+
+else:
+  %addr3 = xor i64 %base, 112
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val3 = load half, ptr %gep3
+  br label %merge
+
+merge:
+  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
+  %addr4 = xor i64 %base, 240
+  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
+  %val4 = load half, ptr %gep4
+  %val1.f = fpext half %val1 to float
+  %val_from_branch.f = fpext half %val_from_branch to float
+  %val4.f = fpext half %val4 to float
+  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
+  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
+  %result.h = fptrunc float %final_sum.f to half
+  ret half %result.h
+}
+

From 0c774682889ae9b1b89cb9d4d796283f205b8a63 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 10 Jun 2025 14:31:22 -0700
Subject: [PATCH 006/851] [BOLT] Expose external entry count for functions
 (#141674)

Record the number of function invocations from external code - code
outside the binary, which may include JIT code and DSOs. Accounting
external entry counts improves the fidelity of call graph flow
conservation analysis.

Test Plan: updated shrinkwrapping.test
---
 bolt/include/bolt/Core/BinaryFunction.h        | 12 ++++++++++++
 bolt/include/bolt/Profile/DataReader.h         |  3 +++
 bolt/include/bolt/Profile/ProfileYAMLMapping.h |  2 ++
 bolt/lib/Core/BinaryFunction.cpp               |  2 ++
 bolt/lib/Passes/ProfileQualityStats.cpp        |  3 +++
 bolt/lib/Profile/DataAggregator.cpp            |  1 +
 bolt/lib/Profile/DataReader.cpp                |  6 ++++++
 bolt/lib/Profile/YAMLProfileReader.cpp         |  1 +
 bolt/lib/Profile/YAMLProfileWriter.cpp         |  1 +
 bolt/test/X86/shrinkwrapping.test              |  2 ++
 10 files changed, 33 insertions(+)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 14957cba50174..ca8b786f4ab69 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -388,6 +388,10 @@ class BinaryFunction {
   /// The profile data for the number of times the function was executed.
   uint64_t ExecutionCount{COUNT_NO_PROFILE};
 
+  /// Profile data for the number of times this function was entered from
+  /// external code (DSO, JIT, etc).
+  uint64_t ExternEntryCount{0};
+
   /// Profile match ratio.
   float ProfileMatchRatio{0.0f};
 
@@ -1877,6 +1881,10 @@ class BinaryFunction {
     return *this;
   }
 
+  /// Set the profile data for the number of times the function was entered from
+  /// external code (DSO/JIT).
+  void setExternEntryCount(uint64_t Count) { ExternEntryCount = Count; }
+
   /// Adjust execution count for the function by a given \p Count. The value
   /// \p Count will be subtracted from the current function count.
   ///
@@ -1904,6 +1912,10 @@ class BinaryFunction {
   /// Return COUNT_NO_PROFILE if there's no profile info.
   uint64_t getExecutionCount() const { return ExecutionCount; }
 
+  /// Return the profile information about the number of times the function was
+  /// entered from external code (DSO/JIT).
+  uint64_t getExternEntryCount() const { return ExternEntryCount; }
+
   /// Return the raw profile information about the number of branch
   /// executions corresponding to this function.
   uint64_t getRawSampleCount() const { return RawSampleCount; }
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 3c770fed2598f..6f527ba3931d4 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -97,6 +97,9 @@ struct FuncBranchData {
   /// Total execution count for the function.
   int64_t ExecutionCount{0};
 
+  /// Total entry count from external code for the function.
+  uint64_t ExternEntryCount{0};
+
   /// Indicate if the data was used.
   bool Used{false};
 
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index a8d9a15311d94..41e2bd1651efd 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -206,6 +206,7 @@ struct BinaryFunctionProfile {
   uint32_t Id{0};
   llvm::yaml::Hex64 Hash{0};
   uint64_t ExecCount{0};
+  uint64_t ExternEntryCount{0};
   std::vector<BinaryBasicBlockProfile> Blocks;
   std::vector<InlineTreeNode> InlineTree;
   bool Used{false};
@@ -218,6 +219,7 @@ template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
     YamlIO.mapRequired("fid", BFP.Id);
     YamlIO.mapRequired("hash", BFP.Hash);
     YamlIO.mapRequired("exec", BFP.ExecCount);
+    YamlIO.mapOptional("extern", BFP.ExternEntryCount, 0);
     YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
     YamlIO.mapOptional("blocks", BFP.Blocks,
                        std::vector<bolt::BinaryBasicBlockProfile>());
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 6d1969f5c6c30..b998d7160aae7 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -471,6 +471,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
     OS << "\n  Sample Count: " << RawSampleCount;
     OS << "\n  Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
   }
+  if (ExternEntryCount)
+    OS << "\n  Extern Entry Count: " << ExternEntryCount;
 
   if (opts::PrintDynoStats && !getLayout().block_empty()) {
     OS << '\n';
diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
index dfd74d3dd5719..64cc662c3ab29 100644
--- a/bolt/lib/Passes/ProfileQualityStats.cpp
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -532,6 +532,9 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
     std::vector<uint64_t> &MaxCountMap = TotalMaxCountMaps[FunctionNum];
     std::vector<uint64_t> &MinCountMap = TotalMinCountMaps[FunctionNum];
 
+    // Record external entry count into CallGraphIncomingFlows
+    CallGraphIncomingFlows[FunctionNum] += Function->getExternEntryCount();
+
     // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows
     auto recordCall = [&](const BinaryBasicBlock *SourceBB,
                           const MCSymbol *DestSymbol, uint64_t Count,
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 4022212bcf1b6..308346e5d02ce 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2255,6 +2255,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
       YamlBF.Id = BF->getFunctionNumber();
       YamlBF.Hash = BAT->getBFHash(FuncAddress);
       YamlBF.ExecCount = BF->getKnownExecutionCount();
+      YamlBF.ExternEntryCount = BF->getExternEntryCount();
       YamlBF.NumBasicBlocks = BAT->getNumBasicBlocks(FuncAddress);
       const BoltAddressTranslation::BBHashMapTy &BlockMap =
           BAT->getBBHashMap(FuncAddress);
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index c512394f26a3b..afe24216d7f5d 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -85,6 +85,7 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) {
   }
   llvm::stable_sort(Data);
   ExecutionCount += FBD.ExecutionCount;
+  ExternEntryCount += FBD.ExternEntryCount;
   for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) {
     assert(I->To.Name == FBD.Name);
     auto NewElmt = EntryData.insert(EntryData.end(), *I);
@@ -269,6 +270,7 @@ Error DataReader::preprocessProfile(BinaryContext &BC) {
     if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) {
       setBranchData(Function, FuncData);
       Function.ExecutionCount = FuncData->ExecutionCount;
+      Function.ExternEntryCount = FuncData->ExternEntryCount;
       FuncData->Used = true;
     }
   }
@@ -419,6 +421,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) {
       if (fetchProfileForOtherEntryPoints(BF)) {
         BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
         BF.ExecutionCount = FBD->ExecutionCount;
+        BF.ExternEntryCount = FBD->ExternEntryCount;
         BF.RawSampleCount = FBD->getNumExecutedBranches();
       }
       return;
@@ -449,6 +452,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) {
     setBranchData(BF, NewBranchData);
     NewBranchData->Used = true;
     BF.ExecutionCount = NewBranchData->ExecutionCount;
+    BF.ExternEntryCount = NewBranchData->ExternEntryCount;
     BF.ProfileMatchRatio = 1.0f;
     break;
   }
@@ -1190,6 +1194,8 @@ std::error_code DataReader::parse() {
     if (BI.To.IsSymbol && BI.To.Offset == 0) {
       I = GetOrCreateFuncEntry(BI.To.Name);
       I->second.ExecutionCount += BI.Branches;
+      if (!BI.From.IsSymbol)
+        I->second.ExternEntryCount += BI.Branches;
     }
   }
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 33ce40ac2eeec..086e47b661e10 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -176,6 +176,7 @@ bool YAMLProfileReader::parseFunctionProfile(
   uint64_t FunctionExecutionCount = 0;
 
   BF.setExecutionCount(YamlBF.ExecCount);
+  BF.setExternEntryCount(YamlBF.ExternEntryCount);
 
   uint64_t FuncRawBranchCount = 0;
   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks)
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 0ae67a4d35595..1632aa1c6bfe2 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -226,6 +226,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
   YamlBF.ExecCount = BF.getKnownExecutionCount();
+  YamlBF.ExternEntryCount = BF.getExternEntryCount();
   DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t> InlineTreeNodeId;
   if (PseudoProbeDecoder && BF.getGUID()) {
     std::tie(YamlBF.InlineTree, InlineTreeNodeId) =
diff --git a/bolt/test/X86/shrinkwrapping.test b/bolt/test/X86/shrinkwrapping.test
index 8581d7e0c0f7b..521b4561b3ba6 100644
--- a/bolt/test/X86/shrinkwrapping.test
+++ b/bolt/test/X86/shrinkwrapping.test
@@ -8,6 +8,7 @@ REQUIRES: shell
 
 RUN: %clangxx %cxxflags -no-pie %S/Inputs/exc4sw.S -o %t.exe -Wl,-q
 RUN: llvm-bolt %t.exe -o %t --relocs --frame-opt=all \
+RUN:   --print-only=main --print-cfg \
 RUN:   --data=%p/Inputs/exc4sw.fdata --reorder-blocks=cache 2>&1 | \
 RUN:   FileCheck %s --check-prefix=CHECK-BOLT
 
@@ -19,6 +20,7 @@ RUN: llvm-objdump --dwarf=frames %t | grep -A20 -e \
 RUN:   `llvm-nm --numeric-sort %t | grep main | tail -n 1 | cut -f1 -d' ' | \
 RUN:    tail -c9` 2>&1 | FileCheck %s --check-prefix=CHECK-OUTPUT
 
+CHECK-BOLT: Extern Entry Count: 100
 CHECK-BOLT: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
 
 CHECK-INPUT:  DW_CFA_advance_loc: 2

From 163c67ad3d1bf7af6590930d8f18700d65ad4564 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Jun 2025 14:44:19 -0700
Subject: [PATCH 007/851] [flang][runtime] Replace recursion with iterative
 work queue (#137727)

Recursion, both direct and indirect, prevents accurate stack size
calculation at link time for GPU device code. Restructure these
recursive (often mutually so) routines in the Fortran runtime with new
implementations based on an iterative work queue with
suspendable/resumable work tickets: Assign, Initialize, initializeClone,
Finalize, and Destroy.

Default derived type I/O is also recursive, but already disabled. It can
be added to this new framework later if the overall approach succeeds.

Note that derived type FINAL subroutine calls, defined assignments, and
defined I/O procedures all perform callbacks into user code, which may
well reenter the runtime library. This kind of recursion is not handled
by this change, although it may be possible to do so in the future using
thread-local work queues.

The effects of this restructuring on CPU performance are yet to be
measured.
---
 .../include/flang-rt/runtime/environment.h    |   3 +
 flang-rt/include/flang-rt/runtime/stat.h      |  10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |   2 +
 .../include/flang-rt/runtime/work-queue.h     | 548 +++++++++++++++
 flang-rt/lib/runtime/CMakeLists.txt           |   2 +
 flang-rt/lib/runtime/assign.cpp               | 623 +++++++++++------
 flang-rt/lib/runtime/derived.cpp              | 517 +++++++-------
 flang-rt/lib/runtime/descriptor-io.cpp        | 651 +++++++++++++++++-
 flang-rt/lib/runtime/descriptor-io.h          | 620 +----------------
 flang-rt/lib/runtime/environment.cpp          |   4 +
 flang-rt/lib/runtime/namelist.cpp             |   1 +
 flang-rt/lib/runtime/tools.cpp                |   4 +-
 flang-rt/lib/runtime/type-info.cpp            |   6 +-
 flang-rt/lib/runtime/work-queue.cpp           | 161 +++++
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |   2 +-
 flang/docs/Extensions.md                      |  10 +
 flang/include/flang/Runtime/assign.h          |   2 +-
 flang/include/flang/Semantics/tools.h         |   7 +-
 flang/lib/Semantics/runtime-type-info.cpp     |   4 +
 flang/lib/Semantics/tools.cpp                 |  32 +
 flang/module/__fortran_type_info.f90          |   3 +-
 flang/test/Lower/volatile-openmp.f90          |   8 +-
 flang/test/Semantics/typeinfo01.f90           |  30 +-
 flang/test/Semantics/typeinfo03.f90           |   2 +-
 flang/test/Semantics/typeinfo04.f90           |   8 +-
 flang/test/Semantics/typeinfo05.f90           |   4 +-
 flang/test/Semantics/typeinfo06.f90           |   4 +-
 flang/test/Semantics/typeinfo07.f90           |   8 +-
 flang/test/Semantics/typeinfo08.f90           |   2 +-
 flang/test/Semantics/typeinfo11.f90           |   2 +-
 flang/test/Semantics/typeinfo12.f90           |  67 ++
 31 files changed, 2227 insertions(+), 1120 deletions(-)
 create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 create mode 100644 flang-rt/lib/runtime/work-queue.cpp
 create mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 16258b3bbba9b..e579f6012ce86 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,6 +64,9 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
+  enum InternalDebugging { WorkQueue = 1 };
+  int internalDebugging{0}; // FLANG_RT_DEBUG
+
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index 070d0bf8673fb..dc372de53506a 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes
+  // Interoperable STAT= codes (>= 11)
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values
+  // Standard STAT= values (>= 101)
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,10 +49,14 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
+
+  // Dummy status for work queue continuation, declared here to perhaps
+  // avoid collisions
+  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 5e79efde164f2..9bde3adba87f5 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -240,6 +240,7 @@ class DerivedType {
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
+  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -322,6 +323,7 @@ class DerivedType {
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
+  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
new file mode 100644
index 0000000000000..878b18373e1d2
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -0,0 +1,548 @@
+//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Internal runtime utilities for work queues that replace the use of recursion
+// for better GPU device support.
+//
+// A work queue comprises a list of tickets.  Each ticket class has a Begin()
+// member function, which is called once, and a Continue() member function
+// that can be called zero or more times.  A ticket's execution terminates
+// when either of these member functions returns a status other than
+// StatContinue.  When that status is not StatOk, then the whole queue
+// is shut down.
+//
+// By returning StatContinue from its Continue() member function,
+// a ticket suspends its execution so that any nested tickets that it
+// may have created can be run to completion.  It is the reponsibility
+// of each ticket class to maintain resumption information in its state
+// and manage its own progress.  Most ticket classes inherit from
+// class ComponentsOverElements, which implements an outer loop over all
+// components of a derived type, and an inner loop over all elements
+// of a descriptor, possibly with multiple phases of execution per element.
+//
+// Tickets are created by WorkQueue::Begin...() member functions.
+// There is one of these for each "top level" recursive function in the
+// Fortran runtime support library that has been restructured into this
+// ticket framework.
+//
+// When the work queue is running tickets, it always selects the last ticket
+// on the list for execution -- "work stack" might have been a more accurate
+// name for this framework.  This ticket may, while doing its job, create
+// new tickets, and since those are pushed after the active one, the first
+// such nested ticket will be the next one executed to completion -- i.e.,
+// the order of nested WorkQueue::Begin...() calls is respected.
+// Note that a ticket's Continue() member function won't be called again
+// until all nested tickets have run to completion and it is once again
+// the last ticket on the queue.
+//
+// Example for an assignment to a derived type:
+// 1. Assign() is called, and its work queue is created.  It calls
+//    WorkQueue::BeginAssign() and then WorkQueue::Run().
+// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
+//    BeginFinalize() and returns StatContinue.
+// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
+//    until one of them returns StatOk, which ends the finalization ticket.
+// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
+//    and then returns StatOk, which ends the ticket.
+// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
+//    and ::Continue() are called until they are done (not StatContinue).
+//    Along the way, it may create nested AssignTickets for components,
+//    and suspend itself so that they may each run to completion.
+
+#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
+#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
+
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
+#include <flang/Common/variant.h>
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
+
+namespace Fortran::runtime {
+class Terminator;
+class WorkQueue;
+
+// Ticket worker base classes
+
+template <typename TICKET> class ImmediateTicketRunner {
+public:
+  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
+      : ticket_{ticket} {}
+  RT_API_ATTRS int Run(WorkQueue &workQueue) {
+    int status{ticket_.Begin(workQueue)};
+    while (status == StatContinue) {
+      status = ticket_.Continue(workQueue);
+    }
+    return status;
+  }
+
+private:
+  TICKET &ticket_;
+};
+
+// Base class for ticket workers that operate elementwise over descriptors
+class Elementwise {
+protected:
+  RT_API_ATTRS Elementwise(
+      const Descriptor &instance, const Descriptor *from = nullptr)
+      : instance_{instance}, from_{from} {
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
+  RT_API_ATTRS void Advance() {
+    ++elementAt_;
+    instance_.IncrementSubscripts(subscripts_);
+    if (from_) {
+      from_->IncrementSubscripts(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
+  RT_API_ATTRS void Reset() {
+    elementAt_ = 0;
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+
+  const Descriptor &instance_, *from_{nullptr};
+  std::size_t elements_{instance_.Elements()};
+  std::size_t elementAt_{0};
+  SubscriptValue subscripts_[common::maxRank];
+  SubscriptValue fromSubscripts_[common::maxRank];
+};
+
+// Base class for ticket workers that operate over derived type components.
+class Componentwise {
+protected:
+  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
+  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
+  RT_API_ATTRS void Advance() {
+    ++componentAt_;
+    GetComponent();
+  }
+  RT_API_ATTRS void SkipToEnd() {
+    component_ = nullptr;
+    componentAt_ = components_;
+  }
+  RT_API_ATTRS void Reset() {
+    component_ = nullptr;
+    componentAt_ = 0;
+    GetComponent();
+  }
+  RT_API_ATTRS void GetComponent();
+
+  const typeInfo::DerivedType &derived_;
+  std::size_t components_{0}, componentAt_{0};
+  const typeInfo::Component *component_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
+};
+
+// Base class for ticket workers that operate over derived type components
+// in an outer loop, and elements in an inner loop.
+class ComponentsOverElements : protected Componentwise, protected Elementwise {
+protected:
+  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Componentwise{derived}, Elementwise{instance, from} {
+    if (Elementwise::IsComplete()) {
+      Componentwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextElement();
+    if (Elementwise::IsComplete()) {
+      Elementwise::Reset();
+      Componentwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Elementwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void Reset() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Reset();
+  }
+
+  int phase_{0};
+};
+
+// Base class for ticket workers that operate over elements in an outer loop,
+// type components in an inner loop.
+class ElementsOverComponents : protected Elementwise, protected Componentwise {
+protected:
+  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Elementwise{instance, from}, Componentwise{derived} {
+    if (Componentwise::IsComplete()) {
+      Elementwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextComponent();
+    if (Componentwise::IsComplete()) {
+      Componentwise::Reset();
+      Elementwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Componentwise::Reset();
+    Elementwise::Advance();
+  }
+
+  int phase_{0};
+};
+
+// Ticket worker classes
+
+// Implements derived type instance initialization
+class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
+                         private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<InitializeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+};
+
+// Initializes one derived type instance from the value of another
+class InitializeCloneTicket
+    : public ImmediateTicketRunner<InitializeCloneTicket>,
+      private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg)
+      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
+        ComponentsOverElements{original, derived}, clone_{clone},
+        hasStat_{hasStat}, errMsg_{errMsg} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const Descriptor &clone_;
+  bool hasStat_{false};
+  const Descriptor *errMsg_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
+};
+
+// Implements derived type instance finalization
+class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
+                       private ComponentsOverElements {
+public:
+  RT_API_ATTRS FinalizeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<FinalizeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const typeInfo::DerivedType *finalizableParentType_{nullptr};
+};
+
+// Implements derived type instance destruction
+class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
+                      private ComponentsOverElements {
+public:
+  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, bool finalize)
+      : ImmediateTicketRunner<DestroyTicket>{*this},
+        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  bool finalize_{false};
+};
+
+// Implements general intrinsic assignment
+class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
+public:
+  RT_API_ATTRS AssignTicket(
+      Descriptor &to, const Descriptor &from, int flags, MemmoveFct memmoveFct)
+      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
+        flags_{flags}, memmoveFct_{memmoveFct} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  RT_API_ATTRS bool IsSimpleMemmove() const {
+    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
+        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
+  }
+  RT_API_ATTRS Descriptor &GetTempDescriptor();
+
+  Descriptor &to_;
+  const Descriptor *from_{nullptr};
+  int flags_{0}; // enum AssignFlags
+  MemmoveFct memmoveFct_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
+  const typeInfo::DerivedType *toDerived_{nullptr};
+  Descriptor *toDeallocate_{nullptr};
+  bool persist_{false};
+  bool done_{false};
+};
+
+// Implements derived type intrinsic assignment.
+template <bool IS_COMPONENTWISE>
+class DerivedAssignTicket
+    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+          ElementsOverComponents> {
+public:
+  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      ElementsOverComponents>;
+  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter)
+      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
+        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
+        deallocateAfter_{deallocateAfter} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
+  bool toIsContiguous_{this->instance_.IsContiguous()};
+  bool fromIsContiguous_{this->from_->IsContiguous()};
+  int flags_{0};
+  MemmoveFct memmoveFct_{nullptr};
+  Descriptor *deallocateAfter_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
+};
+
+namespace io::descr {
+
+template <io::Direction DIR>
+class DescriptorIoTicket
+    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
+      private Elementwise {
+public:
+  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
+        Elementwise{descriptor}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
+  const typeInfo::DerivedType *derived_{nullptr};
+  const typeInfo::SpecialBinding *special_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
+};
+
+template <io::Direction DIR>
+class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
+                        private ElementsOverComponents {
+public:
+  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DerivedIoTicket>(*this),
+        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+};
+
+} // namespace io::descr
+
+struct NullTicket {
+  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
+  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
+};
+
+struct Ticket {
+  RT_API_ATTRS int Continue(WorkQueue &);
+  bool begun{false};
+  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
+      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
+      DerivedAssignTicket<true>,
+      io::descr::DescriptorIoTicket<io::Direction::Output>,
+      io::descr::DescriptorIoTicket<io::Direction::Input>,
+      io::descr::DerivedIoTicket<io::Direction::Output>,
+      io::descr::DerivedIoTicket<io::Direction::Input>>
+      u;
+};
+
+class WorkQueue {
+public:
+  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
+      : terminator_{terminator} {
+    for (int j{1}; j < numStatic_; ++j) {
+      static_[j].previous = &static_[j - 1];
+      static_[j - 1].next = &static_[j];
+    }
+  }
+  RT_API_ATTRS ~WorkQueue();
+  RT_API_ATTRS Terminator &terminator() { return terminator_; };
+
+  // APIs for particular tasks.  These can return StatOk if the work is
+  // completed immediately.
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return InitializeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg) {
+    if (runTicketsImmediately_) {
+      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeCloneTicket>(
+          clone, original, derived, hasStat, errMsg);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginFinalize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return FinalizeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, bool finalize) {
+    if (runTicketsImmediately_) {
+      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
+    } else {
+      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
+      int flags, MemmoveFct memmoveFct) {
+    if (runTicketsImmediately_) {
+      return AssignTicket{to, from, flags, memmoveFct}.Run(*this);
+    } else {
+      StartTicket().u.emplace<AssignTicket>(to, from, flags, memmoveFct);
+      return StatContinue;
+    }
+  }
+  template <bool IS_COMPONENTWISE>
+  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter) {
+    if (runTicketsImmediately_) {
+      return DerivedAssignTicket<IS_COMPONENTWISE>{
+          to, from, derived, flags, memmoveFct, deallocateAfter}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
+          to, from, derived, flags, memmoveFct, deallocateAfter);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DescriptorIoTicket<DIR>{
+          io, descriptor, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
+          io, descriptor, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DerivedIoTicket<DIR>{
+          io, descriptor, derived, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
+          io, descriptor, derived, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+
+  RT_API_ATTRS int Run();
+
+private:
+#if RT_DEVICE_COMPILATION
+  // Always use the work queue on a GPU device to avoid recursion.
+  static constexpr bool runTicketsImmediately_{false};
+#else
+  // Avoid the work queue overhead on the host, unless it needs
+  // debugging, which is so much easier there.
+  static constexpr bool runTicketsImmediately_{true};
+#endif
+
+  // Most uses of the work queue won't go very deep.
+  static constexpr int numStatic_{2};
+
+  struct TicketList {
+    bool isStatic{true};
+    Ticket ticket;
+    TicketList *previous{nullptr}, *next{nullptr};
+  };
+
+  RT_API_ATTRS Ticket &StartTicket();
+  RT_API_ATTRS void Stop();
+
+  Terminator &terminator_;
+  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
+  TicketList static_[numStatic_];
+  TicketList *firstFree_{static_};
+};
+
+} // namespace Fortran::runtime
+#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index a3f63b4315644..332c0872e065f 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,6 +68,7 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
+  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -131,6 +132,7 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
+  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index bf67b5dc8b645..41b130cc8f257 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -102,11 +103,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
-  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
-    result = ReturnError(terminator, Initialize(to, *derived, terminator));
-  }
-  return result;
+  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
 }
 
 // least <= 0, most >= 0
@@ -231,6 +228,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -244,275 +243,453 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
-      MustDeallocateLHS(to, from, terminator, flags)};
-  DescriptorAddendum *toAddendum{to.Addendum()};
-  const typeInfo::DerivedType *toDerived{
-      toAddendum ? toAddendum->derivedType() : nullptr};
-  if (toDerived && (flags & NeedFinalization) &&
-      toDerived->noFinalizationNeeded()) {
-    flags &= ~NeedFinalization;
-  }
-  std::size_t toElementBytes{to.ElementBytes()};
-  std::size_t fromElementBytes{from.ElementBytes()};
-  // The following lambda definition violates the conding style,
-  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
-  auto isSimpleMemmove = [&]() {
-    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
-        from.IsContiguous() && toElementBytes == fromElementBytes;
-  };
-  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
-  Descriptor *deferDeallocation{nullptr};
-  if (MayAlias(to, from)) {
+  WorkQueue workQueue{terminator};
+  if (workQueue.BeginAssign(to, from, flags, memmoveFct) == StatContinue) {
+    workQueue.Run();
+  }
+}
+
+RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
+  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
+      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
+  DescriptorAddendum *toAddendum{to_.Addendum()};
+  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
+  if (toDerived_ && (flags_ & NeedFinalization) &&
+      toDerived_->noFinalizationNeeded()) {
+    flags_ &= ~NeedFinalization;
+  }
+  if (MayAlias(to_, *from_)) {
     if (mustDeallocateLHS) {
-      deferDeallocation = &deferredDeallocStatDesc.descriptor();
+      // Convert the LHS into a temporary, then make it look deallocated.
+      toDeallocate_ = &tempDescriptor_.descriptor();
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
-          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
-      to.set_base_addr(nullptr);
-    } else if (!isSimpleMemmove()) {
+          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
+      to_.set_base_addr(nullptr);
+      if (toDerived_ && (flags_ & NeedFinalization)) {
+        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+        flags_ &= ~NeedFinalization;
+      }
+    } else if (!IsSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from.SizeInBytes()};
-      StaticDescriptor<maxRank, true, 16> staticDesc;
-      Descriptor &newFrom{staticDesc.descriptor()};
-      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
+      auto descBytes{from_->SizeInBytes()};
+      Descriptor &newFrom{tempDescriptor_.descriptor()};
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
-      if (stat == StatOk) {
-        if (HasDynamicComponent(from)) {
-          // If 'from' has allocatable/automatic component, we cannot
-          // just make a shallow copy of the descriptor member.
-          // This will still leave data overlap in 'to' and 'newFrom'.
-          // For example:
-          //   type t
-          //     character, allocatable :: c(:)
-          //   end type t
-          //   type(t) :: x(3)
-          //   x(2:3) = x(1:2)
-          // We have to make a deep copy into 'newFrom' in this case.
-          RTNAME(AssignTemporary)
-          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
-        } else {
-          ShallowCopy(newFrom, from, true, from.IsContiguous());
+      if (int stat{ReturnError(
+              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
+          stat != StatOk) {
+        return stat;
+      }
+      if (HasDynamicComponent(*from_)) {
+        // If 'from' has allocatable/automatic component, we cannot
+        // just make a shallow copy of the descriptor member.
+        // This will still leave data overlap in 'to' and 'newFrom'.
+        // For example:
+        //   type t
+        //     character, allocatable :: c(:)
+        //   end type t
+        //   type(t) :: x(3)
+        //   x(2:3) = x(1:2)
+        // We have to make a deep copy into 'newFrom' in this case.
+        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
+          if (const auto *derived{addendum->derivedType()}) {
+            if (!derived->noInitializationNeeded()) {
+              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
+                  status != StatOk && status != StatContinue) {
+                return status;
+              }
+            }
+          }
+        }
+        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (int status{workQueue.BeginAssign(
+                newFrom, *from_, nestedFlags, memmoveFct_)};
+            status != StatOk && status != StatContinue) {
+          return status;
         }
-        Assign(to, newFrom, terminator,
-            flags &
-                (NeedFinalization | ComponentCanBeDefinedAssignment |
-                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
-        newFrom.Deallocate();
+      } else {
+        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
       }
-      return;
+      from_ = &newFrom;
+      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
+          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
+      toDeallocate_ = &newFrom;
     }
   }
-  if (to.IsAllocatable()) {
+  if (to_.IsAllocatable()) {
     if (mustDeallocateLHS) {
-      if (deferDeallocation) {
-        if ((flags & NeedFinalization) && toDerived) {
-          Finalize(*deferDeallocation, *toDerived, &terminator);
-          flags &= ~NeedFinalization;
-        }
-      } else {
-        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
-            &terminator);
-        flags &= ~NeedFinalization;
+      if (!toDeallocate_ && to_.IsAllocated()) {
+        toDeallocate_ = &to_;
       }
-    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
-      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
-                       "unallocated allocatable",
-          to.rank(), from.rank());
+    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
+      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
+                                   "assignment to unallocated allocatable",
+          to_.rank(), from_->rank());
     }
-    if (!to.IsAllocated()) {
-      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
-        return;
+  } else if (!to_.IsAllocated()) {
+    workQueue.terminator().Crash(
+        "Assign: left-hand side variable is neither allocated nor allocatable");
+  }
+  if (toDerived_ && to_.IsAllocated()) {
+    // Schedule finalization or destruction of the LHS.
+    if (flags_ & NeedFinalization) {
+      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else if (!toDerived_->noDestructionNeeded()) {
+      if (int status{
+              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      flags &= ~NeedFinalization;
-      toElementBytes = to.ElementBytes(); // may have changed
-      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
     }
   }
-  if (toDerived && (flags & CanBeDefinedAssignment)) {
-    // Check for a user-defined assignment type-bound procedure;
-    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
-    // the semantics, including allocatable (re)allocation and any
-    // finalization.
-    //
-    // Note that the aliasing and LHS (re)allocation handling above
-    // needs to run even with CanBeDefinedAssignment flag, when
-    // the Assign() is invoked recursively for component-per-component
-    // assignments.
-    if (to.rank() == 0) {
-      if (const auto *special{toDerived->FindSpecialBinding(
+  return StatContinue;
+}
+
+RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
+  if (done_) {
+    // All child tickets are complete; can release this ticket's state.
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+    }
+    return StatOk;
+  }
+  // All necessary finalization or destruction that was initiated by Begin()
+  // has been completed.  Deallocation may be pending, and if it's for the LHS,
+  // do it now so that the LHS gets reallocated.
+  if (toDeallocate_ == &to_) {
+    toDeallocate_ = nullptr;
+    to_.Deallocate();
+  }
+  // Allocate the LHS if needed
+  if (!to_.IsAllocated()) {
+    if (int stat{
+            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
+        stat != StatOk) {
+      return stat;
+    }
+    const auto *addendum{to_.Addendum()};
+    toDerived_ = addendum ? addendum->derivedType() : nullptr;
+    if (toDerived_ && !toDerived_->noInitializationNeeded()) {
+      if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
+          status != StatOk) {
+        return status;
+      }
+    }
+  }
+  // Check for a user-defined assignment type-bound procedure;
+  // see 10.2.1.4-5.
+  // Note that the aliasing and LHS (re)allocation handling above
+  // needs to run even with CanBeDefinedAssignment flag, since
+  // Assign() can be invoked recursively for component-wise assignments.
+  if (toDerived_ && (flags_ & CanBeDefinedAssignment)) {
+    if (to_.rank() == 0) {
+      if (const auto *special{toDerived_->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        return DoScalarDefinedAssignment(to, from, *special);
+        DoScalarDefinedAssignment(to_, *from_, *special);
+        done_ = true;
+        return StatContinue;
       }
     }
-    if (const auto *special{toDerived->FindSpecialBinding(
+    if (const auto *special{toDerived_->FindSpecialBinding(
             typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
+      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
+      done_ = true;
+      return StatContinue;
     }
   }
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
-  // Scalar expansion of the RHS is implied by using the same empty
-  // subscript values on each (seemingly) elemental reference into
-  // "from".
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
-  std::size_t toElements{to.Elements()};
-  if (from.rank() > 0 && toElements != from.Elements()) {
-    terminator.Crash("Assign: mismatching element counts in array assignment "
-                     "(to %zd, from %zd)",
-        toElements, from.Elements());
+  // Intrinsic assignment
+  std::size_t toElements{to_.Elements()};
+  if (from_->rank() > 0 && toElements != from_->Elements()) {
+    workQueue.terminator().Crash("Assign: mismatching element counts in array "
+                                 "assignment (to %zd, from %zd)",
+        toElements, from_->Elements());
   }
-  if (to.type() != from.type()) {
-    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
-        to.type().raw(), from.type().raw());
+  if (to_.type() != from_->type()) {
+    workQueue.terminator().Crash(
+        "Assign: mismatching types (to code %d != from code %d)",
+        to_.type().raw(), from_->type().raw());
   }
-  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
-    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
-                     "bytes != from %zd bytes)",
+  std::size_t toElementBytes{to_.ElementBytes()};
+  std::size_t fromElementBytes{from_->ElementBytes()};
+  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
+    workQueue.terminator().Crash("Assign: mismatching non-character element "
+                                 "sizes (to %zd bytes != from %zd bytes)",
         toElementBytes, fromElementBytes);
   }
-  if (const typeInfo::DerivedType *
-      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
-    // Derived type intrinsic assignment, which is componentwise and elementwise
-    // for all components, including parent components (10.2.1.2-3).
-    // The target is first finalized if still necessary (7.5.6.3(1))
-    if (flags & NeedFinalization) {
-      Finalize(to, *updatedToDerived, &terminator);
-    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
-      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
-    }
-    // Copy the data components (incl. the parent) first.
-    const Descriptor &componentDesc{updatedToDerived->component()};
-    std::size_t numComponents{componentDesc.Elements()};
-    for (std::size_t j{0}; j < toElements;
-         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-      for (std::size_t k{0}; k < numComponents; ++k) {
-        const auto &comp{
-            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-                k)}; // TODO: exploit contiguity here
-        // Use PolymorphicLHS for components so that the right things happen
-        // when the components are polymorphic; when they're not, they're both
-        // not, and their declared types will match.
-        int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (flags & ComponentCanBeDefinedAssignment) {
-          nestedFlags |=
-              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-        }
-        switch (comp.genre()) {
-        case typeInfo::Component::Genre::Data:
-          if (comp.category() == TypeCategory::Derived) {
-            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
-            Descriptor &toCompDesc{statDesc[0].descriptor()};
-            Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
-            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
-          } else { // Component has intrinsic type; simply copy raw bytes
-            std::size_t componentByteSize{comp.SizeInBytes(to)};
-            memmoveFct(to.Element<char>(toAt) + comp.offset(),
-                from.Element<const char>(fromAt) + comp.offset(),
-                componentByteSize);
-          }
-          break;
-        case typeInfo::Component::Genre::Pointer: {
-          std::size_t componentByteSize{comp.SizeInBytes(to)};
-          memmoveFct(to.Element<char>(toAt) + comp.offset(),
-              from.Element<const char>(fromAt) + comp.offset(),
-              componentByteSize);
-        } break;
-        case typeInfo::Component::Genre::Allocatable:
-        case typeInfo::Component::Genre::Automatic: {
-          auto *toDesc{reinterpret_cast<Descriptor *>(
-              to.Element<char>(toAt) + comp.offset())};
-          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-              from.Element<char>(fromAt) + comp.offset())};
-          // Allocatable components of the LHS are unconditionally
-          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-          // unlike a "top-level" assignment to a variable, where
-          // deallocation is optional.
-          //
-          // Be careful not to destroy/reallocate the LHS, if there is
-          // overlap between LHS and RHS (it seems that partial overlap
-          // is not possible, though).
-          // Invoke Assign() recursively to deal with potential aliasing.
-          if (toDesc->IsAllocatable()) {
-            if (!fromDesc->IsAllocated()) {
-              // No aliasing.
-              //
-              // If to is not allocated, the Destroy() call is a no-op.
-              // This is just a shortcut, because the recursive Assign()
-              // below would initiate the destruction for to.
-              // No finalization is required.
-              toDesc->Destroy(
-                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
-              continue; // F'2018 10.2.1.3(13)(2)
-            }
-          }
-          // Force LHS deallocation with DeallocateLHS flag.
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
-        } break;
-        }
+  if (toDerived_) {
+    if (toDerived_->noDefinedAssignment()) { // componentwise
+      if (int status{workQueue.BeginDerivedAssign<true>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      // Copy procedure pointer components
-      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
-      std::size_t numProcPtrs{procPtrDesc.Elements()};
-      for (std::size_t k{0}; k < numProcPtrs; ++k) {
-        const auto &procPtr{
-            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
-                k)};
-        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
-            from.Element<const char>(fromAt) + procPtr.offset,
-            sizeof(typeInfo::ProcedurePointer));
+    } else { // elementwise
+      if (int status{workQueue.BeginDerivedAssign<false>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
     }
-  } else { // intrinsic type, intrinsic assignment
-    if (isSimpleMemmove()) {
-      memmoveFct(to.raw().base_addr, from.raw().base_addr,
-          toElements * toElementBytes);
-    } else if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to.type().raw()) {
+    toDeallocate_ = nullptr;
+  } else if (IsSimpleMemmove()) {
+    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
+        toElements * toElementBytes);
+  } else {
+    // Scalar expansion of the RHS is implied by using the same empty
+    // subscript values on each (seemingly) elemental reference into
+    // "from".
+    SubscriptValue toAt[maxRank];
+    to_.GetLowerBounds(toAt);
+    SubscriptValue fromAt[maxRank];
+    from_->GetLowerBounds(fromAt);
+    if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to_.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        terminator.Crash("unexpected type code %d in blank padded Assign()",
-            to.type().raw());
+        workQueue.terminator().Crash(
+            "unexpected type code %d in blank padded Assign()",
+            to_.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
+          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
+        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (deferDeallocation) {
-    // deferDeallocation is used only when LHS is an allocatable.
-    // The finalization has already been run for it.
-    deferDeallocation->Destroy(
-        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+  if (persist_) {
+    done_ = true;
+    return StatContinue;
+  } else {
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+      toDeallocate_ = nullptr;
+    }
+    return StatOk;
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
+    WorkQueue &workQueue) {
+  if (toIsContiguous_ && fromIsContiguous_ &&
+      this->derived_.noDestructionNeeded() &&
+      this->derived_.noDefinedAssignment() &&
+      this->instance_.rank() == this->from_->rank()) {
+    if (std::size_t elementBytes{this->instance_.ElementBytes()};
+        elementBytes == this->from_->ElementBytes()) {
+      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
+      // to be expanded, the types have the same size, and there are no
+      // allocatable components or defined ASSIGNMENT(=) at any level.
+      memmoveFct_(this->instance_.template OffsetElement<char>(),
+          this->from_->template OffsetElement<const char *>(),
+          this->instance_.Elements() * elementBytes);
+      return StatOk;
+    }
+  }
+  // Use PolymorphicLHS for components so that the right things happen
+  // when the components are polymorphic; when they're not, they're both
+  // not, and their declared types will match.
+  int nestedFlags{MaybeReallocate | PolymorphicLHS};
+  if (flags_ & ComponentCanBeDefinedAssignment) {
+    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+  }
+  flags_ = nestedFlags;
+  // Copy procedure pointer components
+  const Descriptor &procPtrDesc{this->derived_.procPtr()};
+  bool noDataComponents{this->IsComplete()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &procPtr{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (noDataComponents) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        memmoveFct_(this->instance_.template ElementComponent<char>(
+                        this->subscripts_, procPtr.offset),
+            this->from_->template ElementComponent<const char>(
+                this->fromSubscripts_, procPtr.offset),
+            sizeof(typeInfo::ProcedurePointer));
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  if (noDataComponents) {
+    return StatOk;
+  }
+  return StatContinue;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
+
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
+    WorkQueue &workQueue) {
+  while (!this->IsComplete()) {
+    // Copy the data components (incl. the parent) first.
+    switch (this->component_->genre()) {
+    case typeInfo::Component::Genre::Data:
+      if (this->component_->category() == TypeCategory::Derived) {
+        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
+        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
+        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
+            workQueue.terminator(), this->subscripts_);
+        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
+            workQueue.terminator(), this->fromSubscripts_);
+        this->Advance();
+        if (int status{workQueue.BeginAssign(
+                toCompDesc, fromCompDesc, flags_, memmoveFct_)};
+            status != StatOk) {
+          return status;
+        }
+      } else { // Component has intrinsic type; simply copy raw bytes
+        std::size_t componentByteSize{
+            this->component_->SizeInBytes(this->instance_)};
+        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+          std::size_t offset{this->component_->offset()};
+          char *to{this->instance_.template OffsetElement<char>(offset)};
+          const char *from{
+              this->from_->template OffsetElement<const char>(offset)};
+          std::size_t toElementStride{this->instance_.ElementBytes()};
+          std::size_t fromElementStride{
+              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+          if (toElementStride == fromElementStride &&
+              toElementStride == componentByteSize) {
+            memmoveFct_(to, from, this->elements_ * componentByteSize);
+          } else {
+            for (std::size_t n{this->elements_}; n--;
+                to += toElementStride, from += fromElementStride) {
+              memmoveFct_(to, from, componentByteSize);
+            }
+          }
+          this->Componentwise::Advance();
+        } else {
+          memmoveFct_(
+              this->instance_.template Element<char>(this->subscripts_) +
+                  this->component_->offset(),
+              this->from_->template Element<const char>(this->fromSubscripts_) +
+                  this->component_->offset(),
+              componentByteSize);
+          this->Advance();
+        }
+      }
+      break;
+    case typeInfo::Component::Genre::Pointer: {
+      std::size_t componentByteSize{
+          this->component_->SizeInBytes(this->instance_)};
+      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+        std::size_t offset{this->component_->offset()};
+        char *to{this->instance_.template OffsetElement<char>(offset)};
+        const char *from{
+            this->from_->template OffsetElement<const char>(offset)};
+        std::size_t toElementStride{this->instance_.ElementBytes()};
+        std::size_t fromElementStride{
+            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+        if (toElementStride == fromElementStride &&
+            toElementStride == componentByteSize) {
+          memmoveFct_(to, from, this->elements_ * componentByteSize);
+        } else {
+          for (std::size_t n{this->elements_}; n--;
+              to += toElementStride, from += fromElementStride) {
+            memmoveFct_(to, from, componentByteSize);
+          }
+        }
+        this->Componentwise::Advance();
+      } else {
+        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
+                this->component_->offset(),
+            this->from_->template Element<const char>(this->fromSubscripts_) +
+                this->component_->offset(),
+            componentByteSize);
+        this->Advance();
+      }
+    } break;
+    case typeInfo::Component::Genre::Allocatable:
+    case typeInfo::Component::Genre::Automatic: {
+      auto *toDesc{reinterpret_cast<Descriptor *>(
+          this->instance_.template Element<char>(this->subscripts_) +
+          this->component_->offset())};
+      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+          this->from_->template Element<char>(this->fromSubscripts_) +
+          this->component_->offset())};
+      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
+        if (toDesc->IsAllocated()) {
+          if (this->phase_ == 0) {
+            this->phase_++;
+            if (const auto *componentDerived{this->component_->derivedType()};
+                componentDerived && !componentDerived->noDestructionNeeded()) {
+              if (int status{workQueue.BeginDestroy(
+                      *toDesc, *componentDerived, /*finalize=*/false)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
+          toDesc->Deallocate();
+        }
+        this->Advance();
+      } else {
+        // Allocatable components of the LHS are unconditionally
+        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+        // unlike a "top-level" assignment to a variable, where
+        // deallocation is optional.
+        this->Advance();
+        int nestedFlags{flags_};
+        if (this->derived_.noFinalizationNeeded() &&
+            this->derived_.noInitializationNeeded() &&
+            this->derived_.noDestructionNeeded()) {
+          // The actual deallocation may be avoided, if the existing
+          // location can be reoccupied.
+        } else {
+          // Force LHS deallocation with DeallocateLHS flag.
+          nestedFlags |= DeallocateLHS;
+        }
+        if (int status{workQueue.BeginAssign(
+                *toDesc, *fromDesc, nestedFlags, memmoveFct_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } break;
+    }
+  }
+  if (deallocateAfter_) {
+    deallocateAfter_->Deallocate();
+  }
+  return StatOk;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -582,7 +759,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
-
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -599,7 +775,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 35037036f63e7..8ab737c701b01 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,6 +12,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -30,180 +31,193 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
-    const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{instance.Elements()};
-  int stat{StatOk};
-  // Initialize data components in each element; the per-element iterations
-  // constitute the inner loops, not the outer ones
-  std::size_t myComponents{componentDesc.Elements()};
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &allocDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(allocDesc, instance, terminator);
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
+    const Descriptor *) {
+  WorkQueue workQueue{terminator};
+  int status{workQueue.BeginInitialize(instance, derived)};
+  return status == StatContinue ? workQueue.Run() : status;
+}
+
+RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived_.procPtr()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    bool noDataComponents{IsComplete()};
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &comp{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (noDataComponents) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
+            subscripts_, comp.offset)};
+        pptr = comp.procInitialization;
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      // Establish allocatable descriptors
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            allocDesc, instance_, workQueue.terminator());
         allocDesc.raw().attribute = CFI_attribute_allocatable;
-        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
-          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
-              if (const auto *derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  stat = Initialize(
-                      allocDesc, *derived, terminator, hasStat, errMsg);
-                }
-              }
-            }
-          }
-          if (stat != StatOk) {
-            break;
-          }
-        }
       }
-    } else if (const void *init{comp.initialization()}) {
+      SkipToNextComponent();
+    } else if (const void *init{component_->initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{comp.SizeInBytes(instance)};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
+      std::size_t bytes{component_->SizeInBytes(instance_)};
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        char *ptr{instance_.ElementComponent<char>(
+            subscripts_, component_->offset())};
         std::memcpy(ptr, init, bytes);
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &ptrDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(ptrDesc, instance, terminator);
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            ptrDesc, instance_, workQueue.terminator());
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.  Recursive.
+      // data component.  Handles parent component's elements.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, instance);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
-        }
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginInitialize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived.procPtr()};
-  std::size_t myProcPtrs{procPtrDesc.Elements()};
-  for (std::size_t k{0}; k < myProcPtrs; ++k) {
-    const auto &comp{
-        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
-          at, comp.offset)};
-      pptr = comp.procInitialization;
-    }
-  }
-  return stat;
+  return StatOk;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    const Descriptor &original, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{orig.Elements()};
-  int stat{StatOk};
-
-  // Skip pointers and unallocated variables.
-  if (orig.IsPointer() || !orig.IsAllocated()) {
-    return stat;
+  if (original.IsPointer() || !original.IsAllocated()) {
+    return StatOk; // nothing to do
+  } else {
+    WorkQueue workQueue{terminator};
+    int status{workQueue.BeginInitializeClone(
+        clone, original, derived, hasStat, errMsg)};
+    return status == StatContinue ? workQueue.Run() : status;
   }
-  // Initialize each data component.
-  std::size_t components{componentDesc.Elements()};
-  for (std::size_t i{0}; i < components; ++i) {
-    const typeInfo::Component &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
-    SubscriptValue at[maxRank];
-    orig.GetLowerBounds(at);
-    // Allocate allocatable components that are also allocated in the original
-    // object.
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
-        Descriptor &origDesc{
-            *orig.ElementComponent<Descriptor>(at, comp.offset())};
-        Descriptor &cloneDesc{
-            *clone.ElementComponent<Descriptor>(at, comp.offset())};
-        if (origDesc.IsAllocated()) {
+}
+
+RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (origDesc.IsAllocated()) {
+        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        if (phase_ == 0) {
+          ++phase_;
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
-              if (const typeInfo::DerivedType *
-                  derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  // Perform default initialization for the allocated element.
-                  stat = Initialize(
-                      cloneDesc, *derived, terminator, hasStat, errMsg);
-                }
-                // Initialize derived type's allocatables.
-                if (stat == StatOk) {
-                  stat = InitializeClone(cloneDesc, origDesc, *derived,
-                      terminator, hasStat, errMsg);
+          if (int stat{ReturnError(workQueue.terminator(),
+                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
+              stat != StatOk) {
+            return stat;
+          }
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              if (!derived->noInitializationNeeded()) {
+                // Perform default initialization for the allocated element.
+                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
+                    status != StatOk) {
+                  return status;
                 }
               }
             }
           }
         }
-        if (stat != StatOk) {
-          break;
+        if (phase_ == 1) {
+          ++phase_;
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              // Initialize derived type's allocatables.
+              if (int status{workQueue.BeginInitializeClone(
+                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType()) {
-      // Handle nested derived types.
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, orig);
-      // Data components don't have descriptors, allocate them.
-      StaticDescriptor<maxRank, true, 0> origStaticDesc;
-      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
-      Descriptor &origDesc{origStaticDesc.descriptor()};
-      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (component_->derivedType()) {
+        // Handle nested derived types.
+        const typeInfo::DerivedType &compType{*component_->derivedType()};
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &origDesc{componentDescriptor_.descriptor()};
+        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
         origDesc.Establish(compType,
-            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
         cloneDesc.Establish(compType,
-            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = InitializeClone(
-            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
+            clone_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginInitializeClone(
+                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
+            status != StatOk) {
+          return status;
         }
+      } else {
+        SkipToNextComponent();
       }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
+      workQueue.Run();
     }
   }
-  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -221,7 +235,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
+    const typeInfo::DerivedType &derived, Terminator &terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -258,9 +272,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
-        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
@@ -284,87 +296,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  CallFinalSubroutine(descriptor, derived, terminator);
-  const auto *parentType{derived.GetParentType()};
-  bool recurse{parentType && !parentType->noFinalizationNeeded()};
+RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
+  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  for (auto k{recurse ? std::size_t{1}
-                      /* skip first component, it's the parent */
-                      : 0};
-       k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    descriptor.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
-        comp.category() == TypeCategory::Derived) {
+  finalizableParentType_ = derived_.GetParentType();
+  if (finalizableParentType_) {
+    if (finalizableParentType_->noFinalizationNeeded()) {
+      finalizableParentType_ = nullptr;
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
+        component_->category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        const Descriptor &compDesc{
-            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (compDesc.IsAllocated()) {
-          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *
-                compDynamicType{addendum->derivedType()}) {
-              if (!compDynamicType->noFinalizationNeeded()) {
-                Finalize(compDesc, *compDynamicType, terminator);
+      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
+          if (const typeInfo::DerivedType *compDynamicType{
+                  addendum->derivedType()}) {
+            if (!compDynamicType->noFinalizationNeeded()) {
+              if (int status{
+                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
+                  status != StatOk) {
+                return status;
               }
             }
           }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
-        if (!compType->noFinalizationNeeded()) {
-          for (std::size_t j{0}; j++ < elements;
-               descriptor.IncrementSubscripts(at)) {
-            const Descriptor &compDesc{
-                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-            if (compDesc.IsAllocated()) {
-              Finalize(compDesc, *compType, terminator);
-            }
+    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType *compType{component_->derivedType()};
+          compType && !compType->noFinalizationNeeded()) {
+        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        Advance();
+        if (compDesc.IsAllocated()) {
+          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
+              status != StatOk) {
+            return status;
           }
         }
+      } else {
+        SkipToNextComponent();
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Finalize(compDesc, compType, terminator);
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginFinalize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  if (recurse) {
-    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
-    Descriptor &tmpDesc{statDesc.descriptor()};
-    tmpDesc = descriptor;
+  // Last, do the parent component, if any and finalizable.
+  if (finalizableParentType_) {
+    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
+    tmpDesc = instance_;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(parentType);
-    tmpDesc.raw().elem_len = parentType->sizeInBytes();
-    Finalize(tmpDesc, *parentType, terminator);
+    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
+    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
+    const auto &parentType{*finalizableParentType_};
+    finalizableParentType_ = nullptr;
+    // Don't return StatOk here if the nested FInalize is still running;
+    // it needs this->componentDescriptor_.
+    return workQueue.BeginFinalize(tmpDesc, parentType);
   }
+  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -373,51 +392,71 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor,
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
-    return;
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
+      workQueue.Run();
+    }
   }
-  if (finalize && !derived.noFinalizationNeeded()) {
-    Finalize(descriptor, derived, terminator);
+}
+
+RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
+  if (finalize_ && !derived_.noFinalizationNeeded()) {
+    if (int status{workQueue.BeginFinalize(instance_, derived_)};
+        status != StatOk && status != StatContinue) {
+      return status;
+    }
   }
+  return StatContinue;
+}
+
+RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
   // Deallocate all direct and indirect allocatable and automatic components.
   // Contrary to finalization, the order of deallocation does not matter.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  SubscriptValue at[maxRank];
-  descriptor.GetLowerBounds(at);
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    const bool destroyComp{
-        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j < elements; ++j) {
-        Descriptor *d{
-            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (destroyComp) {
-          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
+  while (!IsComplete()) {
+    const auto *componentDerived{component_->derivedType()};
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      Descriptor *d{instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (d->IsAllocated()) {
+        if (phase_ == 0) {
+          ++phase_;
+          if (componentDerived && !componentDerived->noDestructionNeeded()) {
+            if (int status{workQueue.BeginDestroy(
+                    *d, *componentDerived, /*finalize=*/false)};
+                status != StatOk) {
+              return status;
+            }
+          }
         }
         d->Deallocate();
-        descriptor.IncrementSubscripts(at);
       }
-    } else if (destroyComp &&
-        comp.genre() == typeInfo::Component::Genre::Data) {
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (!componentDerived || componentDerived->noDestructionNeeded()) {
+        SkipToNextComponent();
+      } else {
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &compDesc{componentDescriptor_.descriptor()};
+        const typeInfo::DerivedType &compType{*componentDerived};
         compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginDestroy(
+                compDesc, *componentDerived, /*finalize=*/false)};
+            status != StatOk) {
+          return status;
+        }
       }
+    } else {
+      SkipToNextComponent();
     }
   }
+  return StatOk;
 }
 
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 3db1455af52fe..364724b89ba0d 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,15 +7,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
+
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
 // Defined formatted I/O (maybe)
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -104,8 +133,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
 }
 
 // Defined unformatted I/O
-bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -152,5 +181,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Data) {
+      // Create a descriptor for the component
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      component_->CreatePointerDescriptor(
+          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
+      Advance();
+      if (int status{workQueue.BeginDescriptorIo<DIR>(
+              io_, compDesc, table_, anyIoTookPlace_)};
+          status != StatOk) {
+        return status;
+      }
+    } else {
+      // Component is itself a descriptor
+      char *pointer{
+          instance_.Element<char>(subscripts_) + component_->offset()};
+      const Descriptor &compDesc{
+          *reinterpret_cast<const Descriptor *>(pointer)};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (int status{workQueue.BeginDescriptorIo<DIR>(
+                io_, compDesc, table_, anyIoTookPlace_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
+  IoErrorHandler &handler{io_.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return handler.GetIoStat();
+  }
+  if (!io_.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return handler.GetIoStat();
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io_.BeginReadingRecord()) {
+      return StatOk;
+    }
+  }
+  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
+    // Unformatted I/O
+    IoErrorHandler &handler{io_.GetIoErrorHandler()};
+    const DescriptorAddendum *addendum{instance_.Addendum()};
+    if (const typeInfo::DerivedType *type{
+            addendum ? addendum->derivedType() : nullptr}) {
+      // derived type unformatted I/O
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*type,
+                DIR == Direction::Input
+                    ? common::DefinedIo::ReadUnformatted
+                    : common::DefinedIo::WriteUnformatted)}) {
+          if (definedIo->subroutine) {
+            typeInfo::SpecialBinding special{DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false};
+            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
+              anyIoTookPlace_ = true;
+              return StatOk;
+            }
+          } else {
+            int status{workQueue.BeginDerivedIo<DIR>(
+                io_, instance_, *type, table_, anyIoTookPlace_)};
+            return status == StatContinue ? StatOk : status; // done here
+          }
+        }
+      }
+      if (const typeInfo::SpecialBinding *special{
+              type->FindSpecialBinding(DIR == Direction::Input
+                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+        if (!table_ || !table_->ignoreNonTbpEntries || special->isTypeBound()) {
+          // defined derived type unformatted I/O
+          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
+            anyIoTookPlace_ = true;
+            return StatOk;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+      // Default derived type unformatted I/O
+      // TODO: If no component at any level has defined READ or WRITE
+      // (as appropriate), the elements are contiguous, and no byte swapping
+      // is active, do a block transfer via the code below.
+      int status{workQueue.BeginDerivedIo<DIR>(
+          io_, instance_, *type, table_, anyIoTookPlace_)};
+      return status == StatContinue ? StatOk : status; // done here
+    } else {
+      // intrinsic type unformatted I/O
+      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
+      InquireIOLengthState *inq{nullptr};
+      bool swapEndianness{false};
+      if (externalUnf) {
+        swapEndianness = externalUnf->unit().swapEndianness();
+      } else {
+        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
+        if (!childUnf) {
+          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
+                                         : nullptr;
+          RUNTIME_CHECK(handler, inq != nullptr);
+        }
+      }
+      std::size_t elementBytes{instance_.ElementBytes()};
+      std::size_t swappingBytes{elementBytes};
+      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
+        // Byte swapping units can be smaller than elements, namely
+        // for COMPLEX and CHARACTER.
+        if (maybeCatAndKind->first == TypeCategory::Character) {
+          // swap each character position independently
+          swappingBytes = maybeCatAndKind->second; // kind
+        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+          // swap real and imaginary components independently
+          swappingBytes /= 2;
+        }
+      }
+      using CharType =
+          std::conditional_t<DIR == Direction::Output, const char, char>;
+      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+        if constexpr (DIR == Direction::Output) {
+          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                             : inq->Emit(&x, totalBytes, swappingBytes);
+        } else {
+          return externalUnf
+              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+              : childUnf->Receive(&x, totalBytes, swappingBytes);
+        }
+      }};
+      if (!swapEndianness &&
+          instance_.IsContiguous()) { // contiguous unformatted I/O
+        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+        if (Transfer(x, elements_ * elementBytes)) {
+          anyIoTookPlace_ = true;
+        } else {
+          return IostatEnd;
+        }
+      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+        for (; !IsComplete(); Advance()) {
+          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+          if (Transfer(x, elementBytes)) {
+            anyIoTookPlace_ = true;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+    }
+    // Unformatted I/O never needs to call Continue().
+    return StatOk;
+  }
+  // Formatted I/O
+  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    bool any{false};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        any = FormattedRealIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedRealIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedRealIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedRealIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedRealIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedRealIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        any = FormattedComplexIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedComplexIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedComplexIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedComplexIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedComplexIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedComplexIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        any = FormattedCharacterIO<char, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        any = FormattedLogicalIO<1, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedLogicalIO<2, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedLogicalIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedLogicalIO<8, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Derived: {
+      // Derived type information must be present for formatted I/O.
+      IoErrorHandler &handler{io_.GetIoErrorHandler()};
+      const DescriptorAddendum *addendum{instance_.Addendum()};
+      RUNTIME_CHECK(handler, addendum != nullptr);
+      derived_ = addendum->derivedType();
+      RUNTIME_CHECK(handler, derived_ != nullptr);
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*derived_,
+                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                        : common::DefinedIo::WriteFormatted)}) {
+          if (definedIo->subroutine) {
+            nonTbpSpecial_.emplace(DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadFormatted
+                    : typeInfo::SpecialBinding::Which::WriteFormatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false);
+            special_ = &*nonTbpSpecial_;
+          }
+        }
+      }
+      if (!special_) {
+        if (const typeInfo::SpecialBinding *binding{
+                derived_->FindSpecialBinding(DIR == Direction::Input
+                        ? typeInfo::SpecialBinding::Which::ReadFormatted
+                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+          if (!table_ || !table_->ignoreNonTbpEntries ||
+              binding->isTypeBound()) {
+            special_ = binding;
+          }
+        }
+      }
+      return StatContinue;
+    }
+    }
+    if (any) {
+      anyIoTookPlace_ = true;
+    } else {
+      return IostatEnd;
+    }
+  } else {
+    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+        static_cast<int>(instance_.type().raw()));
+    return handler.GetIoStat();
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  // Only derived type formatted I/O gets here.
+  while (!IsComplete()) {
+    if (special_) {
+      if (auto defined{DefinedFormattedIo(
+              io_, instance_, *derived_, *special_, subscripts_)}) {
+        anyIoTookPlace_ |= *defined;
+        Advance();
+        continue;
+      }
+    }
+    Descriptor &elementDesc{elementDescriptor_.descriptor()};
+    elementDesc.Establish(
+        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
+    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
+    Advance();
+    if (int status{workQueue.BeginDerivedIo<DIR>(
+            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
+        status != StatOk) {
+      return status;
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  bool anyIoTookPlace{false};
+  WorkQueue workQueue{io.GetIoErrorHandler()};
+  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+  return anyIoTookPlace;
+}
+
+template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index eb60f106c9203..88ad59bd24b53 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,619 +9,27 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
+#include "flang-rt/runtime/connection.h"
 
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/optional.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
+namespace Fortran::runtime {
+class Descriptor;
+} // namespace Fortran::runtime
 
-namespace Fortran::runtime::io::descr {
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
 
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io::descr {
 
 template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-// For intrinsic (not defined) derived type I/O, formatted & unformatted
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
-    const typeInfo::Component &component, const Descriptor &origDescriptor,
-    const SubscriptValue origSubscripts[], Terminator &terminator,
-    const NonTbpDefinedIoTable *table) {
-#if !defined(RT_DEVICE_AVOID_RECURSION)
-  if (component.genre() == typeInfo::Component::Genre::Data) {
-    // Create a descriptor for the component
-    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
-    Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
-        desc, origDescriptor, terminator, origSubscripts);
-    return DescriptorIO<DIR>(io, desc, table);
-  } else {
-    // Component is itself a descriptor
-    char *pointer{
-        origDescriptor.Element<char>(origSubscripts) + component.offset()};
-    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
-    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
-  }
-#else
-  terminator.Crash("not yet implemented: component IO");
-#endif
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  SubscriptValue at[maxRank];
-  compArray.GetLowerBounds(at);
-  for (std::size_t k{0}; k < numComponents;
-       ++k, compArray.IncrementSubscripts(at)) {
-    const typeInfo::Component &component{
-        *compArray.Element<typeInfo::Component>(at)};
-    if (!DefaultComponentIO<DIR>(
-            io, component, descriptor, subscripts, handler, table)) {
-      // Return true for NAMELIST input if any component appeared.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && k > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    SubscriptValue at[maxRank];
-    compArray.GetLowerBounds(at);
-    for (std::size_t k{0}; k < numComponents;
-         ++k, compArray.IncrementSubscripts(at)) {
-      const typeInfo::Component &component{
-          *compArray.Element<typeInfo::Component>(at)};
-      if (!DefaultComponentIO<DIR>(
-              io, component, descriptor, subscripts, handler, table)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
-    const typeInfo::SpecialBinding &, const SubscriptValue[]);
-
-template <Direction DIR>
-static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  // Derived type information must be present for formatted I/O.
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  RUNTIME_CHECK(handler, addendum != nullptr);
-  const typeInfo::DerivedType *type{addendum->derivedType()};
-  RUNTIME_CHECK(handler, type != nullptr);
-  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
-  const typeInfo::SpecialBinding *special{nullptr};
-  if (table) {
-    if (const auto *definedIo{table->Find(*type,
-            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                    : common::DefinedIo::WriteFormatted)}) {
-      if (definedIo->subroutine) {
-        nonTbpSpecial.emplace(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted,
-            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-            false);
-        special = &*nonTbpSpecial;
-      }
-    }
-  }
-  if (!special) {
-    if (const typeInfo::SpecialBinding *
-        binding{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
-        special = binding;
-      }
-    }
-  }
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t numElements{descriptor.Elements()};
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    Fortran::common::optional<bool> result;
-    if (special) {
-      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
-    }
-    if (!result) {
-      result = DefaultComponentwiseFormattedIO<DIR>(
-          io, descriptor, *type, table, subscripts);
-    }
-    if (!result.value()) {
-      // Return true for NAMELIST input if we got anything.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && j > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
 
-// Unformatted I/O
-template <Direction DIR>
-static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  if (const typeInfo::DerivedType *
-      type{addendum ? addendum->derivedType() : nullptr}) {
-    // derived type unformatted I/O
-    if (table) {
-      if (const auto *definedIo{table->Find(*type,
-              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
-                                      : common::DefinedIo::WriteUnformatted)}) {
-        if (definedIo->subroutine) {
-          typeInfo::SpecialBinding special{DIR == Direction::Input
-                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
-              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-              false};
-          if (Fortran::common::optional<bool> wasDefined{
-                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
-            return *wasDefined;
-          }
-        } else {
-          return DefaultComponentwiseUnformattedIO<DIR>(
-              io, descriptor, *type, table);
-        }
-      }
-    }
-    if (const typeInfo::SpecialBinding *
-        special{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
-        // defined derived type unformatted I/O
-        return DefinedUnformattedIo(io, descriptor, *type, *special);
-      }
-    }
-    // Default derived type unformatted I/O
-    // TODO: If no component at any level has defined READ or WRITE
-    // (as appropriate), the elements are contiguous, and no byte swapping
-    // is active, do a block transfer via the code below.
-    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
-  } else {
-    // intrinsic type unformatted I/O
-    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
-    auto *inq{
-        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
-    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
-    std::size_t elementBytes{descriptor.ElementBytes()};
-    std::size_t numElements{descriptor.Elements()};
-    std::size_t swappingBytes{elementBytes};
-    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
-      // Byte swapping units can be smaller than elements, namely
-      // for COMPLEX and CHARACTER.
-      if (maybeCatAndKind->first == TypeCategory::Character) {
-        // swap each character position independently
-        swappingBytes = maybeCatAndKind->second; // kind
-      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-        // swap real and imaginary components independently
-        swappingBytes /= 2;
-      }
-    }
-    SubscriptValue subscripts[maxRank];
-    descriptor.GetLowerBounds(subscripts);
-    using CharType =
-        std::conditional_t<DIR == Direction::Output, const char, char>;
-    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-      if constexpr (DIR == Direction::Output) {
-        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                           : inq->Emit(&x, totalBytes, swappingBytes);
-      } else {
-        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-                           : childUnf->Receive(&x, totalBytes, swappingBytes);
-      }
-    }};
-    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
-    if (!swapEndianness &&
-        descriptor.IsContiguous()) { // contiguous unformatted I/O
-      char &x{ExtractElement<char>(io, descriptor, subscripts)};
-      return Transfer(x, numElements * elementBytes);
-    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-      for (std::size_t j{0}; j < numElements; ++j) {
-        char &x{ExtractElement<char>(io, descriptor, subscripts)};
-        if (!Transfer(x, elementBytes)) {
-          return false;
-        }
-        if (!descriptor.IncrementSubscripts(subscripts) &&
-            j + 1 < numElements) {
-          handler.Crash("DescriptorIO: subscripts out of bounds");
-        }
-      }
-      return true;
-    }
-  }
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return false;
-  }
-  if (!io.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return false;
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io.BeginReadingRecord()) {
-      return false;
-    }
-  }
-  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
-    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
-  }
-  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        return FormattedRealIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedRealIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedRealIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedRealIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedRealIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedRealIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        return FormattedComplexIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedComplexIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedComplexIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedComplexIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedComplexIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedComplexIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        return FormattedCharacterIO<char, DIR>(io, descriptor);
-      case 2:
-        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
-      case 4:
-        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        return FormattedLogicalIO<1, DIR>(io, descriptor);
-      case 2:
-        return FormattedLogicalIO<2, DIR>(io, descriptor);
-      case 4:
-        return FormattedLogicalIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedLogicalIO<8, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Derived:
-      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
-    }
-  }
-  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-      static_cast<int>(descriptor.type().raw()));
-  return false;
-}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 1d5304254ed0e..0f0564403c0e2 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
+    internalDebugging = std::strtol(x, nullptr, 10);
+  }
+
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index b0cf2180fc6d4..1bef387a9771f 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,6 +10,7 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index b08195cd31e05..24d05f369fcbe 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 82182696d70c6..451213202acef 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
+  std::size_t offset{offset_};
   if (subscripts) {
-    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
-  } else {
-    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
+    offset += container.SubscriptsToByteOffset(subscripts);
   }
+  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
new file mode 100644
index 0000000000000..a508ecb637102
--- /dev/null
+++ b/flang-rt/lib/runtime/work-queue.cpp
@@ -0,0 +1,161 @@
+//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/work-queue.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/visit.h"
+
+namespace Fortran::runtime {
+
+#if !defined(RT_DEVICE_COMPILATION)
+// FLANG_RT_DEBUG code is disabled when false.
+static constexpr bool enableDebugOutput{false};
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
+    : derived_{derived}, components_{derived_.component().Elements()} {
+  GetComponent();
+}
+
+RT_API_ATTRS void Componentwise::GetComponent() {
+  if (IsComplete()) {
+    component_ = nullptr;
+  } else {
+    const Descriptor &componentDesc{derived_.component()};
+    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+        componentAt_);
+  }
+}
+
+RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
+  if (!begun) {
+    begun = true;
+    return common::visit(
+        [&workQueue](
+            auto &specificTicket) { return specificTicket.Begin(workQueue); },
+        u);
+  } else {
+    return common::visit(
+        [&workQueue](auto &specificTicket) {
+          return specificTicket.Continue(workQueue);
+        },
+        u);
+  }
+}
+
+RT_API_ATTRS WorkQueue::~WorkQueue() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+  while (firstFree_) {
+    TicketList *next{firstFree_->next};
+    if (!firstFree_->isStatic) {
+      FreeMemory(firstFree_);
+    }
+    firstFree_ = next;
+  }
+}
+
+RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
+  if (!firstFree_) {
+    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
+    firstFree_ = new (p) TicketList;
+    firstFree_->isStatic = false;
+  }
+  TicketList *newTicket{firstFree_};
+  if ((firstFree_ = newTicket->next)) {
+    firstFree_->previous = nullptr;
+  }
+  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
+  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
+    newTicket->previous->next = newTicket;
+  } else {
+    first_ = newTicket;
+  }
+  if ((newTicket->next = after)) {
+    after->previous = newTicket;
+  } else {
+    last_ = newTicket;
+  }
+  newTicket->ticket.begun = false;
+#if !defined(RT_DEVICE_COMPILATION)
+  if (enableDebugOutput &&
+      (executionEnvironment.internalDebugging &
+          ExecutionEnvironment::WorkQueue)) {
+    std::fprintf(stderr, "WQ: new ticket\n");
+  }
+#endif
+  return newTicket->ticket;
+}
+
+RT_API_ATTRS int WorkQueue::Run() {
+  while (last_) {
+    TicketList *at{last_};
+    insertAfter_ = last_;
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
+          at->ticket.begun ? "Continue" : "Begin");
+    }
+#endif
+    int stat{at->ticket.Continue(*this)};
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
+    }
+#endif
+    insertAfter_ = nullptr;
+    if (stat == StatOk) {
+      if (at->previous) {
+        at->previous->next = at->next;
+      } else {
+        first_ = at->next;
+      }
+      if (at->next) {
+        at->next->previous = at->previous;
+      } else {
+        last_ = at->previous;
+      }
+      if ((at->next = firstFree_)) {
+        at->next->previous = at;
+      }
+      at->previous = nullptr;
+      firstFree_ = at;
+    } else if (stat != StatContinue) {
+      Stop();
+      return stat;
+    }
+  }
+  return StatOk;
+}
+
+RT_API_ATTRS void WorkQueue::Stop() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+}
+
+RT_OFFLOAD_API_GROUP_END
+
+} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 3833e48be3dd6..6c148b1de6f82 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength";
+        << "OutputDescriptor() for InquireIoLength " << j;
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 78d871c593e1d..871749934810c 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,6 +858,16 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
+* Intrinsic assignment of arrays is defined elementally, and intrinsic
+  assignment of derived type components is defined componentwise.
+  However, when intrinsic assignment takes place for an array of derived
+  type, the order of the loop nesting is not defined.
+  Some compilers will loop over the elements, assigning all of the components
+  of each element before proceeding to the next element.
+  This compiler loops over all of the components, and assigns all of
+  the elements for each component before proceeding to the next component.
+  A program using defined assignment might be able to detect the difference.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index bc80997a1bec2..eb1f63184a177 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,7 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6
+  DeallocateLHS = 1 << 6,
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 3839bc1d2a215..79f7032aac312 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,9 +182,12 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &derived);
+bool MayRequireFinalization(const DerivedTypeSpec &);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
+// Does this type have any defined assignment at any level (or any polymorphic
+// allocatable)?
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index ccc5e37c840a9..2a862e0e2858b 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -661,6 +661,10 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
+    // Similarly, a flag to enable optimized runtime assignment.
+    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
+        IntExpr<1>(
+            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 1d1e3ac044166..3247addc905ba 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -813,6 +813,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
+static bool MayHaveDefinedAssignment(
+    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
+  if (const Scope *scope{derived.GetScope()};
+      scope && checked.find(scope) == checked.end()) {
+    checked.insert(scope);
+    for (const auto &[_, symbolRef] : *scope) {
+      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
+        if (generic->kind().IsAssignment()) {
+          return true;
+        }
+      } else if (symbolRef->has<ObjectEntityDetails>() &&
+          !IsPointer(*symbolRef)) {
+        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
+          if (type->IsPolymorphic()) {
+            return true;
+          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+            if (MayHaveDefinedAssignment(*derived, checked)) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
+  std::set<const Scope *> checked;
+  return MayHaveDefinedAssignment(derived, checked);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index b30a6bf697563..7226b06504d28 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,7 +52,8 @@
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: __padding0(4)
+    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
+    integer(1) :: __padding0(3)
   end type
 
   type :: Binding
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 28f0bf78f33c9..2e05b652822b5 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index d228cd2a84ca4..7dc92504aeebf 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ subroutine s2(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ subroutine s2(x, y)
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,7 +155,7 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
@@ -197,7 +197,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
@@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index f0c0a817da4a4..e2552d0a21d6f 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..94dd2199db35a 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index 2a7f12a153eb8..df1aecf3821de 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 2385709a8eb44..22f37b1a4369d 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index e8766d9811db8..ab20d6f601106 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 689cf469dee3b..391a66f3d6664 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ module m
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 92efc8f9ea54b..08e0b95abb763 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..6b23b63d28b1d
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,67 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!Check "nodefinedassignment" settings.
+
+module m01
+
+  type hasAsst1
+   contains
+    procedure asst1
+    generic :: assignment(=) => asst1
+  end type
+!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type hasAsst2 ! no defined assignment relevant to the runtime
+  end type
+  interface assignment(=)
+    procedure asst2
+  end interface
+!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test1
+    type(hasAsst1) c
+  end type
+!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type test2
+    type(hasAsst2) c
+  end type
+!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test3
+    type(hasAsst1), pointer :: p
+  end type
+!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test4
+    type(hasAsst2), pointer :: p
+  end type
+!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type, extends(hasAsst1) :: test5
+  end type
+!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type, extends(hasAsst2) :: test6
+  end type
+!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test7
+    type(test7), allocatable :: c
+  end type
+!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test8
+    class(test8), allocatable :: c
+  end type
+!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+
+ contains
+  impure elemental subroutine asst1(left, right)
+    class(hasAsst1), intent(out) :: left
+    class(hasAsst1), intent(in) :: right
+  end
+  impure elemental subroutine asst2(left, right)
+    class(hasAsst2), intent(out) :: left
+    class(hasAsst2), intent(in) :: right
+  end
+end

From b994a4c04f38d8cfb13f3dbf3d99146cb778443e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Jun 2025 14:44:41 -0700
Subject: [PATCH 008/851] [flang][NFC] Clean up code in two new functions
 (#142037)

Two recently-added functions in Semantics/tools.h need some cleaning up
to conform to the coding style of the project. One of them should
actually be in Parser/tools.{h,cpp}, the other doesn't need to be
defined in the header.
---
 flang/include/flang/Parser/tools.h          |  3 +++
 flang/include/flang/Semantics/tools.h       | 26 ++-------------------
 flang/lib/Lower/OpenACC.cpp                 |  4 ++--
 flang/lib/Lower/OpenMP/OpenMP.cpp           |  4 ++--
 flang/lib/Parser/tools.cpp                  |  5 ++++
 flang/lib/Semantics/check-omp-structure.cpp |  8 +++----
 flang/lib/Semantics/tools.cpp               | 14 +++++++++++
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h
index f1ead11734fa0..447bccd5d35a6 100644
--- a/flang/include/flang/Parser/tools.h
+++ b/flang/include/flang/Parser/tools.h
@@ -250,5 +250,8 @@ template <typename A> std::optional<CharBlock> GetLastSource(A &x) {
   return GetSourceHelper<false>::GetSource(const_cast<const A &>(x));
 }
 
+// Checks whether the assignment statement has a single variable on the RHS.
+bool CheckForSingleVariableOnRHS(const AssignmentStmt &);
+
 } // namespace Fortran::parser
 #endif // FORTRAN_PARSER_TOOLS_H_
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 79f7032aac312..51df7c40f5b8b 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -756,29 +756,7 @@ std::string GetCommonBlockObjectName(const Symbol &, bool underscoring);
 // Check for ambiguous USE associations
 bool HadUseError(SemanticsContext &, SourceName at, const Symbol *);
 
-/// Checks if the assignment statement has a single variable on the RHS.
-inline bool checkForSingleVariableOnRHS(
-    const Fortran::parser::AssignmentStmt &assignmentStmt) {
-  const Fortran::parser::Expr &expr{
-      std::get<Fortran::parser::Expr>(assignmentStmt.t)};
-  const Fortran::common::Indirection<Fortran::parser::Designator> *designator =
-      std::get_if<Fortran::common::Indirection<Fortran::parser::Designator>>(
-          &expr.u);
-  return designator != nullptr;
-}
-
-/// Checks if the symbol on the LHS is present in the RHS expression.
-inline bool checkForSymbolMatch(const Fortran::semantics::SomeExpr *lhs,
-    const Fortran::semantics::SomeExpr *rhs) {
-  auto lhsSyms{Fortran::evaluate::GetSymbolVector(*lhs)};
-  const Fortran::semantics::Symbol &lhsSymbol{*lhsSyms.front()};
-  for (const Fortran::semantics::Symbol &symbol :
-      Fortran::evaluate::GetSymbolVector(*rhs)) {
-    if (lhsSymbol == symbol) {
-      return true;
-    }
-  }
-  return false;
-}
+// Checks whether the symbol on the LHS is present in the RHS expression.
+bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TOOLS_H_
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 02dba22c29c7f..c10e1777614cd 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -653,8 +653,8 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
   firOpBuilder.setInsertionPointToStart(&block);
-  if (Fortran::semantics::checkForSingleVariableOnRHS(stmt1)) {
-    if (Fortran::semantics::checkForSymbolMatch(
+  if (Fortran::parser::CheckForSingleVariableOnRHS(stmt1)) {
+    if (Fortran::semantics::CheckForSymbolMatch(
             Fortran::semantics::GetExpr(stmt2Var),
             Fortran::semantics::GetExpr(stmt2Expr))) {
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6892e571e62a3..784749bba5a0c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3200,8 +3200,8 @@ static void genAtomicCapture(lower::AbstractConverter &converter,
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
   firOpBuilder.setInsertionPointToStart(&block);
-  if (semantics::checkForSingleVariableOnRHS(stmt1)) {
-    if (semantics::checkForSymbolMatch(semantics::GetExpr(stmt2Var),
+  if (parser::CheckForSingleVariableOnRHS(stmt1)) {
+    if (semantics::CheckForSymbolMatch(semantics::GetExpr(stmt2Var),
                                        semantics::GetExpr(stmt2Expr))) {
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
       const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr);
diff --git a/flang/lib/Parser/tools.cpp b/flang/lib/Parser/tools.cpp
index 6e5f1ed2fc66f..264ca520f38b8 100644
--- a/flang/lib/Parser/tools.cpp
+++ b/flang/lib/Parser/tools.cpp
@@ -174,4 +174,9 @@ const CoindexedNamedObject *GetCoindexedNamedObject(
       },
       allocateObject.u);
 }
+
+bool CheckForSingleVariableOnRHS(const AssignmentStmt &assignmentStmt) {
+  return Unwrap<Designator>(std::get<Expr>(assignmentStmt.t)) != nullptr;
+}
+
 } // namespace Fortran::parser
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index bdd078c33da92..31fcbb9683202 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2933,9 +2933,9 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct(
   const auto *e2 = GetExpr(context_, stmt2Expr);
 
   if (e1 && v1 && e2 && v2) {
-    if (semantics::checkForSingleVariableOnRHS(stmt1)) {
+    if (parser::CheckForSingleVariableOnRHS(stmt1)) {
       CheckAtomicCaptureStmt(stmt1);
-      if (semantics::checkForSymbolMatch(v2, e2)) {
+      if (CheckForSymbolMatch(v2, e2)) {
         // ATOMIC CAPTURE construct is of the form [capture-stmt, update-stmt]
         CheckAtomicUpdateStmt(stmt2);
       } else {
@@ -2947,8 +2947,8 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct(
             "Captured variable/array element/derived-type component %s expected to be assigned in the second statement of ATOMIC CAPTURE construct"_err_en_US,
             stmt1Expr.source);
       }
-    } else if (semantics::checkForSymbolMatch(v1, e1) &&
-        semantics::checkForSingleVariableOnRHS(stmt2)) {
+    } else if (CheckForSymbolMatch(v1, e1) &&
+        parser::CheckForSingleVariableOnRHS(stmt2)) {
       // ATOMIC CAPTURE construct is of the form [update-stmt, capture-stmt]
       CheckAtomicUpdateStmt(stmt1);
       CheckAtomicCaptureStmt(stmt2);
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 3247addc905ba..ea5ab2d455b54 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1788,4 +1788,18 @@ bool HadUseError(
   }
 }
 
+bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) {
+  if (lhs && rhs) {
+    if (SymbolVector lhsSymbols{evaluate::GetSymbolVector(*lhs)};
+        !lhsSymbols.empty()) {
+      const Symbol &first{*lhsSymbols.front()};
+      for (const Symbol &symbol : evaluate::GetSymbolVector(*rhs)) {
+        if (first == symbol) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
 } // namespace Fortran::semantics

From 54e72d15bc09e9e6464792711b8c475f92a759e2 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Jun 2025 14:44:59 -0700
Subject: [PATCH 009/851] [flang] Ensure overrides of special procedures
 (#142465)

When a derived type declares a generic procedure binding of interest to
the runtime library, such as for ASSIGNMENT(=), it overrides any binding
that might have been present for the parent type.

Fixes https://github.com/llvm/llvm-project/issues/142414.
---
 flang/lib/Semantics/runtime-type-info.cpp |  4 ++--
 flang/test/Semantics/typeinfo13.f90       | 26 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo13.f90

diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 2a862e0e2858b..4c186f4874152 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1067,7 +1067,7 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
     specials =
         DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec);
   }
-  for (auto pair : dtScope) {
+  for (const auto &pair : dtScope) {
     const Symbol &symbol{*pair.second};
     if (const auto *generic{symbol.detailsIf<GenericDetails>()}) {
       DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec);
@@ -1245,7 +1245,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     AddValue(values, specialSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{specific}});
     // index might already be present in the case of an override
-    specials.emplace(*index,
+    specials.insert_or_assign(*index,
         evaluate::StructureConstructor{
             DEREF(specialSchema_.AsDerived()), std::move(values)});
   }
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
new file mode 100644
index 0000000000000..cf4abf9e38181
--- /dev/null
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -0,0 +1,26 @@
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+!Ensure ASSIGNMENT(=) overrides are applied to the special procedures table.
+module m
+  type base
+   contains
+    procedure :: baseAssign
+    generic :: assignment(=) => baseAssign
+  end type
+  type, extends(base) :: child
+   contains
+    procedure :: override
+    generic :: assignment(=) => override
+  end type
+ contains
+  impure elemental subroutine baseAssign(to, from)
+    class(base), intent(out) :: to
+    type(base), intent(in) :: from
+  end
+  impure elemental subroutine override(to, from)
+    class(child), intent(out) :: to
+    type(child), intent(in) :: from
+  end
+end
+
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]

From 2f9dfdfb35bdb10334b09476a47dc1d93beea96c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 10 Jun 2025 15:11:44 -0700
Subject: [PATCH 010/851] [IR] Simplify scalable vector handling in
 ShuffleVectorInst::getShuffleMask. NFC (#143596)

Combine the scalable vector UndefValue check with the earlier
ConstantAggregateZero handling for fixed and scalable vectors.

Assert that the rest of the code is only reached for fixed vectors.

Use append instead of resize since we know the size is increasing.
---
 llvm/lib/IR/Instructions.cpp | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b29969657e7fc..2d89ec1b0a8d3 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1854,23 +1854,18 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
   ElementCount EC = cast<VectorType>(Mask->getType())->getElementCount();
 
-  if (isa<ConstantAggregateZero>(Mask)) {
-    Result.resize(EC.getKnownMinValue(), 0);
+  if (isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) {
+    int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
+    Result.append(EC.getKnownMinValue(), MaskVal);
     return;
   }
 
-  Result.reserve(EC.getKnownMinValue());
+  assert(!EC.isScalable() &&
+         "Scalable vector shuffle mask must be undef or zeroinitializer");
 
-  if (EC.isScalable()) {
-    assert((isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) &&
-           "Scalable vector shuffle mask must be undef or zeroinitializer");
-    int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
-    for (unsigned I = 0; I < EC.getKnownMinValue(); ++I)
-      Result.emplace_back(MaskVal);
-    return;
-  }
+  unsigned NumElts = EC.getFixedValue();
 
-  unsigned NumElts = EC.getKnownMinValue();
+  Result.reserve(NumElts);
 
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)

From 32649e017eaa609fa556b6d6d74bb73abf37214d Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:12:16 -0700
Subject: [PATCH 011/851] [IR2Vec] Exposing Embedding as an data type wrapped
 around std::vector<double> (#143197)

Currently `Embedding` is `std::vector<double>`. This PR makes it a data type wrapped around `std::vector<double>` to overload basic arithmetic operators and expose comparison operations. It _simplifies_ the usage here and in the passes where operations on `Embedding` would be performed.

(Tracking issue - #141817)
---
 llvm/include/llvm/Analysis/IR2Vec.h    |  69 ++++++--
 llvm/lib/Analysis/IR2Vec.cpp           |  69 +++++---
 llvm/unittests/Analysis/IR2VecTest.cpp | 208 ++++++++++++++++++++-----
 3 files changed, 274 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 9fd1b0ae8e248..8bf21b0e75d67 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -53,7 +53,63 @@ class raw_ostream;
 enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
-using Embedding = std::vector<double>;
+/// Embedding is a datatype that wraps std::vector<double>. It provides
+/// additional functionality for arithmetic and comparison operations.
+/// It is meant to be used *like* std::vector<double> but is more restrictive
+/// in the sense that it does not allow the user to change the size of the
+/// embedding vector. The dimension of the embedding is fixed at the time of
+/// construction of Embedding object. But the elements can be modified in-place.
+struct Embedding {
+private:
+  std::vector<double> Data;
+
+public:
+  Embedding() = default;
+  Embedding(const std::vector<double> &V) : Data(V) {}
+  Embedding(std::vector<double> &&V) : Data(std::move(V)) {}
+  Embedding(std::initializer_list<double> IL) : Data(IL) {}
+
+  explicit Embedding(size_t Size) : Data(Size) {}
+  Embedding(size_t Size, double InitialValue) : Data(Size, InitialValue) {}
+
+  size_t size() const { return Data.size(); }
+  bool empty() const { return Data.empty(); }
+
+  double &operator[](size_t Itr) {
+    assert(Itr < Data.size() && "Index out of bounds");
+    return Data[Itr];
+  }
+
+  const double &operator[](size_t Itr) const {
+    assert(Itr < Data.size() && "Index out of bounds");
+    return Data[Itr];
+  }
+
+  using iterator = typename std::vector<double>::iterator;
+  using const_iterator = typename std::vector<double>::const_iterator;
+
+  iterator begin() { return Data.begin(); }
+  iterator end() { return Data.end(); }
+  const_iterator begin() const { return Data.begin(); }
+  const_iterator end() const { return Data.end(); }
+  const_iterator cbegin() const { return Data.cbegin(); }
+  const_iterator cend() const { return Data.cend(); }
+
+  const std::vector<double> &getData() const { return Data; }
+
+  /// Arithmetic operators
+  Embedding &operator+=(const Embedding &RHS);
+  Embedding &operator-=(const Embedding &RHS);
+
+  /// Adds Src Embedding scaled by Factor with the called Embedding.
+  /// Called_Embedding += Src * Factor
+  Embedding &scaleAndAdd(const Embedding &Src, float Factor);
+
+  /// Returns true if the embedding is approximately equal to the RHS embedding
+  /// within the specified tolerance.
+  bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) const;
+};
+
 using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
 using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 // FIXME: Current the keys are strings. This can be changed to
@@ -61,8 +117,8 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 using Vocab = std::map<std::string, Embedding>;
 
 /// Embedder provides the interface to generate embeddings (vector
-/// representations) for instructions, basic blocks, and functions. The vector
-/// representations are generated using IR2Vec algorithms.
+/// representations) for instructions, basic blocks, and functions. The
+/// vector representations are generated using IR2Vec algorithms.
 ///
 /// The Embedder class is an abstract class and it is intended to be
 /// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
@@ -99,13 +155,6 @@ class Embedder {
   /// zero vector.
   Embedding lookupVocab(const std::string &Key) const;
 
-  /// Adds two vectors: Dst += Src
-  static void addVectors(Embedding &Dst, const Embedding &Src);
-
-  /// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor
-  static void addScaledVector(Embedding &Dst, const Embedding &Src,
-                              float Factor);
-
 public:
   virtual ~Embedder() = default;
 
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 490db5fdcdf99..25ce35d4ace37 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -55,6 +55,51 @@ static cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional,
 
 AnalysisKey IR2VecVocabAnalysis::Key;
 
+namespace llvm::json {
+inline bool fromJSON(const llvm::json::Value &E, Embedding &Out,
+                     llvm::json::Path P) {
+  std::vector<double> TempOut;
+  if (!llvm::json::fromJSON(E, TempOut, P))
+    return false;
+  Out = Embedding(std::move(TempOut));
+  return true;
+}
+} // namespace llvm::json
+
+// ==----------------------------------------------------------------------===//
+// Embedding
+//===----------------------------------------------------------------------===//
+
+Embedding &Embedding::operator+=(const Embedding &RHS) {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
+                 std::plus<double>());
+  return *this;
+}
+
+Embedding &Embedding::operator-=(const Embedding &RHS) {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
+                 std::minus<double>());
+  return *this;
+}
+
+Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
+  assert(this->size() == Src.size() && "Vectors must have the same dimension");
+  for (size_t Itr = 0; Itr < this->size(); ++Itr)
+    (*this)[Itr] += Src[Itr] * Factor;
+  return *this;
+}
+
+bool Embedding::approximatelyEquals(const Embedding &RHS,
+                                    double Tolerance) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  for (size_t Itr = 0; Itr < this->size(); ++Itr)
+    if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance)
+      return false;
+  return true;
+}
+
 // ==----------------------------------------------------------------------===//
 // Embedder and its subclasses
 //===----------------------------------------------------------------------===//
@@ -73,20 +118,6 @@ Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
   return make_error<StringError>("Unknown IR2VecKind", errc::invalid_argument);
 }
 
-void Embedder::addVectors(Embedding &Dst, const Embedding &Src) {
-  assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
-  std::transform(Dst.begin(), Dst.end(), Src.begin(), Dst.begin(),
-                 std::plus<double>());
-}
-
-void Embedder::addScaledVector(Embedding &Dst, const Embedding &Src,
-                               float Factor) {
-  assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
-  for (size_t i = 0; i < Dst.size(); ++i) {
-    Dst[i] += Src[i] * Factor;
-  }
-}
-
 // FIXME: Currently lookups are string based. Use numeric Keys
 // for efficiency
 Embedding Embedder::lookupVocab(const std::string &Key) const {
@@ -164,20 +195,20 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
     Embedding InstVector(Dimension, 0);
 
     const auto OpcVec = lookupVocab(I.getOpcodeName());
-    addScaledVector(InstVector, OpcVec, OpcWeight);
+    InstVector.scaleAndAdd(OpcVec, OpcWeight);
 
     // FIXME: Currently lookups are string based. Use numeric Keys
     // for efficiency.
     const auto Type = I.getType();
     const auto TypeVec = getTypeEmbedding(Type);
-    addScaledVector(InstVector, TypeVec, TypeWeight);
+    InstVector.scaleAndAdd(TypeVec, TypeWeight);
 
     for (const auto &Op : I.operands()) {
       const auto OperandVec = getOperandEmbedding(Op.get());
-      addScaledVector(InstVector, OperandVec, ArgWeight);
+      InstVector.scaleAndAdd(OperandVec, ArgWeight);
     }
     InstVecMap[&I] = InstVector;
-    addVectors(BBVector, InstVector);
+    BBVector += InstVector;
   }
   BBVecMap[&BB] = BBVector;
 }
@@ -187,7 +218,7 @@ void SymbolicEmbedder::computeEmbeddings() const {
     return;
   for (const auto &BB : F) {
     computeEmbeddings(BB);
-    addVectors(FuncVector, BBVecMap[&BB]);
+    FuncVector += BBVecMap[&BB];
   }
 }
 
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 9e47b2cd8bedd..053b9f75e7a66 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -32,89 +33,209 @@ class TestableEmbedder : public Embedder {
   void computeEmbeddings() const override {}
   void computeEmbeddings(const BasicBlock &BB) const override {}
   using Embedder::lookupVocab;
-  static void addVectors(Embedding &Dst, const Embedding &Src) {
-    Embedder::addVectors(Dst, Src);
+};
+
+TEST(EmbeddingTest, ConstructorsAndAccessors) {
+  // Default constructor
+  {
+    Embedding E;
+    EXPECT_TRUE(E.empty());
+    EXPECT_EQ(E.size(), 0u);
   }
-  static void addScaledVector(Embedding &Dst, const Embedding &Src,
-                              float Factor) {
-    Embedder::addScaledVector(Dst, Src, Factor);
+
+  // Constructor with const std::vector<double>&
+  {
+    std::vector<double> Data = {1.0, 2.0, 3.0};
+    Embedding E(Data);
+    EXPECT_FALSE(E.empty());
+    ASSERT_THAT(E, SizeIs(3u));
+    EXPECT_THAT(E.getData(), ElementsAre(1.0, 2.0, 3.0));
+    EXPECT_EQ(E[0], 1.0);
+    EXPECT_EQ(E[1], 2.0);
+    EXPECT_EQ(E[2], 3.0);
   }
-};
 
-TEST(IR2VecTest, CreateSymbolicEmbedder) {
-  Vocab V = {{"foo", {1.0, 2.0}}};
+  // Constructor with std::vector<double>&&
+  {
+    Embedding E(std::vector<double>({4.0, 5.0}));
+    ASSERT_THAT(E, SizeIs(2u));
+    EXPECT_THAT(E.getData(), ElementsAre(4.0, 5.0));
+  }
 
-  LLVMContext Ctx;
-  Module M("M", Ctx);
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+  // Constructor with std::initializer_list<double>
+  {
+    Embedding E({6.0, 7.0, 8.0, 9.0});
+    ASSERT_THAT(E, SizeIs(4u));
+    EXPECT_THAT(E.getData(), ElementsAre(6.0, 7.0, 8.0, 9.0));
+    EXPECT_EQ(E[0], 6.0);
+    E[0] = 6.5;
+    EXPECT_EQ(E[0], 6.5);
+  }
 
-  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
-  EXPECT_TRUE(static_cast<bool>(Result));
+  // Constructor with size_t
+  {
+    Embedding E(5);
+    ASSERT_THAT(E, SizeIs(5u));
+    EXPECT_THAT(E.getData(), ElementsAre(0.0, 0.0, 0.0, 0.0, 0.0));
+  }
 
-  auto *Emb = Result->get();
-  EXPECT_NE(Emb, nullptr);
-}
+  // Constructor with size_t and double
+  {
+    Embedding E(5, 1.5);
+    ASSERT_THAT(E, SizeIs(5u));
+    EXPECT_THAT(E.getData(), ElementsAre(1.5, 1.5, 1.5, 1.5, 1.5));
+  }
 
-TEST(IR2VecTest, CreateInvalidMode) {
-  Vocab V = {{"foo", {1.0, 2.0}}};
+  // Test iterators
+  {
+    Embedding E({6.5, 7.0, 8.0, 9.0});
+    std::vector<double> VecE;
+    for (double Val : E) {
+      VecE.push_back(Val);
+    }
+    EXPECT_THAT(VecE, ElementsAre(6.5, 7.0, 8.0, 9.0));
+
+    const Embedding CE = E;
+    std::vector<double> VecCE;
+    for (const double &Val : CE) {
+      VecCE.push_back(Val);
+    }
+    EXPECT_THAT(VecCE, ElementsAre(6.5, 7.0, 8.0, 9.0));
+
+    EXPECT_EQ(*E.begin(), 6.5);
+    EXPECT_EQ(*(E.end() - 1), 9.0);
+    EXPECT_EQ(*CE.cbegin(), 6.5);
+    EXPECT_EQ(*(CE.cend() - 1), 9.0);
+  }
+}
 
-  LLVMContext Ctx;
-  Module M("M", Ctx);
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+TEST(EmbeddingTest, AddVectors) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
 
-  // static_cast an invalid int to IR2VecKind
-  auto Result = Embedder::create(static_cast<IR2VecKind>(-1), *F, V);
-  EXPECT_FALSE(static_cast<bool>(Result));
+  E1 += E2;
+  EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0));
 
-  std::string ErrMsg;
-  llvm::handleAllErrors(
-      Result.takeError(),
-      [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
-  EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos);
+  // Check that E2 is unchanged
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
-TEST(IR2VecTest, AddVectors) {
+TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
 
-  TestableEmbedder::addVectors(E1, E2);
-  EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0));
+  E1 -= E2;
+  EXPECT_THAT(E1, ElementsAre(0.5, 0.5, 4.0));
 
   // Check that E2 is unchanged
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
-TEST(IR2VecTest, AddScaledVector) {
+TEST(EmbeddingTest, AddScaledVector) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {2.0, 0.5, -1.0};
 
-  TestableEmbedder::addScaledVector(E1, E2, 0.5f);
+  E1.scaleAndAdd(E2, 0.5f);
   EXPECT_THAT(E1, ElementsAre(2.0, 2.25, 2.5));
 
   // Check that E2 is unchanged
   EXPECT_THAT(E2, ElementsAre(2.0, 0.5, -1.0));
 }
 
+TEST(EmbeddingTest, ApproximatelyEqual) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {1.0000001, 2.0000001, 3.0000001};
+  EXPECT_TRUE(E1.approximatelyEquals(E2)); // Diff = 1e-7
+
+  Embedding E3 = {1.00002, 2.00002, 3.00002}; // Diff = 2e-5
+  EXPECT_FALSE(E1.approximatelyEquals(E3));
+  EXPECT_TRUE(E1.approximatelyEquals(E3, 3e-5));
+
+  Embedding E_clearly_within = {1.0000005, 2.0000005, 3.0000005}; // Diff = 5e-7
+  EXPECT_TRUE(E1.approximatelyEquals(E_clearly_within));
+
+  Embedding E_clearly_outside = {1.00001, 2.00001, 3.00001}; // Diff = 1e-5
+  EXPECT_FALSE(E1.approximatelyEquals(E_clearly_outside));
+
+  Embedding E4 = {1.0, 2.0, 3.5}; // Large diff
+  EXPECT_FALSE(E1.approximatelyEquals(E4, 0.01));
+
+  Embedding E5 = {1.0, 2.0, 3.0};
+  EXPECT_TRUE(E1.approximatelyEquals(E5, 0.0));
+  EXPECT_TRUE(E1.approximatelyEquals(E5));
+}
+
 #if GTEST_HAS_DEATH_TEST
 #ifndef NDEBUG
-TEST(IR2VecTest, MismatchedDimensionsAddVectors) {
+TEST(EmbeddingTest, AccessOutOfBounds) {
+  Embedding E = {1.0, 2.0, 3.0};
+  EXPECT_DEATH(E[3], "Index out of bounds");
+  EXPECT_DEATH(E[-1], "Index out of bounds");
+  EXPECT_DEATH(E[4] = 4.0, "Index out of bounds");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsAddVectors) {
   Embedding E1 = {1.0, 2.0};
   Embedding E2 = {1.0};
-  EXPECT_DEATH(TestableEmbedder::addVectors(E1, E2),
-               "Vectors must have the same dimension");
+  EXPECT_DEATH(E1 += E2, "Vectors must have the same dimension");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsSubtractVectors) {
+  Embedding E1 = {1.0, 2.0};
+  Embedding E2 = {1.0};
+  EXPECT_DEATH(E1 -= E2, "Vectors must have the same dimension");
 }
 
-TEST(IR2VecTest, MismatchedDimensionsAddScaledVector) {
+TEST(EmbeddingTest, MismatchedDimensionsAddScaledVector) {
   Embedding E1 = {1.0, 2.0};
   Embedding E2 = {1.0};
-  EXPECT_DEATH(TestableEmbedder::addScaledVector(E1, E2, 1.0f),
+  EXPECT_DEATH(E1.scaleAndAdd(E2, 1.0f),
+               "Vectors must have the same dimension");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsApproximatelyEqual) {
+  Embedding E1 = {1.0, 2.0};
+  Embedding E2 = {1.010};
+  EXPECT_DEATH(E1.approximatelyEquals(E2),
                "Vectors must have the same dimension");
 }
 #endif // NDEBUG
 #endif // GTEST_HAS_DEATH_TEST
 
+TEST(IR2VecTest, CreateSymbolicEmbedder) {
+  Vocab V = {{"foo", {1.0, 2.0}}};
+
+  LLVMContext Ctx;
+  Module M("M", Ctx);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  EXPECT_TRUE(static_cast<bool>(Result));
+
+  auto *Emb = Result->get();
+  EXPECT_NE(Emb, nullptr);
+}
+
+TEST(IR2VecTest, CreateInvalidMode) {
+  Vocab V = {{"foo", {1.0, 2.0}}};
+
+  LLVMContext Ctx;
+  Module M("M", Ctx);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+
+  // static_cast an invalid int to IR2VecKind
+  auto Result = Embedder::create(static_cast<IR2VecKind>(-1), *F, V);
+  EXPECT_FALSE(static_cast<bool>(Result));
+
+  std::string ErrMsg;
+  llvm::handleAllErrors(
+      Result.takeError(),
+      [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+  EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos);
+}
+
 TEST(IR2VecTest, LookupVocab) {
   Vocab V = {{"foo", {1.0, 2.0}}, {"bar", {3.0, 4.0}}};
   LLVMContext Ctx;
@@ -136,8 +257,9 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) {
   Embedding E1;
   Embedding E2;
   // Should be no-op, but not crash
-  TestableEmbedder::addVectors(E1, E2);
-  TestableEmbedder::addScaledVector(E1, E2, 1.0f);
+  E1 += E2;
+  E1 -= E2;
+  E1.scaleAndAdd(E2, 1.0f);
   EXPECT_TRUE(E1.empty());
 }
 

From 3a2bcd96e22721312c9d340c9122a3988dc1e222 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 10 Jun 2025 15:26:54 -0700
Subject: [PATCH 012/851] [RISCV][TTI] Allow partial reduce with mismatched
 extends (#143608)

This depends on the recently add partial_reduce_sumla node for lowering
but at this point, we have all the parts.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   3 +-
 .../RISCV/partial-reduce-dot-product.ll       | 439 ++++++++++++------
 2 files changed, 296 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ff822dec232a9..d5ea0c5d52293 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -305,8 +305,7 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost(
   if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
       Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
       InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
-      OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) ||
-      !VF.isKnownMultipleOf(4))
+      !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
     return InstructionCost::getInvalid();
 
   Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 847c4ba0bebfc..8c29da02b813c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -351,79 +351,153 @@ for.exit:                        ; preds = %for.body
 
 
 define i32 @vqdotsu(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @vqdotsu(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
+; V-LABEL: define i32 @vqdotsu(
+; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; V-NEXT:  entry:
+; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; V:       vector.ph:
+; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; V:       vector.body:
+; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
+; V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; V:       middle.block:
+; V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; V-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; V-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; V:       scalar.ph:
+;
+; ZVQDOTQ-LABEL: define i32 @vqdotsu(
+; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ZVQDOTQ-NEXT:  entry:
+; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; ZVQDOTQ:       vector.ph:
+; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ZVQDOTQ:       vector.body:
+; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; ZVQDOTQ:       middle.block:
+; ZVQDOTQ-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
+; ZVQDOTQ-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; ZVQDOTQ:       scalar.ph:
 ;
-; FIXED-LABEL: define i32 @vqdotsu(
-; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
-; FIXED-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; FIXED:       middle.block:
-; FIXED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
-; FIXED-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; FIXED:       scalar.ph:
+; FIXED-V-LABEL: define i32 @vqdotsu(
+; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-V-NEXT:  entry:
+; FIXED-V-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-V:       vector.ph:
+; FIXED-V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-V:       vector.body:
+; FIXED-V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-V-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-V-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-V-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-V-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
+; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
+; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-V:       middle.block:
+; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
+; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
+; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V:       scalar.ph:
+;
+; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu(
+; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-ZVQDOTQ-NEXT:  entry:
+; FIXED-ZVQDOTQ-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-ZVQDOTQ:       vector.ph:
+; FIXED-ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-ZVQDOTQ:       vector.body:
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-ZVQDOTQ:       middle.block:
+; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
   br label %for.body
@@ -448,79 +522,153 @@ for.exit:                        ; preds = %for.body
 }
 
 define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @vqdotsu2(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
+; V-LABEL: define i32 @vqdotsu2(
+; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; V-NEXT:  entry:
+; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; V:       vector.ph:
+; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; V:       vector.body:
+; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
+; V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; V:       middle.block:
+; V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; V-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; V-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; V:       scalar.ph:
+;
+; ZVQDOTQ-LABEL: define i32 @vqdotsu2(
+; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ZVQDOTQ-NEXT:  entry:
+; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; ZVQDOTQ:       vector.ph:
+; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ZVQDOTQ:       vector.body:
+; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; ZVQDOTQ:       middle.block:
+; ZVQDOTQ-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
+; ZVQDOTQ-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; ZVQDOTQ:       scalar.ph:
 ;
-; FIXED-LABEL: define i32 @vqdotsu2(
-; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
-; FIXED-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; FIXED:       middle.block:
-; FIXED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
-; FIXED-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; FIXED:       scalar.ph:
+; FIXED-V-LABEL: define i32 @vqdotsu2(
+; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-V-NEXT:  entry:
+; FIXED-V-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-V:       vector.ph:
+; FIXED-V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-V:       vector.body:
+; FIXED-V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-V-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-V-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-V-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-V-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
+; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
+; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-V:       middle.block:
+; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
+; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
+; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V:       scalar.ph:
+;
+; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu2(
+; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-ZVQDOTQ-NEXT:  entry:
+; FIXED-ZVQDOTQ-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-ZVQDOTQ:       vector.ph:
+; FIXED-ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-ZVQDOTQ:       vector.body:
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-ZVQDOTQ:       middle.block:
+; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
   br label %for.body
@@ -543,3 +691,6 @@ for.body:                                         ; preds = %for.body, %entry
 for.exit:                        ; preds = %for.body
   ret i32 %add
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; FIXED: {{.*}}

From c7063380205d8776e281f7a6603119aa8ea28c12 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 10 Jun 2025 15:34:54 -0700
Subject: [PATCH 013/851] [lldb] Fix `target stop-hook add` help output

The help output for `target stop-hook add` references non-existing
option `--one-line-command`. The correct option is `--one-liner`:

```
-o <one-line-command> ( --one-liner <one-line-command> )
     Add a command for the stop hook.  Can be specified more than once,
     and commands will be run in the order they appear.
```

This commit fixes the help text.

rdar://152730660
---
 lldb/source/Commands/CommandObjectTarget.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 21b21954bbc90..a4ced37649ea0 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -4885,9 +4885,9 @@ class CommandObjectTargetStopHookAdd : public CommandObjectParsed,
 Command Based stop-hooks:
 -------------------------
   Stop hooks can run a list of lldb commands by providing one or more
-  --one-line-command options.  The commands will get run in the order they are
-  added.  Or you can provide no commands, in which case you will enter a
-  command editor where you can enter the commands to be run.
+  --one-liner options.  The commands will get run in the order they are added.
+  Or you can provide no commands, in which case you will enter a command editor
+  where you can enter the commands to be run.
 
 Python Based Stop Hooks:
 ------------------------

From 32d2b6ba4797584743d4764b25af0ae6f6c3d063 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 10 Jun 2025 15:58:53 -0700
Subject: [PATCH 014/851] [HWASAN] Disable LSan test on Android (#143625)

Android HWASan does not support LSan.
---
 compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
index b6e486b291f3a..91acd28a1a5ff 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
+++ b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
@@ -1,6 +1,9 @@
 // Make sure dlerror is not classified as a leak even if we use dynamic TLS.
 // This is currently not implemented, so this test is XFAIL.
 
+// Android HWAsan does not support LSan.
+// UNSUPPORTED: android
+
 // RUN: %clangxx_hwasan -O0 %s -o %t && HWASAN_OPTIONS=detect_leaks=1 %run %t
 
 #include <assert.h>

From 48122a797710a05b5b8620f6051e9716a8e5a6c3 Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com>
Date: Tue, 10 Jun 2025 16:15:12 -0700
Subject: [PATCH 015/851] [flang][cuda] Fix CUDA generic resolution for VALUE
 arguments in device procedures (#140952)

For actual arguments that have VALUE attribute inside device routines, treat them as if they have device attribute.
---
 flang/lib/Semantics/check-call.cpp |  7 +++++++
 flang/test/Semantics/cuf21.cuf     | 11 +++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index dfc2ddbacf071..6f2503285013d 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1033,6 +1033,13 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
               *actualDataAttr == common::CUDADataAttr::Managed)) {
         actualDataAttr = common::CUDADataAttr::Device;
       }
+      // For device procedures, treat actual arguments with VALUE attribute as
+      // device data
+      if (!actualDataAttr && actualLastSymbol && IsValue(*actualLastSymbol) &&
+          (*procedure.cudaSubprogramAttrs ==
+              common::CUDASubprogramAttrs::Device)) {
+        actualDataAttr = common::CUDADataAttr::Device;
+      }
     }
     if (dummyDataAttr == common::CUDADataAttr::Device &&
         (dummyIsAssumedShape || dummyIsAssumedRank) &&
diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf
index b8b99a8d1d9be..077657c8a52d5 100644
--- a/flang/test/Semantics/cuf21.cuf
+++ b/flang/test/Semantics/cuf21.cuf
@@ -9,19 +9,22 @@ module mlocModule
   end interface maxlocUpdate
 contains
 
-  attributes(global) subroutine maxlocPartialMaskR_32F1D()
+  attributes(global) subroutine maxlocPartialMaskR_32F1D(back)
     implicit none
+    logical, intent(in), value :: back
     real(4) :: mval
 
-    call maxlocUpdate(mval)
+    call maxlocUpdate(mval, back)
 
   end subroutine maxlocPartialMaskR_32F1D
 
-  attributes(device) subroutine maxlocUpdateR_32F(mval)
+  attributes(device) subroutine maxlocUpdateR_32F(mval, back)
     real(4) :: mval
+    logical :: back
   end subroutine maxlocUpdateR_32F
 
-  attributes(device) subroutine maxlocUpdateR_64F(mval)
+  attributes(device) subroutine maxlocUpdateR_64F(mval, back)
     real(8) :: mval
+    logical :: back
   end subroutine maxlocUpdateR_64F
 end module

From 1bf4702d2bbaad522886dfbab913a8dd6efe3b85 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Tue, 10 Jun 2025 16:18:53 -0700
Subject: [PATCH 016/851] Disable prctl test when building for arm or riscv.
 (#143627)

I'm setting up a buildbot for arm32 using qemu and qemu doesn't support
PR_GET_THP_DISABLE.
Disable the test for now while we figure out what to do about that.

Also disable for riscv because we may do the same for riscv buildbots.
---
 libc/test/src/sys/prctl/linux/CMakeLists.txt | 6 ++++++
 libc/test/src/sys/prctl/linux/prctl_test.cpp | 1 +
 2 files changed, 7 insertions(+)

diff --git a/libc/test/src/sys/prctl/linux/CMakeLists.txt b/libc/test/src/sys/prctl/linux/CMakeLists.txt
index b06e1c8087008..d02900e1857a0 100644
--- a/libc/test/src/sys/prctl/linux/CMakeLists.txt
+++ b/libc/test/src/sys/prctl/linux/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_custom_target(libc_sys_prctl_unittests)
 
+# Temporarily disable this test while setting up arm and riscv buildbots
+# using qemu, since PR_GET_THP_DISABLE is not supported on qemu.
+if (NOT (LIBC_TARGET_ARCHITECTURE_IS_ARM OR
+	 LIBC_TARGET_ARCHITECTURE_IS_RISCV32 OR
+	 LIBC_TARGET_ARCHITECTURE_IS_RISCV64))
 add_libc_unittest(
   prctl_test
   SUITE
@@ -13,3 +18,4 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+endif()
diff --git a/libc/test/src/sys/prctl/linux/prctl_test.cpp b/libc/test/src/sys/prctl/linux/prctl_test.cpp
index 374c905e0ef8a..76b829c82d1be 100644
--- a/libc/test/src/sys/prctl/linux/prctl_test.cpp
+++ b/libc/test/src/sys/prctl/linux/prctl_test.cpp
@@ -34,6 +34,7 @@ TEST_F(LlvmLibcSysPrctlTest, GetSetName) {
 TEST_F(LlvmLibcSysPrctlTest, GetTHPDisable) {
   // Manually check errno since the return value logic here is not
   // covered in ErrnoSetterMatcher.
+  // Note that PR_GET_THP_DISABLE is not supported by QEMU.
   int ret = LIBC_NAMESPACE::prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0);
   ASSERT_ERRNO_SUCCESS();
   // PR_GET_THP_DISABLE return (as the function result) the current

From ad479ddb343c2756e6eed0f2999bbdb88a65c7c5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 11 Jun 2025 08:49:13 +0900
Subject: [PATCH 017/851] Revert "[SeparateConstOffsetFromGEP] Decompose
 constant xor operand if possible (#135788)"

This reverts commit 13ccce28776d8ad27b0c6a92b5a452d62da05663.

The tests are on non-canonical IR, and adds an extra unrelated
pre-processing step to the pass. I'm assuming this is a workaround
for the known-bits recursion depth limit in instcombine.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 193 -----------------
 .../AMDGPU/xor-to-or-disjoint.ll              | 204 ------------------
 2 files changed, 397 deletions(-)
 delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6fae9f1dd2404..320b79203c0b3 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,7 +174,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -191,7 +190,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -200,8 +198,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "separate-offset-gep"
-
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -492,42 +488,6 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
-/// A helper class that aims to convert xor operations into or operations when
-/// their operands are disjoint and the result is used in a GEP's index. This
-/// can then enable further GEP optimizations by effectively turning BaseVal |
-/// Const into BaseVal + Const when they are disjoint, which
-/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
-/// sets up a grid of memory accesses across a wave where each thread acesses
-/// data at various offsets.
-class XorToOrDisjointTransformer {
-public:
-  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
-                             const DataLayout &DL)
-      : F(F), DT(DT), DL(DL) {}
-
-  bool run();
-
-private:
-  Function &F;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  /// Maps a common operand to all Xor instructions
-  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
-  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
-  XorBaseValInst XorGroups;
-
-  /// Checks if the given value has at least one GetElementPtr user
-  static bool hasGEPUser(const Value *V);
-
-  /// Helper function to check if BaseXor dominates all XORs in the group
-  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
-
-  /// Processes a group of XOR instructions that share the same non-constant
-  /// base operand. Returns true if this group's processing modified the
-  /// function.
-  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
-};
-
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1263,154 +1223,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
-// Helper function to check if an instruction has at least one GEP user
-bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
-  return llvm::any_of(V->users(), [](const User *U) {
-    return isa<llvm::GetElementPtrInst>(U);
-  });
-}
-
-bool XorToOrDisjointTransformer::dominatesAllXors(
-    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
-  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
-    BinaryOperator *XorInst = XorEntry.first;
-    // Do not evaluate the BaseXor, otherwise we end up cloning it.
-    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
-  });
-}
-
-bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
-                                                 XorOpList &XorsInGroup) {
-  bool Changed = false;
-  if (XorsInGroup.size() <= 1)
-    return false;
-
-  // Sort XorsInGroup by the constant offset value in increasing order.
-  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
-    return A.second.slt(B.second);
-  });
-
-  // Dominance check
-  // The "base" XOR for dominance purposes is the one with the smallest
-  // constant.
-  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
-
-  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                      << ": Cloning and inserting XOR with smallest constant ("
-                      << *XorWithSmallConst
-                      << ") as it does not dominate all other XORs"
-                      << " in function " << F.getName() << "\n");
-
-    BinaryOperator *ClonedXor =
-        cast<BinaryOperator>(XorWithSmallConst->clone());
-    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
-    ClonedXor->insertAfter(OriginalBaseInst);
-    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
-    Changed = true;
-    XorWithSmallConst = ClonedXor;
-  }
-
-  SmallVector<Instruction *, 8> InstructionsToErase;
-  const APInt SmallestConst =
-      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
-
-  // Main transformation loop: Iterate over the original XORs in the sorted
-  // group.
-  for (const auto &XorEntry : XorsInGroup) {
-    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
-    const APInt ConstOffsetVal = XorEntry.second;
-
-    // Do not process the one with smallest constant as it is the base.
-    if (XorInst == XorWithSmallConst)
-      continue;
-
-    // Disjointness Check 1
-    APInt NewConstVal = ConstOffsetVal - SmallestConst;
-    if ((NewConstVal & SmallestConst) != 0) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
-                        << F.getName() << ":\n"
-                        << "  New Const: " << NewConstVal
-                        << "  Smallest Const: " << SmallestConst
-                        << "  are not disjoint \n");
-      continue;
-    }
-
-    // Disjointness Check 2
-    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
-                          0)) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                        << ": Transforming XOR to OR (disjoint) in function "
-                        << F.getName() << ":\n"
-                        << "  Xor: " << *XorInst << "\n"
-                        << "  Base Val: " << *XorWithSmallConst << "\n"
-                        << "  New Const: " << NewConstVal << "\n");
-
-      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
-          XorWithSmallConst,
-          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
-          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
-
-      NewOrInst->copyMetadata(*XorInst);
-      XorInst->replaceAllUsesWith(NewOrInst);
-      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
-      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
-
-      Changed = true;
-    } else {
-      LLVM_DEBUG(
-          dbgs() << DEBUG_TYPE
-                 << ": Cannot transform XOR (not proven disjoint) in function "
-                 << F.getName() << ":\n"
-                 << "  Xor: " << *XorInst << "\n"
-                 << "  Base Val: " << *XorWithSmallConst << "\n"
-                 << "  New Const: " << NewConstVal << "\n");
-    }
-  }
-
-  for (Instruction *I : InstructionsToErase)
-    I->eraseFromParent();
-
-  return Changed;
-}
-
-// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
-// the base for memory operations. This transformation is true under the
-// following conditions
-// Check 1 -  B and C are disjoint.
-// Check 2 - XOR(A,C) and B are disjoint.
-//
-// This transformation is beneficial particularly for GEPs because:
-// 1. OR operations often map better to addressing modes than XOR
-// 2. Disjoint OR operations preserve the semantics of the original XOR
-// 3. This can enable further optimizations in the GEP offset folding pipeline
-bool XorToOrDisjointTransformer::run() {
-  bool Changed = false;
-
-  // Collect all candidate XORs
-  for (Instruction &I : instructions(F)) {
-    Instruction *Op0 = nullptr;
-    ConstantInt *C1 = nullptr;
-    BinaryOperator *MatchedXorOp = nullptr;
-
-    // Attempt to match the instruction 'I' as XOR operation.
-    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
-                               m_BinOp(MatchedXorOp))) &&
-        hasGEPUser(MatchedXorOp))
-      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
-  }
-
-  if (XorGroups.empty())
-    return false;
-
-  // Process each group of XORs
-  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
-    if (processXorGroup(OriginalBaseInst, XorsInGroup))
-      Changed = true;
-
-  return Changed;
-}
-
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1430,11 +1242,6 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
-
-  // Decompose xor in to "or disjoint" if possible.
-  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
-  Changed |= XorTransformer.run();
-
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
deleted file mode 100644
index 825227292fe14..0000000000000
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
-; RUN: -S < %s | FileCheck %s
-
-
-; Test a simple case of xor to or disjoint transformation
-define half @test_basic_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_basic_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 4128
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test the decreasing order of offset xor to or disjoint transformation
-define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_descending_offset_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 4128
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 32
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test that %addr2 is not transformed to or disjoint.
-define half @test_no_transfomation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_no_transfomation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 64  ; Should not be transformed
-  %addr3 = xor i64 %base, 2080
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test case with xor instructions in different basic blocks
-define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
-; CHECK-LABEL: define half @test_dom_tree(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    br label %[[MERGE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
-; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
-; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
-; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
-; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192   ; Clear low bits
-  %addr1 = xor i64 %base,16
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %val1 = load half, ptr %gep1
-  br i1 %cond, label %then, label %else
-
-then:
-  %addr2 = xor i64 %base, 48
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %val2 = load half, ptr %gep2
-  br label %merge
-
-else:
-  %addr3 = xor i64 %base, 112
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val3 = load half, ptr %gep3
-  br label %merge
-
-merge:
-  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
-  %addr4 = xor i64 %base, 240
-  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
-  %val4 = load half, ptr %gep4
-  %val1.f = fpext half %val1 to float
-  %val_from_branch.f = fpext half %val_from_branch to float
-  %val4.f = fpext half %val4 to float
-  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
-  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
-  %result.h = fptrunc float %final_sum.f to half
-  ret half %result.h
-}
-

From b9329fe88e47741d9c20ab92f892ac52457e6195 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 10 Jun 2025 16:50:29 -0700
Subject: [PATCH 018/851] [CIR] Upstream support for calling constructors
 (#143579)

This change adds support for calling C++ constructors. The support for
actually defining a constructor is still missing and will be added in a
later change.
---
 clang/include/clang/CIR/MissingFeatures.h     |  3 +
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 99 +++++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 74 ++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 51 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  6 ++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 13 +++
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 54 +++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.h          | 19 ++++
 clang/lib/CIR/CodeGen/CIRGenTypes.h           |  6 ++
 clang/test/CIR/CodeGen/ctor.cpp               | 19 ++++
 10 files changed, 336 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/ctor.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 72d882beb2244..f89d386378e51 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -222,6 +222,9 @@ struct MissingFeatures {
   static bool instrumentation() { return false; }
   static bool cleanupAfterErrorDiags() { return false; }
   static bool cxxRecordStaticMembers() { return false; }
+  static bool isMemcpyEquivalentSpecialMember() { return false; }
+  static bool isTrivialCtorOrDtor() { return false; }
+  static bool implicitConstructorArgs() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index b194a8670bfb9..9d25eea9e413d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -60,6 +60,13 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const {
   return *this;
 }
 
+/// Returns the canonical formal type of the given C++ method.
+static CanQual<FunctionProtoType> getFormalType(const CXXMethodDecl *md) {
+  return md->getType()
+      ->getCanonicalTypeUnqualified()
+      .getAs<FunctionProtoType>();
+}
+
 /// Adds the formal parameters in FPT to the given prefix. If any parameter in
 /// FPT has pass_object_size_attrs, then we'll add parameters for those, too.
 /// TODO(cir): this should be shared with LLVM codegen
@@ -76,6 +83,48 @@ static void appendParameterTypes(const CIRGenTypes &cgt,
   cgt.getCGModule().errorNYI("appendParameterTypes: hasExtParameterInfos");
 }
 
+const CIRGenFunctionInfo &
+CIRGenTypes::arrangeCXXStructorDeclaration(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  llvm::SmallVector<CanQualType, 16> argTypes;
+  argTypes.push_back(deriveThisType(md->getParent(), md));
+
+  bool passParams = true;
+
+  if (auto *cd = dyn_cast<CXXConstructorDecl>(md)) {
+    // A base class inheriting constructor doesn't get forwarded arguments
+    // needed to construct a virtual base (or base class thereof)
+    if (cd->getInheritedConstructor())
+      cgm.errorNYI(cd->getSourceRange(),
+                   "arrangeCXXStructorDeclaration: inheriting constructor");
+  }
+
+  CanQual<FunctionProtoType> fpt = getFormalType(md);
+
+  if (passParams)
+    appendParameterTypes(*this, argTypes, fpt);
+
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+
+  RequiredArgs required =
+      (passParams && md->isVariadic() ? RequiredArgs(argTypes.size())
+                                      : RequiredArgs::All);
+
+  CanQualType resultType = theCXXABI.hasThisReturn(gd) ? argTypes.front()
+                           : theCXXABI.hasMostDerivedReturn(gd)
+                               ? astContext.VoidPtrTy
+                               : astContext.VoidTy;
+
+  assert(!theCXXABI.hasThisReturn(gd) &&
+         "Please send PR with a test and remove this");
+
+  assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo());
+  assert(!cir::MissingFeatures::opCallFnInfoOpts());
+
+  return arrangeCIRFunctionInfo(resultType, argTypes, required);
+}
+
 /// Derives the 'this' type for CIRGen purposes, i.e. ignoring method CVR
 /// qualification. Either or both of `rd` and `md` may be null. A null `rd`
 /// indicates that there is no meaningful 'this' type, and a null `md` can occur
@@ -103,13 +152,13 @@ CanQualType CIRGenTypes::deriveThisType(const CXXRecordDecl *rd,
 /// top of any implicit parameters already stored.
 static const CIRGenFunctionInfo &
 arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl<CanQualType> &prefix,
-                       CanQual<FunctionProtoType> ftp) {
+                       CanQual<FunctionProtoType> fpt) {
   assert(!cir::MissingFeatures::opCallFnInfoOpts());
   RequiredArgs required =
-      RequiredArgs::getFromProtoWithExtraSlots(ftp, prefix.size());
+      RequiredArgs::getFromProtoWithExtraSlots(fpt, prefix.size());
   assert(!cir::MissingFeatures::opCallExtParameterInfo());
-  appendParameterTypes(cgt, prefix, ftp);
-  CanQualType resultType = ftp->getReturnType().getUnqualifiedType();
+  appendParameterTypes(cgt, prefix, fpt);
+  CanQualType resultType = fpt->getReturnType().getUnqualifiedType();
   return cgt.arrangeCIRFunctionInfo(resultType, prefix, required);
 }
 
@@ -141,6 +190,44 @@ arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm,
   return cgt.arrangeCIRFunctionInfo(retType, argTypes, required);
 }
 
+/// Arrange a call to a C++ method, passing the given arguments.
+///
+/// passProtoArgs indicates whether `args` has args for the parameters in the
+/// given CXXConstructorDecl.
+const CIRGenFunctionInfo &CIRGenTypes::arrangeCXXConstructorCall(
+    const CallArgList &args, const CXXConstructorDecl *d, CXXCtorType ctorKind,
+    bool passProtoArgs) {
+
+  // FIXME: Kill copy.
+  llvm::SmallVector<CanQualType, 16> argTypes;
+  for (const auto &arg : args)
+    argTypes.push_back(astContext.getCanonicalParamType(arg.ty));
+
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+  // +1 for implicit this, which should always be args[0]
+  unsigned totalPrefixArgs = 1;
+
+  CanQual<FunctionProtoType> fpt = getFormalType(d);
+  RequiredArgs required =
+      passProtoArgs
+          ? RequiredArgs::getFromProtoWithExtraSlots(fpt, totalPrefixArgs)
+          : RequiredArgs::All;
+
+  GlobalDecl gd(d, ctorKind);
+  if (theCXXABI.hasThisReturn(gd))
+    cgm.errorNYI(d->getSourceRange(),
+                 "arrangeCXXConstructorCall: hasThisReturn");
+  if (theCXXABI.hasMostDerivedReturn(gd))
+    cgm.errorNYI(d->getSourceRange(),
+                 "arrangeCXXConstructorCall: hasMostDerivedReturn");
+  CanQualType resultType = astContext.VoidTy;
+
+  assert(!cir::MissingFeatures::opCallFnInfoOpts());
+  assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo());
+
+  return arrangeCIRFunctionInfo(resultType, argTypes, required);
+}
+
 /// Arrange a call to a C++ method, passing the given arguments.
 ///
 /// numPrefixArgs is the number of the ABI-specific prefix arguments we have. It
@@ -198,7 +285,7 @@ CIRGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *md) {
 /// constructor or destructor.
 const CIRGenFunctionInfo &
 CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd,
-                                  const FunctionProtoType *ftp,
+                                  const FunctionProtoType *fpt,
                                   const CXXMethodDecl *md) {
   llvm::SmallVector<CanQualType, 16> argTypes;
 
@@ -208,7 +295,7 @@ CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd,
   assert(!cir::MissingFeatures::opCallFnInfoOpts());
   return ::arrangeCIRFunctionInfo(
       *this, argTypes,
-      ftp->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
+      fpt->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
 }
 
 /// Arrange the argument and result information for the declaration or
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 4cdaa480121dd..8491a66ea6cb4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -10,9 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
 
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecordLayout.h"
+#include "clang/AST/Type.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -63,3 +66,74 @@ Address CIRGenFunction::getAddressOfBaseClass(
 
   return value;
 }
+
+void CIRGenFunction::emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                                            clang::CXXCtorType type,
+                                            bool forVirtualBase,
+                                            bool delegating,
+                                            AggValueSlot thisAVS,
+                                            const clang::CXXConstructExpr *e) {
+  CallArgList args;
+  Address thisAddr = thisAVS.getAddress();
+  QualType thisType = d->getThisType();
+  mlir::Value thisPtr = thisAddr.getPointer();
+
+  assert(!cir::MissingFeatures::addressSpace());
+
+  args.add(RValue::get(thisPtr), thisType);
+
+  // In LLVM Codegen: If this is a trivial constructor, just emit what's needed.
+  // If this is a union copy constructor, we must emit a memcpy, because the AST
+  // does not model that copy.
+  assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember());
+
+  const FunctionProtoType *fpt = d->getType()->castAs<FunctionProtoType>();
+
+  assert(!cir::MissingFeatures::opCallArgEvaluationOrder());
+
+  emitCallArgs(args, fpt, e->arguments(), e->getConstructor(),
+               /*ParamsToSkip=*/0);
+
+  assert(!cir::MissingFeatures::sanitizers());
+  emitCXXConstructorCall(d, type, forVirtualBase, delegating, thisAddr, args,
+                         e->getExprLoc());
+}
+
+void CIRGenFunction::emitCXXConstructorCall(
+    const CXXConstructorDecl *d, CXXCtorType type, bool forVirtualBase,
+    bool delegating, Address thisAddr, CallArgList &args, SourceLocation loc) {
+
+  const CXXRecordDecl *crd = d->getParent();
+
+  // If this is a call to a trivial default constructor:
+  // In LLVM: do nothing.
+  // In CIR: emit as a regular call, other later passes should lower the
+  // ctor call into trivial initialization.
+  assert(!cir::MissingFeatures::isTrivialCtorOrDtor());
+
+  assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember());
+
+  bool passPrototypeArgs = true;
+
+  // Check whether we can actually emit the constructor before trying to do so.
+  if (d->getInheritedConstructor()) {
+    cgm.errorNYI(d->getSourceRange(),
+                 "emitCXXConstructorCall: inherited constructor");
+    return;
+  }
+
+  // Insert any ABI-specific implicit constructor arguments.
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+
+  // Emit the call.
+  auto calleePtr = cgm.getAddrOfCXXStructor(GlobalDecl(d, type));
+  const CIRGenFunctionInfo &info = cgm.getTypes().arrangeCXXConstructorCall(
+      args, d, type, passPrototypeArgs);
+  CIRGenCallee callee = CIRGenCallee::forDirect(calleePtr, GlobalDecl(d, type));
+  cir::CIRCallOpInterface c;
+  emitCall(info, callee, ReturnValueSlot(), args, &c, getLoc(loc));
+
+  if (cgm.getCodeGenOpts().OptimizationLevel != 0 && !crd->isDynamicClass() &&
+      type != Ctor_Base && cgm.getCodeGenOpts().StrictVTablePointers)
+    cgm.errorNYI(d->getSourceRange(), "vtable assumption loads");
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 8129fe0ad7db7..f2c2de7a4f59d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1393,6 +1393,57 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce,
       ce, md, returnValue, hasQualifier, qualifier, isArrow, base);
 }
 
+void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
+                                          AggValueSlot dest) {
+  assert(!dest.isIgnored() && "Must have a destination!");
+  const CXXConstructorDecl *cd = e->getConstructor();
+
+  // If we require zero initialization before (or instead of) calling the
+  // constructor, as can be the case with a non-user-provided default
+  // constructor, emit the zero initialization now, unless destination is
+  // already zeroed.
+  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: requires initialization");
+    return;
+  }
+
+  // If this is a call to a trivial default constructor:
+  // In LLVM: do nothing.
+  // In CIR: emit as a regular call, other later passes should lower the
+  // ctor call into trivial initialization.
+
+  // Elide the constructor if we're constructing from a temporary
+  if (getLangOpts().ElideConstructors && e->isElidable()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: elidable constructor");
+    return;
+  }
+
+  if (getContext().getAsArrayType(e->getType())) {
+    cgm.errorNYI(e->getSourceRange(), "emitCXXConstructExpr: array type");
+    return;
+  }
+
+  clang::CXXCtorType type = Ctor_Complete;
+  bool forVirtualBase = false;
+  bool delegating = false;
+
+  switch (e->getConstructionKind()) {
+  case CXXConstructionKind::Complete:
+    type = Ctor_Complete;
+    break;
+  case CXXConstructionKind::Delegating:
+  case CXXConstructionKind::VirtualBase:
+  case CXXConstructionKind::NonVirtualBase:
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: other construction kind");
+    return;
+  }
+
+  emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
+}
+
 RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) {
   // Emit the expression as an lvalue.
   LValue lv = emitLValue(e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 56d7ea3884ba7..f1df1b79fc48e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -51,6 +51,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   void Visit(Expr *e) { StmtVisitor<AggExprEmitter>::Visit(e); }
 
   void VisitInitListExpr(InitListExpr *e);
+  void VisitCXXConstructExpr(const CXXConstructExpr *e);
 
   void visitCXXParenListOrInitListExpr(Expr *e, ArrayRef<Expr *> args,
                                        FieldDecl *initializedFieldInUnion,
@@ -213,6 +214,11 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
   }
 }
 
+void AggExprEmitter::VisitCXXConstructExpr(const CXXConstructExpr *e) {
+  AggValueSlot slot = ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
+  cgf.emitCXXConstructExpr(e, slot);
+}
+
 void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc,
                                                     LValue lv) {
   const QualType type = lv.getType();
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index d6002c3e4d4d9..7db7f6928fd8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -744,6 +744,19 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s);
 
+  void emitCXXConstructExpr(const clang::CXXConstructExpr *e,
+                            AggValueSlot dest);
+
+  void emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                              clang::CXXCtorType type, bool forVirtualBase,
+                              bool delegating, AggValueSlot thisAVS,
+                              const clang::CXXConstructExpr *e);
+
+  void emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                              clang::CXXCtorType type, bool forVirtualBase,
+                              bool delegating, Address thisAddr,
+                              CallArgList &args, clang::SourceLocation loc);
+
   mlir::LogicalResult emitCXXForRangeStmt(const CXXForRangeStmt &s,
                                           llvm::ArrayRef<const Attr *> attrs);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 3d46c44b4f1ec..8407f8fad06ba 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -103,6 +103,25 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
 
 CIRGenModule::~CIRGenModule() = default;
 
+/// FIXME: this could likely be a common helper and not necessarily related
+/// with codegen.
+/// Return the best known alignment for an unknown pointer to a
+/// particular class.
+CharUnits CIRGenModule::getClassPointerAlignment(const CXXRecordDecl *rd) {
+  if (!rd->hasDefinition())
+    return CharUnits::One(); // Hopefully won't be used anywhere.
+
+  auto &layout = astContext.getASTRecordLayout(rd);
+
+  // If the class is final, then we know that the pointer points to an
+  // object of that type and can use the full alignment.
+  if (rd->isEffectivelyFinal())
+    return layout.getAlignment();
+
+  // Otherwise, we have to assume it could be a subclass.
+  return layout.getNonVirtualAlignment();
+}
+
 CharUnits CIRGenModule::getNaturalTypeAlignment(QualType t,
                                                 LValueBaseInfo *baseInfo) {
   assert(!cir::MissingFeatures::opTBAA());
@@ -1174,6 +1193,34 @@ void CIRGenModule::setInitializer(cir::GlobalOp &op, mlir::Attribute value) {
   assert(!cir::MissingFeatures::opGlobalVisibility());
 }
 
+std::pair<cir::FuncType, cir::FuncOp> CIRGenModule::getAddrAndTypeOfCXXStructor(
+    GlobalDecl gd, const CIRGenFunctionInfo *fnInfo, cir::FuncType fnType,
+    bool dontDefer, ForDefinition_t isForDefinition) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  if (isa<CXXDestructorDecl>(md)) {
+    // Always alias equivalent complete destructors to base destructors in the
+    // MS ABI.
+    if (getTarget().getCXXABI().isMicrosoft() &&
+        gd.getDtorType() == Dtor_Complete &&
+        md->getParent()->getNumVBases() == 0)
+      errorNYI(md->getSourceRange(),
+               "getAddrAndTypeOfCXXStructor: MS ABI complete destructor");
+  }
+
+  if (!fnType) {
+    if (!fnInfo)
+      fnInfo = &getTypes().arrangeCXXStructorDeclaration(gd);
+    fnType = getTypes().getFunctionType(*fnInfo);
+  }
+
+  auto fn = getOrCreateCIRFunction(getMangledName(gd), fnType, gd,
+                                   /*ForVtable=*/false, dontDefer,
+                                   /*IsThunk=*/false, isForDefinition);
+
+  return {fnType, fn};
+}
+
 cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
                                             mlir::Type funcType, bool forVTable,
                                             bool dontDefer,
@@ -1248,8 +1295,11 @@ StringRef CIRGenModule::getMangledName(GlobalDecl gd) {
   // Some ABIs don't have constructor variants. Make sure that base and complete
   // constructors get mangled the same.
   if (const auto *cd = dyn_cast<CXXConstructorDecl>(canonicalGd.getDecl())) {
-    errorNYI(cd->getSourceRange(), "getMangledName: C++ constructor");
-    return cast<NamedDecl>(gd.getDecl())->getIdentifier()->getName();
+    if (!getTarget().getCXXABI().hasConstructorVariants()) {
+      errorNYI(cd->getSourceRange(),
+               "getMangledName: C++ constructor without variants");
+      return cast<NamedDecl>(gd.getDecl())->getIdentifier()->getName();
+    }
   }
 
   // Keep the first result in the case of a mangling collision.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 24ec9ca6403bc..9748c0b3ed43a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -166,11 +166,30 @@ class CIRGenModule : public CIRGenTypeCache {
   mlir::Location getLoc(clang::SourceLocation cLoc);
   mlir::Location getLoc(clang::SourceRange cRange);
 
+  /// Return the best known alignment for an unknown pointer to a
+  /// particular class.
+  clang::CharUnits getClassPointerAlignment(const clang::CXXRecordDecl *rd);
+
   /// FIXME: this could likely be a common helper and not necessarily related
   /// with codegen.
   clang::CharUnits getNaturalTypeAlignment(clang::QualType t,
                                            LValueBaseInfo *baseInfo);
 
+  cir::FuncOp
+  getAddrOfCXXStructor(clang::GlobalDecl gd,
+                       const CIRGenFunctionInfo *fnInfo = nullptr,
+                       cir::FuncType fnType = nullptr, bool dontDefer = false,
+                       ForDefinition_t isForDefinition = NotForDefinition) {
+    return getAddrAndTypeOfCXXStructor(gd, fnInfo, fnType, dontDefer,
+                                       isForDefinition)
+        .second;
+  }
+
+  std::pair<cir::FuncType, cir::FuncOp> getAddrAndTypeOfCXXStructor(
+      clang::GlobalDecl gd, const CIRGenFunctionInfo *fnInfo = nullptr,
+      cir::FuncType fnType = nullptr, bool dontDefer = false,
+      ForDefinition_t isForDefinition = NotForDefinition);
+
   /// This contains all the decls which have definitions but which are deferred
   /// for emission and therefore should only be output if they are actually
   /// used. If a decl is in this, then it is known to have not been referenced
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h
index 48d474beeddec..c2813d79bf63b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h
@@ -19,6 +19,7 @@
 
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Type.h"
+#include "clang/Basic/ABI.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 #include "llvm/ADT/SmallPtrSet.h"
@@ -165,6 +166,10 @@ class CIRGenTypes {
   bool isZeroInitializable(clang::QualType ty);
   bool isZeroInitializable(const RecordDecl *rd);
 
+  const CIRGenFunctionInfo &arrangeCXXConstructorCall(
+      const CallArgList &args, const clang::CXXConstructorDecl *d,
+      clang::CXXCtorType ctorKind, bool passProtoArgs = true);
+
   const CIRGenFunctionInfo &
   arrangeCXXMethodCall(const CallArgList &args,
                        const clang::FunctionProtoType *type,
@@ -173,6 +178,7 @@ class CIRGenTypes {
   /// C++ methods have some special rules and also have implicit parameters.
   const CIRGenFunctionInfo &
   arrangeCXXMethodDeclaration(const clang::CXXMethodDecl *md);
+  const CIRGenFunctionInfo &arrangeCXXStructorDeclaration(clang::GlobalDecl gd);
 
   const CIRGenFunctionInfo &
   arrangeCXXMethodType(const clang::CXXRecordDecl *rd,
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
new file mode 100644
index 0000000000000..3a1e82e338c1c
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+struct Struk {
+  int a;
+  Struk();
+};
+
+void baz() {
+  Struk s;
+}
+
+// CHECK: !rec_Struk = !cir.record<struct "Struk" {!s32i}>
+
+// CHECK:   cir.func @_ZN5StrukC1Ev(!cir.ptr<!rec_Struk>)
+// CHECK:   cir.func @_Z3bazv()
+// CHECK-NEXT:     %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr<!rec_Struk>, ["s", init] {alignment = 4 : i64}
+// CHECK-NEXT:     cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr<!rec_Struk>) -> ()
+// CHECK-NEXT:     cir.return

From 6f62979a5a5bcf70d65f23e0991a274e6df5955b Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Tue, 10 Jun 2025 16:57:16 -0700
Subject: [PATCH 019/851] Revert "[CI] Migrate to runtimes build" (#143612)

Reverts llvm/llvm-project#142696

See https://github.com/llvm/llvm-project/issues/143610 for details; I
believe this PR causes CI builders to build LLVM in a way that's been
broken for a while. To keep CI green, if this is the correct culprit,
those tests should be fixed or skipped
---
 .ci/compute_projects.py         | 115 ++++++++++++--------------------
 .ci/compute_projects_test.py    |  55 ++-------------
 .ci/monolithic-linux.sh         |  13 +---
 .github/workflows/premerge.yaml |   3 +-
 4 files changed, 49 insertions(+), 137 deletions(-)

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index e61b8dc5021f3..40dd0507a9eaf 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -49,7 +49,8 @@
     },
     "lld": {"bolt", "cross-project-tests"},
     # TODO(issues/132795): LLDB should be enabled on clang changes.
-    "clang": {"clang-tools-extra", "cross-project-tests"},
+    "clang": {"clang-tools-extra", "compiler-rt", "cross-project-tests"},
+    "clang-tools-extra": {"libc"},
     "mlir": {"flang"},
     # Test everything if ci scripts are changed.
     # FIXME: Figure out what is missing and add here.
@@ -63,15 +64,7 @@
 
 # This mapping describes runtimes that should be tested when the key project is
 # touched.
-DEPENDENT_RUNTIMES_TO_TEST = {
-    "clang": {"compiler-rt"},
-    "clang-tools-extra": {"libc"},
-}
-DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG = {
-    "llvm": {"libcxx", "libcxxabi", "libunwind"},
-    "clang": {"libcxx", "libcxxabi", "libunwind"},
-    ".ci": {"libcxx", "libcxxabi", "libunwind"},
-}
+DEPENDENT_RUNTIMES_TO_TEST = {"clang": {"libcxx", "libcxxabi", "libunwind"}}
 
 EXCLUDE_LINUX = {
     "cross-project-tests",  # TODO(issues/132796): Tests are failing.
@@ -100,6 +93,9 @@
     "cross-project-tests",
     "flang",
     "libc",
+    "libcxx",
+    "libcxxabi",
+    "libunwind",
     "lldb",
     "openmp",
     "polly",
@@ -126,10 +122,10 @@
     "polly": "check-polly",
 }
 
-RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"}
+RUNTIMES = {"libcxx", "libcxxabi", "libunwind"}
 
 
-def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
+def _add_dependencies(projects: Set[str]) -> Set[str]:
     projects_with_dependents = set(projects)
     current_projects_count = 0
     while current_projects_count != len(projects_with_dependents):
@@ -138,25 +134,9 @@ def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
             if project not in PROJECT_DEPENDENCIES:
                 continue
             projects_with_dependents.update(PROJECT_DEPENDENCIES[project])
-    for runtime in runtimes:
-        if runtime not in PROJECT_DEPENDENCIES:
-            continue
-        projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime])
     return projects_with_dependents
 
 
-def _exclude_projects(current_projects: Set[str], platform: str) -> Set[str]:
-    if platform == "Linux":
-        to_exclude = EXCLUDE_LINUX
-    elif platform == "Windows":
-        to_exclude = EXCLUDE_WINDOWS
-    elif platform == "Darwin":
-        to_exclude = EXCLUDE_MAC
-    else:
-        raise ValueError(f"Unexpected platform: {platform}")
-    return current_projects.difference(to_exclude)
-
-
 def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
     projects_to_test = set()
     for modified_project in modified_projects:
@@ -174,14 +154,25 @@ def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set
             ):
                 continue
             projects_to_test.add(dependent_project)
-    projects_to_test = _exclude_projects(projects_to_test, platform)
+    if platform == "Linux":
+        for to_exclude in EXCLUDE_LINUX:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    elif platform == "Windows":
+        for to_exclude in EXCLUDE_WINDOWS:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    elif platform == "Darwin":
+        for to_exclude in EXCLUDE_MAC:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    else:
+        raise ValueError("Unexpected platform.")
     return projects_to_test
 
 
-def _compute_projects_to_build(
-    projects_to_test: Set[str], runtimes: Set[str]
-) -> Set[str]:
-    return _add_dependencies(projects_to_test, runtimes)
+def _compute_projects_to_build(projects_to_test: Set[str]) -> Set[str]:
+    return _add_dependencies(projects_to_test)
 
 
 def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
@@ -193,36 +184,24 @@ def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
     return check_targets
 
 
-def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
+def _compute_runtimes_to_test(projects_to_test: Set[str]) -> Set[str]:
     runtimes_to_test = set()
-    for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST:
-            continue
-        runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project])
-    return _exclude_projects(runtimes_to_test, platform)
+    for project_to_test in projects_to_test:
+        if project_to_test in DEPENDENT_RUNTIMES_TO_TEST:
+            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[project_to_test])
+        if project_to_test in DEPENDENT_RUNTIMES_TO_BUILD:
+            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_BUILD[project_to_test])
+    return runtimes_to_test
 
 
-def _compute_runtimes_to_test_needs_reconfig(
-    modified_projects: Set[str], platform: str
-) -> Set[str]:
-    runtimes_to_test = set()
-    for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG:
+def _compute_runtime_check_targets(projects_to_test: Set[str]) -> Set[str]:
+    check_targets = set()
+    for project_to_test in projects_to_test:
+        if project_to_test not in DEPENDENT_RUNTIMES_TO_TEST:
             continue
-        runtimes_to_test.update(
-            DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project]
-        )
-    return _exclude_projects(runtimes_to_test, platform)
-
-
-def _compute_runtimes_to_build(
-    runtimes_to_test: Set[str], modified_projects: Set[str], platform: str
-) -> Set[str]:
-    runtimes_to_build = set(runtimes_to_test)
-    for modified_project in modified_projects:
-        if modified_project in DEPENDENT_RUNTIMES_TO_BUILD:
-            runtimes_to_build.update(DEPENDENT_RUNTIMES_TO_BUILD[modified_project])
-    return _exclude_projects(runtimes_to_build, platform)
+        for runtime_to_test in DEPENDENT_RUNTIMES_TO_TEST[project_to_test]:
+            check_targets.add(PROJECT_CHECK_TARGETS[runtime_to_test])
+    return check_targets
 
 
 def _get_modified_projects(modified_files: list[str]) -> Set[str]:
@@ -246,19 +225,10 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]:
 def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     modified_projects = _get_modified_projects(modified_files)
     projects_to_test = _compute_projects_to_test(modified_projects, platform)
-    runtimes_to_test = _compute_runtimes_to_test(modified_projects, platform)
-    runtimes_to_test_needs_reconfig = _compute_runtimes_to_test_needs_reconfig(
-        modified_projects, platform
-    )
-    runtimes_to_build = _compute_runtimes_to_build(
-        runtimes_to_test | runtimes_to_test_needs_reconfig, modified_projects, platform
-    )
-    projects_to_build = _compute_projects_to_build(projects_to_test, runtimes_to_build)
+    projects_to_build = _compute_projects_to_build(projects_to_test)
     projects_check_targets = _compute_project_check_targets(projects_to_test)
-    runtimes_check_targets = _compute_project_check_targets(runtimes_to_test)
-    runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
-        runtimes_to_test_needs_reconfig
-    )
+    runtimes_to_build = _compute_runtimes_to_test(projects_to_test)
+    runtimes_check_targets = _compute_runtime_check_targets(projects_to_test)
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -268,9 +238,6 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "project_check_targets": " ".join(sorted(projects_check_targets)),
         "runtimes_to_build": ";".join(sorted(runtimes_to_build)),
         "runtimes_check_targets": " ".join(sorted(runtimes_check_targets)),
-        "runtimes_check_targets_needs_reconfig": " ".join(
-            sorted(runtimes_check_targets_needs_reconfig)
-        ),
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6bc2e34a1cbe1..ae376ea6a43cd 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -26,10 +26,6 @@ def test_llvm(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -50,10 +46,6 @@ def test_llvm_windows(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -74,10 +66,6 @@ def test_llvm_mac(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -87,21 +75,17 @@ def test_clang(self):
         )
         self.assertEqual(
             env_variables["projects_to_build"],
-            "clang;clang-tools-extra;lld;llvm",
+            "clang;clang-tools-extra;compiler-rt;lld;llvm",
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-clang check-clang-tools",
+            "check-clang check-clang-tools check-compiler-rt",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -120,10 +104,6 @@ def test_clang_windows(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -135,7 +115,6 @@ def test_bolt(self):
         self.assertEqual(env_variables["project_check_targets"], "check-bolt")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_lldb(self):
         env_variables = compute_projects.get_env_variables(
@@ -145,7 +124,6 @@ def test_lldb(self):
         self.assertEqual(env_variables["project_check_targets"], "check-lldb")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_mlir(self):
         env_variables = compute_projects.get_env_variables(
@@ -157,7 +135,6 @@ def test_mlir(self):
         )
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -167,7 +144,6 @@ def test_flang(self):
         self.assertEqual(env_variables["project_check_targets"], "check-flang")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
@@ -177,7 +153,6 @@ def test_invalid_subproject(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_top_level_file(self):
         env_variables = compute_projects.get_env_variables(["README.md"], "Linux")
@@ -185,7 +160,6 @@ def test_top_level_file(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_runtiems_in_projects(self):
         env_variables = compute_projects.get_env_variables(
@@ -195,7 +169,6 @@ def test_exclude_runtiems_in_projects(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_docs(self):
         env_variables = compute_projects.get_env_variables(
@@ -205,7 +178,6 @@ def test_exclude_docs(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_gn(self):
         env_variables = compute_projects.get_env_variables(
@@ -215,7 +187,6 @@ def test_exclude_gn(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_ci(self):
         env_variables = compute_projects.get_env_variables(
@@ -227,15 +198,10 @@ def test_ci(self):
             "check-clang check-lld check-lldb check-llvm",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"],
-            "libcxx;libcxxabi;libunwind",
+            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -249,19 +215,6 @@ def test_lldb(self):
             env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
-
-    def test_clang_tools_extra(self):
-        env_variables = compute_projects.get_env_variables(
-            ["clang-tools-extra/CMakeLists.txt"], "Linux"
-        )
-        self.assertEqual(
-            env_variables["projects_to_build"], "clang;clang-tools-extra;lld;llvm"
-        )
-        self.assertEqual(env_variables["project_check_targets"], "check-clang-tools")
-        self.assertEqual(env_variables["runtimes_to_build"], "libc")
-        self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
 
 if __name__ == "__main__":
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index c350a58679140..7503ea4e6a992 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -57,7 +57,6 @@ projects="${1}"
 targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
-runtime_targets_needs_reconfig="${5}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -94,15 +93,9 @@ echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets}
 
-if [[ "${runtime_targets}" != "" ]]; then
-  echo "--- ninja runtimes"
-
-  ninja -C "${BUILD_DIR}" ${runtime_targets}
-fi
-
 # Compiling runtimes with just-built Clang and running their tests
 # as an additional testing for Clang.
-if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
+if [[ "${runtimes_targets}" != "" ]]; then
   echo "--- cmake runtimes C++26"
 
   cmake \
@@ -112,7 +105,7 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   echo "--- ninja runtimes C++26"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets}
 
   echo "--- cmake runtimes clang modules"
 
@@ -123,5 +116,5 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   echo "--- ninja runtimes clang modules"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets}
 fi
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 4435a3e905768..709b6d03d94c3 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -56,12 +56,11 @@ jobs:
           echo "Running project checks targets: ${project_check_targets}"
           echo "Building runtimes: ${runtimes_to_build}"
           echo "Running runtimes checks targets: ${runtimes_check_targets}"
-          echo "Running runtimes checks requiring reconfiguring targets: ${runtimes_check_targets_needs_reconfig}"
 
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
+          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}"
       - name: Upload Artifacts
         uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:

From 3cef099ceddccefca8e11268624397cde9e04af6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Wed, 11 Jun 2025 01:06:13 +0000
Subject: [PATCH 020/851] [TySan][CMake] Depend on tysan for check-tysan in
 runtimes build (#143597)

The runtimes build expects libclang_rt.tysan.a to be available, but the
check-tysan target does not actually depend on it when built using a
runtimes build with LLVM_ENABLE_RUNTIMES pointing at ./llvm. This means
we get test failures when running check-compiler-rt due to the missing
static archive.

This patch also makes check-tysan depend on tysan when we are using the
runtimes build.

This is causing premerge failures currently since we recently migrated
to the runtimes build.
---
 compiler-rt/test/tysan/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compiler-rt/test/tysan/CMakeLists.txt b/compiler-rt/test/tysan/CMakeLists.txt
index 76f57501e854e..ce0afa8769f03 100644
--- a/compiler-rt/test/tysan/CMakeLists.txt
+++ b/compiler-rt/test/tysan/CMakeLists.txt
@@ -21,9 +21,7 @@ foreach(arch ${TYSAN_TEST_ARCH})
 endforeach()
 
 set(TYSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND TYSAN_TEST_DEPS tysan)
-endif()
+list(APPEND TYSAN_TEST_DEPS tysan)
 
 add_lit_testsuite(check-tysan "Running the TypeSanitizer tests"
   ${TYSAN_TESTSUITES}

From 67ff66e67734c0b283ec676899e5b89b67fdafcb Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Tue, 10 Jun 2025 20:19:38 -0500
Subject: [PATCH 021/851] [PGO][Offload] Fix offload coverage mapping 
 (#143490)

This pull request fixes coverage mapping on GPU targets.

- It adds an address space cast to the coverage mapping generation pass.
- It reads the profiled function names from the ELF directly. Reading it
from public globals was causing issues in cases where multiple
device-code object files are linked together.
---
 clang/lib/CodeGen/CoverageMappingGen.cpp      |  5 +--
 .../Instrumentation/InstrProfiling.cpp        |  6 ----
 .../common/include/GlobalHandler.h            |  4 +--
 .../common/src/GlobalHandler.cpp              | 31 +++++++++----------
 .../common/src/PluginInterface.cpp            |  7 ++---
 5 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 1788bb4f28697..4aafac349e3e9 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -2622,8 +2622,9 @@ void CoverageMappingModuleGen::emit() {
   CGM.addUsedGlobal(CovData);
   // Create the deferred function records array
   if (!FunctionNames.empty()) {
-    auto NamesArrTy = llvm::ArrayType::get(llvm::PointerType::getUnqual(Ctx),
-                                           FunctionNames.size());
+    auto AddrSpace = FunctionNames.front()->getType()->getPointerAddressSpace();
+    auto NamesArrTy = llvm::ArrayType::get(
+        llvm::PointerType::get(Ctx, AddrSpace), FunctionNames.size());
     auto NamesArrVal = llvm::ConstantArray::get(NamesArrTy, FunctionNames);
     // This variable will *NOT* be emitted to the object file. It is used
     // to pass the list of names referenced to codegen.
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe3b0da33a009..5e7548b0a2fd1 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1955,12 +1955,6 @@ void InstrLowerer::emitNameData() {
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
 
-  // Make names variable public if current target is a GPU
-  if (isGPUProfTarget(M)) {
-    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
-    NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
-  }
-
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
   NamesVar->setSection(
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 6def53430a7c0..5d6109df49da5 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -80,6 +80,7 @@ struct GPUProfGlobals {
 
   void dump() const;
   Error write() const;
+  bool empty() const;
 };
 
 /// Subclass of GlobalTy that holds the memory for a global of \p Ty.
@@ -192,9 +193,6 @@ class GenericGlobalHandlerTy {
                                           /*D2H=*/false);
   }
 
-  /// Checks whether a given image contains profiling globals.
-  bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
-
   /// Reads profiling data from a GPU image to supplied profdata struct.
   /// Iterates through the image symbol table and stores global values
   /// with profiling prefixes.
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 27d7e8ee2fdf3..5464c197dba78 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -173,16 +173,6 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
   return Plugin::success();
 }
 
-bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
-                                                 DeviceImageTy &Image) {
-  GlobalTy global(getInstrProfNamesVarName().str(), 0);
-  if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) {
-    consumeError(std::move(Err));
-    return false;
-  }
-  return true;
-}
-
 Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
@@ -204,12 +194,17 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
     // Check if given current global is a profiling global based
     // on name
     if (*NameOrErr == getInstrProfNamesVarName()) {
-      // Read in profiled function names
-      DeviceProfileData.NamesData = SmallVector<uint8_t>(Sym.getSize(), 0);
-      GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(),
-                           DeviceProfileData.NamesData.data());
-      if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
-        return Err;
+      // Read in profiled function names from ELF
+      auto SectionOrErr = Sym.getSection();
+      if (!SectionOrErr)
+        return SectionOrErr.takeError();
+
+      auto ContentsOrErr = (*SectionOrErr)->getContents();
+      if (!ContentsOrErr)
+        return ContentsOrErr.takeError();
+
+      SmallVector<uint8_t> NameBytes(ContentsOrErr->bytes());
+      DeviceProfileData.NamesData = NameBytes;
     } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) {
       // Read global variable profiling counts
       SmallVector<int64_t> Counts(Sym.getSize() / sizeof(int64_t), 0);
@@ -322,3 +317,7 @@ Error GPUProfGlobals::write() const {
 
   return Plugin::success();
 }
+
+bool GPUProfGlobals::empty() const {
+  return Counts.empty() && Data.empty() && NamesData.empty();
+}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9e316adad8f4..f9a6b3c1f4324 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -858,14 +858,13 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   for (auto *Image : LoadedImages) {
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (!Handler.hasProfilingGlobals(*this, *Image))
-      continue;
-
-    GPUProfGlobals profdata;
     auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
     if (!ProfOrErr)
       return ProfOrErr.takeError();
 
+    if (ProfOrErr->empty())
+      continue;
+
     // Dump out profdata
     if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
         uint32_t(DeviceDebugKind::PGODump))

From 841a7f0897272f6412bc2e42a7dd695bf1e8a8cf Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 10 Jun 2025 18:30:07 -0700
Subject: [PATCH 022/851] [RISCV][NFC] Factor out VLEN in the SiFive7
 scheduling model (#143629)

In preparation of reusing SiFive7Model for sifive-x390, which has a VLEN
of 1024, it's better (and less chaotic) to factor out the VLEN parameter
from various of places first: the plan is to do a major overhaul on this
file in which all the `WriteRes` are going to be encapsulated in a big
`multiclass`, where VLEN is one of its template arguments. Such that we
can instantiate different scheduling models with different VLEN.

Before that happens, a placeholder defvar `SiFive7VLEN` is used instead
in this patch.

NFC.

Co-authored-by: Michael Maitland <michaeltmaitland@gmail.com>
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 64 ++++++++++------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index af64a871a9292..c1d7cd4a716e7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -88,9 +88,8 @@ class SiFive7GetCyclesSegmentedSeg2<string mx> {
 
 // Cycles for segmented loads and stores are calculated using the
 // formula vl * ceil((SEW * nf) / DLEN), where SEW * nf is the segment size.
-class SiFive7GetCyclesSegmented<string mx, int sew, int nf> {
-  defvar VLEN = 512;
-  defvar DLEN = 256;
+class SiFive7GetCyclesSegmented<string mx, int sew, int nf, int VLEN> {
+  defvar DLEN = !div(VLEN, 2);
   // (VLEN * LMUL) / SEW
   defvar VLUpperBound  = !cond(
     !eq(mx, "M1") : !div(VLEN, sew),
@@ -107,23 +106,20 @@ class SiFive7GetCyclesSegmented<string mx, int sew, int nf> {
   int c = !mul(VLUpperBound, !div(!sub(!add(a, b), 1), b));
 }
 
-class SiFive7GetCyclesOnePerElement<string mx, int sew> {
-  // FIXME: On SiFive7, VLEN is 512. Although a user can request the compiler
-  // to use a different VLEN, this model will not make scheduling decisions
-  // based on the user specified VLEN.
+class SiFive7GetCyclesOnePerElement<string mx, int sew, int VLEN> {
   // c = ceil(VLEN / SEW) * LMUL
   // Note: c >= 1 since the smallest VLEN is 512 / 8 = 8, and the
   // largest division performed on VLEN is in MF8 case with division
   // by 8. Therefore, there is no need to ceil the result.
-  int VLEN = !div(512, sew);
+  int numElements = !div(VLEN, sew);
   int c = !cond(
-    !eq(mx, "M1")  : VLEN,
-    !eq(mx, "M2")  : !mul(VLEN, 2),
-    !eq(mx, "M4")  : !mul(VLEN, 4),
-    !eq(mx, "M8")  : !mul(VLEN, 8),
-    !eq(mx, "MF2") : !div(VLEN, 2),
-    !eq(mx, "MF4") : !div(VLEN, 4),
-    !eq(mx, "MF8") : !div(VLEN, 8)
+    !eq(mx, "M1")  : numElements,
+    !eq(mx, "M2")  : !mul(numElements, 2),
+    !eq(mx, "M4")  : !mul(numElements, 4),
+    !eq(mx, "M8")  : !mul(numElements, 8),
+    !eq(mx, "MF2") : !div(numElements, 2),
+    !eq(mx, "MF4") : !div(numElements, 4),
+    !eq(mx, "MF8") : !div(numElements, 8)
   );
 }
 
@@ -139,10 +135,9 @@ class SiFive7GetDivOrSqrtFactor<int sew> {
 
 /// Cycles for reductions take approximately VL*SEW/DLEN + 5(4 + log(DLEN/SEW))
 /// cycles.
-class SiFive7GetReductionCycles<string mx, int sew> {
+class SiFive7GetReductionCycles<string mx, int sew, int VLEN> {
   // VLUpperBound*SEW/DLEN is equivalent to 2*LMUL since
   // VLUpperBound=(VLEN*LMUL)/SEW.
-  defvar VLEN = 512;
   defvar DLEN = !div(VLEN, 2);
   defvar TwoTimesLMUL = !cond(
     !eq(mx, "M1") : 2,
@@ -160,8 +155,7 @@ class SiFive7GetReductionCycles<string mx, int sew> {
 }
 
 /// Cycles for ordered reductions take approximately 6*VL cycles
-class SiFive7GetOrderedReductionCycles<string mx, int sew> {
-  defvar VLEN = 512;
+class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
   // (VLEN * LMUL) / SEW
   defvar VLUpperBound  = !cond(
     !eq(mx, "M1") : !div(VLEN, sew),
@@ -234,6 +228,8 @@ def SiFive7VCQ         : ProcResource<1>; // Vector Command Queue
 
 def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
 
+defvar SiFive7VLEN = 512;
+
 // Branching
 let Latency = 3 in {
 def : WriteRes<WriteJmp, [SiFive7PipeB]>;
@@ -481,7 +477,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxList in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS8",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -501,7 +497,7 @@ foreach mx = SchedMxList in {
 // since LMUL >= 16/64.
 foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS16",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -518,7 +514,7 @@ foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
 }
 foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS32",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -535,7 +531,7 @@ foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
 }
 foreach mx = ["M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS64",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -588,7 +584,7 @@ foreach mx = SchedMxList in {
     let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
     defm "" : LMULWriteResMX<"WriteVSSEG2e" # eew,   [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
     foreach nf=3-8 in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf>.c;
+      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
       // Does not chain so set latency high
       let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
@@ -603,7 +599,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxList in {
   foreach nf=2-8 in {
     foreach eew = [8, 16, 32, 64] in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf>.c;
+      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
       // Does not chain so set latency high
       let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
@@ -669,7 +665,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew>.c, 4));
+                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
@@ -774,7 +770,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, isF=1>.val in {
     defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew>.c, 4));
+                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
@@ -834,7 +830,7 @@ foreach mx = SchedMxListFW in {
 // 14. Vector Reduction Operations
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -847,7 +843,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxListWRed in {
   foreach sew = SchedSEWSet<mx, 0, 1>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -857,7 +853,7 @@ foreach mx = SchedMxListWRed in {
 
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
     let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -865,7 +861,7 @@ foreach mx = SchedMxListF in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SiFive7VCQ, SiFive7VA],
                                      mx, sew, IsWorstCase>;
     }
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
     let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
@@ -874,12 +870,12 @@ foreach mx = SchedMxListF in {
 
 foreach mx = SchedMxListFWRed in {
   foreach sew = SchedSEWSet<mx, 1, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
     let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
     let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
@@ -924,7 +920,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;

From 8c890eaa3f4cedb494dc2a8180d9c9219bf76900 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Wed, 11 Jun 2025 10:19:12 +0800
Subject: [PATCH 023/851] Revert "[SelectionDAG] Make `(a & x) | (~a & y) -> (a
 & (x ^ y)) ^ y` available for all targets" (#143648)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  57 --
 .../Target/SystemZ/SystemZISelLowering.cpp    |  14 -
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   1 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  58 ++
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |  30 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  42 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 ++---
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  42 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  42 +-
 ...unfold-masked-merge-vector-variablemask.ll | 167 ++---
 llvm/test/CodeGen/RISCV/fold-masked-merge.ll  | 302 ---------
 ...unfold-masked-merge-scalar-variablemask.ll |  62 +-
 .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 --------
 llvm/test/CodeGen/WebAssembly/simd-arith.ll   | 600 +++++++++++-------
 llvm/test/CodeGen/X86/bitselect.ll            |  50 +-
 llvm/test/CodeGen/X86/fold-masked-merge.ll    |  30 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  26 +-
 ...unfold-masked-merge-vector-variablemask.ll | 598 ++++++++---------
 18 files changed, 1059 insertions(+), 1500 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll
 delete mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0da536a3b157..b65e8e06eae62 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,59 +8128,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
-                                   SDValue AndR1, const SDLoc &DL,
-                                   SelectionDAG &DAG) {
-  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
-    return SDValue();
-  SDValue NotOp = AndL0->getOperand(0);
-  if (NotOp == AndR1)
-    std::swap(AndR1, AndL1);
-  if (NotOp != AndL1)
-    return SDValue();
-
-  EVT VT = AndL1->getValueType(0);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
-                               const TargetLowering &TLI, const SDLoc &DL) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  // If the target supports and-not, don't fold this.
-  if (TLI.hasAndNot(SDValue(Node, 0)))
-    return SDValue();
-
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8359,10 +8306,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
-  if (VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
-      return R;
-
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 1c59b1e63b7bc..f06246706aaa9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1283,20 +1283,6 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
-bool SystemZTargetLowering::hasAndNot(SDValue Y) const {
-  EVT VT = Y.getValueType();
-
-  // We can use NC(G)RK for types in GPRs ...
-  if (VT == MVT::i32 || VT == MVT::i64)
-    return Subtarget.hasMiscellaneousExtensions3();
-
-  // ... or VNC for types in VRs.
-  if (VT.isVector() || VT == MVT::i128)
-    return Subtarget.hasVector();
-
-  return false;
-}
-
 // Information about the addressing mode for a memory access.
 struct AddressingMode {
   // True if a long displacement is supported.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f2f0bf6d8b410..f3536a840fda8 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -671,7 +671,6 @@ class SystemZTargetLowering : public TargetLowering {
   }
 
   unsigned getStackProbeSize(const MachineFunction &MF) const;
-  bool hasAndNot(SDValue Y) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 68da901c2f123..96be91256915d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52350,6 +52350,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
+static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
+                                   SDValue And1_L, SDValue And1_R,
+                                   const SDLoc &DL, SelectionDAG &DAG) {
+  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
+    return SDValue();
+  SDValue NotOp = And0_L->getOperand(0);
+  if (NotOp == And1_R)
+    std::swap(And1_R, And1_L);
+  if (NotOp != And1_L)
+    return SDValue();
+
+  // (~(NotOp) & And0_R) | (NotOp & And1_R)
+  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
+  EVT VT = And1_L->getValueType(0);
+  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for  targets without a fused
+/// "and-not" operation. This function is intended to be called from a
+/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52753,6 +52806,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // We should fold "masked merge" patterns when `andn` is not available.
+  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG))
+      return R;
+
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index b372dec383344..201b97d479c68 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b32 s1, s1, s2
+; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX7-NEXT:    s_and_b32 s0, s1, s0
-; GFX7-NEXT:    s_xor_b32 s0, s0, s2
+; GFX7-NEXT:    s_or_b32 s0, s2, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
+; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    s_xor_b32 s0, s0, s2
+; GFX8-NEXT:    s_or_b32 s0, s2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX10-NEXT:    s_and_b32 s0, s1, s0
-; GFX10-NEXT:    s_xor_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX10-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index e1b4cad370f96..6925a98f643b9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,16 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x3c003c00
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_and_b32 s3, s4, s3
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
+; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x10001
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_and_b32 s3, s4, s3
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
+; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -417,16 +417,16 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -442,15 +442,15 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_lshl_b32 s4, s6, 3
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
+; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
+; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 44bd4090436ef..be16fac4c53f7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s1, s3, 4
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; SI-NEXT:    s_and_b32 s0, s0, s1
-; SI-NEXT:    s_xor_b32 s0, s0, s2
+; SI-NEXT:    s_lshl_b32 s0, s3, 4
+; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_andn2_b32 s1, s2, s0
+; SI-NEXT:    s_and_b32 s0, s0, 0x50005
+; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s1, s3, 4
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_xor_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s0, s3, 4
+; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_andn2_b32 s1, s2, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0x50005
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s8, s8, 4
+; SI-NEXT:    s_lshl_b32 s0, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x50005
+; SI-NEXT:    s_and_b32 s8, s0, 0x50005
+; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s0, 0x50005
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
+; VI-NEXT:    s_mov_b32 s8, 0x50005
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; VI-NEXT:    s_mov_b32 s9, s8
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,34 +1594,35 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_and_b32 s6, s4, 0x505
+; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
+; VI-NEXT:    s_and_b32 s4, s4, s5
+; VI-NEXT:    s_or_b32 s4, s6, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1635,17 +1636,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1655,17 +1656,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v3i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_andn2_b32 s5, s5, s4
+; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1680,34 +1681,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_andn2_b32 s5, s5, s4
+; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1720,20 +1721,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s8, s8, 3
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s1, s3, 0x5050505
-; SI-NEXT:    s_xor_b32 s0, s2, 0x5050505
-; SI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1742,20 +1743,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-LABEL: s_dynamic_insertelement_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s8, s8, 3
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s1, s3, 0x5050505
-; VI-NEXT:    s_xor_b32 s0, s2, 0x5050505
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index a0ad6328b0c01..e0dacb7a59a42 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, s6, 4
-; GFX9-NEXT:    s_xor_b32 s2, s7, 0x3e703e7
-; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s7
+; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
+; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
+; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s1, s4, 4
-; VI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
-; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_xor_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s0, s4, 4
+; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT:    s_andn2_b32 s1, s2, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s1, s4, 4
-; CI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
-; CI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; CI-NEXT:    s_and_b32 s0, s0, s1
-; CI-NEXT:    s_xor_b32 s0, s0, s2
+; CI-NEXT:    s_lshl_b32 s0, s4, 4
+; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; CI-NEXT:    s_andn2_b32 s1, s2, s0
+; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
+; CI-NEXT:    s_or_b32 s0, s0, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
-; GFX11-NEXT:    s_xor_b32 s4, s2, 0x3e703e7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s3, s4, s3
-; GFX11-NEXT:    s_xor_b32 s2, s3, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 321b64510c35f..69724aa75af4f 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -5,11 +5,10 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
 ; GCN-LABEL: s_out32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i32 %x, %mask
@@ -23,11 +22,10 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
 ; GCN-LABEL: s_out64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i64 %x, %mask
@@ -429,11 +427,10 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_varx_42:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_and_b32 s0, s2, s0
+; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -465,11 +462,10 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_varx_42_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s1, s0, 42
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s2, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s1, s1, s2
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -564,11 +560,10 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_42_vary:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s1, 42
+; GCN-NEXT:    s_and_b32 s0, s2, 42
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -600,11 +595,10 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_42_vary_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s1, 42
+; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
+; GCN-NEXT:    s_and_b32 s1, s2, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index bac8bbbf0b4de..8e4c77e76029c 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -8,16 +8,17 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [out_v1i8_param_2];
-; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
-; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
-; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
+; CHECK-NEXT:    not.b16 %rs5, %rs2;
+; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -33,16 +34,17 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [out_v1i16_param_2];
-; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
-; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
-; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
+; CHECK-NEXT:    not.b16 %rs5, %rs2;
+; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -124,16 +126,17 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_1];
-; CHECK-NEXT:    ld.param.b32 %r3, [out_v1i32_param_2];
-; CHECK-NEXT:    xor.b32 %r4, %r1, %r2;
-; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
-; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
+; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
+; CHECK-NEXT:    not.b32 %r5, %r2;
+; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
+; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -227,19 +230,21 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<13>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2];
-; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
-; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
-; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
-; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
-; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
+; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
+; CHECK-NEXT:    not.b32 %r9, %r4;
+; CHECK-NEXT:    not.b32 %r10, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -251,16 +256,17 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd3, [out_v1i64_param_2];
-; CHECK-NEXT:    xor.b64 %rd4, %rd1, %rd2;
-; CHECK-NEXT:    and.b64 %rd5, %rd4, %rd3;
-; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
+; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
+; CHECK-NEXT:    not.b64 %rd5, %rd2;
+; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
+; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -344,25 +350,29 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<25>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1];
-; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2];
-; CHECK-NEXT:    xor.b32 %r13, %r4, %r8;
-; CHECK-NEXT:    and.b32 %r14, %r13, %r12;
-; CHECK-NEXT:    xor.b32 %r15, %r14, %r8;
-; CHECK-NEXT:    xor.b32 %r16, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r17, %r16, %r11;
-; CHECK-NEXT:    xor.b32 %r18, %r17, %r7;
-; CHECK-NEXT:    xor.b32 %r19, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r20, %r19, %r10;
-; CHECK-NEXT:    xor.b32 %r21, %r20, %r6;
-; CHECK-NEXT:    xor.b32 %r22, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r23, %r22, %r9;
-; CHECK-NEXT:    xor.b32 %r24, %r23, %r5;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15};
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
+; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
+; CHECK-NEXT:    not.b32 %r17, %r8;
+; CHECK-NEXT:    not.b32 %r18, %r7;
+; CHECK-NEXT:    not.b32 %r19, %r6;
+; CHECK-NEXT:    not.b32 %r20, %r5;
+; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
+; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
+; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
+; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
+; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
+; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
+; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -374,23 +384,26 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<23>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
 ; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
-; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1];
-; CHECK-NEXT:    xor.b32 %r14, %r4, %r13;
-; CHECK-NEXT:    and.b32 %r15, %r14, %r8;
-; CHECK-NEXT:    xor.b32 %r16, %r15, %r13;
-; CHECK-NEXT:    xor.b32 %r17, %r2, %r11;
-; CHECK-NEXT:    and.b32 %r18, %r17, %r6;
-; CHECK-NEXT:    xor.b32 %r19, %r18, %r11;
-; CHECK-NEXT:    xor.b32 %r20, %r1, %r10;
-; CHECK-NEXT:    and.b32 %r21, %r20, %r5;
-; CHECK-NEXT:    xor.b32 %r22, %r21, %r10;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16};
+; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    not.b32 %r17, %r8;
+; CHECK-NEXT:    not.b32 %r18, %r6;
+; CHECK-NEXT:    not.b32 %r19, %r5;
+; CHECK-NEXT:    and.b32 %r20, %r13, %r19;
+; CHECK-NEXT:    and.b32 %r21, %r14, %r18;
+; CHECK-NEXT:    and.b32 %r22, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r23, %r12, %r22;
+; CHECK-NEXT:    or.b32 %r24, %r11, %r21;
+; CHECK-NEXT:    or.b32 %r25, %r10, %r20;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -402,19 +415,21 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<13>;
+; CHECK-NEXT:    .reg .b64 %rd<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2];
-; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd4;
-; CHECK-NEXT:    and.b64 %rd8, %rd7, %rd6;
-; CHECK-NEXT:    xor.b64 %rd9, %rd8, %rd4;
-; CHECK-NEXT:    xor.b64 %rd10, %rd1, %rd3;
-; CHECK-NEXT:    and.b64 %rd11, %rd10, %rd5;
-; CHECK-NEXT:    xor.b64 %rd12, %rd11, %rd3;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd12, %rd9};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
+; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
+; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
+; CHECK-NEXT:    not.b64 %rd9, %rd4;
+; CHECK-NEXT:    not.b64 %rd10, %rd3;
+; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
+; CHECK-NEXT:    and.b64 %rd12, %rd8, %rd9;
+; CHECK-NEXT:    or.b64 %rd13, %rd6, %rd12;
+; CHECK-NEXT:    or.b64 %rd14, %rd5, %rd11;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd14, %rd13};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
deleted file mode 100644
index 631b7109281e5..0000000000000
--- a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
-; RUN: llc -mtriple=riscv64 < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
-; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
-; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
-;
-; test that masked-merge code is generated as "xor;and;xor" sequence or
-; "andn ; and; or" if and-not is available.
-
-define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: masked_merge0:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a1, a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge0:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
-; CHECK-I-LABEL: masked_merge1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a1, a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i16 %a0, %a1
-  %not = xor i16 %a0, -1
-  %and1 = and i16 %a2, %not
-  %or = or i16 %and0, %and1
-  ret i16 %or
-}
-
-define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; CHECK-I-LABEL: masked_merge2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    mv a0, a1
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    andn a2, a1, a0
-; CHECK-ZBB-NEXT:    and a0, a1, a0
-; CHECK-ZBB-NEXT:    or a0, a2, a0
-; CHECK-ZBB-NEXT:    ret
-  %not = xor i8 %a0, -1
-  %and0 = and i8 %not, %a1
-  %and1 = and i8 %a1, %a0
-  %or = or i8 %and0, %and1
-  ret i8 %or
-}
-
-define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
-; RV32I-LABEL: masked_merge3:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    xor a3, a3, a5
-; RV32I-NEXT:    xor a2, a2, a4
-; RV32I-NEXT:    not a2, a2
-; RV32I-NEXT:    not a3, a3
-; RV32I-NEXT:    and a0, a2, a0
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    xor a0, a0, a4
-; RV32I-NEXT:    xor a1, a1, a5
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: masked_merge3:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    xor a1, a1, a2
-; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    ret
-;
-; RV32ZBB-LABEL: masked_merge3:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    not a6, a0
-; RV32ZBB-NEXT:    not a7, a1
-; RV32ZBB-NEXT:    andn a1, a1, a3
-; RV32ZBB-NEXT:    andn a0, a0, a2
-; RV32ZBB-NEXT:    andn a2, a7, a5
-; RV32ZBB-NEXT:    andn a3, a6, a4
-; RV32ZBB-NEXT:    or a0, a3, a0
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: masked_merge3:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    not a3, a0
-; RV64ZBB-NEXT:    andn a2, a3, a2
-; RV64ZBB-NEXT:    andn a0, a0, a1
-; RV64ZBB-NEXT:    or a0, a2, a0
-; RV64ZBB-NEXT:    ret
-  %v0 = xor i64 %a1, -1
-  %v1 = xor i64 %a2, -1
-  %not = xor i64 %a0, -1
-  %and0 = and i64 %not, %v1
-  %and1 = and i64 %v0, %a0
-  %or = or i64 %and0, %and1
-  ret i64 %or
-}
-
-define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; RV32-LABEL: not_a_masked_merge0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    and a1, a0, a1
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_a_masked_merge0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    and a1, a0, a1
-; RV64-NEXT:    negw a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not_a_not = sub i32 0, %a0
-  %and1 = and i32 %not_a_not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
-; CHECK-I-LABEL: not_a_masked_merge1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    not a1, a3
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a0, a0, a1
-; CHECK-ZBB-NEXT:    andn a1, a2, a3
-; CHECK-ZBB-NEXT:    or a0, a0, a1
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a3, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: not_a_masked_merge2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    or a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    or a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %not_an_and0 = or i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %not_an_and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: not_a_masked_merge3:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge3:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    xor a0, a0, a2
-; CHECK-ZBB-NEXT:    orn a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %not_an_and1 = xor i32 %not, %a2
-  %or = or i32 %and0, %not_an_and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-LABEL: not_a_masked_merge4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a2, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform0:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    sw a1, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform0:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    sw a1, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and0, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a4, a0
-; CHECK-I-NEXT:    and a0, a4, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    sw a4, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    not a4, a0
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    sw a4, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %not, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a2, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a2
-; CHECK-I-NEXT:    sw a2, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a2, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a2
-; CHECK-ZBB-NEXT:    sw a2, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and1, ptr %p1
-  ret i32 %or
-}
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index efc8243df71e0..1517e524a7f78 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,13 +8,16 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
+; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
+; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -33,9 +36,10 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -54,9 +58,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -75,19 +80,22 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -652,9 +660,10 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a0, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    not a1, a2
+; CHECK-I-NEXT:    and a0, a2, a0
+; CHECK-I-NEXT:    andi a1, a1, 42
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -695,9 +704,10 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a1, a0, 42
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    xor a0, a1, a0
+; CHECK-I-NEXT:    not a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    andi a1, a2, 42
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -802,9 +812,10 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a1, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a0, a2
+; CHECK-I-NEXT:    andi a2, a2, 42
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    or a0, a2, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -844,9 +855,10 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a1, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    not a0, a2
+; CHECK-I-NEXT:    andi a0, a0, 42
+; CHECK-I-NEXT:    and a1, a2, a1
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
deleted file mode 100644
index c014345507f69..0000000000000
--- a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
+++ /dev/null
@@ -1,277 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3
-
-; test that masked-merge code is generated as "xor;and;xor" sequence or
-; "andn ; and; or" if and-not is available.
-
-define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: masked_merge0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xr %r3, %r4
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
-; NO-MISC3-LABEL: masked_merge1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xr %r3, %r4
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    ncrk %r0, %r4, %r2
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    or %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i16 %a0, %a1
-  %not = xor i16 %a0, -1
-  %and1 = and i16 %a2, %not
-  %or = or i16 %and0, %and1
-  ret i16 %or
-}
-
-define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; NO-MISC3-LABEL: masked_merge2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lr %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lr %r2, %r3
-; MISC3-NEXT:    br %r14
-  %not = xor i8 %a0, -1
-  %and0 = and i8 %not, %a1
-  %and1 = and i8 %a1, %a0
-  %or = or i8 %and0, %and1
-  ret i8 %or
-}
-
-define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
-; NO-MISC3-LABEL: masked_merge3:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lcgr %r0, %r4
-; NO-MISC3-NEXT:    aghi %r0, -1
-; NO-MISC3-NEXT:    xgr %r3, %r0
-; NO-MISC3-NEXT:    ngr %r3, %r2
-; NO-MISC3-NEXT:    xgr %r3, %r2
-; NO-MISC3-NEXT:    xgrk %r2, %r3, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge3:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lcgr %r0, %r2
-; MISC3-NEXT:    aghi %r0, -1
-; MISC3-NEXT:    ncgrk %r0, %r0, %r4
-; MISC3-NEXT:    ncgrk %r2, %r2, %r3
-; MISC3-NEXT:    ogr %r2, %r0
-; MISC3-NEXT:    br %r14
-  %v0 = xor i64 %a1, -1
-  %v1 = xor i64 %a2, -1
-  %not = xor i64 %a0, -1
-  %and0 = and i64 %not, %v1
-  %and1 = and i64 %v0, %a0
-  %or = or i64 %and0, %and1
-  ret i64 %or
-}
-
-define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lcr %r0, %r2
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    nr %r0, %r4
-; NO-MISC3-NEXT:    ork %r2, %r3, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lcr %r0, %r2
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    nr %r0, %r4
-; MISC3-NEXT:    ork %r2, %r3, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not_a_not = sub i32 0, %a0
-  %and1 = and i32 %not_a_not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
-; NO-MISC3-LABEL: not_a_masked_merge1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xilf %r5, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    nr %r4, %r5
-; NO-MISC3-NEXT:    or %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    ncrk %r0, %r4, %r5
-; MISC3-NEXT:    or %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a3, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    or %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r4
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    or %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    br %r14
-  %not_an_and0 = or i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %not_an_and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge3:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge3:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    xr %r2, %r4
-; MISC3-NEXT:    ocrk %r2, %r3, %r2
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %not_an_and1 = xor i32 %not, %a2
-  %or = or i32 %and0, %not_an_and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge4:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge4:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a2, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r4
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    st %r3, 0(%r5)
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    st %r3, 0(%r5)
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and0, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nrk %r0, %r2, %r3
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r4, %r2
-; NO-MISC3-NEXT:    or %r0, %r4
-; NO-MISC3-NEXT:    st %r2, 0(%r5)
-; NO-MISC3-NEXT:    lr %r2, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nrk %r0, %r2, %r3
-; MISC3-NEXT:    ncrk %r1, %r4, %r2
-; MISC3-NEXT:    xilf %r2, 4294967295
-; MISC3-NEXT:    or %r0, %r1
-; MISC3-NEXT:    st %r2, 0(%r5)
-; MISC3-NEXT:    lr %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %not, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r4, %r2
-; NO-MISC3-NEXT:    ork %r2, %r3, %r4
-; NO-MISC3-NEXT:    st %r4, 0(%r5)
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r0, %r4, %r2
-; MISC3-NEXT:    ork %r2, %r3, %r0
-; MISC3-NEXT:    st %r0, 0(%r5)
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and1, ptr %p1
-  ret i32 %or
-}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e12bf530..185c46aa5681e 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -4465,139 +4465,203 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
-; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
-; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
-; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
-; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
-; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
-; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
-; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
-; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
-; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
-; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
-; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
-; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
-; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
-; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
-; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
-; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
-; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
-; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
-; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
-; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
-; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
-; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
+; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
+; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
 ; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
+; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
+; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
+; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -7482,75 +7546,107 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
-; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
-; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
-; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
-; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
-; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
 ; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9357,43 +9453,59 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
 ; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -10862,27 +10974,35 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v2i64:
 ; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
-; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
-; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
-; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
-; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
-; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
-; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
+; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
+; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
+; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
+; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
+; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
+; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
 ; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
-; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
-; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
-; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
-; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
+; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
+; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd6..2922113b14ea9 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,22 +17,14 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-NOBMI-LABEL: bitselect_i8:
-; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %esi, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    andl %edx, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NOBMI-NEXT:    retq
-;
-; X64-BMI-LABEL: bitselect_i8:
-; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    andnl %edi, %edx, %eax
-; X64-BMI-NEXT:    andl %edx, %esi
-; X64-BMI-NEXT:    orl %esi, %eax
-; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
-; X64-BMI-NEXT:    retq
+; X64-LABEL: bitselect_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
@@ -43,20 +35,21 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind {
 ; X86-LABEL: bitselect_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorw %cx, %ax
-; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorw %ax, %cx
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bitselect_i16:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %esi, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    andl %edx, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edx, %eax
+; X64-NOBMI-NEXT:    andl %edx, %esi
+; X64-NOBMI-NEXT:    notl %eax
+; X64-NOBMI-NEXT:    andl %edi, %eax
+; X64-NOBMI-NEXT:    orl %esi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI-NEXT:    retq
 ;
@@ -193,12 +186,13 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ;
 ; X64-BMI-LABEL: bitselect_i128:
 ; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    andnq %rsi, %r9, %rsi
 ; X64-BMI-NEXT:    andnq %rdi, %r8, %rax
+; X64-BMI-NEXT:    andq %r9, %rcx
+; X64-BMI-NEXT:    orq %rcx, %rsi
 ; X64-BMI-NEXT:    andq %r8, %rdx
 ; X64-BMI-NEXT:    orq %rdx, %rax
-; X64-BMI-NEXT:    andnq %rsi, %r9, %rdx
-; X64-BMI-NEXT:    andq %r9, %rcx
-; X64-BMI-NEXT:    orq %rcx, %rdx
+; X64-BMI-NEXT:    movq %rsi, %rdx
 ; X64-BMI-NEXT:    retq
   %not = xor i128 %m, -1
   %ma = and i128 %a, %not
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 4a4eecbdfb3f3..b2614c5fe0493 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -30,17 +30,18 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 ; NOBMI-LABEL: masked_merge1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    xorl %edx, %eax
-; NOBMI-NEXT:    andl %edi, %eax
-; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    movl %edi, %eax
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    andl %edx, %eax
+; NOBMI-NEXT:    orl %esi, %eax
 ; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge1:
 ; BMI:       # %bb.0:
-; BMI-NEXT:    andnl %edx, %edi, %eax
 ; BMI-NEXT:    andl %edi, %esi
+; BMI-NEXT:    andnl %edx, %edi, %eax
 ; BMI-NEXT:    orl %esi, %eax
 ; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BMI-NEXT:    retq
@@ -52,11 +53,20 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; CHECK-LABEL: masked_merge2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; NOBMI-LABEL: masked_merge2:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge2:
+; BMI:       # %bb.0:
+; BMI-NEXT:    movl %edi, %eax
+; BMI-NEXT:    notb %al
+; BMI-NEXT:    andb %sil, %al
+; BMI-NEXT:    andb %dil, %sil
+; BMI-NEXT:    orb %sil, %al
+; BMI-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 6a55d740fe421..9c9d06921096c 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,18 +6,21 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notb %al
+; CHECK-NOBMI-NEXT:    andb %sil, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
+; CHECK-BMI-NEXT:    movl %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    orl %edi, %eax
+; CHECK-BMI-NEXT:    notb %al
+; CHECK-BMI-NEXT:    andb %sil, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
@@ -30,17 +33,18 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index 809c15881cc9b..b1194bedc4e1c 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,10 +16,11 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    orb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -36,28 +37,32 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    notb %r9b
+; CHECK-BASELINE-NEXT:    andb %cl, %r9b
+; CHECK-BASELINE-NEXT:    andb %dl, %al
+; CHECK-BASELINE-NEXT:    orb %dil, %al
+; CHECK-BASELINE-NEXT:    orb %sil, %r9b
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    notb %r9b
+; CHECK-SSE1-NEXT:    andb %cl, %r9b
+; CHECK-SSE1-NEXT:    andb %dl, %al
+; CHECK-SSE1-NEXT:    orb %dil, %al
+; CHECK-SSE1-NEXT:    orb %sil, %r9b
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -81,10 +86,11 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
@@ -229,28 +235,32 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
+; CHECK-BASELINE-NEXT:    orl %esi, %r9d
+; CHECK-BASELINE-NEXT:    andl %edx, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %eax
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andl %ecx, %r9d
+; CHECK-SSE1-NEXT:    orl %esi, %r9d
+; CHECK-SSE1-NEXT:    andl %edx, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %eax
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i16:
@@ -429,12 +439,9 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
@@ -444,21 +451,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
@@ -468,10 +475,13 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -496,43 +506,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -873,14 +883,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
@@ -896,16 +906,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
+; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    xorw %di, %r10w
+; CHECK-BASELINE-NEXT:    movl %edi, %r10d
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
@@ -931,14 +941,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
@@ -954,16 +964,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    xorw %r11w, %bx
+; CHECK-SSE1-NEXT:    movl %r11d, %ebx
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
+; CHECK-SSE1-NEXT:    movl %r10d, %r11d
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    xorw %di, %r10w
+; CHECK-SSE1-NEXT:    movl %edi, %r10d
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
@@ -1749,117 +1759,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r10
-; CHECK-BASELINE-NEXT:    movq %rdx, %r8
-; CHECK-BASELINE-NEXT:    movq %rsi, %r9
-; CHECK-BASELINE-NEXT:    movq %rdi, %r11
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
-; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %esi
-; CHECK-BASELINE-NEXT:    movzwl (%r9), %edi
-; CHECK-BASELINE-NEXT:    xorw %cx, %di
-; CHECK-BASELINE-NEXT:    andw (%r10), %di
-; CHECK-BASELINE-NEXT:    xorl %ecx, %edi
-; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %si, %cx
-; CHECK-BASELINE-NEXT:    andw 2(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %esi, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 4(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %dx, %cx
-; CHECK-BASELINE-NEXT:    andw 4(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %edx, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %ax, %cx
-; CHECK-BASELINE-NEXT:    andw 6(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 8(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
+; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
+; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
+; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
+; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
+; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
+; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorw %bx, %ax
-; CHECK-BASELINE-NEXT:    andw 8(%r10), %ax
-; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %ebx
-; CHECK-BASELINE-NEXT:    xorw %r14w, %bx
-; CHECK-BASELINE-NEXT:    andw 10(%r10), %bx
-; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 12(%r9), %r14d
-; CHECK-BASELINE-NEXT:    xorw %r12w, %r14w
-; CHECK-BASELINE-NEXT:    andw 12(%r10), %r14w
-; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 14(%r9), %r12d
-; CHECK-BASELINE-NEXT:    xorw %r13w, %r12w
-; CHECK-BASELINE-NEXT:    andw 14(%r10), %r12w
-; CHECK-BASELINE-NEXT:    xorl %r13d, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 16(%r9), %r13d
-; CHECK-BASELINE-NEXT:    xorw %r15w, %r13w
-; CHECK-BASELINE-NEXT:    andw 16(%r10), %r13w
-; CHECK-BASELINE-NEXT:    xorl %r15d, %r13d
-; CHECK-BASELINE-NEXT:    movzwl 18(%r9), %r15d
-; CHECK-BASELINE-NEXT:    xorw %bp, %r15w
-; CHECK-BASELINE-NEXT:    andw 18(%r10), %r15w
-; CHECK-BASELINE-NEXT:    xorl %ebp, %r15d
-; CHECK-BASELINE-NEXT:    movl 20(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 20(%r9), %ebp
-; CHECK-BASELINE-NEXT:    xorw %ax, %bp
-; CHECK-BASELINE-NEXT:    andw 20(%r10), %bp
+; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bp, %ax
+; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
 ; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %esi
-; CHECK-BASELINE-NEXT:    xorw %ax, %si
-; CHECK-BASELINE-NEXT:    andw 22(%r10), %si
-; CHECK-BASELINE-NEXT:    xorl %eax, %esi
-; CHECK-BASELINE-NEXT:    movl 24(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 24(%r9), %edx
-; CHECK-BASELINE-NEXT:    xorw %ax, %dx
-; CHECK-BASELINE-NEXT:    andw 24(%r10), %dx
-; CHECK-BASELINE-NEXT:    xorl %eax, %edx
-; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %ax, %cx
-; CHECK-BASELINE-NEXT:    andw 26(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl 28(%r8), %edi
-; CHECK-BASELINE-NEXT:    movzwl 28(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorw %di, %ax
-; CHECK-BASELINE-NEXT:    andw 28(%r10), %ax
-; CHECK-BASELINE-NEXT:    xorl %edi, %eax
-; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %edi
-; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %r8d
-; CHECK-BASELINE-NEXT:    xorw %di, %r8w
-; CHECK-BASELINE-NEXT:    andw 30(%r10), %r8w
-; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    movw %r8w, 30(%r11)
-; CHECK-BASELINE-NEXT:    movw %ax, 28(%r11)
-; CHECK-BASELINE-NEXT:    movw %cx, 26(%r11)
-; CHECK-BASELINE-NEXT:    movw %dx, 24(%r11)
-; CHECK-BASELINE-NEXT:    movw %si, 22(%r11)
-; CHECK-BASELINE-NEXT:    movw %bp, 20(%r11)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r11)
-; CHECK-BASELINE-NEXT:    movw %r13w, 16(%r11)
-; CHECK-BASELINE-NEXT:    movw %r12w, 14(%r11)
-; CHECK-BASELINE-NEXT:    movw %r14w, 12(%r11)
-; CHECK-BASELINE-NEXT:    movw %bx, 10(%r11)
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
+; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
+; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
+; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
+; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
+; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
+; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
+; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
+; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorw %dx, %si
+; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
+; CHECK-BASELINE-NEXT:    xorl %esi, %edx
+; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 8(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 4(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%r11)
-; CHECK-BASELINE-NEXT:    movq %r11, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1876,117 +1882,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r10
-; CHECK-SSE1-NEXT:    movq %rdx, %r8
-; CHECK-SSE1-NEXT:    movq %rsi, %r9
-; CHECK-SSE1-NEXT:    movq %rdi, %r11
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
-; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzwl 2(%r8), %esi
-; CHECK-SSE1-NEXT:    movzwl (%r9), %edi
-; CHECK-SSE1-NEXT:    xorw %cx, %di
-; CHECK-SSE1-NEXT:    andw (%r10), %di
-; CHECK-SSE1-NEXT:    xorl %ecx, %edi
-; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %si, %cx
-; CHECK-SSE1-NEXT:    andw 2(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %esi, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 4(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %dx, %cx
-; CHECK-SSE1-NEXT:    andw 4(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %edx, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %ax, %cx
-; CHECK-SSE1-NEXT:    andw 6(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 8(%r9), %eax
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw (%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r12w, %ax
+; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r12d
+; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r10w, %ax
+; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r10d
+; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r11w, %ax
+; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r11d
+; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r13w, %ax
+; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r13d
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorw %bx, %ax
-; CHECK-SSE1-NEXT:    andw 8(%r10), %ax
-; CHECK-SSE1-NEXT:    xorl %ebx, %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%r9), %ebx
-; CHECK-SSE1-NEXT:    xorw %r14w, %bx
-; CHECK-SSE1-NEXT:    andw 10(%r10), %bx
-; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
-; CHECK-SSE1-NEXT:    movzwl 12(%r9), %r14d
-; CHECK-SSE1-NEXT:    xorw %r12w, %r14w
-; CHECK-SSE1-NEXT:    andw 12(%r10), %r14w
-; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
-; CHECK-SSE1-NEXT:    movzwl 14(%r9), %r12d
-; CHECK-SSE1-NEXT:    xorw %r13w, %r12w
-; CHECK-SSE1-NEXT:    andw 14(%r10), %r12w
-; CHECK-SSE1-NEXT:    xorl %r13d, %r12d
-; CHECK-SSE1-NEXT:    movzwl 16(%r9), %r13d
-; CHECK-SSE1-NEXT:    xorw %r15w, %r13w
-; CHECK-SSE1-NEXT:    andw 16(%r10), %r13w
-; CHECK-SSE1-NEXT:    xorl %r15d, %r13d
-; CHECK-SSE1-NEXT:    movzwl 18(%r9), %r15d
-; CHECK-SSE1-NEXT:    xorw %bp, %r15w
-; CHECK-SSE1-NEXT:    andw 18(%r10), %r15w
-; CHECK-SSE1-NEXT:    xorl %ebp, %r15d
-; CHECK-SSE1-NEXT:    movl 20(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 20(%r9), %ebp
-; CHECK-SSE1-NEXT:    xorw %ax, %bp
-; CHECK-SSE1-NEXT:    andw 20(%r10), %bp
+; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %ebx
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bp, %ax
+; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
 ; CHECK-SSE1-NEXT:    xorl %eax, %ebp
-; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 22(%r9), %esi
-; CHECK-SSE1-NEXT:    xorw %ax, %si
-; CHECK-SSE1-NEXT:    andw 22(%r10), %si
-; CHECK-SSE1-NEXT:    xorl %eax, %esi
-; CHECK-SSE1-NEXT:    movl 24(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 24(%r9), %edx
-; CHECK-SSE1-NEXT:    xorw %ax, %dx
-; CHECK-SSE1-NEXT:    andw 24(%r10), %dx
-; CHECK-SSE1-NEXT:    xorl %eax, %edx
-; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 26(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %ax, %cx
-; CHECK-SSE1-NEXT:    andw 26(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl 28(%r8), %edi
-; CHECK-SSE1-NEXT:    movzwl 28(%r9), %eax
-; CHECK-SSE1-NEXT:    xorw %di, %ax
-; CHECK-SSE1-NEXT:    andw 28(%r10), %ax
-; CHECK-SSE1-NEXT:    xorl %edi, %eax
-; CHECK-SSE1-NEXT:    movzwl 30(%r8), %edi
-; CHECK-SSE1-NEXT:    movzwl 30(%r9), %r8d
-; CHECK-SSE1-NEXT:    xorw %di, %r8w
-; CHECK-SSE1-NEXT:    andw 30(%r10), %r8w
-; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    movw %r8w, 30(%r11)
-; CHECK-SSE1-NEXT:    movw %ax, 28(%r11)
-; CHECK-SSE1-NEXT:    movw %cx, 26(%r11)
-; CHECK-SSE1-NEXT:    movw %dx, 24(%r11)
-; CHECK-SSE1-NEXT:    movw %si, 22(%r11)
-; CHECK-SSE1-NEXT:    movw %bp, 20(%r11)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%r11)
-; CHECK-SSE1-NEXT:    movw %r13w, 16(%r11)
-; CHECK-SSE1-NEXT:    movw %r12w, 14(%r11)
-; CHECK-SSE1-NEXT:    movw %r14w, 12(%r11)
-; CHECK-SSE1-NEXT:    movw %bx, 10(%r11)
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r14w, %ax
+; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r14d
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r15w, %ax
+; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r15d
+; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r13w, %ax
+; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r13d
+; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorw %ax, %r10w
+; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
+; CHECK-SSE1-NEXT:    xorl %r10d, %eax
+; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
+; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
+; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
+; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorw %dx, %si
+; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
+; CHECK-SSE1-NEXT:    xorl %esi, %edx
+; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
+; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 8(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 6(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 4(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%r11)
-; CHECK-SSE1-NEXT:    movq %r11, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13

From 0a6463039da89914c7a0f99622fb7a008abde2fd Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Tue, 10 Jun 2025 19:48:09 -0700
Subject: [PATCH 024/851] [NFC] get rid of `undef` in avx512vl-intrinsics.ll
 test (#143641)

---
 llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 72 ++++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 0973824fbb0ef..b408aac218108 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -46,7 +46,7 @@ define <2 x double> @test_compress_pd_128(<2 x double> %data) {
 ; CHECK-LABEL: test_compress_pd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x double> %1
 }
 
@@ -94,7 +94,7 @@ define <4 x float> @test_compress_ps_128(<4 x float> %data) {
 ; CHECK-LABEL: test_compress_ps_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x float> %1
 }
 
@@ -142,7 +142,7 @@ define <2 x i64> @test_compress_q_128(<2 x i64> %data) {
 ; CHECK-LABEL: test_compress_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x i64> %1
 }
 
@@ -190,7 +190,7 @@ define <4 x i32> @test_compress_d_128(<4 x i32> %data) {
 ; CHECK-LABEL: test_compress_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i32> %1
 }
 
@@ -198,7 +198,7 @@ define <2 x double> @test_expand_pd_128(<2 x double> %data) {
 ; CHECK-LABEL: test_expand_pd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x double> %1
 }
 
@@ -246,7 +246,7 @@ define <4 x float> @test_expand_ps_128(<4 x float> %data) {
 ; CHECK-LABEL: test_expand_ps_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x float> %1
 }
 
@@ -294,7 +294,7 @@ define <2 x i64> @test_expand_q_128(<2 x i64> %data) {
 ; CHECK-LABEL: test_expand_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x i64> %1
 }
 
@@ -342,7 +342,7 @@ define <4 x i32> @test_expand_d_128(<4 x i32> %data) {
 ; CHECK-LABEL: test_expand_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i32> %1
 }
 
@@ -430,7 +430,7 @@ define <4 x double> @test_compress_pd_256(<4 x double> %data) {
 ; CHECK-LABEL: test_compress_pd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x double> %1
 }
 
@@ -476,7 +476,7 @@ define <8 x float> @test_compress_ps_256(<8 x float> %data) {
 ; CHECK-LABEL: test_compress_ps_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x float> %1
 }
 
@@ -524,7 +524,7 @@ define <4 x i64> @test_compress_q_256(<4 x i64> %data) {
 ; CHECK-LABEL: test_compress_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i64> %1
 }
 
@@ -570,7 +570,7 @@ define <8 x i32> @test_compress_d_256(<8 x i32> %data) {
 ; CHECK-LABEL: test_compress_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x i32> %1
 }
 
@@ -578,7 +578,7 @@ define <4 x double> @test_expand_pd_256(<4 x double> %data) {
 ; CHECK-LABEL: test_expand_pd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x double> %1
 }
 
@@ -626,7 +626,7 @@ define <8 x float> @test_expand_ps_256(<8 x float> %data) {
 ; CHECK-LABEL: test_expand_ps_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x float> %1
 }
 
@@ -672,7 +672,7 @@ define <4 x i64> @test_expand_q_256(<4 x i64> %data) {
 ; CHECK-LABEL: test_expand_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i64> %1
 }
 
@@ -720,7 +720,7 @@ define <8 x i32> @test_expand_d_256(<8 x i32> %data) {
 ; CHECK-LABEL: test_expand_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x i32> %1
 }
 
@@ -884,7 +884,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
   ret <4 x float> %3
 }
@@ -906,7 +906,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
   ret <4 x float> %3
 }
@@ -986,7 +986,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
   ret <4 x float> %3
 }
@@ -1008,7 +1008,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
   ret <4 x float> %3
 }
@@ -5223,7 +5223,7 @@ define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
   ret <2 x i64> %res2
 }
@@ -5242,7 +5242,7 @@ define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5274,7 +5274,7 @@ define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
   ret <4 x i64> %res2
 }
@@ -5293,7 +5293,7 @@ define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -5325,7 +5325,7 @@ define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %pas
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
   ret <2 x i64> %res2
 }
@@ -5344,7 +5344,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask) {
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5376,7 +5376,7 @@ define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %pas
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
   ret <4 x i64> %res2
 }
@@ -5395,7 +5395,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask) {
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -5427,7 +5427,7 @@ define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2
   ret <2 x i64> %res2
 }
@@ -5447,7 +5447,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5480,7 +5480,7 @@ define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2
   ret <4 x i64> %res2
 }
@@ -5500,7 +5500,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -6861,7 +6861,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6889,7 +6889,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6914,7 +6914,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6936,7 +6936,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
 ; X64-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3

From 28a4ed945dc101c9a7dbdc93d9461da67225f7dc Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Tue, 10 Jun 2025 22:49:09 -0400
Subject: [PATCH 025/851] [AMDGPU][True16] remove AsmVOP3OpSel (#143465)

This is NFC. Clean up the AsmVOP3OpSel field, and use Vop3Base instead.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      | 44 +---------------------
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +----
 llvm/lib/Target/AMDGPU/VOPInstructions.td  |  6 +--
 3 files changed, 5 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2c20475726a48..e74ccbee975ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2242,41 +2242,6 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasNeg,
   string ret = dst#src0#src1#src2#opsel#mods#clamp;
 }
 
-// FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
-// VOP3 16 bit instructions are replaced to true16 format
-class getAsmVOP3OpSel <int NumSrcArgs,
-                       bit HasClamp,
-                       bit HasOMod,
-                       bit Src0HasMods,
-                       bit Src1HasMods,
-                       bit Src2HasMods,
-                       bit HasByteSel = 0,
-                       bit HasBitOp3 = 0> {
-  string dst = "$vdst";
-
-  string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
-  string isrc1 = !if(!eq(NumSrcArgs, 1), "",
-                     !if(!eq(NumSrcArgs, 2), " $src1",
-                                             " $src1,"));
-  string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
-
-  string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
-  string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
-                     !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
-                                             " $src1_modifiers,"));
-  string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
-
-  string src0 = !if(Src0HasMods, fsrc0, isrc0);
-  string src1 = !if(Src1HasMods, fsrc1, isrc1);
-  string src2 = !if(Src2HasMods, fsrc2, isrc2);
-
-  string bytesel = !if(HasByteSel, "$byte_sel", "");
-  string clamp = !if(HasClamp, "$clamp", "");
-  string omod = !if(HasOMod, "$omod", "");
-  string bitop3 = !if(HasBitOp3, "$bitop3", "");
-  string ret = dst#", "#src0#src1#src2#bitop3#"$op_sel"#bytesel#clamp#omod;
-}
-
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
@@ -2687,14 +2652,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
    HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
-  field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
-                                              HasClamp,
-                                              HasOMod,
-                                              HasSrc0FloatMods,
-                                              HasSrc1FloatMods,
-                                              HasSrc2FloatMods,
-                                              HasFP8ByteSel,
-                                              HasBitOp3>.ret;
+  field string AsmVOP3OpSel = AsmVOP3Base;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 9f66951372d19..a005e0245b8ff 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -626,10 +626,6 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasOpSel = 1;
   let HasFP8DstByteSel = 1;
   let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand.
-  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
-                            getAsmVOP3OpSel<3, HasClamp, HasOMod,
-                                            HasSrc0FloatMods, HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret);
   let AsmVOP3Base = !subst(", $src2_modifiers", "",
                     getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
                     HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, 0/*Src1Mods*/,
@@ -1066,12 +1062,10 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
   let HasOpSel = 1;
-  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
-                            getAsmVOP3OpSel<3, HasClamp, HasOMod,
-                                            HasSrc0FloatMods, HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret);
+  let Asm64 = !subst(", $src2_modifiers", "", AsmVOP3Base);
   let HasExtVOP3DPP = 0;
   let HasFP8DstByteSel = 1;
+  let HasFP8ByteSel = 0;
 }
 
 class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 4cd845aaa5497..6045f59d1f040 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -112,9 +112,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   bit HasFP8DstByteSel = P.HasFP8DstByteSel;
   bit HasFP4DstByteSel = P.HasFP4DstByteSel;
 
-  let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
-                        P.AsmVOP3OpSel,
-                        !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+  let AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
 
   let Size = 8;
   let mayLoad = 0;
@@ -1484,7 +1482,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
 
   let HasModifiers =
       !if (Features.IsMAI, 0,
-           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+           !or(Features.IsPacked, P.HasModifiers));
 }
 
 class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {

From d75e28477af0baa063a4d4cc7b3cf657cfadd758 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 10 Jun 2025 20:36:52 -0700
Subject: [PATCH 026/851] [flang][runtime] Fix build bot flang-runtime-cuda-gcc
 errors (#143650)

Adjust default parent class accessibility to attemp to work around what
appear to be old GCC's interpretation.
---
 flang-rt/include/flang-rt/runtime/work-queue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index 878b18373e1d2..f7f4777839836 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -319,7 +319,7 @@ class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
 template <bool IS_COMPONENTWISE>
 class DerivedAssignTicket
     : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
-      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      protected std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
           ElementsOverComponents> {
 public:
   using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
@@ -348,7 +348,7 @@ namespace io::descr {
 template <io::Direction DIR>
 class DescriptorIoTicket
     : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
-      private Elementwise {
+      protected Elementwise {
 public:
   RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
@@ -372,7 +372,7 @@ class DescriptorIoTicket
 
 template <io::Direction DIR>
 class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
-                        private ElementsOverComponents {
+                        protected ElementsOverComponents {
 public:
   RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const typeInfo::DerivedType &derived,

From 3ece9b06a2d299d5a108efa856e662587543b2f3 Mon Sep 17 00:00:00 2001
From: quic_hchandel <quic_hchandel@quicinc.com>
Date: Wed, 11 Jun 2025 09:56:12 +0530
Subject: [PATCH 027/851] [RISCV][NFC] Improve test coverage for xtheadcondmov
 and xmipscmov (#143567)

Co-authored-by: Harsh Chandel <hchandel@qti.qualcomm.com>
---
 llvm/test/CodeGen/RISCV/select-cond.ll | 1018 ++++++++++++++++++++++++
 1 file changed, 1018 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/select-cond.ll

diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
new file mode 100644
index 0000000000000..a5f4677f73f13
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -0,0 +1,1018 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv32 -mattr=+xtheadcondmov -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32-THEAD
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64-MIPS
+
+define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_trunc:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a3, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_trunc:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a2, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_trunc:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB0_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB0_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_trunc:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %cond_trunc = trunc i32 %cond to i1
+  %res = select i1 %cond_trunc, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_param:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a3, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_param:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a2, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_param:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB1_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB1_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_param:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a0, a1, .LBB2_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_eq:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    beq a0, a1, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_eq:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp eq i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bne a0, a1, .LBB3_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB3_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ne:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bne a0, a1, .LBB3_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB3_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ne:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ne i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ugt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bltu a1, a0, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ugt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a1, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ugt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a1, a0, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ugt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ugt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_uge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bgeu a0, a1, .LBB5_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB5_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_uge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_uge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a0, a1, .LBB5_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB5_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_uge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp uge i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bltu a0, a1, .LBB6_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB6_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ult:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a0, a1, .LBB6_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB6_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ult:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ult i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ule:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bgeu a1, a0, .LBB7_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ule:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a1, a0
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ule:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a1, a0, .LBB7_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB7_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ule:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ule i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sgt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    blt a1, a0, .LBB8_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB8_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sgt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a1, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sgt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a1, a0, .LBB8_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB8_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sgt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sgt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bge a0, a1, .LBB9_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB9_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a0, a1, .LBB9_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB9_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sge i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    blt a0, a1, .LBB10_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB10_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_slt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a0, a1, .LBB10_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB10_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_slt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp slt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sle:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bge a1, a0, .LBB11_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB11_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sle:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a1, a0
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sle:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a1, a0, .LBB11_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB11_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sle:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sle i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_trunc:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bnez a3, .LBB12_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:  .LBB12_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_trunc:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    mv a1, a3
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a2, a4, a0
+; RV32-THEAD-NEXT:    th.mveqz a1, a5, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_trunc:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB12_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB12_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_trunc:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %cond_trunc = trunc i64 %cond to i1
+  %res = select i1 %cond_trunc, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_param:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a5, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a5, .LBB13_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:  .LBB13_2:
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_param:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a3, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a4, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    mv a1, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_param:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB13_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB13_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_param:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    beqz a1, .LBB14_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB14_2:
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_eq:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a1, a1, a3
+; RV32-THEAD-NEXT:    xor a0, a0, a2
+; RV32-THEAD-NEXT:    or a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    beq a0, a1, .LBB14_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB14_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_eq:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp eq i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    bnez a1, .LBB15_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB15_2:
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ne:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a1, a1, a3
+; RV32-THEAD-NEXT:    xor a0, a0, a2
+; RV32-THEAD-NEXT:    or a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bne a0, a1, .LBB15_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB15_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ne:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ne i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ugt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB16_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    beqz a0, .LBB16_3
+; RV32-NEXT:    j .LBB16_4
+; RV32-NEXT:  .LBB16_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    bnez a0, .LBB16_4
+; RV32-NEXT:  .LBB16_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB16_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ugt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ugt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a1, a0, .LBB16_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB16_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ugt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ugt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_uge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB17_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a1, a3
+; RV32-NEXT:    bnez a0, .LBB17_3
+; RV32-NEXT:    j .LBB17_4
+; RV32-NEXT:  .LBB17_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    beqz a0, .LBB17_4
+; RV32-NEXT:  .LBB17_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB17_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_uge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_uge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a0, a1, .LBB17_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB17_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_uge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp uge i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB18_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a1, a3
+; RV32-NEXT:    beqz a0, .LBB18_3
+; RV32-NEXT:    j .LBB18_4
+; RV32-NEXT:  .LBB18_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    bnez a0, .LBB18_4
+; RV32-NEXT:  .LBB18_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB18_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ult:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a0, a1, .LBB18_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB18_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ult:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ult i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ule:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB19_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    bnez a0, .LBB19_3
+; RV32-NEXT:    j .LBB19_4
+; RV32-NEXT:  .LBB19_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    beqz a0, .LBB19_4
+; RV32-NEXT:  .LBB19_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB19_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ule:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ule:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a1, a0, .LBB19_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB19_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ule:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ule i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sgt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB20_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a3, a1
+; RV32-NEXT:    beqz a0, .LBB20_3
+; RV32-NEXT:    j .LBB20_4
+; RV32-NEXT:  .LBB20_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    bnez a0, .LBB20_4
+; RV32-NEXT:  .LBB20_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB20_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sgt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sgt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a1, a0, .LBB20_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB20_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sgt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sgt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB21_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a1, a3
+; RV32-NEXT:    bnez a0, .LBB21_3
+; RV32-NEXT:    j .LBB21_4
+; RV32-NEXT:  .LBB21_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    beqz a0, .LBB21_4
+; RV32-NEXT:  .LBB21_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB21_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a0, a1, .LBB21_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB21_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sge i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB22_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a1, a3
+; RV32-NEXT:    beqz a0, .LBB22_3
+; RV32-NEXT:    j .LBB22_4
+; RV32-NEXT:  .LBB22_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    bnez a0, .LBB22_4
+; RV32-NEXT:  .LBB22_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB22_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_slt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a0, a1, .LBB22_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB22_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_slt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp slt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sle:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB23_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a3, a1
+; RV32-NEXT:    bnez a0, .LBB23_3
+; RV32-NEXT:    j .LBB23_4
+; RV32-NEXT:  .LBB23_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    beqz a0, .LBB23_4
+; RV32-NEXT:  .LBB23_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB23_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sle:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sle:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a1, a0, .LBB23_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB23_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sle:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sle i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+

From a3201ce9e114aa2ecd66e525607093e4dff2f574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 10 Jun 2025 22:10:26 -0700
Subject: [PATCH 028/851] [flang][cuda] Add option to disable warp function in
 semantic (#143640)

These functions are not available in some lower compute capabilities.
Add option in the language feature to enforce the semantic check on
these.
---
 .../include/flang/Support/Fortran-features.h  |   2 +-
 flang/lib/Semantics/check-cuda.cpp            | 125 ++++++++++++------
 flang/test/Semantics/cuf22.cuf                |   8 ++
 flang/tools/bbc/bbc.cpp                       |  10 ++
 4 files changed, 101 insertions(+), 44 deletions(-)
 create mode 100644 flang/test/Semantics/cuf22.cuf

diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index 3f6d825e2b66c..ea0845b7d605f 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
     IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
     ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
-    InaccessibleDeferredOverride)
+    InaccessibleDeferredOverride, CudaWarpMatchFunction)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index c024640af1220..8decfb0149829 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -17,6 +17,7 @@
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/ADT/StringSet.h"
 
 // Once labeled DO constructs have been canonicalized and their parse subtrees
 // transformed into parser::DoConstructs, scan the parser::Blocks of the program
@@ -61,6 +62,11 @@ bool CanonicalizeCUDA(parser::Program &program) {
 
 using MaybeMsg = std::optional<parser::MessageFormattedText>;
 
+static const llvm::StringSet<> warpFunctions_ = {"match_all_syncjj",
+    "match_all_syncjx", "match_all_syncjf", "match_all_syncjd",
+    "match_any_syncjj", "match_any_syncjx", "match_any_syncjf",
+    "match_any_syncjd"};
+
 // Traverses an evaluate::Expr<> in search of unsupported operations
 // on the device.
 
@@ -68,7 +74,7 @@ struct DeviceExprChecker
     : public evaluate::AnyTraverse<DeviceExprChecker, MaybeMsg> {
   using Result = MaybeMsg;
   using Base = evaluate::AnyTraverse<DeviceExprChecker, Result>;
-  DeviceExprChecker() : Base(*this) {}
+  explicit DeviceExprChecker(SemanticsContext &c) : Base(*this), context_{c} {}
   using Base::operator();
   Result operator()(const evaluate::ProcedureDesignator &x) const {
     if (const Symbol * sym{x.GetInterfaceSymbol()}) {
@@ -78,10 +84,17 @@ struct DeviceExprChecker
         if (auto attrs{subp->cudaSubprogramAttrs()}) {
           if (*attrs == common::CUDASubprogramAttrs::HostDevice ||
               *attrs == common::CUDASubprogramAttrs::Device) {
+            if (warpFunctions_.contains(sym->name().ToString()) &&
+                !context_.languageFeatures().IsEnabled(
+                    Fortran::common::LanguageFeature::CudaWarpMatchFunction)) {
+              return parser::MessageFormattedText(
+                  "warp match function disabled"_err_en_US);
+            }
             return {};
           }
         }
       }
+
       const Symbol &ultimate{sym->GetUltimate()};
       const Scope &scope{ultimate.owner()};
       const Symbol *mod{scope.IsModule() ? scope.symbol() : nullptr};
@@ -94,9 +107,12 @@ struct DeviceExprChecker
       // TODO(CUDA): Check for unsupported intrinsics here
       return {};
     }
+
     return parser::MessageFormattedText(
         "'%s' may not be called in device code"_err_en_US, x.GetName());
   }
+
+  SemanticsContext &context_;
 };
 
 struct FindHostArray
@@ -133,9 +149,10 @@ struct FindHostArray
   }
 };
 
-template <typename A> static MaybeMsg CheckUnwrappedExpr(const A &x) {
+template <typename A>
+static MaybeMsg CheckUnwrappedExpr(SemanticsContext &context, const A &x) {
   if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
-    return DeviceExprChecker{}(expr->typedExpr);
+    return DeviceExprChecker{context}(expr->typedExpr);
   }
   return {};
 }
@@ -144,104 +161,124 @@ template <typename A>
 static void CheckUnwrappedExpr(
     SemanticsContext &context, SourceName at, const A &x) {
   if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
-    if (auto msg{DeviceExprChecker{}(expr->typedExpr)}) {
+    if (auto msg{DeviceExprChecker{context}(expr->typedExpr)}) {
       context.Say(at, std::move(*msg));
     }
   }
 }
 
 template <bool CUF_KERNEL> struct ActionStmtChecker {
-  template <typename A> static MaybeMsg WhyNotOk(const A &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const A &x) {
     if constexpr (ConstraintTrait<A>) {
-      return WhyNotOk(x.thing);
+      return WhyNotOk(context, x.thing);
     } else if constexpr (WrapperTrait<A>) {
-      return WhyNotOk(x.v);
+      return WhyNotOk(context, x.v);
     } else if constexpr (UnionTrait<A>) {
-      return WhyNotOk(x.u);
+      return WhyNotOk(context, x.u);
     } else if constexpr (TupleTrait<A>) {
-      return WhyNotOk(x.t);
+      return WhyNotOk(context, x.t);
     } else {
       return parser::MessageFormattedText{
           "Statement may not appear in device code"_err_en_US};
     }
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const common::Indirection<A> &x) {
-    return WhyNotOk(x.value());
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const common::Indirection<A> &x) {
+    return WhyNotOk(context, x.value());
   }
   template <typename... As>
-  static MaybeMsg WhyNotOk(const std::variant<As...> &x) {
-    return common::visit([](const auto &x) { return WhyNotOk(x); }, x);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::variant<As...> &x) {
+    return common::visit(
+        [&context](const auto &x) { return WhyNotOk(context, x); }, x);
   }
   template <std::size_t J = 0, typename... As>
-  static MaybeMsg WhyNotOk(const std::tuple<As...> &x) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::tuple<As...> &x) {
     if constexpr (J == sizeof...(As)) {
       return {};
-    } else if (auto msg{WhyNotOk(std::get<J>(x))}) {
+    } else if (auto msg{WhyNotOk(context, std::get<J>(x))}) {
       return msg;
     } else {
-      return WhyNotOk<(J + 1)>(x);
+      return WhyNotOk<(J + 1)>(context, x);
     }
   }
-  template <typename A> static MaybeMsg WhyNotOk(const std::list<A> &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const std::list<A> &x) {
     for (const auto &y : x) {
-      if (MaybeMsg result{WhyNotOk(y)}) {
+      if (MaybeMsg result{WhyNotOk(context, y)}) {
         return result;
       }
     }
     return {};
   }
-  template <typename A> static MaybeMsg WhyNotOk(const std::optional<A> &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::optional<A> &x) {
     if (x) {
-      return WhyNotOk(*x);
+      return WhyNotOk(context, *x);
     } else {
       return {};
     }
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const parser::UnlabeledStatement<A> &x) {
-    return WhyNotOk(x.statement);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::UnlabeledStatement<A> &x) {
+    return WhyNotOk(context, x.statement);
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const parser::Statement<A> &x) {
-    return WhyNotOk(x.statement);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::Statement<A> &x) {
+    return WhyNotOk(context, x.statement);
   }
-  static MaybeMsg WhyNotOk(const parser::AllocateStmt &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AllocateStmt &) {
     return {}; // AllocateObjects are checked elsewhere
   }
-  static MaybeMsg WhyNotOk(const parser::AllocateCoarraySpec &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AllocateCoarraySpec &) {
     return parser::MessageFormattedText(
         "A coarray may not be allocated on the device"_err_en_US);
   }
-  static MaybeMsg WhyNotOk(const parser::DeallocateStmt &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::DeallocateStmt &) {
     return {}; // AllocateObjects are checked elsewhere
   }
-  static MaybeMsg WhyNotOk(const parser::AssignmentStmt &x) {
-    return DeviceExprChecker{}(x.typedAssignment);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AssignmentStmt &x) {
+    return DeviceExprChecker{context}(x.typedAssignment);
   }
-  static MaybeMsg WhyNotOk(const parser::CallStmt &x) {
-    return DeviceExprChecker{}(x.typedCall);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::CallStmt &x) {
+    return DeviceExprChecker{context}(x.typedCall);
+  }
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::ContinueStmt &) {
+    return {};
   }
-  static MaybeMsg WhyNotOk(const parser::ContinueStmt &) { return {}; }
-  static MaybeMsg WhyNotOk(const parser::IfStmt &x) {
-    if (auto result{
-            CheckUnwrappedExpr(std::get<parser::ScalarLogicalExpr>(x.t))}) {
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const parser::IfStmt &x) {
+    if (auto result{CheckUnwrappedExpr(
+            context, std::get<parser::ScalarLogicalExpr>(x.t))}) {
       return result;
     }
-    return WhyNotOk(
+    return WhyNotOk(context,
         std::get<parser::UnlabeledStatement<parser::ActionStmt>>(x.t)
             .statement);
   }
-  static MaybeMsg WhyNotOk(const parser::NullifyStmt &x) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::NullifyStmt &x) {
     for (const auto &y : x.v) {
-      if (MaybeMsg result{DeviceExprChecker{}(y.typedExpr)}) {
+      if (MaybeMsg result{DeviceExprChecker{context}(y.typedExpr)}) {
         return result;
       }
     }
     return {};
   }
-  static MaybeMsg WhyNotOk(const parser::PointerAssignmentStmt &x) {
-    return DeviceExprChecker{}(x.typedAssignment);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::PointerAssignmentStmt &x) {
+    return DeviceExprChecker{context}(x.typedAssignment);
   }
 };
 
@@ -435,12 +472,14 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
                 ErrorIfHostSymbol(assign->lhs, source);
                 ErrorIfHostSymbol(assign->rhs, source);
               }
-              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(
+                      context_, x)}) {
                 context_.Say(source, std::move(*msg));
               }
             },
             [&](const auto &x) {
-              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(
+                      context_, x)}) {
                 context_.Say(source, std::move(*msg));
               }
             },
@@ -504,7 +543,7 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
     Check(DEREF(parser::Unwrap<parser::Expr>(x)));
   }
   void Check(const parser::Expr &expr) {
-    if (MaybeMsg msg{DeviceExprChecker{}(expr.typedExpr)}) {
+    if (MaybeMsg msg{DeviceExprChecker{context_}(expr.typedExpr)}) {
       context_.Say(expr.source, std::move(*msg));
     }
   }
diff --git a/flang/test/Semantics/cuf22.cuf b/flang/test/Semantics/cuf22.cuf
new file mode 100644
index 0000000000000..36e0f0b2502df
--- /dev/null
+++ b/flang/test/Semantics/cuf22.cuf
@@ -0,0 +1,8 @@
+! RUN: not bbc -fcuda -fcuda-disable-warp-function %s -o - 2>&1 | FileCheck %s
+
+attributes(device) subroutine testMatch()
+  integer :: a, ipred, mask, v32
+  a = match_all_sync(mask, v32, ipred)
+end subroutine
+
+! CHECK:  warp match function disabled
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index c544008a24d56..c80872108ac8f 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
                                       llvm::cl::desc("enable CUDA Fortran"),
                                       llvm::cl::init(false));
 
+static llvm::cl::opt<bool>
+    disableCUDAWarpFunction("fcuda-disable-warp-function",
+                            llvm::cl::desc("Disable CUDA Warp Function"),
+                            llvm::cl::init(false));
+
 static llvm::cl::opt<std::string>
     enableGPUMode("gpu", llvm::cl::desc("Enable GPU Mode managed|unified"),
                   llvm::cl::init(""));
@@ -600,6 +605,11 @@ int main(int argc, char **argv) {
     options.features.Enable(Fortran::common::LanguageFeature::CUDA);
   }
 
+  if (disableCUDAWarpFunction) {
+    options.features.Enable(
+        Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);
+  }
+
   if (enableGPUMode == "managed") {
     options.features.Enable(Fortran::common::LanguageFeature::CudaManaged);
   } else if (enableGPUMode == "unified") {

From 842377882a3f52e345668751fa6d46ba4f7268d2 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 11 Jun 2025 13:32:49 +0800
Subject: [PATCH 029/851] [RISCV] Select signed bitfield insert for XAndesPerf
 (#143356)

This patch is similar to #142737

The XAndesPerf extension includes signed bitfield extraction
instruction `NDS.BFOS, which can extract the bits from 0 to Len - 1,
place them starting at bit Lsb, zero-filled the bits from 0 to Lsb -1,
and sign-extend the result.

When Lsb == Msb, it is a special case where the Lsb will be set to 0
instead of being equal to the Msb.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 56 +++++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |  1 +
 llvm/test/CodeGen/RISCV/rv32xandesperf.ll   | 26 ++++++++++
 llvm/test/CodeGen/RISCV/rv64xandesperf.ll   | 46 +++++++++++++++++
 4 files changed, 129 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d298965595b47..4539efd591c8b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -683,6 +683,59 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
   return false;
 }
 
+bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
+  // Only supported with XAndesPerf at the moment.
+  if (!Subtarget->hasVendorXAndesPerf())
+    return false;
+
+  auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!N1C)
+    return false;
+
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
+  auto BitfieldInsert = [&](SDValue N0, unsigned Msb, unsigned Lsb,
+                            const SDLoc &DL, MVT VT) {
+    unsigned Opc = RISCV::NDS_BFOS;
+    // If the Lsb is equal to the Msb, then the Lsb should be 0.
+    if (Lsb == Msb)
+      Lsb = 0;
+    return CurDAG->getMachineNode(Opc, DL, VT, N0.getOperand(0),
+                                  CurDAG->getTargetConstant(Lsb, DL, VT),
+                                  CurDAG->getTargetConstant(Msb, DL, VT));
+  };
+
+  SDLoc DL(Node);
+  MVT VT = Node->getSimpleValueType(0);
+  const unsigned RightShAmt = N1C->getZExtValue();
+
+  // Transform (sra (shl X, C1) C2) with C1 > C2
+  //        -> (NDS.BFOS X, lsb, msb)
+  if (N0.getOpcode() == ISD::SHL) {
+    auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!N01C)
+      return false;
+
+    const unsigned LeftShAmt = N01C->getZExtValue();
+    // Make sure that this is a bitfield insertion (i.e., the shift-right
+    // amount should be less than the left-shift).
+    if (LeftShAmt <= RightShAmt)
+      return false;
+
+    const unsigned MsbPlusOne = VT.getSizeInBits() - RightShAmt;
+    const unsigned Msb = MsbPlusOne - 1;
+    const unsigned Lsb = LeftShAmt - RightShAmt;
+
+    SDNode *Sbi = BitfieldInsert(N0, Msb, Lsb, DL, VT);
+    ReplaceNode(Node, Sbi);
+    return true;
+  }
+
+  return false;
+}
+
 bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
                                                    const SDLoc &DL, MVT VT,
                                                    SDValue X, unsigned Msb,
@@ -1214,6 +1267,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (trySignedBitfieldExtract(Node))
       return;
 
+    if (trySignedBitfieldInsertInSign(Node))
+      return;
+
     // Optimize (sra (sext_inreg X, i16), C) ->
     //          (srai (slli X, (XLen-16), (XLen-16) + C)
     // And      (sra (sext_inreg X, i8), C) ->
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index abc0372d15c4f..cb63c21fd8fc9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -77,6 +77,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
 
   bool tryShrinkShlLogicImm(SDNode *Node);
   bool trySignedBitfieldExtract(SDNode *Node);
+  bool trySignedBitfieldInsertInSign(SDNode *Node);
   bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
                                   SDValue X, unsigned Msb, unsigned Lsb);
   bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
index 3996420d477b2..3e7f09f3d6c22 100644
--- a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
@@ -154,6 +154,32 @@ define i32 @bfos_from_ashr_sexti16_i32(i16 %x) {
   ret i32 %ashr
 }
 
+; MSB = 0
+
+define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 14
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 31
+  %lshr = ashr i32 %shl, 17
+  ret i32 %lshr
+}
+
+; MSB < LSB
+
+define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 20
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 29
+  %lshr = ashr i32 %shl, 11
+  ret i32 %lshr
+}
+
+; sext
+
 define i32 @sexti1_i32(i32 %a) {
 ; CHECK-LABEL: sexti1_i32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
index af7c300a92d1f..98cda42665169 100644
--- a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
@@ -212,6 +212,52 @@ define i64 @bfos_from_ashr_sexti16_i64(i16 %x) {
   ret i64 %ashr
 }
 
+; MSB = 0
+
+define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 14
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 31
+  %lshr = ashr i32 %shl, 17
+  ret i32 %lshr
+}
+
+define i64 @bfos_from_ashr_shl_with_msb_zero_insert_i64(i64 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 46
+; CHECK-NEXT:    ret
+  %shl = shl i64 %x, 63
+  %lshr = ashr i64 %shl, 17
+  ret i64 %lshr
+}
+
+; MSB < LSB
+
+define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 20
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 29
+  %lshr = ashr i32 %shl, 11
+  ret i32 %lshr
+}
+
+define i64 @bfos_from_ashr_shl_insert_i64(i64 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 52
+; CHECK-NEXT:    ret
+  %shl = shl i64 %x, 29
+  %lshr = ashr i64 %shl, 11
+  ret i64 %lshr
+}
+
+; sext
+
 define signext i32 @sexti1_i32(i32 signext %a) {
 ; CHECK-LABEL: sexti1_i32:
 ; CHECK:       # %bb.0:

From c2cb571c6cbcec75ab401974348f9f0d9b2190db Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Tue, 10 Jun 2025 23:41:41 -0700
Subject: [PATCH 030/851] [Clang][NFC] Move UntypedParameters instead of copy
 (#143646)

Static analysis flagged that UntypedParameters could be moved instead of
copied. This would avoid copying a large object.
---
 clang/lib/Sema/SemaExprCXX.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 2546ab5c0a342..c106ea749170f 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2888,7 +2888,7 @@ static bool resolveAllocationOverload(
     // type-identity-less argument list.
     IAP.PassTypeIdentity = TypeAwareAllocationMode::No;
     IAP.PassAlignment = InitialAlignmentMode;
-    Args = UntypedParameters;
+    Args = std::move(UntypedParameters);
   }
   assert(!S.isStdTypeIdentity(Args[0]->getType(), nullptr));
   return resolveAllocationOverloadInterior(

From a17e97e6778b2cd4114052faf6ee25db330ef405 Mon Sep 17 00:00:00 2001
From: maflcko <6399679+maflcko@users.noreply.github.com>
Date: Wed, 11 Jun 2025 08:43:23 +0200
Subject: [PATCH 031/851] [libc++] Add missing C++20 [time.point.arithmetic]
 (#143165)

This was part of https://wg21.link/p0355r7, but apparently never
implemented.

---------

Co-authored-by: MarcoFalke <*~=`'#}+{/-|&$^_@721217.xyz>
Co-authored-by: Hristo Hristov <zingam@outlook.com>
---
 libcxx/include/__chrono/time_point.h          | 13 +++++++
 libcxx/include/chrono                         |  5 +++
 .../time.point.arithmetic/op_++.pass.cpp      | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_++int.pass.cpp   | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_--.pass.cpp      | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_--int.pass.cpp   | 35 +++++++++++++++++++
 6 files changed, 158 insertions(+)
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp

diff --git a/libcxx/include/__chrono/time_point.h b/libcxx/include/__chrono/time_point.h
index 6b866b882f89a..fc4408d23dbf1 100644
--- a/libcxx/include/__chrono/time_point.h
+++ b/libcxx/include/__chrono/time_point.h
@@ -58,6 +58,19 @@ class time_point {
 
   // arithmetic
 
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator++() {
+    ++__d_;
+    return *this;
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point operator++(int) { return time_point{__d_++}; }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator--() {
+    --__d_;
+    return *this;
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point operator--(int) { return time_point{__d_--}; }
+#endif // _LIBCPP_STD_VER >= 20
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 time_point& operator+=(const duration& __d) {
     __d_ += __d;
     return *this;
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index cd9b98872083e..82e99a31bcc9f 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -132,6 +132,11 @@ public:
 
     // arithmetic
 
+    constexpr time_point& operator++();    // C++20
+    constexpr time_point  operator++(int); // C++20
+    constexpr time_point& operator--();    // C++20
+    constexpr time_point  operator--(int); // C++20
+
     time_point& operator+=(const duration& d); // constexpr in C++17
     time_point& operator-=(const duration& d); // constexpr in C++17
 
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
new file mode 100644
index 0000000000000..e035d7ef4fa0e
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point& operator++();
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t{Duration{5}};
+  std::chrono::time_point<Clock, Duration>& tref{++t};
+  assert(&tref == &t);
+  assert(tref.time_since_epoch() == Duration{6});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
new file mode 100644
index 0000000000000..5304d37d5c361
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point operator++(int);
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t1{Duration{3}};
+  std::chrono::time_point<Clock, Duration> t2{t1++};
+  assert(t1.time_since_epoch() == Duration{4});
+  assert(t2.time_since_epoch() == Duration{3});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
new file mode 100644
index 0000000000000..915156fcc6b8c
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point& operator--();
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t{Duration{5}};
+  std::chrono::time_point<Clock, Duration>& tref{--t};
+  assert(&tref == &t);
+  assert(tref.time_since_epoch() == Duration{4});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp
new file mode 100644
index 0000000000000..cc5f462106bbf
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point operator--(int);
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t1{Duration{3}};
+  std::chrono::time_point<Clock, Duration> t2{t1--};
+  assert(t1.time_since_epoch() == Duration{2});
+  assert(t2.time_since_epoch() == Duration{3});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}

From 0f3c54a3b3289b6375a1d32684e831cb407af003 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 10 Jun 2025 21:33:53 +0100
Subject: [PATCH 032/851] [X86] Add test coverage showing failure to merge
 "zero input passthrough" behaviour for BSR instructions on x86_64 targets

---
 llvm/test/CodeGen/X86/bsr.ll | 492 +++++++++++++++++++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bsr.ll

diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
new file mode 100644
index 0000000000000..1247b3ec59324
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -0,0 +1,492 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i8 @cmov_bsr8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsr8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  .LBB0_5: # %cond.end
+; X86-NEXT:    xorb $7, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movb $8, %al
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    jne .LBB0_5
+; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr8:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    bsrl %ecx, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 false)
+  %2 = xor i8 %1, 7
+  %3 = icmp eq i8 %x, 0
+  %4 = select i1 %3, i8 %y, i8 %2
+  ret i8 %4
+}
+
+define i8 @cmov_bsr8_undef(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsr8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB1_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    bsrl %ecx, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 true)
+  %2 = xor i8 %1, 7
+  %3 = icmp ne i8 %x, 0
+  %4 = select i1 %3, i8 %2, i8 %y
+  ret i8 %4
+}
+
+define i16 @cmov_bsr16(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsr16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrw %ax, %cx
+; X86-NEXT:    xorl $15, %ecx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  .LBB2_5: # %cond.end
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movw $16, %cx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_5
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    xorl $15, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr16:
+; X64:       # %bb.0:
+; X64-NEXT:    movw $31, %ax
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+  %2 = xor i16 %1, 15
+  %3 = icmp ne i16 %x, 0
+  %4 = select i1 %3, i16 %2, i16 %y
+  ret i16 %4
+}
+
+define i16 @cmov_bsr16_undef(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsr16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB3_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrw %ax, %ax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %2 = xor i16 %1, 15
+  %3 = icmp eq i16 %x, 0
+  %4 = select i1 %3, i16 %y, i16 %2
+  ret i16 %4
+}
+
+define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsr32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_4
+; X86-NEXT:  .LBB4_5: # %cond.end
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB4_5
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %2 = xor i32 %1, 31
+  %3 = icmp eq i32 %x, 0
+  %4 = select i1 %3, i32 %y, i32 %2
+  ret i32 %4
+}
+
+define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsr32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %2 = xor i32 %1, 31
+  %3 = icmp ne i32 %x, 0
+  %4 = select i1 %3, i32 %2, i32 %y
+  ret i32 %4
+}
+
+define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsr64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB6_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_7
+; X86-NEXT:    jmp .LBB6_6
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB6_6
+; X86-NEXT:  .LBB6_7: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_3:
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_7
+; X86-NEXT:  .LBB6_6:
+; X86-NEXT:    xorl $63, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+  %2 = xor i64 %1, 63
+  %3 = icmp ne i64 %x, 0
+  %4 = select i1 %3, i64 %2, i64 %y
+  ret i64 %4
+}
+
+define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsr64_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB7_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB7_5
+; X86-NEXT:  .LBB7_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_1:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    je .LBB7_4
+; X86-NEXT:  .LBB7_5:
+; X86-NEXT:    xorl $63, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %2 = xor i64 %1, 63
+  %3 = icmp eq i64 %x, 0
+  %4 = select i1 %3, i64 %y, i64 %2
+  ret i64 %4
+}
+
+define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsr128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB8_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    jne .LBB8_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    jmp .LBB8_11
+; X86-NEXT:  .LBB8_3:
+; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB8_6
+; X86-NEXT:  .LBB8_7: # %cond.false
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:  .LBB8_8: # %cond.false
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    jne .LBB8_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB8_10: # %cond.false
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:  .LBB8_11: # %cond.end
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    xorl $127, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsr128:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rsi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovneq %r8, %rax
+; X64-NEXT:    xorq $127, %rax
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmoveq %rcx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 false)
+  %2 = xor i128 %1, 127
+  %3 = icmp eq i128 %x, 0
+  %4 = select i1 %3, i128 %y, i128 %2
+  ret i128 %4
+}
+
+define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsr128_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB9_3:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB9_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    bsrl %ebx, %ebp
+; X86-NEXT:    xorl $31, %ebp
+; X86-NEXT:    orl $32, %ebp
+; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:  .LBB9_4:
+; X86-NEXT:    bsrl %edx, %ebp
+; X86-NEXT:    xorl $31, %ebp
+; X86-NEXT:  .LBB9_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    jne .LBB9_8
+; X86-NEXT:  # %bb.7:
+; X86-NEXT:    orl $64, %ebp
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_8:
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB9_9
+; X86-NEXT:  # %bb.10:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    jmp .LBB9_11
+; X86-NEXT:  .LBB9_9:
+; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsr128_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rsi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    orq $64, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovneq %r8, %rax
+; X64-NEXT:    xorq $127, %rax
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmoveq %rcx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 true)
+  %2 = xor i128 %1, 127
+  %3 = icmp ne i128 %x, 0
+  %4 = select i1 %3, i128 %2, i128 %y
+  ret i128 %4
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i128 @llvm.ctlz.i128(i128, i1)

From a6ace2801e8900a6fe8c3b8295938f3b3c1e4466 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Jun 2025 08:07:30 +0100
Subject: [PATCH 033/851] [X86] combineConcatVectorOps - ensure we're only
 concatenating v2f64 generic shuffles into vXf64 vshufpd

Identified while triaging #143606 - we can't concat v4f64 lhs/rhs subvecs and then expect the v2f64 operands to be in the correct place for VSHUFPD

Test coverage will follow
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 96be91256915d..8bcd8670879a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59383,7 +59383,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
 
   // We can always convert per-lane vXf64 shuffles into VSHUFPD.
   if (!IsSplat &&
-      (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
+      ((NumOps == 2 && VT == MVT::v4f64) ||
+       (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
       all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
     // Collect the individual per-lane v2f64/v4f64 shuffles.
     MVT OpVT = Ops[0].getSimpleValueType();

From 32ac7dc2d21843091116b636777c174830cd2dd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson@ericsson.com>
Date: Wed, 11 Jun 2025 09:24:03 +0200
Subject: [PATCH 034/851] [test][AArch64] Adjust vector insertion lit tests
 (#143101)

The test cases test_insert_v16i8_insert_2_undef_base and
test_insert_v16i8_insert_2_undef_base_different_valeus in
CodeGen/AArch64/arm64-vector-insertion.ll was leaving element 8 in the
vector as "undef" without any real explanation. It kind of looked like a
typo as the input IR looked like this
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
leaving %v.8 as unused.

This patch is cleaning up the tests a bit by adding separate test cases
to validate what is happening when skipping insert at index 8, while
amending the original tests cases to use %v.8 instead of %v.7 when
creating %v.10.
---
 .../CodeGen/AArch64/arm64-vector-insertion.ll | 69 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index 94074d1689f6a..ff28c7817d143 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -66,6 +66,35 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %a, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a may be poison, so simply inserting %a also at
+; index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %a, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
@@ -75,8 +104,8 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   ret <16 x i8> %v.15
 }
 
-define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) {
-; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus:
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values(i8 %a, i8 %b) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup.16b v0, w0
 ; CHECK-NEXT:    mov.b v0[2], w1
@@ -94,6 +123,42 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a,
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %b, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a and %b may be poison, so simply inserting %a or %b
+; at index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values_skip8(i32 %a0, i32 %b0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    lsr w8, w1, #5
+; CHECK-NEXT:    mov.b v0[2], w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[7], w8
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    mov.b v0[12], w8
+; CHECK-NEXT:    mov.b v0[15], w8
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %b1 = lshr exact i32 %b0, 5
+  %b = trunc i32 %b1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %b, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12

From 686ec6cfe86367c43dccd83d7e6e2bac7e6a73a0 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 035/851] [BOLT][AArch64] Fix adr-relaxation.s test (#143151)

On some AArch64 machines the splitting was inconsistent.
This causes cold `foo` to have a `mov` instruction before adrp.

```
<foo.cold.0>:
  mov     x0, #0x0                // =0
  adrp    x1, 0x600000 <_start>
  add     x1, x1, #0x14
  ret
```

This patch removes the `mov` instruction right above .L2, making
splitting deterministic.
---
 bolt/test/AArch64/adr-relaxation.s | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bolt/test/AArch64/adr-relaxation.s b/bolt/test/AArch64/adr-relaxation.s
index a643a62339ba3..864650c3287d8 100644
--- a/bolt/test/AArch64/adr-relaxation.s
+++ b/bolt/test/AArch64/adr-relaxation.s
@@ -34,7 +34,6 @@ foo:
   .cfi_startproc
   cmp  x1, x11
   b.hi  .L2
-  mov  x0, #0x0
 .L2:
 # CHECK-FOO: <foo.cold.0>:
 # CHECK-FOO-NEXT: adrp

From 521e6ce5c8fdfc72cccc1accd78a59f1a5e2805a Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Wed, 11 Jun 2025 10:25:29 +0300
Subject: [PATCH 036/851] [CI] Add mention of LLVM Developer Policy in
 email-check message (NFC) (#143300)

As for now, It may be hard for people to get truth from long Discourse
discussion, so a link to official document may be enough to convince
changing email from private to public.
---
 .github/workflows/email-check.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index f4481d5cf5583..904ad718f97dd 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -32,7 +32,8 @@ jobs:
           COMMENT: >-
             ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.<br/>
             Please turn off [Keep my email addresses private](https://github.com/settings/emails) setting in your account.<br/>
-            See [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
+            See [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and
+            [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
         run: |
           cat << EOF > comments
           [{"body" : "$COMMENT"}]

From 17f1dac805d388596be5e8c316c0f14b3222da4e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Jun 2025 08:12:42 +0100
Subject: [PATCH 037/851] [X86] Add test coverage showing failure to merge
 "zero input passthrough" behaviour for BSF instructions on x86_64 targets

---
 llvm/test/CodeGen/X86/bsf.ll | 452 +++++++++++++++++++++++++++++++++++
 1 file changed, 452 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bsf.ll

diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
new file mode 100644
index 0000000000000..58929115baf54
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i8 @cmov_bsf8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $256, %eax # imm = 0x100
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $256, %eax # imm = 0x100
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 false)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB1_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $65536, %eax # imm = 0x10000
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $65536, %eax # imm = 0x10000
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i16 @cmov_bsf16_undef(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB3_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i32 @cmov_bsf32(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB4_5
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB4_5: # %cond.end
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_4
+; X86-NEXT:    jmp .LBB4_5
+;
+; X64-LABEL: cmov_bsf32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    jne .LBB6_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB6_7
+; X86-NEXT:  .LBB6_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB6_7: # %cond.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_3:
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+;
+; X64-LABEL: cmov_bsf64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i64 @cmov_bsf64_undef(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    je .LBB7_5
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB7_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_5: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB8_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB8_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %edi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %eax, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    jmp .LBB8_11
+; X86-NEXT:  .LBB8_3:
+; X86-NEXT:    rep bsfl %ecx, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB8_6
+; X86-NEXT:  .LBB8_7: # %cond.false
+; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:  .LBB8_8: # %cond.false
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB8_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:  .LBB8_10: # %cond.false
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:  .LBB8_11: # %cond.end
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    jne .LBB8_13
+; X86-NEXT:  # %bb.12:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:  .LBB8_13: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %ebp, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB8_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 false)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    je .LBB9_11
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB9_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %ecx, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB9_6
+; X86-NEXT:  .LBB9_5:
+; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    je .LBB9_8
+; X86-NEXT:    jmp .LBB9_9
+; X86-NEXT:  .LBB9_11: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    jmp .LBB9_10
+; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    rep bsfl %edx, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB9_5
+; X86-NEXT:  .LBB9_6: # %select.false.sink
+; X86-NEXT:    rep bsfl %esi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    jne .LBB9_9
+; X86-NEXT:  .LBB9_8: # %select.false.sink
+; X86-NEXT:    addl $64, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:  .LBB9_9: # %select.false.sink
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:  .LBB9_10: # %select.false.sink
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB9_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i128 @llvm.cttz.i128(i128, i1)

From a72bcda1434c72f9db6687565a361479e0dde572 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 038/851] [X86] add test coverage for #143606

---
 .../X86/vector-shuffle-combining-avx512vl.ll  | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
index 15c82f169c86e..d5aa7588925d8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
@@ -137,3 +137,31 @@ define void @PR142995(ptr %p0, ptr %p1, ptr %p2) nounwind #0 {
 }
 declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr captures(none), i32 immarg, <5 x i1>, <5 x i32>)
 declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr captures(none), i32 immarg, <64 x i1>, <64 x i32>)
+
+define <8 x double> @PR143606(ptr %px, ptr %py) {
+; X86-LABEL: PR143606:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovapd (%ecx), %ymm0
+; X86-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR143606:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovapd (%rdi), %ymm0
+; X64-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X64-NEXT:    retq
+  %x = load <4 x double>, ptr %px, align 32
+  %y.lo = load <4 x double>, ptr %py, align 32
+  %py.hi = getelementptr inbounds nuw i8, ptr %py, i64 32
+  %y.hi = load <4 x double>, ptr %py.hi, align 32
+  %lo = shufflevector <4 x double> %x, <4 x double> %y.lo, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %hi = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x, <4 x i64> <i64 1, i64 4, i64 2, i64 7>, <4 x double> %y.hi)
+  %res = shufflevector <4 x double> %lo, <4 x double> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}

From e9bd1aee6537508970614fd79a4f076ba4ed93d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Jun 2025 08:30:09 +0100
Subject: [PATCH 039/851] [X86] bmi-select-distrib.ll - remove unused check
 prefixes and pull out PR comments above tests. NFC

---
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 31 +++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index 49beda516d508..e5696ded4fbf1 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86,X86-BMI
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86,X86-BMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64,X64-BMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
 
-define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -25,8 +25,8 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -46,8 +46,8 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -67,8 +67,8 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; PR131587
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -283,8 +283,8 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -304,8 +304,8 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -325,8 +325,8 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -346,8 +346,8 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -392,8 +392,8 @@ define i32 @and_sub_1_select_orig(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; PR133848
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -863,8 +863,3 @@ define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
   %ret = xor i32 %a1, %bls
   ret i32 %ret
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-BMI: {{.*}}
-; X64-BMI2: {{.*}}
-; X86-BMI: {{.*}}
-; X86-BMI2: {{.*}}

From 13115276d0d12b0d9bf952abdc19f04866db16a8 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 11 Jun 2025 08:32:55 +0100
Subject: [PATCH 040/851] Revert "[AArch64][GlobalISel] Expand 64bit extracts
 to 128bit to allow more patterns (#142904)"

This reverts commit 61cdba602abe67761ab2bbf12bf85710dfa963f4 due to verifier
issues.
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  32 +--
 .../GlobalISel/regbank-extract-vector-elt.mir |   4 +-
 llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/abs.ll              |   1 -
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  13 +-
 .../AArch64/arm64-neon-simd-ldst-one.ll       |  45 ++--
 llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll |  55 +++--
 llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll     |   1 -
 llvm/test/CodeGen/AArch64/bswap.ll            |   1 -
 llvm/test/CodeGen/AArch64/concat-vector.ll    |   7 +-
 llvm/test/CodeGen/AArch64/double_reduct.ll    |  18 +-
 llvm/test/CodeGen/AArch64/f16-instructions.ll |  12 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   4 +-
 llvm/test/CodeGen/AArch64/fcvt.ll             |  14 +-
 llvm/test/CodeGen/AArch64/fdiv.ll             |   2 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  |   4 +-
 llvm/test/CodeGen/AArch64/fminmax.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fmla.ll             |   6 +-
 llvm/test/CodeGen/AArch64/fmul.ll             |   2 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll |   1 -
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll |   1 -
 llvm/test/CodeGen/AArch64/fptrunc.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fsqrt.ll            |   2 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |  45 ++--
 llvm/test/CodeGen/AArch64/itofp.ll            |  20 +-
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  33 ++-
 llvm/test/CodeGen/AArch64/popcount.ll         |   8 +-
 llvm/test/CodeGen/AArch64/ptradd.ll           |   1 -
 llvm/test/CodeGen/AArch64/shift.ll            |   6 -
 llvm/test/CodeGen/AArch64/store.ll            |  15 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 228 +++++++++++++-----
 .../CodeGen/AArch64/vecreduce-fadd-strict.ll  |   7 +-
 .../vecreduce-fmax-legalization-nan.ll        |  26 +-
 .../AArch64/vecreduce-fmax-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fmaximum.ll     |  26 +-
 .../AArch64/vecreduce-fmin-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fminimum.ll     |  26 +-
 .../CodeGen/AArch64/vecreduce-fmul-strict.ll  |  29 ++-
 llvm/test/CodeGen/AArch64/vecreduce-fmul.ll   | 121 ++++++----
 .../AArch64/vecreduce-umax-legalization.ll    |  15 +-
 llvm/test/CodeGen/AArch64/vector-lrint.ll     |  25 +-
 43 files changed, 592 insertions(+), 334 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 53c7a00a7f9f0..31954e7954c03 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -399,26 +399,6 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(1).setReg(ConstReg);
     return applyDefaultMapping(OpdMapper);
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    Register Src = MI.getOperand(1).getReg();
-    LLT SrcTy = MRI.getType(Src);
-    assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector");
-    LLT DstTy = SrcTy.multiplyElements(2);
-    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
-    auto Undef = Builder.buildUndef(SrcTy);
-    auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)});
-    MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    for (MachineInstr &Ext :
-         make_early_inc_range(MRI.use_nodbg_instructions(Src))) {
-      if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT)
-        Ext.getOperand(1).setReg(Concat.getReg(0));
-    }
-    return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -1034,20 +1014,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     // Destination and source need to be FPRs.
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
-    // Index needs to be a GPR constant.
+
+    // Index needs to be a GPR.
     OpRegBankIdx[2] = PMI_FirstGPR;
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
-    if (!Ty.isScalable() && Ty.getSizeInBits() == 64)
-      MappingID = CustomMappingID;
     break;
-  }
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
index 4e569e0bc7e5f..35bc36d472b1a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
@@ -94,9 +94,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>)
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64)
     ; CHECK-NEXT: $h0 = COPY [[EVEC]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 287344bdbd29f..7f922c0047553 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bitf_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 73fcee56506f9..b8eb8269d605c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bit_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 470d68a805718..0f56d25a47b2a 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -243,7 +243,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: abs_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 60af49d867be7..367105f783817 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1215,7 +1215,6 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: testDUP.v1i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -1711,7 +1710,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    mov b1, v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v1.b[2], v0.b[2]
 ; CHECK-GI-NEXT:    mov v1.b[3], v0.b[3]
@@ -1818,7 +1817,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v2.b[2], v0.b[2]
@@ -1904,7 +1903,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    mov h1, v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
@@ -1975,7 +1974,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
@@ -2037,7 +2036,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
@@ -2243,7 +2242,6 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v8i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -2270,7 +2268,6 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v16i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.16b, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index ac6f041ccd70d..f47c06e1ba4cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -614,11 +614,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane0_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str h0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str h0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <4 x i16> %b, i32 0
   store i16 %0, ptr %a, align 2
@@ -638,11 +643,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane0_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %b, i32 0
   store i32 %0, ptr %a, align 4
@@ -673,11 +683,16 @@ entry:
 }
 
 define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane0_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x float> %b, i32 0
   store float %0, ptr %a, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index 1f8ac792d75f5..cb14adc00df00 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -663,14 +663,24 @@ entry:
 }
 
 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlahs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlah s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlah s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -803,14 +813,24 @@ entry:
 }
 
 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlshs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlsh s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlsh s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -847,6 +867,3 @@ entry:
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
   ret i32 %vqrdmlshs_s32.i
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index eccf918f74312..d4cc154ac6afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -271,7 +271,6 @@ define half @test_vcvt_f16_f32(<1 x float> %x) {
 ;
 ; GISEL-LABEL: test_vcvt_f16_f32:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9ae4782b52bd9..898958fb4993f 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -207,7 +207,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: bswap_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 1e8dd0c78043a..acf15f1bd1178 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -13,10 +13,11 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    mov w8, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.h[3], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
    %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 2d146bf9aae89..f30895db2c098 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -65,8 +65,10 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -90,8 +92,10 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -918,8 +922,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
 ; CHECK-GI-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v0.s[1]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s4
+; CHECK-GI-NEXT:    fmul s1, s1, s5
 ; CHECK-GI-NEXT:    fmul s0, s0, s2
 ; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index aa120f2643950..adc536da26f26 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1496,7 +1496,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign:
@@ -1505,7 +1505,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   ret half %r
@@ -1536,7 +1536,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f32:
@@ -1545,7 +1545,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc float %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
@@ -1577,7 +1577,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f64:
@@ -1586,7 +1586,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc double %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 943073e2a603e..b15579199a059 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -196,7 +196,7 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
@@ -537,7 +537,7 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 7ac1f37af2e0b..3a5f7e2cd6b29 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -33,7 +33,7 @@ define float @copysign_f32(float %a, float %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call float @llvm.copysign.f32(float %a, float %b)
@@ -56,7 +56,7 @@ define half @copysign_f16(half %a, half %b) {
 ; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call half @llvm.copysign.f16(half %a, half %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 2c512de413aeb..b408e9c1bd4e6 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -169,7 +169,7 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
@@ -468,7 +468,7 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
@@ -767,7 +767,7 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
@@ -1066,7 +1066,7 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
@@ -1365,7 +1365,7 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
@@ -1664,7 +1664,7 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
@@ -1963,7 +1963,7 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d232ca4d9c131..5bdccccc62b99 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -199,7 +199,7 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 1c7c55d12a864..fb12f8acf1745 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index da9b57223cff7..64f0da8b4cd0f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index ef59209b69921..a37aabb0b5384 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -268,7 +268,7 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
@@ -873,7 +873,7 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
@@ -1358,7 +1358,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 51eba5666f681..bd3d1353e643e 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -196,7 +196,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index bcebbf4982eaa..9c21d2bf083a2 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzs w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 38895eb7bd761..44847a41287d6 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzu w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index a428c95c90387..1f84c944d7c16 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -263,7 +263,7 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -354,7 +354,7 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 1e888a4c0e193..6c5fd8e52b017 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -203,7 +203,7 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 1af36ccaefa30..5c89316e5f570 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -1478,11 +1478,16 @@ entry:
 }
 
 define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
-; CHECK-LABEL: extract_v2f32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x float> %a, i32 0
   ret float %d
@@ -1681,11 +1686,16 @@ entry:
 }
 
 define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
-; CHECK-LABEL: extract_v4f16_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v4f16_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <4 x half> %a, i32 0
   ret half %d
@@ -2149,11 +2159,16 @@ entry:
 }
 
 define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
-; CHECK-LABEL: extract_v2i32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x i32> %a, i32 0
   ret i32 %d
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 5ec30b6e8a667..e8194b9bd9b27 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4378,7 +4378,7 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -4415,7 +4415,7 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -6393,7 +6393,7 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -6439,7 +6439,7 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7375,7 +7375,7 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7395,7 +7395,7 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7602,7 +7602,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7637,7 +7637,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8124,7 +8124,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8175,7 +8175,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 9d165556f1c73..c1ea891bc86e7 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -301,17 +301,28 @@ define float @exp10_f32(float %x) {
 }
 
 define <1 x float> @exp10_v1f32(<1 x float> %x) {
-; CHECK-LABEL: exp10_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; SDAG-LABEL: exp10_v1f32:
+; SDAG:       // %bb.0:
+; SDAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    .cfi_offset w30, -16
+; SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SDAG-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; SDAG-NEXT:    bl exp10f
+; SDAG-NEXT:    // kill: def $s0 killed $s0 def $d0
+; SDAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: exp10_v1f32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -16
+; GISEL-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; GISEL-NEXT:    bl exp10f
+; GISEL-NEXT:    // kill: def $s0 killed $s0 def $d0
+; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISEL-NEXT:    ret
   %r = call <1 x float> @llvm.exp10.v1f32(<1 x float> %x)
   ret <1 x float> %r
 }
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index eded13a6b3669..c158d8ad93b05 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -655,9 +655,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; CHECKO0-NEXT:    // implicit-def: $d2
 ; CHECKO0-NEXT:    fmov s2, w8
 ; CHECKO0-NEXT:    ldr d0, [x0]
-; CHECKO0-NEXT:    // implicit-def: $q1
-; CHECKO0-NEXT:    fmov d1, d0
-; CHECKO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT:    fmov s1, s0
 ; CHECKO0-NEXT:    fmov w8, s1
 ; CHECKO0-NEXT:    fmov s1, w8
 ; CHECKO0-NEXT:    // kill: def $d1 killed $s1
@@ -727,9 +725,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; GISELO0-NEXT:    // implicit-def: $d2
 ; GISELO0-NEXT:    fmov s2, w8
 ; GISELO0-NEXT:    ldr d0, [x0]
-; GISELO0-NEXT:    // implicit-def: $q1
-; GISELO0-NEXT:    fmov d1, d0
-; GISELO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT:    fmov s1, s0
 ; GISELO0-NEXT:    fmov w8, s1
 ; GISELO0-NEXT:    fmov s1, w8
 ; GISELO0-NEXT:    // kill: def $d1 killed $s1
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 4a1c50b67ed7b..28a8f4303765b 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -51,7 +51,6 @@ define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 1652eb70b0625..9827cb3526f99 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -595,8 +595,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: shl_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
@@ -773,8 +771,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: ashr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
@@ -947,8 +943,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: lshr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 0fe1ef5039929..3a9f12b838702 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -167,11 +167,16 @@ define void @store_v16i16(<16 x i16> %a, ptr %ptr){
 }
 
 define void @store_v1i32(<1 x i32> %a, ptr %ptr){
-; CHECK-LABEL: store_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
     store <1 x i32> %a, ptr %ptr
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 63e26a25f4e27..77483ebb2235c 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -930,85 +930,195 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-GI-LABEL: vector_to_vector_cast:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    sub sp, sp, #16
-; CHECK-GI-NEXT:    umov.b w10, v0[1]
-; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
 ; CHECK-GI-NEXT:    mov d1, v0[1]
-; CHECK-GI-NEXT:    umov.b w8, v0[0]
-; CHECK-GI-NEXT:    umov.b w11, v0[0]
-; CHECK-GI-NEXT:    umov.b w12, v0[2]
-; CHECK-GI-NEXT:    umov.b w13, v0[2]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
 ; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
 ; CHECK-GI-NEXT:    umov.b w16, v0[4]
-; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w9, w9, #0x1
-; CHECK-GI-NEXT:    bfi w8, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v1[1]
-; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    bfi w11, w9, #1, #31
-; CHECK-GI-NEXT:    umov.b w9, v1[0]
-; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #2
-; CHECK-GI-NEXT:    umov.b w12, v1[2]
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w11, w11, w13, lsl #2
-; CHECK-GI-NEXT:    umov.b w13, v0[5]
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
 ; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #3
-; CHECK-GI-NEXT:    umov.b w15, v1[3]
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    bfi w9, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
 ; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v1[4]
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #2
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[4]
-; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v1[5]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w15, lsl #3
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #6
-; CHECK-GI-NEXT:    umov.b w10, v1[6]
-; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v0[5]
-; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v0[6]
-; CHECK-GI-NEXT:    orr w11, w11, w12, lsl #4
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
 ; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #6
-; CHECK-GI-NEXT:    and w10, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #7
-; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #5
-; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
 ; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    strb w8, [sp, #8]
-; CHECK-GI-NEXT:    orr w8, w10, w11, lsl #6
-; CHECK-GI-NEXT:    ldr b0, [sp, #8]
-; CHECK-GI-NEXT:    strb w9, [sp, #9]
-; CHECK-GI-NEXT:    and w9, w15, #0x1
-; CHECK-GI-NEXT:    ldr b1, [sp, #9]
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
 ; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    strb w8, [sp, #10]
-; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
 ; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    strb w8, [sp, #12]
-; CHECK-GI-NEXT:    strb w8, [sp, #13]
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
 ; CHECK-GI-NEXT:    strb w8, [sp, #14]
-; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index bd68b213ec988..1164e02a16c9e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -79,10 +79,11 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 1906ca9defa40..1d295a30a994b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 152eb66ebcdfe..ee2af110c84cd 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index a1b7118d8080d..be61f9b521795 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index d5f999add22c2..300081dc3ec40 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index 719cac8f33028..e735f670ced0c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index e22a5a4af4fae..e1b21705c95f3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -72,9 +79,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -465,6 +475,3 @@ declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 5fd705b07ca3b..2429cf4b4597a 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -44,17 +51,20 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -105,7 +115,8 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -113,10 +124,12 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
   ret half %r
@@ -134,7 +147,8 @@ define float @mul_S(<4 x float> %bin.rdx)  {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
@@ -206,7 +220,8 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -215,10 +230,12 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
   ret half %r
@@ -238,7 +255,8 @@ define float @mul_2S(<8 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
@@ -271,8 +289,9 @@ define float @mul_S_init_42(<4 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
@@ -338,8 +357,10 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
@@ -354,14 +375,18 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov h3, v1.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h4, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h2, h1, v1.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h3, v1.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h4, h0
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h1
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
+; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
@@ -389,8 +414,10 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -414,8 +441,10 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -442,10 +471,12 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mov d3, v2.d[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
 ; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    fmul s1, s1, s4
+; CHECK-GI-NEXT:    mov s3, v2.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s1, s2, v2.s[1]
+; CHECK-GI-NEXT:    fmul s1, s2, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
@@ -471,8 +502,10 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -523,8 +556,10 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b)
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s1, s0, s1
 ; CHECK-GI-NEXT:    fmul s0, s1, s0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index d5c040e09945b..0806f7da5c89c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -57,11 +57,16 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 }
 
 define i32 @test_v1i32(<1 x i32> %a) nounwind {
-; CHECK-LABEL: test_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll
index 53456c4c81ccc..602643264e7be 100644
--- a/llvm/test/CodeGen/AArch64/vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll
@@ -755,13 +755,20 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-i32-NEXT:    ret
 ;
-; CHECK-i64-LABEL: lrint_v1f32:
-; CHECK-i64:       // %bb.0:
-; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-i64-NEXT:    frintx s0, s0
-; CHECK-i64-NEXT:    fcvtzs x8, s0
-; CHECK-i64-NEXT:    fmov d0, x8
-; CHECK-i64-NEXT:    ret
+; CHECK-i64-SD-LABEL: lrint_v1f32:
+; CHECK-i64-SD:       // %bb.0:
+; CHECK-i64-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-SD-NEXT:    frintx s0, s0
+; CHECK-i64-SD-NEXT:    fcvtzs x8, s0
+; CHECK-i64-SD-NEXT:    fmov d0, x8
+; CHECK-i64-SD-NEXT:    ret
+;
+; CHECK-i64-GI-LABEL: lrint_v1f32:
+; CHECK-i64-GI:       // %bb.0:
+; CHECK-i64-GI-NEXT:    frintx s0, s0
+; CHECK-i64-GI-NEXT:    fcvtzs x8, s0
+; CHECK-i64-GI-NEXT:    fmov d0, x8
+; CHECK-i64-GI-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
   ret <1 x iXLen> %a
 }
@@ -1328,7 +1335,3 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
   ret <32 x iXLen> %a
 }
 declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-i32-GI: {{.*}}
-; CHECK-i64-GI: {{.*}}
-; CHECK-i64-SD: {{.*}}

From 14c11e4bcb262496981a2948af11a3f9e9de23ef Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Wed, 11 Jun 2025 09:39:31 +0200
Subject: [PATCH 041/851] [coro][NFC] Move switch basic block to beginning of
 coroutine (#143626)

This makes the code flow when reading the LLVM IR of a split coroutine a
bit more natural. It does not change anything from an end-user
perspective but makes debugging the CoroSplit pass slightly easier.
---
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index f9a6c70fedc2d..cebe44581b061 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -703,6 +703,7 @@ void coro::BaseCloner::replaceEntryBlock() {
     auto *SwitchBB =
         cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]);
     Builder.CreateBr(SwitchBB);
+    SwitchBB->moveAfter(Entry);
     break;
   }
   case coro::ABI::Async:

From 24d730b3808a562507f3f1f5fc125acf4b6e03aa Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Wed, 11 Jun 2025 15:56:37 +0800
Subject: [PATCH 042/851] Reland "[SelectionDAG] Make `(a & x) | (~a & y) -> (a
 & (x ^ y)) ^ y` available for all targets" (#143651)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  57 ++
 .../Target/SystemZ/SystemZISelLowering.cpp    |  14 +
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   1 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  58 --
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |  30 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  42 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 +++--
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  42 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  42 +-
 ...unfold-masked-merge-vector-variablemask.ll | 167 +++--
 llvm/test/CodeGen/RISCV/fold-masked-merge.ll  | 302 +++++++++
 ...unfold-masked-merge-scalar-variablemask.ll |  62 +-
 .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 ++++++++
 llvm/test/CodeGen/WebAssembly/simd-arith.ll   | 600 +++++++-----------
 llvm/test/CodeGen/X86/bitselect.ll            |  50 +-
 llvm/test/CodeGen/X86/fold-masked-merge.ll    |  54 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  26 +-
 ...unfold-masked-merge-vector-variablemask.ll | 598 +++++++++--------
 18 files changed, 1524 insertions(+), 1059 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b65e8e06eae62..e79a17e86bc87 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,6 +8128,59 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
+                                   SDValue AndR1, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
+    return SDValue();
+  SDValue NotOp = AndL0->getOperand(0);
+  if (NotOp == AndR1)
+    std::swap(AndR1, AndL1);
+  if (NotOp != AndL1)
+    return SDValue();
+
+  EVT VT = AndL1.getValueType();
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8306,6 +8359,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
+  if (VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
+      return R;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f06246706aaa9..1c59b1e63b7bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1283,6 +1283,20 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
+bool SystemZTargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // We can use NC(G)RK for types in GPRs ...
+  if (VT == MVT::i32 || VT == MVT::i64)
+    return Subtarget.hasMiscellaneousExtensions3();
+
+  // ... or VNC for types in VRs.
+  if (VT.isVector() || VT == MVT::i128)
+    return Subtarget.hasVector();
+
+  return false;
+}
+
 // Information about the addressing mode for a memory access.
 struct AddressingMode {
   // True if a long displacement is supported.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f3536a840fda8..f2f0bf6d8b410 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -671,6 +671,7 @@ class SystemZTargetLowering : public TargetLowering {
   }
 
   unsigned getStackProbeSize(const MachineFunction &MF) const;
+  bool hasAndNot(SDValue Y) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8bcd8670879a9..96714adf78e43 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52350,59 +52350,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
-static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
-                                   SDValue And1_L, SDValue And1_R,
-                                   const SDLoc &DL, SelectionDAG &DAG) {
-  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
-    return SDValue();
-  SDValue NotOp = And0_L->getOperand(0);
-  if (NotOp == And1_R)
-    std::swap(And1_R, And1_L);
-  if (NotOp != And1_L)
-    return SDValue();
-
-  // (~(NotOp) & And0_R) | (NotOp & And1_R)
-  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
-  EVT VT = And1_L->getValueType(0);
-  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for  targets without a fused
-/// "and-not" operation. This function is intended to be called from a
-/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  SDLoc DL(Node);
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52806,11 +52753,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should fold "masked merge" patterns when `andn` is not available.
-  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG))
-      return R;
-
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..b372dec383344 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX7-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX7-NEXT:    s_and_b32 s0, s1, s0
-; GFX7-NEXT:    s_or_b32 s0, s2, s0
+; GFX7-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX10-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX10-NEXT:    s_and_b32 s0, s1, s0
-; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6925a98f643b9..e1b4cad370f96 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,16 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x3c003c00
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x10001
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -417,16 +417,16 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -442,15 +442,15 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s4, s6, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
+; GCN-NEXT:    s_lshl_b32 s6, s6, 3
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index be16fac4c53f7..44bd4090436ef 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s3, 4
-; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_andn2_b32 s1, s2, s0
-; SI-NEXT:    s_and_b32 s0, s0, 0x50005
-; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_lshl_b32 s1, s3, 4
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; SI-NEXT:    s_and_b32 s0, s0, s1
+; SI-NEXT:    s_xor_b32 s0, s0, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s3, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x50005
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s3, 4
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 4
+; SI-NEXT:    s_lshl_b32 s8, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x50005
-; SI-NEXT:    s_and_b32 s8, s0, 0x50005
-; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 4
-; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s0, 0x50005
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; VI-NEXT:    s_mov_b32 s9, s8
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    s_lshl_b32 s8, s8, 4
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,35 +1594,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_and_b32 s6, s4, 0x505
-; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
-; VI-NEXT:    s_and_b32 s4, s4, s5
-; VI-NEXT:    s_or_b32 s4, s6, s4
+; VI-NEXT:    s_xor_b32 s6, s4, 0x505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1636,17 +1635,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1656,17 +1655,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v3i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1681,34 +1680,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1721,20 +1720,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; SI-NEXT:    s_lshl_b32 s8, s8, 3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; SI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1743,20 +1742,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-LABEL: s_dynamic_insertelement_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; VI-NEXT:    s_lshl_b32 s8, s8, 3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; VI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index e0dacb7a59a42..a0ad6328b0c01 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
-; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
-; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s6, 4
+; GFX9-NEXT:    s_xor_b32 s2, s7, 0x3e703e7
+; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s0, s4, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s4, 4
+; VI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s0, s4, 4
-; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; CI-NEXT:    s_andn2_b32 s1, s2, s0
-; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; CI-NEXT:    s_or_b32 s0, s0, s1
+; CI-NEXT:    s_lshl_b32 s1, s4, 4
+; CI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; CI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; CI-NEXT:    s_and_b32 s0, s0, s1
+; CI-NEXT:    s_xor_b32 s0, s0, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s4, s2, 0x3e703e7
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s3, s4, s3
+; GFX11-NEXT:    s_xor_b32 s2, s3, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 69724aa75af4f..321b64510c35f 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -5,10 +5,11 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
 ; GCN-LABEL: s_out32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i32 %x, %mask
@@ -22,10 +23,11 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
 ; GCN-LABEL: s_out64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i64 %x, %mask
@@ -427,10 +429,11 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_varx_42:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, s0
-; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -462,10 +465,11 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_varx_42_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
-; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_xor_b32 s1, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -560,10 +564,11 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_42_vary:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, 42
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -595,10 +600,11 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_42_vary_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
-; GCN-NEXT:    s_and_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 8e4c77e76029c..bac8bbbf0b4de 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -8,17 +8,16 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b8 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [out_v1i8_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -34,17 +33,16 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [out_v1i16_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -126,17 +124,16 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
-; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
-; CHECK-NEXT:    not.b32 %r5, %r2;
-; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v1i32_param_2];
+; CHECK-NEXT:    xor.b32 %r4, %r1, %r2;
+; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -230,21 +227,19 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
-; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
-; CHECK-NEXT:    not.b32 %r9, %r4;
-; CHECK-NEXT:    not.b32 %r10, %r3;
-; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
-; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
-; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
-; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2];
+; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
+; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
+; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -256,17 +251,16 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
-; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
-; CHECK-NEXT:    not.b64 %rd5, %rd2;
-; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
-; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [out_v1i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd4, %rd1, %rd2;
+; CHECK-NEXT:    and.b64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -350,29 +344,25 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
-; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r7;
-; CHECK-NEXT:    not.b32 %r19, %r6;
-; CHECK-NEXT:    not.b32 %r20, %r5;
-; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
-; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
-; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
-; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
-; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
-; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
-; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2];
+; CHECK-NEXT:    xor.b32 %r13, %r4, %r8;
+; CHECK-NEXT:    and.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    xor.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r17, %r16, %r11;
+; CHECK-NEXT:    xor.b32 %r18, %r17, %r7;
+; CHECK-NEXT:    xor.b32 %r19, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r20, %r19, %r10;
+; CHECK-NEXT:    xor.b32 %r21, %r20, %r6;
+; CHECK-NEXT:    xor.b32 %r22, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r23, %r22, %r9;
+; CHECK-NEXT:    xor.b32 %r24, %r23, %r5;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -384,26 +374,23 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
 ; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r6;
-; CHECK-NEXT:    not.b32 %r19, %r5;
-; CHECK-NEXT:    and.b32 %r20, %r13, %r19;
-; CHECK-NEXT:    and.b32 %r21, %r14, %r18;
-; CHECK-NEXT:    and.b32 %r22, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r23, %r12, %r22;
-; CHECK-NEXT:    or.b32 %r24, %r11, %r21;
-; CHECK-NEXT:    or.b32 %r25, %r10, %r20;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23};
+; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    xor.b32 %r14, %r4, %r13;
+; CHECK-NEXT:    and.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r15, %r13;
+; CHECK-NEXT:    xor.b32 %r17, %r2, %r11;
+; CHECK-NEXT:    and.b32 %r18, %r17, %r6;
+; CHECK-NEXT:    xor.b32 %r19, %r18, %r11;
+; CHECK-NEXT:    xor.b32 %r20, %r1, %r10;
+; CHECK-NEXT:    and.b32 %r21, %r20, %r5;
+; CHECK-NEXT:    xor.b32 %r22, %r21, %r10;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -415,21 +402,19 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<15>;
+; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
-; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
-; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
-; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
-; CHECK-NEXT:    not.b64 %rd9, %rd4;
-; CHECK-NEXT:    not.b64 %rd10, %rd3;
-; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
-; CHECK-NEXT:    and.b64 %rd12, %rd8, %rd9;
-; CHECK-NEXT:    or.b64 %rd13, %rd6, %rd12;
-; CHECK-NEXT:    or.b64 %rd14, %rd5, %rd11;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd14, %rd13};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd4;
+; CHECK-NEXT:    and.b64 %rd8, %rd7, %rd6;
+; CHECK-NEXT:    xor.b64 %rd9, %rd8, %rd4;
+; CHECK-NEXT:    xor.b64 %rd10, %rd1, %rd3;
+; CHECK-NEXT:    and.b64 %rd11, %rd10, %rd5;
+; CHECK-NEXT:    xor.b64 %rd12, %rd11, %rd3;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd12, %rd9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
new file mode 100644
index 0000000000000..631b7109281e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
+; RUN: llc -mtriple=riscv64 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
+;
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: masked_merge0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; CHECK-I-LABEL: masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; CHECK-I-LABEL: masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    mv a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    andn a2, a1, a0
+; CHECK-ZBB-NEXT:    and a0, a1, a0
+; CHECK-ZBB-NEXT:    or a0, a2, a0
+; CHECK-ZBB-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; RV32I-LABEL: masked_merge3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    xor a3, a3, a5
+; RV32I-NEXT:    xor a2, a2, a4
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    xor a0, a0, a4
+; RV32I-NEXT:    xor a1, a1, a5
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: masked_merge3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: masked_merge3:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a6, a0
+; RV32ZBB-NEXT:    not a7, a1
+; RV32ZBB-NEXT:    andn a1, a1, a3
+; RV32ZBB-NEXT:    andn a0, a0, a2
+; RV32ZBB-NEXT:    andn a2, a7, a5
+; RV32ZBB-NEXT:    andn a3, a6, a4
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: masked_merge3:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    not a3, a0
+; RV64ZBB-NEXT:    andn a2, a3, a2
+; RV64ZBB-NEXT:    andn a0, a0, a1
+; RV64ZBB-NEXT:    or a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; RV32-LABEL: not_a_masked_merge0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_a_masked_merge0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a1, a0, a1
+; RV64-NEXT:    negw a0, a0
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-I-LABEL: not_a_masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    not a1, a3
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a0, a0, a1
+; CHECK-ZBB-NEXT:    andn a1, a2, a3
+; CHECK-ZBB-NEXT:    or a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    or a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    or a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge3:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge3:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    orn a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a1, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a1, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a4, a0
+; CHECK-I-NEXT:    and a0, a4, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a4, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    not a4, a0
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a4, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a2, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a2
+; CHECK-I-NEXT:    sw a2, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a2, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a2
+; CHECK-ZBB-NEXT:    sw a2, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index 1517e524a7f78..efc8243df71e0 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,16 +8,13 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
-; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
-; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -36,10 +33,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -58,10 +54,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -80,22 +75,19 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -660,10 +652,9 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a2, a0
-; CHECK-I-NEXT:    andi a1, a1, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -704,10 +695,9 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    andi a1, a2, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a1, a0, 42
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    xor a0, a1, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -812,10 +802,9 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a2, a2, 42
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    or a0, a2, a0
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -855,10 +844,9 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a0, a0, 42
-; CHECK-I-NEXT:    and a1, a2, a1
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
new file mode 100644
index 0000000000000..c014345507f69
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3
+
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; NO-MISC3-LABEL: masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; NO-MISC3-LABEL: masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; NO-MISC3-LABEL: masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcgr %r0, %r4
+; NO-MISC3-NEXT:    aghi %r0, -1
+; NO-MISC3-NEXT:    xgr %r3, %r0
+; NO-MISC3-NEXT:    ngr %r3, %r2
+; NO-MISC3-NEXT:    xgr %r3, %r2
+; NO-MISC3-NEXT:    xgrk %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcgr %r0, %r2
+; MISC3-NEXT:    aghi %r0, -1
+; MISC3-NEXT:    ncgrk %r0, %r0, %r4
+; MISC3-NEXT:    ncgrk %r2, %r2, %r3
+; MISC3-NEXT:    ogr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcr %r0, %r2
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    nr %r0, %r4
+; NO-MISC3-NEXT:    ork %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcr %r0, %r2
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    nr %r0, %r4
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; NO-MISC3-LABEL: not_a_masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xilf %r5, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    nr %r4, %r5
+; NO-MISC3-NEXT:    or %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    ncrk %r0, %r4, %r5
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    or %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    or %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    xr %r2, %r4
+; MISC3-NEXT:    ocrk %r2, %r3, %r2
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge4:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge4:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    st %r3, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    st %r3, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nrk %r0, %r2, %r3
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    or %r0, %r4
+; NO-MISC3-NEXT:    st %r2, 0(%r5)
+; NO-MISC3-NEXT:    lr %r2, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nrk %r0, %r2, %r3
+; MISC3-NEXT:    ncrk %r1, %r4, %r2
+; MISC3-NEXT:    xilf %r2, 4294967295
+; MISC3-NEXT:    or %r0, %r1
+; MISC3-NEXT:    st %r2, 0(%r5)
+; MISC3-NEXT:    lr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    ork %r2, %r3, %r4
+; NO-MISC3-NEXT:    st %r4, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    st %r0, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 185c46aa5681e..e3607e12bf530 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -4465,203 +4465,139 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
-; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
-; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
-; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
-; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
-; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
-; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
-; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
-; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
-; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
-; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
-; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
-; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
-; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
-; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
-; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
-; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
-; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
-; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
+; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
 ; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
-; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
-; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -7546,107 +7482,75 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
-; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
-; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
-; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
-; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
-; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
-; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
 ; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9453,59 +9357,43 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
-; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
-; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
 ; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -10974,35 +10862,27 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v2i64:
 ; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
-; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
-; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
-; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
-; NO-SIMD128-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
-; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
-; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
-; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
+; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
+; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
+; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
+; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
+; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
 ; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
-; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
-; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
-; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
-; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
-; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
+; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113b14ea9..4fc0827ac4dd6 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,14 +17,22 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: bitselect_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    andb %dil, %al
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    retq
+; X64-NOBMI-LABEL: bitselect_i8:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: bitselect_i8:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    andnl %edi, %edx, %eax
+; X64-BMI-NEXT:    andl %edx, %esi
+; X64-BMI-NEXT:    orl %esi, %eax
+; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-BMI-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
@@ -35,21 +43,20 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind {
 ; X86-LABEL: bitselect_i16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorw %ax, %cx
-; X86-NEXT:    andw {{[0-9]+}}(%esp), %cx
+; X86-NEXT:    xorw %cx, %ax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bitselect_i16:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %edx, %eax
-; X64-NOBMI-NEXT:    andl %edx, %esi
-; X64-NOBMI-NEXT:    notl %eax
-; X64-NOBMI-NEXT:    andl %edi, %eax
-; X64-NOBMI-NEXT:    orl %esi, %eax
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI-NEXT:    retq
 ;
@@ -186,13 +193,12 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ;
 ; X64-BMI-LABEL: bitselect_i128:
 ; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    andnq %rsi, %r9, %rsi
 ; X64-BMI-NEXT:    andnq %rdi, %r8, %rax
-; X64-BMI-NEXT:    andq %r9, %rcx
-; X64-BMI-NEXT:    orq %rcx, %rsi
 ; X64-BMI-NEXT:    andq %r8, %rdx
 ; X64-BMI-NEXT:    orq %rdx, %rax
-; X64-BMI-NEXT:    movq %rsi, %rdx
+; X64-BMI-NEXT:    andnq %rsi, %r9, %rdx
+; X64-BMI-NEXT:    andq %r9, %rcx
+; X64-BMI-NEXT:    orq %rcx, %rdx
 ; X64-BMI-NEXT:    retq
   %not = xor i128 %m, -1
   %ma = and i128 %a, %not
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index b2614c5fe0493..05e7b2a2de372 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -30,18 +30,17 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 ; NOBMI-LABEL: masked_merge1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %edi, %eax
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    notl %eax
-; NOBMI-NEXT:    andl %edx, %eax
-; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
 ; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge1:
 ; BMI:       # %bb.0:
-; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    andnl %edx, %edi, %eax
+; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    orl %esi, %eax
 ; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BMI-NEXT:    retq
@@ -53,20 +52,11 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; NOBMI-LABEL: masked_merge2:
-; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
-; NOBMI-NEXT:    retq
-;
-; BMI-LABEL: masked_merge2:
-; BMI:       # %bb.0:
-; BMI-NEXT:    movl %edi, %eax
-; BMI-NEXT:    notb %al
-; BMI-NEXT:    andb %sil, %al
-; BMI-NEXT:    andb %dil, %sil
-; BMI-NEXT:    orb %sil, %al
-; BMI-NEXT:    retq
+; CHECK-LABEL: masked_merge2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
@@ -279,3 +269,27 @@ define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
   store i32 %and1, ptr %p1
   ret i32 %or
 }
+
+define i32 @pr137641_crash({ i8, i32 } %0) {
+; NOBMI-LABEL: pr137641_crash:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    andl $201, %eax
+; NOBMI-NEXT:    xorl $1, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: pr137641_crash:
+; BMI:       # %bb.0:
+; BMI-NEXT:    movl %esi, %eax
+; BMI-NEXT:    notl %eax
+; BMI-NEXT:    andl $1, %eax
+; BMI-NEXT:    andl $200, %esi
+; BMI-NEXT:    orl %esi, %eax
+; BMI-NEXT:    retq
+  %asmresult1.i = extractvalue { i8, i32 } %0, 1
+  %not = xor i32 %asmresult1.i, 1
+  %and = and i32 1, %not
+  %and1 = and i32 %asmresult1.i, 200
+  %2 = or i32 %and, %and1
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 9c9d06921096c..6a55d740fe421 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,21 +6,18 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %al
-; CHECK-NOBMI-NEXT:    andb %sil, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    notb %al
-; CHECK-BMI-NEXT:    andb %sil, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
@@ -33,18 +30,17 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %esi, %eax
-; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1c..809c15881cc9b 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,10 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    orb %dil, %al
-; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    orb %dil, %al
-; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -86,11 +81,10 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
@@ -235,32 +229,28 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
-; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %eax
-; CHECK-BASELINE-NEXT:    orl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andl %ecx, %r9d
-; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %eax
-; CHECK-SSE1-NEXT:    orl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i16:
@@ -439,9 +429,12 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
@@ -451,21 +444,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
@@ -475,13 +468,10 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -506,43 +496,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -883,14 +873,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
@@ -906,16 +896,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movl %edi, %r10d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    xorw %di, %r10w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
@@ -941,14 +931,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
@@ -964,16 +954,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
-; CHECK-SSE1-NEXT:    movl %r11d, %ebx
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorw %r11w, %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
-; CHECK-SSE1-NEXT:    movl %r10d, %r11d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
-; CHECK-SSE1-NEXT:    movl %edi, %r10d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    xorw %di, %r10w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
@@ -1759,113 +1749,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
-; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
-; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
-; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
-; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
-; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movq %rcx, %r10
+; CHECK-BASELINE-NEXT:    movq %rdx, %r8
+; CHECK-BASELINE-NEXT:    movq %rsi, %r9
+; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %esi
+; CHECK-BASELINE-NEXT:    movzwl (%r9), %edi
+; CHECK-BASELINE-NEXT:    xorw %cx, %di
+; CHECK-BASELINE-NEXT:    andw (%r10), %di
+; CHECK-BASELINE-NEXT:    xorl %ecx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %si, %cx
+; CHECK-BASELINE-NEXT:    andw 2(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %esi, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %dx, %cx
+; CHECK-BASELINE-NEXT:    andw 4(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %edx, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 6(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-BASELINE-NEXT:    xorw %bx, %ax
-; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %bp, %ax
-; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
+; CHECK-BASELINE-NEXT:    andw 8(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r14w, %bx
+; CHECK-BASELINE-NEXT:    andw 10(%r10), %bx
+; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-BASELINE-NEXT:    xorw %r12w, %r14w
+; CHECK-BASELINE-NEXT:    andw 12(%r10), %r14w
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r13w, %r12w
+; CHECK-BASELINE-NEXT:    andw 14(%r10), %r12w
+; CHECK-BASELINE-NEXT:    xorl %r13d, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r13w
+; CHECK-BASELINE-NEXT:    andw 16(%r10), %r13w
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-BASELINE-NEXT:    xorw %bp, %r15w
+; CHECK-BASELINE-NEXT:    andw 18(%r10), %r15w
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r15d
+; CHECK-BASELINE-NEXT:    movl 20(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-BASELINE-NEXT:    xorw %ax, %bp
+; CHECK-BASELINE-NEXT:    andw 20(%r10), %bp
 ; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
-; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
-; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
-; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
-; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
-; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
-; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
-; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
-; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-BASELINE-NEXT:    xorw %dx, %si
-; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
-; CHECK-BASELINE-NEXT:    xorl %esi, %edx
-; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %esi
+; CHECK-BASELINE-NEXT:    xorw %ax, %si
+; CHECK-BASELINE-NEXT:    andw 22(%r10), %si
+; CHECK-BASELINE-NEXT:    xorl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl 24(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 24(%r9), %edx
+; CHECK-BASELINE-NEXT:    xorw %ax, %dx
+; CHECK-BASELINE-NEXT:    andw 24(%r10), %dx
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 26(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl 28(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 28(%r9), %eax
+; CHECK-BASELINE-NEXT:    xorw %di, %ax
+; CHECK-BASELINE-NEXT:    andw 28(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %edi, %eax
+; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-BASELINE-NEXT:    xorw %di, %r8w
+; CHECK-BASELINE-NEXT:    andw 30(%r10), %r8w
+; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movw %r8w, 30(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 28(%r11)
+; CHECK-BASELINE-NEXT:    movw %cx, 26(%r11)
+; CHECK-BASELINE-NEXT:    movw %dx, 24(%r11)
+; CHECK-BASELINE-NEXT:    movw %si, 22(%r11)
+; CHECK-BASELINE-NEXT:    movw %bp, 20(%r11)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r11)
+; CHECK-BASELINE-NEXT:    movw %r13w, 16(%r11)
+; CHECK-BASELINE-NEXT:    movw %r12w, 14(%r11)
+; CHECK-BASELINE-NEXT:    movw %r14w, 12(%r11)
+; CHECK-BASELINE-NEXT:    movw %bx, 10(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 4(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%r11)
+; CHECK-BASELINE-NEXT:    movq %r11, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1882,113 +1876,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw (%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r12w, %ax
-; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r12d
-; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r10w, %ax
-; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r10d
-; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r11w, %ax
-; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r11d
-; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    movq %rcx, %r10
+; CHECK-SSE1-NEXT:    movq %rdx, %r8
+; CHECK-SSE1-NEXT:    movq %rsi, %r9
+; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 2(%r8), %esi
+; CHECK-SSE1-NEXT:    movzwl (%r9), %edi
+; CHECK-SSE1-NEXT:    xorw %cx, %di
+; CHECK-SSE1-NEXT:    andw (%r10), %di
+; CHECK-SSE1-NEXT:    xorl %ecx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %si, %cx
+; CHECK-SSE1-NEXT:    andw 2(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %esi, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %dx, %cx
+; CHECK-SSE1-NEXT:    andw 4(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %edx, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 6(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-SSE1-NEXT:    xorw %bx, %ax
-; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %ebx
-; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %bp, %ax
-; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
+; CHECK-SSE1-NEXT:    andw 8(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %ebx, %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-SSE1-NEXT:    xorw %r14w, %bx
+; CHECK-SSE1-NEXT:    andw 10(%r10), %bx
+; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
+; CHECK-SSE1-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-SSE1-NEXT:    xorw %r12w, %r14w
+; CHECK-SSE1-NEXT:    andw 12(%r10), %r14w
+; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-SSE1-NEXT:    xorw %r13w, %r12w
+; CHECK-SSE1-NEXT:    andw 14(%r10), %r12w
+; CHECK-SSE1-NEXT:    xorl %r13d, %r12d
+; CHECK-SSE1-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r13w
+; CHECK-SSE1-NEXT:    andw 16(%r10), %r13w
+; CHECK-SSE1-NEXT:    xorl %r15d, %r13d
+; CHECK-SSE1-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-SSE1-NEXT:    xorw %bp, %r15w
+; CHECK-SSE1-NEXT:    andw 18(%r10), %r15w
+; CHECK-SSE1-NEXT:    xorl %ebp, %r15d
+; CHECK-SSE1-NEXT:    movl 20(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-SSE1-NEXT:    xorw %ax, %bp
+; CHECK-SSE1-NEXT:    andw 20(%r10), %bp
 ; CHECK-SSE1-NEXT:    xorl %eax, %ebp
-; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r14w, %ax
-; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r14d
-; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r15w, %ax
-; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r15d
-; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-SSE1-NEXT:    xorw %ax, %r10w
-; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
-; CHECK-SSE1-NEXT:    xorl %r10d, %eax
-; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
-; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
-; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
-; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-SSE1-NEXT:    xorw %dx, %si
-; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
-; CHECK-SSE1-NEXT:    xorl %esi, %edx
-; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
-; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
+; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 22(%r9), %esi
+; CHECK-SSE1-NEXT:    xorw %ax, %si
+; CHECK-SSE1-NEXT:    andw 22(%r10), %si
+; CHECK-SSE1-NEXT:    xorl %eax, %esi
+; CHECK-SSE1-NEXT:    movl 24(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 24(%r9), %edx
+; CHECK-SSE1-NEXT:    xorw %ax, %dx
+; CHECK-SSE1-NEXT:    andw 24(%r10), %dx
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 26(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl 28(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 28(%r9), %eax
+; CHECK-SSE1-NEXT:    xorw %di, %ax
+; CHECK-SSE1-NEXT:    andw 28(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %edi, %eax
+; CHECK-SSE1-NEXT:    movzwl 30(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-SSE1-NEXT:    xorw %di, %r8w
+; CHECK-SSE1-NEXT:    andw 30(%r10), %r8w
+; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    movw %r8w, 30(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 28(%r11)
+; CHECK-SSE1-NEXT:    movw %cx, 26(%r11)
+; CHECK-SSE1-NEXT:    movw %dx, 24(%r11)
+; CHECK-SSE1-NEXT:    movw %si, 22(%r11)
+; CHECK-SSE1-NEXT:    movw %bp, 20(%r11)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%r11)
+; CHECK-SSE1-NEXT:    movw %r13w, 16(%r11)
+; CHECK-SSE1-NEXT:    movw %r12w, 14(%r11)
+; CHECK-SSE1-NEXT:    movw %r14w, 12(%r11)
+; CHECK-SSE1-NEXT:    movw %bx, 10(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 8(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 4(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%r11)
+; CHECK-SSE1-NEXT:    movq %r11, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13

From 937be177528de156922c1b5f6cab08ba3009dbf2 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 11 Jun 2025 10:10:22 +0200
Subject: [PATCH 043/851] [flang] Enable delayed localization by default for
 `do concurrent` (#142567)

This PR aims to make it easier and more self-contained to revert the
switch/flag if we discover any problems with enabling it by default.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +-----
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 64b16b3abe991..5ff8101dba097 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,11 +2033,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
-    // default unlike the staging flag) once the implementation of this is more
-    // complete.
-    bool useDelayedPriv =
-        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
+    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 6cae0eb46db13..039b17808d19e 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index a3d0c34ed8569..67f080eb2c1c5 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index d643213854744..798cbb335c8c0 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc3..64f14ff972272 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca16..34d7bcfb7d7ad 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From afbcf9529a1edb88d067e6fca8d9534901310d5e Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp@gmail.com>
Date: Wed, 11 Jun 2025 14:01:31 +0530
Subject: [PATCH 044/851] [OpenMP 6.0 ]Codegen for Reduction over private
 variables with reduction clause (#134709)

Codegen support for reduction over private variable with reduction
clause. Section 7.6.10 in in OpenMP 6.0 spec.
- An internal shared copy is initialized with an initializer value.
- The shared copy is updated by combining its value with the values from
the private copies created by the clause.
- Once an encountering thread verifies that all updates are complete,
its original list item is updated by merging its value with that of the
shared copy and then broadcast to all threads.

Sample Test Case from OpenMP 6.0 Example
```
#include <assert.h>
#include <omp.h>
#define N 10

void do_red(int n, int *v, int &sum_v)
{
    sum_v = 0; // sum_v is private
    #pragma omp for reduction(original(private),+: sum_v)
    for (int i = 0; i < n; i++)
    {
        sum_v += v[i];
    }
}

int main(void)
{
    int v[N];
    for (int i = 0; i < N; i++)
        v[i] = i;
    #pragma omp parallel num_threads(4)
    {
        int s_v; // s_v is private
        do_red(N, v, s_v);
        assert(s_v == 45);
    }
    return 0;
}
```
Expected Codegen:
```
 // A shared global/static variable is introduced for the reduction result.
 // This variable is initialized (e.g., using memset or a UDR initializer)
 // e.g., .omp.reduction.internal_private_var

 // Barrier before any thread performs combination
  call void @__kmpc_barrier(...)

 // Initialization block (executed by thread 0)
 // e.g., call void @llvm.memset.p0.i64(...) or call @udr_initializer(...)

  call void @__kmpc_critical(...)
    // Inside critical section:
    // Load the current value from the shared variable
    // Load the thread-local private variable's value
    // Perform the reduction operation
    // Store the result back to the shared variable

  call void @__kmpc_end_critical(...)
  // Barrier after all threads complete their combinations

  call void @__kmpc_barrier(...)
 // Broadcast phase:
 // Load the final result from the shared variable)
 // Store the final result to the original private variable in each thread
 // Final barrier after broadcast

  call void @__kmpc_barrier(...)
```

---------

Co-authored-by: Chandra Ghale <ghale@pe31.hpc.amslabs.hpecorp.net>
---
 clang/docs/OpenMPSupport.rst                  |   3 +-
 clang/docs/ReleaseNotes.rst                   |   1 +
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 292 ++++++-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  12 +
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  11 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |  41 +-
 .../OpenMP/distribute_simd_misc_messages.c    |   3 +-
 .../OpenMP/for_private_reduction_codegen.cpp  | 710 ++++++++++++++++++
 clang/test/OpenMP/for_reduction_messages.cpp  |   2 +
 .../OpenMP/for_simd_reduction_messages.cpp    |   2 +-
 .../OpenMP/sections_reduction_messages.cpp    |   2 +-
 .../for/omp_for_private_reduction.cpp         | 194 +++++
 12 files changed, 1235 insertions(+), 38 deletions(-)
 create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index d6507071d4693..986aaabe1eed4 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -406,7 +406,8 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Extensions to atomic construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ 
-| Private reductions                                          | :part:`partial`           | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
+| Private reductions                                          | :good:`mostly`            | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
+|                                                             |                           |                           | Codegen: https://github.com/llvm/llvm-project/pull/134709                |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Self maps                                                   | :part:`partial`           | :none:`unclaimed`         | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888      |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f36c82bff2ef8..5645edc73431b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1100,6 +1100,7 @@ OpenMP Support
   open parenthesis. (#GH139665)
 - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have
   an argument larger than what can fit within a 64-bit integer.
+- Added support for private variable reduction.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 09e3ccc380ae3..4173355491fd4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4907,11 +4907,255 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
   }
 }
 
+static std::string generateUniqueName(CodeGenModule &CGM,
+                                      llvm::StringRef Prefix, const Expr *Ref);
+
+void CGOpenMPRuntime::emitPrivateReduction(
+    CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates,
+    const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) {
+
+  //  Create a shared global variable (__shared_reduction_var) to accumulate the
+  //  final result.
+  //
+  //  Call __kmpc_barrier to synchronize threads before initialization.
+  //
+  //  The master thread (thread_id == 0) initializes __shared_reduction_var
+  //    with the identity value or initializer.
+  //
+  //  Call __kmpc_barrier to synchronize before combining.
+  //  For each i:
+  //    - Thread enters critical section.
+  //    - Reads its private value from LHSExprs[i].
+  //    - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i],
+  //    Privates[i]).
+  //    - Exits critical section.
+  //
+  //  Call __kmpc_barrier after combining.
+  //
+  //  Each thread copies __shared_reduction_var[i] back to RHSExprs[i].
+  //
+  //  Final __kmpc_barrier to synchronize after broadcasting
+  QualType PrivateType = Privates->getType();
+  llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+  const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps);
+  std::string ReductionVarNameStr;
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates->IgnoreParenCasts()))
+    ReductionVarNameStr =
+        generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates);
+  else
+    ReductionVarNameStr = "unnamed_priv_var";
+
+  // Create an internal shared variable
+  std::string SharedName =
+      CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr});
+  llvm::GlobalVariable *SharedVar = OMPBuilder.getOrCreateInternalVariable(
+      LLVMType, ".omp.reduction." + SharedName);
+
+  SharedVar->setAlignment(
+      llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+  Address SharedResult =
+      CGF.MakeNaturalAlignRawAddrLValue(SharedVar, PrivateType).getAddress();
+
+  llvm::Value *ThreadId = getThreadID(CGF, Loc);
+  llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+  llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+  llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+  llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+  llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+      ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+  CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+  CGF.EmitBlock(InitBB);
+
+  auto EmitSharedInit = [&]() {
+    if (UDR) { // Check if it's a User-Defined Reduction
+      if (const Expr *UDRInitExpr = UDR->getInitializer()) {
+        std::pair<llvm::Function *, llvm::Function *> FnPair =
+            getUserDefinedReduction(UDR);
+        llvm::Function *InitializerFn = FnPair.second;
+        if (InitializerFn) {
+          if (const auto *CE =
+                  dyn_cast<CallExpr>(UDRInitExpr->IgnoreParenImpCasts())) {
+            const auto *OutDRE = cast<DeclRefExpr>(
+                cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+                    ->getSubExpr());
+            const VarDecl *OutVD = cast<VarDecl>(OutDRE->getDecl());
+
+            CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+            LocalScope.addPrivate(OutVD, SharedResult);
+
+            (void)LocalScope.Privatize();
+            if (const auto *OVE = dyn_cast<OpaqueValueExpr>(
+                    CE->getCallee()->IgnoreParenImpCasts())) {
+              CodeGenFunction::OpaqueValueMapping OpaqueMap(
+                  CGF, OVE, RValue::get(InitializerFn));
+              CGF.EmitIgnoredExpr(CE);
+            } else {
+              CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                                   PrivateType.getQualifiers(),
+                                   /*IsInitializer=*/true);
+            }
+          } else {
+            CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                                 PrivateType.getQualifiers(),
+                                 /*IsInitializer=*/true);
+          }
+        } else {
+          CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                               PrivateType.getQualifiers(),
+                               /*IsInitializer=*/true);
+        }
+      } else {
+        // EmitNullInitialization handles default construction for C++ classes
+        // and zeroing for scalars, which is a reasonable default.
+        CGF.EmitNullInitialization(SharedResult, PrivateType);
+      }
+      return; // UDR initialization handled
+    }
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates)) {
+      if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+        if (const Expr *InitExpr = VD->getInit()) {
+          CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+                               PrivateType.getQualifiers(), true);
+          return;
+        }
+      }
+    }
+    CGF.EmitNullInitialization(SharedResult, PrivateType);
+  };
+  EmitSharedInit();
+  CGF.Builder.CreateBr(InitEndBB);
+  CGF.EmitBlock(InitEndBB);
+
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  const Expr *ReductionOp = ReductionOps;
+  const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
+  LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+  LValue LHSLV = CGF.EmitLValue(Privates);
+
+  auto EmitCriticalReduction = [&](auto ReductionGen) {
+    std::string CriticalName = getName({"reduction_critical"});
+    emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+  };
+
+  if (CurrentUDR) {
+    // Handle user-defined reduction.
+    auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      std::pair<llvm::Function *, llvm::Function *> FnPair =
+          getUserDefinedReduction(CurrentUDR);
+      if (FnPair.first) {
+        if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
+          const auto *OutDRE = cast<DeclRefExpr>(
+              cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+                  ->getSubExpr());
+          const auto *InDRE = cast<DeclRefExpr>(
+              cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
+                  ->getSubExpr());
+          CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+          LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
+                                SharedLV.getAddress());
+          LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
+                                LHSLV.getAddress());
+          (void)LocalScope.Privatize();
+          emitReductionCombiner(CGF, ReductionOp);
+        }
+      }
+    };
+    EmitCriticalReduction(ReductionGen);
+  } else {
+    // Handle built-in reduction operations.
+#ifndef NDEBUG
+    const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
+    if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
+      ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
+
+    const Expr *AssignRHS = nullptr;
+    if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
+      if (BinOp->getOpcode() == BO_Assign)
+        AssignRHS = BinOp->getRHS();
+    } else if (const auto *OpCall =
+                   dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+      if (OpCall->getOperator() == OO_Equal)
+        AssignRHS = OpCall->getArg(1);
+    }
+
+    assert(AssignRHS &&
+           "Private Variable Reduction : Invalid ReductionOp expression");
+#endif
+
+    auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      const auto *OmpOutDRE =
+          dyn_cast<DeclRefExpr>(LHSExprs->IgnoreParenImpCasts());
+      const auto *OmpInDRE =
+          dyn_cast<DeclRefExpr>(RHSExprs->IgnoreParenImpCasts());
+      assert(
+          OmpOutDRE && OmpInDRE &&
+          "Private Variable Reduction : LHSExpr/RHSExpr must be DeclRefExprs");
+      const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
+      const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
+      CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+      LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
+      LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
+      (void)LocalScope.Privatize();
+      // Emit the actual reduction operation
+      CGF.EmitIgnoredExpr(ReductionOp);
+    };
+    EmitCriticalReduction(ReductionGen);
+  }
+
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  // Broadcast final result
+  bool IsAggregate = PrivateType->isAggregateType();
+  LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType);
+  llvm::Value *FinalResultVal = nullptr;
+  Address FinalResultAddr = Address::invalid();
+
+  if (IsAggregate)
+    FinalResultAddr = SharedResult;
+  else
+    FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc);
+
+  LValue TargetLHSLV = CGF.EmitLValue(RHSExprs);
+  if (IsAggregate) {
+    CGF.EmitAggregateCopy(TargetLHSLV,
+                          CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
+                          PrivateType, AggValueSlot::DoesNotOverlap, false);
+  } else {
+    CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
+  }
+  // Final synchronization barrier
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  // Combiner with original list item
+  auto OriginalListCombiner = [&](CodeGenFunction &CGF,
+                                  PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    emitSingleReductionCombiner(CGF, ReductionOps, Privates,
+                                cast<DeclRefExpr>(LHSExprs),
+                                cast<DeclRefExpr>(RHSExprs));
+  };
+  EmitCriticalReduction(OriginalListCombiner);
+}
+
 void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
-                                    ArrayRef<const Expr *> Privates,
-                                    ArrayRef<const Expr *> LHSExprs,
-                                    ArrayRef<const Expr *> RHSExprs,
-                                    ArrayRef<const Expr *> ReductionOps,
+                                    ArrayRef<const Expr *> OrgPrivates,
+                                    ArrayRef<const Expr *> OrgLHSExprs,
+                                    ArrayRef<const Expr *> OrgRHSExprs,
+                                    ArrayRef<const Expr *> OrgReductionOps,
                                     ReductionOptionsTy Options) {
   if (!CGF.HaveInsertPoint())
     return;
@@ -4958,10 +5202,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   if (SimpleReduction) {
     CodeGenFunction::RunCleanupsScope Scope(CGF);
-    const auto *IPriv = Privates.begin();
-    const auto *ILHS = LHSExprs.begin();
-    const auto *IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
+    const auto *IPriv = OrgPrivates.begin();
+    const auto *ILHS = OrgLHSExprs.begin();
+    const auto *IRHS = OrgRHSExprs.begin();
+    for (const Expr *E : OrgReductionOps) {
       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
                                   cast<DeclRefExpr>(*IRHS));
       ++IPriv;
@@ -4971,6 +5215,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
     return;
   }
 
+  // Filter out shared  reduction variables based on IsPrivateVarReduction flag.
+  // Only keep entries where the corresponding variable is not private.
+  SmallVector<const Expr *> FilteredPrivates, FilteredLHSExprs,
+      FilteredRHSExprs, FilteredReductionOps;
+  for (unsigned I : llvm::seq<unsigned>(
+           std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
+    if (!Options.IsPrivateVarReduction[I]) {
+      FilteredPrivates.emplace_back(OrgPrivates[I]);
+      FilteredLHSExprs.emplace_back(OrgLHSExprs[I]);
+      FilteredRHSExprs.emplace_back(OrgRHSExprs[I]);
+      FilteredReductionOps.emplace_back(OrgReductionOps[I]);
+    }
+  }
+  // Wrap filtered vectors in ArrayRef for downstream shared reduction
+  // processing.
+  ArrayRef<const Expr *> Privates = FilteredPrivates;
+  ArrayRef<const Expr *> LHSExprs = FilteredLHSExprs;
+  ArrayRef<const Expr *> RHSExprs = FilteredRHSExprs;
+  ArrayRef<const Expr *> ReductionOps = FilteredReductionOps;
+
   // 1. Build a list of reduction variables.
   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = RHSExprs.size();
@@ -5162,7 +5426,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
       } else {
         // Emit as a critical region.
         auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
-                                           const Expr *, const Expr *) {
+                                     const Expr *, const Expr *) {
           CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime();
           std::string Name = RT.getName({"atomic_reduction"});
           RT.emitCriticalRegion(
@@ -5209,6 +5473,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   CGF.EmitBranch(DefaultBB);
   CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+  assert(OrgLHSExprs.size() == OrgPrivates.size() &&
+         "PrivateVarReduction: Privates size mismatch");
+  assert(OrgLHSExprs.size() == OrgReductionOps.size() &&
+         "PrivateVarReduction: ReductionOps size mismatch");
+  for (unsigned I : llvm::seq<unsigned>(
+           std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
+    if (Options.IsPrivateVarReduction[I])
+      emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I],
+                           OrgRHSExprs[I], OrgReductionOps[I]);
+  }
 }
 
 /// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..5be48b439f4fd 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,20 @@ class CGOpenMPRuntime {
   struct ReductionOptionsTy {
     bool WithNowait;
     bool SimpleReduction;
+    llvm::SmallVector<bool, 8> IsPrivateVarReduction;
     OpenMPDirectiveKind ReductionKind;
   };
+
+  /// Emits code for private variable reduction
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+                            const Expr *Privates, const Expr *LHSExprs,
+                            const Expr *RHSExprs, const Expr *ReductionOps);
+
   /// Emit a code for reduction clause. Next code should be emitted for
   /// reduction:
   /// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 7fa6bfa75c350..d9195d749e056 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1472,6 +1472,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
   llvm::SmallVector<const Expr *, 8> LHSExprs;
   llvm::SmallVector<const Expr *, 8> RHSExprs;
   llvm::SmallVector<const Expr *, 8> ReductionOps;
+  llvm::SmallVector<bool, 8> IsPrivateVarReduction;
   bool HasAtLeastOneReduction = false;
   bool IsReductionWithTaskMod = false;
   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
@@ -1482,6 +1483,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     Privates.append(C->privates().begin(), C->privates().end());
     LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
     RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+    IsPrivateVarReduction.append(C->private_var_reduction_flags().begin(),
+                                 C->private_var_reduction_flags().end());
     ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
     IsReductionWithTaskMod =
         IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1503,7 +1506,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     // parallel directive (it always has implicit barrier).
     CGM.getOpenMPRuntime().emitReduction(
         *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
-        {WithNowait, SimpleReduction, ReductionKind});
+        {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
   }
 }
 
@@ -3944,7 +3947,8 @@ static void emitScanBasedDirective(
       PrivScope.Privatize();
       CGF.CGM.getOpenMPRuntime().emitReduction(
           CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
-          {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+          {/*WithNowait=*/true, /*SimpleReduction=*/true,
+           /*IsPrivateVarReduction*/ {}, OMPD_unknown});
     }
     llvm::Value *NextIVal =
         CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5749,7 +5753,8 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
       }
       CGM.getOpenMPRuntime().emitReduction(
           *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
-          {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+          {/*WithNowait=*/true, /*SimpleReduction=*/true,
+           /*IsPrivateVarReduction*/ {}, OMPD_simd});
       for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
         const Expr *PrivateExpr = Privates[I];
         LValue DestLVal;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 4ac3a60ae455f..a3395ac157d96 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -19047,34 +19047,14 @@ static bool actOnOMPReductionKindClause(
         reportOriginalDsa(S, Stack, D, DVar);
         continue;
       }
-      // OpenMP 6.0 [ 7.6.10 ]
-      // Support Reduction over private variables with reduction clause.
-      // A list item in a reduction clause can now be private in the enclosing
-      // context. For orphaned constructs it is assumed to be shared unless the
-      // original(private) modifier appears in the clause.
-      DVar = Stack->getImplicitDSA(D, true);
-      bool IsOrphaned = false;
-      OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective();
-      OpenMPDirectiveKind ParentDir = Stack->getParentDirective();
-      // Check if the construct is orphaned (has no enclosing OpenMP context)
-      IsOrphaned = ParentDir == OMPD_unknown;
-      // OpenMP 6.0: Private DSA check
-      IsPrivate =
-          (S.getLangOpts().OpenMP > 52) &&
-          ((isOpenMPPrivate(DVar.CKind) && DVar.CKind != OMPC_reduction &&
-            isOpenMPWorksharingDirective(CurrDir) &&
-            !isOpenMPParallelDirective(CurrDir) &&
-            !isOpenMPTeamsDirective(CurrDir) &&
-            !isOpenMPSimdDirective(ParentDir)) ||
-           (IsOrphaned && DVar.CKind == OMPC_unknown) ||
-           RD.OrigSharingModifier != OMPC_ORIGINAL_SHARING_shared);
 
       // OpenMP [2.14.3.6, Restrictions, p.1]
       //  A list item that appears in a reduction clause of a worksharing
       //  construct must be shared in the parallel regions to which any of the
       //  worksharing regions arising from the worksharing construct bind.
 
-      if (!IsPrivate && isOpenMPWorksharingDirective(CurrDir) &&
+      if (S.getLangOpts().OpenMP <= 52 &&
+          isOpenMPWorksharingDirective(CurrDir) &&
           !isOpenMPParallelDirective(CurrDir) &&
           !isOpenMPTeamsDirective(CurrDir)) {
         DVar = Stack->getImplicitDSA(D, true);
@@ -19085,6 +19065,23 @@ static bool actOnOMPReductionKindClause(
           reportOriginalDsa(S, Stack, D, DVar);
           continue;
         }
+      } else if (isOpenMPWorksharingDirective(CurrDir) &&
+                 !isOpenMPParallelDirective(CurrDir) &&
+                 !isOpenMPTeamsDirective(CurrDir)) {
+        // OpenMP 6.0 [ 7.6.10 ]
+        // Support Reduction over private variables with reduction clause.
+        // A list item in a reduction clause can now be private in the enclosing
+        // context. For orphaned constructs it is assumed to be shared unless
+        // the original(private) modifier appears in the clause.
+        DVar = Stack->getImplicitDSA(D, true);
+        // Determine if the variable should be considered private
+        IsPrivate = DVar.CKind != OMPC_shared;
+        bool IsOrphaned = false;
+        OpenMPDirectiveKind ParentDir = Stack->getParentDirective();
+        IsOrphaned = ParentDir == OMPD_unknown;
+        if ((IsOrphaned &&
+             RD.OrigSharingModifier == OMPC_ORIGINAL_SHARING_private))
+          IsPrivate = true;
       }
     } else {
       // Threadprivates cannot be shared between threads, so dignose if the base
diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c
index 8cbf96cd7a014..270e17dcb89bb 100644
--- a/clang/test/OpenMP/distribute_simd_misc_messages.c
+++ b/clang/test/OpenMP/distribute_simd_misc_messages.c
@@ -508,6 +508,7 @@ void test_collapse(void) {
 #pragma omp distribute simd collapse(5 - 5)
   for (i = 0; i < 16; ++i)
     ;
+#if defined(_OPENMP) && (_OPENMP <= 202111)
 // expected-note@+3 2 {{defined as reduction}}
 #pragma omp target
 #pragma omp teams
@@ -520,7 +521,7 @@ void test_collapse(void) {
 #pragma omp for reduction(+ : i, j)
       for (int k = 0; k < 16; ++k)
         i += j;
-
+#endif
 #pragma omp target
 #pragma omp teams
   for (i = 0; i < 16; ++i)
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..c8a6863299fb3
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,710 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+"
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17  -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+class Sum {
+  int val;
+
+public:
+  Sum(int v = 0) : val(v) {}
+  Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+  Sum &operator+=(const Sum &rhs) {
+    val += rhs.val;
+    return *this;
+  }
+};
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in)           \
+    initializer(omp_priv = Sum(0))
+
+void func_red() {
+  Sum result(0);
+  Sum array[N];
+
+  for (int i = 0; i < N; i++) {
+    array[i] = Sum(i);
+  }
+
+#pragma omp parallel private(result) num_threads(4)
+  {
+#pragma omp for reduction(sum_reduction : result)
+    for (int i = 0; i < N; i++) {
+      result = result + array[i];
+    }
+  }
+}
+
+void do_red(int n, int *v, int &sum_v) {
+  sum_v = 0;
+#pragma omp for reduction(original(private), + : sum_v)
+  for (int i = 0; i < n; i++) {
+    sum_v += v[i];
+  }
+}
+void do_red_extended(int n, int *v, int &sum_v, int &prod_v) {
+  sum_v = 0;
+  prod_v = 1;
+
+#pragma omp for reduction(original(private), + : sum_v)                        \
+    reduction(original(private), * : prod_v)
+  for (int i = 0; i < n; i++) {
+    sum_v += v[i];
+    prod_v *= v[i];
+  }
+}
+int main(void) {
+  int v[N];
+  for (int i = 0; i < N; i++)
+    v[i] = i;
+#pragma omp parallel num_threads(4)
+  {
+    int s_v;
+    do_red(N, v, s_v);
+  }
+
+  int sum_v_ext = 0, prod_v_ext = 1;
+#pragma omp parallel num_threads(4)
+  {
+    do_red_extended(N, v, sum_v_ext, prod_v_ext);
+  }
+  return 0;
+}
+
+//.
+// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4
+//.
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10
+// CHECK-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK:       arrayctor.loop:
+// CHECK-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0)
+// CHECK-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK:       arrayctor.cont:
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]])
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT:    store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0)
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]])
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4
+// CHECK-NEXT:    store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_combiner.
+// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT:    [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[VAL2]], align 4
+// CHECK-NEXT:    ret ptr [[THIS1]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_initializer.
+// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]])
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[VAL]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi
+// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK:       omp.precond.then:
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT:    store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    store i32 [[TMP26]], ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// CHECK-NEXT:    store i32 [[ADD12]], ptr [[TMP7]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK:       omp.precond.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_
+// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PROD_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V5:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PROD_V7:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[I9:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT:    [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK:       omp.precond.then:
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    store ptr [[SUM_V5]], ptr [[_TMP6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK-NEXT:    store i32 1, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    store ptr [[PROD_V7]], ptr [[_TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I9]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
+// CHECK-NEXT:    store i32 [[ADD12]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT:    [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64
+// CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK-NEXT:    [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]]
+// CHECK-NEXT:    store i32 [[MUL15]], ptr [[TMP27]], align 4
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1
+// CHECK-NEXT:    store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]]
+// CHECK-NEXT:    store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    store i32 [[TMP34]], ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK-NEXT:    store i32 [[ADD18]], ptr [[TMP9]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]]
+// CHECK:       init19:
+// CHECK-NEXT:    store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    br label [[INIT_END20]]
+// CHECK:       init.end20:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]]
+// CHECK-NEXT:    store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    store i32 [[TMP40]], ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK-NEXT:    store i32 [[MUL22]], ptr [[TMP10]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK:       omp.precond.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca [10 x i32], align 16
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V_EXT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[PROD_V_EXT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V_EXT]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[PROD_V_EXT]], align 4
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
+// CHECK-NEXT:    ret i32 0
+
diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp
index de28ba2c3be02..2fdac3048c9cd 100644
--- a/clang/test/OpenMP/for_reduction_messages.cpp
+++ b/clang/test/OpenMP/for_reduction_messages.cpp
@@ -417,10 +417,12 @@ int main(int argc, char **argv) {
 #pragma omp for reduction(+ : qa[1], qa[0])
   for (int i = 0; i < 10; ++i)
     foo();
+#if defined(_OPENMP) && (_OPENMP <= 202111)
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp for reduction(+ : fl)      // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
+#endif
   static int m=0;
 #pragma omp for reduction(+:m)
   for (int i = 0; i < 10; ++i)
diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp
index 96b3805b10a86..a9ef6c39cb5d2 100644
--- a/clang/test/OpenMP/for_simd_reduction_messages.cpp
+++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp
@@ -396,11 +396,11 @@ int main(int argc, char **argv) {
 #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
-#endif
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp for simd reduction(+ : fl)      // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
+#endif
   static int m;
 #pragma omp for simd reduction(+ : m)
   for (int i = 0; i < 10; ++i)
diff --git a/clang/test/OpenMP/sections_reduction_messages.cpp b/clang/test/OpenMP/sections_reduction_messages.cpp
index 42ec3ed6d58e8..8cde6489f325f 100644
--- a/clang/test/OpenMP/sections_reduction_messages.cpp
+++ b/clang/test/OpenMP/sections_reduction_messages.cpp
@@ -461,12 +461,12 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#endif
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp sections reduction(+ : fl) // expected-error {{reduction variable must be shared}}
   {
     foo();
   }
+#endif
   static int m;
 #pragma omp sections reduction(+ : m) // OK
   {
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
new file mode 100644
index 0000000000000..9bf3be1e9e45d
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx-compile -fopenmp-version=60  && %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include <limits.h>
+#include <complex.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define N 10
+class Sum {
+  int val;
+
+public:
+  Sum(int v = 0) : val(v) {}
+  Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+  Sum &operator+=(const Sum &rhs) {
+    val += rhs.val;
+    return *this;
+  }
+  int getValue() const { return val; }
+};
+
+// Declare OpenMP reduction
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in)           \
+    initializer(omp_priv = Sum(0))
+
+#pragma omp declare reduction(sum_pctor_reduction:Sum : omp_out += omp_in)     \
+    initializer(omp_priv = Sum(1)) // non-default ctor
+
+int checkUserDefinedReduction() {
+  Sum final_result_udr(0);
+  Sum final_result_udr_pctor(1);
+  Sum array_sum[N];
+  int error_flag = 0;
+  int expected_value = 0;
+  int expected_value_pctor = 0;
+  for (int i = 0; i < N; ++i) {
+    array_sum[i] = Sum(i);
+    expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1)
+    expected_value_pctor += i;
+  }
+  int num_threads_for_pctor_calc = 4; //  num_threads(4)
+  int priv_initializer_val_pctor = 1; //  initializer(omp_priv = Sum(1))
+  expected_value_pctor +=
+      num_threads_for_pctor_calc + priv_initializer_val_pctor;
+#pragma omp parallel num_threads(4) private(final_result_udr) private(         \
+        final_result_udr_pctor)
+  {
+#pragma omp for reduction(sum_reduction : final_result_udr)                    \
+    reduction(sum_pctor_reduction : final_result_udr_pctor)
+    for (int i = 0; i < N; ++i) {
+      final_result_udr += array_sum[i];
+      final_result_udr_pctor += array_sum[i];
+    }
+
+    if (final_result_udr.getValue() != expected_value ||
+        final_result_udr_pctor.getValue() != expected_value_pctor)
+      error_flag += 1;
+  }
+  return error_flag;
+}
+void performMinMaxRed(int &min_val, int &max_val) {
+  int input_data[] = {7, 3, 12, 5, 8};
+  int n_size = sizeof(input_data) / sizeof(input_data[0]);
+  min_val = INT_MAX;
+  max_val = INT_MIN;
+#pragma omp for reduction(original(private), min : min_val)                    \
+    reduction(original(private), max : max_val)
+  for (int i = 0; i < n_size; ++i) {
+    if (input_data[i] < min_val)
+      min_val = input_data[i];
+    if (input_data[i] > max_val)
+      max_val = input_data[i];
+  }
+}
+int performComplexReduction() {
+  double _Complex arr[N];
+  double _Complex expected = 0.0 + 0.0 * I;
+  double _Complex result = 0.0 + 0.0 * I;
+  int error = 0;
+
+  // Initialize the array and compute serial sum
+  for (int i = 0; i < N; ++i) {
+    arr[i] = i - i * I;
+    expected += arr[i];
+  }
+  double real_sum = 0.0, imag_sum = 0.0;
+#pragma omp parallel private(real_sum) private(imag_sum)
+  {
+#pragma omp for reduction(+ : real_sum, imag_sum)
+    for (int i = 0; i < N; ++i) {
+      real_sum += creal(arr[i]);
+      imag_sum += cimag(arr[i]);
+    }
+
+    result = real_sum + imag_sum * I;
+    if (cabs(result - expected) > 1e-6) {
+      error++;
+    }
+  }
+  return error;
+}
+
+std::complex<double> doComplexReduction(std::complex<double> *arr) {
+  std::complex<double> result(1, 0);
+
+#pragma omp declare reduction(* : std::complex<double> : omp_out *= omp_in)    \
+    initializer(omp_priv = std::complex<double>(1, 0))
+
+#pragma omp for reduction(original(private), * : result)
+  for (int i = 0; i < N; ++i)
+    result *= arr[i];
+
+  return result;
+}
+
+void performReductions(int n_elements, const int *input_values,
+                       int &sum_val_out, int &prod_val_out,
+                       float &float_sum_val_out) {
+  // private variables for this thread's reduction.
+  sum_val_out = 0;
+  prod_val_out = 1;
+  float_sum_val_out = 0.0f;
+
+  const float kPiValue = 3.14f;
+#pragma omp for reduction(original(private), + : sum_val_out)                  \
+    reduction(original(private), * : prod_val_out)                             \
+    reduction(original(private), + : float_sum_val_out)
+  for (int i = 0; i < n_elements; ++i) {
+    sum_val_out += input_values[i];
+    prod_val_out *= (i + 1);
+    float_sum_val_out += kPiValue;
+  }
+}
+int main(void) {
+  int input_array[N];
+  int total_errors = 0;
+  const float kPiVal = 3.14f;
+  const int kExpectedSum = 45; // Sum of 0..9
+  const int kExpectedProd = 3628800; // 10!
+  const float kExpectedFsum = kPiVal * N; // 3.14f * 10
+  const int kExpectedMin = 3;
+  const int kExpectedMax = 12;
+  std::complex<double> arr[N];
+  std::complex<double> kExpectedComplex(1, 0);
+  // Initialize the array
+  for (int i = 1; i <= N; ++i) {
+    arr[i - 1] = std::complex<double>(
+        1.0 + 0.1 * i, 0.5 * i); // Avoid zero to prevent multiplication by zero
+    kExpectedComplex *= arr[i - 1];
+  }
+
+  for (int i = 0; i < N; i++)
+    input_array[i] = i;
+#pragma omp parallel num_threads(4)
+  {
+
+    int t_sum_v;
+    int t_prod_v;
+    float t_fsum_v;
+    performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v);
+    if (t_sum_v != kExpectedSum)
+      total_errors++;
+    if (t_prod_v != kExpectedProd)
+      total_errors++;
+    if (t_fsum_v != kExpectedFsum)
+      total_errors++;
+  }
+#pragma omp parallel num_threads(4)
+  {
+    int t_min_v;
+    int t_max_v;
+    performMinMaxRed(t_min_v, t_max_v);
+    if (t_min_v != kExpectedMin)
+      total_errors++;
+    if (t_max_v != kExpectedMax)
+      total_errors++;
+  }
+  total_errors += checkUserDefinedReduction();
+  total_errors += performComplexReduction();
+#pragma omp parallel num_threads(4)
+  {
+    std::complex<double> result(1, 0);
+    result = doComplexReduction(arr);
+    if (std::abs(result.real() - kExpectedComplex.real()) > 1e-6 ||
+        std::abs(result.imag() - kExpectedComplex.imag()) > 1e-6) {
+      total_errors++;
+    }
+  }
+  if (total_errors != 0)
+    fprintf(stderr, "ERROR: reduction on private variable  %d\n", total_errors);
+
+  return total_errors;
+}

From e44a65ed98ad896d0c0c3b1e10937a19f786b9ef Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 11 Jun 2025 10:36:12 +0200
Subject: [PATCH 045/851] [flang][OpenMP] Map basic `local` specifiers to
 `private` clauses (#142735)

Starts the effort to map `do concurrent` locality specifiers to OpenMP
clauses. This PR adds support for basic specifiers (no `init` or `copy`
regions yet).
---
 .../OpenMP/DoConcurrentConversion.cpp         | 55 ++++++++++++++++++-
 .../locality_specifiers_simple.mlir           | 48 ++++++++++++++++
 2 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 0fdb302fe10ca..283c3052c166c 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Support/OpenMP-utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
@@ -308,10 +310,47 @@ class DoConcurrentConversion
               fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
+    mlir::omp::WsloopOperands wsloopClauseOps;
+
+    // For `local` (and `local_init`) opernads, emit corresponding `private`
+    // clauses and attach these clauses to the workshare loop.
+    if (!loop.getLocalOperands().empty())
+      for (auto [op, sym, arg] : llvm::zip_equal(
+               loop.getLocalOperands(),
+               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+               loop.getRegionLocalArgs())) {
+        auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom<
+            fir::LocalitySpecifierOp>(loop, sym);
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          TODO(localizer.getLoc(),
+               "local_init conversion is not supported yet");
+
+        if (!localizer.getInitRegion().empty())
+          TODO(localizer.getLoc(),
+               "non-empty `init` regions are not supported yet");
+
+        auto oldIP = rewriter.saveInsertionPoint();
+        rewriter.setInsertionPointAfter(localizer);
+        auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
+            localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+            localizer.getTypeAttr().getValue(),
+            mlir::omp::DataSharingClauseType::Private);
+        rewriter.restoreInsertionPoint(oldIP);
+
+        wsloopClauseOps.privateVars.push_back(op);
+        wsloopClauseOps.privateSyms.push_back(
+            mlir::SymbolRefAttr::get(privatizer));
+      }
 
-    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(loop.getLoc());
+    auto wsloopOp =
+        rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(isComposite);
-    rewriter.createBlock(&wsloopOp.getRegion());
+
+    Fortran::common::openmp::EntryBlockArgs wsloopArgs;
+    wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
+    Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
+                                           wsloopOp.getRegion());
 
     auto loopNestOp =
         rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
@@ -324,6 +363,18 @@ class DoConcurrentConversion
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
 
+    // `local` region arguments are transferred/cloned from the `do concurrent`
+    // loop to the loopnest op when the region is cloned above. Instead, these
+    // region arguments should be on the workshare loop's region.
+    for (auto [wsloopArg, loopNestArg] :
+         llvm::zip_equal(wsloopOp.getRegion().getArguments(),
+                         loopNestOp.getRegion().getArguments().drop_front(
+                             clauseOps.loopLowerBounds.size())))
+      rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+
+    for (unsigned i = 0; i < loop.getLocalVars().size(); ++i)
+      loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
+
     return loopNestOp;
   }
 
diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
new file mode 100644
index 0000000000000..160c1df040680
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
@@ -0,0 +1,48 @@
+// Tests mapping `local` locality specifier to `private` clauses for a simple
+// case (not `init` or `copy` regions).
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.local {type = local} @_QFlocal_spec_translationElocal_var_private_f32 : f32
+
+func.func @_QPlocal_spec_translation() {
+  %3 = fir.alloca f32 {bindc_name = "local_var", uniq_name = "_QFlocal_spec_translationElocal_var"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+  %c4_i32 = arith.constant 4 : index
+  %c11_i32 = arith.constant 11 : index
+  %c1 = arith.constant 1 : index
+
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFlocal_spec_translationEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    fir.do_concurrent.loop (%arg0) = (%c4_i32) to (%c11_i32) step (%c1)
+      local(@_QFlocal_spec_translationElocal_var_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+
+      %10:2 = hlfir.declare %arg1 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+      %cst = arith.constant 4.200000e+01 : f32
+      hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK: omp.private {type = private} @[[PRIVATIZER:.*local_spec_translationElocal_var.*.omp]] : f32
+
+// CHECK: func.func @_QPlocal_spec_translation
+// CHECK:   %[[LOCAL_VAR:.*]] = fir.alloca f32 {bindc_name = "local_var", {{.*}}}
+// CHECK:   %[[LOCAL_VAR_DECL:.*]]:2 = hlfir.declare %[[LOCAL_VAR]]
+// CHECK:   omp.parallel {
+// CHECK:     omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_VAR_DECL]]#0 -> %[[LOCAL_ARG:.*]] : !fir.ref<f32>) {
+// CHECK:       omp.loop_nest {{.*}} {
+// CHECK:       %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_ARG]]
+// CHECK:       %[[C42:.*]] = arith.constant
+// CHECK:       hlfir.assign %[[C42]] to %[[PRIV_DECL]]#0
+// CHECK:       omp.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   omp.terminator
+// CHECK: }

From 7460c700ae3026d927952f911d0e667de6e0c18b Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Jun 2025 04:42:05 -0400
Subject: [PATCH 046/851] [MemCpyOpt] handle memcpy from memset in more cases
 (#140954)

This aims to reduce the divergence between the initial checks in this
function and processMemCpyMemCpyDependence (in particular, adding
handling of offsets), with the goal to eventually reduce duplication
there and improve this pass in other ways.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 74 ++++++++++++-------
 .../MemCpyOpt/memset-memcpy-oversized.ll      | 47 ++++++++++++
 .../MemCpyOpt/memset-memcpy-to-2x-memset.ll   |  3 +-
 llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll |  2 +-
 .../MemCpyOpt/variable-sized-memset-memcpy.ll |  2 +-
 5 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a78e3770384ae..960001bf880c6 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1364,8 +1364,9 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   return true;
 }
 
-/// Determine whether the instruction has undefined content for the given Size,
-/// either because it was freshly alloca'd or started its lifetime.
+/// Determine whether the pointer V had only undefined content (due to Def) up
+/// to the given Size, either because it was freshly alloca'd or started its
+/// lifetime.
 static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
                              MemoryDef *Def, Value *Size) {
   if (MSSA->isLiveOnEntryDef(Def))
@@ -1400,6 +1401,24 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
   return false;
 }
 
+// If the memcpy is larger than the previous, but the memory was undef prior to
+// that, we can just ignore the tail. Technically we're only interested in the
+// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
+// as we can't easily represent this location (hasUndefContents uses mustAlias
+// which cannot deal with offsets), we use the full 0..CopySize range.
+static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
+                                  MemIntrinsic *MemSrc, BatchAAResults &BAA) {
+  Value *CopySize = MemCpy->getLength();
+  MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
+  MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc);
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA);
+  if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+    if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
+      return true;
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1415,19 +1434,25 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
                                                MemSetInst *MemSet,
                                                BatchAAResults &BAA) {
-  // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
-  // memcpying from the same address. Otherwise it is hard to reason about.
-  if (!BAA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
-    return false;
-
   Value *MemSetSize = MemSet->getLength();
   Value *CopySize = MemCpy->getLength();
 
-  if (MemSetSize != CopySize) {
-    // Make sure the memcpy doesn't read any more than what the memset wrote.
-    // Don't worry about sizes larger than i64.
+  int64_t MOffset = 0;
+  const DataLayout &DL = MemCpy->getModule()->getDataLayout();
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other, or the memory transfer has a known offset from the memset.
+  if (MemCpy->getSource() != MemSet->getDest()) {
+    std::optional<int64_t> Offset =
+        MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL);
+    if (!Offset || *Offset < 0)
+      return false;
+    MOffset = *Offset;
+  }
 
-    // A known memset size is required.
+  if (MOffset != 0 || MemSetSize != CopySize) {
+    // Make sure the memcpy doesn't read any more than what the memset wrote,
+    // other than undef. Don't worry about sizes larger than i64. A known memset
+    // size is required.
     auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
     if (!CMemSetSize)
       return false;
@@ -1436,23 +1461,18 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
     if (!CCopySize)
       return false;
-    if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
-      // If the memcpy is larger than the memset, but the memory was undef prior
-      // to the memset, we can just ignore the tail. Technically we're only
-      // interested in the bytes from MemSetSize..CopySize here, but as we can't
-      // easily represent this location, we use the full 0..CopySize range.
-      MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
-      bool CanReduceSize = false;
-      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
-      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
-          MemSetAccess->getDefiningAccess(), MemCpyLoc, BAA);
-      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
-        if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
-          CanReduceSize = true;
-
-      if (!CanReduceSize)
+    if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
+      if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA))
         return false;
-      CopySize = MemSetSize;
+      // Clip the memcpy to the bounds of the memset
+      if (MOffset == 0)
+        CopySize = MemSetSize;
+      else
+        CopySize =
+            ConstantInt::get(CopySize->getType(),
+                             CMemSetSize->getZExtValue() <= (uint64_t)MOffset
+                                 ? 0
+                                 : CMemSetSize->getZExtValue() - MOffset);
     }
   }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
index 1c3896407e950..0c16f34590fc7 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -187,6 +187,53 @@ define void @test_write_before_memset_in_both_regions(ptr %result) {
   ret void
 }
 
+define void @test_negative_offset_memset(ptr %result) {
+; CHECK-LABEL: @test_negative_offset_memset(
+; CHECK-NEXT:    [[A1:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A1]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 4
+  call void @llvm.memset.p0.i64(ptr align 8 %b, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 12, i1 false)
+  ret void
+}
+
+define void @test_offset_memsetcpy(ptr %result) {
+; CHECK-LABEL: @test_offset_memsetcpy(
+; CHECK-NEXT:    [[A1:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A1]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 4
+  call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %b, i64 12, i1 false)
+  ret void
+}
+
+define void @test_two_memset(ptr %result) {
+; CHECK-LABEL: @test_two_memset(
+; CHECK-NEXT:    [[A:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[A]], i32 12
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[B]], i8 1, i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 12
+  call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 8 %b, i8 1, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
+  ret void
+}
+
 declare ptr @malloc(i64)
 declare void @free(ptr)
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
index 47474e8dac051..18488f03a2d88 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
@@ -73,11 +73,10 @@ define void @test_different_source_gep(ptr %dst1, ptr %dst2, i8 %c) {
 ; CHECK-LABEL: @test_different_source_gep(
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[DST1]], i64 64
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[P]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST2:%.*]], i8 [[C]], i64 64, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memset.p0.i64(ptr %dst1, i8 %c, i64 128, i1 false)
-  ; FIXME: We could optimize this as well.
   %p = getelementptr i8, ptr %dst1, i64 64
   call void @llvm.memcpy.p0.p0.i64(ptr %dst2, ptr %p, i64 64, i1 false)
   ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
index 5e13432746bf7..0e312bc42d463 100644
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -19,7 +19,7 @@ define i32 @foo(i1 %z) {
 ; CHECK:       for.body3.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_INC7_1]]
 ; CHECK:       for.inc7.1:
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[A]], i8 0, i64 4, i1 false)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
index a834d2465dfa5..d5b1ab9b2f299 100644
--- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
@@ -18,7 +18,7 @@ define void @test(ptr %src, i8 %c, i64 %size) {
   ret void
 }
 
-; Differing sizes, so left as it is.
+; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca
 define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
 ; CHECK-LABEL: @negative_test(
 ; CHECK-NEXT:    [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1

From ddb771ecfd12cab8d323a4e64e64b965883585de Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 11 Jun 2025 09:50:26 +0100
Subject: [PATCH 047/851] [AArch64][Clang] Update new Neon vector element
 types. (#142760)

This updates the element types used in the new __Int8x8_t types added in
#126945, mostly to allow C++ name mangling in ItaniumMangling
mangleAArch64VectorBase to work correctly. Char is replaced by
SignedCharTy or UnsignedCharTy as required and Float16Ty is better using
HalfTy to match the vector types. Same for Long types.
---
 .../include/clang/Basic/AArch64ACLETypes.def  |  22 +-
 clang/test/AST/ast-dump-aarch64-neon-types.c  |  22 +-
 clang/test/CodeGen/AArch64/mixed-neon-types.c | 559 ++++++++++++++++--
 3 files changed, 538 insertions(+), 65 deletions(-)

diff --git a/clang/include/clang/Basic/AArch64ACLETypes.def b/clang/include/clang/Basic/AArch64ACLETypes.def
index 9acfd693288cf..bbe0c85f9ffbe 100644
--- a/clang/include/clang/Basic/AArch64ACLETypes.def
+++ b/clang/include/clang/Basic/AArch64ACLETypes.def
@@ -123,31 +123,31 @@
 
 //===- Neon Vector point types --------------------------------------------===//
 
-NEON_VECTOR_TYPE(__Int8x8_t, CharTy, 8, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int8x8_t, SignedCharTy, 8, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Int16x4_t, ShortTy, 16, 4, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Int32x2_t, IntTy, 32, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Uint8x8_t, CharTy, 8, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint8x8_t, UnsignedCharTy, 8, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Uint16x4_t, UnsignedShortTy, 16, 4, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Uint32x2_t, UnsignedIntTy, 32, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Float16x4_t, Float16Ty, 16, 4, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float16x4_t, HalfTy, 16, 4, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Float32x2_t, FloatTy, 32, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Poly8x8_t, CharTy, 8, 8, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly8x8_t, UnsignedCharTy, 8, 8, VectorKind::NeonPoly)
 NEON_VECTOR_TYPE(__Poly16x4_t, UnsignedShortTy, 16, 4, VectorKind::NeonPoly)
 NEON_VECTOR_TYPE(__Bfloat16x4_t, BFloat16Ty, 16, 4, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Int8x16_t, CharTy, 8, 16, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int8x16_t, SignedCharTy, 8, 16, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Int16x8_t, ShortTy, 16, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Int32x4_t, IntTy, 32, 4, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Int64x2_t, LongLongTy, 64, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Uint8x16_t, CharTy, 8, 16, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Int64x2_t, LongTy, 64, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint8x16_t, UnsignedCharTy, 8, 16, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Uint16x8_t, UnsignedShortTy, 16, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Uint32x4_t, UnsignedIntTy, 32, 4, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Uint64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Float16x8_t, Float16Ty, 16, 8, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Uint64x2_t, UnsignedLongTy, 64, 2, VectorKind::Neon)
+NEON_VECTOR_TYPE(__Float16x8_t, HalfTy, 16, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Float32x4_t, FloatTy, 32, 4, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Float64x2_t, DoubleTy, 64, 2, VectorKind::Neon)
-NEON_VECTOR_TYPE(__Poly8x16_t, CharTy, 8, 16, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly8x16_t, UnsignedCharTy, 8, 16, VectorKind::NeonPoly)
 NEON_VECTOR_TYPE(__Poly16x8_t, UnsignedShortTy, 16, 8, VectorKind::NeonPoly)
-NEON_VECTOR_TYPE(__Poly64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::NeonPoly)
+NEON_VECTOR_TYPE(__Poly64x2_t, UnsignedLongTy, 64, 2, VectorKind::NeonPoly)
 NEON_VECTOR_TYPE(__Bfloat16x8_t, BFloat16Ty, 16, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Mfloat8x8_t, MFloat8Ty, 8, 8, VectorKind::Neon)
 NEON_VECTOR_TYPE(__Mfloat8x16_t, MFloat8Ty, 8, 16, VectorKind::Neon)
diff --git a/clang/test/AST/ast-dump-aarch64-neon-types.c b/clang/test/AST/ast-dump-aarch64-neon-types.c
index 16255cd51c9d8..f509bd880c14b 100644
--- a/clang/test/AST/ast-dump-aarch64-neon-types.c
+++ b/clang/test/AST/ast-dump-aarch64-neon-types.c
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple arm-linux-gnu %s -x c++
 
 __Int8x8_t Int8x8;
-// CHECK: Int8x8 '__Int8x8_t':'__attribute__((neon_vector_type(8))) char'
+// CHECK: Int8x8 '__Int8x8_t':'__attribute__((neon_vector_type(8))) signed char'
 // expected-error@-2{{unknown type name '__Int8x8_t'}}
 
 __Int16x4_t Int16x4;
@@ -21,7 +21,7 @@ __Int32x2_t Int32x2;
 // expected-error@-2{{unknown type name '__Int32x2_t'}}
 
 __Uint8x8_t Uint8x8;
-// CHECK: Uint8x8 '__Uint8x8_t':'__attribute__((neon_vector_type(8))) char'
+// CHECK: Uint8x8 '__Uint8x8_t':'__attribute__((neon_vector_type(8))) unsigned char'
 // expected-error@-2{{unknown type name '__Uint8x8_t'}}
 
 __Uint16x4_t Uint16x4;
@@ -33,7 +33,7 @@ __Uint32x2_t Uint32x2;
 // expected-error@-2{{unknown type name '__Uint32x2_t'}}
 
 __Float16x4_t Float16x4;
-// CHECK: Float16x4 '__Float16x4_t':'__attribute__((neon_vector_type(4))) _Float16'
+// CHECK: Float16x4 '__Float16x4_t':'__attribute__((neon_vector_type(4))) __fp16'
 // expected-error@-2{{unknown type name '__Float16x4_t'}}
 
 __Float32x2_t Float32x2;
@@ -41,7 +41,7 @@ __Float32x2_t Float32x2;
 // expected-error@-2{{unknown type name '__Float32x2_t'}}
 
 __Poly8x8_t Poly8x8;
-// CHECK: Poly8x8 '__Poly8x8_t':'__attribute__((neon_polyvector_type(8))) char'
+// CHECK: Poly8x8 '__Poly8x8_t':'__attribute__((neon_polyvector_type(8))) unsigned char'
 // expected-error@-2{{unknown type name '__Poly8x8_t'}}
 
 __Poly16x4_t Poly16x4;
@@ -53,7 +53,7 @@ __Bfloat16x4_t Bfloat16x4;
 // expected-error@-2{{unknown type name '__Bfloat16x4_t'}}
 
 __Int8x16_t Int8x16;
-// CHECK: Int8x16 '__Int8x16_t':'__attribute__((neon_vector_type(16))) char'
+// CHECK: Int8x16 '__Int8x16_t':'__attribute__((neon_vector_type(16))) signed char'
 // expected-error@-2{{unknown type name '__Int8x16_t'}}
 
 __Int16x8_t Int16x8;
@@ -65,11 +65,11 @@ __Int32x4_t Int32x4;
 // expected-error@-2{{unknown type name '__Int32x4_t'}}
 
 __Int64x2_t Int64x2;
-// CHECK: Int64x2 '__Int64x2_t':'__attribute__((neon_vector_type(2))) long long'
+// CHECK: Int64x2 '__Int64x2_t':'__attribute__((neon_vector_type(2))) long'
 // expected-error@-2{{unknown type name '__Int64x2_t'}}
 
 __Uint8x16_t Uint8x16;
-// CHECK: Uint8x16 '__Uint8x16_t':'__attribute__((neon_vector_type(16))) char'
+// CHECK: Uint8x16 '__Uint8x16_t':'__attribute__((neon_vector_type(16))) unsigned char'
 // expected-error@-2{{unknown type name '__Uint8x16_t'}}
 
 __Uint16x8_t Uint16x8;
@@ -81,11 +81,11 @@ __Uint32x4_t Uint32x4;
 // expected-error@-2{{unknown type name '__Uint32x4_t'}}
 
 __Uint64x2_t Uint64x2;
-// CHECK: Uint64x2 '__Uint64x2_t':'__attribute__((neon_vector_type(2))) unsigned long long'
+// CHECK: Uint64x2 '__Uint64x2_t':'__attribute__((neon_vector_type(2))) unsigned long'
 // expected-error@-2{{unknown type name '__Uint64x2_t'}}
 
 __Float16x8_t Float16x8;
-// CHECK: Float16x8 '__Float16x8_t':'__attribute__((neon_vector_type(8))) _Float16'
+// CHECK: Float16x8 '__Float16x8_t':'__attribute__((neon_vector_type(8))) __fp16'
 // expected-error@-2{{unknown type name '__Float16x8_t'}}
 
 __Float32x4_t Float32x4;
@@ -97,7 +97,7 @@ __Float64x2_t Float64x2;
 // expected-error@-2{{unknown type name '__Float64x2_t'}}
 
 __Poly8x16_t Poly8x16;
-// CHECK: Poly8x16 '__Poly8x16_t':'__attribute__((neon_polyvector_type(16))) char'
+// CHECK: Poly8x16 '__Poly8x16_t':'__attribute__((neon_polyvector_type(16))) unsigned char'
 // expected-error@-2{{unknown type name '__Poly8x16_t'}}
 
 __Poly16x8_t Poly16x8;
@@ -105,7 +105,7 @@ __Poly16x8_t Poly16x8;
 // expected-error@-2{{unknown type name '__Poly16x8_t'}}
 
 __Poly64x2_t Poly64x2;
-// CHECK: Poly64x2 '__Poly64x2_t':'__attribute__((neon_polyvector_type(2))) unsigned long long'
+// CHECK: Poly64x2 '__Poly64x2_t':'__attribute__((neon_polyvector_type(2))) unsigned long'
 // expected-error@-2{{unknown type name '__Poly64x2_t'}}
 
 __Bfloat16x8_t Bfloat16x8;
diff --git a/clang/test/CodeGen/AArch64/mixed-neon-types.c b/clang/test/CodeGen/AArch64/mixed-neon-types.c
index 47681a507d715..34fbe499f4052 100644
--- a/clang/test/CodeGen/AArch64/mixed-neon-types.c
+++ b/clang/test/CodeGen/AArch64/mixed-neon-types.c
@@ -3,23 +3,23 @@
 // RUN: %clang_cc1  -triple aarch64-linux-gnu -target-feature +neon -x c++ %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-CPP
 // REQUIRES: aarch64-registered-target
 
-typedef __Uint32x4_t X;
+typedef __Uint8x16_t X;
 
-// CHECK-C-LABEL: define dso_local <4 x i32> @test(
-// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-C-LABEL: define dso_local <16 x i8> @test(
+// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-C-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
 //
-// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z4test12__Uint32x4_t(
-// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z4test12__Uint8x16_t(
+// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CPP-NEXT:  [[ENTRY:.*:]]
-// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
-// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-CPP-NEXT:    ret <4 x i32> [[TMP0]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP0]]
 //
 X test(X x) {
   return x;
@@ -28,47 +28,520 @@ X test(X x) {
 #include <arm_neon.h>
 
 // CHECK-C-LABEL: define dso_local <16 x i8> @testboth(
-// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
 // CHECK-C-NEXT:    [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16
 // CHECK-C-NEXT:    [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16
 // CHECK-C-NEXT:    [[__RET_I:%.*]] = alloca <16 x i8>, align 16
-// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-C-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-C-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16
-// CHECK-C-NEXT:    store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16
-// CHECK-C-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
-// CHECK-C-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
-// CHECK-C-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    store <16 x i8> [[TMP0]], ptr [[__P0_ADDR_I]], align 16
+// CHECK-C-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P1_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
+// CHECK-C-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP2]], [[TMP3]]
 // CHECK-C-NEXT:    store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16
-// CHECK-C-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
-// CHECK-C-NEXT:    ret <16 x i8> [[TMP6]]
+// CHECK-C-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP4]]
 //
-// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z8testboth12__Uint32x4_t(
-// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z8testboth12__Uint8x16_t(
+// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK-CPP-NEXT:  [[ENTRY:.*:]]
 // CHECK-CPP-NEXT:    [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16
 // CHECK-CPP-NEXT:    [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16
 // CHECK-CPP-NEXT:    [[__RET_I:%.*]] = alloca <16 x i8>, align 16
-// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
-// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
-// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-CPP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8>
-// CHECK-CPP-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
-// CHECK-CPP-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8>
-// CHECK-CPP-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16
-// CHECK-CPP-NEXT:    store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16
-// CHECK-CPP-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
-// CHECK-CPP-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
-// CHECK-CPP-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[TMP0]], ptr [[__P0_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[TMP1]], ptr [[__P1_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16
+// CHECK-CPP-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP2]], [[TMP3]]
 // CHECK-CPP-NEXT:    store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16
-// CHECK-CPP-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
-// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP6]]
+// CHECK-CPP-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP4]]
 //
-int8x16_t testboth(X x) {
+uint8x16_t testboth(X x) {
    return vaddq_u8(x, x);
 }
+
+// CHECK-C-LABEL: define dso_local <8 x i8> @test__Int8x8_t(
+// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-C-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z14test__Int8x8_t10__Int8x8_t(
+// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-CPP-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <8 x i8> [[TMP0]]
+//
+int8x8_t test__Int8x8_t(__Int8x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x i16> @test__Int16x4_t(
+// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-C-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z15test__Int16x4_t11__Int16x4_t(
+// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-CPP-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <4 x i16> [[TMP0]]
+//
+int16x4_t test__Int16x4_t(__Int16x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x i32> @test__Int32x2_t(
+// CHECK-C-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-C-NEXT:    store <2 x i32> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x i32> @_Z15test__Int32x2_t11__Int32x2_t(
+// CHECK-CPP-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-CPP-NEXT:    store <2 x i32> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <2 x i32> [[TMP0]]
+//
+int32x2_t test__Int32x2_t(__Int32x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i8> @test__Uint8x8_t(
+// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-C-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z15test__Uint8x8_t11__Uint8x8_t(
+// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-CPP-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <8 x i8> [[TMP0]]
+//
+uint8x8_t test__Uint8x8_t(__Uint8x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x i16> @test__Uint16x4_t(
+// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-C-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z16test__Uint16x4_t12__Uint16x4_t(
+// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-CPP-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <4 x i16> [[TMP0]]
+//
+uint16x4_t test__Uint16x4_t(__Uint16x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x i32> @test__Uint32x2_t(
+// CHECK-C-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-C-NEXT:    store <2 x i32> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x i32> @_Z16test__Uint32x2_t12__Uint32x2_t(
+// CHECK-CPP-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i32>, align 8
+// CHECK-CPP-NEXT:    store <2 x i32> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <2 x i32> [[TMP0]]
+//
+uint32x2_t test__Uint32x2_t(__Uint32x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x half> @test__Float16x4_t(
+// CHECK-C-SAME: <4 x half> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x half>, align 8
+// CHECK-C-NEXT:    store <4 x half> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x half> @_Z17test__Float16x4_t13__Float16x4_t(
+// CHECK-CPP-SAME: <4 x half> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x half>, align 8
+// CHECK-CPP-NEXT:    store <4 x half> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <4 x half> [[TMP0]]
+//
+float16x4_t test__Float16x4_t(__Float16x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x float> @test__Float32x2_t(
+// CHECK-C-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x float>, align 8
+// CHECK-C-NEXT:    store <2 x float> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x float> @_Z17test__Float32x2_t13__Float32x2_t(
+// CHECK-CPP-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x float>, align 8
+// CHECK-CPP-NEXT:    store <2 x float> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <2 x float> [[TMP0]]
+//
+float32x2_t test__Float32x2_t(__Float32x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i8> @test__Poly8x8_t(
+// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-C-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z15test__Poly8x8_t11__Poly8x8_t(
+// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-CPP-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <8 x i8> [[TMP0]]
+//
+poly8x8_t test__Poly8x8_t(__Poly8x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x i16> @test__Poly16x4_t(
+// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-C-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z16test__Poly16x4_t12__Poly16x4_t(
+// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i16>, align 8
+// CHECK-CPP-NEXT:    store <4 x i16> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <4 x i16> [[TMP0]]
+//
+poly16x4_t test__Poly16x4_t(__Poly16x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x bfloat> @test__Bfloat16x4_t(
+// CHECK-C-SAME: <4 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-C-NEXT:    store <4 x bfloat> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x bfloat>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <4 x bfloat> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x bfloat> @_Z18test__Bfloat16x4_t14__Bfloat16x4_t(
+// CHECK-CPP-SAME: <4 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x bfloat>, align 8
+// CHECK-CPP-NEXT:    store <4 x bfloat> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x bfloat>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <4 x bfloat> [[TMP0]]
+//
+bfloat16x4_t test__Bfloat16x4_t(__Bfloat16x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <16 x i8> @test__Int8x16_t(
+// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z15test__Int8x16_t11__Int8x16_t(
+// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test__Int8x16_t(__Int8x16_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i16> @test__Int16x8_t(
+// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-C-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z15test__Int16x8_t11__Int16x8_t(
+// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-CPP-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test__Int16x8_t(__Int16x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x i32> @test__Int32x4_t(
+// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z15test__Int32x4_t11__Int32x4_t(
+// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test__Int32x4_t(__Int32x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x i64> @test__Int64x2_t(
+// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-C-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z15test__Int64x2_t11__Int64x2_t(
+// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-CPP-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test__Int64x2_t(__Int64x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <16 x i8> @test__Uint8x16_t(
+// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z16test__Uint8x16_t12__Uint8x16_t(
+// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test__Uint8x16_t(__Uint8x16_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i16> @test__Uint16x8_t(
+// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-C-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z16test__Uint16x8_t12__Uint16x8_t(
+// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-CPP-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test__Uint16x8_t(__Uint16x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x i32> @test__Uint32x4_t(
+// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-C-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z16test__Uint32x4_t12__Uint32x4_t(
+// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i32>, align 16
+// CHECK-CPP-NEXT:    store <4 x i32> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test__Uint32x4_t(__Uint32x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x i64> @test__Uint64x2_t(
+// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-C-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z16test__Uint64x2_t12__Uint64x2_t(
+// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-CPP-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <2 x i64> [[TMP0]]
+//
+uint64x2_t test__Uint64x2_t(__Uint64x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x half> @test__Float16x8_t(
+// CHECK-C-SAME: <8 x half> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-C-NEXT:    store <8 x half> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x half> @_Z17test__Float16x8_t13__Float16x8_t(
+// CHECK-CPP-SAME: <8 x half> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-CPP-NEXT:    store <8 x half> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test__Float16x8_t(__Float16x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <4 x float> @test__Float32x4_t(
+// CHECK-C-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-C-NEXT:    store <4 x float> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <4 x float> @_Z17test__Float32x4_t13__Float32x4_t(
+// CHECK-CPP-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-CPP-NEXT:    store <4 x float> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test__Float32x4_t(__Float32x4_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x double> @test__Float64x2_t(
+// CHECK-C-SAME: <2 x double> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-C-NEXT:    store <2 x double> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <2 x double> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x double> @_Z17test__Float64x2_t13__Float64x2_t(
+// CHECK-CPP-SAME: <2 x double> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-CPP-NEXT:    store <2 x double> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <2 x double> [[TMP0]]
+//
+float64x2_t test__Float64x2_t(__Float64x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <16 x i8> @test__Poly8x16_t(
+// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z16test__Poly8x16_t12__Poly8x16_t(
+// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP0]]
+//
+poly8x16_t test__Poly8x16_t(__Poly8x16_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i16> @test__Poly16x8_t(
+// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-C-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z16test__Poly16x8_t12__Poly16x8_t(
+// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i16>, align 16
+// CHECK-CPP-NEXT:    store <8 x i16> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <8 x i16> [[TMP0]]
+//
+poly16x8_t test__Poly16x8_t(__Poly16x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <2 x i64> @test__Poly64x2_t(
+// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-C-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z16test__Poly64x2_t12__Poly64x2_t(
+// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-CPP-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <2 x i64> [[TMP0]]
+//
+poly64x2_t test__Poly64x2_t(__Poly64x2_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x bfloat> @test__Bfloat16x8_t(
+// CHECK-C-SAME: <8 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-C-NEXT:    store <8 x bfloat> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local noundef <8 x bfloat> @_Z18test__Bfloat16x8_t14__Bfloat16x8_t(
+// CHECK-CPP-SAME: <8 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x bfloat>, align 16
+// CHECK-CPP-NEXT:    store <8 x bfloat> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <8 x bfloat> [[TMP0]]
+//
+bfloat16x8_t test__Bfloat16x8_t(__Bfloat16x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <8 x i8> @test__Mfloat8x8_t(
+// CHECK-C-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-C-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local <8 x i8> @_Z17test__Mfloat8x8_t13__Mfloat8x8_t(
+// CHECK-CPP-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <8 x i8>, align 8
+// CHECK-CPP-NEXT:    store <8 x i8> [[X]], ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8
+// CHECK-CPP-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test__Mfloat8x8_t(__Mfloat8x8_t x) { return x; }
+// CHECK-C-LABEL: define dso_local <16 x i8> @test__Mfloat8x16_t(
+// CHECK-C-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-C-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CPP-LABEL: define dso_local <16 x i8> @_Z18test__Mfloat8x16_t14__Mfloat8x16_t(
+// CHECK-CPP-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CPP-NEXT:  [[ENTRY:.*:]]
+// CHECK-CPP-NEXT:    [[X_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-CPP-NEXT:    store <16 x i8> [[X]], ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16
+// CHECK-CPP-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test__Mfloat8x16_t(__Mfloat8x16_t x) { return x; }

From 6e0c2bc668107547365d79a6e5f57317a6302c29 Mon Sep 17 00:00:00 2001
From: Javed Absar <javed.absar@gmail.com>
Date: Wed, 11 Jun 2025 10:05:34 +0100
Subject: [PATCH 048/851] [mlir][async][nfc] Fix typo in async op description
 (#143621)

---
 mlir/include/mlir/Dialect/Async/IR/AsyncOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
index 3d29d5bc7dbb6..6dbcdefbc9332 100644
--- a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
+++ b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
@@ -376,7 +376,7 @@ def Async_CreateGroupOp : Async_Op<"create_group", [Pure]> {
 }
 
 def Async_AddToGroupOp : Async_Op<"add_to_group", []> {
-  let summary = "adds and async token or value to the group";
+  let summary = "adds an async token or value to the group";
   let description = [{
     The `async.add_to_group` adds an async token or value to the async group.
     Returns the rank of the added element in the group. This rank is fixed
@@ -655,7 +655,7 @@ def Async_RuntimeLoadOp : Async_Op<"runtime.load",
 }
 
 def Async_RuntimeAddToGroupOp : Async_Op<"runtime.add_to_group", []> {
-  let summary = "adds and async token or value to the group";
+  let summary = "adds an async token or value to the group";
   let description = [{
     The `async.runtime.add_to_group` adds an async token or value to the async
     group. Returns the rank of the added element in the group.

From 7ffdf4240d62724dca7f42b37bd8671fefe17e17 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Wed, 11 Jun 2025 10:21:07 +0100
Subject: [PATCH 049/851] [flang][Driver] Enable support for
 -mmacos-version-min= (#143508)

So far as I can tell this option is driver-only so we can just re-use
what already exists for clang. I've added a unit test based on clang's
unit test to demonstrate that the option is handled.

Still TODO is to ensure that flang-rt is built with the same macos
minimum version as compiler-rt. At the moment, setting the flang minimum
version to older than the macos version on which flang was built will
lead to link warnings because flangrt is built for version of macos on
which flang was built rather than the oldest supported version (as
compiler-rt is).
---
 clang/include/clang/Driver/Options.td |   2 +
 flang/test/Driver/darwin-version.f90  | 107 ++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 flang/test/Driver/darwin-version.f90

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 3582efd7721b0..152df89118a6a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4927,8 +4927,10 @@ def ffuchsia_api_level_EQ : Joined<["-"], "ffuchsia-api-level=">,
   HelpText<"Set Fuchsia API level">,
   MarshallingInfoInt<LangOpts<"FuchsiaAPILevel">>;
 def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
+  Visibility<[ClangOption, CC1Option, FlangOption]>,
   Group<m_Group>, HelpText<"Set macOS deployment target">;
 def : Joined<["-"], "mmacosx-version-min=">,
+  Visibility<[ClangOption, CC1Option, FC1Option, FlangOption]>,
   Group<m_Group>, Alias<mmacos_version_min_EQ>;
 def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>,
   Visibility<[ClangOption, CC1Option]>,
diff --git a/flang/test/Driver/darwin-version.f90 b/flang/test/Driver/darwin-version.f90
new file mode 100644
index 0000000000000..99d19ee44be9b
--- /dev/null
+++ b/flang/test/Driver/darwin-version.f90
@@ -0,0 +1,107 @@
+! Based on clang's darwin-version.c test with tests for ios watchos and tvos
+! removed
+
+! RUN: %flang -target i686-apple-darwin8 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX4 %s
+! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.4 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX4 %s
+! CHECK-VERSION-OSX4: "i686-apple-macosx10.4.0"
+! RUN: %flang -target i686-apple-darwin9 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX5 %s
+! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.5 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX5 %s
+! CHECK-VERSION-OSX5: "i686-apple-macosx10.5.0"
+! RUN: %flang -target i686-apple-darwin10 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX6 %s
+! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.6 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX6 %s
+! CHECK-VERSION-OSX6: "i686-apple-macosx10.6.0"
+! RUN: %flang -target x86_64-apple-darwin14 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX10 %s
+! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=10.10 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX10 %s
+! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=10.10 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX10 %s
+! CHECK-VERSION-OSX10: "x86_64-apple-macosx10.10.0"
+! RUN: not %flang -target x86_64-apple-darwin -mmacos-version-min= -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-MISSING %s
+! RUN: not %flang -target x86_64-apple-darwin -mmacos-version-min= -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-MISSING %s
+! CHECK-VERSION-MISSING: missing version number
+
+! RUN: %flang -target x86_64-apple-driverkit19.0 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-DRIVERKIT190 %s
+! CHECK-VERSION-DRIVERKIT190: "x86_64-apple-driverkit19.0.0"
+
+! Check environment variable gets interpreted correctly
+! RUN: env MACOSX_DEPLOYMENT_TARGET=10.5 IPHONEOS_DEPLOYMENT_TARGET=2.0 \
+! RUN:   %flang -target i686-apple-darwin9 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX5 %s
+
+! RUN: env MACOSX_DEPLOYMENT_TARGET=10.4.10 \
+! RUN:   %flang -target i386-apple-darwin9 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-OSX49 %s
+! CHECK-VERSION-OSX49: "i386-apple-macosx10.4.10"
+! RUN: env IPHONEOS_DEPLOYMENT_TARGET=2.3.1 \
+
+! Target can specify the OS version:
+
+! RUN: %flang -target x86_64-apple-macos10.11.2 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TMAC2 %s
+! CHECK-VERSION-TMAC2: "x86_64-apple-macosx10.11.2"
+
+! Warn about -m<os>-version-min when it's used with target:
+
+! RUN: %flang -target x86_64-apple-macos10.11.2 -mmacos-version-min=10.6 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TNO-OSV1 %s
+! CHECK-VERSION-TNO-OSV1: overriding '-mmacos-version-min=10.6' option with '-target x86_64-apple-macos10.11.2'
+
+! RUN: %flang -target x86_64-apple-macos10.6 -mmacos-version-min=10.6 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TNO-SAME %s
+! CHECK-VERSION-TNO-SAME-NOT: overriding
+! CHECK-VERSION-TNO-SAME-NOT: argument unused during compilation
+
+! Target with OS version is not overridden by -m<os>-version-min variables:
+
+! RUN: %flang -target x86_64-apple-macos10.11.2 -mmacos-version-min=10.6 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TIGNORE-OSV1 %s
+! CHECK-VERSION-TIGNORE-OSV1: "x86_64-apple-macosx10.11.2"
+
+! Target without OS version includes the OS given by -m<os>-version-min arguments:
+
+! RUN: %flang -target x86_64-apple-macos -mmacos-version-min=10.11 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-USE-OS-ARG1 %s
+! CHECK-VERSION-USE-OS-ARG1: "x86_64-apple-macosx10.11.0"
+
+! Target with OS version is not overridden by environment variables:
+
+! RUN: env MACOSX_DEPLOYMENT_TARGET=10.1 \
+! RUN:   %flang -target i386-apple-macos10.5 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TMACOS-CMD %s
+! CHECK-VERSION-TMACOS-CMD: "i386-apple-macosx10.5.0"
+
+! Target with OS version is not overridden by arch:
+
+! RUN: %flang -target uknown-apple-macos10.11.2 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-VERSION-TIGNORE-ARCH1 %s
+! CHECK-VERSION-TIGNORE-ARCH1: "unknown-apple-macosx10.11.2"
+
+! Target can be used to specify the environment:
+
+! RUN: %flang -target x86_64-apple-macos11 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-MACOS11 %s
+! RUN: %flang -target x86_64-apple-darwin20 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-MACOS11 %s
+! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=11 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-MACOS11 %s
+! CHECK-MACOS11: "x86_64-apple-macosx11.0.0"
+
+! RUN: %flang -target arm64-apple-macosx10.16 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-IMPLICIT-MACOS11 %s
+! CHECK-IMPLICIT-MACOS11: warning: overriding deployment version
+! CHECK-IMPLICIT-MACOS11: "arm64-apple-macosx11.0.0"
+
+! RUN: %flang -target arm64-apple-macos999 -c %s -### 2>&1 | \
+! RUN:   FileCheck --check-prefix=CHECK-MACOS999 %s
+
+! CHECK-MACOS999: "arm64-apple-macosx999.0.0"

From 9797b5fcfbb9b9c96a219985f3623849bbd3956e Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Wed, 11 Jun 2025 10:35:06 +0100
Subject: [PATCH 050/851] [C++20][Modules] Fix false compilation error with
 constexpr (#143168)

Use declaresSameEntity when evaluating constexpr to avoid resetting
computed union value due to using different instances of the merged
field decl.
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/AST/ExprConstant.cpp                |  3 +-
 .../constexpr-initialization-failure.cpp      | 44 +++++++++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Modules/constexpr-initialization-failure.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5645edc73431b..b5e6cf088a4b1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -693,6 +693,7 @@ Bug Fixes in This Version
 - Fixed type mismatch error when 'builtin-elementwise-math' arguments have different qualifiers, this should be well-formed. (#GH141397)
 - Constant evaluation now correctly runs the destructor of a variable declared in
   the second clause of a C-style ``for`` loop. (#GH139818)
+- Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index fa4e10e84de05..27ea55e981446 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -6844,7 +6844,8 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This,
         // FIXME: In this case, the values of the other subobjects are
         // specified, since zero-initialization sets all padding bits to zero.
         if (!Value->hasValue() ||
-            (Value->isUnion() && Value->getUnionField() != FD)) {
+            (Value->isUnion() &&
+             !declaresSameEntity(Value->getUnionField(), FD))) {
           if (CD->isUnion())
             *Value = APValue(FD);
           else
diff --git a/clang/test/Modules/constexpr-initialization-failure.cpp b/clang/test/Modules/constexpr-initialization-failure.cpp
new file mode 100644
index 0000000000000..8ff20f2fc8ac6
--- /dev/null
+++ b/clang/test/Modules/constexpr-initialization-failure.cpp
@@ -0,0 +1,44 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-name=h1.h -emit-header-unit -xc++-user-header h1.h -o h1.pcm
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-map-file=module.modulemap -fmodule-file=h1.h=h1.pcm main.cpp -o main.o
+
+//--- module.modulemap
+module "h1.h" {
+  header "h1.h"
+  export *
+}
+
+//--- h0.h
+// expected-no-diagnostics
+#pragma once
+
+template <typename T> struct A {
+  union {
+    struct {
+      T x, y, z;
+    };
+  };
+  constexpr A(T, T, T) : x(), y(), z() {}
+};
+typedef A<float> packed_vec3;
+
+//--- h1.h
+// expected-no-diagnostics
+#pragma once
+
+#include "h0.h"
+
+constexpr packed_vec3 kMessThingsUp = packed_vec3(5.0f, 5.0f, 5.0f);
+
+//--- main.cpp
+// expected-no-diagnostics
+#include "h0.h"
+
+static_assert(sizeof(packed_vec3) == sizeof(float) * 3);
+static_assert(alignof(packed_vec3) == sizeof(float));
+
+import "h1.h";
+
+constexpr packed_vec3 kDefaultHalfExtents = packed_vec3(5.0f, 5.0f, 5.0f);

From c59cc2b690b9e528a82ba214f74a8f7c8abb3cde Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 11:43:34 +0200
Subject: [PATCH 051/851] [libunwind] Remove checks for -nostdlib++ (#143162)

libunwind uses a C linker, so it's never even trying to link against any
C++ libraries. This removes the code which tries to drop C++ libraries,
which makes the CMake configuration simpler and allows for upgrading
GCC.
---
 libunwind/cmake/config-ix.cmake | 56 ---------------------------------
 libunwind/src/CMakeLists.txt    | 12 -------
 2 files changed, 68 deletions(-)

diff --git a/libunwind/cmake/config-ix.cmake b/libunwind/cmake/config-ix.cmake
index 126c872f0d489..d42ceffb1f631 100644
--- a/libunwind/cmake/config-ix.cmake
+++ b/libunwind/cmake/config-ix.cmake
@@ -26,62 +26,6 @@ if (NOT LIBUNWIND_USE_COMPILER_RT)
   endif ()
 endif()
 
-# libunwind is using -nostdlib++ at the link step when available,
-# otherwise -nodefaultlibs is used. We want all our checks to also
-# use one of these options, otherwise we may end up with an inconsistency between
-# the flags we think we require during configuration (if the checks are
-# performed without one of those options) and the flags that are actually
-# required during compilation (which has the -nostdlib++ or -nodefaultlibs). libc is
-# required for the link to go through. We remove sanitizers from the
-# configuration checks to avoid spurious link errors.
-
-llvm_check_compiler_linker_flag(CXX "-nostdlib++" CXX_SUPPORTS_NOSTDLIBXX_FLAG)
-if (CXX_SUPPORTS_NOSTDLIBXX_FLAG)
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nostdlib++")
-else()
-  llvm_check_compiler_linker_flag(C "-nodefaultlibs" C_SUPPORTS_NODEFAULTLIBS_FLAG)
-  if (C_SUPPORTS_NODEFAULTLIBS_FLAG)
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nodefaultlibs")
-  endif()
-endif()
-
-# Only link against compiler-rt manually if we use -nodefaultlibs, since
-# otherwise the compiler will do the right thing on its own.
-if (NOT CXX_SUPPORTS_NOSTDLIBXX_FLAG AND C_SUPPORTS_NODEFAULTLIBS_FLAG)
-  if (LIBUNWIND_HAS_C_LIB)
-    list(APPEND CMAKE_REQUIRED_LIBRARIES c)
-  endif ()
-  if (LIBUNWIND_HAS_ROOT_LIB)
-    list(APPEND CMAKE_REQUIRED_LIBRARIES root)
-  endif ()
-  if (LIBUNWIND_USE_COMPILER_RT)
-    include(HandleCompilerRT)
-    find_compiler_rt_library(builtins LIBUNWIND_BUILTINS_LIBRARY
-                             FLAGS ${LIBUNWIND_COMPILE_FLAGS})
-    list(APPEND CMAKE_REQUIRED_LIBRARIES "${LIBUNWIND_BUILTINS_LIBRARY}")
-  else ()
-    if (LIBUNWIND_HAS_GCC_S_LIB)
-      list(APPEND CMAKE_REQUIRED_LIBRARIES gcc_s)
-    endif ()
-    if (LIBUNWIND_HAS_GCC_LIB)
-      list(APPEND CMAKE_REQUIRED_LIBRARIES gcc)
-    endif ()
-  endif ()
-  if (MINGW)
-    # Mingw64 requires quite a few "C" runtime libraries in order for basic
-    # programs to link successfully with -nodefaultlibs.
-    if (LIBUNWIND_USE_COMPILER_RT)
-      set(MINGW_RUNTIME ${LIBUNWIND_BUILTINS_LIBRARY})
-    else ()
-      set(MINGW_RUNTIME gcc_s gcc)
-    endif()
-    set(MINGW_LIBRARIES mingw32 ${MINGW_RUNTIME} moldname mingwex msvcrt advapi32
-                        shell32 user32 kernel32 mingw32 ${MINGW_RUNTIME}
-                        moldname mingwex msvcrt)
-    list(APPEND CMAKE_REQUIRED_LIBRARIES ${MINGW_LIBRARIES})
-  endif()
-endif()
-
 if (CXX_SUPPORTS_NOSTDLIBXX_FLAG OR C_SUPPORTS_NODEFAULTLIBS_FLAG)
   if (CMAKE_C_FLAGS MATCHES -fsanitize OR CMAKE_CXX_FLAGS MATCHES -fsanitize)
     set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -fno-sanitize=all")
diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt
index 70bd3a017cda7..03818b1bb2512 100644
--- a/libunwind/src/CMakeLists.txt
+++ b/libunwind/src/CMakeLists.txt
@@ -71,18 +71,6 @@ set(LIBUNWIND_SOURCES
     ${LIBUNWIND_ASM_SOURCES})
 
 # Generate library list.
-if (CXX_SUPPORTS_NOSTDLIBXX_FLAG)
-  add_link_flags_if_supported(-nostdlib++)
-else()
-  if (LIBUNWIND_USE_COMPILER_RT)
-    add_library_flags("${LIBUNWIND_BUILTINS_LIBRARY}")
-  else()
-    add_library_flags_if(LIBUNWIND_HAS_GCC_S_LIB gcc_s)
-    add_library_flags_if(LIBUNWIND_HAS_GCC_LIB gcc)
-  endif()
-  add_library_flags_if(LIBUNWIND_HAS_C_LIB c)
-endif()
-
 if (NOT APPLE)
   add_library_flags_if(LIBUNWIND_HAS_DL_LIB dl)
 endif()

From ea9046699eae04ac5159a1666f19b5b32e5d41c1 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 11 Jun 2025 11:02:32 +0100
Subject: [PATCH 052/851] [LLVM][SROA] Teach SROA how to "bitcast" between
 fixed and scalable vectors. (#130973)

For function whose vscale_range is limited to a single value we can size
scalable vectors. This aids SROA by allowing scalable vector load and
store operations to be considered for replacement whereby bitcasts
through memory can be replaced by vector insert or extract operations.
---
 .../CodeGen/attr-arm-sve-vector-bits-cast.c   |  23 +-
 llvm/include/llvm/IR/Function.h               |   4 +
 llvm/lib/IR/Function.cpp                      |  12 +
 llvm/lib/Transforms/Scalar/SROA.cpp           | 165 +++++++--
 .../scalable-vectors-with-known-vscale.ll     | 349 ++++++++++++++++++
 llvm/test/Transforms/SROA/scalable-vectors.ll | 223 ++++++++++-
 6 files changed, 721 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll

diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
index e1e2220f94d6d..fcd4314249ff8 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c
@@ -62,10 +62,7 @@ fixed_bool_t from_svbool_t(svbool_t type) {
 
 // CHECK-LABEL: @lax_cast(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64
-// CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 2 x i64>, ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 4 x i32> [[TYPE_COERCE:%.*]] to <vscale x 2 x i64>
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 svint64_t lax_cast(fixed_int32_t type) {
@@ -74,9 +71,9 @@ svint64_t lax_cast(fixed_int32_t type) {
 
 // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
@@ -84,8 +81,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) {
 
 // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE:%.*]], i64 0)
+// CHECK-NEXT:    store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
@@ -94,9 +91,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) {
 
 // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> poison, <16 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
   return type;
@@ -105,7 +102,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) {
 // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]]
+// CHECK-NEXT:    store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) {
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index c2510ea75544a..f24d03635731e 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -1053,6 +1053,10 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// defined.
   void setAlignment(MaybeAlign Align) { GlobalObject::setAlignment(Align); }
 
+  /// Return the value for vscale based on the vscale_range attribute or 0 when
+  /// unknown.
+  unsigned getVScaleValue() const;
+
 private:
   void allocHungoffUselist();
   template<int Idx> void setHungoffOperand(Constant *C);
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 63665d837c398..493dec72d45af 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1165,6 +1165,18 @@ bool Function::nullPointerIsDefined() const {
   return hasFnAttribute(Attribute::NullPointerIsValid);
 }
 
+unsigned Function::getVScaleValue() const {
+  Attribute Attr = getFnAttribute(Attribute::VScaleRange);
+  if (!Attr.isValid())
+    return 0;
+
+  unsigned VScale = Attr.getVScaleRangeMin();
+  if (VScale && VScale == Attr.getVScaleRangeMax())
+    return VScale;
+
+  return 0;
+}
+
 bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) {
   if (F && F->nullPointerIsDefined())
     return true;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index a4e373d395b90..42d1d9a437bb2 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1120,8 +1120,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return PI.setEscapedReadOnly(&LI);
 
     TypeSize Size = DL.getTypeStoreSize(LI.getType());
-    if (Size.isScalable())
-      return PI.setAborted(&LI);
+    if (Size.isScalable()) {
+      unsigned VScale = LI.getFunction()->getVScaleValue();
+      if (!VScale)
+        return PI.setAborted(&LI);
+
+      Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
+    }
 
     return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
                              LI.isVolatile());
@@ -1135,8 +1140,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return PI.setAborted(&SI);
 
     TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
-    if (StoreSize.isScalable())
-      return PI.setAborted(&SI);
+    if (StoreSize.isScalable()) {
+      unsigned VScale = SI.getFunction()->getVScaleValue();
+      if (!VScale)
+        return PI.setAborted(&SI);
+
+      StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
+    }
 
     uint64_t Size = StoreSize.getFixedValue();
 
@@ -1927,7 +1937,8 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
 /// ensure that we only try to convert viable values. The strategy is that we
 /// will peel off single element struct and array wrappings to get to an
 /// underlying value, and convert that value.
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
+                            unsigned VScale = 0) {
   if (OldTy == NewTy)
     return true;
 
@@ -1941,8 +1952,35 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
     return false;
   }
 
-  if (DL.getTypeSizeInBits(NewTy).getFixedValue() !=
-      DL.getTypeSizeInBits(OldTy).getFixedValue())
+  TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
+  TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
+
+  if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
+      (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
+    // Conversion is only possible when the size of scalable vectors is known.
+    if (!VScale)
+      return false;
+
+    // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
+    // a single domain (either fixed or scalable). Any additional conversion
+    // between fixed and scalable types is handled through integer types.
+    auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
+    auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
+
+    if (isa<ScalableVectorType>(NewTy)) {
+      if (!VectorType::getWithSizeAndScalar(cast<VectorType>(NewVTy), OldVTy))
+        return false;
+
+      NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
+    } else {
+      if (!VectorType::getWithSizeAndScalar(cast<VectorType>(OldVTy), NewVTy))
+        return false;
+
+      OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
+    }
+  }
+
+  if (NewSize != OldSize)
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
@@ -1992,7 +2030,14 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                            Type *NewTy) {
   Type *OldTy = V->getType();
-  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+#ifndef NDEBUG
+  BasicBlock *BB = IRB.GetInsertBlock();
+  assert(BB && BB->getParent() && "VScale unknown!");
+  unsigned VScale = BB->getParent()->getVScaleValue();
+  assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
+         "Value not convertable to type");
+#endif
 
   if (OldTy == NewTy)
     return V;
@@ -2000,13 +2045,41 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
          "Integer types must be the exact same to convert.");
 
+  // A variant of bitcast that supports a mixture of fixed and scalable types
+  // that are know to have the same size.
+  auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
+    Type *InTy = In->getType();
+    if (InTy == Ty)
+      return In;
+
+    if (isa<FixedVectorType>(InTy) && isa<ScalableVectorType>(Ty)) {
+      // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
+      //   <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
+      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(Ty), InTy);
+      return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
+                                                      PoisonValue::get(VTy), In,
+                                                      IRB.getInt64(0)),
+                               Ty);
+    }
+
+    if (isa<ScalableVectorType>(InTy) && isa<FixedVectorType>(Ty)) {
+      // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
+      //   <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
+      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(InTy), Ty);
+      return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
+                                     IRB.getInt64(0));
+    }
+
+    return IRB.CreateBitCast(In, Ty);
+  };
+
   // See if we need inttoptr for this type pair. May require additional bitcast.
   if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
     // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
     // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
     // Directly handle i64 to i8*
-    return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+    return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
                               NewTy);
   }
 
@@ -2016,7 +2089,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
     // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
     // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
     // Expand i8* to i64 --> i8* to i64 to i64
-    return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+    return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
                              NewTy);
   }
 
@@ -2031,12 +2104,14 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
     // size.
     if (OldAS != NewAS) {
       assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
-      return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
-                                NewTy);
+      return IRB.CreateIntToPtr(
+          CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                            DL.getIntPtrType(NewTy)),
+          NewTy);
     }
   }
 
-  return IRB.CreateBitCast(V, NewTy);
+  return CreateBitCastLike(V, NewTy);
 }
 
 /// Test whether the given slice use can be promoted to a vector.
@@ -2046,7 +2121,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
                                             VectorType *Ty,
                                             uint64_t ElementSize,
-                                            const DataLayout &DL) {
+                                            const DataLayout &DL,
+                                            unsigned VScale) {
   // First validate the slice offsets.
   uint64_t BeginOffset =
       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
@@ -2090,7 +2166,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
       assert(LTy->isIntegerTy());
       LTy = SplitIntTy;
     }
-    if (!canConvertValue(DL, SliceTy, LTy))
+    if (!canConvertValue(DL, SliceTy, LTy, VScale))
       return false;
   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
     if (SI->isVolatile())
@@ -2103,7 +2179,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
       assert(STy->isIntegerTy());
       STy = SplitIntTy;
     }
-    if (!canConvertValue(DL, STy, SliceTy))
+    if (!canConvertValue(DL, STy, SliceTy, VScale))
       return false;
   } else {
     return false;
@@ -2118,7 +2194,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
 /// (and thus isVectorPromotionViable) over all slices of the alloca for the
 /// given VectorType.
 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
-                                        const DataLayout &DL) {
+                                        const DataLayout &DL, unsigned VScale) {
   uint64_t ElementSize =
       DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
 
@@ -2131,11 +2207,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
   ElementSize /= 8;
 
   for (const Slice &S : P)
-    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+    if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
       return false;
 
   for (const Slice *S : P.splitSliceTails())
-    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+    if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
       return false;
 
   return true;
@@ -2150,7 +2226,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
                              SmallVectorImpl<VectorType *> &CandidateTys,
                              bool HaveCommonEltTy, Type *CommonEltTy,
                              bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
-                             VectorType *CommonVecPtrTy) {
+                             VectorType *CommonVecPtrTy, unsigned VScale) {
   // If we didn't find a vector type, nothing to do here.
   if (CandidateTys.empty())
     return nullptr;
@@ -2226,7 +2302,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
   });
 
   for (VectorType *VTy : CandidateTys)
-    if (checkVectorTypeForPromotion(P, VTy, DL))
+    if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
       return VTy;
 
   return nullptr;
@@ -2237,7 +2313,7 @@ static VectorType *createAndCheckVectorTypesForPromotion(
     function_ref<void(Type *)> CheckCandidateType, Partition &P,
     const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
     bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
-    bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) {
+    bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
   [[maybe_unused]] VectorType *OriginalElt =
       CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
   // Consider additional vector types where the element type size is a
@@ -2262,9 +2338,9 @@ static VectorType *createAndCheckVectorTypesForPromotion(
     }
   }
 
-  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
-                                      CommonEltTy, HaveVecPtrTy,
-                                      HaveCommonVecPtrTy, CommonVecPtrTy);
+  return checkVectorTypesForPromotion(
+      P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+      HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
 }
 
 /// Test whether the given alloca partitioning and range of slices can be
@@ -2276,7 +2352,8 @@ static VectorType *createAndCheckVectorTypesForPromotion(
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
+                                           unsigned VScale) {
   // Collect the candidate types for vector-based promotion. Also track whether
   // we have different element types.
   SmallVector<VectorType *, 4> CandidateTys;
@@ -2288,7 +2365,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   bool HaveCommonEltTy = true;
   bool HaveCommonVecPtrTy = true;
   auto CheckCandidateType = [&](Type *Ty) {
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+    if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
       // Return if bitcast to vectors is different for total size in bits.
       if (!CandidateTys.empty()) {
         VectorType *V = CandidateTys[0];
@@ -2343,14 +2420,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
   if (auto *VTy = createAndCheckVectorTypesForPromotion(
           LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
           CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
-          HaveCommonVecPtrTy, CommonVecPtrTy))
+          HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
     return VTy;
 
   CandidateTys.clear();
   return createAndCheckVectorTypesForPromotion(
       DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
       HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
-      CommonVecPtrTy);
+      CommonVecPtrTy, VScale);
 }
 
 /// Test whether a slice of an alloca is valid for integer widening.
@@ -2387,7 +2464,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (LI->isVolatile())
       return false;
     // We can't handle loads that extend past the allocated memory.
-    if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size)
+    TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
+    if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerLoad.
@@ -2412,7 +2490,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (SI->isVolatile())
       return false;
     // We can't handle stores that extend past the allocated memory.
-    if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size)
+    TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
+    if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerStore.
@@ -2885,8 +2964,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
-    const bool IsLoadPastEnd =
-        DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize;
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
@@ -2896,8 +2973,9 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     } else if (NewBeginOffset == NewAllocaBeginOffset &&
                NewEndOffset == NewAllocaEndOffset &&
                (canConvertValue(DL, NewAllocaTy, TargetTy) ||
-                (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
-                 TargetTy->isIntegerTy() && !LI.isVolatile()))) {
+                (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
+                 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
+                 !LI.isVolatile()))) {
       Value *NewPtr =
           getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
@@ -3070,7 +3148,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) {
+    TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
+    if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
       assert(!SI.isVolatile());
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
@@ -4846,14 +4925,18 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   Type *SliceTy = nullptr;
   VectorType *SliceVecTy = nullptr;
   const DataLayout &DL = AI.getDataLayout();
+  unsigned VScale = AI.getFunction()->getVScaleValue();
+
   std::pair<Type *, IntegerType *> CommonUseTy =
       findCommonType(P.begin(), P.end(), P.endOffset());
   // Do all uses operate on the same type?
-  if (CommonUseTy.first)
-    if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) {
+  if (CommonUseTy.first) {
+    TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
+    if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
       SliceTy = CommonUseTy.first;
       SliceVecTy = dyn_cast<VectorType>(SliceTy);
     }
+  }
   // If not, can we find an appropriate subtype in the original allocated type?
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -4874,12 +4957,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
 
   // If the common use types are not viable for promotion then attempt to find
   // another type that is viable.
-  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL))
+  if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
                                                  P.beginOffset(), P.size())) {
       VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
       if (TypePartitionVecTy &&
-          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL))
+          checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
         SliceTy = TypePartitionTy;
     }
 
@@ -4890,7 +4973,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
 
   VectorType *VecTy =
-      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+      IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
   if (VecTy)
     SliceTy = VecTy;
 
diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll
new file mode 100644
index 0000000000000..85715e406e065
--- /dev/null
+++ b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll
@@ -0,0 +1,349 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
+; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+; This test checks that SROA runs mem2reg on scalable vectors.
+
+define <vscale x 16 x i1> @alloca_nxv16i1(<vscale x 16 x i1> %pg) vscale_range(1) {
+; CHECK-LABEL: @alloca_nxv16i1(
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[PG:%.*]]
+;
+  %pg.addr = alloca <vscale x 16 x i1>
+  store <vscale x 16 x i1> %pg, ptr %pg.addr
+  %1 = load <vscale x 16 x i1>, ptr %pg.addr
+  ret <vscale x 16 x i1> %1
+}
+
+define <vscale x 16 x i8> @alloca_nxv16i8(<vscale x 16 x i8> %vec) vscale_range(1) {
+; CHECK-LABEL: @alloca_nxv16i8(
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[VEC:%.*]]
+;
+  %vec.addr = alloca <vscale x 16 x i8>
+  store <vscale x 16 x i8> %vec, ptr %vec.addr
+  %1 = load <vscale x 16 x i8>, ptr %vec.addr
+  ret <vscale x 16 x i8> %1
+}
+
+; Test scalable alloca that can't be promoted. Mem2Reg only considers
+; non-volatile loads and stores for promotion.
+define <vscale x 16 x i8> @unpromotable_alloca(<vscale x 16 x i8> %vec) vscale_range(1) {
+; CHECK-LABEL: @unpromotable_alloca(
+; CHECK-NEXT:    [[VEC_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+; CHECK-NEXT:    store volatile <vscale x 16 x i8> [[VEC:%.*]], ptr [[VEC_ADDR]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile <vscale x 16 x i8>, ptr [[VEC_ADDR]], align 16
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+;
+  %vec.addr = alloca <vscale x 16 x i8>
+  store volatile <vscale x 16 x i8> %vec, ptr %vec.addr
+  %1 = load volatile <vscale x 16 x i8>, ptr %vec.addr
+  ret <vscale x 16 x i8> %1
+}
+
+; Test we bail out when using an alloca of a fixed-length vector (VLS) that was
+; bitcasted to a scalable vector.
+define <vscale x 4 x i32> @cast_alloca_to_svint32_t(<vscale x 4 x i32> %type.coerce) vscale_range(1) {
+; CHECK-LABEL: @cast_alloca_to_svint32_t(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
+; CHECK-NEXT:    [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef
+; CHECK-NEXT:    [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP2]]
+;
+  %type = alloca <16 x i32>
+  %type.addr = alloca <16 x i32>
+  store <vscale x 4 x i32> %type.coerce, ptr %type
+  %type1 = load <16 x i32>, ptr %type
+  store <16 x i32> %type1, ptr %type.addr
+  %1 = load <16 x i32>, ptr %type.addr
+  %2 = load <vscale x 4 x i32>, ptr %type.addr
+  ret <vscale x 4 x i32> %2
+}
+
+; When casting from VLA to VLS via memory check we bail out when producing a
+; GEP where the element type is a scalable vector.
+define <vscale x 4 x i32> @cast_alloca_from_svint32_t() vscale_range(1) {
+; CHECK-LABEL: @cast_alloca_from_svint32_t(
+; CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[RETVAL_COERCE]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %retval = alloca <16 x i32>
+  store <16 x i32> zeroinitializer, ptr %retval
+  %retval.coerce = alloca <vscale x 4 x i32>
+  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false)
+  %1 = load <vscale x 4 x i32>, ptr %retval.coerce
+  ret <vscale x 4 x i32> %1
+}
+
+; Test we bail out when using an alloca of a fixed-length vector (VLS) that was
+; bitcasted to a scalable vector.
+define void @select_load_alloca_to_svdouble_t() vscale_range(1) {
+; CHECK-LABEL: @select_load_alloca_to_svdouble_t(
+; CHECK-NEXT:    [[Z:%.*]] = alloca <16 x half>, align 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null
+; CHECK-NEXT:    [[VAL:%.*]] = load <vscale x 2 x double>, ptr [[COND]], align 16
+; CHECK-NEXT:    ret void
+;
+  %z = alloca <16 x half>
+  %cmp = icmp eq i32 0, 0
+  %cond = select i1 %cmp, ptr %z, ptr null
+  %val = load <vscale x 2 x double>, ptr %cond, align 16
+  ret void
+}
+
+define void @select_store_alloca_to_svdouble_t(<vscale x 2 x double> %val) vscale_range(1) {
+; CHECK-LABEL: @select_store_alloca_to_svdouble_t(
+; CHECK-NEXT:    [[Z:%.*]] = alloca <16 x half>, align 32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null
+; CHECK-NEXT:    store <vscale x 2 x double> [[VAL:%.*]], ptr [[COND]], align 16
+; CHECK-NEXT:    ret void
+;
+  %z = alloca <16 x half>
+  %cmp = icmp eq i32 0, 0
+  %cond = select i1 %cmp, ptr %z, ptr null
+  store <vscale x 2 x double> %val, ptr %cond, align 16
+  ret void
+}
+
+define <4 x i32> @fixed_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast(<vscale x 16 x i1> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 16 x i1> [[A:%.*]] to <vscale x 2 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load <2 x i8>, ptr %tmp
+  ret <2 x i8> %cast
+}
+
+define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 4 x i32> [[A:%.*]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x ptr>
+; CHECK-NEXT:    ret <2 x ptr> [[TMP2]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <2 x ptr>, ptr %tmp
+  ret <2 x ptr> %cast
+}
+
+define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint(<vscale x 2 x ptr> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <vscale x 2 x ptr> [[A:%.*]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i64> [[TMP1]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    ret <4 x i32> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 2 x ptr> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr(<vscale x 2 x ptr> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr(
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.nxv2p0(<vscale x 2 x ptr> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <2 x ptr> [[TMP_0_CAST]]
+;
+  %tmp = alloca <2 x ptr>
+  store <vscale x 2 x ptr> %a, ptr %tmp
+  %cast = load <2 x ptr>, ptr %tmp
+  ret <2 x ptr> %cast
+}
+
+define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace(<vscale x 2 x ptr addrspace(1)> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <vscale x 2 x ptr addrspace(1)> [[A:%.*]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr>
+; CHECK-NEXT:    ret <2 x ptr> [[TMP3]]
+;
+  %tmp = alloca <2 x ptr>
+  store <vscale x 2 x ptr addrspace(1)> %a, ptr %tmp
+  %cast = load <2 x ptr>, ptr %tmp
+  ret <2 x ptr> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 16 x i1> @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> [[A:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[TMP1]] to <vscale x 16 x i1>
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <2 x i8> %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define <vscale x 2 x ptr> @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> [[A:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <vscale x 4 x i32> [[TMP1]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = inttoptr <vscale x 2 x i64> [[TMP2]] to <vscale x 2 x ptr>
+; CHECK-NEXT:    ret <vscale x 2 x ptr> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 2 x ptr>, ptr %tmp
+  ret <vscale x 2 x ptr> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = bitcast <vscale x 2 x i64> [[TMP2]] to <vscale x 4 x i32>
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <2 x ptr> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 2 x ptr> @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr(
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = call <vscale x 2 x ptr> @llvm.vector.insert.nxv2p0.v2p0(<vscale x 2 x ptr> poison, <2 x ptr> [[A:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 2 x ptr> [[TMP_0_CAST]]
+;
+  %tmp = alloca <2 x ptr>
+  store <2 x ptr> %a, ptr %tmp
+  %cast = load <vscale x 2 x ptr>, ptr %tmp
+  ret <vscale x 2 x ptr> %cast
+}
+
+define <vscale x 2 x ptr addrspace(1)> @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace(<2 x ptr> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr <vscale x 2 x i64> [[TMP2]] to <vscale x 2 x ptr addrspace(1)>
+; CHECK-NEXT:    ret <vscale x 2 x ptr addrspace(1)> [[TMP3]]
+;
+  %tmp = alloca <2 x ptr>
+  store <2 x ptr> %a, ptr %tmp
+  %cast = load <vscale x 2 x ptr addrspace(1)>, ptr %tmp
+  ret <vscale x 2 x ptr addrspace(1)> %cast
+}
+
+define <4 x i32> @scalable_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @scalable_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <vscale x 4 x i32> @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @scalable_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define i16 @scalar_alloca_scalar_from_scalable(<vscale x 16 x i1> %a) vscale_range(1) {
+; CHECK-LABEL: @scalar_alloca_scalar_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret i16 [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load i16, ptr %tmp
+  ret i16 %cast
+}
+
+define <vscale x 16 x i1> @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) {
+; CHECK-LABEL: @scalar_alloca_scalable_from_scalar(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store i16 [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store i16 %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 8
+; CHECK-NEXT:    [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8
+; CHECK-NEXT:    [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8
+; CHECK-NEXT:    [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]]
+;
+  %tmp = alloca { <2 x i32>, <2 x i32> }
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp
+  ret { <2 x i32>, <2 x i32> } %cast
+}
+
+define <vscale x 4 x i64> @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) {
+; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16
+; CHECK-NEXT:    [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1
+; CHECK-NEXT:    [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 4 x i64>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP_0_CAST]]
+;
+  %tmp = alloca { <2 x ptr>, <2 x ptr> }
+  store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp
+  %cast = load <vscale x 4 x i64>, ptr %tmp
+  ret <vscale x 4 x i64> %cast
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-MODIFY-CFG: {{.*}}
+; CHECK-PRESERVE-CFG: {{.*}}
diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll
index d892883ce9dc3..346814d9f630e 100644
--- a/llvm/test/Transforms/SROA/scalable-vectors.ll
+++ b/llvm/test/Transforms/SROA/scalable-vectors.ll
@@ -2,6 +2,8 @@
 ; RUN: opt < %s -passes='sroa<preserve-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG
 ; RUN: opt < %s -passes='sroa<modify-cfg>' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG
 
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
 ; This test checks that SROA runs mem2reg on scalable vectors.
 
 define <vscale x 16 x i1> @alloca_nxv16i1(<vscale x 16 x i1> %pg) {
@@ -67,11 +69,12 @@ define <vscale x 4 x i32> @cast_alloca_to_svint32_t(<vscale x 4 x i32> %type.coe
 define <vscale x 4 x i32> @cast_alloca_from_svint32_t() {
 ; CHECK-LABEL: @cast_alloca_from_svint32_t(
 ; CHECK-NEXT:    [[RETVAL_COERCE:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[RETVAL_COERCE]], align 16
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
 ;
   %retval = alloca <16 x i32>
+  store <16 x i32> zeroinitializer, ptr %retval
   %retval.coerce = alloca <vscale x 4 x i32>
   call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false)
   %1 = load <vscale x 4 x i32>, ptr %retval.coerce
@@ -110,6 +113,224 @@ define void @select_store_alloca_to_svdouble_t(<vscale x 2 x double> %val) {
   ret void
 }
 
+define <4 x i32> @fixed_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast(<vscale x 16 x i1> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i8>, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load <2 x i8>, ptr %tmp
+  ret <2 x i8> %cast
+}
+
+define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <2 x ptr> [[TMP2]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <2 x ptr>, ptr %tmp
+  ret <2 x ptr> %cast
+}
+
+define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint(<vscale x 2 x ptr> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <vscale x 2 x ptr> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr(<vscale x 2 x ptr> %a) {
+; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x ptr>, align 16
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <2 x ptr> [[CAST]]
+;
+  %tmp = alloca <2 x ptr>
+  store <vscale x 2 x ptr> %a, ptr %tmp
+  %cast = load <2 x ptr>, ptr %tmp
+  ret <2 x ptr> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed(<4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 16 x i1> @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x i8>, align 2
+; CHECK-NEXT:    store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP2]]
+;
+  %tmp = alloca <2 x i8>
+  store <2 x i8> %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define <vscale x 2 x ptr> @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 2 x ptr> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 2 x ptr>, ptr %tmp
+  ret <vscale x 2 x ptr> %cast
+}
+
+define <vscale x 4 x i32> @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP_0_CAST]]
+;
+  %tmp = alloca <4 x i32>
+  store <2 x ptr> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define <vscale x 2 x ptr> @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) {
+; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <2 x ptr>, align 16
+; CHECK-NEXT:    store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <vscale x 2 x ptr>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 2 x ptr> [[CAST]]
+;
+  %tmp = alloca <2 x ptr>
+  store <2 x ptr> %a, ptr %tmp
+  %cast = load <vscale x 2 x ptr>, ptr %tmp
+  ret <vscale x 2 x ptr> %cast
+}
+
+define <4 x i32> @scalable_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @scalable_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load <4 x i32>, ptr %tmp
+  ret <4 x i32> %cast
+}
+
+define <vscale x 4 x i32> @scalable_alloca_scalable_from_fixed(<4 x i32> %a) {
+; CHECK-LABEL: @scalable_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <vscale x 4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST:%.*]] = load <vscale x 4 x i32>, ptr [[TMP]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[CAST]]
+;
+  %tmp = alloca <vscale x 4 x i32>
+  store <4 x i32> %a, ptr %tmp
+  %cast = load <vscale x 4 x i32>, ptr %tmp
+  ret <vscale x 4 x i32> %cast
+}
+
+define i16 @scalar_alloca_scalar_from_scalable(<vscale x 16 x i1> %a) {
+; CHECK-LABEL: @scalar_alloca_scalar_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store <vscale x 16 x i1> [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret i16 [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store <vscale x 16 x i1> %a, ptr %tmp
+  %cast = load i16, ptr %tmp
+  ret i16 %cast
+}
+
+define <vscale x 16 x i1> @scalar_alloca_scalable_from_scalar(i16 %a) {
+; CHECK-LABEL: @scalar_alloca_scalable_from_scalar(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    store i16 [[A:%.*]], ptr [[TMP]], align 2
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 16 x i1>, ptr [[TMP]], align 2
+; CHECK-NEXT:    ret <vscale x 16 x i1> [[TMP_0_CAST]]
+;
+  %tmp = alloca i16
+  store i16 %a, ptr %tmp
+  %cast = load <vscale x 16 x i1>, ptr %tmp
+  ret <vscale x 16 x i1> %cast
+}
+
+define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8
+; CHECK-NEXT:    store <vscale x 4 x i32> [[A:%.*]], ptr [[TMP]], align 16
+; CHECK-NEXT:    [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8
+; CHECK-NEXT:    [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8
+; CHECK-NEXT:    [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]]
+;
+  %tmp = alloca { <2 x i32>, <2 x i32> }
+  store <vscale x 4 x i32> %a, ptr %tmp
+  %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp
+  ret { <2 x i32>, <2 x i32> } %cast
+}
+
+define <vscale x 4 x i64> @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) {
+; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed(
+; CHECK-NEXT:    [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16
+; CHECK-NEXT:    [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0
+; CHECK-NEXT:    [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16
+; CHECK-NEXT:    [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1
+; CHECK-NEXT:    [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1
+; CHECK-NEXT:    store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16
+; CHECK-NEXT:    [[TMP_0_CAST:%.*]] = load <vscale x 4 x i64>, ptr [[TMP]], align 32
+; CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP_0_CAST]]
+;
+  %tmp = alloca { <2 x ptr>, <2 x ptr> }
+  store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp
+  %cast = load <vscale x 4 x i64>, ptr %tmp
+  ret <vscale x 4 x i64> %cast
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-MODIFY-CFG: {{.*}}

From ddef9ce8dad611c2fef172f3b08c5c98235a3b41 Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp@gmail.com>
Date: Wed, 11 Jun 2025 15:39:16 +0530
Subject: [PATCH 053/851] LLVM Buildbot failure on openmp runtime test
 (#143674)

Error looks to be missing includes for complex number support in some
system. Removing test for now.
Relevant PR :
[PR-134709](https://github.com/llvm/llvm-project/pull/134709)
```
 .---command stderr------------
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:78:42: error: use of undeclared identifier 'I'
# |    78 |   double _Complex expected = 0.0 + 0.0 * I;
# |       |                                          ^
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:79:40: error: use of undeclared identifier 'I'
# |    79 |   double _Complex result = 0.0 + 0.0 * I;
# |       |                                        ^
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:84:22: error: use of undeclared identifier 'I'
# |    84 |     arr[i] = i - i * I;
# |       |                      ^
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:92:19: error: use of undeclared identifier 'creal'
# |    92 |       real_sum += creal(arr[i]);
# |       |                   ^~~~~
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:93:19: error: use of undeclared identifier 'cimag'
# |    93 |       imag_sum += cimag(arr[i]);
# |       |                   ^~~~~
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:96:36: error: use of undeclared identifier 'I'
# |    96 |     result = real_sum + imag_sum * I;
# |       |                                    ^
# | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:97:9: error: use of undeclared identifier 'cabs'
# |    97 |     if (cabs(result - expected) > 1e-6) {
# |       |         ^~~~
# | 7 errors generated.
```

Co-authored-by: Chandra Ghale <ghale@pe31.hpc.amslabs.hpecorp.net>
---
 .../for/omp_for_private_reduction.cpp         | 34 ++-----------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
index 9bf3be1e9e45d..4520755a8a305 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -73,34 +73,6 @@ void performMinMaxRed(int &min_val, int &max_val) {
       max_val = input_data[i];
   }
 }
-int performComplexReduction() {
-  double _Complex arr[N];
-  double _Complex expected = 0.0 + 0.0 * I;
-  double _Complex result = 0.0 + 0.0 * I;
-  int error = 0;
-
-  // Initialize the array and compute serial sum
-  for (int i = 0; i < N; ++i) {
-    arr[i] = i - i * I;
-    expected += arr[i];
-  }
-  double real_sum = 0.0, imag_sum = 0.0;
-#pragma omp parallel private(real_sum) private(imag_sum)
-  {
-#pragma omp for reduction(+ : real_sum, imag_sum)
-    for (int i = 0; i < N; ++i) {
-      real_sum += creal(arr[i]);
-      imag_sum += cimag(arr[i]);
-    }
-
-    result = real_sum + imag_sum * I;
-    if (cabs(result - expected) > 1e-6) {
-      error++;
-    }
-  }
-  return error;
-}
-
 std::complex<double> doComplexReduction(std::complex<double> *arr) {
   std::complex<double> result(1, 0);
 
@@ -138,7 +110,8 @@ int main(void) {
   const float kPiVal = 3.14f;
   const int kExpectedSum = 45; // Sum of 0..9
   const int kExpectedProd = 3628800; // 10!
-  const float kExpectedFsum = kPiVal * N; // 3.14f * 10
+  const float kExpectedFsum = 31.400000f; // 3.14f * 10
+  const float kTolerance = 1e-4f;
   const int kExpectedMin = 3;
   const int kExpectedMax = 12;
   std::complex<double> arr[N];
@@ -163,7 +136,7 @@ int main(void) {
       total_errors++;
     if (t_prod_v != kExpectedProd)
       total_errors++;
-    if (t_fsum_v != kExpectedFsum)
+    if (std::abs(t_fsum_v - kExpectedFsum) > kTolerance)
       total_errors++;
   }
 #pragma omp parallel num_threads(4)
@@ -177,7 +150,6 @@ int main(void) {
       total_errors++;
   }
   total_errors += checkUserDefinedReduction();
-  total_errors += performComplexReduction();
 #pragma omp parallel num_threads(4)
   {
     std::complex<double> result(1, 0);

From 354cfba5209eed5ea6bafb6a3e69e65148c4e25d Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 11:23:24 +0100
Subject: [PATCH 054/851] [DebugInfo][RemoveDIs] Remove
 scoped-dbg-format-setter (#143450)

This was a utility for flipping between intrinsic and debug record mode
-- we don't need it any more. The "IsNewDbgInfoFormat" should be true
everywhere.
---
 .../include/llvm/IR/DebugProgramInstruction.h | 19 -------------------
 llvm/include/llvm/IR/PassManagerImpl.h        |  4 ----
 llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp |  8 ++------
 llvm/lib/CodeGen/MIRPrinter.cpp               |  6 ------
 llvm/lib/IR/IRPrintingPasses.cpp              |  3 ---
 llvm/lib/IR/LegacyPassManager.cpp             |  5 -----
 llvm/lib/IRPrinter/IRPrintingPasses.cpp       |  3 ---
 llvm/lib/Linker/IRMover.cpp                   |  3 ---
 .../Transforms/IPO/ThinLTOBitcodeWriter.cpp   |  4 +---
 mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp    |  4 ----
 10 files changed, 3 insertions(+), 56 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 1436677e5a085..e0292c2b8d2d2 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -692,25 +692,6 @@ getDbgRecordRange(DbgMarker *DebugMarker) {
 
 DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef)
 
-/// Used to temporarily set the debug info format of a function, module, or
-/// basic block for the duration of this object's lifetime, after which the
-/// prior state will be restored.
-template <typename T> class ScopedDbgInfoFormatSetter {
-  T &Obj;
-  bool OldState;
-
-public:
-  ScopedDbgInfoFormatSetter(T &Obj, bool NewState)
-      : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) {
-    Obj.setIsNewDbgInfoFormat(NewState);
-  }
-  ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); }
-};
-
-template <typename T>
-ScopedDbgInfoFormatSetter(T &Obj,
-                          bool NewState) -> ScopedDbgInfoFormatSetter<T>;
-
 } // namespace llvm
 
 #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H
diff --git a/llvm/include/llvm/IR/PassManagerImpl.h b/llvm/include/llvm/IR/PassManagerImpl.h
index fe7b35fbce2c5..ade13f10c54e4 100644
--- a/llvm/include/llvm/IR/PassManagerImpl.h
+++ b/llvm/include/llvm/IR/PassManagerImpl.h
@@ -63,10 +63,6 @@ PreservedAnalyses PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>::run(
       detail::getAnalysisResult<PassInstrumentationAnalysis>(
           AM, IR, std::tuple<ExtraArgTs...>(ExtraArgs...));
 
-  // RemoveDIs: if requested, convert debug-info to DbgRecord representation
-  // for duration of these passes.
-  ScopedDbgInfoFormatSetter FormatSetter(IR, true);
-
   StackTraceEntry Entry(PI, IR);
   for (auto &Pass : Passes) {
     Entry.setPass(&*Pass);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index fb393d33df3b2..e48f735ded831 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -19,9 +19,7 @@
 using namespace llvm;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat);
-  if (M.IsNewDbgInfoFormat)
-    M.removeDebugIntrinsicDeclarations();
+  M.removeDebugIntrinsicDeclarations();
 
   const ModuleSummaryIndex *Index =
       EmitSummaryIndex ? &(AM.getResult<ModuleSummaryIndexAnalysis>(M))
@@ -51,9 +49,7 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat);
-      if (M.IsNewDbgInfoFormat)
-        M.removeDebugIntrinsicDeclarations();
+      M.removeDebugIntrinsicDeclarations();
 
       WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
                          /*EmitModuleHash=*/false);
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 34ac0794f901f..7710b503facc3 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -965,17 +965,11 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  ScopedDbgInfoFormatSetter FormatSetter(const_cast<Module &>(M), true);
-
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineModuleInfo &MMI,
                     const MachineFunction &MF) {
-  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
-  // in dbg.value format.
-  ScopedDbgInfoFormatSetter FormatSetter(
-      const_cast<Function &>(MF.getFunction()), true);
   printMF(OS, MMI, MF);
 }
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index eb35377d0fb23..5c062800198fc 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -40,7 +40,6 @@ class PrintModulePassWrapper : public ModulePass {
         ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {}
 
   bool runOnModule(Module &M) override {
-    ScopedDbgInfoFormatSetter FormatSetter(M, true);
     // Remove intrinsic declarations when printing in the new format.
     // TODO: consider removing this as debug-intrinsics are gone.
     M.removeDebugIntrinsicDeclarations();
@@ -84,8 +83,6 @@ class PrintFunctionPassWrapper : public FunctionPass {
 
   // This pass just prints a banner followed by the function as it's processed.
   bool runOnFunction(Function &F) override {
-    ScopedDbgInfoFormatSetter FormatSetter(F, true);
-
     if (isFunctionInPrintList(F.getName())) {
       if (forcePrintModuleIR())
         OS << Banner << " (function: " << F.getName() << ")\n"
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index fd69e309cdf10..c8f1606ea06cb 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -526,11 +526,6 @@ bool PassManagerImpl::run(Module &M) {
   dumpArguments();
   dumpPasses();
 
-  // RemoveDIs: if a command line flag is given, convert to the
-  // DbgVariableRecord representation of debug-info for the duration of these
-  // passes.
-  ScopedDbgInfoFormatSetter FormatSetter(M, true);
-
   for (ImmutablePass *ImPass : getImmutablePasses())
     Changed |= ImPass->doInitialization(M);
 
diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
index 5fd6a094fa57b..81ad284ea1642 100644
--- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp
+++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp
@@ -32,7 +32,6 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner,
       EmitSummaryIndex(EmitSummaryIndex) {}
 
 PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) {
-  ScopedDbgInfoFormatSetter FormatSetter(M, true);
   // Remove intrinsic declarations when printing in the new format.
   // TODO: consider removing this now that debug intrinsics are gone.
   M.removeDebugIntrinsicDeclarations();
@@ -72,8 +71,6 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner)
 
 PreservedAnalyses PrintFunctionPass::run(Function &F,
                                          FunctionAnalysisManager &) {
-  ScopedDbgInfoFormatSetter FormatSetter(F, true);
-
   if (isFunctionInPrintList(F.getName())) {
     if (forcePrintModuleIR())
       OS << Banner << " (function: " << F.getName() << ")\n" << *F.getParent();
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 4dd5ae81c89c1..a449185b2b9ba 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1444,9 +1444,6 @@ Error IRLinker::run() {
     if (Error Err = SrcM->getMaterializer()->materializeMetadata())
       return Err;
 
-  // Convert source module to match dest for the duration of the link.
-  ScopedDbgInfoFormatSetter FormatSetter(*SrcM, DstM.IsNewDbgInfoFormat);
-
   // Inherit the target data from the source module if the destination
   // module doesn't have one already.
   if (DstM.getDataLayout().isDefault())
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 06f5d78d77e01..e276376f21583 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -584,9 +584,7 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat);
-  if (M.IsNewDbgInfoFormat)
-    M.removeDebugIntrinsicDeclarations();
+  M.removeDebugIntrinsicDeclarations();
 
   bool Changed = writeThinLTOBitcode(
       OS, ThinLinkOS,
diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
index 73e8626db3a09..75170bffcdf21 100644
--- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp
@@ -32,10 +32,6 @@ void registerToLLVMIRTranslation() {
         if (!llvmModule)
           return failure();
 
-        // When printing LLVM IR, we should convert the module to the debug info
-        // format that LLVM expects us to print.
-        // See https://llvm.org/docs/RemoveDIsDebugInfo.html
-        llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule, true);
         llvmModule->removeDebugIntrinsicDeclarations();
         llvmModule->print(output, nullptr);
         return success();

From 79a72c47d09c2e2cee645430f9d290c20d2618f1 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Wed, 11 Jun 2025 06:29:37 -0400
Subject: [PATCH 055/851] [AArch64] Consider negated powers of 2 when
 calculating throughput cost (#143013)

Negated powers of 2 have similar or (exact in the case of remainder)
codegen with lowering sdiv. In the case of sdiv, it just negates the
result in the end anyway, so nothing dissimilar at all.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 24 ++++---
 llvm/test/Analysis/CostModel/AArch64/div.ll   | 36 +++++-----
 llvm/test/Analysis/CostModel/AArch64/rem.ll   | 36 +++++-----
 .../Analysis/CostModel/AArch64/sve-div.ll     | 72 +++++++++----------
 .../Analysis/CostModel/AArch64/sve-rem.ll     | 72 +++++++++----------
 5 files changed, 124 insertions(+), 116 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6e9a35c462fc9..acd37a5ae0720 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4005,7 +4005,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       // have similar cost.
       auto VT = TLI->getValueType(DL, Ty);
       if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
-        if (Op2Info.isPowerOf2()) {
+        if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
+          // Neg can be folded into the asr instruction.
           return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
                                   : (3 * AsrCost + AddCost);
         } else {
@@ -4013,17 +4014,24 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
         }
       } else if (VT.isVector()) {
         InstructionCost UsraCost = 2 * AsrCost;
-        if (Op2Info.isPowerOf2()) {
+        if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
           // Division with scalable types corresponds to native 'asrd'
           // instruction when SVE is available.
           // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
+
+          // One more for the negation in SDIV
+          InstructionCost Cost =
+              (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
           if (Ty->isScalableTy() && ST->hasSVE())
-            return 2 * AsrCost;
-          return UsraCost +
-                 (ISD == ISD::SDIV
-                      ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
-                            AsrCost
-                      : 2 * AddCost);
+            Cost += 2 * AsrCost;
+          else {
+            Cost +=
+                UsraCost +
+                (ISD == ISD::SDIV
+                     ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
+                     : 2 * AddCost);
+          }
+          return Cost;
         } else if (LT.second == MVT::v2i64) {
           return VT.getVectorNumElements() *
                  getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll
index 5367344ce573f..3a2358dba51b2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/div.ll
@@ -870,27 +870,27 @@ define void @sdiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = sdiv i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I64 = sdiv i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I32 = sdiv i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I16 = sdiv i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I8 = sdiv i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %I128 = sdiv i128 undef, -16
diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll
index d684e3af00b83..2fa62f1705911 100644
--- a/llvm/test/Analysis/CostModel/AArch64/rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll
@@ -870,27 +870,27 @@ define void @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = srem i128 undef, -16
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I64 = srem i64 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I32 = srem i32 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I16 = srem i16 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %I8 = srem i8 undef, -16
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %I128 = srem i128 undef, -16
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
index 480c3146a210d..c055d3218f65b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll
@@ -479,42 +479,42 @@ define void @udiv_uniformconstpow2() {
 
 define void @sdiv_uniformconstnegpow2() {
 ; CHECK-LABEL: 'sdiv_uniformconstnegpow2'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv <vscale x 2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv <vscale x 4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv <vscale x 8 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv <vscale x 2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv <vscale x 4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv <vscale x 8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv <vscale x 16 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv <vscale x 2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv <vscale x 4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv <vscale x 8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv <vscale x 16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv <vscale x 32 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv <vscale x 2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv <vscale x 4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv <vscale x 8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv <vscale x 16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv <vscale x 32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv <vscale x 64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv <vscale x 2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv <vscale x 4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv <vscale x 8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv <vscale x 2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv <vscale x 4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv <vscale x 8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv <vscale x 16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv <vscale x 2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv <vscale x 4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv <vscale x 8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv <vscale x 16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv <vscale x 32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv <vscale x 2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv <vscale x 4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv <vscale x 8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv <vscale x 16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv <vscale x 32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv <vscale x 64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll
index e2488735de4b5..eac8b66bcd216 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll
@@ -491,43 +491,43 @@ define void @urem_uniformconstpow2() {
 
 define void @srem_uniformconstnegpow2() {
 ; CHECK-LABEL: 'srem_uniformconstnegpow2'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem <vscale x 2 x i128> undef, splat (i128 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem <vscale x 2 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = srem <vscale x 4 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem <vscale x 8 x i64> undef, splat (i64 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem <vscale x 2 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem <vscale x 4 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = srem <vscale x 8 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem <vscale x 16 x i32> undef, splat (i32 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem <vscale x 2 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem <vscale x 4 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem <vscale x 8 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = srem <vscale x 16 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem <vscale x 32 x i16> undef, splat (i16 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem <vscale x 2 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem <vscale x 4 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem <vscale x 8 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem <vscale x 16 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = srem <vscale x 32 x i8> undef, splat (i8 -16)
-; CHECK-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem <vscale x 64 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem <vscale x 2 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %NV4i64 = srem <vscale x 4 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem <vscale x 8 x i64> undef, splat (i64 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem <vscale x 2 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem <vscale x 4 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %NV8i32 = srem <vscale x 8 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem <vscale x 16 x i32> undef, splat (i32 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem <vscale x 2 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem <vscale x 4 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem <vscale x 8 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %NV16i16 = srem <vscale x 16 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem <vscale x 32 x i16> undef, splat (i16 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem <vscale x 2 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem <vscale x 4 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem <vscale x 8 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem <vscale x 16 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %NV32i8 = srem <vscale x 32 x i8> undef, splat (i8 -16)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem <vscale x 64 x i8> undef, splat (i8 -16)
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %V2i64 = srem <2 x i64> undef, splat (i64 -16)

From 40cc7b4578fd2d65aaef8356fbe7caf2d84a8f3e Mon Sep 17 00:00:00 2001
From: Tomas Matheson <Tomas.Matheson@arm.com>
Date: Wed, 11 Jun 2025 11:45:23 +0100
Subject: [PATCH 056/851] [clang][AArch64] test -cc1 -print-enabled-extensions
 (#143570)

This adds tests that document how -cc1 and -print-enabled-extensions
interact. The current behaviour looks wrong, and is caused by the fact
that --print-enabled-extensions uses the MC subtarget feature API to
determine the list of extensions to print, whereas the frontend uses the
TargetParser API. The latter does no dependency expansion for the
-target-feature flags but the MC API does.

This doesn't fix anything but at least it documents the current
behaviour, and will serve as a pre-commit test for any future fixes.
---
 .../aarch64-print-enabled-extensions-cc1.c    | 139 ++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c

diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
new file mode 100644
index 0000000000000..5d65fdafaa251
--- /dev/null
+++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
@@ -0,0 +1,139 @@
+// Test how -cc1 -target-feature interacts with -print-enabled-extensions.
+// The current behaviour does not look correct, since dependent features are
+// removed from the printed list when one of their dependencies are disabled,
+// but they are actually still enabled during compilation, and then actually
+// disabled for parsing assembly.
+
+// REQUIRES: aarch64-registered-target
+
+// Behaviour with two positive features.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon -target-feature +sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY
+
+// Negative -target-feature disables the extension but keeps any dependencies of it (FEAT_FP16).
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon -target-feature +sve -target-feature -sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_NEG
+
+// Disabling then re-enabling a feature is the same as never disabling it.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon -target-feature -sve -target-feature +sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY
+
+// Disabling then re-enabling a feature is the same as never disabling it.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon -target-feature +sve -target-feature -sve -target-feature +sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY
+
+// Only disabling it is the same as never having enabled it.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=NEG_ONLY
+
+// Only disabling it is the same as never having enabled it.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +neon -target-feature -sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=NEG_ONLY
+
+// Disabling a dependency (after enabling the dependent) appears to disable the dependent feature.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +sve2 -target-feature -sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP
+
+// Disabling a dependency before enabling the dependent appears to have no effect.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature -sve -target-feature +sve2 \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP2
+
+// Disabling a dependency before enabling the dependent appears to have no effect.
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \
+// RUN:     -target-feature +sve2 \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP2
+
+// Driver --print-enabled-extensions indicates that negative -target-features disable dependent features.
+// RUN: %clang --target=aarch64 -march=armv8-a+sve2 --print-enabled-extensions \
+// RUN:     -Xclang -target-feature -Xclang -sve \
+// RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG
+
+// However, sve2 is actually enabled in clang but disabled for MC.
+// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \
+// RUN:     -Xclang -target-feature -Xclang -sve \
+// RUN:     -Xclang -verify -Xclang -verify-ignore-unexpected=note
+
+
+// POS_ONLY: Extensions enabled for the given AArch64 target
+// POS_ONLY-EMPTY:
+// POS_ONLY-NEXT:     Architecture Feature(s)                                Description
+// POS_ONLY-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// POS_ONLY-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// POS_ONLY-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// POS_ONLY-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// POS_ONLY-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// POS_ONLY-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// POS_NEG: Extensions enabled for the given AArch64 target
+// POS_NEG-EMPTY:
+// POS_NEG-NEXT:     Architecture Feature(s)                                Description
+// POS_NEG-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// POS_NEG-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// POS_NEG-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// POS_NEG-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// POS_NEG-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// NEG_POS: Extensions enabled for the given AArch64 target
+// NEG_POS-EMPTY:
+// NEG_POS-NEXT:     Architecture Feature(s)                                Description
+// NEG_POS-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// NEG_POS-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// NEG_POS-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// NEG_POS-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// NEG_POS-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// NEG_POS-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// NEG_ONLY: Extensions enabled for the given AArch64 target
+// NEG_ONLY-EMPTY:
+// NEG_ONLY-NEXT:     Architecture Feature(s)                                Description
+// NEG_ONLY-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// NEG_ONLY-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// NEG_ONLY-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// NEG_ONLY-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// DISABLE_DEP: Extensions enabled for the given AArch64 target
+// DISABLE_DEP-EMPTY: 
+// DISABLE_DEP-NEXT:     Architecture Feature(s)                                Description
+// DISABLE_DEP-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// DISABLE_DEP-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// DISABLE_DEP-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// DISABLE_DEP-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// DISABLE_DEP-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// DISABLE_DEP2: Extensions enabled for the given AArch64 target
+// DISABLE_DEP2-EMPTY: 
+// DISABLE_DEP2-NEXT:     Architecture Feature(s)                                Description
+// DISABLE_DEP2-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// DISABLE_DEP2-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// DISABLE_DEP2-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// DISABLE_DEP2-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// DISABLE_DEP2-NEXT:     FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// DISABLE_DEP2-NEXT:     FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
+// DISABLE_DEP2-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+// DISABLE_VIA_XCLANG: Extensions enabled for the given AArch64 target
+// DISABLE_VIA_XCLANG-EMPTY: 
+// DISABLE_VIA_XCLANG-NEXT:     Architecture Feature(s)                                Description
+// DISABLE_VIA_XCLANG-NEXT:     FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
+// DISABLE_VIA_XCLANG-NEXT:     FEAT_ETE                                               Enable Embedded Trace Extension
+// DISABLE_VIA_XCLANG-NEXT:     FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
+// DISABLE_VIA_XCLANG-NEXT:     FEAT_FP16                                              Enable half-precision floating-point data processing
+// DISABLE_VIA_XCLANG-NEXT:     FEAT_TRBE                                              Enable Trace Buffer Extension
+
+#if __ARM_FEATURE_SVE2
+#warning "SVE2 is enabled"
+// expected-warning@-1 {{SVE2 is enabled}}
+#endif
+
+void fn_that_requires_sve2() {
+    __asm__("ldnt1sh z0.s, p0/z, [z1.s]");
+    // expected-error@-1 {{instruction requires: sve2}}
+}

From 19b0e1227ca6653405e4a34627d04a14f2287f26 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 11 Jun 2025 13:27:14 +0200
Subject: [PATCH 057/851] [ConstantFolding] Fold sqrt poison -> poison
 (#141821)

I noticed this when a sqrt produced by VectorCombine with a poison
operand wasn't getting folded away to poison.

Most intrinsics in general could probably be folded to poison if one of
their arguments are poison too. Are there any exceptions to this we need
to be aware of?
---
 llvm/lib/Analysis/ConstantFolding.cpp         |  7 ++-
 .../InstSimplify/fp-undef-poison.ll           | 50 +++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 23ea6966fbf6c..1ef0badd23757 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2223,8 +2223,13 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
 
   if (isa<PoisonValue>(Operands[0])) {
     // TODO: All of these operations should probably propagate poison.
-    if (IntrinsicID == Intrinsic::canonicalize)
+    switch (IntrinsicID) {
+    case Intrinsic::canonicalize:
+    case Intrinsic::sqrt:
       return PoisonValue::get(Ty);
+    default:
+      break;
+    }
   }
 
   if (isa<UndefValue>(Operands[0])) {
diff --git a/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll b/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll
index cb2026df962c8..ffab9c94ddf42 100644
--- a/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll
+++ b/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll
@@ -293,3 +293,53 @@ define double @fmul_nnan_inf_op1(double %x) {
   %r = fmul nnan double %x, 0xfff0000000000000
   ret double %r
 }
+
+define float @sqrt_poison() {
+; CHECK-LABEL: @sqrt_poison(
+; CHECK-NEXT:    ret float poison
+;
+  %sqrt = call float @llvm.sqrt(float poison)
+  ret float %sqrt
+}
+
+define <2 x float> @sqrt_poison_fixed_vec() {
+; CHECK-LABEL: @sqrt_poison_fixed_vec(
+; CHECK-NEXT:    ret <2 x float> poison
+;
+  %sqrt = call <2 x float> @llvm.sqrt(<2 x float> poison)
+  ret <2 x float> %sqrt
+}
+
+define <2 x float> @sqrt_poison_elt_fixed_vec() {
+; CHECK-LABEL: @sqrt_poison_elt_fixed_vec(
+; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float poison>
+;
+  %sqrt = call <2 x float> @llvm.sqrt(<2 x float> <float 1.0, float poison>)
+  ret <2 x float> %sqrt
+}
+
+define <vscale x 2 x float> @sqrt_poison_scalable_vec() {
+; CHECK-LABEL: @sqrt_poison_scalable_vec(
+; CHECK-NEXT:    ret <vscale x 2 x float> poison
+;
+  %sqrt = call <vscale x 2 x float> @llvm.sqrt(<vscale x 2 x float> poison)
+  ret <vscale x 2 x float> %sqrt
+}
+
+define float @sqrt_nnan_nan() {
+; CHECK-LABEL: @sqrt_nnan_nan(
+; CHECK-NEXT:    [[SQRT:%.*]] = call nnan float @llvm.sqrt.f32(float 0x7FF8000000000000)
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = call nnan float @llvm.sqrt(float 0x7ff8000000000000)
+  ret float %sqrt
+}
+
+define float @sqrt_ninf_inf() {
+; CHECK-LABEL: @sqrt_ninf_inf(
+; CHECK-NEXT:    [[SQRT:%.*]] = call ninf float @llvm.sqrt.f32(float 0xFFF0000000000000)
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = call ninf float @llvm.sqrt(float 0xfff0000000000000)
+  ret float %sqrt
+}

From 44a7ecd1d7485be94d3a92021c650175f100d2f7 Mon Sep 17 00:00:00 2001
From: Alexander Ziaee <concussious@runbox.com>
Date: Wed, 11 Jun 2025 07:27:23 -0400
Subject: [PATCH 058/851] [doc] Use ISO nomenclature for 1024 byte units
 (#133148)

Increase specificity by using the correct unit sizes. KBytes is an
abbreviation for kB, 1000 bytes, and the hardware industry as well as
several operating systems have now switched to using 1000 byte kBs.

If this change is acceptable, sometimes GitHub mangles merges to use the
original email of the account. $dayjob asks contributions have my work
email. Thanks!
---
 lld/ELF/Relocations.cpp                                       | 2 +-
 .../Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp | 2 +-
 lldb/tools/debugserver/source/RNBRemote.cpp                   | 2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp                | 4 ++--
 openmp/tools/archer/ompt-tsan.cpp                             | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 8413d8bb2437c..1af01e7247dce 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -161,7 +161,7 @@ static RelType getMipsPairType(RelType type, bool isLocal) {
     // symbol, the R_MIPS_GOT16 relocation creates a GOT entry to hold
     // the high 16 bits of the symbol's value. A paired R_MIPS_LO16
     // relocations handle low 16 bits of the address. That allows
-    // to allocate only one GOT entry for every 64 KBytes of local data.
+    // to allocate only one GOT entry for every 64 KiB of local data.
     return isLocal ? R_MIPS_LO16 : R_MIPS_NONE;
   case R_MICROMIPS_GOT16:
     return isLocal ? R_MICROMIPS_LO16 : R_MIPS_NONE;
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
index 67ba42f33d1dd..4a1117222f34c 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp
@@ -1377,7 +1377,7 @@ GDBRemoteCommunicationServerCommon::GetModuleInfo(llvm::StringRef module_path,
 
 std::vector<std::string> GDBRemoteCommunicationServerCommon::HandleFeatures(
     const llvm::ArrayRef<llvm::StringRef> client_features) {
-  // 128KBytes is a reasonable max packet size--debugger can always use less.
+  // 128 KiB is a reasonable max packet size--debugger can always use less.
   constexpr uint32_t max_packet_size = 128 * 1024;
 
   // Features common to platform server and llgs.
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index af3c66c71c77e..391d1c50168ea 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -3476,7 +3476,7 @@ static bool GetProcessNameFrom_vAttach(const char *&p,
 }
 
 rnb_err_t RNBRemote::HandlePacket_qSupported(const char *p) {
-  uint32_t max_packet_size = 128 * 1024; // 128KBytes is a reasonable max packet
+  uint32_t max_packet_size = 128 * 1024; // 128 KiB is a reasonable max packet
                                          // size--debugger can always use less
   std::stringstream reply;
   reply << "qXfer:features:read+;PacketSize=" << std::hex << max_packet_size
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 33c9edd24646b..a1a177528eb23 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -123,7 +123,7 @@ std::optional<unsigned> X86TTIImpl::getCacheSize(
     //   - Broadwell
     //   - Skylake
     //   - Kabylake
-    return 32 * 1024;  //  32 KByte
+    return 32 * 1024;  //  32 KiB
   case TargetTransformInfo::CacheLevel::L2D:
     //   - Penryn
     //   - Nehalem
@@ -134,7 +134,7 @@ std::optional<unsigned> X86TTIImpl::getCacheSize(
     //   - Broadwell
     //   - Skylake
     //   - Kabylake
-    return 256 * 1024; // 256 KByte
+    return 256 * 1024; // 256 KiB
   }
 
   llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp
index bb60fc6b603f4..c315999af4328 100644
--- a/openmp/tools/archer/ompt-tsan.cpp
+++ b/openmp/tools/archer/ompt-tsan.cpp
@@ -1224,7 +1224,7 @@ static void ompt_tsan_finalize(ompt_data_t *tool_data) {
   if (archer_flags->print_max_rss) {
     struct rusage end;
     getrusage(RUSAGE_SELF, &end);
-    printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
+    printf("MAX RSS[KiB] during execution: %ld\n", end.ru_maxrss);
   }
 
   if (archer_flags)

From abbbe4a6cd1b83b89a834163335053863f5ffbfa Mon Sep 17 00:00:00 2001
From: Simone Pellegrini <simone.pellegrini@arm.com>
Date: Wed, 11 Jun 2025 13:37:34 +0200
Subject: [PATCH 059/851] [mlir][vector] Fix attaching write effects on
 transfer_write's base (#142940)

This fixes an issue with `TransferWriteOp`'s implementation of the
`MemoryEffectOpInterface` where the write effect was attached to the
stored value rather than the base.

This had the effect that when asking for the memory effects for the
input memref buffer using `getEffectsOnValue(...)`, the function would
return no-effects (as the effect would have been attached to the stored
value rather than the input buffer).
---
 flang/test/HLFIR/assign-side-effects.fir      |  9 +-
 flang/test/HLFIR/memory-effects.fir           | 93 ++++++++++---------
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |  2 +-
 .../Dialect/Bufferization/side-effects.mlir   |  6 +-
 mlir/test/Dialect/Vector/side-effects.mlir    | 15 +++
 mlir/test/IR/test-side-effects.mlir           |  8 +-
 mlir/test/lib/IR/TestSideEffects.cpp          | 12 +--
 7 files changed, 81 insertions(+), 64 deletions(-)
 create mode 100644 mlir/test/Dialect/Vector/side-effects.mlir

diff --git a/flang/test/HLFIR/assign-side-effects.fir b/flang/test/HLFIR/assign-side-effects.fir
index dfd1c5886e4fa..cac9530e2277c 100644
--- a/flang/test/HLFIR/assign-side-effects.fir
+++ b/flang/test/HLFIR/assign-side-effects.fir
@@ -2,14 +2,14 @@
 // RUN: fir-opt %s --test-side-effects --verify-diagnostics
 
 func.func @test1(%x: !fir.ref<i32>, %i: i32) {
-  // expected-remark @below {{found an instance of 'write' on a op operand, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'write' on op operand 1, on resource '<Default>'}}
   hlfir.assign %i to %x : i32, !fir.ref<i32>
   return
 }
 
 func.func @test2(%x: !fir.ref<i32>, %y: !fir.ref<i32>) {
-  // expected-remark @below {{found an instance of 'write' on a op operand, on resource '<Default>'}}
-  // expected-remark @below {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'write' on op operand 1, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   hlfir.assign %y to %x : !fir.ref<i32>, !fir.ref<i32>
   return
 }
@@ -22,7 +22,8 @@ func.func @test3(%x: !fir.ref<!fir.type<t>>, %y: !fir.ref<!fir.type<t>>) {
 }
 
 func.func @test4(%x: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %y: !fir.box<!fir.array<?xi32>>) {
-  // expected-remark @below {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   // expected-remark @below {{found an instance of 'write' on resource '<Default>'}}
   // expected-remark @below {{found an instance of 'free' on resource '<Default>'}}
   // expected-remark @below {{found an instance of 'allocate' on resource '<Default>'}}
diff --git a/flang/test/HLFIR/memory-effects.fir b/flang/test/HLFIR/memory-effects.fir
index cac887ebe67de..6c791f1260be7 100644
--- a/flang/test/HLFIR/memory-effects.fir
+++ b/flang/test/HLFIR/memory-effects.fir
@@ -3,8 +3,9 @@
 func.func @concat(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: !fir.ref<!fir.char<1, 20>>) {
 // expected-remark@+1 {{operation has no memory effects}}
   %c30 = arith.constant 30 : index
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   %0 = hlfir.concat %arg0, %arg1 len %c30 : (!fir.ref<!fir.char<1,10>>, !fir.ref<!fir.char<1,20>>, index) -> (!hlfir.expr<!fir.char<1,30>>)
   return
 }
@@ -16,8 +17,8 @@ func.func @all_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) {
 }
 
 func.func @all_effects(%arg0: !fir.ref<!fir.array<2x10x!fir.logical<4>>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %all = hlfir.all %arg0 dim %arg1 : (!fir.ref<!fir.array<2x10x!fir.logical<4>>>, i32) -> !hlfir.expr<?x!fir.logical<4>>
   return
 }
@@ -29,8 +30,8 @@ func.func @any_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) {
 }
 
 func.func @any_effects(%arg0: !fir.ref<!fir.array<2x10x!fir.logical<4>>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %all = hlfir.any %arg0 dim %arg1 : (!fir.ref<!fir.array<2x10x!fir.logical<4>>>, i32) -> !hlfir.expr<?x!fir.logical<4>>
   return
 }
@@ -42,7 +43,7 @@ func.func @count_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) {
 }
 
 func.func @count_effects(%arg0: !fir.ref<!fir.array<2x10x!fir.logical<4>>>, %arg1: i32) {
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %all = hlfir.count %arg0 dim %arg1 : (!fir.ref<!fir.array<2x10x!fir.logical<4>>>, i32) -> i32
   return
 }
@@ -54,15 +55,15 @@ func.func @product_no_effects(%arg0: !hlfir.expr<?xf32>) {
 }
 
 func.func @product_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %product = hlfir.product %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xf32>
   return
 }
 
 func.func @set_length_read(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: index) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %0 = hlfir.set_length %arg0 len %arg1 : (!fir.ref<!fir.char<1,10>>, index) -> !hlfir.expr<!fir.char<1,?>>
   return
 }
@@ -74,8 +75,8 @@ func.func @sum_no_effects(%arg0: !hlfir.expr<?xf32>) {
 }
 
 func.func @sum_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %sum = hlfir.sum %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xf32>
   return
 }
@@ -87,8 +88,8 @@ func.func @maxval_no_effects(%arg0: !hlfir.expr<?xf32>) {
 }
 
 func.func @maxval_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %maxval = hlfir.maxval %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xf32>
   return
 }
@@ -100,34 +101,34 @@ func.func @minval_no_effects(%arg0: !hlfir.expr<?xf32>) {
 }
 
 func.func @minval_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %minval = hlfir.minval %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xf32>
   return
 }
 
 func.func @minloc_effects_simple(%arg0: !hlfir.expr<?xf32>) {
-// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %minloc = hlfir.minloc %arg0 : (!hlfir.expr<?xf32>) -> !hlfir.expr<?xi32>
   return
 }
 
 func.func @minloc_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %minloc = hlfir.minloc %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xi32>
   return
 }
 
 func.func @maxloc_effects_simple(%arg0: !hlfir.expr<?xf32>) {
-// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %maxloc = hlfir.maxloc %arg0 : (!hlfir.expr<?xf32>) -> !hlfir.expr<?xi32>
   return
 }
 
 func.func @maxloc_effects(%arg0: !fir.ref<!fir.array<2x2xf32>>, %arg1: i32) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %maxloc = hlfir.maxloc %arg0 dim %arg1 : (!fir.ref<!fir.array<2x2xf32>>, i32) -> !hlfir.expr<2xi32>
   return
 }
@@ -139,49 +140,49 @@ func.func @dot_product_no_effects(%arg0: !hlfir.expr<?xf32>, %arg1: !hlfir.expr<
 }
 
 func.func @dot_product_effects(%arg0: !fir.ref<!fir.array<10xf32>>, %arg1: !fir.ref<!fir.array<10xf32>>) {
-// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   %0 = hlfir.dot_product %arg0 %arg1 : (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) -> f32
   return
 }
 
 func.func @matmul_no_reads(%arg0: !hlfir.expr<?x?xf32>, %arg1: !hlfir.expr<?x?xf32>) {
-// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %0 = hlfir.matmul %arg0 %arg1 : (!hlfir.expr<?x?xf32>, !hlfir.expr<?x?xf32>) -> !hlfir.expr<?x?xf32>
   return
 }
 
 func.func @matmul_reads(%arg0: !fir.ref<!fir.array<10x5xf32>>, %arg1: !fir.ref<!fir.array<5x10xf32>>) {
-// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   %0 = hlfir.matmul %arg0 %arg1 : (!fir.ref<!fir.array<10x5xf32>>, !fir.ref<!fir.array<5x10xf32>>) -> !hlfir.expr<10x10xf32>
   return
 }
 
 func.func @transpose_no_reads(%arg0: !hlfir.expr<?x?xf32>) {
-// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %0 = hlfir.transpose %arg0 : (!hlfir.expr<?x?xf32>) -> !hlfir.expr<?x?xf32>
   return
 }
 
 func.func @transpose_read(%arg0: !fir.ref<!fir.array<10x5xf32>>) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %0 = hlfir.transpose %arg0 : (!fir.ref<!fir.array<10x5xf32>>) -> !hlfir.expr<5x10xf32>
   return
 }
 
 func.func @matmul_transpose_no_reads(%arg0: !hlfir.expr<?x?xf32>, %arg1: !hlfir.expr<?x?xf32>) {
-// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %0 = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr<?x?xf32>, !hlfir.expr<?x?xf32>) -> !hlfir.expr<?x?xf32>
   return
 }
 
 func.func @matmul_transpose_reads(%arg0: !fir.ref<!fir.array<5x10xf32>>, %arg1: !fir.ref<!fir.array<5x10xf32>>) {
-// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   %0 = hlfir.matmul_transpose %arg0 %arg1 : (!fir.ref<!fir.array<5x10xf32>>, !fir.ref<!fir.array<5x10xf32>>) -> !hlfir.expr<10x10xf32>
   return
 }
@@ -195,8 +196,8 @@ func.func @associate(%arg0: i32) {
 }
 
 func.func @as_expr_read(%arg0: !fir.ref<!fir.array<2xi32>>) {
-// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   %0 = hlfir.as_expr %arg0 : (!fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?xi32>
 // expected-remark@+1 {{found an instance of 'free' on resource '<Default>'}}
   hlfir.destroy %0 : !hlfir.expr<?xi32>
@@ -204,28 +205,28 @@ func.func @as_expr_read(%arg0: !fir.ref<!fir.array<2xi32>>) {
 }
 
 func.func @char_extremum(%arg0: !fir.ref<!fir.char<1,10>>, %arg1: !fir.ref<!fir.char<1,20>>) {
-// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
-// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource '<Default>'}}
   %0 = hlfir.char_extremum min, %arg0, %arg1 : (!fir.ref<!fir.char<1, 10>>, !fir.ref<!fir.char<1,20>>) -> !hlfir.expr<!fir.char<1,10>>
   return
 }
 
 func.func @copy_in(%box: !fir.box<!fir.array<?xf64>>, %temp: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, %is_present: i1) {
 // expected-remark@+3 {{found an instance of 'allocate' on resource '<Default>'}}
-// expected-remark@+2 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'write' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'write' on op operand 1, on resource '<Default>'}}
   %0:2 = hlfir.copy_in %box to %temp : (!fir.box<!fir.array<?xf64>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>) -> (!fir.box<!fir.array<?xf64>>, i1)
   return
 }
 
 func.func @copy_out(%box: !fir.box<!fir.array<?xf64>>, %temp: !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, %was_copied: i1) {
 // expected-remark@+2 {{found an instance of 'free' on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
   hlfir.copy_out %temp, %was_copied : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1) -> ()
 // expected-remark@+3 {{found an instance of 'free' on resource '<Default>'}}
-// expected-remark@+2 {{found an instance of 'read' on a op operand, on resource '<Default>'}}
-// expected-remark@+1 {{found an instance of 'write' on a op operand, on resource '<Default>'}}
+// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+// expected-remark@+1 {{found an instance of 'write' on op operand 2, on resource '<Default>'}}
   hlfir.copy_out %temp, %was_copied to %box : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, i1, !fir.box<!fir.array<?xf64>>) -> ()
   return
 }
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 3179b4f975404..a295bf1eb4d95 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -5038,7 +5038,7 @@ void TransferWriteOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
   if (llvm::isa<MemRefType>(getShapedType()))
-    effects.emplace_back(MemoryEffects::Write::get(), &getValueToStoreMutable(),
+    effects.emplace_back(MemoryEffects::Write::get(), &getBaseMutable(),
                          SideEffects::DefaultResource::get());
 }
 
diff --git a/mlir/test/Dialect/Bufferization/side-effects.mlir b/mlir/test/Dialect/Bufferization/side-effects.mlir
index 841490e9f3234..129fc8b32c270 100644
--- a/mlir/test/Dialect/Bufferization/side-effects.mlir
+++ b/mlir/test/Dialect/Bufferization/side-effects.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s --test-side-effects --verify-diagnostics
 
 func.func @test_side_effects(%arg0: memref<2xi32>) -> memref<2xi32> {
-  // expected-remark @below {{found an instance of 'read' on a op operand, on resource '<Default>'}}
-  // expected-remark @below {{found an instance of 'write' on a op result, on resource '<Default>'}}
-  // expected-remark @below {{found an instance of 'allocate' on a op result, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'write' on op result 0, on resource '<Default>'}}
+  // expected-remark @below {{found an instance of 'allocate' on op result 0, on resource '<Default>'}}
   %0 = bufferization.clone %arg0 : memref<2xi32> to memref<2xi32>
   return %0 : memref<2xi32>
 }
diff --git a/mlir/test/Dialect/Vector/side-effects.mlir b/mlir/test/Dialect/Vector/side-effects.mlir
new file mode 100644
index 0000000000000..54c274a1a2a02
--- /dev/null
+++ b/mlir/test/Dialect/Vector/side-effects.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s --test-side-effects --verify-diagnostics
+
+func.func @test_side_effects(%arg0: memref<8xf32>) {
+  // expected-remark @below {{operation has no memory effects}}
+  %c0 = arith.constant 0 : index
+  // expected-remark @below {{operation has no memory effects}}
+  %c4 = arith.constant 4 : index
+  // expected-remark @below {{operation has no memory effects}}
+  %cst = arith.constant 0.0 : f32
+  // expected-remark @below {{found an instance of 'read' on op operand 0, on resource '<Default>'}}
+  %0 = vector.transfer_read %arg0[%c0], %cst : memref<8xf32>, vector<4xf32>
+  // expected-remark @below {{found an instance of 'write' on op operand 1, on resource '<Default>'}}
+  vector.transfer_write %0, %arg0[%c4] : vector<4xf32>, memref<8xf32>
+  return
+}
diff --git a/mlir/test/IR/test-side-effects.mlir b/mlir/test/IR/test-side-effects.mlir
index efce4856041a1..b652ecb7dad1d 100644
--- a/mlir/test/IR/test-side-effects.mlir
+++ b/mlir/test/IR/test-side-effects.mlir
@@ -15,7 +15,7 @@ func.func @side_effect(%arg : index) {
     {effect="write", test_resource}
   ]} : () -> i32
   
-  // expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Test>'}}
+  // expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Test>'}}
   %3 = "test.side_effect_op"() {effects = [
     {effect="allocate", on_result, test_resource}
   ]} : () -> i32
@@ -38,19 +38,19 @@ func.func @side_effect(%arg : index) {
     effect_parameter = affine_map<(i, j) -> (j, i)>
   } : () -> i32
 
-  // expected-remark@+1 {{found an instance of 'allocate' on a op operand, on resource '<Test>'}}
+  // expected-remark@+1 {{found an instance of 'allocate' on op operand 0, on resource '<Test>'}}
   %6 = test.side_effect_with_region_op (%arg) {
   ^bb0(%arg0 : index):
     test.region_yield %arg0 : index 
   } {effects = [ {effect="allocate", on_operand, test_resource} ]} : index -> index
 
-  // expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource '<Test>'}}
+  // expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource '<Test>'}}
   %7 = test.side_effect_with_region_op (%arg) {
   ^bb0(%arg0 : index):
     test.region_yield %arg0 : index 
   } {effects = [ {effect="allocate", on_result, test_resource} ]} : index -> index
 
-  // expected-remark@+1 {{found an instance of 'allocate' on a block argument, on resource '<Test>'}}
+  // expected-remark@+1 {{found an instance of 'allocate' on block argument 0, on resource '<Test>'}}
   %8 = test.side_effect_with_region_op (%arg) {
   ^bb0(%arg0 : index):
     test.region_yield %arg0 : index 
diff --git a/mlir/test/lib/IR/TestSideEffects.cpp b/mlir/test/lib/IR/TestSideEffects.cpp
index 7e01509d55685..000e7c204fd5f 100644
--- a/mlir/test/lib/IR/TestSideEffects.cpp
+++ b/mlir/test/lib/IR/TestSideEffects.cpp
@@ -52,12 +52,12 @@ struct SideEffectsPass
           diag << "'write'";
 
         if (instance.getValue()) {
-          if (instance.getEffectValue<OpOperand *>())
-            diag << " on a op operand,";
-          else if (instance.getEffectValue<OpResult>())
-            diag << " on a op result,";
-          else if (instance.getEffectValue<BlockArgument>())
-            diag << " on a block argument,";
+          if (auto *opOpd = instance.getEffectValue<OpOperand *>())
+            diag << " on op operand " << opOpd->getOperandNumber() << ",";
+          else if (auto opRes = instance.getEffectValue<OpResult>())
+            diag << " on op result " << opRes.getResultNumber() << ",";
+          else if (auto opBlk = instance.getEffectValue<BlockArgument>())
+            diag << " on block argument " << opBlk.getArgNumber() << ",";
         } else if (SymbolRefAttr symbolRef = instance.getSymbolRef())
           diag << " on a symbol '" << symbolRef << "',";
 

From 2dd88c405d77b34dc028af09f3d55fa10dbed50e Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 11 Jun 2025 13:44:01 +0200
Subject: [PATCH 060/851] [flang][OpenMP] Extend locality spec to OMP claues
 (`init` and `dealloc` regions) (#142795)

Extends support for locality specifier to OpenMP translation by adding
supprot for transling localizers that have `init` and `dealloc` regions.
---
 .../OpenMP/DoConcurrentConversion.cpp         | 29 +++++++++--
 .../locality_specifiers_init_dealloc.mlir     | 51 +++++++++++++++++++
 2 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 283c3052c166c..28f6c8bf02813 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -326,16 +326,37 @@ class DoConcurrentConversion
           TODO(localizer.getLoc(),
                "local_init conversion is not supported yet");
 
-        if (!localizer.getInitRegion().empty())
-          TODO(localizer.getLoc(),
-               "non-empty `init` regions are not supported yet");
-
         auto oldIP = rewriter.saveInsertionPoint();
         rewriter.setInsertionPointAfter(localizer);
         auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
             localizer.getLoc(), sym.getLeafReference().str() + ".omp",
             localizer.getTypeAttr().getValue(),
             mlir::omp::DataSharingClauseType::Private);
+
+        if (!localizer.getInitRegion().empty()) {
+          rewriter.cloneRegionBefore(localizer.getInitRegion(),
+                                     privatizer.getInitRegion(),
+                                     privatizer.getInitRegion().begin());
+          auto firYield = mlir::cast<fir::YieldOp>(
+              privatizer.getInitRegion().back().getTerminator());
+          rewriter.setInsertionPoint(firYield);
+          rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
+                                              firYield.getOperands());
+          rewriter.eraseOp(firYield);
+        }
+
+        if (!localizer.getDeallocRegion().empty()) {
+          rewriter.cloneRegionBefore(localizer.getDeallocRegion(),
+                                     privatizer.getDeallocRegion(),
+                                     privatizer.getDeallocRegion().begin());
+          auto firYield = mlir::cast<fir::YieldOp>(
+              privatizer.getDeallocRegion().back().getTerminator());
+          rewriter.setInsertionPoint(firYield);
+          rewriter.create<mlir::omp::YieldOp>(firYield.getLoc(),
+                                              firYield.getOperands());
+          rewriter.eraseOp(firYield);
+        }
+
         rewriter.restoreInsertionPoint(oldIP);
 
         wsloopClauseOps.privateVars.push_back(op);
diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir
new file mode 100644
index 0000000000000..1659c7bdf6d3e
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir
@@ -0,0 +1,51 @@
+// Tests mapping `local` locality specifier to `private` clauses for non-empty
+// `init` and `dealloc` regions.
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+func.func @my_allocator(%arg0: !fir.ref<!fir.box<!fir.array<10xf32>>>, %arg1: !fir.ref<!fir.box<!fir.array<10xf32>>>) {
+  return
+}
+
+func.func @my_deallocator(%arg0: !fir.ref<!fir.box<!fir.array<10xf32>>>) {
+  return
+}
+
+fir.local {type = local} @_QFlocal_assocEaa_private_box_10xf32 : !fir.box<!fir.array<10xf32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<10xf32>>>, %arg1: !fir.ref<!fir.box<!fir.array<10xf32>>>):
+  fir.call @my_allocator(%arg0, %arg1) : (!fir.ref<!fir.box<!fir.array<10xf32>>>, !fir.ref<!fir.box<!fir.array<10xf32>>>) -> ()
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.array<10xf32>>>)
+} dealloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<10xf32>>>):
+  fir.call @my_deallocator(%arg0) : (!fir.ref<!fir.box<!fir.array<10xf32>>>) -> ()
+  fir.yield
+}
+
+func.func @_QPlocal_assoc() {
+  %0 = fir.alloca !fir.box<!fir.array<10xf32>>
+  %c1 = arith.constant 1 : index
+
+  fir.do_concurrent {
+    %9 = fir.alloca i32 {bindc_name = "i"}
+    %10:2 = hlfir.declare %9 {uniq_name = "_QFlocal_assocEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@_QFlocal_assocEaa_private_box_10xf32 %0 -> %arg1 : !fir.ref<!fir.box<!fir.array<10xf32>>>) {
+      %11 = fir.convert %arg0 : (index) -> i32
+      fir.store %11 to %10#0 : !fir.ref<i32>
+    }
+  }
+
+  return
+}
+
+// CHECK:      omp.private {type = private} @[[PRIVATIZER:.*]] : !fir.box<!fir.array<10xf32>> init {
+// CHECK-NEXT: ^bb0(%[[ORIG_ARG:.*]]: !{{.*}}, %[[PRIV_ARG:.*]]: !{{.*}}):
+// CHECK-NEXT:   fir.call @my_allocator(%[[ORIG_ARG]], %[[PRIV_ARG]]) : ({{.*}}) -> ()
+// CHECK-NEXT:   omp.yield(%[[PRIV_ARG]] : {{.*}})
+// CHECK-NEXT: } dealloc {
+// CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !{{.*}}):
+// CHECK-NEXT:   fir.call @my_deallocator(%[[PRIV_ARG]]) : ({{.*}}) -> ()
+// CHECK-NEXT:   omp.yield
+// CHECK-NEXT: }
+
+// CHECK: %[[LOCAL_ALLOC:.*]] = fir.alloca !fir.box<!fir.array<10xf32>>
+// CHECK: omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_ALLOC]] -> %{{.*}} : !{{.*}})

From 756e7cfd86c7f2bf20aaa1a3f87b5aa72ec128b4 Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <vogelsgesang@gmail.com>
Date: Wed, 11 Jun 2025 13:50:32 +0200
Subject: [PATCH 061/851] [debuginfo][coro] Fix linkage name for clones of coro
 functions (#141889)

So far, the `DW_AT_linkage_name` of the coroutine `resume`, `destroy`,
`cleanup` and `noalloc` function clones were incorrectly set to the
original function name instead of the updated function names.

With this commit, we now update the `DW_AT_linkage_name` to the correct
name. This has multiple benefits:

1. it's easier for me (and other toolchain developers) to understand the
   output of `llvm-dwarf-dump` when coroutines are involved.
2. When hitting a breakpoint, both LLDB and GDB now tell you which clone
   of the function you are in. E.g., GDB now prints "Breakpoint 1.2,
   coro_func(int) [clone .resume] (v=43) at ..." instead of "Breakpoint
   1.2, coro_func(int) (v=43) at ...".
3. GDB's `info line coro_func` command now allows you to distinguish the
   multiple different clones of the function.

In Swift, the linkage names of the clones were already updated. The
comment right above the relevant code in `CoroSplit.cpp` already hinted
that the linkage name should probably also be updated in C++. This
comment was added in commit 6ce76ff7eb7640, and back then the
corresponding `DW_AT_specification` (i.e., `SP->getDeclaration()`) was
not updated, yet, which led to problems for C++. In the meantime, commit
ca1a5b37c7236d added code to also update `SP->getDeclaration`, as such
there is no reason anymore to not update the linkage name for C++.

Note that most test cases used inconsistent function names for the LLVM
function vs. the DISubprogram linkage name. clang would never emit such
LLVM IR. This confused me initially, and hence I fixed it while updating
the test case.

Drive-by fix: The change in `CGVTables.cpp` is purely stylistic, NFC.
When looking for other usages of `replaceWithDistinct`, I got initially
confused because `CGVTables.cpp` was calling a static function via an
object instance.
---
 clang/lib/CodeGen/CGVTables.cpp               |  2 +-
 .../coroutine_handle/TestCoroutineHandle.py   |  5 ++-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  | 31 +++++--------------
 ...coro-debug-dbg.values-not_used_in_frame.ll |  6 ++--
 .../Coroutines/coro-debug-dbg.values.ll       | 10 +++---
 .../Coroutines/coro-debug-frame-variable.ll   | 10 +++---
 llvm/test/Transforms/Coroutines/coro-debug.ll | 18 +++++------
 7 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp
index c7447273a42fa..2897ccdf88660 100644
--- a/clang/lib/CodeGen/CGVTables.cpp
+++ b/clang/lib/CodeGen/CGVTables.cpp
@@ -124,7 +124,7 @@ static void resolveTopLevelMetadata(llvm::Function *Fn,
   auto *DIS = Fn->getSubprogram();
   if (!DIS)
     return;
-  auto *NewDIS = DIS->replaceWithDistinct(DIS->clone());
+  auto *NewDIS = llvm::MDNode::replaceWithDistinct(DIS->clone());
   VMap.MD()[DIS].reset(NewDIS);
 
   // Find all llvm.dbg.declare intrinsics and resolve the DILocalVariable nodes
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
index ae1a0c86b45d8..f471ea728f953 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
@@ -18,8 +18,11 @@ def do_test(self, stdlib_type):
         self.build(dictionary={stdlib_type: "1"})
         is_clang = self.expectedCompiler(["clang"])
 
+        # Clang <= 20 used to also name the resume/destroy functions
+        # as `my_generator_func`.
+        # Never versions of clang name the clones as `.resume`/`.destroy`.
         test_generator_func_ptr_re = re.compile(
-            r"^\(a.out`my_generator_func\(\) at main.cpp:[0-9]*\)$"
+            r"^\(a.out`my_generator_func\(\)( \(\..*\))? at main.cpp:[0-9]*\)$"
         )
 
         # Run until the initial suspension point
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index cebe44581b061..8813f91e9060c 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -913,29 +913,14 @@ void coro::BaseCloner::create() {
     assert(SP != OrigF.getSubprogram() && SP->isDistinct());
     updateScopeLine(ActiveSuspend, *SP);
 
-    // Update the linkage name to reflect the modified symbol name. It
-    // is necessary to update the linkage name in Swift, since the
-    // mangling changes for resume functions. It might also be the
-    // right thing to do in C++, but due to a limitation in LLVM's
-    // AsmPrinter we can only do this if the function doesn't have an
-    // abstract specification, since the DWARF backend expects the
-    // abstract specification to contain the linkage name and asserts
-    // that they are identical.
-    if (SP->getUnit() &&
-        SP->getUnit()->getSourceLanguage() == dwarf::DW_LANG_Swift) {
-      SP->replaceLinkageName(MDString::get(Context, NewF->getName()));
-      if (auto *Decl = SP->getDeclaration()) {
-        auto *NewDecl = DISubprogram::get(
-            Decl->getContext(), Decl->getScope(), Decl->getName(),
-            NewF->getName(), Decl->getFile(), Decl->getLine(), Decl->getType(),
-            Decl->getScopeLine(), Decl->getContainingType(),
-            Decl->getVirtualIndex(), Decl->getThisAdjustment(),
-            Decl->getFlags(), Decl->getSPFlags(), Decl->getUnit(),
-            Decl->getTemplateParams(), nullptr, Decl->getRetainedNodes(),
-            Decl->getThrownTypes(), Decl->getAnnotations(),
-            Decl->getTargetFuncName());
-        SP->replaceDeclaration(NewDecl);
-      }
+    // Update the linkage name and the function name to reflect the modified
+    // name.
+    MDString *NewLinkageName = MDString::get(Context, NewF->getName());
+    SP->replaceLinkageName(NewLinkageName);
+    if (DISubprogram *Decl = SP->getDeclaration()) {
+      TempDISubprogram NewDecl = Decl->clone();
+      NewDecl->replaceLinkageName(NewLinkageName);
+      SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl)));
     }
   }
 
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll
index 4da07c91eb486..deaec7b8d7f89 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll
@@ -2,18 +2,18 @@
 ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split,coro-split)' -S | FileCheck %s
 ;
 ; This file is based on coro-debug-frame-variable.ll.
-; CHECK:  define internal fastcc void @f.resume(ptr noundef nonnull align 16 dereferenceable(80) %begin) !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]]
+; CHECK:  define internal fastcc void @_Z3foov.resume(ptr noundef nonnull align 16 dereferenceable(80) %begin) !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]]
 ; CHECK:       await.ready:
 ; CHECK:         #dbg_value(i32 poison, ![[IVAR_RESUME:[0-9]+]], !DIExpression(
 ; CHECK:         #dbg_value(i32 poison, ![[JVAR_RESUME:[0-9]+]], !DIExpression(
 ;
-; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume"
 ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i"
 ; CHECK: ![[JVAR_RESUME]] = !DILocalVariable(name: "j"
 
 source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll"
 
-define void @f(i32 %i, i32 %j) presplitcoroutine !dbg !8 {
+define void @_Z3foov(i32 %i, i32 %j) presplitcoroutine !dbg !8 {
 entry:
   %__promise = alloca i8, align 8
   %x = alloca [10 x i32], align 16
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
index 28592cc671062..5f7701c357ec3 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split,coro-split)' -S | FileCheck %s
 ;
 ; This file is based on coro-debug-frame-variable.ll.
-; CHECK-LABEL: define void @f(
+; CHECK-LABEL: define void @_Z3foov(
 ; CHECK:       %[[frame:.*]] = call {{.*}} @llvm.coro.begin
 ; CHECK:       #dbg_value(ptr %[[frame]]
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetX:[0-9]*]]),
@@ -20,7 +20,7 @@
 ; CHECK:       #dbg_value(ptr %[[frame]]
 ; CHECK-SAME:    !DIExpression(DW_OP_plus_uconst, [[OffsetJ:[0-9]*]], DW_OP_deref),
 
-; CHECK-LABEL: void @f.resume(
+; CHECK-LABEL: void @_Z3foov.resume(
 ; CHECK-SAME:                 ptr {{.*}} %[[frame:.*]])
 ; CHECK-SAME:  !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]]
 ; CHECK:         %[[frame_alloca:.*]] = alloca ptr
@@ -37,7 +37,7 @@
 ; CHECK:         #dbg_value(ptr %[[frame_alloca]], ![[JVAR_RESUME:[0-9]+]],
 ; CHECK-SAME:        !DIExpression(DW_OP_deref, DW_OP_plus_uconst, [[OffsetJ]], DW_OP_deref)
 ;
-; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume"
 ; CHECK: ![[FRAME_DI_NUM]] = !DILocalVariable(name: "__coro_frame"
 ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i"
 ; CHECK: ![[XVAR_RESUME]] = !DILocalVariable(name: "x"
@@ -46,7 +46,7 @@
 
 declare void @consume(i32)
 
-define void @f(i32 %i, i32 %j) presplitcoroutine !dbg !8 {
+define void @_Z3foov(i32 %i, i32 %j) presplitcoroutine !dbg !8 {
 entry:
   %__promise = alloca i8, align 8
   %x = alloca [10 x i32], align 16
@@ -257,4 +257,4 @@ attributes #4 = { argmemonly nofree nosync nounwind willreturn writeonly }
 !21 = !DILocation(line: 43, column: 3, scope: !7)
 !22 = !DILocation(line: 43, column: 8, scope: !7)
 !23 = !DILocalVariable(name: "produced", scope: !7, file: !1, line:24, type: !10)
-!30 = distinct !DIAssignID()
\ No newline at end of file
+!30 = distinct !DIAssignID()
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
index a3c62b2dd12e1..125ec752c8345 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll
@@ -23,11 +23,11 @@
 ;
 ; The CHECKs verify that dbg.declare intrinsics are created for the coroutine
 ; funclet 'f.resume', and that they reference the address of the variables on
-; the coroutine frame. The debug locations for the original function 'f' are
+; the coroutine frame. The debug locations for the original function 'foo' are
 ; static (!11 and !13), whereas the coroutine funclet will have its own new
 ; ones with identical line and column numbers.
 ;
-; CHECK-LABEL: define void @f() {{.*}} {
+; CHECK-LABEL: define void @_Z3foov() {{.*}} {
 ; CHECK:       entry:
 ; CHECK:         %j = alloca i32, align 4
 ; CHECK:         #dbg_declare(ptr %j, ![[JVAR:[0-9]+]], !DIExpression(), ![[JDBGLOC:[0-9]+]]
@@ -36,7 +36,7 @@
 ; CHECK:         #dbg_declare(ptr %[[MEMORY]], ![[IVAR:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 20), ![[IDBGLOC]]
 ; CHECK:       await.ready:
 ;
-; CHECK-LABEL: define internal fastcc void @f.resume({{.*}}) {{.*}} {
+; CHECK-LABEL: define internal fastcc void @_Z3foov.resume({{.*}}) {{.*}} {
 ; CHECK:       entry.resume:
 ; CHECK-NEXT:    %[[DBG_PTR:.*]] = alloca ptr
 ; CHECK-NEXT:    #dbg_declare(ptr %[[DBG_PTR]], ![[XVAR_RESUME:[0-9]+]],   !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32),
@@ -58,13 +58,13 @@
 ; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[BLK_SCOPE]])
 
 ; CHECK-DAG: ![[XVAR_RESUME]] = !DILocalVariable(name: "x"
-; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov"
+; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume"
 ; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_BLK_SCOPE:[0-9]+]])
 ; CHECK-DAG: ![[RESUME_BLK_SCOPE]] = distinct !DILexicalBlock(scope: ![[RESUME_PROG_SCOPE]], file: !1, line: 23, column: 12)
 ; CHECK-DAG: ![[IVAR_RESUME]] = !DILocalVariable(name: "i"
 ; CHECK-DAG: ![[JVAR_RESUME]] = !DILocalVariable(name: "j"
 ; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_BLK_SCOPE]])
-define void @f() presplitcoroutine !dbg !8 {
+define void @_Z3foov() presplitcoroutine !dbg !8 {
 entry:
   %__promise = alloca i8, align 8
   %i = alloca i32, align 4
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index 17a0b80c5b5e5..a220073248ba3 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -6,12 +6,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: noinline nounwind
-define ptr @f(i32 %x) #0 personality i32 0 !dbg !6 {
+define ptr @flink(i32 %x) #0 personality i32 0 !dbg !6 {
 entry:
   %x.addr = alloca i32, align 4
   %coro_hdl = alloca ptr, align 8
   store i32 %x, ptr %x.addr, align 4
-  %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !16
+  %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16
   %1 = call i64 @llvm.coro.size.i64(), !dbg !16
   %call = call ptr @malloc(i64 %1), !dbg !16
   %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16
@@ -170,8 +170,8 @@ attributes #7 = { noduplicate }
 !31 = !DILocalVariable(name: "allocated", scope: !6, file: !7, line: 55, type: !11)
 !32 = !DILocalVariable(name: "inline_asm", scope: !6, file: !7, line: 55, type: !11)
 
-; CHECK: define ptr @f(i32 %x) #0 personality i32 0 !dbg ![[ORIG:[0-9]+]]
-; CHECK: define internal fastcc void @f.resume(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[RESUME:[0-9]+]]
+; CHECK: define ptr @flink(i32 %x) #0 personality i32 0 !dbg ![[ORIG:[0-9]+]]
+; CHECK: define internal fastcc void @flink.resume(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[RESUME:[0-9]+]]
 ; CHECK: entry.resume:
 ; CHECK: %[[DBG_PTR:.*]] = alloca ptr
 ; CHECK: #dbg_declare(ptr %[[DBG_PTR]], ![[RESUME_COROHDL:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst,
@@ -194,18 +194,18 @@ attributes #7 = { noduplicate }
 ; CHECK: [[DEFAULT_DEST]]:
 ; CHECK-NOT: {{.*}}:
 ; CHECK: #dbg_value(i32 %[[CALLBR_RES]]
-; CHECK: define internal fastcc void @f.destroy(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[DESTROY:[0-9]+]]
-; CHECK: define internal fastcc void @f.cleanup(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[CLEANUP:[0-9]+]]
+; CHECK: define internal fastcc void @flink.destroy(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[DESTROY:[0-9]+]]
+; CHECK: define internal fastcc void @flink.cleanup(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[CLEANUP:[0-9]+]]
 
 ; CHECK: ![[ORIG]] = distinct !DISubprogram(name: "f", linkageName: "flink"
 
-; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink.resume"
 ; CHECK: ![[RESUME_COROHDL]] = !DILocalVariable(name: "coro_hdl", scope: ![[RESUME]]
 ; CHECK: ![[RESUME_X]] = !DILocalVariable(name: "x", arg: 1, scope: ![[RESUME]]
 ; CHECK: ![[RESUME_CONST]] = !DILocalVariable(name: "direct_const", scope: ![[RESUME]]
 ; CHECK: ![[RESUME_DIRECT]] = !DILocalVariable(name: "direct_mem", scope: ![[RESUME]]
 ; CHECK: ![[RESUME_DIRECT_VALUE]] = !DILocalVariable(name: "direct_value", scope: ![[RESUME]]
 
-; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink.destroy"
 
-; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink.cleanup"

From f44f411afa914107d0a2395d2d8db826f88205e5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 11 Jun 2025 21:05:42 +0900
Subject: [PATCH 062/851] MSP430: Add tests for fcmp (#142706)

The existing coverage is thin. libcalls.ll seems to be the main fcmp
test, and it doesn't cover all the condition types, and runs with -O0.

Test all conditions for f32 and f64
---
 llvm/test/CodeGen/MSP430/fcmp.ll | 761 +++++++++++++++++++++++++++++++
 1 file changed, 761 insertions(+)
 create mode 100644 llvm/test/CodeGen/MSP430/fcmp.ll

diff --git a/llvm/test/CodeGen/MSP430/fcmp.ll b/llvm/test/CodeGen/MSP430/fcmp.ll
new file mode 100644
index 0000000000000..df1edc61b3370
--- /dev/null
+++ b/llvm/test/CodeGen/MSP430/fcmp.ll
@@ -0,0 +1,761 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=msp430-unknown-unknown < %s | FileCheck %s
+
+define i1 @fcmp_false_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_false_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    clr.b r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp false double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oeq_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_oeq_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ogt_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ogt_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jge .LBB2_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp ogt double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oge_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_oge_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jge .LBB3_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB3_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp oge double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_olt_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_olt_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jl .LBB4_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp olt double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ole_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ole_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jl .LBB5_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB5_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp ole double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_one_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_one_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    push r5
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    sub #8, r1
+; CHECK-NEXT:    mov r15, r7
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 24(r1), r12
+; CHECK-NEXT:    mov 26(r1), r5
+; CHECK-NEXT:    mov 28(r1), r4
+; CHECK-NEXT:    mov 30(r1), r6
+; CHECK-NEXT:    mov r7, r11
+; CHECK-NEXT:    mov r5, r13
+; CHECK-NEXT:    mov r4, r14
+; CHECK-NEXT:    mov r6, r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r6, 6(r1)
+; CHECK-NEXT:    mov r4, 4(r1)
+; CHECK-NEXT:    mov r5, 2(r1)
+; CHECK-NEXT:    mov 24(r1), r13
+; CHECK-NEXT:    mov r13, 0(r1)
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mov r9, r13
+; CHECK-NEXT:    mov r10, r14
+; CHECK-NEXT:    mov r7, r15
+; CHECK-NEXT:    call #__unorddf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r6
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    bic r6, r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    add #8, r1
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    pop r5
+; CHECK-NEXT:    pop r4
+; CHECK-NEXT:    ret
+  %cmp = fcmp one double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ord_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ord_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub #8, r1
+; CHECK-NEXT:    mov 16(r1), 6(r1)
+; CHECK-NEXT:    mov 14(r1), 4(r1)
+; CHECK-NEXT:    mov 12(r1), 2(r1)
+; CHECK-NEXT:    mov 10(r1), 0(r1)
+; CHECK-NEXT:    call #__unorddf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    add #8, r1
+; CHECK-NEXT:    ret
+  %cmp = fcmp ord double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_uno_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_uno_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub #8, r1
+; CHECK-NEXT:    mov 16(r1), 6(r1)
+; CHECK-NEXT:    mov 14(r1), 4(r1)
+; CHECK-NEXT:    mov 12(r1), 2(r1)
+; CHECK-NEXT:    mov 10(r1), 0(r1)
+; CHECK-NEXT:    call #__unorddf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r13
+; CHECK-NEXT:    rra r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    bic r13, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    add #8, r1
+; CHECK-NEXT:    ret
+  %cmp = fcmp uno double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ueq_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ueq_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    push r5
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    sub #8, r1
+; CHECK-NEXT:    mov r15, r7
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 24(r1), r12
+; CHECK-NEXT:    mov 26(r1), r5
+; CHECK-NEXT:    mov 28(r1), r4
+; CHECK-NEXT:    mov 30(r1), r6
+; CHECK-NEXT:    mov r7, r11
+; CHECK-NEXT:    mov r5, r13
+; CHECK-NEXT:    mov r4, r14
+; CHECK-NEXT:    mov r6, r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r6, 6(r1)
+; CHECK-NEXT:    mov r4, 4(r1)
+; CHECK-NEXT:    mov r5, 2(r1)
+; CHECK-NEXT:    mov 24(r1), r13
+; CHECK-NEXT:    mov r13, 0(r1)
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    rra r6
+; CHECK-NEXT:    and #1, r6
+; CHECK-NEXT:    mov r8, r12
+; CHECK-NEXT:    mov r9, r13
+; CHECK-NEXT:    mov r10, r14
+; CHECK-NEXT:    mov r7, r15
+; CHECK-NEXT:    call #__unorddf2
+; CHECK-NEXT:    bis r6, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    add #8, r1
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    pop r5
+; CHECK-NEXT:    pop r4
+; CHECK-NEXT:    ret
+  %cmp = fcmp ueq double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ugt_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ugt_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jge .LBB10_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp ugt double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_uge_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_uge_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jge .LBB11_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp uge double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ult_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ult_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jl .LBB12_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB12_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp ult double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ule_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_ule_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jl .LBB13_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp ule double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_une_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_une_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r11
+; CHECK-NEXT:    mov r14, r10
+; CHECK-NEXT:    mov r13, r9
+; CHECK-NEXT:    mov r12, r8
+; CHECK-NEXT:    mov 8(r1), r12
+; CHECK-NEXT:    mov 10(r1), r13
+; CHECK-NEXT:    mov 12(r1), r14
+; CHECK-NEXT:    mov 14(r1), r15
+; CHECK-NEXT:    call #__mspabi_cmpd
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r13
+; CHECK-NEXT:    rra r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    bic r13, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    ret
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_true_f64(double %a, double %b) #0 {
+; CHECK-LABEL: fcmp_true_f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.b #1, r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp true double %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_false_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_false_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    clr.b r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp false float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oeq_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_oeq_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp oeq float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ogt_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ogt_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jge .LBB18_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB18_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ogt float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_oge_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_oge_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jge .LBB19_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB19_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp oge float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_olt_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_olt_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jl .LBB20_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB20_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp olt float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ole_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ole_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jl .LBB21_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB21_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ole float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_one_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_one_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r10
+; CHECK-NEXT:    mov r14, r9
+; CHECK-NEXT:    mov r13, r8
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    call #__unordsf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r6
+; CHECK-NEXT:    mov r7, r12
+; CHECK-NEXT:    mov r8, r13
+; CHECK-NEXT:    mov r9, r14
+; CHECK-NEXT:    mov r10, r15
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    rra r6
+; CHECK-NEXT:    bic r12, r6
+; CHECK-NEXT:    and #1, r6
+; CHECK-NEXT:    mov.b r6, r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    ret
+  %cmp = fcmp one float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ord_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ord_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__unordsf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ord float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_uno_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_uno_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__unordsf2
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r13
+; CHECK-NEXT:    rra r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    bic r13, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp uno float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ueq_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ueq_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    push r6
+; CHECK-NEXT:    push r7
+; CHECK-NEXT:    push r8
+; CHECK-NEXT:    push r9
+; CHECK-NEXT:    push r10
+; CHECK-NEXT:    mov r15, r10
+; CHECK-NEXT:    mov r14, r9
+; CHECK-NEXT:    mov r13, r8
+; CHECK-NEXT:    mov r12, r7
+; CHECK-NEXT:    call #__unordsf2
+; CHECK-NEXT:    mov r12, r6
+; CHECK-NEXT:    mov r7, r12
+; CHECK-NEXT:    mov r8, r13
+; CHECK-NEXT:    mov r9, r14
+; CHECK-NEXT:    mov r10, r15
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    rra r12
+; CHECK-NEXT:    and #1, r12
+; CHECK-NEXT:    bis r6, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    pop r10
+; CHECK-NEXT:    pop r9
+; CHECK-NEXT:    pop r8
+; CHECK-NEXT:    pop r7
+; CHECK-NEXT:    pop r6
+; CHECK-NEXT:    ret
+  %cmp = fcmp ueq float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ugt_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ugt_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jge .LBB26_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB26_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ugt float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_uge_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_uge_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jge .LBB27_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp uge float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ult_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ult_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    tst r13
+; CHECK-NEXT:    jl .LBB28_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ult float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_ule_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_ule_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    mov r12, r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    cmp #1, r13
+; CHECK-NEXT:    jl .LBB29_2
+; CHECK-NEXT:  ; %bb.1:
+; CHECK-NEXT:    clr r12
+; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp ule float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_une_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_une_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    call #__mspabi_cmpf
+; CHECK-NEXT:    tst r12
+; CHECK-NEXT:    mov r2, r13
+; CHECK-NEXT:    rra r13
+; CHECK-NEXT:    mov #1, r12
+; CHECK-NEXT:    bic r13, r12
+; CHECK-NEXT:    ; kill: def $r12b killed $r12b killed $r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_true_f32(float %a, float %b) #0 {
+; CHECK-LABEL: fcmp_true_f32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.b #1, r12
+; CHECK-NEXT:    ret
+  %cmp = fcmp true float %a, %b
+  ret i1 %cmp
+}
+
+attributes #0 = { nounwind }

From 953a778fabc48025569fe0d5b3b363b981263f21 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Wed, 11 Jun 2025 19:08:23 +0700
Subject: [PATCH 063/851] [RISCV][FPEnv] Lowering of fpenv intrinsics (#141498)

The change implements custom lowering of `get_fpenv`, `set_fpenv` and
`reset_fpenv` for RISCV target.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  | 44 ++++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVISelLowering.h    |  3 ++
 llvm/lib/Target/RISCV/RISCVInstrInfo.td      |  7 ++++
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td   |  1 +
 llvm/test/CodeGen/RISCV/fpenv-xlen.ll        | 37 ++++++++++++++++
 llvm/test/CodeGen/RISCV/frm-write-in-loop.ll | 31 ++++++++++++++
 6 files changed, 123 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/fpenv-xlen.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 498adee35550c..a157c94849f37 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -649,6 +649,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::GET_ROUNDING, XLenVT, Custom);
     setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
+    setOperationAction(ISD::GET_FPENV, XLenVT, Custom);
+    setOperationAction(ISD::SET_FPENV, XLenVT, Custom);
+    setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom);
   }
 
   setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool,
@@ -8159,6 +8162,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
     return lowerSET_ROUNDING(Op, DAG);
+  case ISD::GET_FPENV:
+    return lowerGET_FPENV(Op, DAG);
+  case ISD::SET_FPENV:
+    return lowerSET_FPENV(Op, DAG);
+  case ISD::RESET_FPENV:
+    return lowerRESET_FPENV(Op, DAG);
   case ISD::EH_DWARF_CFA:
     return lowerEH_DWARF_CFA(Op, DAG);
   case ISD::VP_MERGE:
@@ -13799,6 +13808,41 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
                      RMValue);
 }
 
+SDValue RISCVTargetLowering::lowerGET_FPENV(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+  SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other);
+  return DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo);
+}
+
+SDValue RISCVTargetLowering::lowerSET_FPENV(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue EnvValue = Op->getOperand(1);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+
+  EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue);
+  return DAG.getNode(RISCVISD::WRITE_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     EnvValue);
+}
+
+SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  const MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(Op);
+  SDValue Chain = Op->getOperand(0);
+  SDValue EnvValue = DAG.getRegister(RISCV::X0, XLenVT);
+  SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT);
+
+  return DAG.getNode(RISCVISD::WRITE_CSR, DL, MVT::Other, Chain, SysRegNo,
+                     EnvValue);
+}
+
 SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 2ea2bf656ffd7..417d684a62382 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -544,6 +544,9 @@ class RISCVTargetLowering : public TargetLowering {
                                             unsigned ExtendOpc) const;
   SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 0d1ff09f4da3a..70fad925cf070 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -2037,6 +2037,13 @@ let hasSideEffects = true in {
 def ReadFFLAGS : ReadSysReg<SysRegFFLAGS, [FFLAGS]>;
 def WriteFFLAGS : WriteSysReg<SysRegFFLAGS, [FFLAGS]>;
 }
+
+let hasPostISelHook = 1 in {
+def ReadFCSR : ReadSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def WriteFCSR : WriteSysReg<SysRegFCSR, [FRM, FFLAGS]>;
+def WriteFCSRImm : WriteSysRegImm<SysRegFCSR, [FRM, FFLAGS]>;
+}
+
 /// Other pseudo-instructions
 
 // Pessimistically assume the stack pointer will be clobbered
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 80213e1503b0a..e87f4523a84f9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -846,6 +846,7 @@ foreach m = LMULList in {
 
 def FFLAGS : RISCVReg<0, "fflags">;
 def FRM    : RISCVReg<0, "frm">;
+def FCSR   : RISCVReg<0, "fcsr">;
 
 // Shadow Stack register
 def SSP    : RISCVReg<0, "ssp">;
diff --git a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
new file mode 100644
index 0000000000000..148186b21c125
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zfinx -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zfinx -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs -O0 | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f -verify-machineinstrs -O0 | FileCheck %s
+
+define iXLen @func_get_fpenv() {
+; CHECK-LABEL: func_get_fpenv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frcsr a0
+; CHECK-NEXT:    ret
+entry:
+  %fpenv = call iXLen @llvm.get.fpenv.iXLen()
+  ret iXLen %fpenv
+}
+
+define void @func_set_fpenv(iXLen %fpenv) {
+; CHECK-LABEL: func_set_fpenv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fscsr a0
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.set.fpenv.iXLen(iXLen %fpenv)
+  ret void
+}
+
+define void @func_reset_fpenv() {
+; CHECK-LABEL: func_reset_fpenv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fscsr zero
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.reset.fpenv()
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll b/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll
index 4f435067343b7..72c5951178276 100644
--- a/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll
+++ b/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll
@@ -90,3 +90,34 @@ loop:
 exit:
     ret double %f2
 }
+
+define double @foo2(double %0, double %1, i64 %n, i64 %fcsr) strictfp {
+; CHECK-LABEL: foo2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    fmv.d.x fa5, zero
+; CHECK-NEXT:  .LBB2_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    csrwi fcsr, 0
+; CHECK-NEXT:    fadd.d fa5, fa5, fa0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    fscsr a1
+; CHECK-NEXT:    fadd.d fa5, fa5, fa1
+; CHECK-NEXT:    beqz a0, .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    fmv.d fa0, fa5
+; CHECK-NEXT:    ret
+entry:
+    br label %loop
+loop:
+    %cnt = phi i64 [0, %entry], [%cnt_inc, %loop]
+    %acc = phi double [0.0, %entry], [%f2, %loop]
+    call void @llvm.set.fpenv(i64 0) strictfp
+    %f1 = call double @llvm.experimental.constrained.fadd.f64(double %acc, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp
+    call void @llvm.set.fpenv(i64 %fcsr) strictfp
+    %f2 = call double @llvm.experimental.constrained.fadd.f64(double %f1, double %1, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp
+    %cnt_inc = add i64 %cnt, 1
+    %cond = icmp eq i64 %cnt_inc, %n
+    br i1 %cond, label %loop, label %exit
+exit:
+    ret double %f2
+}

From 4a46ead8fb5b57e69bcd1c72ebd7dd8eaf09fa9c Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang@salesforce.com>
Date: Wed, 11 Jun 2025 14:09:54 +0200
Subject: [PATCH 064/851] [lldb] Show coro_frame in `std::coroutine_handle`
 pretty printer (#141516)

This commit adjusts the pretty printer for `std::coroutine_handle` based
on recent personal experiences with debugging C++20 coroutines:

1. It adds the `coro_frame` member. This member exposes the complete
   coroutine frame contents, including the suspension point id and all
   internal variables which the compiler decided to persist into the
   coroutine frame. While this data is highly compiler-specific, inspecting
   it can help identify the internal state of suspended coroutines.
2. It includes the `promise` and `coro_frame` members, even if
   devirtualization failed and we could not infer the promise type / the
   coro_frame type. Having them available as `void*` pointers can still be
   useful to identify, e.g., which two coroutine handles have the same
   frame / promise pointers.
---
 .../lldb/DataFormatters/TypeSynthetic.h       |   2 +-
 lldb/source/DataFormatters/TypeSynthetic.cpp  |   6 +-
 .../Plugins/Language/CPlusPlus/Coroutines.cpp | 145 ++++++++----------
 .../Plugins/Language/CPlusPlus/Coroutines.h   |   4 +-
 .../coroutine_handle/TestCoroutineHandle.py   |  46 ++++--
 5 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
index 37f02fb8f7ce5..11a4ca2cd8330 100644
--- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h
+++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h
@@ -92,7 +92,7 @@ class SyntheticChildrenFrontEnd {
   lldb::ValueObjectSP
   CreateValueObjectFromAddress(llvm::StringRef name, uint64_t address,
                                const ExecutionContext &exe_ctx,
-                               CompilerType type);
+                               CompilerType type, bool do_deref = true);
 
   lldb::ValueObjectSP CreateValueObjectFromData(llvm::StringRef name,
                                                 const DataExtractor &data,
diff --git a/lldb/source/DataFormatters/TypeSynthetic.cpp b/lldb/source/DataFormatters/TypeSynthetic.cpp
index 57009b07dc553..33af0ad63077f 100644
--- a/lldb/source/DataFormatters/TypeSynthetic.cpp
+++ b/lldb/source/DataFormatters/TypeSynthetic.cpp
@@ -138,9 +138,9 @@ lldb::ValueObjectSP SyntheticChildrenFrontEnd::CreateValueObjectFromExpression(
 
 lldb::ValueObjectSP SyntheticChildrenFrontEnd::CreateValueObjectFromAddress(
     llvm::StringRef name, uint64_t address, const ExecutionContext &exe_ctx,
-    CompilerType type) {
-  ValueObjectSP valobj_sp(
-      ValueObject::CreateValueObjectFromAddress(name, address, exe_ctx, type));
+    CompilerType type, bool do_deref) {
+  ValueObjectSP valobj_sp(ValueObject::CreateValueObjectFromAddress(
+      name, address, exe_ctx, type, do_deref));
   if (valobj_sp)
     valobj_sp->SetSyntheticChildrenGenerated(true);
   return valobj_sp;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
index 9d84af4a85384..e8c2db1886333 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp
@@ -11,8 +11,6 @@
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/Symbol/Function.h"
 #include "lldb/Symbol/VariableList.h"
-#include "lldb/Utility/LLDBLog.h"
-#include "lldb/Utility/Log.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -61,19 +59,23 @@ static Function *ExtractDestroyFunction(lldb::TargetSP target_sp,
   return destroy_func_address.CalculateSymbolContextFunction();
 }
 
-static CompilerType InferPromiseType(Function &destroy_func) {
-  Block &block = destroy_func.GetBlock(true);
+// clang generates aritifical `__promise` and `__coro_frame` variables inside
+// the destroy function. Look for those variables and extract their type.
+static CompilerType InferArtificialCoroType(Function *destroy_func,
+                                            ConstString var_name) {
+  if (!destroy_func)
+    return {};
+
+  Block &block = destroy_func->GetBlock(true);
   auto variable_list = block.GetBlockVariableList(true);
 
-  // clang generates an artificial `__promise` variable inside the
-  // `destroy` function. Look for it.
-  auto promise_var = variable_list->FindVariable(ConstString("__promise"));
-  if (!promise_var)
+  auto var = variable_list->FindVariable(var_name);
+  if (!var)
     return {};
-  if (!promise_var->IsArtificial())
+  if (!var->IsArtificial())
     return {};
 
-  Type *promise_type = promise_var->GetType();
+  Type *promise_type = var->GetType();
   if (!promise_type)
     return {};
   return promise_type->GetForwardCompilerType();
@@ -107,30 +109,17 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::
 
 llvm::Expected<uint32_t> lldb_private::formatters::
     StdlibCoroutineHandleSyntheticFrontEnd::CalculateNumChildren() {
-  if (!m_resume_ptr_sp || !m_destroy_ptr_sp)
-    return 0;
-
-  return m_promise_ptr_sp ? 3 : 2;
+  return m_children.size();
 }
 
 lldb::ValueObjectSP lldb_private::formatters::
     StdlibCoroutineHandleSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) {
-  switch (idx) {
-  case 0:
-    return m_resume_ptr_sp;
-  case 1:
-    return m_destroy_ptr_sp;
-  case 2:
-    return m_promise_ptr_sp;
-  }
-  return lldb::ValueObjectSP();
+  return idx < m_children.size() ? m_children[idx] : lldb::ValueObjectSP();
 }
 
 lldb::ChildCacheState
 lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() {
-  m_resume_ptr_sp.reset();
-  m_destroy_ptr_sp.reset();
-  m_promise_ptr_sp.reset();
+  m_children.clear();
 
   ValueObjectSP valobj_sp = m_backend.GetNonSyntheticValue();
   if (!valobj_sp)
@@ -140,60 +129,66 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() {
   if (frame_ptr_addr == 0 || frame_ptr_addr == LLDB_INVALID_ADDRESS)
     return lldb::ChildCacheState::eRefetch;
 
-  auto ast_ctx = valobj_sp->GetCompilerType().GetTypeSystem<TypeSystemClang>();
-  if (!ast_ctx)
-    return lldb::ChildCacheState::eRefetch;
-
-  // Create the `resume` and `destroy` children.
   lldb::TargetSP target_sp = m_backend.GetTargetSP();
   auto &exe_ctx = m_backend.GetExecutionContextRef();
   lldb::ProcessSP process_sp = target_sp->GetProcessSP();
   auto ptr_size = process_sp->GetAddressByteSize();
-  CompilerType void_type = ast_ctx->GetBasicType(lldb::eBasicTypeVoid);
-  std::array<CompilerType, 1> args{void_type};
-  CompilerType coro_func_type = ast_ctx->CreateFunctionType(
-      /*result_type=*/void_type, args,
-      /*is_variadic=*/false, /*qualifiers=*/0);
-  CompilerType coro_func_ptr_type = coro_func_type.GetPointerType();
-  m_resume_ptr_sp = CreateValueObjectFromAddress(
-      "resume", frame_ptr_addr + 0 * ptr_size, exe_ctx, coro_func_ptr_type);
-  lldbassert(m_resume_ptr_sp);
-  m_destroy_ptr_sp = CreateValueObjectFromAddress(
-      "destroy", frame_ptr_addr + 1 * ptr_size, exe_ctx, coro_func_ptr_type);
-  lldbassert(m_destroy_ptr_sp);
-
-  // Get the `promise_type` from the template argument
-  CompilerType promise_type(
-      valobj_sp->GetCompilerType().GetTypeTemplateArgument(0));
-  if (!promise_type)
+  auto ast_ctx = valobj_sp->GetCompilerType().GetTypeSystem<TypeSystemClang>();
+  if (!ast_ctx)
     return lldb::ChildCacheState::eRefetch;
 
-  // Try to infer the promise_type if it was type-erased
+  // Determine the coroutine frame type and the promise type. Fall back
+  // to `void`, since even the pointer itself might be useful, even if the
+  // type inference failed.
+  Function *destroy_func = ExtractDestroyFunction(target_sp, frame_ptr_addr);
+  CompilerType void_type = ast_ctx->GetBasicType(lldb::eBasicTypeVoid);
+  CompilerType promise_type;
+  if (CompilerType template_arg =
+          valobj_sp->GetCompilerType().GetTypeTemplateArgument(0))
+    promise_type = std::move(template_arg);
   if (promise_type.IsVoidType()) {
-    if (Function *destroy_func =
-            ExtractDestroyFunction(target_sp, frame_ptr_addr)) {
-      if (CompilerType inferred_type = InferPromiseType(*destroy_func)) {
+    // Try to infer the promise_type if it was type-erased
+    if (destroy_func) {
+      if (CompilerType inferred_type =
+              InferArtificialCoroType(destroy_func, ConstString("__promise"))) {
         promise_type = inferred_type;
       }
     }
   }
+  CompilerType coro_frame_type =
+      InferArtificialCoroType(destroy_func, ConstString("__coro_frame"));
+  if (!coro_frame_type)
+    coro_frame_type = void_type;
 
-  // If we don't know the promise type, we don't display the `promise` member.
-  // `CreateValueObjectFromAddress` below would fail for `void` types.
-  if (promise_type.IsVoidType()) {
-    return lldb::ChildCacheState::eRefetch;
-  }
-
-  // Add the `promise` member. We intentionally add `promise` as a pointer type
-  // instead of a value type, and don't automatically dereference this pointer.
-  // We do so to avoid potential very deep recursion in case there is a cycle
-  // formed between `std::coroutine_handle`s and their promises.
-  lldb::ValueObjectSP promise = CreateValueObjectFromAddress(
-      "promise", frame_ptr_addr + 2 * ptr_size, exe_ctx, promise_type);
-  Status error;
-  lldb::ValueObjectSP promisePtr = promise->AddressOf(error);
-  if (error.Success())
-    m_promise_ptr_sp = promisePtr->Clone(ConstString("promise"));
+  // Create the `resume` and `destroy` children.
+  std::array<CompilerType, 1> args{coro_frame_type};
+  CompilerType coro_func_type = ast_ctx->CreateFunctionType(
+      /*result_type=*/void_type, args,
+      /*is_variadic=*/false, /*qualifiers=*/0);
+  CompilerType coro_func_ptr_type = coro_func_type.GetPointerType();
+  ValueObjectSP resume_ptr_sp = CreateValueObjectFromAddress(
+      "resume", frame_ptr_addr + 0 * ptr_size, exe_ctx, coro_func_ptr_type);
+  assert(resume_ptr_sp);
+  m_children.push_back(std::move(resume_ptr_sp));
+  ValueObjectSP destroy_ptr_sp = CreateValueObjectFromAddress(
+      "destroy", frame_ptr_addr + 1 * ptr_size, exe_ctx, coro_func_ptr_type);
+  assert(destroy_ptr_sp);
+  m_children.push_back(std::move(destroy_ptr_sp));
+
+  // Add promise and coro_frame
+  // Add the `promise` and `coro_frame` member. We intentionally add them as
+  // pointer types instead of a value type, and don't automatically dereference
+  // those pointers. We do so to avoid potential very deep recursion in case
+  // there is a cycle formed between `std::coroutine_handle`s and their
+  // promises.
+  ValueObjectSP promise_ptr_sp = CreateValueObjectFromAddress(
+      "promise", frame_ptr_addr + 2 * ptr_size, exe_ctx,
+      promise_type.GetPointerType(), /*do_deref=*/false);
+  m_children.push_back(std::move(promise_ptr_sp));
+  ValueObjectSP coroframe_ptr_sp = CreateValueObjectFromAddress(
+      "coro_frame", frame_ptr_addr, exe_ctx, coro_frame_type.GetPointerType(),
+      /*do_deref=*/false);
+  m_children.push_back(std::move(coroframe_ptr_sp));
 
   return lldb::ChildCacheState::eRefetch;
 }
@@ -201,16 +196,10 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() {
 llvm::Expected<size_t>
 StdlibCoroutineHandleSyntheticFrontEnd::GetIndexOfChildWithName(
     ConstString name) {
-  if (!m_resume_ptr_sp || !m_destroy_ptr_sp)
-    return llvm::createStringError("Type has no child named '%s'",
-                                   name.AsCString());
-
-  if (name == ConstString("resume"))
-    return 0;
-  if (name == ConstString("destroy"))
-    return 1;
-  if (name == ConstString("promise_ptr") && m_promise_ptr_sp)
-    return 2;
+  for (const auto &[idx, child_sp] : llvm::enumerate(m_children)) {
+    if (child_sp->GetName() == name)
+      return idx;
+  }
 
   return llvm::createStringError("Type has no child named '%s'",
                                  name.AsCString());
diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
index fd9445d46e6a0..520d8e0b3c79d 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h
@@ -43,9 +43,7 @@ class StdlibCoroutineHandleSyntheticFrontEnd
   llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
 
 private:
-  lldb::ValueObjectSP m_resume_ptr_sp;
-  lldb::ValueObjectSP m_destroy_ptr_sp;
-  lldb::ValueObjectSP m_promise_ptr_sp;
+  std::vector<lldb::ValueObjectSP> m_children;
 };
 
 SyntheticChildrenFrontEnd *
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
index f471ea728f953..54bb661057cd6 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py
@@ -46,11 +46,17 @@ def do_test(self, stdlib_type):
                         ValueCheck(name="current_value", value="-1"),
                     ],
                 ),
+                # We don not check any members inside the `coro_frame`,
+                # as its contents are highly compiler-specific.
+                ValueCheck(name="coro_frame"),
             ],
         )
+
+        # For a type-erased `coroutine_handle<>`, we can still devirtualize
+        # the promise call and display the correctly typed promise. This
+        # currently only works in clang, because gcc is not adding the
+        # artificial `__promise` variable to the destroy function.
         if is_clang:
-            # For a type-erased `coroutine_handle<>`, we can still devirtualize
-            # the promise call and display the correctly typed promise.
             self.expect_expr(
                 "type_erased_hdl",
                 result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"),
@@ -63,23 +69,26 @@ def do_test(self, stdlib_type):
                             ValueCheck(name="current_value", value="-1"),
                         ],
                     ),
+                    ValueCheck(name="coro_frame"),
                 ],
             )
-            # For an incorrectly typed `coroutine_handle`, we use the user-supplied
-            # incorrect type instead of inferring the correct type. Strictly speaking,
-            # incorrectly typed coroutine handles are undefined behavior. However,
-            # it provides probably a better debugging experience if we display the
-            # promise as seen by the program instead of fixing this bug based on
-            # the available debug info.
-            self.expect_expr(
-                "incorrectly_typed_hdl",
-                result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"),
-                result_children=[
-                    ValueCheck(name="resume", summary=test_generator_func_ptr_re),
-                    ValueCheck(name="destroy", summary=test_generator_func_ptr_re),
-                    ValueCheck(name="promise", dereference=ValueCheck(value="-1")),
-                ],
-            )
+
+        # For an incorrectly typed `coroutine_handle`, we use the user-supplied
+        # incorrect type instead of inferring the correct type. Strictly speaking,
+        # incorrectly typed coroutine handles are undefined behavior. However,
+        # it provides probably a better debugging experience if we display the
+        # promise as seen by the program instead of fixing this bug based on
+        # the available debug info.
+        self.expect_expr(
+            "incorrectly_typed_hdl",
+            result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"),
+            result_children=[
+                ValueCheck(name="resume", summary=test_generator_func_ptr_re),
+                ValueCheck(name="destroy", summary=test_generator_func_ptr_re),
+                ValueCheck(name="promise", dereference=ValueCheck(value="-1")),
+                ValueCheck(name="coro_frame"),
+            ],
+        )
 
         process = self.process()
 
@@ -110,6 +119,7 @@ def do_test(self, stdlib_type):
                         ValueCheck(name="current_value", value="42"),
                     ],
                 ),
+                ValueCheck(name="coro_frame"),
             ],
         )
 
@@ -133,6 +143,7 @@ def do_test(self, stdlib_type):
                         ValueCheck(name="current_value", value="42"),
                     ],
                 ),
+                ValueCheck(name="coro_frame"),
             ],
         )
         if is_clang:
@@ -150,6 +161,7 @@ def do_test(self, stdlib_type):
                             ValueCheck(name="current_value", value="42"),
                         ],
                     ),
+                    ValueCheck(name="coro_frame"),
                 ],
             )
 

From 3ef7d035e21d8f75eb85b521d7ff0203e60cb6f2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 11 Jun 2025 21:14:59 +0900
Subject: [PATCH 065/851] MSP430: Stop using setCmpLibcallCC (#142708)

This appears to only be useful for the eq/ne cases, and only for
ARM libcalls. This is setting it to the default values, and there's
no change in the new fcmp test output.
---
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 183 +++++++++---------
 1 file changed, 89 insertions(+), 94 deletions(-)

diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 5589cea6e675d..8c55f77d062b7 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -151,104 +151,99 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   // EABI Libcalls - EABI Section 6.2
   const struct {
     const RTLIB::Libcall Op;
-    const char * const Name;
-    const ISD::CondCode Cond;
+    const char *const Name;
   } LibraryCalls[] = {
-    // Floating point conversions - EABI Table 6
-    { RTLIB::FPROUND_F64_F32,   "__mspabi_cvtdf",   ISD::SETCC_INVALID },
-    { RTLIB::FPEXT_F32_F64,     "__mspabi_cvtfd",   ISD::SETCC_INVALID },
-    // The following is NOT implemented in libgcc
-    //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi", ISD::SETCC_INVALID },
-    { RTLIB::FPTOSINT_F64_I32,  "__mspabi_fixdli",  ISD::SETCC_INVALID },
-    { RTLIB::FPTOSINT_F64_I64,  "__mspabi_fixdlli", ISD::SETCC_INVALID },
-    // The following is NOT implemented in libgcc
-    //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu", ISD::SETCC_INVALID },
-    { RTLIB::FPTOUINT_F64_I32,  "__mspabi_fixdul",  ISD::SETCC_INVALID },
-    { RTLIB::FPTOUINT_F64_I64,  "__mspabi_fixdull", ISD::SETCC_INVALID },
-    // The following is NOT implemented in libgcc
-    //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi", ISD::SETCC_INVALID },
-    { RTLIB::FPTOSINT_F32_I32,  "__mspabi_fixfli",  ISD::SETCC_INVALID },
-    { RTLIB::FPTOSINT_F32_I64,  "__mspabi_fixflli", ISD::SETCC_INVALID },
-    // The following is NOT implemented in libgcc
-    //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu", ISD::SETCC_INVALID },
-    { RTLIB::FPTOUINT_F32_I32,  "__mspabi_fixful",  ISD::SETCC_INVALID },
-    { RTLIB::FPTOUINT_F32_I64,  "__mspabi_fixfull", ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc
-    //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid", ISD::SETCC_INVALID },
-    { RTLIB::SINTTOFP_I32_F64,  "__mspabi_fltlid",  ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc but is not in the EABI
-    { RTLIB::SINTTOFP_I64_F64,  "__mspabi_fltllid", ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc
-    //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud", ISD::SETCC_INVALID },
-    { RTLIB::UINTTOFP_I32_F64,  "__mspabi_fltuld",  ISD::SETCC_INVALID },
-    // The following IS implemented in libgcc but is not in the EABI
-    { RTLIB::UINTTOFP_I64_F64,  "__mspabi_fltulld", ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc
-    //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif", ISD::SETCC_INVALID },
-    { RTLIB::SINTTOFP_I32_F32,  "__mspabi_fltlif",  ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc but is not in the EABI
-    { RTLIB::SINTTOFP_I64_F32,  "__mspabi_fltllif", ISD::SETCC_INVALID },
-    // TODO The following IS implemented in libgcc
-    //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf", ISD::SETCC_INVALID },
-    { RTLIB::UINTTOFP_I32_F32,  "__mspabi_fltulf",  ISD::SETCC_INVALID },
-    // The following IS implemented in libgcc but is not in the EABI
-    { RTLIB::UINTTOFP_I64_F32,  "__mspabi_fltullf", ISD::SETCC_INVALID },
-
-    // Floating point comparisons - EABI Table 7
-    { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
-    { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
-    { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
-    { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
-    { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
-    { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
-    { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
-    { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
-    { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
-    { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
-    { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
-    { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
-
-    // Floating point arithmetic - EABI Table 8
-    { RTLIB::ADD_F64,  "__mspabi_addd", ISD::SETCC_INVALID },
-    { RTLIB::ADD_F32,  "__mspabi_addf", ISD::SETCC_INVALID },
-    { RTLIB::DIV_F64,  "__mspabi_divd", ISD::SETCC_INVALID },
-    { RTLIB::DIV_F32,  "__mspabi_divf", ISD::SETCC_INVALID },
-    { RTLIB::MUL_F64,  "__mspabi_mpyd", ISD::SETCC_INVALID },
-    { RTLIB::MUL_F32,  "__mspabi_mpyf", ISD::SETCC_INVALID },
-    { RTLIB::SUB_F64,  "__mspabi_subd", ISD::SETCC_INVALID },
-    { RTLIB::SUB_F32,  "__mspabi_subf", ISD::SETCC_INVALID },
-    // The following are NOT implemented in libgcc
-    // { RTLIB::NEG_F64,  "__mspabi_negd", ISD::SETCC_INVALID },
-    // { RTLIB::NEG_F32,  "__mspabi_negf", ISD::SETCC_INVALID },
-
-    // Universal Integer Operations - EABI Table 9
-    { RTLIB::SDIV_I16,   "__mspabi_divi", ISD::SETCC_INVALID },
-    { RTLIB::SDIV_I32,   "__mspabi_divli", ISD::SETCC_INVALID },
-    { RTLIB::SDIV_I64,   "__mspabi_divlli", ISD::SETCC_INVALID },
-    { RTLIB::UDIV_I16,   "__mspabi_divu", ISD::SETCC_INVALID },
-    { RTLIB::UDIV_I32,   "__mspabi_divul", ISD::SETCC_INVALID },
-    { RTLIB::UDIV_I64,   "__mspabi_divull", ISD::SETCC_INVALID },
-    { RTLIB::SREM_I16,   "__mspabi_remi", ISD::SETCC_INVALID },
-    { RTLIB::SREM_I32,   "__mspabi_remli", ISD::SETCC_INVALID },
-    { RTLIB::SREM_I64,   "__mspabi_remlli", ISD::SETCC_INVALID },
-    { RTLIB::UREM_I16,   "__mspabi_remu", ISD::SETCC_INVALID },
-    { RTLIB::UREM_I32,   "__mspabi_remul", ISD::SETCC_INVALID },
-    { RTLIB::UREM_I64,   "__mspabi_remull", ISD::SETCC_INVALID },
-
-    // Bitwise Operations - EABI Table 10
-    // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
-    { RTLIB::SRL_I32,    "__mspabi_srll", ISD::SETCC_INVALID },
-    { RTLIB::SRA_I32,    "__mspabi_sral", ISD::SETCC_INVALID },
-    { RTLIB::SHL_I32,    "__mspabi_slll", ISD::SETCC_INVALID },
-    // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc
-
+      // Floating point conversions - EABI Table 6
+      {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"},
+      {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi" },
+      {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"},
+      {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu" },
+      {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"},
+      {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi" },
+      {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"},
+      {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu" },
+      {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"},
+      {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid" },
+      {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"},
+      // TODO The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud" },
+      {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"},
+      // The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif" },
+      {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"},
+      // TODO The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf" },
+      {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"},
+      // The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"},
+
+      // Floating point comparisons - EABI Table 7
+      {RTLIB::OEQ_F64, "__mspabi_cmpd"},
+      {RTLIB::UNE_F64, "__mspabi_cmpd"},
+      {RTLIB::OGE_F64, "__mspabi_cmpd"},
+      {RTLIB::OLT_F64, "__mspabi_cmpd"},
+      {RTLIB::OLE_F64, "__mspabi_cmpd"},
+      {RTLIB::OGT_F64, "__mspabi_cmpd"},
+      {RTLIB::OEQ_F32, "__mspabi_cmpf"},
+      {RTLIB::UNE_F32, "__mspabi_cmpf"},
+      {RTLIB::OGE_F32, "__mspabi_cmpf"},
+      {RTLIB::OLT_F32, "__mspabi_cmpf"},
+      {RTLIB::OLE_F32, "__mspabi_cmpf"},
+      {RTLIB::OGT_F32, "__mspabi_cmpf"},
+
+      // Floating point arithmetic - EABI Table 8
+      {RTLIB::ADD_F64, "__mspabi_addd"},
+      {RTLIB::ADD_F32, "__mspabi_addf"},
+      {RTLIB::DIV_F64, "__mspabi_divd"},
+      {RTLIB::DIV_F32, "__mspabi_divf"},
+      {RTLIB::MUL_F64, "__mspabi_mpyd"},
+      {RTLIB::MUL_F32, "__mspabi_mpyf"},
+      {RTLIB::SUB_F64, "__mspabi_subd"},
+      {RTLIB::SUB_F32, "__mspabi_subf"},
+      // The following are NOT implemented in libgcc
+      // { RTLIB::NEG_F64,  "__mspabi_negd" },
+      // { RTLIB::NEG_F32,  "__mspabi_negf" },
+
+      // Universal Integer Operations - EABI Table 9
+      {RTLIB::SDIV_I16, "__mspabi_divi"},
+      {RTLIB::SDIV_I32, "__mspabi_divli"},
+      {RTLIB::SDIV_I64, "__mspabi_divlli"},
+      {RTLIB::UDIV_I16, "__mspabi_divu"},
+      {RTLIB::UDIV_I32, "__mspabi_divul"},
+      {RTLIB::UDIV_I64, "__mspabi_divull"},
+      {RTLIB::SREM_I16, "__mspabi_remi"},
+      {RTLIB::SREM_I32, "__mspabi_remli"},
+      {RTLIB::SREM_I64, "__mspabi_remlli"},
+      {RTLIB::UREM_I16, "__mspabi_remu"},
+      {RTLIB::UREM_I32, "__mspabi_remul"},
+      {RTLIB::UREM_I64, "__mspabi_remull"},
+
+      // Bitwise Operations - EABI Table 10
+      // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
+      {RTLIB::SRL_I32, "__mspabi_srll"},
+      {RTLIB::SRA_I32, "__mspabi_sral"},
+      {RTLIB::SHL_I32, "__mspabi_slll"},
+      // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc
   };
 
-  for (const auto &LC : LibraryCalls) {
+  for (const auto &LC : LibraryCalls)
     setLibcallName(LC.Op, LC.Name);
-    if (LC.Cond != ISD::SETCC_INVALID)
-      setCmpLibcallCC(LC.Op, LC.Cond);
-  }
 
   if (STI.hasHWMult16()) {
     const struct {

From ac7fa4099e83d6490d2f9ea185b236db2f26e652 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 11 Jun 2025 21:17:58 +0900
Subject: [PATCH 066/851] MSP430: Partially move runtime libcall config out of
 TargetLowering (#142709)

RuntimeLibcalls needs to be correct outside of codegen contexts.
---
 llvm/lib/IR/RuntimeLibcalls.cpp               | 120 ++++++++++++++++++
 llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 114 -----------------
 2 files changed, 120 insertions(+), 114 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 541379e7ade48..31013310a746d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -81,6 +81,123 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
   }
 }
 
+static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
+  // EABI Libcalls - EABI Section 6.2
+  const struct {
+    const RTLIB::Libcall Op;
+    const char *const Name;
+  } LibraryCalls[] = {
+      // Floating point conversions - EABI Table 6
+      {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"},
+      {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi" },
+      {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"},
+      {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu" },
+      {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"},
+      {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi" },
+      {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"},
+      {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"},
+      // The following is NOT implemented in libgcc
+      //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu" },
+      {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"},
+      {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid" },
+      {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"},
+      // TODO The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud" },
+      {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"},
+      // The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif" },
+      {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"},
+      // TODO The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"},
+      // TODO The following IS implemented in libgcc
+      //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf" },
+      {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"},
+      // The following IS implemented in libgcc but is not in the EABI
+      {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"},
+
+      // Floating point comparisons - EABI Table 7
+      {RTLIB::OEQ_F64, "__mspabi_cmpd"},
+      {RTLIB::UNE_F64, "__mspabi_cmpd"},
+      {RTLIB::OGE_F64, "__mspabi_cmpd"},
+      {RTLIB::OLT_F64, "__mspabi_cmpd"},
+      {RTLIB::OLE_F64, "__mspabi_cmpd"},
+      {RTLIB::OGT_F64, "__mspabi_cmpd"},
+      {RTLIB::OEQ_F32, "__mspabi_cmpf"},
+      {RTLIB::UNE_F32, "__mspabi_cmpf"},
+      {RTLIB::OGE_F32, "__mspabi_cmpf"},
+      {RTLIB::OLT_F32, "__mspabi_cmpf"},
+      {RTLIB::OLE_F32, "__mspabi_cmpf"},
+      {RTLIB::OGT_F32, "__mspabi_cmpf"},
+
+      // Floating point arithmetic - EABI Table 8
+      {RTLIB::ADD_F64, "__mspabi_addd"},
+      {RTLIB::ADD_F32, "__mspabi_addf"},
+      {RTLIB::DIV_F64, "__mspabi_divd"},
+      {RTLIB::DIV_F32, "__mspabi_divf"},
+      {RTLIB::MUL_F64, "__mspabi_mpyd"},
+      {RTLIB::MUL_F32, "__mspabi_mpyf"},
+      {RTLIB::SUB_F64, "__mspabi_subd"},
+      {RTLIB::SUB_F32, "__mspabi_subf"},
+      // The following are NOT implemented in libgcc
+      // { RTLIB::NEG_F64,  "__mspabi_negd" },
+      // { RTLIB::NEG_F32,  "__mspabi_negf" },
+
+      // Universal Integer Operations - EABI Table 9
+      {RTLIB::SDIV_I16, "__mspabi_divi"},
+      {RTLIB::SDIV_I32, "__mspabi_divli"},
+      {RTLIB::SDIV_I64, "__mspabi_divlli"},
+      {RTLIB::UDIV_I16, "__mspabi_divu"},
+      {RTLIB::UDIV_I32, "__mspabi_divul"},
+      {RTLIB::UDIV_I64, "__mspabi_divull"},
+      {RTLIB::SREM_I16, "__mspabi_remi"},
+      {RTLIB::SREM_I32, "__mspabi_remli"},
+      {RTLIB::SREM_I64, "__mspabi_remlli"},
+      {RTLIB::UREM_I16, "__mspabi_remu"},
+      {RTLIB::UREM_I32, "__mspabi_remul"},
+      {RTLIB::UREM_I64, "__mspabi_remull"},
+
+      // Bitwise Operations - EABI Table 10
+      // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
+      {RTLIB::SRL_I32, "__mspabi_srll"},
+      {RTLIB::SRA_I32, "__mspabi_sral"},
+      {RTLIB::SHL_I32, "__mspabi_slll"},
+      // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc
+  };
+
+  for (const auto &LC : LibraryCalls)
+    Info.setLibcallName(LC.Op, LC.Name);
+
+  // Several of the runtime library functions use a special calling conv
+  Info.setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+  Info.setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+
+  // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
@@ -448,4 +565,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     else
       setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
   }
+
+  if (TT.getArch() == Triple::ArchType::msp430)
+    setMSP430Libcalls(*this, TT);
 }
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 8c55f77d062b7..c2946de838d77 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -148,103 +148,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
   setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
 
-  // EABI Libcalls - EABI Section 6.2
-  const struct {
-    const RTLIB::Libcall Op;
-    const char *const Name;
-  } LibraryCalls[] = {
-      // Floating point conversions - EABI Table 6
-      {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"},
-      {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"},
-      // The following is NOT implemented in libgcc
-      //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi" },
-      {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"},
-      {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"},
-      // The following is NOT implemented in libgcc
-      //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu" },
-      {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"},
-      {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"},
-      // The following is NOT implemented in libgcc
-      //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi" },
-      {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"},
-      {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"},
-      // The following is NOT implemented in libgcc
-      //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu" },
-      {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"},
-      {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"},
-      // TODO The following IS implemented in libgcc
-      //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid" },
-      {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"},
-      // TODO The following IS implemented in libgcc but is not in the EABI
-      {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"},
-      // TODO The following IS implemented in libgcc
-      //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud" },
-      {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"},
-      // The following IS implemented in libgcc but is not in the EABI
-      {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"},
-      // TODO The following IS implemented in libgcc
-      //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif" },
-      {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"},
-      // TODO The following IS implemented in libgcc but is not in the EABI
-      {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"},
-      // TODO The following IS implemented in libgcc
-      //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf" },
-      {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"},
-      // The following IS implemented in libgcc but is not in the EABI
-      {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"},
-
-      // Floating point comparisons - EABI Table 7
-      {RTLIB::OEQ_F64, "__mspabi_cmpd"},
-      {RTLIB::UNE_F64, "__mspabi_cmpd"},
-      {RTLIB::OGE_F64, "__mspabi_cmpd"},
-      {RTLIB::OLT_F64, "__mspabi_cmpd"},
-      {RTLIB::OLE_F64, "__mspabi_cmpd"},
-      {RTLIB::OGT_F64, "__mspabi_cmpd"},
-      {RTLIB::OEQ_F32, "__mspabi_cmpf"},
-      {RTLIB::UNE_F32, "__mspabi_cmpf"},
-      {RTLIB::OGE_F32, "__mspabi_cmpf"},
-      {RTLIB::OLT_F32, "__mspabi_cmpf"},
-      {RTLIB::OLE_F32, "__mspabi_cmpf"},
-      {RTLIB::OGT_F32, "__mspabi_cmpf"},
-
-      // Floating point arithmetic - EABI Table 8
-      {RTLIB::ADD_F64, "__mspabi_addd"},
-      {RTLIB::ADD_F32, "__mspabi_addf"},
-      {RTLIB::DIV_F64, "__mspabi_divd"},
-      {RTLIB::DIV_F32, "__mspabi_divf"},
-      {RTLIB::MUL_F64, "__mspabi_mpyd"},
-      {RTLIB::MUL_F32, "__mspabi_mpyf"},
-      {RTLIB::SUB_F64, "__mspabi_subd"},
-      {RTLIB::SUB_F32, "__mspabi_subf"},
-      // The following are NOT implemented in libgcc
-      // { RTLIB::NEG_F64,  "__mspabi_negd" },
-      // { RTLIB::NEG_F32,  "__mspabi_negf" },
-
-      // Universal Integer Operations - EABI Table 9
-      {RTLIB::SDIV_I16, "__mspabi_divi"},
-      {RTLIB::SDIV_I32, "__mspabi_divli"},
-      {RTLIB::SDIV_I64, "__mspabi_divlli"},
-      {RTLIB::UDIV_I16, "__mspabi_divu"},
-      {RTLIB::UDIV_I32, "__mspabi_divul"},
-      {RTLIB::UDIV_I64, "__mspabi_divull"},
-      {RTLIB::SREM_I16, "__mspabi_remi"},
-      {RTLIB::SREM_I32, "__mspabi_remli"},
-      {RTLIB::SREM_I64, "__mspabi_remlli"},
-      {RTLIB::UREM_I16, "__mspabi_remu"},
-      {RTLIB::UREM_I32, "__mspabi_remul"},
-      {RTLIB::UREM_I64, "__mspabi_remull"},
-
-      // Bitwise Operations - EABI Table 10
-      // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc
-      {RTLIB::SRL_I32, "__mspabi_srll"},
-      {RTLIB::SRA_I32, "__mspabi_sral"},
-      {RTLIB::SHL_I32, "__mspabi_slll"},
-      // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc
-  };
-
-  for (const auto &LC : LibraryCalls)
-    setLibcallName(LC.Op, LC.Name);
-
   if (STI.hasHWMult16()) {
     const struct {
       const RTLIB::Libcall Op;
@@ -308,23 +211,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
   }
 
-  // Several of the runtime library functions use a special calling conv
-  setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
-  setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
-  // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
-
   setMinFunctionAlignment(Align(2));
   setPrefFunctionAlignment(Align(2));
   setMaxAtomicSizeInBitsSupported(0);

From 33fee564998598a52e802292db25c0ee52f7e1a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Wed, 11 Jun 2025 14:22:45 +0200
Subject: [PATCH 067/851] [HLSL][SPIR-V] Change SPV AS map for groupshared
 (#143519)

The previous mapping we setting the hlsl_groupshared AS to 0, which
translated to either Generic or Function.
Changing this to 3, which translated to Workgroup.

Related to #142804
---
 clang/lib/Basic/Targets/SPIR.h           | 4 ++--
 clang/test/CodeGenHLSL/group_shared.hlsl | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 0eaf82eee756b..b416a01f0f374 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -46,7 +46,7 @@ static const unsigned SPIRDefIsPrivMap[] = {
     0,  // ptr32_sptr
     0,  // ptr32_uptr
     0,  // ptr64
-    0,  // hlsl_groupshared
+    3,  // hlsl_groupshared
     12, // hlsl_constant
     10, // hlsl_private
     11, // hlsl_device
@@ -82,7 +82,7 @@ static const unsigned SPIRDefIsGenMap[] = {
     0,  // ptr32_sptr
     0,  // ptr32_uptr
     0,  // ptr64
-    0,  // hlsl_groupshared
+    3,  // hlsl_groupshared
     0,  // hlsl_constant
     10, // hlsl_private
     11, // hlsl_device
diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl
index 4b2e2beba4f12..a562e75b34881 100644
--- a/clang/test/CodeGenHLSL/group_shared.hlsl
+++ b/clang/test/CodeGenHLSL/group_shared.hlsl
@@ -3,6 +3,10 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan1.3-compute %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
 // Make sure groupshared translated into address space 3.
 // CHECK:@a = addrspace(3) global [10 x float]
 

From 50f534e21cfb47aaf44e1613f71b56cca55ba395 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Wed, 11 Jun 2025 14:22:54 +0200
Subject: [PATCH 068/851] [HLSL][SPIR-V] Handle SV_Position builtin in PS
 (#141759)

This commit is using the same mechanism as vk::ext_builtin_input to
implement the SV_Position semantic input.
The HLSL signature is not yet ready for DXIL, hence this commit only
implements the SPIR-V side.

This is incomplete as it doesn't allow the semantic on hull/domain and
other shaders, but it's a first step to validate the overall
input/output
semantic logic.

Fixes https://github.com/llvm/llvm-project/issues/136969
---
 clang/include/clang/Basic/Attr.td             |  7 ++++
 clang/include/clang/Basic/AttrDocs.td         | 14 +++++++
 clang/include/clang/Sema/SemaHLSL.h           |  2 +
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 42 ++++++++++++++-----
 clang/lib/Parse/ParseHLSL.cpp                 |  1 +
 clang/lib/Sema/SemaDeclAttr.cpp               |  3 ++
 clang/lib/Sema/SemaHLSL.cpp                   | 27 ++++++++++++
 .../CodeGenHLSL/semantics/SV_Position.ps.hlsl | 10 +++++
 .../test/SemaHLSL/Semantics/position.ps.hlsl  |  7 ++++
 .../SemaHLSL/Semantics/position.ps.size.hlsl  | 10 +++++
 .../test/SemaHLSL/Semantics/position.vs.hlsl  |  6 +++
 11 files changed, 119 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
 create mode 100644 clang/test/SemaHLSL/Semantics/position.ps.hlsl
 create mode 100644 clang/test/SemaHLSL/Semantics/position.ps.size.hlsl
 create mode 100644 clang/test/SemaHLSL/Semantics/position.vs.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b8e5806d3c5e9..9e84462eaa660 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4901,6 +4901,13 @@ def HLSLResourceBinding: InheritableAttr {
   }];
 }
 
+def HLSLSV_Position : HLSLAnnotationAttr {
+  let Spellings = [HLSLAnnotation<"sv_position">];
+  let Subjects = SubjectList<[ParmVar, Field]>;
+  let LangOpts = [HLSL];
+  let Documentation = [HLSLSV_PositionDocs];
+}
+
 def HLSLPackOffset: HLSLAnnotationAttr {
   let Spellings = [HLSLAnnotation<"packoffset">];
   let LangOpts = [HLSL];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index ea3c43f38d9fe..047f51ffa59ed 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -8529,6 +8529,20 @@ The full documentation is available here: https://docs.microsoft.com/en-us/windo
   }];
 }
 
+def HLSLSV_PositionDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``SV_Position`` semantic, when applied to an input parameter in a pixel
+shader, contains the location of the pixel center (x, y) in screen space.
+This semantic can be applied to the parameter, or a field in a struct used
+as an input parameter.
+This attribute is supported as an input in pixel, hull, domain and mesh shaders.
+This attribute is supported as an output in vertex, geometry and domain shaders.
+
+The full documentation is available here: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-semantics
+  }];
+}
+
 def HLSLGroupSharedAddressSpaceDocs : Documentation {
   let Category = DocCatVariable;
   let Content = [{
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 66d09f49680be..ba5f06f93dc30 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -125,6 +125,7 @@ class SemaHLSL : public SemaBase {
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL);
+  void handleSV_PositionAttr(Decl *D, const ParsedAttr &AL);
   void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL);
   void handleShaderAttr(Decl *D, const ParsedAttr &AL);
   void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL);
@@ -146,6 +147,7 @@ class SemaHLSL : public SemaBase {
 
   // Diagnose whether the input ID is uint/unit2/uint3 type.
   bool diagnoseInputIDType(QualType T, const ParsedAttr &AL);
+  bool diagnosePositionType(QualType T, const ParsedAttr &AL);
 
   bool CanPerformScalarCast(QualType SrcTy, QualType DestTy);
   bool ContainsBitField(QualType BaseTy);
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 6d267e6164845..720dac8383c05 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -384,6 +384,30 @@ static Value *buildVectorInput(IRBuilder<> &B, Function *F, llvm::Type *Ty) {
   return B.CreateCall(F, {B.getInt32(0)});
 }
 
+static void addSPIRVBuiltinDecoration(llvm::GlobalVariable *GV,
+                                      unsigned BuiltIn) {
+  LLVMContext &Ctx = GV->getContext();
+  IRBuilder<> B(GV->getContext());
+  MDNode *Operands = MDNode::get(
+      Ctx,
+      {ConstantAsMetadata::get(B.getInt32(/* Spirv::Decoration::BuiltIn */ 11)),
+       ConstantAsMetadata::get(B.getInt32(BuiltIn))});
+  MDNode *Decoration = MDNode::get(Ctx, {Operands});
+  GV->addMetadata("spirv.Decorations", *Decoration);
+}
+
+static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
+                                           llvm::Type *Ty, const Twine &Name,
+                                           unsigned BuiltInID) {
+  auto *GV = new llvm::GlobalVariable(
+      M, Ty, /* isConstant= */ true, llvm::GlobalValue::ExternalLinkage,
+      /* Initializer= */ nullptr, Name, /* insertBefore= */ nullptr,
+      llvm::GlobalVariable::GeneralDynamicTLSModel,
+      /* AddressSpace */ 7, /* isExternallyInitialized= */ true);
+  addSPIRVBuiltinDecoration(GV, BuiltInID);
+  return B.CreateLoad(Ty, GV);
+}
+
 llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B,
                                               const ParmVarDecl &D,
                                               llvm::Type *Ty) {
@@ -407,6 +431,12 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B,
     llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(getGroupIdIntrinsic());
     return buildVectorInput(B, GroupIDIntrinsic, Ty);
   }
+  if (D.hasAttr<HLSLSV_PositionAttr>()) {
+    if (getArch() == llvm::Triple::spirv)
+      return createSPIRVBuiltinLoad(B, CGM.getModule(), Ty, "sv_position",
+                                    /* BuiltIn::Position */ 0);
+    llvm_unreachable("SV_Position semantic not implemented for this target.");
+  }
   assert(false && "Unhandled parameter attribute");
   return nullptr;
 }
@@ -626,16 +656,8 @@ void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
 
 void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD,
                                               llvm::GlobalVariable *GV) {
-  if (auto Attr = VD->getAttr<HLSLVkExtBuiltinInputAttr>()) {
-    LLVMContext &Ctx = GV->getContext();
-    IRBuilder<> B(GV->getContext());
-    MDNode *Operands = MDNode::get(
-        Ctx, {ConstantAsMetadata::get(
-                  B.getInt32(/* Spirv::Decoration::BuiltIn */ 11)),
-              ConstantAsMetadata::get(B.getInt32(Attr->getBuiltIn()))});
-    MDNode *Decoration = MDNode::get(Ctx, {Operands});
-    GV->addMetadata("spirv.Decorations", *Decoration);
-  }
+  if (auto Attr = VD->getAttr<HLSLVkExtBuiltinInputAttr>())
+    addSPIRVBuiltinDecoration(GV, Attr->getBuiltIn());
 }
 
 llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) {
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 5569605c287b1..53d46465e3362 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -289,6 +289,7 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
   case ParsedAttr::AT_HLSLSV_GroupID:
   case ParsedAttr::AT_HLSLSV_GroupIndex:
   case ParsedAttr::AT_HLSLSV_DispatchThreadID:
+  case ParsedAttr::AT_HLSLSV_Position:
     break;
   default:
     llvm_unreachable("invalid HLSL Annotation");
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 6360827f415b8..1aeae41042a1c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7588,6 +7588,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLWaveSize:
     S.HLSL().handleWaveSizeAttr(D, AL);
     break;
+  case ParsedAttr::AT_HLSLSV_Position:
+    S.HLSL().handleSV_PositionAttr(D, AL);
+    break;
   case ParsedAttr::AT_HLSLVkExtBuiltinInput:
     S.HLSL().handleVkExtBuiltinInputAttr(D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 9065cc5a1d4a5..ba491b6134293 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -764,6 +764,13 @@ void SemaHLSL::CheckSemanticAnnotation(
       return;
     DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute});
     break;
+  case attr::HLSLSV_Position:
+    // TODO(#143523): allow use on other shader types & output once the overall
+    // semantic logic is implemented.
+    if (ST == llvm::Triple::Pixel)
+      return;
+    DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Pixel});
+    break;
   default:
     llvm_unreachable("Unknown HLSLAnnotationAttr");
   }
@@ -1147,6 +1154,26 @@ void SemaHLSL::handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL) {
                  HLSLSV_DispatchThreadIDAttr(getASTContext(), AL));
 }
 
+bool SemaHLSL::diagnosePositionType(QualType T, const ParsedAttr &AL) {
+  const auto *VT = T->getAs<VectorType>();
+
+  if (!T->hasFloatingRepresentation() || (VT && VT->getNumElements() > 4)) {
+    Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type)
+        << AL << "float/float1/float2/float3/float4";
+    return false;
+  }
+
+  return true;
+}
+
+void SemaHLSL::handleSV_PositionAttr(Decl *D, const ParsedAttr &AL) {
+  auto *VD = cast<ValueDecl>(D);
+  if (!diagnosePositionType(VD->getType(), AL))
+    return;
+
+  D->addAttr(::new (getASTContext()) HLSLSV_PositionAttr(getASTContext(), AL));
+}
+
 void SemaHLSL::handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL) {
   auto *VD = cast<ValueDecl>(D);
   if (!diagnoseInputIDType(VD->getType(), AL))
diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
new file mode 100644
index 0000000000000..58b91fc9264dd
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
+
+// CHECK: @sv_position = external thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0
+
+// CHECK: define void @main() {{.*}} {
+float4 main(float4 p : SV_Position) {
+  // CHECK: %[[#P:]] = load <4 x float>, ptr addrspace(7) @sv_position, align 16
+  // CHECK: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]])
+  return p;
+}
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
new file mode 100644
index 0000000000000..32bc5f55b2abd
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s
+
+float4 main(float4 a : SV_Position) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 a 'float4':'vector<float, 4>'
+// CHECK-NEXT: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}>
+}
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl
new file mode 100644
index 0000000000000..124d401a9990c
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -finclude-default-header -o - %s -verify -verify-ignore-unexpected
+// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library  -x hlsl -finclude-default-header -o - %s -verify -verify-ignore-unexpected
+
+// expected-error@+1 {{attribute 'SV_Position' only applies to a field or parameter of type 'float/float1/float2/float3/float4'}}
+void main(vector<float, 5> a : SV_Position) {
+}
+
+// expected-error@+1 {{attribute 'SV_Position' only applies to a field or parameter of type 'float/float1/float2/float3/float4'}}
+void main(int2 a : SV_Position) {
+}
diff --git a/clang/test/SemaHLSL/Semantics/position.vs.hlsl b/clang/test/SemaHLSL/Semantics/position.vs.hlsl
new file mode 100644
index 0000000000000..19f781fa3757c
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/position.vs.hlsl
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-vertex -x hlsl -finclude-default-header -o - %s -verify
+
+// expected-error@+1 {{attribute 'SV_Position' is unsupported in 'vertex' shaders, requires pixel}}
+float4 main(float4 a : SV_Position) {
+  return a;
+}

From b49c7896c0a31ca618098b52a28eb87dff625b8f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 14:27:48 +0200
Subject: [PATCH 069/851] [libc++] Fix constraints in `__countr_zero` and
 `__popcount`

Currently these two functions are constrained on `is_unsigned`, which is
more permissive than what is required by the standard for their public
counterparts. This fixes the constraints to match the public functions
by using `__libcpp_is_unsigned_integer` instead.
---
 libcxx/include/__bit/countr.h   | 4 ++--
 libcxx/include/__bit/popcount.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h
index 7b311b83853c5..1589f57c47385 100644
--- a/libcxx/include/__bit/countr.h
+++ b/libcxx/include/__bit/countr.h
@@ -11,7 +11,7 @@
 
 #include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned.h>
+#include <__type_traits/is_unsigned_integer.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero(_Tp __t) _NOEXCEPT {
-  static_assert(is_unsigned<_Tp>::value, "__countr_zero only works with unsigned types");
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero only works with unsigned types");
   return __builtin_ctzg(__t, numeric_limits<_Tp>::digits);
 }
 
diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h
index 9ae572d466ba7..4be0e418e7aa6 100644
--- a/libcxx/include/__bit/popcount.h
+++ b/libcxx/include/__bit/popcount.h
@@ -11,7 +11,7 @@
 
 #include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned.h>
+#include <__type_traits/is_unsigned_integer.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount(_Tp __t) _NOEXCEPT {
-  static_assert(is_unsigned<_Tp>::value, "__popcount only works with unsigned types");
+  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__popcount only works with unsigned types");
   return __builtin_popcountg(__t);
 }
 

From 3c56437eafee95f368feb20d28b74c29504b833d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 14:31:13 +0200
Subject: [PATCH 070/851] [libc++] Refactor signed/unsigned integer traits
 (#142750)

This patch does a few things:
- `__libcpp_is_signed_integer` and `__libcpp_is_unsigned_integer` are
refactored to be variable templates instead of class templates.
- the two traits are merged into a single header
`<__type_traits/integer_traits.h>`.
- `__libcpp_signed_integer`, `__libcpp_unsigned_integer` and
`__libcpp_integer` are moved into the same header.
- The above mentioned concepts are renamed to `__signed_integer`,
`__unsigned_integer` and `__signed_or_unsigned_integer` respectively.
---
 libcxx/include/CMakeLists.txt                 |  3 +-
 libcxx/include/__bit/bit_ceil.h               |  4 +-
 libcxx/include/__bit/bit_floor.h              |  4 +-
 libcxx/include/__bit/bit_log2.h               |  4 +-
 libcxx/include/__bit/bit_width.h              |  4 +-
 libcxx/include/__bit/countl.h                 |  9 +--
 libcxx/include/__bit/countr.h                 |  9 +--
 libcxx/include/__bit/has_single_bit.h         |  4 +-
 libcxx/include/__bit/popcount.h               |  7 +-
 libcxx/include/__bit/rotate.h                 | 11 ++-
 libcxx/include/__concepts/arithmetic.h        | 13 ----
 libcxx/include/__format/format_arg_store.h    |  6 +-
 libcxx/include/__mdspan/extents.h             |  6 +-
 .../include/__numeric/saturation_arithmetic.h | 30 ++++----
 libcxx/include/__type_traits/integer_traits.h | 73 +++++++++++++++++++
 .../include/__type_traits/is_signed_integer.h | 35 ---------
 .../__type_traits/is_unsigned_integer.h       | 35 ---------
 libcxx/include/__utility/cmp.h                | 16 ++--
 libcxx/include/module.modulemap.in            |  9 +--
 .../__libcpp_integer.compile.pass.cpp         | 62 ++++++++--------
 .../__libcpp_signed_integer.compile.pass.cpp  | 62 ++++++++--------
 ...__libcpp_unsigned_integer.compile.pass.cpp | 62 ++++++++--------
 22 files changed, 223 insertions(+), 245 deletions(-)
 create mode 100644 libcxx/include/__type_traits/integer_traits.h
 delete mode 100644 libcxx/include/__type_traits/is_signed_integer.h
 delete mode 100644 libcxx/include/__type_traits/is_unsigned_integer.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 52611e43968bc..8931a1b35f6d3 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -800,6 +800,7 @@ set(files
   __type_traits/extent.h
   __type_traits/has_unique_object_representation.h
   __type_traits/has_virtual_destructor.h
+  __type_traits/integer_traits.h
   __type_traits/integral_constant.h
   __type_traits/invoke.h
   __type_traits/is_abstract.h
@@ -850,7 +851,6 @@ set(files
   __type_traits/is_same.h
   __type_traits/is_scalar.h
   __type_traits/is_signed.h
-  __type_traits/is_signed_integer.h
   __type_traits/is_specialization.h
   __type_traits/is_standard_layout.h
   __type_traits/is_swappable.h
@@ -864,7 +864,6 @@ set(files
   __type_traits/is_unbounded_array.h
   __type_traits/is_union.h
   __type_traits/is_unsigned.h
-  __type_traits/is_unsigned_integer.h
   __type_traits/is_valid_expansion.h
   __type_traits/is_void.h
   __type_traits/is_volatile.h
diff --git a/libcxx/include/__bit/bit_ceil.h b/libcxx/include/__bit/bit_ceil.h
index cfd792dc2e2ad..99881a8538290 100644
--- a/libcxx/include/__bit/bit_ceil.h
+++ b/libcxx/include/__bit/bit_ceil.h
@@ -11,8 +11,8 @@
 
 #include <__assert>
 #include <__bit/countl.h>
-#include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/integer_traits.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -41,7 +41,7 @@ template <class _Tp>
 
 #  if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept {
   return std::__bit_ceil(__t);
 }
diff --git a/libcxx/include/__bit/bit_floor.h b/libcxx/include/__bit/bit_floor.h
index 6bcbc53fb4972..799a064130b4b 100644
--- a/libcxx/include/__bit/bit_floor.h
+++ b/libcxx/include/__bit/bit_floor.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___BIT_BIT_FLOOR_H
 
 #include <__bit/bit_log2.h>
-#include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/integer_traits.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept {
   return __t == 0 ? 0 : _Tp{1} << std::__bit_log2(__t);
 }
diff --git a/libcxx/include/__bit/bit_log2.h b/libcxx/include/__bit/bit_log2.h
index b22e1ce1f84e6..8077cd91d6fd7 100644
--- a/libcxx/include/__bit/bit_log2.h
+++ b/libcxx/include/__bit/bit_log2.h
@@ -11,7 +11,7 @@
 
 #include <__bit/countl.h>
 #include <__config>
-#include <__type_traits/is_unsigned_integer.h>
+#include <__type_traits/integer_traits.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __bit_log2(_Tp __t) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires an unsigned integer type");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__bit_log2 requires an unsigned integer type");
   return numeric_limits<_Tp>::digits - 1 - std::__countl_zero(__t);
 }
 
diff --git a/libcxx/include/__bit/bit_width.h b/libcxx/include/__bit/bit_width.h
index 853e481776f7d..75050acabbe88 100644
--- a/libcxx/include/__bit/bit_width.h
+++ b/libcxx/include/__bit/bit_width.h
@@ -10,8 +10,8 @@
 #define _LIBCPP___BIT_BIT_WIDTH_H
 
 #include <__bit/bit_log2.h>
-#include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/integer_traits.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,7 +21,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept {
   return __t == 0 ? 0 : std::__bit_log2(__t) + 1;
 }
diff --git a/libcxx/include/__bit/countl.h b/libcxx/include/__bit/countl.h
index 9499bf9b458ee..075914020879a 100644
--- a/libcxx/include/__bit/countl.h
+++ b/libcxx/include/__bit/countl.h
@@ -9,9 +9,8 @@
 #ifndef _LIBCPP___BIT_COUNTL_H
 #define _LIBCPP___BIT_COUNTL_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned_integer.h>
+#include <__type_traits/integer_traits.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -25,18 +24,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countl_zero(_Tp __t) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires an unsigned integer type");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__countl_zero requires an unsigned integer type");
   return __builtin_clzg(__t, numeric_limits<_Tp>::digits);
 }
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countl_zero(_Tp __t) noexcept {
   return std::__countl_zero(__t);
 }
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countl_one(_Tp __t) noexcept {
   return __t != numeric_limits<_Tp>::max() ? std::countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
 }
diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h
index 1589f57c47385..f6c98695d3d06 100644
--- a/libcxx/include/__bit/countr.h
+++ b/libcxx/include/__bit/countr.h
@@ -9,9 +9,8 @@
 #ifndef _LIBCPP___BIT_COUNTR_H
 #define _LIBCPP___BIT_COUNTR_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned_integer.h>
+#include <__type_traits/integer_traits.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -25,18 +24,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero(_Tp __t) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero only works with unsigned types");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__countr_zero only works with unsigned types");
   return __builtin_ctzg(__t, numeric_limits<_Tp>::digits);
 }
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept {
   return std::__countr_zero(__t);
 }
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countr_one(_Tp __t) noexcept {
   return __t != numeric_limits<_Tp>::max() ? std::countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits;
 }
diff --git a/libcxx/include/__bit/has_single_bit.h b/libcxx/include/__bit/has_single_bit.h
index 52f5853a1bc8a..b43e69323e77b 100644
--- a/libcxx/include/__bit/has_single_bit.h
+++ b/libcxx/include/__bit/has_single_bit.h
@@ -9,8 +9,8 @@
 #ifndef _LIBCPP___BIT_HAS_SINGLE_BIT_H
 #define _LIBCPP___BIT_HAS_SINGLE_BIT_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/integer_traits.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -23,7 +23,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
   return __t != 0 && (((__t & (__t - 1)) == 0));
 }
diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h
index 4be0e418e7aa6..8d9ba09938482 100644
--- a/libcxx/include/__bit/popcount.h
+++ b/libcxx/include/__bit/popcount.h
@@ -9,9 +9,8 @@
 #ifndef _LIBCPP___BIT_POPCOUNT_H
 #define _LIBCPP___BIT_POPCOUNT_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned_integer.h>
+#include <__type_traits/integer_traits.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -24,13 +23,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount(_Tp __t) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__popcount only works with unsigned types");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__popcount only works with unsigned types");
   return __builtin_popcountg(__t);
 }
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int popcount(_Tp __t) noexcept {
   return std::__popcount(__t);
 }
diff --git a/libcxx/include/__bit/rotate.h b/libcxx/include/__bit/rotate.h
index d79d98de296aa..c6f34bdaf6e63 100644
--- a/libcxx/include/__bit/rotate.h
+++ b/libcxx/include/__bit/rotate.h
@@ -9,9 +9,8 @@
 #ifndef _LIBCPP___BIT_ROTATE_H
 #define _LIBCPP___BIT_ROTATE_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
-#include <__type_traits/is_unsigned_integer.h>
+#include <__type_traits/integer_traits.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -25,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // the rotr function becomes the ROR instruction.
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires an unsigned integer type");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__rotl requires an unsigned integer type");
   const int __n = numeric_limits<_Tp>::digits;
   int __r       = __s % __n;
 
@@ -40,7 +39,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s)
 
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotr(_Tp __x, int __s) _NOEXCEPT {
-  static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires an unsigned integer type");
+  static_assert(__is_unsigned_integer_v<_Tp>, "__rotr requires an unsigned integer type");
   const int __n = numeric_limits<_Tp>::digits;
   int __r       = __s % __n;
 
@@ -55,12 +54,12 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotr(_Tp __x, int __s)
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp rotl(_Tp __t, int __cnt) noexcept {
   return std::__rotl(__t, __cnt);
 }
 
-template <__libcpp_unsigned_integer _Tp>
+template <__unsigned_integer _Tp>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp rotr(_Tp __t, int __cnt) noexcept {
   return std::__rotr(__t, __cnt);
 }
diff --git a/libcxx/include/__concepts/arithmetic.h b/libcxx/include/__concepts/arithmetic.h
index 0c44f117805f3..64c0200783df7 100644
--- a/libcxx/include/__concepts/arithmetic.h
+++ b/libcxx/include/__concepts/arithmetic.h
@@ -13,8 +13,6 @@
 #include <__type_traits/is_floating_point.h>
 #include <__type_traits/is_integral.h>
 #include <__type_traits/is_signed.h>
-#include <__type_traits/is_signed_integer.h>
-#include <__type_traits/is_unsigned_integer.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -38,17 +36,6 @@ concept unsigned_integral = integral<_Tp> && !signed_integral<_Tp>;
 template <class _Tp>
 concept floating_point = is_floating_point_v<_Tp>;
 
-// Concept helpers for the internal type traits for the fundamental types.
-
-template <class _Tp>
-concept __libcpp_unsigned_integer = __libcpp_is_unsigned_integer<_Tp>::value;
-
-template <class _Tp>
-concept __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value;
-
-template <class _Tp>
-concept __libcpp_integer = __libcpp_unsigned_integer<_Tp> || __libcpp_signed_integer<_Tp>;
-
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h
index 87557aa4da7bb..fbb4cad21b232 100644
--- a/libcxx/include/__format/format_arg_store.h
+++ b/libcxx/include/__format/format_arg_store.h
@@ -14,7 +14,6 @@
 #  pragma GCC system_header
 #endif
 
-#include <__concepts/arithmetic.h>
 #include <__concepts/same_as.h>
 #include <__config>
 #include <__cstddef/size_t.h>
@@ -22,6 +21,7 @@
 #include <__format/format_arg.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/extent.h>
+#include <__type_traits/integer_traits.h>
 #include <__type_traits/remove_const.h>
 #include <cstdint>
 #include <string>
@@ -65,7 +65,7 @@ consteval __arg_t __determine_arg_t() {
 #  endif
 
 // Signed integers
-template <class, __libcpp_signed_integer _Tp>
+template <class, __signed_integer _Tp>
 consteval __arg_t __determine_arg_t() {
   if constexpr (sizeof(_Tp) <= sizeof(int))
     return __arg_t::__int;
@@ -80,7 +80,7 @@ consteval __arg_t __determine_arg_t() {
 }
 
 // Unsigned integers
-template <class, __libcpp_unsigned_integer _Tp>
+template <class, __unsigned_integer _Tp>
 consteval __arg_t __determine_arg_t() {
   if constexpr (sizeof(_Tp) <= sizeof(unsigned))
     return __arg_t::__unsigned;
diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h
index 00454004851d5..99b54badf893c 100644
--- a/libcxx/include/__mdspan/extents.h
+++ b/libcxx/include/__mdspan/extents.h
@@ -21,11 +21,10 @@
 #include <__config>
 
 #include <__concepts/arithmetic.h>
-#include <__cstddef/byte.h>
 #include <__type_traits/common_type.h>
+#include <__type_traits/integer_traits.h>
 #include <__type_traits/is_convertible.h>
 #include <__type_traits/is_nothrow_constructible.h>
-#include <__type_traits/is_same.h>
 #include <__type_traits/make_unsigned.h>
 #include <__utility/integer_sequence.h>
 #include <__utility/unreachable.h>
@@ -283,7 +282,8 @@ class extents {
   using size_type  = make_unsigned_t<index_type>;
   using rank_type  = size_t;
 
-  static_assert(__libcpp_integer<index_type>, "extents::index_type must be a signed or unsigned integer type");
+  static_assert(__signed_or_unsigned_integer<index_type>,
+                "extents::index_type must be a signed or unsigned integer type");
   static_assert(((__mdspan_detail::__is_representable_as<index_type>(_Extents) || (_Extents == dynamic_extent)) && ...),
                 "extents ctor: arguments must be representable as index_type and nonnegative");
 
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 4110a8cb142a5..9bd3af12c9572 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -11,9 +11,9 @@
 #define _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
 
 #include <__assert>
-#include <__concepts/arithmetic.h>
 #include <__config>
 #include <__memory/addressof.h>
+#include <__type_traits/integer_traits.h>
 #include <__utility/cmp.h>
 #include <limits>
 
@@ -28,12 +28,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
   if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum)))
     return __sum;
   // Handle overflow
-  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+  if constexpr (__unsigned_integer<_Tp>) {
     return std::numeric_limits<_Tp>::max();
   } else {
     // Signed addition overflow
@@ -46,12 +46,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
   }
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
   if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub)))
     return __sub;
   // Handle overflow
-  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+  if constexpr (__unsigned_integer<_Tp>) {
     // Overflows if (x < y)
     return std::numeric_limits<_Tp>::min();
   } else {
@@ -65,12 +65,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
   }
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __mul_sat(_Tp __x, _Tp __y) noexcept {
   if (_Tp __mul; !__builtin_mul_overflow(__x, __y, std::addressof(__mul)))
     return __mul;
   // Handle overflow
-  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+  if constexpr (__unsigned_integer<_Tp>) {
     return std::numeric_limits<_Tp>::max();
   } else {
     // Signed multiplication overflow
@@ -81,10 +81,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __mul_sat(_Tp __x, _Tp __y) noexcept {
   }
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __div_sat(_Tp __x, _Tp __y) noexcept {
   _LIBCPP_ASSERT_UNCATEGORIZED(__y != 0, "Division by 0 is undefined");
-  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+  if constexpr (__unsigned_integer<_Tp>) {
     return __x / __y;
   } else {
     // Handle signed division overflow
@@ -94,7 +94,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __div_sat(_Tp __x, _Tp __y) noexcept {
   }
 }
 
-template <__libcpp_integer _Rp, __libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept {
   // Saturation is impossible edge case when ((min _Rp) < (min _Tp) && (max _Rp) > (max _Tp)) and it is expected to be
   // optimized out by the compiler.
@@ -112,27 +112,27 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept {
 
 #if _LIBCPP_STD_VER >= 26
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept {
   return std::__add_sat(__x, __y);
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept {
   return std::__sub_sat(__x, __y);
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept {
   return std::__mul_sat(__x, __y);
 }
 
-template <__libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept {
   return std::__div_sat(__x, __y);
 }
 
-template <__libcpp_integer _Rp, __libcpp_integer _Tp>
+template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept {
   return std::__saturate_cast<_Rp>(__x);
 }
diff --git a/libcxx/include/__type_traits/integer_traits.h b/libcxx/include/__type_traits/integer_traits.h
new file mode 100644
index 0000000000000..fad502c44e301
--- /dev/null
+++ b/libcxx/include/__type_traits/integer_traits.h
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H
+#define _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// This trait is to determine whether a type is a /signed integer type/
+// See [basic.fundamental]/p1
+template <class _Tp>
+inline const bool __is_signed_integer_v = false;
+template <>
+inline const bool __is_signed_integer_v<signed char> = true;
+template <>
+inline const bool __is_signed_integer_v<signed short> = true;
+template <>
+inline const bool __is_signed_integer_v<signed int> = true;
+template <>
+inline const bool __is_signed_integer_v<signed long> = true;
+template <>
+inline const bool __is_signed_integer_v<signed long long> = true;
+#if _LIBCPP_HAS_INT128
+template <>
+inline const bool __is_signed_integer_v<__int128_t> = true;
+#endif
+
+// This trait is to determine whether a type is an /unsigned integer type/
+// See [basic.fundamental]/p2
+template <class _Tp>
+inline const bool __is_unsigned_integer_v = false;
+template <>
+inline const bool __is_unsigned_integer_v<unsigned char> = true;
+template <>
+inline const bool __is_unsigned_integer_v<unsigned short> = true;
+template <>
+inline const bool __is_unsigned_integer_v<unsigned int> = true;
+template <>
+inline const bool __is_unsigned_integer_v<unsigned long> = true;
+template <>
+inline const bool __is_unsigned_integer_v<unsigned long long> = true;
+#if _LIBCPP_HAS_INT128
+template <>
+inline const bool __is_unsigned_integer_v<__uint128_t> = true;
+#endif
+
+#if _LIBCPP_STD_VER >= 20
+template <class _Tp>
+concept __signed_integer = __is_signed_integer_v<_Tp>;
+
+template <class _Tp>
+concept __unsigned_integer = __is_unsigned_integer_v<_Tp>;
+
+// This isn't called __integer, because an integer type according to [basic.fundamental]/p11 is the same as an integral
+// type. An integral type is _not_ the same set of types as signed and unsigned integer types combined.
+template <class _Tp>
+concept __signed_or_unsigned_integer = __signed_integer<_Tp> || __unsigned_integer<_Tp>;
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H
diff --git a/libcxx/include/__type_traits/is_signed_integer.h b/libcxx/include/__type_traits/is_signed_integer.h
deleted file mode 100644
index 62943902a1834..0000000000000
--- a/libcxx/include/__type_traits/is_signed_integer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H
-#define _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// clang-format off
-template <class _Tp> struct __libcpp_is_signed_integer                   : false_type {};
-template <>          struct __libcpp_is_signed_integer<signed char>      : true_type {};
-template <>          struct __libcpp_is_signed_integer<signed short>     : true_type {};
-template <>          struct __libcpp_is_signed_integer<signed int>       : true_type {};
-template <>          struct __libcpp_is_signed_integer<signed long>      : true_type {};
-template <>          struct __libcpp_is_signed_integer<signed long long> : true_type {};
-#if _LIBCPP_HAS_INT128
-template <>          struct __libcpp_is_signed_integer<__int128_t>       : true_type {};
-#endif
-// clang-format on
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H
diff --git a/libcxx/include/__type_traits/is_unsigned_integer.h b/libcxx/include/__type_traits/is_unsigned_integer.h
deleted file mode 100644
index 74414a831e79a..0000000000000
--- a/libcxx/include/__type_traits/is_unsigned_integer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H
-#define _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H
-
-#include <__config>
-#include <__type_traits/integral_constant.h>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-// clang-format off
-template <class _Tp> struct __libcpp_is_unsigned_integer                     : false_type {};
-template <>          struct __libcpp_is_unsigned_integer<unsigned char>      : true_type {};
-template <>          struct __libcpp_is_unsigned_integer<unsigned short>     : true_type {};
-template <>          struct __libcpp_is_unsigned_integer<unsigned int>       : true_type {};
-template <>          struct __libcpp_is_unsigned_integer<unsigned long>      : true_type {};
-template <>          struct __libcpp_is_unsigned_integer<unsigned long long> : true_type {};
-#if _LIBCPP_HAS_INT128
-template <>          struct __libcpp_is_unsigned_integer<__uint128_t>        : true_type {};
-#endif
-// clang-format on
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H
diff --git a/libcxx/include/__utility/cmp.h b/libcxx/include/__utility/cmp.h
index b7c1ed614dfcb..14dc0c154c040 100644
--- a/libcxx/include/__utility/cmp.h
+++ b/libcxx/include/__utility/cmp.h
@@ -9,8 +9,8 @@
 #ifndef _LIBCPP___UTILITY_CMP_H
 #define _LIBCPP___UTILITY_CMP_H
 
-#include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/integer_traits.h>
 #include <__type_traits/is_signed.h>
 #include <__type_traits/make_unsigned.h>
 #include <limits>
@@ -26,7 +26,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 20
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t == __u;
@@ -36,12 +36,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept {
     return __u < 0 ? false : __t == make_unsigned_t<_Up>(__u);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_not_equal(_Tp __t, _Up __u) noexcept {
   return !std::cmp_equal(__t, __u);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept {
   if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>)
     return __t < __u;
@@ -51,22 +51,22 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept {
     return __u < 0 ? false : __t < make_unsigned_t<_Up>(__u);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_greater(_Tp __t, _Up __u) noexcept {
   return std::cmp_less(__u, __t);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less_equal(_Tp __t, _Up __u) noexcept {
   return !std::cmp_greater(__t, __u);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_greater_equal(_Tp __t, _Up __u) noexcept {
   return !std::cmp_less(__t, __u);
 }
 
-template <__libcpp_integer _Tp, __libcpp_integer _Up>
+template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up>
 _LIBCPP_HIDE_FROM_ABI constexpr bool in_range(_Up __u) noexcept {
   return std::cmp_less_equal(__u, numeric_limits<_Tp>::max()) &&
          std::cmp_greater_equal(__u, numeric_limits<_Tp>::min());
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 7f625cefed1c2..f5fd970934e9b 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -94,6 +94,7 @@ module std_core [system] {
     module extent                                     { header "__type_traits/extent.h" }
     module has_unique_object_representation           { header "__type_traits/has_unique_object_representation.h" }
     module has_virtual_destructor                     { header "__type_traits/has_virtual_destructor.h" }
+    module integer_traits                             { header "__type_traits/integer_traits.h" }
     module integral_constant                          { header "__type_traits/integral_constant.h" }
     module invoke                                     { header "__type_traits/invoke.h" }
     module is_abstract {
@@ -284,10 +285,6 @@ module std_core [system] {
       header "__type_traits/is_scalar.h"
       export std_core.type_traits.integral_constant
     }
-    module is_signed_integer {
-      header "__type_traits/is_signed_integer.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_signed {
       header "__type_traits/is_signed.h"
       export std_core.type_traits.integral_constant
@@ -340,10 +337,6 @@ module std_core [system] {
       header "__type_traits/is_union.h"
       export std_core.type_traits.integral_constant
     }
-    module is_unsigned_integer {
-      header "__type_traits/is_unsigned_integer.h"
-      export std_core.type_traits.integral_constant
-    }
     module is_unsigned {
       header "__type_traits/is_unsigned.h"
       export std_core.type_traits.integral_constant
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
index 563580b687955..4958a258137a1 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp
@@ -11,9 +11,9 @@
 // Concept helpers for the internal type traits for the fundamental types.
 
 // template <class _Tp>
-// concept __libcpp_integer;
+// concept __signed_or_unsigned_integer;
 
-#include <concepts>
+#include <__type_traits/integer_traits.h>
 
 #include "test_macros.h"
 
@@ -24,40 +24,40 @@ enum SomeEnum {};
 enum class SomeScopedEnum {};
 
 // Unsigned
-static_assert(std::__libcpp_integer<unsigned char>);
-static_assert(std::__libcpp_integer<unsigned short int>);
-static_assert(std::__libcpp_integer<unsigned int>);
-static_assert(std::__libcpp_integer<unsigned long int>);
-static_assert(std::__libcpp_integer<unsigned long long int>);
-static_assert(std::__libcpp_integer<unsigned short int>);
+static_assert(std::__signed_or_unsigned_integer<unsigned char>);
+static_assert(std::__signed_or_unsigned_integer<unsigned short int>);
+static_assert(std::__signed_or_unsigned_integer<unsigned int>);
+static_assert(std::__signed_or_unsigned_integer<unsigned long int>);
+static_assert(std::__signed_or_unsigned_integer<unsigned long long int>);
+static_assert(std::__signed_or_unsigned_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(std::__libcpp_integer<__uint128_t>);
+static_assert(std::__signed_or_unsigned_integer<__uint128_t>);
 #endif
 // Signed
-static_assert(std::__libcpp_integer<signed char>);
-static_assert(std::__libcpp_integer<short int>);
-static_assert(std::__libcpp_integer<int>);
-static_assert(std::__libcpp_integer<long int>);
-static_assert(std::__libcpp_integer<long long int>);
-static_assert(std::__libcpp_integer<short int>);
+static_assert(std::__signed_or_unsigned_integer<signed char>);
+static_assert(std::__signed_or_unsigned_integer<short int>);
+static_assert(std::__signed_or_unsigned_integer<int>);
+static_assert(std::__signed_or_unsigned_integer<long int>);
+static_assert(std::__signed_or_unsigned_integer<long long int>);
+static_assert(std::__signed_or_unsigned_integer<short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(std::__libcpp_integer<__int128_t>);
+static_assert(std::__signed_or_unsigned_integer<__int128_t>);
 #endif
 // Non-integer
-static_assert(!std::__libcpp_integer<bool>);
-static_assert(!std::__libcpp_integer<char>);
+static_assert(!std::__signed_or_unsigned_integer<bool>);
+static_assert(!std::__signed_or_unsigned_integer<char>);
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-static_assert(!std::__libcpp_integer<wchar_t>);
+static_assert(!std::__signed_or_unsigned_integer<wchar_t>);
 #endif
-static_assert(!std::__libcpp_integer<char8_t>);
-static_assert(!std::__libcpp_integer<char16_t>);
-static_assert(!std::__libcpp_integer<char32_t>);
-static_assert(!std::__libcpp_integer<float>);
-static_assert(!std::__libcpp_integer<double>);
-static_assert(!std::__libcpp_integer<long double>);
-static_assert(!std::__libcpp_integer<void>);
-static_assert(!std::__libcpp_integer<int*>);
-static_assert(!std::__libcpp_integer<unsigned int*>);
-static_assert(!std::__libcpp_integer<SomeObject>);
-static_assert(!std::__libcpp_integer<SomeEnum>);
-static_assert(!std::__libcpp_integer<SomeScopedEnum>);
+static_assert(!std::__signed_or_unsigned_integer<char8_t>);
+static_assert(!std::__signed_or_unsigned_integer<char16_t>);
+static_assert(!std::__signed_or_unsigned_integer<char32_t>);
+static_assert(!std::__signed_or_unsigned_integer<float>);
+static_assert(!std::__signed_or_unsigned_integer<double>);
+static_assert(!std::__signed_or_unsigned_integer<long double>);
+static_assert(!std::__signed_or_unsigned_integer<void>);
+static_assert(!std::__signed_or_unsigned_integer<int*>);
+static_assert(!std::__signed_or_unsigned_integer<unsigned int*>);
+static_assert(!std::__signed_or_unsigned_integer<SomeObject>);
+static_assert(!std::__signed_or_unsigned_integer<SomeEnum>);
+static_assert(!std::__signed_or_unsigned_integer<SomeScopedEnum>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
index d1e21ee96b073..3fa342685770c 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp
@@ -11,9 +11,9 @@
 // Concept helpers for the internal type traits for the fundamental types.
 
 // template <class _Tp>
-// concept __libcpp_signed_integer;
+// concept __signed_integer;
 
-#include <concepts>
+#include <__type_traits/integer_traits.h>
 
 #include "test_macros.h"
 
@@ -24,40 +24,40 @@ enum SomeEnum {};
 enum class SomeScopedEnum {};
 
 // Unsigned
-static_assert(!std::__libcpp_signed_integer<unsigned char>);
-static_assert(!std::__libcpp_signed_integer<unsigned short int>);
-static_assert(!std::__libcpp_signed_integer<unsigned int>);
-static_assert(!std::__libcpp_signed_integer<unsigned long int>);
-static_assert(!std::__libcpp_signed_integer<unsigned long long int>);
-static_assert(!std::__libcpp_signed_integer<unsigned short int>);
+static_assert(!std::__signed_integer<unsigned char>);
+static_assert(!std::__signed_integer<unsigned short int>);
+static_assert(!std::__signed_integer<unsigned int>);
+static_assert(!std::__signed_integer<unsigned long int>);
+static_assert(!std::__signed_integer<unsigned long long int>);
+static_assert(!std::__signed_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(!std::__libcpp_signed_integer<__uint128_t>);
+static_assert(!std::__signed_integer<__uint128_t>);
 #endif
 // Signed
-static_assert(std::__libcpp_signed_integer<signed char>);
-static_assert(std::__libcpp_signed_integer<short int>);
-static_assert(std::__libcpp_signed_integer<int>);
-static_assert(std::__libcpp_signed_integer<long int>);
-static_assert(std::__libcpp_signed_integer<long long int>);
-static_assert(std::__libcpp_signed_integer<short int>);
+static_assert(std::__signed_integer<signed char>);
+static_assert(std::__signed_integer<short int>);
+static_assert(std::__signed_integer<int>);
+static_assert(std::__signed_integer<long int>);
+static_assert(std::__signed_integer<long long int>);
+static_assert(std::__signed_integer<short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(std::__libcpp_signed_integer<__int128_t>);
+static_assert(std::__signed_integer<__int128_t>);
 #endif
 // Non-integer
-static_assert(!std::__libcpp_signed_integer<bool>);
-static_assert(!std::__libcpp_signed_integer<char>);
+static_assert(!std::__signed_integer<bool>);
+static_assert(!std::__signed_integer<char>);
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-static_assert(!std::__libcpp_signed_integer<wchar_t>);
+static_assert(!std::__signed_integer<wchar_t>);
 #endif
-static_assert(!std::__libcpp_signed_integer<char8_t>);
-static_assert(!std::__libcpp_signed_integer<char16_t>);
-static_assert(!std::__libcpp_signed_integer<char32_t>);
-static_assert(!std::__libcpp_signed_integer<float>);
-static_assert(!std::__libcpp_signed_integer<double>);
-static_assert(!std::__libcpp_signed_integer<long double>);
-static_assert(!std::__libcpp_signed_integer<void>);
-static_assert(!std::__libcpp_signed_integer<int*>);
-static_assert(!std::__libcpp_signed_integer<unsigned int*>);
-static_assert(!std::__libcpp_signed_integer<SomeObject>);
-static_assert(!std::__libcpp_signed_integer<SomeEnum>);
-static_assert(!std::__libcpp_signed_integer<SomeScopedEnum>);
+static_assert(!std::__signed_integer<char8_t>);
+static_assert(!std::__signed_integer<char16_t>);
+static_assert(!std::__signed_integer<char32_t>);
+static_assert(!std::__signed_integer<float>);
+static_assert(!std::__signed_integer<double>);
+static_assert(!std::__signed_integer<long double>);
+static_assert(!std::__signed_integer<void>);
+static_assert(!std::__signed_integer<int*>);
+static_assert(!std::__signed_integer<unsigned int*>);
+static_assert(!std::__signed_integer<SomeObject>);
+static_assert(!std::__signed_integer<SomeEnum>);
+static_assert(!std::__signed_integer<SomeScopedEnum>);
diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
index c671f03cbfce4..ff60f32319171 100644
--- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
+++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp
@@ -11,9 +11,9 @@
 // Concept helpers for the internal type traits for the fundamental types.
 
 // template <class _Tp>
-// concept __libcpp_unsigned_integer;
+// concept __unsigned_integer;
 
-#include <concepts>
+#include <__type_traits/integer_traits.h>
 
 #include "test_macros.h"
 
@@ -24,40 +24,40 @@ enum SomeEnum {};
 enum class SomeScopedEnum {};
 
 // Unsigned
-static_assert(std::__libcpp_unsigned_integer<unsigned char>);
-static_assert(std::__libcpp_unsigned_integer<unsigned short int>);
-static_assert(std::__libcpp_unsigned_integer<unsigned int>);
-static_assert(std::__libcpp_unsigned_integer<unsigned long int>);
-static_assert(std::__libcpp_unsigned_integer<unsigned long long int>);
-static_assert(std::__libcpp_unsigned_integer<unsigned short int>);
+static_assert(std::__unsigned_integer<unsigned char>);
+static_assert(std::__unsigned_integer<unsigned short int>);
+static_assert(std::__unsigned_integer<unsigned int>);
+static_assert(std::__unsigned_integer<unsigned long int>);
+static_assert(std::__unsigned_integer<unsigned long long int>);
+static_assert(std::__unsigned_integer<unsigned short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(std::__libcpp_unsigned_integer<__uint128_t>);
+static_assert(std::__unsigned_integer<__uint128_t>);
 #endif
 // Signed
-static_assert(!std::__libcpp_unsigned_integer<signed char>);
-static_assert(!std::__libcpp_unsigned_integer<short int>);
-static_assert(!std::__libcpp_unsigned_integer<int>);
-static_assert(!std::__libcpp_unsigned_integer<long int>);
-static_assert(!std::__libcpp_unsigned_integer<long long int>);
-static_assert(!std::__libcpp_unsigned_integer<short int>);
+static_assert(!std::__unsigned_integer<signed char>);
+static_assert(!std::__unsigned_integer<short int>);
+static_assert(!std::__unsigned_integer<int>);
+static_assert(!std::__unsigned_integer<long int>);
+static_assert(!std::__unsigned_integer<long long int>);
+static_assert(!std::__unsigned_integer<short int>);
 #if _LIBCPP_HAS_INT128
-static_assert(!std::__libcpp_unsigned_integer<__int128_t>);
+static_assert(!std::__unsigned_integer<__int128_t>);
 #endif
 // Non-integer
-static_assert(!std::__libcpp_unsigned_integer<bool>);
-static_assert(!std::__libcpp_unsigned_integer<char>);
+static_assert(!std::__unsigned_integer<bool>);
+static_assert(!std::__unsigned_integer<char>);
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-static_assert(!std::__libcpp_unsigned_integer<wchar_t>);
+static_assert(!std::__unsigned_integer<wchar_t>);
 #endif
-static_assert(!std::__libcpp_unsigned_integer<char8_t>);
-static_assert(!std::__libcpp_unsigned_integer<char16_t>);
-static_assert(!std::__libcpp_unsigned_integer<char32_t>);
-static_assert(!std::__libcpp_unsigned_integer<float>);
-static_assert(!std::__libcpp_unsigned_integer<double>);
-static_assert(!std::__libcpp_unsigned_integer<long double>);
-static_assert(!std::__libcpp_unsigned_integer<void>);
-static_assert(!std::__libcpp_unsigned_integer<int*>);
-static_assert(!std::__libcpp_unsigned_integer<unsigned int*>);
-static_assert(!std::__libcpp_unsigned_integer<SomeObject>);
-static_assert(!std::__libcpp_unsigned_integer<SomeEnum>);
-static_assert(!std::__libcpp_unsigned_integer<SomeScopedEnum>);
+static_assert(!std::__unsigned_integer<char8_t>);
+static_assert(!std::__unsigned_integer<char16_t>);
+static_assert(!std::__unsigned_integer<char32_t>);
+static_assert(!std::__unsigned_integer<float>);
+static_assert(!std::__unsigned_integer<double>);
+static_assert(!std::__unsigned_integer<long double>);
+static_assert(!std::__unsigned_integer<void>);
+static_assert(!std::__unsigned_integer<int*>);
+static_assert(!std::__unsigned_integer<unsigned int*>);
+static_assert(!std::__unsigned_integer<SomeObject>);
+static_assert(!std::__unsigned_integer<SomeEnum>);
+static_assert(!std::__unsigned_integer<SomeScopedEnum>);

From b10d711362b8634cefcb288d9f1b577f63adb9f7 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 14:33:41 +0200
Subject: [PATCH 071/851] [libc++][NFC] Move __libcpp_is_integral into the 
 else branch (#142556)

This makes it clear that `__libcpp_is_integral` is an implementation
detail of `is_integral` if we don't have `__is_integral` and not its own
utility.
---
 libcxx/include/__type_traits/is_integral.h | 24 +++++++++++-----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/libcxx/include/__type_traits/is_integral.h b/libcxx/include/__type_traits/is_integral.h
index 7f7ac26beb770..5a340965f0384 100644
--- a/libcxx/include/__type_traits/is_integral.h
+++ b/libcxx/include/__type_traits/is_integral.h
@@ -19,6 +19,18 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#if __has_builtin(__is_integral)
+
+template <class _Tp>
+struct _LIBCPP_NO_SPECIALIZATIONS is_integral : _BoolConstant<__is_integral(_Tp)> {};
+
+#  if _LIBCPP_STD_VER >= 17
+template <class _Tp>
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_integral_v = __is_integral(_Tp);
+#  endif
+
+#else
+
 // clang-format off
 template <class _Tp> struct __libcpp_is_integral                     { enum { value = 0 }; };
 template <>          struct __libcpp_is_integral<bool>               { enum { value = 1 }; };
@@ -47,18 +59,6 @@ template <>          struct __libcpp_is_integral<__uint128_t>        { enum { va
 #endif
 // clang-format on
 
-#if __has_builtin(__is_integral)
-
-template <class _Tp>
-struct _LIBCPP_NO_SPECIALIZATIONS is_integral : _BoolConstant<__is_integral(_Tp)> {};
-
-#  if _LIBCPP_STD_VER >= 17
-template <class _Tp>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_integral_v = __is_integral(_Tp);
-#  endif
-
-#else
-
 template <class _Tp>
 struct is_integral : public _BoolConstant<__libcpp_is_integral<__remove_cv_t<_Tp> >::value> {};
 

From 2692c3aa6760f1e4ea015f906926f63ec7dce044 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 11 Jun 2025 12:39:09 +0000
Subject: [PATCH 072/851] [gn build] Port 3c56437eafee

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 803247bd7881e..41516d677c45a 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1457,6 +1457,7 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/extent.h",
       "__type_traits/has_unique_object_representation.h",
       "__type_traits/has_virtual_destructor.h",
+      "__type_traits/integer_traits.h",
       "__type_traits/integral_constant.h",
       "__type_traits/invoke.h",
       "__type_traits/is_abstract.h",
@@ -1507,7 +1508,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_same.h",
       "__type_traits/is_scalar.h",
       "__type_traits/is_signed.h",
-      "__type_traits/is_signed_integer.h",
       "__type_traits/is_specialization.h",
       "__type_traits/is_standard_layout.h",
       "__type_traits/is_swappable.h",
@@ -1521,7 +1521,6 @@ if (current_toolchain == default_toolchain) {
       "__type_traits/is_unbounded_array.h",
       "__type_traits/is_union.h",
       "__type_traits/is_unsigned.h",
-      "__type_traits/is_unsigned_integer.h",
       "__type_traits/is_valid_expansion.h",
       "__type_traits/is_void.h",
       "__type_traits/is_volatile.h",

From 3d7aa961ac96f83d2e28f107c6dfa5a6a279b364 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 13:56:30 +0100
Subject: [PATCH 073/851] [DebugInfo][RemoveDIs] Use autoupgrader to convert
 old debug-info (#143452)

By chance, two things have prevented the autoupgrade path being
exercised much so far:
 * LLParser setting the debug-info mode to "old" on seeing intrinsics,
* The test in AutoUpgrade.cpp wanting to upgrade into a "new" debug-info
block.

In practice, this appears to mean this code path hasn't seen the various
invalid inputs that can come its way. This commit does a number of
things:
* Tolerates the various illegal inputs that can be written with
debug-intrinsics, and that must be tolerated until the Verifier runs,
 * Printing illegal/null DbgRecord fields must succeed,
* Verifier errors need to localise the function/block where the error
is,
 * Tests that now see debug records will print debug-record errors,

Plus a few new tests for other intrinsic-to-debug-record failures modes
I found. There are also two edge cases:
* Some of the unit tests switch back and forth between intrinsic and
record modes at will; I've deleted coverage and some assertions to
tolerate this as intrinsic support is now Gone (TM),
* In sroa-extract-bits.ll, the order of debug records flips. This is
because the autoupgrader upgrades in the opposite order to the basic
block conversion routines... which doesn't change the record order, but
_does_ change the use list order in Metadata! This should (TM) have no
consequence to the correctness of LLVM, but will change the order of
various records and the order of DWARF record output too.

I tried to reduce this patch to a smaller collection of changes, but
they're all intertwined, sorry.
---
 llvm/lib/AsmParser/LLParser.cpp               |  2 -
 llvm/lib/IR/AsmWriter.cpp                     | 39 +++++++---
 llvm/lib/IR/AutoUpgrade.cpp                   | 77 +++++++++++++------
 llvm/lib/IR/BasicBlock.cpp                    |  4 -
 llvm/lib/IR/Verifier.cpp                      | 29 +++----
 .../drop-debug-info-nonzero-alloca.ll         |  6 +-
 .../parse-and-verify/verify.ll                | 18 ++---
 .../DebugInfo/Generic/sroa-extract-bits.ll    | 28 +++----
 .../IROutliner/outlining-debug-statements.ll  |  3 +-
 llvm/test/Transforms/ObjCARC/code-motion.ll   | 13 ++--
 .../RemoveDI/invalid-dbg-declare-operands.ll  | 46 +++++++++++
 .../Verifier/dbg-declare-invalid-debug-loc.ll | 42 ++++++++++
 .../diexpression-entry-value-llvm-ir.ll       |  6 +-
 .../test/Verifier/llvm.dbg.declare-address.ll |  4 +-
 .../Verifier/llvm.dbg.declare-expression.ll   |  5 +-
 .../Verifier/llvm.dbg.declare-variable.ll     | 11 ++-
 .../llvm.dbg.intrinsic-dbg-attachment.ll      | 16 ++--
 .../Verifier/llvm.dbg.value-expression.ll     |  5 +-
 llvm/test/Verifier/llvm.dbg.value-value.ll    |  4 +-
 llvm/test/Verifier/llvm.dbg.value-variable.ll |  5 +-
 llvm/unittests/IR/DebugInfoTest.cpp           | 13 ----
 21 files changed, 249 insertions(+), 127 deletions(-)
 create mode 100644 llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll
 create mode 100644 llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b933d240c4d27..5c007dcf00224 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8336,8 +8336,6 @@ bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
       return error(CallLoc, "llvm.dbg intrinsic should not appear in a module "
                             "using non-intrinsic debug info");
     }
-    if (!SeenOldDbgInfoFormat)
-      M->setNewDbgInfoFormatFlag(false);
     SeenOldDbgInfoFormat = true;
   }
   CI->setAttributes(PAL);
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 7223dd845d18d..7828ba45ec27f 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1204,17 +1204,23 @@ void SlotTracker::processFunctionMetadata(const Function &F) {
 }
 
 void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
+  // Tolerate null metadata pointers: it's a completely illegal debug record,
+  // but we can have faulty metadata from debug-intrinsic days being
+  // autoupgraded into debug records. This gets caught by the verifier, which
+  // then will print the faulty IR, hitting this code path.
   if (const DbgVariableRecord *DVR = dyn_cast<const DbgVariableRecord>(&DR)) {
     // Process metadata used by DbgRecords; we only specifically care about the
     // DILocalVariable, DILocation, and DIAssignID fields, as the Value and
     // Expression fields should only be printed inline and so do not use a slot.
     // Note: The above doesn't apply for empty-metadata operands.
-    if (auto *Empty = dyn_cast<MDNode>(DVR->getRawLocation()))
+    if (auto *Empty = dyn_cast_if_present<MDNode>(DVR->getRawLocation()))
       CreateMetadataSlot(Empty);
-    CreateMetadataSlot(DVR->getRawVariable());
+    if (DVR->getRawVariable())
+      CreateMetadataSlot(DVR->getRawVariable());
     if (DVR->isDbgAssign()) {
-      CreateMetadataSlot(cast<MDNode>(DVR->getRawAssignID()));
-      if (auto *Empty = dyn_cast<MDNode>(DVR->getRawAddress()))
+      if (auto *AssignID = DVR->getRawAssignID())
+        CreateMetadataSlot(cast<MDNode>(AssignID));
+      if (auto *Empty = dyn_cast_if_present<MDNode>(DVR->getRawAddress()))
         CreateMetadataSlot(Empty);
     }
   } else if (const DbgLabelRecord *DLR = dyn_cast<const DbgLabelRecord>(&DR)) {
@@ -1222,7 +1228,8 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) {
   } else {
     llvm_unreachable("unsupported DbgRecord kind");
   }
-  CreateMetadataSlot(DR.getDebugLoc().getAsMDNode());
+  if (DR.getDebugLoc())
+    CreateMetadataSlot(DR.getDebugLoc().getAsMDNode());
 }
 
 void SlotTracker::processInstructionMetadata(const Instruction &I) {
@@ -4867,22 +4874,30 @@ void AssemblyWriter::printDbgVariableRecord(const DbgVariableRecord &DVR) {
     llvm_unreachable(
         "Tried to print a DbgVariableRecord with an invalid LocationType!");
   }
+
+  auto PrintOrNull = [&](Metadata *M) {
+    if (!M)
+      Out << "(null)";
+    else
+      WriteAsOperandInternal(Out, M, WriterCtx, true);
+  };
+
   Out << "(";
-  WriteAsOperandInternal(Out, DVR.getRawLocation(), WriterCtx, true);
+  PrintOrNull(DVR.getRawLocation());
   Out << ", ";
-  WriteAsOperandInternal(Out, DVR.getRawVariable(), WriterCtx, true);
+  PrintOrNull(DVR.getRawVariable());
   Out << ", ";
-  WriteAsOperandInternal(Out, DVR.getRawExpression(), WriterCtx, true);
+  PrintOrNull(DVR.getRawExpression());
   Out << ", ";
   if (DVR.isDbgAssign()) {
-    WriteAsOperandInternal(Out, DVR.getRawAssignID(), WriterCtx, true);
+    PrintOrNull(DVR.getRawAssignID());
     Out << ", ";
-    WriteAsOperandInternal(Out, DVR.getRawAddress(), WriterCtx, true);
+    PrintOrNull(DVR.getRawAddress());
     Out << ", ";
-    WriteAsOperandInternal(Out, DVR.getRawAddressExpression(), WriterCtx, true);
+    PrintOrNull(DVR.getRawAddressExpression());
     Out << ", ";
   }
-  WriteAsOperandInternal(Out, DVR.getDebugLoc().getAsMDNode(), WriterCtx, true);
+  PrintOrNull(DVR.getDebugLoc().getAsMDNode());
   Out << ")";
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 814c00c669cb3..cb90af36f3d9f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1155,8 +1155,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
   case 'd':
     if (Name.consume_front("dbg.")) {
       // Mark debug intrinsics for upgrade to new debug format.
-      if (CanUpgradeDebugIntrinsicsToRecords &&
-          F->getParent()->IsNewDbgInfoFormat) {
+      if (CanUpgradeDebugIntrinsicsToRecords) {
         if (Name == "addr" || Name == "value" || Name == "assign" ||
             Name == "declare" || Name == "label") {
           // There's no function to replace these with.
@@ -4395,39 +4394,66 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
   return Builder.CreateBitCast(RMW, RetTy);
 }
 
-/// Helper to unwrap intrinsic call MetadataAsValue operands.
-template <typename MDType>
-static MDType *unwrapMAVOp(CallBase *CI, unsigned Op) {
-  if (MetadataAsValue *MAV = dyn_cast<MetadataAsValue>(CI->getArgOperand(Op)))
-    return dyn_cast<MDType>(MAV->getMetadata());
+/// Helper to unwrap intrinsic call MetadataAsValue operands. Return as a
+/// plain MDNode, as it's the verifier's job to check these are the correct
+/// types later.
+static MDNode *unwrapMAVOp(CallBase *CI, unsigned Op) {
+  if (Op < CI->arg_size()) {
+    if (MetadataAsValue *MAV =
+            dyn_cast<MetadataAsValue>(CI->getArgOperand(Op))) {
+      Metadata *MD = MAV->getMetadata();
+      return dyn_cast_if_present<MDNode>(MD);
+    }
+  }
+  return nullptr;
+}
+
+/// Helper to unwrap Metadata MetadataAsValue operands, such as the Value field.
+static Metadata *unwrapMAVMetadataOp(CallBase *CI, unsigned Op) {
+  if (Op < CI->arg_size())
+    if (MetadataAsValue *MAV = dyn_cast<MetadataAsValue>(CI->getArgOperand(Op)))
+      return MAV->getMetadata();
   return nullptr;
 }
 
+static MDNode *getDebugLocSafe(const Instruction *I) {
+  // The MDNode attached to this instruction might not be the correct type,
+  // as the verifier has not yet be run. Fetch it as a bare MDNode.
+  return I->getDebugLoc().getAsMDNode();
+}
+
 /// Convert debug intrinsic calls to non-instruction debug records.
 /// \p Name - Final part of the intrinsic name, e.g. 'value' in llvm.dbg.value.
 /// \p CI - The debug intrinsic call.
 static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) {
   DbgRecord *DR = nullptr;
   if (Name == "label") {
-    DR = new DbgLabelRecord(unwrapMAVOp<DILabel>(CI, 0), CI->getDebugLoc());
+    DR = DbgLabelRecord::createUnresolvedDbgLabelRecord(unwrapMAVOp(CI, 0),
+                                                        CI->getDebugLoc());
   } else if (Name == "assign") {
-    DR = new DbgVariableRecord(
-        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
-        unwrapMAVOp<DIExpression>(CI, 2), unwrapMAVOp<DIAssignID>(CI, 3),
-        unwrapMAVOp<Metadata>(CI, 4), unwrapMAVOp<DIExpression>(CI, 5),
-        CI->getDebugLoc());
+    DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+        DbgVariableRecord::LocationType::Assign, unwrapMAVMetadataOp(CI, 0),
+        unwrapMAVOp(CI, 1), unwrapMAVOp(CI, 2), unwrapMAVOp(CI, 3),
+        unwrapMAVMetadataOp(CI, 4),
+        /*The address is a Value ref, it will be stored as a Metadata */
+        unwrapMAVOp(CI, 5), getDebugLocSafe(CI));
   } else if (Name == "declare") {
-    DR = new DbgVariableRecord(
-        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, 1),
-        unwrapMAVOp<DIExpression>(CI, 2), CI->getDebugLoc(),
-        DbgVariableRecord::LocationType::Declare);
+    DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+        DbgVariableRecord::LocationType::Declare, unwrapMAVMetadataOp(CI, 0),
+        unwrapMAVOp(CI, 1), unwrapMAVOp(CI, 2), nullptr, nullptr, nullptr,
+        getDebugLocSafe(CI));
   } else if (Name == "addr") {
     // Upgrade dbg.addr to dbg.value with DW_OP_deref.
-    DIExpression *Expr = unwrapMAVOp<DIExpression>(CI, 2);
-    Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
-    DR = new DbgVariableRecord(unwrapMAVOp<Metadata>(CI, 0),
-                               unwrapMAVOp<DILocalVariable>(CI, 1), Expr,
-                               CI->getDebugLoc());
+    MDNode *ExprNode = unwrapMAVOp(CI, 2);
+    // Don't try to add something to the expression if it's not an expression.
+    // Instead, allow the verifier to fail later.
+    if (DIExpression *Expr = dyn_cast<DIExpression>(ExprNode)) {
+      ExprNode = DIExpression::append(Expr, dwarf::DW_OP_deref);
+    }
+    DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+        DbgVariableRecord::LocationType::Value, unwrapMAVMetadataOp(CI, 0),
+        unwrapMAVOp(CI, 1), ExprNode, nullptr, nullptr, nullptr,
+        getDebugLocSafe(CI));
   } else if (Name == "value") {
     // An old version of dbg.value had an extra offset argument.
     unsigned VarOp = 1;
@@ -4440,9 +4466,10 @@ static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) {
       VarOp = 2;
       ExprOp = 3;
     }
-    DR = new DbgVariableRecord(
-        unwrapMAVOp<Metadata>(CI, 0), unwrapMAVOp<DILocalVariable>(CI, VarOp),
-        unwrapMAVOp<DIExpression>(CI, ExprOp), CI->getDebugLoc());
+    DR = DbgVariableRecord::createUnresolvedDbgVariableRecord(
+        DbgVariableRecord::LocationType::Value, unwrapMAVMetadataOp(CI, 0),
+        unwrapMAVOp(CI, VarOp), unwrapMAVOp(CI, ExprOp), nullptr, nullptr,
+        nullptr, getDebugLocSafe(CI));
   }
   assert(DR && "Unhandled intrinsic kind in upgrade to DbgRecord");
   CI->getParent()->insertDbgRecordBefore(DR, CI->getIterator());
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index ed11ea06398f1..f716e9970b841 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -32,8 +32,6 @@ using namespace llvm;
 STATISTIC(NumInstrRenumberings, "Number of renumberings across all blocks");
 
 DbgMarker *BasicBlock::createMarker(Instruction *I) {
-  assert(IsNewDbgInfoFormat &&
-         "Tried to create a marker in a non new debug-info block!");
   if (I->DebugMarker)
     return I->DebugMarker;
   DbgMarker *Marker = new DbgMarker();
@@ -43,8 +41,6 @@ DbgMarker *BasicBlock::createMarker(Instruction *I) {
 }
 
 DbgMarker *BasicBlock::createMarker(InstListType::iterator It) {
-  assert(IsNewDbgInfoFormat &&
-         "Tried to create a marker in a non new debug-info block!");
   if (It != end())
     return createMarker(&*It);
   DbgMarker *DM = getTrailingDbgRecords();
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 592bb6aa90613..9ec94a8b80959 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6714,7 +6714,7 @@ void Verifier::visit(DbgVariableRecord &DVR) {
   CheckDI(DVR.getType() == DbgVariableRecord::LocationType::Value ||
               DVR.getType() == DbgVariableRecord::LocationType::Declare ||
               DVR.getType() == DbgVariableRecord::LocationType::Assign,
-          "invalid #dbg record type", &DVR, DVR.getType());
+          "invalid #dbg record type", &DVR, DVR.getType(), BB, F);
 
   // The location for a DbgVariableRecord must be either a ValueAsMetadata,
   // DIArgList, or an empty MDNode (which is a legacy representation for an
@@ -6722,30 +6722,33 @@ void Verifier::visit(DbgVariableRecord &DVR) {
   auto *MD = DVR.getRawLocation();
   CheckDI(MD && (isa<ValueAsMetadata>(MD) || isa<DIArgList>(MD) ||
                  (isa<MDNode>(MD) && !cast<MDNode>(MD)->getNumOperands())),
-          "invalid #dbg record address/value", &DVR, MD);
+          "invalid #dbg record address/value", &DVR, MD, BB, F);
   if (auto *VAM = dyn_cast<ValueAsMetadata>(MD)) {
     visitValueAsMetadata(*VAM, F);
     if (DVR.isDbgDeclare()) {
       // Allow integers here to support inttoptr salvage.
       Type *Ty = VAM->getValue()->getType();
       CheckDI(Ty->isPointerTy() || Ty->isIntegerTy(),
-              "location of #dbg_declare must be a pointer or int", &DVR, MD);
+              "location of #dbg_declare must be a pointer or int", &DVR, MD, BB,
+              F);
     }
   } else if (auto *AL = dyn_cast<DIArgList>(MD)) {
     visitDIArgList(*AL, F);
   }
 
   CheckDI(isa_and_nonnull<DILocalVariable>(DVR.getRawVariable()),
-          "invalid #dbg record variable", &DVR, DVR.getRawVariable());
+          "invalid #dbg record variable", &DVR, DVR.getRawVariable(), BB, F);
   visitMDNode(*DVR.getRawVariable(), AreDebugLocsAllowed::No);
 
   CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawExpression()),
-          "invalid #dbg record expression", &DVR, DVR.getRawExpression());
+          "invalid #dbg record expression", &DVR, DVR.getRawExpression(), BB,
+          F);
   visitMDNode(*DVR.getExpression(), AreDebugLocsAllowed::No);
 
   if (DVR.isDbgAssign()) {
     CheckDI(isa_and_nonnull<DIAssignID>(DVR.getRawAssignID()),
-            "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID());
+            "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID(), BB,
+            F);
     visitMDNode(*cast<DIAssignID>(DVR.getRawAssignID()),
                 AreDebugLocsAllowed::No);
 
@@ -6756,29 +6759,29 @@ void Verifier::visit(DbgVariableRecord &DVR) {
     CheckDI(
         isa<ValueAsMetadata>(RawAddr) ||
             (isa<MDNode>(RawAddr) && !cast<MDNode>(RawAddr)->getNumOperands()),
-        "invalid #dbg_assign address", &DVR, DVR.getRawAddress());
+        "invalid #dbg_assign address", &DVR, DVR.getRawAddress(), BB, F);
     if (auto *VAM = dyn_cast<ValueAsMetadata>(RawAddr))
       visitValueAsMetadata(*VAM, F);
 
     CheckDI(isa_and_nonnull<DIExpression>(DVR.getRawAddressExpression()),
             "invalid #dbg_assign address expression", &DVR,
-            DVR.getRawAddressExpression());
+            DVR.getRawAddressExpression(), BB, F);
     visitMDNode(*DVR.getAddressExpression(), AreDebugLocsAllowed::No);
 
     // All of the linked instructions should be in the same function as DVR.
     for (Instruction *I : at::getAssignmentInsts(&DVR))
       CheckDI(DVR.getFunction() == I->getFunction(),
-              "inst not in same function as #dbg_assign", I, &DVR);
+              "inst not in same function as #dbg_assign", I, &DVR, BB, F);
   }
 
   // This check is redundant with one in visitLocalVariable().
   DILocalVariable *Var = DVR.getVariable();
-  CheckDI(isType(Var->getRawType()), "invalid type ref", Var,
-          Var->getRawType());
+  CheckDI(isType(Var->getRawType()), "invalid type ref", Var, Var->getRawType(),
+          BB, F);
 
   auto *DLNode = DVR.getDebugLoc().getAsMDNode();
   CheckDI(isa_and_nonnull<DILocation>(DLNode), "invalid #dbg record DILocation",
-          &DVR, DLNode);
+          &DVR, DLNode, BB, F);
   DILocation *Loc = DVR.getDebugLoc();
 
   // The scopes for variables and !dbg attachments must agree.
@@ -6790,7 +6793,7 @@ void Verifier::visit(DbgVariableRecord &DVR) {
   CheckDI(VarSP == LocSP,
           "mismatched subprogram between #dbg record variable and DILocation",
           &DVR, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
-          Loc->getScope()->getSubprogram());
+          Loc->getScope()->getSubprogram(), BB, F);
 
   verifyFnArgs(DVR);
 }
diff --git a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
index 2b089d2639375..c8b235757afba 100644
--- a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
+++ b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll
@@ -12,8 +12,12 @@ entry:
       metadata ptr undef,
       metadata !DILocalVariable(scope: !1),
       metadata !DIExpression())
-; AS: llvm.dbg.value intrinsic requires a !dbg attachment
+; AS: invalid #dbg record DILocation
+; AS: #dbg_value(ptr undef, !{{[0-9]+}}, !DIExpression(), (null))
+; AS: label %entry
+; AS: ptr @foo
 ; AS: warning: ignoring invalid debug info in <stdin>
+
 ret void
 }
 
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
index 0a4b7c255dc71..d1f1e1ce768dc 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
@@ -8,7 +8,7 @@
 
 define dso_local void @fun2() !dbg !15 {
   ;; DIAssignID copied here from @fun() where it is used by intrinsics.
-  ; CHECK: dbg.assign not in same function as inst
+  ; CHECK: DVRAssign not in same function as inst
   %x = alloca i32, align 4, !DIAssignID !14
   ret void
 }
@@ -17,24 +17,24 @@ define dso_local void @fun() !dbg !7 {
 entry:
   %a = alloca i32, align 4, !DIAssignID !14
   ;; Here something other than a dbg.assign intrinsic is using a DIAssignID.
-  ; CHECK: !DIAssignID should only be used by llvm.dbg.assign intrinsics
+  ; CHECK: !DIAssignID should only be used by Assign DVRs
   call void @llvm.dbg.value(metadata !14, metadata !10, metadata !DIExpression()), !dbg !13
 
   ;; Each following dbg.assign has an argument of the incorrect type.
-  ; CHECK: invalid llvm.dbg.assign intrinsic address/value
+  ; CHECK: invalid #dbg record address/value
   call void @llvm.dbg.assign(metadata !3, metadata !10, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13
-  ; CHECK: invalid llvm.dbg.assign intrinsic variable
+  ; CHECK: invalid #dbg record variable
   call void @llvm.dbg.assign(metadata i32 0, metadata !2, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13
-  ; CHECK: invalid llvm.dbg.assign intrinsic expression
+  ; CHECK: invalid #dbg record expression
   call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !2, metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13
-  ; CHECK: invalid llvm.dbg.assign intrinsic DIAssignID
+  ; CHECK: invalid #dbg_assign DIAssignID
   call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !2, metadata ptr undef, metadata !DIExpression()), !dbg !13
-  ; CHECK: invalid llvm.dbg.assign intrinsic address
+  ; CHECK: invalid #dbg_assign address
   call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata !3, metadata !DIExpression()), !dbg !13
   ;; Empty metadata debug operands are allowed.
-  ; CHECK-NOT: invalid llvm.dbg.assign
+  ; CHECK-NOT: invalid #dbg record
   call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata !2, metadata !DIExpression()), !dbg !13
-  ; CHECK: invalid llvm.dbg.assign intrinsic address expression
+  ; CHECK: invalid #dbg_assign address expression
   call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !2), !dbg !13
   ret void
 }
diff --git a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll
index f47e495db6617..6db453605cb57 100644
--- a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll
+++ b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll
@@ -13,8 +13,8 @@ define i8 @test1(i32 %arg) {
 ; CHECK-NEXT:      #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7:![0-9]+]])
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24
-; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), [[META7]])
-; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), [[META7]])
 ; CHECK-NEXT:    ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]]
 ;
 entry:
@@ -36,11 +36,11 @@ define i8 @test2(i32 %arg1, i8 %arg2) {
 ; CHECK-NEXT:      #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16
-; CHECK-NEXT:      #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16), [[META7]])
+; CHECK-NEXT:      #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24
 ; CHECK-NEXT:    [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT:      #dbg_value(i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
-; CHECK-NEXT:      #dbg_value(i8 [[ARG2]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i8 [[ARG2]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
 ; CHECK-NEXT:    ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]]
 ;
 entry:
@@ -84,7 +84,7 @@ define i16 @test4(i32 %arg) {
 ; CHECK-NEXT:      #dbg_value(i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 16
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16
-; CHECK-NEXT:      #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]])
 ; CHECK-NEXT:    ret i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]]
 ;
 entry:
@@ -107,8 +107,8 @@ define i8 @test5(i32 %arg) {
 ; CHECK-NEXT:      #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META11:![0-9]+]], !DIExpression(), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24
-; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]])
-; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]])
 ; CHECK-NEXT:    ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]]
 ;
 entry:
@@ -130,11 +130,11 @@ define i8 @test6(i32 %arg1, i8 %arg2) {
 ; CHECK-NEXT:      #dbg_value(i8 poison, [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8
 ; CHECK-NEXT:    [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16
-; CHECK-NEXT:      #dbg_value(i16 poison, [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), [[META7]])
+; CHECK-NEXT:      #dbg_value(i16 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), [[META7]])
 ; CHECK-NEXT:    [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24
 ; CHECK-NEXT:    [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT:      #dbg_value(i8 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
-; CHECK-NEXT:      #dbg_value(i8 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i8 poison, [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
+; CHECK-NEXT:      #dbg_value(i8 poison, [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]])
 ; CHECK-NEXT:    ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]]
 ;
 entry:
@@ -197,9 +197,9 @@ entry:
 ; CHECK: [[META5]] = !DIFile(filename: "dbg-bit-piece.cpp", directory: "")
 ; CHECK: [[META6]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
 ; CHECK: [[META7]] = !DILocation(line: 0, scope: [[META3]])
-; CHECK: [[META8]] = !DILocalVariable(name: "z", scope: [[META3]], type: [[META6]])
-; CHECK: [[META9]] = !DILocalVariable(name: "y", scope: [[META3]], type: [[META10:![0-9]+]])
-; CHECK: [[META10]] = !DIBasicType(name: "signed int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META8]] = !DILocalVariable(name: "y", scope: [[META3]], type: [[META9:![0-9]+]])
+; CHECK: [[META9]] = !DIBasicType(name: "signed int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META10]] = !DILocalVariable(name: "z", scope: [[META3]], type: [[META6]])
 ; CHECK: [[META11]] = !DILocalVariable(name: "x", scope: [[META3]], type: [[META12:![0-9]+]])
 ; CHECK: [[META12]] = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
 ;.
diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
index bf846c310a525..c1140988fa916 100644
--- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll
@@ -19,7 +19,7 @@ entry:
   %c = alloca i32, align 4
   store i32 2, ptr %a, align 4
   store i32 3, ptr %b, align 4
-  call void @llvm.dbg.value(metadata i64 0, metadata !14, metadata !DIExpression()), !dbg !14
+  call void @llvm.dbg.value(metadata i64 0, metadata !14, metadata !DIExpression()), !dbg !15
   store i32 4, ptr %c, align 4
   %al = load i32, ptr %a
   %bl = load i32, ptr %b
@@ -62,3 +62,4 @@ entry:
 !12 = !DISubroutineType(types: !13)
 !13 = !{}
 !14 = !DILocalVariable(name: "p_6", arg: 1, scope: !11, line: 117, type: !1)
+!15 = !DILocation(line: 1, scope: !11)
diff --git a/llvm/test/Transforms/ObjCARC/code-motion.ll b/llvm/test/Transforms/ObjCARC/code-motion.ll
index 499ee77bc6541..9009b98b4b1e3 100644
--- a/llvm/test/Transforms/ObjCARC/code-motion.ll
+++ b/llvm/test/Transforms/ObjCARC/code-motion.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=objc-arc -S < %s | FileCheck %s
+; RUN: opt -passes=objc-arc -S < %s 2>&1 | FileCheck %s '--implicit-check-not=ignoring invalid debug'
 
 declare void @alterRefCount()
 declare void @use(ptr)
@@ -17,7 +17,7 @@ define i32 @test(ptr %x, ptr %y, i8 %z, i32 %i) {
   store i32 %i, ptr %i.addr, align 4
   %v1 = tail call ptr @llvm.objc.retain(ptr %x)
   store i8 %z, ptr %x
-  call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !9, metadata !DIExpression()), !dbg !10
+  call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !11, metadata !DIExpression()), !dbg !10
   call void @alterRefCount()
   tail call void @llvm.objc.release(ptr %x)
   ret i32 %i
@@ -64,7 +64,7 @@ define void @test3(ptr %obj, i1 %cond) {
 ; CHECK-NEXT:    call void @use(ptr [[OBJ]])
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ]]) {{.*}}, !clang.imprecise_release !2
+; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   %v0 = call ptr @llvm.objc.retain(ptr %obj)
@@ -102,8 +102,8 @@ define void @test4(ptr %obj0, ptr %obj1, i1 %cond) {
 ; CHECK-NEXT:    call void @use(ptr [[OBJ1]])
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ0]]) {{.*}}, !clang.imprecise_release !2
-; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ1]]) {{.*}}, !clang.imprecise_release !2
+; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ0]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA]]
+; CHECK-NEXT:    call void @llvm.objc.release(ptr [[OBJ1]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA]]
 ; CHECK-NEXT:    ret void
 ;
   %v0 = call ptr @llvm.objc.retain(ptr %obj0)
@@ -190,6 +190,8 @@ attributes #0 = { readonly }
 
 !llvm.module.flags = !{!0, !1}
 
+; CHECK: ![[EMPTYMETA]] = !{}
+
 !0 = !{i32 2, !"Dwarf Version", i32 4}
 !1 = !{i32 2, !"Debug Info Version", i32 3}
 !2 = !DILocalVariable(name: "i", arg: 1, scope: !3, file: !4, line: 1, type: !7)
@@ -201,3 +203,4 @@ attributes #0 = { readonly }
 !8 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !4, isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !9, nameTableKind: None)
 !9 = !{}
 !10 = !DILocation(line: 1, column: 14, scope: !3)
+!11 = !DILocalVariable(name: "foo", scope: !3, type: !7)
diff --git a/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll b/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll
new file mode 100644
index 0000000000000..cdc9d8df82aa7
--- /dev/null
+++ b/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll
@@ -0,0 +1,46 @@
+; RUN: llvm-as %s -o - 2>&1 | FileCheck %s
+; CHECK: invalid #dbg record expression
+;
+; Fossilised debug-info with only two arguments to dbg.declare have been
+; spotted in LLVMs test suite (debug-info-always-inline.ll), test that this
+; does not cause a crash. LLVM needs to be able to autoupgrade invalid
+; dbg.declares to invalid #dbg_declares because this occurs before the
+; Verifier runs.
+
+; ModuleID = 'out.ll'
+source_filename = "llvm/test/DebugInfo/Generic/debug-info-always-inline.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; Function Attrs: alwaysinline nounwind sspstrong
+define i32 @_Z3foov() !dbg !7 {
+entry:
+  %sum = alloca i32, align 4, !dbg !11
+  call void @llvm.dbg.declare(metadata ptr %sum,  metadata !26), !dbg !11
+  ret i32 0, !dbg !15
+}
+
+declare void @_Z3barv()
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+!llvm.dbg.cu = !{!3}
+!llvm.debugify = !{!5, !6}
+
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!2 = !{!"clang version 3.6.0 (217844)"}
+!3 = distinct !DICompileUnit(language: DW_LANG_C, file: !4, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!4 = !DIFile(filename: "/fast/fs/llvm-main/llvm/test/DebugInfo/Generic/debug-info-always-inline.ll", directory: "/")
+!5 = !{i32 14}
+!6 = !{i32 7}
+!7 = distinct !DISubprogram(name: "_Z3foov", linkageName: "_Z3foov", scope: null, file: !4, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, retainedNodes: !9)
+!8 = !DISubroutineType(types: !9)
+!9 = !{}
+!11 = !DILocation(line: 2, column: 1, scope: !7)
+!15 = !DILocation(line: 6, column: 1, scope: !7)
+!25 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!26 = !DILocalVariable(name: "b", scope: !7, file: !4, line: 1234, type: !25)
+
diff --git a/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll b/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll
new file mode 100644
index 0000000000000..c521a9b8eb11b
--- /dev/null
+++ b/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll
@@ -0,0 +1,42 @@
+; RUN: opt %s -o /dev/null -S 2>&1 | FileCheck %s
+;
+; The last dbg.declare intrinsic in this file has an illegal DILocation -- this
+; needs to pass through the autoupgrade to #dbg_declare process and then get
+; caught by the verifier.
+;
+; CHECK:      invalid #dbg record DILocation
+; CHECK-NEXT: #dbg_declare(ptr %1, ![[VAR:[0-9]+]], !DIExpression(), ![[PROG:[0-9]+]])
+; CHECK-NEXT: ![[PROG]] = distinct !DISubprogram(name: "IgnoreIntrinsicTest",
+; CHECK-NEXT: label %0
+; CHECK-NEXT: ptr @IgnoreIntrinsicTest
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define i32 @IgnoreIntrinsicTest() !dbg !10 {
+  %1 = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata ptr %1, metadata !14, metadata !DIExpression()), !dbg !10
+  store volatile i32 1, ptr %1, align 4
+  %2 = load volatile i32, ptr %1, align 4
+  %3 = mul nsw i32 %2, 42
+  ret i32 %3
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.4 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !5, imports: !5)
+!1 = !DIFile(filename: "<unknown>", directory: "/Users/matt/ryan_bug")
+!2 = !{!3}
+!3 = !DICompositeType(tag: DW_TAG_enumeration_type, scope: !4, file: !1, line: 20, size: 32, align: 32, elements: !6)
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "C", file: !1, line: 19, size: 8, align: 8, elements: !5)
+!5 = !{}
+!6 = !{!7}
+!7 = !DIEnumerator(name: "max_frame_size", value: 0)
+!8 = !{i32 2, !"Dwarf Version", i32 2}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "IgnoreIntrinsicTest", linkageName: "IgnoreIntrinsicTest", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !5)
+!11 = !DISubroutineType(types: !12)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!14 = !DILocalVariable(name: "x", scope: !10, file: !1, line: 2, type: !13)
+!15 = !DILocation(line: 2, column: 16, scope: !10)
diff --git a/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll b/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll
index 652e6667bfc5c..1a28f0ec519f7 100644
--- a/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll
+++ b/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll
@@ -1,9 +1,9 @@
 ; RUN: llvm-as -disable-output <%s 2>&1| FileCheck %s
 
-; CHECK-NOT: llvm.dbg.value
+; CHECK-NOT: #dbg_value
 ; CHECK: Entry values are only allowed in MIR unless they target a swiftasync Argument
-; CHECK: call void @llvm.dbg.value(metadata i32 %param, metadata !{{.*}}, metadata !DIExpression(DW_OP_LLVM_entry_value, 1))
-; CHECK-NOT: llvm.dbg.value
+; CHECK: #dbg_value(i32 %param, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1),
+; CHECK-NOT: #dbg_value
 ; CHECK-NOT: Entry values are only allowed
 ; CHECK: warning: ignoring invalid debug info
 
diff --git a/llvm/test/Verifier/llvm.dbg.declare-address.ll b/llvm/test/Verifier/llvm.dbg.declare-address.ll
index 219f9ca0a6679..251526b4c321b 100644
--- a/llvm/test/Verifier/llvm.dbg.declare-address.ll
+++ b/llvm/test/Verifier/llvm.dbg.declare-address.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.declare intrinsic address/value
-; CHECK-NEXT: call void @llvm.dbg.declare({{.*}})
+; CHECK: invalid #dbg record address/value
+; CHECK-NEXT: #dbg_declare({{.*}})
 ; CHECK-NEXT: !""
 ; CHECK: warning: ignoring invalid debug info
 
diff --git a/llvm/test/Verifier/llvm.dbg.declare-expression.ll b/llvm/test/Verifier/llvm.dbg.declare-expression.ll
index 671ec21780088..de65bb570677e 100644
--- a/llvm/test/Verifier/llvm.dbg.declare-expression.ll
+++ b/llvm/test/Verifier/llvm.dbg.declare-expression.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.declare intrinsic expression
-; CHECK-NEXT: call void @llvm.dbg.declare({{.*}})
-; CHECK-NEXT: !""
+; CHECK: invalid #dbg record expression
+; CHECK-NEXT: #dbg_declare({{.*}})
 ; CHECK: warning: ignoring invalid debug info
 
 define void @foo(i32 %a) {
diff --git a/llvm/test/Verifier/llvm.dbg.declare-variable.ll b/llvm/test/Verifier/llvm.dbg.declare-variable.ll
index 4f0ae4daa822f..601fab190d36b 100644
--- a/llvm/test/Verifier/llvm.dbg.declare-variable.ll
+++ b/llvm/test/Verifier/llvm.dbg.declare-variable.ll
@@ -1,13 +1,16 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.declare intrinsic variable
-; CHECK-NEXT: call void @llvm.dbg.declare({{.*}})
-; CHECK-NEXT: !""
+; CHECK: invalid #dbg record variable
+; CHECK-NEXT: #dbg_declare({{.*}})
+; CHECK-NEXT: DISubprogram
 ; CHECK: warning: ignoring invalid debug info
 
+;; This test ensures we report an illegal variable as illegal, but also that
+;; the illegal MDNode is printed out (DISubprogram) to help localise.
+
 define void @foo(i32 %a) {
 entry:
   %s = alloca i32
-  call void @llvm.dbg.declare(metadata ptr %s, metadata !"", metadata !DIExpression()), !dbg !DILocation(scope: !1)
+  call void @llvm.dbg.declare(metadata ptr %s, metadata !1, metadata !DIExpression()), !dbg !DILocation(scope: !1)
   ret void
 }
 
diff --git a/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll b/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll
index 5d82f490e055d..b1e22b20d0864 100644
--- a/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll
+++ b/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll
@@ -5,8 +5,8 @@ entry:
       metadata ptr undef,
       metadata !DILocalVariable(scope: !1),
       metadata !DIExpression())
-; CHECK-LABEL: llvm.dbg.value intrinsic requires a !dbg attachment
-; CHECK-NEXT: call void @llvm.dbg.value({{.*}})
+; CHECK-LABEL: invalid #dbg record DILocation
+; CHECK-NEXT: #dbg_value({{.*}})
 ; CHECK-NEXT: label %entry
 ; CHECK-NEXT: ptr @foo
 
@@ -14,8 +14,8 @@ entry:
       metadata ptr undef,
       metadata !DILocalVariable(scope: !1),
       metadata !DIExpression())
-; CHECK-LABEL: llvm.dbg.declare intrinsic requires a !dbg attachment
-; CHECK-NEXT: call void @llvm.dbg.declare({{.*}})
+; CHECK-LABEL: invalid #dbg record DILocation
+; CHECK-NEXT: #dbg_declare({{.*}})
 ; CHECK-NEXT: label %entry
 ; CHECK-NEXT: ptr @foo
 
@@ -24,8 +24,8 @@ entry:
       metadata !DILocalVariable(scope: !1),
       metadata !DIExpression()),
     !dbg !DILocation(scope: !2)
-; CHECK-LABEL: mismatched subprogram between llvm.dbg.value variable and !dbg attachment
-; CHECK-NEXT: call void @llvm.dbg.value({{[^,]+}}, metadata ![[VAR:[0-9]+]], {{[^,]+}}), !dbg ![[LOC:[0-9]+]]
+; CHECK-LABEL: mismatched subprogram between #dbg record variable and DILocation
+; CHECK-NEXT: #dbg_value({{[^,]+}}, ![[VAR:[0-9]+]], {{[^,]+}}, ![[LOC:[0-9]+]])
 ; CHECK-NEXT: label %entry
 ; CHECK-NEXT: ptr @foo
 ; CHECK-NEXT: ![[VAR]] = !DILocalVariable({{.*}}scope: ![[VARSP:[0-9]+]]
@@ -38,8 +38,8 @@ entry:
       metadata !DILocalVariable(scope: !1),
       metadata !DIExpression()),
     !dbg !DILocation(scope: !2)
-; CHECK-LABEL: mismatched subprogram between llvm.dbg.declare variable and !dbg attachment
-; CHECK-NEXT: call void @llvm.dbg.declare({{[^,]+}}, metadata ![[VAR:[0-9]+]], {{.*[^,]+}}), !dbg ![[LOC:[0-9]+]]
+; CHECK-LABEL: mismatched subprogram between #dbg record variable and DILocation
+; CHECK-NEXT: #dbg_declare({{[^,]+}}, ![[VAR:[0-9]+]], {{.*[^,]+}}, ![[LOC:[0-9]+]])
 ; CHECK-NEXT: label %entry
 ; CHECK-NEXT: ptr @foo
 ; CHECK-NEXT: ![[VAR]] = !DILocalVariable({{.*}}scope: ![[VARSP:[0-9]+]]
diff --git a/llvm/test/Verifier/llvm.dbg.value-expression.ll b/llvm/test/Verifier/llvm.dbg.value-expression.ll
index cc45af2e8e7cb..92fd2add700ed 100644
--- a/llvm/test/Verifier/llvm.dbg.value-expression.ll
+++ b/llvm/test/Verifier/llvm.dbg.value-expression.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.value intrinsic expression
-; CHECK-NEXT: call void @llvm.dbg.value({{.*}})
-; CHECK-NEXT: !""
+; CHECK: invalid #dbg record expression
+; CHECK-NEXT: #dbg_value({{.*}})
 ; CHECK: warning: ignoring invalid debug info
 
 define void @foo(i32 %a) {
diff --git a/llvm/test/Verifier/llvm.dbg.value-value.ll b/llvm/test/Verifier/llvm.dbg.value-value.ll
index 8b0ec1fed05c3..c390e530653cd 100644
--- a/llvm/test/Verifier/llvm.dbg.value-value.ll
+++ b/llvm/test/Verifier/llvm.dbg.value-value.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.value intrinsic address/value
-; CHECK-NEXT: call void @llvm.dbg.value({{.*}})
+; CHECK: invalid #dbg record address/value
+; CHECK-NEXT: #dbg_value({{.*}})
 ; CHECK-NEXT: !""
 ; CHECK: warning: ignoring invalid debug info
 
diff --git a/llvm/test/Verifier/llvm.dbg.value-variable.ll b/llvm/test/Verifier/llvm.dbg.value-variable.ll
index 4388e20797ce7..603a4b5c47e7d 100644
--- a/llvm/test/Verifier/llvm.dbg.value-variable.ll
+++ b/llvm/test/Verifier/llvm.dbg.value-variable.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s
-; CHECK: invalid llvm.dbg.value intrinsic variable
-; CHECK-NEXT: call void @llvm.dbg.value({{.*}})
-; CHECK-NEXT: !""
+; CHECK: invalid #dbg record variable
+; CHECK-NEXT: #dbg_value({{.*}})
 ; CHECK: warning: ignoring invalid debug info
 
 define void @foo(i32 %a) {
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index a888fd6c6cdc3..d7aa584bb8cb4 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -991,7 +991,6 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   Instruction *RetInst = &*std::next(FirstInst->getIterator());
 
   // Set-up DbgMarkers in this block.
-  ExitBlock->IsNewDbgInfoFormat = true;
   ExitBlock->createMarker(FirstInst);
   ExitBlock->createMarker(RetInst);
 
@@ -1127,7 +1126,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   BasicBlock *BB1 = &F->getEntryBlock();
   // First instruction should be a dbg.value.
   EXPECT_TRUE(isa<DbgValueInst>(BB1->front()));
-  EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
   // Validating the block for DbgVariableRecords / DbgMarkers shouldn't fail --
   // there's no data stored right now.
   bool BrokenDebugInfo = false;
@@ -1135,15 +1133,8 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   EXPECT_FALSE(Error);
   EXPECT_FALSE(BrokenDebugInfo);
 
-  // Function and module should be marked as not having the new format too.
-  EXPECT_FALSE(F->IsNewDbgInfoFormat);
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
-
   // Now convert.
   M->convertToNewDbgValues();
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
-  EXPECT_TRUE(F->IsNewDbgInfoFormat);
-  EXPECT_TRUE(BB1->IsNewDbgInfoFormat);
 
   // There should now be no dbg.value instructions!
   // Ensure the first instruction exists, the test all of them.
@@ -1180,7 +1171,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   // There should be no DbgVariableRecords / DbgMarkers in the second block, but
   // it should be marked as being in the new format.
   BasicBlock *BB2 = BB1->getNextNode();
-  EXPECT_TRUE(BB2->IsNewDbgInfoFormat);
   for (auto &Inst : *BB2)
     // Either there should be no marker, or it should be empty.
     EXPECT_TRUE(!Inst.DebugMarker ||
@@ -1207,9 +1197,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
 
   // Convert everything back to the "old" format and ensure it's right.
   M->convertFromNewDbgValues();
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
-  EXPECT_FALSE(F->IsNewDbgInfoFormat);
-  EXPECT_FALSE(BB1->IsNewDbgInfoFormat);
 
   EXPECT_EQ(BB1->size(), 4u);
   ASSERT_TRUE(isa<DbgValueInst>(BB1->front()));

From e15d50d5ff295368edaf7bff67f405617310722c Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Wed, 11 Jun 2025 09:20:40 -0400
Subject: [PATCH 074/851] [mlir][spirv] Add lowering of multiple math trig/hypb
 functions (#143604)

Add Math to SPIRV lowering for tan, asin, acos, sinh, cosh, asinh, acosh
and atanh. This completes the lowering of all trigonometric and
hyperbolic functions from math to SPIRV.
---
 .../Conversion/MathToSPIRV/MathToSPIRV.cpp    | 20 ++++++++++--
 .../MathToSPIRV/math-to-gl-spirv.mlir         | 32 +++++++++++++++++++
 .../MathToSPIRV/math-to-opencl-spirv.mlir     | 32 +++++++++++++++++++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
index 1b83794b5f450..501bfa223fb18 100644
--- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
+++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp
@@ -509,7 +509,15 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter,
            CheckedElementwiseOpPattern<math::RsqrtOp, spirv::GLInverseSqrtOp>,
            CheckedElementwiseOpPattern<math::SinOp, spirv::GLSinOp>,
            CheckedElementwiseOpPattern<math::SqrtOp, spirv::GLSqrtOp>,
-           CheckedElementwiseOpPattern<math::TanhOp, spirv::GLTanhOp>>(
+           CheckedElementwiseOpPattern<math::TanhOp, spirv::GLTanhOp>,
+           CheckedElementwiseOpPattern<math::TanOp, spirv::GLTanOp>,
+           CheckedElementwiseOpPattern<math::AsinOp, spirv::GLAsinOp>,
+           CheckedElementwiseOpPattern<math::AcosOp, spirv::GLAcosOp>,
+           CheckedElementwiseOpPattern<math::SinhOp, spirv::GLSinhOp>,
+           CheckedElementwiseOpPattern<math::CoshOp, spirv::GLCoshOp>,
+           CheckedElementwiseOpPattern<math::AsinhOp, spirv::GLAsinhOp>,
+           CheckedElementwiseOpPattern<math::AcoshOp, spirv::GLAcoshOp>,
+           CheckedElementwiseOpPattern<math::AtanhOp, spirv::GLAtanhOp>>(
           typeConverter, patterns.getContext());
 
   // OpenCL patterns
@@ -533,7 +541,15 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter,
                CheckedElementwiseOpPattern<math::RsqrtOp, spirv::CLRsqrtOp>,
                CheckedElementwiseOpPattern<math::SinOp, spirv::CLSinOp>,
                CheckedElementwiseOpPattern<math::SqrtOp, spirv::CLSqrtOp>,
-               CheckedElementwiseOpPattern<math::TanhOp, spirv::CLTanhOp>>(
+               CheckedElementwiseOpPattern<math::TanhOp, spirv::CLTanhOp>,
+               CheckedElementwiseOpPattern<math::TanOp, spirv::CLTanOp>,
+               CheckedElementwiseOpPattern<math::AsinOp, spirv::CLAsinOp>,
+               CheckedElementwiseOpPattern<math::AcosOp, spirv::CLAcosOp>,
+               CheckedElementwiseOpPattern<math::SinhOp, spirv::CLSinhOp>,
+               CheckedElementwiseOpPattern<math::CoshOp, spirv::CLCoshOp>,
+               CheckedElementwiseOpPattern<math::AsinhOp, spirv::CLAsinhOp>,
+               CheckedElementwiseOpPattern<math::AcoshOp, spirv::CLAcoshOp>,
+               CheckedElementwiseOpPattern<math::AtanhOp, spirv::CLAtanhOp>>(
       typeConverter, patterns.getContext());
 }
 
diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
index 5c6561c104389..b8e001c9f6950 100644
--- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
+++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir
@@ -46,6 +46,22 @@ func.func @float32_unary_scalar(%arg0: f32) {
   %14 = math.ceil %arg0 : f32
   // CHECK: spirv.GL.Floor %{{.*}}: f32
   %15 = math.floor %arg0 : f32
+  // CHECK: spirv.GL.Tan %{{.*}}: f32
+  %16 = math.tan %arg0 : f32
+  // CHECK: spirv.GL.Asin %{{.*}}: f32
+  %17 = math.asin %arg0 : f32
+  // CHECK: spirv.GL.Acos %{{.*}}: f32
+  %18 = math.acos %arg0 : f32
+  // CHECK: spirv.GL.Sinh %{{.*}}: f32
+  %19 = math.sinh %arg0 : f32
+  // CHECK: spirv.GL.Cosh %{{.*}}: f32
+  %20 = math.cosh %arg0 : f32
+  // CHECK: spirv.GL.Asinh %{{.*}}: f32
+  %21 = math.asinh %arg0 : f32
+  // CHECK: spirv.GL.Acosh %{{.*}}: f32
+  %22 = math.acosh %arg0 : f32
+  // CHECK: spirv.GL.Atanh %{{.*}}: f32
+  %23 = math.atanh %arg0 : f32
   return
 }
 
@@ -85,6 +101,22 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) {
   %11 = math.tanh %arg0 : vector<3xf32>
   // CHECK: spirv.GL.Sin %{{.*}}: vector<3xf32>
   %12 = math.sin %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Tan %{{.*}}: vector<3xf32>
+  %13 = math.tan %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Asin %{{.*}}: vector<3xf32>
+  %14 = math.asin %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Acos %{{.*}}: vector<3xf32>
+  %15 = math.acos %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Sinh %{{.*}}: vector<3xf32>
+  %16 = math.sinh %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Cosh %{{.*}}: vector<3xf32>
+  %17 = math.cosh %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Asinh %{{.*}}: vector<3xf32>
+  %18 = math.asinh %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Acosh %{{.*}}: vector<3xf32>
+  %19 = math.acosh %arg0 : vector<3xf32>
+  // CHECK: spirv.GL.Atanh %{{.*}}: vector<3xf32>
+  %20 = math.atanh %arg0 : vector<3xf32>
   return
 }
 
diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir
index 393a910c1fb1d..56a0d4dafec8c 100644
--- a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir
+++ b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir
@@ -48,6 +48,22 @@ func.func @float32_unary_scalar(%arg0: f32) {
   %16 = math.erf %arg0 : f32
   // CHECK: spirv.CL.round %{{.*}}: f32
   %17 = math.round %arg0 : f32
+  // CHECK: spirv.CL.tan %{{.*}}: f32
+  %18 = math.tan %arg0 : f32
+  // CHECK: spirv.CL.asin %{{.*}}: f32
+  %19 = math.asin %arg0 : f32
+  // CHECK: spirv.CL.acos %{{.*}}: f32
+  %20 = math.acos %arg0 : f32
+  // CHECK: spirv.CL.sinh %{{.*}}: f32
+  %21 = math.sinh %arg0 : f32
+  // CHECK: spirv.CL.cosh %{{.*}}: f32
+  %22 = math.cosh %arg0 : f32
+  // CHECK: spirv.CL.asinh %{{.*}}: f32
+  %23 = math.asinh %arg0 : f32
+  // CHECK: spirv.CL.acosh %{{.*}}: f32
+  %24 = math.acosh %arg0 : f32
+  // CHECK: spirv.CL.atanh %{{.*}}: f32
+  %25 = math.atanh %arg0 : f32
   return
 }
 
@@ -87,6 +103,22 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) {
   %11 = math.tanh %arg0 : vector<3xf32>
   // CHECK: spirv.CL.sin %{{.*}}: vector<3xf32>
   %12 = math.sin %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.tan %{{.*}}: vector<3xf32>
+  %13 = math.tan %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.asin %{{.*}}: vector<3xf32>
+  %14 = math.asin %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.acos %{{.*}}: vector<3xf32>
+  %15 = math.acos %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.sinh %{{.*}}: vector<3xf32>
+  %16 = math.sinh %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.cosh %{{.*}}: vector<3xf32>
+  %17 = math.cosh %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.asinh %{{.*}}: vector<3xf32>
+  %18 = math.asinh %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.acosh %{{.*}}: vector<3xf32>
+  %19 = math.acosh %arg0 : vector<3xf32>
+  // CHECK: spirv.CL.atanh %{{.*}}: vector<3xf32>
+  %20 = math.atanh %arg0 : vector<3xf32>
   return
 }
 

From cc9f67416d048bf464425b5a9243219efcb08c34 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Wed, 11 Jun 2025 14:30:02 +0100
Subject: [PATCH 075/851] [flang][OpenMP] Consider previous DSA for static
 duration variables (#143601)

Symbols that have a pre-existing DSA set in the enclosing context should
not be made shared based on them being static duration variables.

Suggested-by: Leandro Lupori <leandro.lupori@linaro.org>

---------

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 flang/lib/Semantics/resolve-directives.cpp   |  4 +++-
 flang/test/Semantics/OpenMP/implicit-dsa.f90 | 22 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 65823adcef19d..93bf510fbc3c7 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2382,7 +2382,9 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) {
       dsa = prevDSA;
     } else if (taskGenDir) {
       // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate
-      if (prevDSA.test(Symbol::Flag::OmpShared) || isStaticStorageDuration) {
+      if (prevDSA.test(Symbol::Flag::OmpShared) ||
+          (isStaticStorageDuration &&
+              (prevDSA & dataSharingAttributeFlags).none())) {
         // 6) shared in enclosing context -> shared
         dsa = {Symbol::Flag::OmpShared};
         makeSymbol(dsa);
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 3e9348575597b..4a07e256e2bb6 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -244,3 +244,25 @@ subroutine implicit_dsa_test_12
 !REF: /implicit_dsa_test_12/tm3a
 print *,tm3a
 end subroutine
+
+! Test static duration variables with DSA set in the enclosing scope do not default to shared DSA
+!DEF: /implicit_dsa_test_13_mod Module
+module implicit_dsa_test_13_mod
+  !DEF: /implicit_dsa_test_13_mod/a PUBLIC ObjectEntity INTEGER(4)
+  integer::a=5
+contains
+  !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13 PUBLIC (Subroutine) Subprogram
+  subroutine implicit_dsa_test_13
+    !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/i ObjectEntity INTEGER(4)
+    integer i
+    !$omp do private(a)
+      !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4)
+      do i=0,10
+        !$omp task
+        !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/OtherConstruct1/a (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+        !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/OtherConstruct1/i (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+        a=a+i
+        !$omp end task
+      end do
+  end subroutine implicit_dsa_test_13
+end module implicit_dsa_test_13_mod

From b512077c373a4416c506002383c69867cfee0741 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Wed, 11 Jun 2025 06:34:46 -0700
Subject: [PATCH 076/851] [flang][runtime] Another try to fix build failure
 (#143702)

Tweak accessibility to try to get code past whatever gcc is being used
by the flang-runtime-cuda-gcc build bot.
---
 .../include/flang-rt/runtime/work-queue.h     | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index f7f4777839836..f8cc820c06ca1 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -94,7 +94,7 @@ template <typename TICKET> class ImmediateTicketRunner {
 
 // Base class for ticket workers that operate elementwise over descriptors
 class Elementwise {
-protected:
+public:
   RT_API_ATTRS Elementwise(
       const Descriptor &instance, const Descriptor *from = nullptr)
       : instance_{instance}, from_{from} {
@@ -120,6 +120,7 @@ class Elementwise {
     }
   }
 
+protected:
   const Descriptor &instance_, *from_{nullptr};
   std::size_t elements_{instance_.Elements()};
   std::size_t elementAt_{0};
@@ -129,7 +130,7 @@ class Elementwise {
 
 // Base class for ticket workers that operate over derived type components.
 class Componentwise {
-protected:
+public:
   RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
   RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
   RT_API_ATTRS void Advance() {
@@ -147,6 +148,7 @@ class Componentwise {
   }
   RT_API_ATTRS void GetComponent();
 
+protected:
   const typeInfo::DerivedType &derived_;
   std::size_t components_{0}, componentAt_{0};
   const typeInfo::Component *component_{nullptr};
@@ -155,8 +157,8 @@ class Componentwise {
 
 // Base class for ticket workers that operate over derived type components
 // in an outer loop, and elements in an inner loop.
-class ComponentsOverElements : protected Componentwise, protected Elementwise {
-protected:
+class ComponentsOverElements : public Componentwise, public Elementwise {
+public:
   RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
       const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
       : Componentwise{derived}, Elementwise{instance, from} {
@@ -187,13 +189,14 @@ class ComponentsOverElements : protected Componentwise, protected Elementwise {
     Componentwise::Reset();
   }
 
+protected:
   int phase_{0};
 };
 
 // Base class for ticket workers that operate over elements in an outer loop,
 // type components in an inner loop.
-class ElementsOverComponents : protected Elementwise, protected Componentwise {
-protected:
+class ElementsOverComponents : public Elementwise, public Componentwise {
+public:
   RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
       const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
       : Elementwise{instance, from}, Componentwise{derived} {
@@ -219,6 +222,7 @@ class ElementsOverComponents : protected Elementwise, protected Componentwise {
     Elementwise::Advance();
   }
 
+protected:
   int phase_{0};
 };
 
@@ -319,7 +323,7 @@ class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
 template <bool IS_COMPONENTWISE>
 class DerivedAssignTicket
     : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
-      protected std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
           ElementsOverComponents> {
 public:
   using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
@@ -348,7 +352,7 @@ namespace io::descr {
 template <io::Direction DIR>
 class DescriptorIoTicket
     : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
-      protected Elementwise {
+      private Elementwise {
 public:
   RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
@@ -372,7 +376,7 @@ class DescriptorIoTicket
 
 template <io::Direction DIR>
 class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
-                        protected ElementsOverComponents {
+                        private ElementsOverComponents {
 public:
   RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const typeInfo::DerivedType &derived,

From b09206db154bab8fa09b6708e642a6bba3d125be Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Wed, 11 Jun 2025 14:37:28 +0100
Subject: [PATCH 077/851] [mlir][spirv] Include `SPIRV_AnyImage` in
 `SPIRV_Type` (#143676)

This change is trigger by encountering the following error:

```
<unknown>:0: error: 'spirv.Load' op result #0 must be void
or bool or 8/16/32/64-bit integer or 16/32/64-bit float or
vector of bool or 8/16/32/64-bit integer or 16/32/64-bit
float values of length 2/3/4/8/16 or any SPIR-V pointer type
or any SPIR-V array type or any SPIR-V run time array type
or any SPIR-V struct type or any SPIR-V cooperative matrix
type or any SPIR-V matrix type or any SPIR-V sampled image
type, but got '!spirv.image<f32, Dim2D, NoDepth, NonArrayed,
SingleSampled, NoSampler, Rgba8>'<unknown>:0: note: see current
operation:
%126 = "spirv.Load"(%125) {relaxed_precision} : (!spirv.ptr<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, UniformConstant>) -> !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>
```
---
 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td |  3 ++-
 mlir/test/Dialect/SPIRV/IR/memory-ops.mlir      | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 8fd533db83d9a..b143cf9a5f509 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -4196,7 +4196,8 @@ def SPIRV_Composite :
 def SPIRV_Type : AnyTypeOf<[
     SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector,
     SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
-    SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage
+    SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage,
+    SPIRV_AnyImage
   ]>;
 
 def SPIRV_SignedInt : SignedIntOfWidths<[8, 16, 32, 64]>;
diff --git a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
index 57ff94762ff68..a3b96c698a344 100644
--- a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
@@ -356,6 +356,16 @@ spirv.module Logical GLSL450 {
 
 // -----
 
+// CHECK-LABEL: @image_load
+func.func @image_load() -> () {
+  %0 = spirv.Variable : !spirv.ptr<!spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, Function>
+  // CHECK: spirv.Load "Function" %{{.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>
+  %1 = spirv.Load "Function" %0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.StoreOp
 //===----------------------------------------------------------------------===//

From 6b0cb762af97579ca8ff5eea9be896169a1752b7 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Wed, 11 Jun 2025 15:39:41 +0200
Subject: [PATCH 078/851] [Clang] _default-movable_ should be based on the
 first declaration (#143661)

When the definition of a special member function was defaulted we would
not consider it user-provided, even when the first declaration was not
defaulted.

Fixes #143599
---
 clang/lib/Sema/SemaTypeTraits.cpp             | 16 ++++++++-----
 .../SemaCXX/cxx2c-trivially-relocatable.cpp   | 21 +++++++++++++++++
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 23 +++++++++++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index d663e5581093e..1738ab4466001 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -105,7 +105,7 @@ static CXXMethodDecl *LookupSpecialMemberFromXValue(Sema &SemaRef,
   switch (OCS.BestViableFunction(SemaRef, LookupLoc, Best)) {
   case OR_Success:
   case OR_Deleted:
-    return cast<CXXMethodDecl>(Best->Function);
+    return cast<CXXMethodDecl>(Best->Function)->getCanonicalDecl();
   default:
     return nullptr;
   }
@@ -164,6 +164,8 @@ static bool IsDefaultMovable(Sema &SemaRef, const CXXRecordDecl *D) {
   if (!Dtr)
     return true;
 
+  Dtr = Dtr->getCanonicalDecl();
+
   if (Dtr->isUserProvided() && (!Dtr->isDefaulted() || Dtr->isDeleted()))
     return false;
 
@@ -2044,11 +2046,13 @@ static void DiagnoseNonDefaultMovable(Sema &SemaRef, SourceLocation Loc,
           << diag::TraitNotSatisfiedReason::UserProvidedAssign
           << Decl->isMoveAssignmentOperator() << Decl->getSourceRange();
   }
-  CXXDestructorDecl *Dtr = D->getDestructor();
-  if (Dtr && Dtr->isUserProvided() && !Dtr->isDefaulted())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::DeletedDtr << /*User Provided*/ 1
-        << Dtr->getSourceRange();
+  if (CXXDestructorDecl *Dtr = D->getDestructor()) {
+    Dtr = Dtr->getCanonicalDecl();
+    if (Dtr->isUserProvided() && !Dtr->isDefaulted())
+      SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+          << diag::TraitNotSatisfiedReason::DeletedDtr << /*User Provided*/ 1
+          << Dtr->getSourceRange();
+  }
 }
 
 static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef,
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index aff172e0bc70a..9d43994ee7661 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -388,3 +388,24 @@ void do_test__builtin_trivially_relocate() {
     // expected-note@-1 {{'test__builtin_trivially_relocate<S *, S *, int>' requested here}}
     // expected-error@#reloc1 {{first argument to '__builtin_trivially_relocate' must be relocatable}}
 }
+
+
+namespace GH143599 {
+struct A { ~A (); };
+A::~A () = default;
+
+static_assert (!__builtin_is_cpp_trivially_relocatable(A));
+static_assert (!__builtin_is_replaceable(A));
+
+struct B { B(const B&); };
+B::B (const B&) = default;
+
+static_assert (!__builtin_is_cpp_trivially_relocatable(B));
+static_assert (!__builtin_is_replaceable(B));
+
+struct C { C& operator=(const C&); };
+C& C::operator=(const C&) = default;
+
+static_assert (!__builtin_is_cpp_trivially_relocatable(C));
+static_assert (!__builtin_is_replaceable(C));
+}
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index 9e053034acda4..a8c78f6304ca9 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -320,6 +320,29 @@ static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic));
 
 }
 
+struct GH143599 {  // expected-note 2 {{'GH143599' defined here}}
+    ~GH143599 ();
+     GH143599(const GH143599&);
+     GH143599& operator=(const GH143599&);
+};
+GH143599::~GH143599 () = default;
+GH143599::GH143599 (const GH143599&) = default;
+GH143599& GH143599::operator=(const GH143599&) = default;
+
+static_assert (__builtin_is_cpp_trivially_relocatable(GH143599));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_cpp_trivially_relocatable(GH143599)'}} \
+// expected-note@-1 {{'GH143599' is not trivially relocatable}} \
+// expected-note@-1 {{because it has a user provided copy constructor}} \
+// expected-note@-1 {{because it has a user provided copy assignment operator}} \
+// expected-note@-1 {{because it has a user-provided destructor}}
+
+static_assert (__builtin_is_replaceable(GH143599));
+// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(GH143599)'}} \
+// expected-note@-1 {{'GH143599' is not replaceable}} \
+// expected-note@-1 {{because it has a user provided copy constructor}} \
+// expected-note@-1 {{because it has a user provided copy assignment operator}} \
+// expected-note@-1 {{because it has a user-provided destructor}}
+
 namespace trivially_copyable {
 struct B {
  virtual ~B();

From c71a2e688828ab3ede4fb54168a674ff68396f61 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 14:43:15 +0100
Subject: [PATCH 079/851] [DebugInfo][RemoveDIs] Remove some debug
 intrinsic-only codepaths (#143451)

These are opportunistic deletions as more places that make use of the
IsNewDbgInfoFormat flag are removed. It should (TM)(R) all be dead code
now that `IsNewDbgInfoFormat` should be true everywhere.

FastISel: we don't need to do debug-aware instruction counting any more,
because there are no debug instructions,
Autoupgrade: you can no-longer avoid autoupgrading of intrinsics to
records
DIBuilder: Delete the code for creating debug intrinsics (!)
LoopUtils: No need to handle debug instructions, they don't exist
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp |  3 -
 llvm/lib/IR/AutoUpgrade.cpp                | 25 ++----
 llvm/lib/IR/DIBuilder.cpp                  | 97 +++++-----------------
 llvm/lib/IR/DebugInfo.cpp                  | 19 +----
 llvm/lib/Transforms/Utils/LoopUtils.cpp    | 36 +++-----
 llvm/unittests/IR/IRBuilderTest.cpp        | 10 ---
 6 files changed, 40 insertions(+), 150 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 59cd0dc8dd348..e8a3df3366b2b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
   const BasicBlock *BB = FuncInfo.MBB->getBasicBlock();
   bool BlockHasMultipleInstrs = &BB->front() != &BB->back();
-  // Handle legacy case of debug intrinsics
-  if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat)
-    BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1;
   if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only non-debug
     // instruction in the block then emit it, otherwise we have the
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index cb90af36f3d9f..a0886776ff93f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
-    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
     } else if (IsDbg) {
-      // We might have decided we don't want the new format after all between
-      // first requesting the upgrade and now; skip the conversion if that is
-      // the case, and check here to see if the intrinsic needs to be upgraded
-      // normally.
-      if (!CI->getModule()->IsNewDbgInfoFormat) {
-        bool NeedsUpgrade =
-            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
-        if (!NeedsUpgrade)
-          return;
-        FallthroughToDefaultUpgrade = true;
-      } else {
-        upgradeDbgIntrinsicToDbgRecord(Name, CI);
-      }
+      upgradeDbgIntrinsicToDbgRecord(Name, CI);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (!FallthroughToDefaultUpgrade) {
-      if (Rep)
-        CI->replaceAllUsesWith(Rep);
-      CI->eraseFromParent();
-      return;
-    }
+    if (Rep)
+      CI->replaceAllUsesWith(Rep);
+    CI->eraseFromParent();
+    return;
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 5e5ff22132e99..1484c549dd580 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
       LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
-        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
-    // Insert after LinkedInstr.
-    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
-    NextIt.setHeadBit(true);
-    insertDbgVariableRecord(DVR, NextIt);
-    return DVR;
-  }
-
-  LLVMContext &Ctx = LinkedInstr->getContext();
-  Module *M = LinkedInstr->getModule();
-  if (!AssignFn)
-    AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
-
-  std::array<Value *, 6> Args = {
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
-      MetadataAsValue::get(Ctx, SrcVar),
-      MetadataAsValue::get(Ctx, ValExpr),
-      MetadataAsValue::get(Ctx, Link),
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)),
-      MetadataAsValue::get(Ctx, AddrExpr),
-  };
-
-  IRBuilder<> B(Ctx);
-  B.SetCurrentDebugLocation(DL);
-
-  auto *DVI = cast<DbgAssignIntrinsic>(B.CreateCall(AssignFn, Args));
-  DVI->insertAfter(LinkedInstr->getIterator());
-  return DVI;
+  DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+      Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
+  // Insert after LinkedInstr.
+  BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+  NextIt.setHeadBit(true);
+  insertDbgVariableRecord(DVR, NextIt);
+  return DVR;
 }
 
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
@@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DIExpression *Expr,
                                               const DILocation *DL,
                                               InsertPosition InsertPt) {
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!ValueFn)
-    ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
-  auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt);
-  cast<CallInst>(DVI)->setTailCall();
-  return DVI;
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
@@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!DeclareFn)
-    DeclareFn = getDeclareIntrin(M);
-
-  trackIfUnresolved(VarInfo);
-  trackIfUnresolved(Expr);
-  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
-                   MetadataAsValue::get(VMContext, VarInfo),
-                   MetadataAsValue::get(VMContext, Expr)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(DeclareFn, Args);
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
@@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
          "Expected matching subprograms");
 
   trackIfUnresolved(LabelInfo);
-  if (M.IsNewDbgInfoFormat) {
-    DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
-    if (InsertPt.isValid()) {
-      auto *BB = InsertPt.getBasicBlock();
-      BB->insertDbgRecordBefore(DLR, InsertPt);
-    }
-    return DLR;
+  DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
+  if (InsertPt.isValid()) {
+    auto *BB = InsertPt.getBasicBlock();
+    BB->insertDbgRecordBefore(DLR, InsertPt);
   }
-
-  if (!LabelFn)
-    LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label);
-
-  Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(LabelFn, Args);
+  return DLR;
 }
 
 void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 7db9891fdbd75..2a84e7bae0f10 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     Expr = *R;
   }
   DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {});
-  if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
-    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
-        &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
-    (void)Assign;
-    LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-    return;
-  }
-  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
-                                    AddrExpr, VarRec.DL);
+  auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
+      &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (!Assign.isNull()) {
-    if (const auto *Record = dyn_cast<DbgRecord *>(Assign))
-      errs() << " > INSERT: " << *Record << "\n";
-    else
-      errs() << " > INSERT: " << *cast<Instruction *>(Assign) << "\n";
-  });
+  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+  return;
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 0681ebc111cb2..ff69fa9f70c4e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
 
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
-  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
   llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
@@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // RemoveDIs: do the same as below for DbgVariableRecords.
-        if (Block->IsNewDbgInfoFormat) {
-          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
-                   filterDbgVars(I.getDbgRecordRange()))) {
-            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
-                              DVR.getDebugLoc().get());
-            if (!DeadDebugSet.insert(Key).second)
-              continue;
-            // Unlinks the DVR from it's container, for later insertion.
-            DVR.removeFromParent();
-            DeadDbgVariableRecords.push_back(&DVR);
-          }
-        }
-
-        // For one of each variable encountered, preserve a debug intrinsic (set
+        // For one of each variable encountered, preserve a debug record (set
         // to Poison) and transfer it to the loop exit. This terminates any
         // variable locations that were set during the loop.
-        auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
-        if (!DVI)
-          continue;
-        if (!DeadDebugSet.insert(DebugVariable(DVI)).second)
-          continue;
-        DeadDebugInst.push_back(DVI);
+        for (DbgVariableRecord &DVR :
+             llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) {
+          DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                            DVR.getDebugLoc().get());
+          if (!DeadDebugSet.insert(Key).second)
+            continue;
+          // Unlinks the DVR from it's container, for later insertion.
+          DVR.removeFromParent();
+          DeadDbgVariableRecords.push_back(&DVR);
+        }
       }
 
     // After the loop has been deleted all the values defined and modified
@@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
            "There should be a non-PHI instruction in exit block, else these "
            "instructions will have no parent.");
 
-    for (auto *DVI : DeadDebugInst)
-      DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
-
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
     // each DbgVariableRecord right at the start of the block, wheras dbg.values
     // would be repeatedly inserted before the first instruction. To replicate
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 3a7ba924792ef..aadae5287c380 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in new-debug mode.
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
-
-  // Test in old-debug mode.
-  // Reset the test then call convertFromNewDbgValues to flip the flag
-  // on the test's Module, Function and BasicBlock.
   TearDown();
-  SetUp();
-  M->convertFromNewDbgValues();
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
-  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {

From 46d9abbba2ad63c0280d4248cc2349de78439294 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Wed, 11 Jun 2025 14:50:39 +0100
Subject: [PATCH 080/851] [flang] Add David Truby as maintainer for Flang on
 Windows (#142619)

---
 flang/Maintainers.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/flang/Maintainers.md b/flang/Maintainers.md
index f4a7635389138..b994c300e4e2c 100644
--- a/flang/Maintainers.md
+++ b/flang/Maintainers.md
@@ -79,6 +79,13 @@ clementval@gmail.com (email), clementval (GitHub), clementval (Discourse)
 Abid Qadeer \
 haqadeer@amd.com (email), abidh (GitHub), abidh (Discourse)
 
+### Platform maintainers
+These maintainers are responsible for particular platforms that Flang supports
+
+#### Windows
+David Truby \
+david.truby@arm.com (email), davidtruby (GitHub), davidtruby (Discourse), truby (Discord)
+
 ## Inactive Maintainers
 ### Lead Maintainers
 #### Backend : (Lowering, FIR, Codegen)

From 76197ea6f91f802467f2614e1217e99eb4037200 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 14:51:13 +0100
Subject: [PATCH 081/851] Revert "[DebugInfo][RemoveDIs] Remove some debug
 intrinsic-only codepaths (#143451)"

This reverts commit c71a2e688828ab3ede4fb54168a674ff68396f61.

/me squints -- this is hitting an assertion I thought had been deleted,
will revert and investigate for a bit.
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp |  3 +
 llvm/lib/IR/AutoUpgrade.cpp                | 25 ++++--
 llvm/lib/IR/DIBuilder.cpp                  | 97 +++++++++++++++++-----
 llvm/lib/IR/DebugInfo.cpp                  | 19 ++++-
 llvm/lib/Transforms/Utils/LoopUtils.cpp    | 36 +++++---
 llvm/unittests/IR/IRBuilderTest.cpp        | 10 +++
 6 files changed, 150 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index e8a3df3366b2b..59cd0dc8dd348 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1671,6 +1671,9 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
   const BasicBlock *BB = FuncInfo.MBB->getBasicBlock();
   bool BlockHasMultipleInstrs = &BB->front() != &BB->back();
+  // Handle legacy case of debug intrinsics
+  if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat)
+    BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1;
   if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only non-debug
     // instruction in the block then emit it, otherwise we have the
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a0886776ff93f..cb90af36f3d9f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4490,6 +4490,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
+    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4517,15 +4518,29 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
     } else if (IsDbg) {
-      upgradeDbgIntrinsicToDbgRecord(Name, CI);
+      // We might have decided we don't want the new format after all between
+      // first requesting the upgrade and now; skip the conversion if that is
+      // the case, and check here to see if the intrinsic needs to be upgraded
+      // normally.
+      if (!CI->getModule()->IsNewDbgInfoFormat) {
+        bool NeedsUpgrade =
+            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
+        if (!NeedsUpgrade)
+          return;
+        FallthroughToDefaultUpgrade = true;
+      } else {
+        upgradeDbgIntrinsicToDbgRecord(Name, CI);
+      }
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (Rep)
-      CI->replaceAllUsesWith(Rep);
-    CI->eraseFromParent();
-    return;
+    if (!FallthroughToDefaultUpgrade) {
+      if (Rep)
+        CI->replaceAllUsesWith(Rep);
+      CI->eraseFromParent();
+      return;
+    }
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1484c549dd580..5e5ff22132e99 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1047,13 +1047,36 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
       LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
-  DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
-      Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
-  // Insert after LinkedInstr.
-  BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
-  NextIt.setHeadBit(true);
-  insertDbgVariableRecord(DVR, NextIt);
-  return DVR;
+  if (M.IsNewDbgInfoFormat) {
+    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
+    // Insert after LinkedInstr.
+    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+    NextIt.setHeadBit(true);
+    insertDbgVariableRecord(DVR, NextIt);
+    return DVR;
+  }
+
+  LLVMContext &Ctx = LinkedInstr->getContext();
+  Module *M = LinkedInstr->getModule();
+  if (!AssignFn)
+    AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
+
+  std::array<Value *, 6> Args = {
+      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
+      MetadataAsValue::get(Ctx, SrcVar),
+      MetadataAsValue::get(Ctx, ValExpr),
+      MetadataAsValue::get(Ctx, Link),
+      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)),
+      MetadataAsValue::get(Ctx, AddrExpr),
+  };
+
+  IRBuilder<> B(Ctx);
+  B.SetCurrentDebugLocation(DL);
+
+  auto *DVI = cast<DbgAssignIntrinsic>(B.CreateCall(AssignFn, Args));
+  DVI->insertAfter(LinkedInstr->getIterator());
+  return DVI;
 }
 
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
@@ -1078,10 +1101,18 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DIExpression *Expr,
                                               const DILocation *DL,
                                               InsertPosition InsertPt) {
-  DbgVariableRecord *DVR =
-      DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
-  insertDbgVariableRecord(DVR, InsertPt);
-  return DVR;
+  if (M.IsNewDbgInfoFormat) {
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertPt);
+    return DVR;
+  }
+
+  if (!ValueFn)
+    ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
+  auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt);
+  cast<CallInst>(DVI)->setTailCall();
+  return DVI;
 }
 
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
@@ -1093,10 +1124,25 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
 
-  DbgVariableRecord *DVR =
-      DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
-  insertDbgVariableRecord(DVR, InsertPt);
-  return DVR;
+  if (M.IsNewDbgInfoFormat) {
+    DbgVariableRecord *DVR =
+        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+    insertDbgVariableRecord(DVR, InsertPt);
+    return DVR;
+  }
+
+  if (!DeclareFn)
+    DeclareFn = getDeclareIntrin(M);
+
+  trackIfUnresolved(VarInfo);
+  trackIfUnresolved(Expr);
+  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
+                   MetadataAsValue::get(VMContext, VarInfo),
+                   MetadataAsValue::get(VMContext, Expr)};
+
+  IRBuilder<> B(DL->getContext());
+  initIRBuilder(B, DL, InsertPt);
+  return B.CreateCall(DeclareFn, Args);
 }
 
 void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
@@ -1145,12 +1191,23 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
          "Expected matching subprograms");
 
   trackIfUnresolved(LabelInfo);
-  DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
-  if (InsertPt.isValid()) {
-    auto *BB = InsertPt.getBasicBlock();
-    BB->insertDbgRecordBefore(DLR, InsertPt);
+  if (M.IsNewDbgInfoFormat) {
+    DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
+    if (InsertPt.isValid()) {
+      auto *BB = InsertPt.getBasicBlock();
+      BB->insertDbgRecordBefore(DLR, InsertPt);
+    }
+    return DLR;
   }
-  return DLR;
+
+  if (!LabelFn)
+    LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label);
+
+  Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
+
+  IRBuilder<> B(DL->getContext());
+  initIRBuilder(B, DL, InsertPt);
+  return B.CreateCall(LabelFn, Args);
 }
 
 void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2a84e7bae0f10..7db9891fdbd75 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2123,11 +2123,22 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     Expr = *R;
   }
   DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {});
-  auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
-      &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
+  if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
+    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
+        &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
+    (void)Assign;
+    LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+    return;
+  }
+  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
+                                    AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-  return;
+  LLVM_DEBUG(if (!Assign.isNull()) {
+    if (const auto *Record = dyn_cast<DbgRecord *>(Assign))
+      errs() << " > INSERT: " << *Record << "\n";
+    else
+      errs() << " > INSERT: " << *cast<Instruction *>(Assign) << "\n";
+  });
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index ff69fa9f70c4e..0681ebc111cb2 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -606,6 +606,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
 
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
+  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
   llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
@@ -632,19 +633,29 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // For one of each variable encountered, preserve a debug record (set
+        // RemoveDIs: do the same as below for DbgVariableRecords.
+        if (Block->IsNewDbgInfoFormat) {
+          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
+                   filterDbgVars(I.getDbgRecordRange()))) {
+            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                              DVR.getDebugLoc().get());
+            if (!DeadDebugSet.insert(Key).second)
+              continue;
+            // Unlinks the DVR from it's container, for later insertion.
+            DVR.removeFromParent();
+            DeadDbgVariableRecords.push_back(&DVR);
+          }
+        }
+
+        // For one of each variable encountered, preserve a debug intrinsic (set
         // to Poison) and transfer it to the loop exit. This terminates any
         // variable locations that were set during the loop.
-        for (DbgVariableRecord &DVR :
-             llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) {
-          DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
-                            DVR.getDebugLoc().get());
-          if (!DeadDebugSet.insert(Key).second)
-            continue;
-          // Unlinks the DVR from it's container, for later insertion.
-          DVR.removeFromParent();
-          DeadDbgVariableRecords.push_back(&DVR);
-        }
+        auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
+        if (!DVI)
+          continue;
+        if (!DeadDebugSet.insert(DebugVariable(DVI)).second)
+          continue;
+        DeadDebugInst.push_back(DVI);
       }
 
     // After the loop has been deleted all the values defined and modified
@@ -660,6 +671,9 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
            "There should be a non-PHI instruction in exit block, else these "
            "instructions will have no parent.");
 
+    for (auto *DVI : DeadDebugInst)
+      DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
+
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
     // each DbgVariableRecord right at the start of the block, wheras dbg.values
     // would be repeatedly inserted before the first instruction. To replicate
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index aadae5287c380..3a7ba924792ef 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -1003,8 +1003,18 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
+  // Test in new-debug mode.
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
+
+  // Test in old-debug mode.
+  // Reset the test then call convertFromNewDbgValues to flip the flag
+  // on the test's Module, Function and BasicBlock.
   TearDown();
+  SetUp();
+  M->convertFromNewDbgValues();
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {

From 6fb2a80189016bd4222b174ae4d72e47d0aa58ff Mon Sep 17 00:00:00 2001
From: Davide Grohmann <6573166+davidegrohmann@users.noreply.github.com>
Date: Wed, 11 Jun 2025 15:56:38 +0200
Subject: [PATCH 082/851] [mlir][spirv] Truncate Literal String size at max
 number words (#142916)

If not truncated the SPIRV serialization would not fail but instead
produce an invalid SPIR-V module.

---------

Signed-off-by: Davide Grohmann <davide.grohmann@arm.com>
---
 .../mlir/Target/SPIRV/SPIRVBinaryUtils.h      |  7 +++++++
 mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp    | 20 ++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h b/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h
index e46a576f1d48e..4a4116312981a 100644
--- a/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h
+++ b/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h
@@ -30,6 +30,13 @@ constexpr uint32_t kMagicNumber = 0x07230203;
 /// The serializer tool ID registered to the Khronos Group
 constexpr uint32_t kGeneratorNumber = 22;
 
+/// Max number of words
+/// https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_universal_limits
+constexpr uint32_t kMaxWordCount = 65535;
+
+/// Max number of words for literal
+constexpr uint32_t kMaxLiteralWordCount = kMaxWordCount - 3;
+
 /// Appends a SPRI-V module header to `header` with the given `version` and
 /// `idBound`.
 void appendModuleHeader(SmallVectorImpl<uint32_t> &header,
diff --git a/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp b/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp
index 31205d8f408f1..0ec468d4c1665 100644
--- a/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp
+++ b/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp
@@ -13,6 +13,9 @@
 #include "mlir/Target/SPIRV/SPIRVBinaryUtils.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_MAJOR
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "spirv-binary-utils"
 
 using namespace mlir;
 
@@ -67,8 +70,19 @@ uint32_t spirv::getPrefixedOpcode(uint32_t wordCount, spirv::Opcode opcode) {
 void spirv::encodeStringLiteralInto(SmallVectorImpl<uint32_t> &binary,
                                     StringRef literal) {
   // We need to encode the literal and the null termination.
-  auto encodingSize = literal.size() / 4 + 1;
-  auto bufferStartSize = binary.size();
+  size_t encodingSize = literal.size() / 4 + 1;
+  size_t sizeOfDataToCopy = literal.size();
+  if (encodingSize >= kMaxLiteralWordCount) {
+    // Reserve one word for the null termination.
+    encodingSize = kMaxLiteralWordCount - 1;
+    // Do not override the last word (null termination) when copying.
+    sizeOfDataToCopy = (encodingSize - 1) * 4;
+    LLVM_DEBUG(llvm::dbgs()
+               << "Truncating string literal to max size ("
+               << (kMaxLiteralWordCount - 1) << "): " << literal << "\n");
+  }
+  size_t bufferStartSize = binary.size();
   binary.resize(bufferStartSize + encodingSize, 0);
-  std::memcpy(binary.data() + bufferStartSize, literal.data(), literal.size());
+  std::memcpy(binary.data() + bufferStartSize, literal.data(),
+              sizeOfDataToCopy);
 }

From 76e14deb4a6967388a9bf84db2feeac17a30c786 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Wed, 11 Jun 2025 22:08:20 +0800
Subject: [PATCH 083/851] [X86][BreakFalseDeps] Using reverse order for undef
 register selection (#137569)

BreakFalseDeps picks the best register for undef operands if
instructions have false dependency. The problem is if the instruction is
close to the beginning of the function, ReachingDefAnalysis is over
optimism to the unused registers, which results in collision with
registers just defined in the caller.

This patch changes the selection of undef register in an reverse order,
which reduces the probability of register collisions between caller and
callee. It brings improvement in some of our internal benchmarks with
negligible effect on other benchmarks.
---
 llvm/include/llvm/CodeGen/RegisterClassInfo.h |   8 +-
 .../include/llvm/CodeGen/TargetRegisterInfo.h |   7 +-
 llvm/include/llvm/Target/Target.td            |   2 +-
 llvm/lib/CodeGen/BreakFalseDeps.cpp           |   2 +-
 llvm/lib/CodeGen/RegisterClassInfo.cpp        |  13 +-
 llvm/lib/Target/X86/X86RegisterInfo.td        |  28 +-
 llvm/test/CodeGen/X86/avx-cvt.ll              |  16 +-
 llvm/test/CodeGen/X86/avx512-cvt.ll           | 220 +++++-----
 .../test/CodeGen/X86/avx512-regcall-NoMask.ll |  28 +-
 llvm/test/CodeGen/X86/avx512fp16-cvt.ll       |  36 +-
 llvm/test/CodeGen/X86/avx512fp16-novl.ll      |  40 +-
 llvm/test/CodeGen/X86/break-false-dep.ll      |  26 +-
 llvm/test/CodeGen/X86/coalescer-commute1.ll   |   2 +-
 .../CodeGen/X86/fast-isel-fptrunc-fpext.ll    |   4 +-
 .../fast-isel-int-float-conversion-x86-64.ll  |  12 +-
 .../X86/fast-isel-int-float-conversion.ll     |  24 +-
 .../fast-isel-uint-float-conversion-x86-64.ll |  12 +-
 .../X86/fast-isel-uint-float-conversion.ll    |  24 +-
 llvm/test/CodeGen/X86/fcmp-logic.ll           |  12 +-
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    |  52 +--
 llvm/test/CodeGen/X86/fold-load-unops.ll      |  24 +-
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |  40 +-
 .../X86/fp-strict-scalar-inttofp-fp16.ll      |  60 +--
 .../CodeGen/X86/fp-strict-scalar-inttofp.ll   |  76 ++--
 .../X86/fp-strict-scalar-round-fp16.ll        |  12 +-
 llvm/test/CodeGen/X86/ftrunc.ll               |   6 +-
 llvm/test/CodeGen/X86/half.ll                 |   8 +-
 llvm/test/CodeGen/X86/isel-int-to-fp.ll       |  48 +--
 llvm/test/CodeGen/X86/pr34080.ll              |   4 +-
 llvm/test/CodeGen/X86/pr37879.ll              |   2 +-
 llvm/test/CodeGen/X86/pr38803.ll              |   2 +-
 llvm/test/CodeGen/X86/rounding-ops.ll         |  16 +-
 llvm/test/CodeGen/X86/scalar-int-to-fp.ll     |  30 +-
 .../CodeGen/X86/select-narrow-int-to-fp.ll    |  32 +-
 .../CodeGen/X86/split-extend-vector-inreg.ll  |  44 +-
 llvm/test/CodeGen/X86/sse-cvttp2si.ll         |  16 +-
 .../X86/sse2-intrinsics-x86-upgrade.ll        |   8 +-
 .../test/CodeGen/X86/stack-folding-fp-avx1.ll |  34 +-
 .../CodeGen/X86/vec-strict-inttofp-128.ll     |  40 +-
 .../CodeGen/X86/vec-strict-inttofp-256.ll     | 144 +++----
 .../CodeGen/X86/vec-strict-inttofp-512.ll     |  64 +--
 llvm/test/CodeGen/X86/vec_int_to_fp.ll        | 402 +++++++++---------
 .../X86/vector-constrained-fp-intrinsics.ll   | 210 ++++-----
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   |  10 +-
 44 files changed, 973 insertions(+), 927 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
index 3096f8851516e..078ae80915fed 100644
--- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
@@ -50,6 +50,8 @@ class RegisterClassInfo {
   // entry is valid when its tag matches.
   unsigned Tag = 0;
 
+  bool Reverse = false;
+
   const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
 
@@ -86,9 +88,11 @@ class RegisterClassInfo {
 public:
   LLVM_ABI RegisterClassInfo();
 
-  /// runOnFunction - Prepare to answer questions about MF. This must be called
+  /// runOnFunction - Prepare to answer questions about MF. Rev indicates to
+  /// use reversed raw order when compute register order. This must be called
   /// before any other methods are used.
-  LLVM_ABI void runOnMachineFunction(const MachineFunction &MF);
+  LLVM_ABI void runOnMachineFunction(const MachineFunction &MF,
+                                     bool Rev = false);
 
   /// getNumAllocatableRegs - Returns the number of actually allocatable
   /// registers in RC in the current function.
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index de5a6ecb548a4..8b9ed78a8e970 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -68,7 +68,7 @@ class TargetRegisterClass {
   const bool CoveredBySubRegs;
   const unsigned *SuperClasses;
   const uint16_t SuperClassesSize;
-  ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction&);
+  ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction &, bool Rev);
 
   /// Return the register class ID number.
   unsigned getID() const { return MC->getID(); }
@@ -199,8 +199,9 @@ class TargetRegisterClass {
   /// other criteria.
   ///
   /// By default, this method returns all registers in the class.
-  ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF) const {
-    return OrderFunc ? OrderFunc(MF) : getRegisters();
+  ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF,
+                                            bool Rev = false) const {
+    return OrderFunc ? OrderFunc(MF, Rev) : getRegisters();
   }
 
   /// Returns the combination of all lane masks of register in this class.
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index e8b460aaf803b..ce9a2b2751968 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -314,7 +314,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
   // to use in a given machine function. The code will be inserted in a
   // function like this:
   //
-  //   static inline unsigned f(const MachineFunction &MF) { ... }
+  //   static inline unsigned f(const MachineFunction &MF, bool Rev) { ... }
   //
   // The function should return 0 to select the default order defined by
   // MemberList, 1 to select the first AltOrders entry and so on.
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 05eed969a18af..7eef4a9d12b16 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -285,7 +285,7 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
   TRI = MF->getSubtarget().getRegisterInfo();
   RDA = &getAnalysis<ReachingDefAnalysis>();
 
-  RegClassInfo.runOnMachineFunction(mf);
+  RegClassInfo.runOnMachineFunction(mf, /*Rev=*/true);
 
   LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");
 
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 40fc35a16335f..8ead83302c337 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -39,14 +39,16 @@ StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
 
 RegisterClassInfo::RegisterClassInfo() = default;
 
-void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf,
+                                             bool Rev) {
   bool Update = false;
   MF = &mf;
 
   auto &STI = MF->getSubtarget();
 
   // Allocate new array the first time we see a new target.
-  if (STI.getRegisterInfo() != TRI) {
+  if (STI.getRegisterInfo() != TRI || Reverse != Rev) {
+    Reverse = Rev;
     TRI = STI.getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
     Update = true;
@@ -142,7 +144,12 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
 
   // FIXME: Once targets reserve registers instead of removing them from the
   // allocation order, we can simply use begin/end here.
-  ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
+  ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF, Reverse);
+  std::vector<MCPhysReg> ReverseOrder;
+  if (Reverse) {
+    llvm::append_range(ReverseOrder, reverse(RawOrder));
+    RawOrder = ArrayRef<MCPhysReg>(ReverseOrder);
+  }
   for (unsigned PhysReg : RawOrder) {
     // Remove reserved registers from the allocation order.
     if (Reserved.test(PhysReg))
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 3f9af5639a686..e9ca25d808a56 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -806,17 +806,37 @@ def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i
                                512, (sequence "ZMM%u", 0, 15)>;
 
 // Scalar AVX-512 floating point registers.
-def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> {
+  let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
+  let AltOrderSelect = [{
+    return Rev;
+  }];
+}
 
-def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> {
+  let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
+  let AltOrderSelect = [{
+    return Rev;
+  }];
+}
 
 def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}
 
 // Extended VR128 and VR256 for AVX-512 instructions
 def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128],
-                           128, (add FR32X)>;
+                           128, (add FR32X)> {
+  let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
+  let AltOrderSelect = [{
+    return Rev;
+  }];
+}
 def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64],
-                           256, (sequence "YMM%u", 0, 31)>;
+                           256, (sequence "YMM%u", 0, 31)> {
+  let AltOrders = [(add (sequence "YMM%u", 16, 31), (sequence "YMM%u", 0, 15))];
+  let AltOrderSelect = [{
+    return Rev;
+  }];
+}
 
 // Mask registers
 def VK1     : RegisterClass<"X86", [v1i1],  16,  (sequence "K%u", 0, 7)> {let Size = 16;}
diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll
index 1bd25273ecd48..fb30044512fa5 100644
--- a/llvm/test/CodeGen/X86/avx-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx-cvt.ll
@@ -108,7 +108,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
 define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcA:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %tmp1 = load i64, ptr %e, align 8
   %conv = sitofp i64 %tmp1 to double
@@ -118,7 +118,7 @@ define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
 define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcB:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtsi2sdl (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %tmp1 = load i32, ptr %e, align 4
   %conv = sitofp i32 %tmp1 to double
@@ -128,7 +128,7 @@ define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
 define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcC:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtsi2ssl (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %tmp1 = load i32, ptr %e, align 4
   %conv = sitofp i32 %tmp1 to float
@@ -138,7 +138,7 @@ define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
 define float @funcD(ptr nocapture %e) nounwind uwtable readonly ssp {
 ; CHECK-LABEL: funcD:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %tmp1 = load i64, ptr %e, align 8
   %conv = sitofp i64 %tmp1 to float
@@ -183,7 +183,7 @@ declare float @llvm.floor.f32(float %p)
 define float @floor_f32_load(ptr %aptr) optsize {
 ; CHECK-LABEL: floor_f32_load:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vroundss $9, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vroundss $9, (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %a = load float, ptr %aptr
   %res = call float @llvm.floor.f32(float %a)
@@ -193,7 +193,7 @@ define float @floor_f32_load(ptr %aptr) optsize {
 define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
 ; CHECK-LABEL: floor_f32_load_pgso:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vroundss $9, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vroundss $9, (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %a = load float, ptr %aptr
   %res = call float @llvm.floor.f32(float %a)
@@ -203,7 +203,7 @@ define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
 define double @nearbyint_f64_load(ptr %aptr) optsize {
 ; CHECK-LABEL: nearbyint_f64_load:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %a = load double, ptr %aptr
   %res = call double @llvm.nearbyint.f64(double %a)
@@ -213,7 +213,7 @@ define double @nearbyint_f64_load(ptr %aptr) optsize {
 define double @nearbyint_f64_load_pgso(ptr %aptr) !prof !14 {
 ; CHECK-LABEL: nearbyint_f64_load_pgso:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    vroundsd $12, (%rdi), %xmm15, %xmm0
 ; CHECK-NEXT:    retq
   %a = load double, ptr %aptr
   %res = call double @llvm.nearbyint.f64(double %a)
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index a78d97782e6a3..3dd7b571b9215 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -22,27 +22,27 @@ define <8 x double> @sltof864(<8 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -66,14 +66,14 @@ define <4 x double> @slto4f64(<4 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-NEXT:    retq
@@ -97,9 +97,9 @@ define <2 x double> @slto2f64(<2 x i64> %a) {
 ; NODQ-LABEL: slto2f64:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; NODQ-NEXT:    retq
 ;
@@ -123,9 +123,9 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; NODQ-LABEL: sltof2f32:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; NODQ-NEXT:    retq
 ;
@@ -148,12 +148,12 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 define <4 x float> @slto4f32_mem(ptr %a) {
 ; NODQ-LABEL: slto4f32_mem:
 ; NODQ:       # %bb.0:
-; NODQ-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; NODQ-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
+; NODQ-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm0
+; NODQ-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; NODQ-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
+; NODQ-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; NODQ-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; NODQ-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; NODQ-NEXT:    retq
 ;
@@ -246,16 +246,16 @@ define <4 x float> @slto4f32(<4 x i64> %a) {
 ; NODQ-LABEL: slto4f32:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vzeroupper
 ; NODQ-NEXT:    retq
@@ -281,16 +281,16 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
 ; NODQ-LABEL: ulto4f32:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vzeroupper
 ; NODQ-NEXT:    retq
@@ -316,16 +316,16 @@ define <4 x float> @ulto4f32_nneg(<4 x i64> %a) {
 ; NODQ-LABEL: ulto4f32_nneg:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vzeroupper
 ; NODQ-NEXT:    retq
@@ -864,7 +864,7 @@ define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind
 define double @sltof64_load(ptr nocapture %e) {
 ; ALL-LABEL: sltof64_load:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %tmp1 = load i64, ptr %e, align 8
@@ -875,7 +875,7 @@ entry:
 define double @sitof64_load(ptr %e) {
 ; ALL-LABEL: sitof64_load:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %tmp1 = load i32, ptr %e, align 4
@@ -886,7 +886,7 @@ entry:
 define float @sitof32_load(ptr %e) {
 ; ALL-LABEL: sitof32_load:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %tmp1 = load i32, ptr %e, align 4
@@ -897,7 +897,7 @@ entry:
 define float @sltof32_load(ptr %e) {
 ; ALL-LABEL: sltof32_load:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %tmp1 = load i64, ptr %e, align 8
@@ -990,28 +990,28 @@ define <8 x float> @slto8f32(<8 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-NEXT:    retq
@@ -1034,54 +1034,54 @@ define <16 x float> @slto16f32(<16 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; NODQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
 ; NODQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1109,27 +1109,27 @@ define <8 x double> @slto8f64(<8 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1153,53 +1153,53 @@ define <16 x double> @slto16f64(<16 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
@@ -1225,28 +1225,28 @@ define <8 x float> @ulto8f32(<8 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-NEXT:    retq
@@ -1269,54 +1269,54 @@ define <16 x float> @ulto16f32(<16 x i64> %a) {
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; NODQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
 ; NODQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1498,7 +1498,7 @@ define i32 @fptoui(float %a) nounwind {
 define float @uitof32(i32 %a) nounwind {
 ; ALL-LABEL: uitof32:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; ALL-NEXT:    retq
   %b = uitofp i32 %a to float
   ret float %b
@@ -1507,7 +1507,7 @@ define float @uitof32(i32 %a) nounwind {
 define double @uitof64(i32 %a) nounwind {
 ; ALL-LABEL: uitof64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; ALL-NEXT:    retq
   %b = uitofp i32 %a to double
   ret double %b
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 88c99a06326ab..a664cc7f17a5c 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -1221,17 +1221,17 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X32-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; X32-NEXT:    vcvtsi2sd %eax, %xmm2, %xmm1
+; X32-NEXT:    vcvtsi2sd %eax, %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vcvtsi2sd %ecx, %xmm2, %xmm1
+; X32-NEXT:    vcvtsi2sd %ecx, %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovd %edx, %xmm1
 ; X32-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
 ; X32-NEXT:    vcvtqq2pd %ymm1, %ymm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vcvtsi2sd %esi, %xmm2, %xmm1
+; X32-NEXT:    vcvtsi2sd %esi, %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vcvtsi2sdl (%ebx), %xmm2, %xmm1
+; X32-NEXT:    vcvtsi2sdl (%ebx), %xmm3, %xmm1
 ; X32-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vcvttsd2si %xmm0, %eax
 ; X32-NEXT:    popl %ebx
@@ -1242,15 +1242,15 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex
 ; WIN64:       # %bb.0:
 ; WIN64-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; WIN64-NEXT:    vcvtsi2sd %eax, %xmm2, %xmm1
+; WIN64-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; WIN64-NEXT:    vcvtsi2sd %ecx, %xmm2, %xmm1
+; WIN64-NEXT:    vcvtsi2sd %ecx, %xmm7, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; WIN64-NEXT:    vcvtsi2sd %rdx, %xmm2, %xmm1
+; WIN64-NEXT:    vcvtsi2sd %rdx, %xmm7, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; WIN64-NEXT:    vcvtsi2sd %edi, %xmm2, %xmm1
+; WIN64-NEXT:    vcvtsi2sd %edi, %xmm7, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; WIN64-NEXT:    vcvtsi2sdl (%rsi), %xmm2, %xmm1
+; WIN64-NEXT:    vcvtsi2sdl (%rsi), %xmm7, %xmm1
 ; WIN64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; WIN64-NEXT:    vcvttsd2si %xmm0, %eax
 ; WIN64-NEXT:    retq
@@ -1259,15 +1259,15 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex
 ; LINUXOSX64:       # %bb.0:
 ; LINUXOSX64-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
-; LINUXOSX64-NEXT:    vcvtsi2sd %eax, %xmm2, %xmm1
+; LINUXOSX64-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; LINUXOSX64-NEXT:    vcvtsi2sd %ecx, %xmm2, %xmm1
+; LINUXOSX64-NEXT:    vcvtsi2sd %ecx, %xmm7, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; LINUXOSX64-NEXT:    vcvtsi2sd %rdx, %xmm2, %xmm1
+; LINUXOSX64-NEXT:    vcvtsi2sd %rdx, %xmm7, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; LINUXOSX64-NEXT:    vcvtsi2sd %edi, %xmm2, %xmm1
+; LINUXOSX64-NEXT:    vcvtsi2sd %edi, %xmm7, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
-; LINUXOSX64-NEXT:    vcvtsi2sdl (%rsi), %xmm2, %xmm1
+; LINUXOSX64-NEXT:    vcvtsi2sdl (%rsi), %xmm7, %xmm1
 ; LINUXOSX64-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; LINUXOSX64-NEXT:    vcvttsd2si %xmm0, %eax
 ; LINUXOSX64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index 26abf51c76b23..3f6ddc6ecfd70 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -412,7 +412,7 @@ define double @extload_f16_f64(ptr %x) {
 define float @extload_f16_f32_optsize(ptr %x) optsize {
 ; X64-LABEL: extload_f16_f32_optsize:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsh2ss (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vcvtsh2ss (%rdi), %xmm15, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: extload_f16_f32_optsize:
@@ -420,7 +420,7 @@ define float @extload_f16_f32_optsize(ptr %x) optsize {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsh2ss (%eax), %xmm0, %xmm0
+; X86-NEXT:    vcvtsh2ss (%eax), %xmm7, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -434,7 +434,7 @@ define float @extload_f16_f32_optsize(ptr %x) optsize {
 define double @extload_f16_f64_optsize(ptr %x) optsize {
 ; X64-LABEL: extload_f16_f64_optsize:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsh2sd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vcvtsh2sd (%rdi), %xmm15, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: extload_f16_f64_optsize:
@@ -447,7 +447,7 @@ define double @extload_f16_f64_optsize(ptr %x) optsize {
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    vcvtsh2sd (%eax), %xmm0, %xmm0
+; X86-NEXT:    vcvtsh2sd (%eax), %xmm7, %xmm0
 ; X86-NEXT:    vmovsd %xmm0, (%esp)
 ; X86-NEXT:    fldl (%esp)
 ; X86-NEXT:    movl %ebp, %esp
@@ -559,13 +559,13 @@ define half @s8_to_half(i8 %x) {
 ; X64-LABEL: s8_to_half:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movsbl %dil, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: s8_to_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = sitofp i8 %x to half
   ret half %a
@@ -575,13 +575,13 @@ define half @s16_to_half(i16 %x) {
 ; X64-LABEL: s16_to_half:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: s16_to_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = sitofp i16 %x to half
   ret half %a
@@ -590,12 +590,12 @@ define half @s16_to_half(i16 %x) {
 define half @s32_to_half(i32 %x) {
 ; X64-LABEL: s32_to_half:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %edi, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: s32_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = sitofp i32 %x to half
   ret half %a
@@ -604,7 +604,7 @@ define half @s32_to_half(i32 %x) {
 define half @s64_to_half(i64 %x) {
 ; X64-LABEL: s64_to_half:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %rdi, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: s64_to_half:
@@ -644,13 +644,13 @@ define half @u8_to_half(i8 %x) {
 ; X64-LABEL: u8_to_half:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: u8_to_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = uitofp i8 %x to half
   ret half %a
@@ -660,13 +660,13 @@ define half @u16_to_half(i16 %x) {
 ; X64-LABEL: u16_to_half:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: u16_to_half:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = uitofp i16 %x to half
   ret half %a
@@ -675,12 +675,12 @@ define half @u16_to_half(i16 %x) {
 define half @u32_to_half(i32 %x) {
 ; X64-LABEL: u32_to_half:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtusi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    vcvtusi2sh %edi, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: u32_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
   %a = uitofp i32 %x to half
   ret half %a
@@ -689,7 +689,7 @@ define half @u32_to_half(i32 %x) {
 define half @u64_to_half(i64 %x) {
 ; X64-LABEL: u64_to_half:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtusi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    vcvtusi2sh %rdi, %xmm31, %xmm0
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: u64_to_half:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-novl.ll b/llvm/test/CodeGen/X86/avx512fp16-novl.ll
index 1c4b7316c283c..d17cacc0e1ad7 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-novl.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-novl.ll
@@ -16,14 +16,14 @@ define <4 x half> @vector_sint32ToHalf(<4 x i32> %int32) {
 ; CHECK-LABEL: vector_sint32ToHalf:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractps $3, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm1
 ; CHECK-NEXT:    vextractps $2, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; CHECK-NEXT:    vextractps $1, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vmovd %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm0
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; CHECK-NEXT:    retq
@@ -36,32 +36,32 @@ define <8 x half> @vector_sint16ToHalf(<8 x i16> %int16) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpextrw $7, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm1
 ; CHECK-NEXT:    vpextrw $6, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; CHECK-NEXT:    vpextrw $5, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpextrw $4, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpextrw $2, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vmovw %xmm0, %eax
 ; CHECK-NEXT:    cwtl
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm0
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -97,25 +97,25 @@ define <8 x half> @vector_uint16ToHalf(<8 x i16> %int16) {
 ; CHECK-LABEL: vector_uint16ToHalf:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpextrw $7, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm1, %xmm1
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm1
 ; CHECK-NEXT:    vpextrw $6, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm2, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; CHECK-NEXT:    vpextrw $5, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpextrw $4, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm3, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm2
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm2
 ; CHECK-NEXT:    vpextrw $2, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
 ; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm3
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm3
 ; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-NEXT:    vcvtsi2sh %eax, %xmm4, %xmm0
+; CHECK-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; CHECK-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll
index 5acbccf41c5d3..6943622fac7f2 100644
--- a/llvm/test/CodeGen/X86/break-false-dep.ll
+++ b/llvm/test/CodeGen/X86/break-false-dep.ll
@@ -36,7 +36,7 @@ define dso_local float @t2(ptr nocapture %x) nounwind readonly ssp optsize {
 ;
 ; AVX-LABEL: t2:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsd2ss (%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsd2ss (%rcx), %xmm5, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load double, ptr %x, align 8
@@ -93,7 +93,7 @@ define dso_local float @squirtf_size(ptr %x) nounwind optsize {
 ;
 ; AVX-LABEL: squirtf_size:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vsqrtss (%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtss (%rcx), %xmm5, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %z = load float, ptr %x
@@ -114,7 +114,7 @@ define dso_local double @squirt_size(ptr %x) nounwind optsize {
 ;
 ; AVX-LABEL: squirt_size:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vsqrtsd (%rcx), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtsd (%rcx), %xmm5, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %z = load double, ptr %x
@@ -199,8 +199,8 @@ define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
 ; AVX1-NEXT:    .p2align 4
 ; AVX1-NEXT:  .LBB6_3: # %for.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vcvtsi2ss %eax, %xmm4, %xmm2
-; AVX1-NEXT:    vcvtsi2ss %ecx, %xmm4, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %ecx, %xmm5, %xmm3
 ; AVX1-NEXT:    vaddss %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vaddss %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    incl %eax
@@ -226,9 +226,9 @@ define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp {
 ; AVX512VL-NEXT:    .p2align 4
 ; AVX512VL-NEXT:  .LBB6_3: # %for.body
 ; AVX512VL-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX512VL-NEXT:    vcvtsi2ss %eax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ss %eax, %xmm5, %xmm2
 ; AVX512VL-NEXT:    vaddss %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2ss %ecx, %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ss %ecx, %xmm5, %xmm2
 ; AVX512VL-NEXT:    vaddss %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    incl %eax
 ; AVX512VL-NEXT:    decl %ecx
@@ -358,8 +358,8 @@ define i64 @loopdep2(ptr nocapture %x, ptr nocapture %y) nounwind {
 ; AVX-NEXT:    .p2align 4
 ; AVX-NEXT:  .LBB7_1: # %loop
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vcvtsi2sd %rcx, %xmm1, %xmm0
+; AVX-NEXT:    vxorps %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vcvtsi2sd %rcx, %xmm5, %xmm0
 ; AVX-NEXT:    vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX-NEXT:    #APP
 ; AVX-NEXT:    #NO_APP
@@ -566,8 +566,8 @@ define dso_local void @loopdep3() {
 ; AVX-NEXT:  .LBB8_2: # %for.body3
 ; AVX-NEXT:    # Parent Loop BB8_1 Depth=1
 ; AVX-NEXT:    # => This Inner Loop Header: Depth=2
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vcvtsi2sdl (%r11), %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm5, %xmm5, %xmm5
+; AVX-NEXT:    vcvtsi2sdl (%r11), %xmm5, %xmm0
 ; AVX-NEXT:    vmulsd (%rsi,%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmulsd (%rsi,%r8), %xmm0, %xmm0
 ; AVX-NEXT:    vmulsd (%rsi,%r9), %xmm0, %xmm0
@@ -761,8 +761,8 @@ define dso_local double @inlineasmdep(i64 %arg) {
 ; AVX-NEXT:    #NO_APP
 ; AVX-NEXT:    #APP
 ; AVX-NEXT:    #NO_APP
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vcvtsi2sd %rcx, %xmm3, %xmm0
 ; AVX-NEXT:    vmovaps (%rsp), %xmm6 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/coalescer-commute1.ll b/llvm/test/CodeGen/X86/coalescer-commute1.ll
index 28502782cf642..f4decb7e2e0c5 100644
--- a/llvm/test/CodeGen/X86/coalescer-commute1.ll
+++ b/llvm/test/CodeGen/X86/coalescer-commute1.ll
@@ -16,7 +16,7 @@ define void @runcont(ptr %source) nounwind  {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_1: ## %bb
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vcvtsi2ssl (%eax,%edx,4), %xmm2, %xmm1
+; CHECK-NEXT:    vcvtsi2ssl (%eax,%edx,4), %xmm7, %xmm1
 ; CHECK-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    incl %edx
 ; CHECK-NEXT:    cmpl %edx, %ecx
diff --git a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
index cfca56d35998e..00aa9cd8a27f3 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
@@ -78,7 +78,7 @@ define double @single_to_double_rm_optsize(ptr %x) optsize {
 ;
 ; AVX-LABEL: single_to_double_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtss2sd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtss2sd (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load float, ptr %x, align 4
@@ -112,7 +112,7 @@ define float @double_to_single_rm_optsize(ptr %x) optsize {
 ;
 ; AVX-LABEL: double_to_single_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsd2ss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsd2ss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load double, ptr %x, align 8
diff --git a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll
index 42d65f7cd64b6..5bf08f1c523d2 100644
--- a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll
@@ -12,7 +12,7 @@ define double @long_to_double_rr(i64 %a) {
 ;
 ; AVX-LABEL: long_to_double_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = sitofp i64 %a to double
@@ -27,7 +27,7 @@ define double @long_to_double_rm(ptr %a) {
 ;
 ; AVX-LABEL: long_to_double_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -43,7 +43,7 @@ define double @long_to_double_rm_optsize(ptr %a) optsize {
 ;
 ; AVX-LABEL: long_to_double_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -59,7 +59,7 @@ define float @long_to_float_rr(i64 %a) {
 ;
 ; AVX-LABEL: long_to_float_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = sitofp i64 %a to float
@@ -74,7 +74,7 @@ define float @long_to_float_rm(ptr %a) {
 ;
 ; AVX-LABEL: long_to_float_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -90,7 +90,7 @@ define float @long_to_float_rm_optsize(ptr %a) optsize {
 ;
 ; AVX-LABEL: long_to_float_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
diff --git a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll
index 36daba63f08bc..b39d9a7a3a6d0 100644
--- a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -15,7 +15,7 @@ define double @int_to_double_rr(i32 %a) {
 ;
 ; AVX-LABEL: int_to_double_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_double_rr:
@@ -44,7 +44,7 @@ define double @int_to_double_rr(i32 %a) {
 ; AVX_X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
-; AVX_X86-NEXT:    vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -64,7 +64,7 @@ define double @int_to_double_rm(ptr %a) {
 ;
 ; AVX-LABEL: int_to_double_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sdl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_double_rm:
@@ -95,7 +95,7 @@ define double @int_to_double_rm(ptr %a) {
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
 ; AVX_X86-NEXT:    movl 8(%ebp), %eax
-; AVX_X86-NEXT:    vcvtsi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2sdl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -116,7 +116,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize {
 ;
 ; AVX-LABEL: int_to_double_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sdl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_double_rm_optsize:
@@ -147,7 +147,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize {
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
 ; AVX_X86-NEXT:    movl 8(%ebp), %eax
-; AVX_X86-NEXT:    vcvtsi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2sdl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -168,7 +168,7 @@ define float @int_to_float_rr(i32 %a) {
 ;
 ; AVX-LABEL: int_to_float_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_float_rr:
@@ -186,7 +186,7 @@ define float @int_to_float_rr(i32 %a) {
 ; AVX_X86:       # %bb.0: # %entry
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
-; AVX_X86-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
@@ -205,7 +205,7 @@ define float @int_to_float_rm(ptr %a) {
 ;
 ; AVX-LABEL: int_to_float_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_float_rm:
@@ -225,7 +225,7 @@ define float @int_to_float_rm(ptr %a) {
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX_X86-NEXT:    vcvtsi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2ssl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
@@ -245,7 +245,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize {
 ;
 ; AVX-LABEL: int_to_float_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ssl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; SSE2_X86-LABEL: int_to_float_rm_optsize:
@@ -265,7 +265,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize {
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX_X86-NEXT:    vcvtsi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtsi2ssl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
diff --git a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll
index d05bcfe3fd1e7..77ef9ee5ad2b7 100644
--- a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll
@@ -5,7 +5,7 @@
 define double @long_to_double_rr(i64 %a) {
 ; ALL-LABEL: long_to_double_rr:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = uitofp i64 %a to double
@@ -15,7 +15,7 @@ entry:
 define double @long_to_double_rm(ptr %a) {
 ; ALL-LABEL: long_to_double_rm:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2sdq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -26,7 +26,7 @@ entry:
 define double @long_to_double_rm_optsize(ptr %a) optsize {
 ; ALL-LABEL: long_to_double_rm_optsize:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2sdq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2sdq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -37,7 +37,7 @@ entry:
 define float @long_to_float_rr(i64 %a) {
 ; ALL-LABEL: long_to_float_rr:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = uitofp i64 %a to float
@@ -47,7 +47,7 @@ entry:
 define float @long_to_float_rm(ptr %a) {
 ; ALL-LABEL: long_to_float_rm:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
@@ -58,7 +58,7 @@ entry:
 define float @long_to_float_rm_optsize(ptr %a) optsize {
 ; ALL-LABEL: long_to_float_rm_optsize:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtusi2ssq (%rdi), %xmm0, %xmm0
+; ALL-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm0
 ; ALL-NEXT:    retq
 entry:
   %0 = load i64, ptr %a
diff --git a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll
index b7f9af6165a9c..de5765baeb9d5 100644
--- a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll
@@ -6,7 +6,7 @@
 define double @int_to_double_rr(i32 %a) {
 ; AVX-LABEL: int_to_double_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_double_rr:
@@ -18,7 +18,7 @@ define double @int_to_double_rr(i32 %a) {
 ; AVX_X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
-; AVX_X86-NEXT:    vcvtusi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -33,7 +33,7 @@ entry:
 define double @int_to_double_rm(ptr %a) {
 ; AVX-LABEL: int_to_double_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2sdl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_double_rm:
@@ -46,7 +46,7 @@ define double @int_to_double_rm(ptr %a) {
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
 ; AVX_X86-NEXT:    movl 8(%ebp), %eax
-; AVX_X86-NEXT:    vcvtusi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2sdl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -62,7 +62,7 @@ entry:
 define double @int_to_double_rm_optsize(ptr %a) optsize {
 ; AVX-LABEL: int_to_double_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2sdl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_double_rm_optsize:
@@ -75,7 +75,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize {
 ; AVX_X86-NEXT:    andl $-8, %esp
 ; AVX_X86-NEXT:    subl $8, %esp
 ; AVX_X86-NEXT:    movl 8(%ebp), %eax
-; AVX_X86-NEXT:    vcvtusi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2sdl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX_X86-NEXT:    fldl (%esp)
 ; AVX_X86-NEXT:    movl %ebp, %esp
@@ -91,14 +91,14 @@ entry:
 define float @int_to_float_rr(i32 %a) {
 ; AVX-LABEL: int_to_float_rr:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_float_rr:
 ; AVX_X86:       # %bb.0: # %entry
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
-; AVX_X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
@@ -112,7 +112,7 @@ entry:
 define float @int_to_float_rm(ptr %a) {
 ; AVX-LABEL: int_to_float_rm:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2ssl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_float_rm:
@@ -120,7 +120,7 @@ define float @int_to_float_rm(ptr %a) {
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX_X86-NEXT:    vcvtusi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2ssl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
@@ -135,7 +135,7 @@ entry:
 define float @int_to_float_rm_optsize(ptr %a) optsize {
 ; AVX-LABEL: int_to_float_rm_optsize:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtusi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vcvtusi2ssl (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX_X86-LABEL: int_to_float_rm_optsize:
@@ -143,7 +143,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize {
 ; AVX_X86-NEXT:    pushl %eax
 ; AVX_X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX_X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX_X86-NEXT:    vcvtusi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT:    vcvtusi2ssl (%eax), %xmm7, %xmm0
 ; AVX_X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX_X86-NEXT:    flds (%esp)
 ; AVX_X86-NEXT:    popl %eax
diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll
index 7b806bca43c2e..98fa725b2ea3a 100644
--- a/llvm/test/CodeGen/X86/fcmp-logic.ll
+++ b/llvm/test/CodeGen/X86/fcmp-logic.ll
@@ -399,11 +399,11 @@ define i1 @PR140534(i32 %a0, i32 %a1, i32 %a2) {
 ; AVX1-LABEL: PR140534:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    movl %edi, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    movl %esi, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    movl %edx, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX1-NEXT:    vcmpltsd %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm0
@@ -414,9 +414,9 @@ define i1 @PR140534(i32 %a0, i32 %a1, i32 %a2) {
 ;
 ; AVX512-LABEL: PR140534:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
-; AVX512-NEXT:    vcvtusi2sd %esi, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtusi2sd %edx, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %esi, %xmm15, %xmm1
+; AVX512-NEXT:    vcvtusi2sd %edx, %xmm15, %xmm2
 ; AVX512-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; AVX512-NEXT:    vcmpltsd %xmm2, %xmm1, %k0
 ; AVX512-NEXT:    vcmpltsd %xmm0, %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 5519d9b787b7f..d59b12c6d1231 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -887,14 +887,14 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movq %rsi, %rcx
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rdi
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
 ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    shlxq %rsi, %rdi, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl nuw i64 %v, %cnt
@@ -927,9 +927,9 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
 ; CHECK-AVX2-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm1
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; CHECK-AVX2-NEXT:    vmovq %xmm0, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; CHECK-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
 ; CHECK-AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
@@ -940,9 +940,9 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-NO-FASTFMA-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [2,2]
 ; CHECK-NO-FASTFMA-NEXT:    vpsllvq %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vpextrq $1, %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm1
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; CHECK-NO-FASTFMA-NEXT:    vmovq %xmm0, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1]
 ; CHECK-NO-FASTFMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
@@ -1108,13 +1108,13 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
 ; CHECK-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vzeroupper
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $0, %xmm0, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vzeroupper
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
@@ -1201,7 +1201,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
@@ -1209,7 +1209,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind {
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    movl $1, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl nuw i64 1, %cnt
@@ -1317,11 +1317,11 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    testq %rax, %rax
 ; CHECK-AVX2-NEXT:    js .LBB23_1
 ; CHECK-AVX2-NEXT:  # %bb.2:
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    jmp .LBB23_3
 ; CHECK-AVX2-NEXT:  .LBB23_1:
 ; CHECK-AVX2-NEXT:    shrq %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:  .LBB23_3:
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -1334,7 +1334,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
@@ -1343,7 +1343,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    movl $8, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
@@ -1371,7 +1371,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    movl $8, %eax
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-AVX2-NEXT:    shlq %cl, %rax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1382,7 +1382,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movl $8, %eax
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
@@ -1391,7 +1391,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    movl $8, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
@@ -1451,7 +1451,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    movl $1, %eax
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
@@ -1466,7 +1466,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
@@ -1478,7 +1478,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    movl $1, %eax
 ; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
@@ -1562,7 +1562,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-AVX2-NEXT:    shll %cl, %eax
 ; CHECK-AVX2-NEXT:    movzwl %ax, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -1578,7 +1578,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
 ; CHECK-NO-FASTFMA-NEXT:    movzwl %ax, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -1591,7 +1591,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    movl $1, %eax
 ; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
 ; CHECK-FMA-NEXT:    movzwl %ax, %eax
-; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
@@ -1648,7 +1648,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    movl $1, %eax
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-AVX2-NEXT:    shll %cl, %eax
-; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
@@ -1659,7 +1659,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    movl $1, %eax
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
-; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
@@ -1668,7 +1668,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-FMA:       # %bb.0:
 ; CHECK-FMA-NEXT:    movl $1, %eax
 ; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
-; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
+; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
 ; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll
index d3e34f8d8ffd3..35e14e5cf8980 100644
--- a/llvm/test/CodeGen/X86/fold-load-unops.ll
+++ b/llvm/test/CodeGen/X86/fold-load-unops.ll
@@ -89,7 +89,7 @@ define float @rcpss_size(ptr %a) optsize {
 ;
 ; AVX-LABEL: rcpss_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrcpss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -106,7 +106,7 @@ define <4 x float> @rcpss_full_size(ptr %a) optsize {
 ;
 ; AVX-LABEL: rcpss_full_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrcpss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <4 x float>, ptr %a
     %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
@@ -121,7 +121,7 @@ define float @rcpss_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: rcpss_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrcpss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -138,7 +138,7 @@ define <4 x float> @rcpss_full_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: rcpss_full_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrcpss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <4 x float>, ptr %a
     %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
@@ -153,7 +153,7 @@ define float @rsqrtss_size(ptr %a) optsize {
 ;
 ; AVX-LABEL: rsqrtss_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -170,7 +170,7 @@ define <4 x float> @rsqrtss_full_size(ptr %a) optsize {
 ;
 ; AVX-LABEL: rsqrtss_full_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <4 x float>, ptr %a
     %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
@@ -185,7 +185,7 @@ define float @rsqrtss_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: rsqrtss_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -202,7 +202,7 @@ define <4 x float> @rsqrtss_full_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: rsqrtss_full_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vrsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load <4 x float>, ptr %a
     %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
@@ -217,7 +217,7 @@ define float @sqrtss_size(ptr %a) optsize{
 ;
 ; AVX-LABEL: sqrtss_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -268,7 +268,7 @@ define float @sqrtss_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: sqrtss_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtss (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load float, ptr %a
     %ins = insertelement <4 x float> undef, float %ld, i32 0
@@ -319,7 +319,7 @@ define double @sqrtsd_size(ptr %a) optsize {
 ;
 ; AVX-LABEL: sqrtsd_size:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load double, ptr %a
     %ins = insertelement <2 x double> undef, double %ld, i32 0
@@ -370,7 +370,7 @@ define double @sqrtsd_pgso(ptr %a) !prof !14 {
 ;
 ; AVX-LABEL: sqrtsd_pgso:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    vsqrtsd (%rdi), %xmm15, %xmm0
 ; AVX-NEXT:    retq
     %ld = load double, ptr %a
     %ins = insertelement <2 x double> undef, double %ld, i32 0
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 3577f252f50da..5d69a217fb402 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -2018,7 +2018,7 @@ define double @sifdb(i8 %x) #0 {
 ; AVX-LABEL: sifdb:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movsbl %dil, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %x,
@@ -2062,7 +2062,7 @@ define double @sifdw(i16 %x) #0 {
 ; AVX-LABEL: sifdw:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movswl %di, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %x,
@@ -2103,7 +2103,7 @@ define double @sifdi(i32 %x) #0 {
 ;
 ; AVX-LABEL: sifdi:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x,
@@ -2147,7 +2147,7 @@ define float @siffb(i8 %x) #0 {
 ; AVX-LABEL: siffb:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movsbl %dil, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %x,
@@ -2191,7 +2191,7 @@ define float @siffw(i16 %x) #0 {
 ; AVX-LABEL: siffw:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movswl %di, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %x,
@@ -2232,7 +2232,7 @@ define float @siffi(i32 %x) #0 {
 ;
 ; AVX-LABEL: siffi:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x,
@@ -2267,7 +2267,7 @@ define double @sifdl(i64 %x) #0 {
 ;
 ; AVX-LABEL: sifdl:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x,
@@ -2302,7 +2302,7 @@ define float @siffl(i64 %x) #0 {
 ;
 ; AVX-LABEL: siffl:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x,
@@ -2349,7 +2349,7 @@ define double @uifdb(i8 %x) #0 {
 ; AVX-LABEL: uifdb:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movzbl %dil, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.uitofp.f64.i8(i8 %x,
@@ -2393,7 +2393,7 @@ define double @uifdw(i16 %x) #0 {
 ; AVX-LABEL: uifdw:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.uitofp.f64.i16(i16 %x,
@@ -2440,12 +2440,12 @@ define double @uifdi(i32 %x) #0 {
 ; AVX1-LABEL: uifdi:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edi, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: uifdi:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x,
@@ -2518,7 +2518,7 @@ define double @uifdl(i64 %x) #0 {
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    testq %rdi, %rdi
 ; AVX1-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-NEXT:    vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rcx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB48_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -2527,7 +2527,7 @@ define double @uifdl(i64 %x) #0 {
 ;
 ; AVX512-LABEL: uifdl:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x,
@@ -2571,7 +2571,7 @@ define float @uiffb(i8 %x) #0 {
 ; AVX-LABEL: uiffb:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movzbl %dil, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.uitofp.f32.i8(i8 %x,
@@ -2615,7 +2615,7 @@ define float @uiffw(i16 %x) #0 {
 ; AVX-LABEL: uiffw:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 %x,
@@ -2662,12 +2662,12 @@ define float @uiffi(i32 %x) #0 {
 ; AVX1-LABEL: uiffi:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edi, %eax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: uiffi:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x,
@@ -2740,7 +2740,7 @@ define float @uiffl(i64 %x) #0 {
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    testq %rdi, %rdi
 ; AVX1-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rcx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB52_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
@@ -2749,7 +2749,7 @@ define float @uiffl(i64 %x) #0 {
 ;
 ; AVX512-LABEL: uiffl:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x,
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index 7c0386f0e784e..c31bee5ff1030 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -33,7 +33,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 ; AVX-NEXT:    andb $1, %dil
 ; AVX-NEXT:    negb %dil
 ; AVX-NEXT:    movsbl %dil, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -45,7 +45,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_i1tof16:
@@ -53,7 +53,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 ; X64-NEXT:    andb $1, %dil
 ; X64-NEXT:    negb %dil
 ; X64-NEXT:    movsbl %dil, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.sitofp.f16.i1(i1 %x,
                                                metadata !"round.dynamic",
@@ -74,7 +74,7 @@ define half @sitofp_i8tof16(i8 %x) #0 {
 ; AVX-LABEL: sitofp_i8tof16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movsbl %dil, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -83,13 +83,13 @@ define half @sitofp_i8tof16(i8 %x) #0 {
 ; X86-LABEL: sitofp_i8tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_i8tof16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movsbl %dil, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %x,
                                                metadata !"round.dynamic",
@@ -110,7 +110,7 @@ define half @sitofp_i16tof16(i16 %x) #0 {
 ; AVX-LABEL: sitofp_i16tof16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movswl %di, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -119,13 +119,13 @@ define half @sitofp_i16tof16(i16 %x) #0 {
 ; X86-LABEL: sitofp_i16tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_i16tof16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %x,
                                                metadata !"round.dynamic",
@@ -144,7 +144,7 @@ define half @sitofp_i32tof16(i32 %x) #0 {
 ;
 ; AVX-LABEL: sitofp_i32tof16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -152,12 +152,12 @@ define half @sitofp_i32tof16(i32 %x) #0 {
 ;
 ; X86-LABEL: sitofp_i32tof16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_i32tof16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %edi, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x,
                                                metadata !"round.dynamic",
@@ -176,7 +176,7 @@ define half @sitofp_i64tof16(i64 %x) #0 {
 ;
 ; AVX-LABEL: sitofp_i64tof16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -190,7 +190,7 @@ define half @sitofp_i64tof16(i64 %x) #0 {
 ;
 ; X64-LABEL: sitofp_i64tof16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtsi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %rdi, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x,
                                                metadata !"round.dynamic",
@@ -211,7 +211,7 @@ define half @uitofp_i1tof16(i1 %x) #0 {
 ; AVX-LABEL: uitofp_i1tof16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    andl $1, %edi
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -222,13 +222,13 @@ define half @uitofp_i1tof16(i1 %x) #0 {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_i1tof16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $1, %edi
-; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %edi, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.uitofp.f16.i1(i1 %x,
                                                metadata !"round.dynamic",
@@ -249,7 +249,7 @@ define half @uitofp_i8tof16(i8 %x) #0 {
 ; AVX-LABEL: uitofp_i8tof16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movzbl %dil, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -258,13 +258,13 @@ define half @uitofp_i8tof16(i8 %x) #0 {
 ; X86-LABEL: uitofp_i8tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_i8tof16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.uitofp.f16.i8(i8 %x,
                                                metadata !"round.dynamic",
@@ -285,7 +285,7 @@ define half @uitofp_i16tof16(i16 %x) #0 {
 ; AVX-LABEL: uitofp_i16tof16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movzwl %di, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -294,13 +294,13 @@ define half @uitofp_i16tof16(i16 %x) #0 {
 ; X86-LABEL: uitofp_i16tof16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    vcvtsi2sh %eax, %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_i16tof16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.uitofp.f16.i16(i16 %x,
                                                metadata !"round.dynamic",
@@ -321,7 +321,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ; F16C-LABEL: uitofp_i32tof16:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    movl %edi, %eax
-; F16C-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; F16C-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -329,7 +329,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ;
 ; AVX512-LABEL: uitofp_i32tof16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -337,12 +337,12 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ;
 ; X86-LABEL: uitofp_i32tof16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: uitofp_i32tof16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtusi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    vcvtusi2sh %edi, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x,
                                                metadata !"round.dynamic",
@@ -381,7 +381,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ; F16C-NEXT:    orq %rax, %rcx
 ; F16C-NEXT:    testq %rdi, %rdi
 ; F16C-NEXT:    cmovnsq %rdi, %rcx
-; F16C-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; F16C-NEXT:    vcvtsi2ss %rcx, %xmm15, %xmm0
 ; F16C-NEXT:    jns .LBB9_2
 ; F16C-NEXT:  # %bb.1:
 ; F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
@@ -393,7 +393,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ;
 ; AVX512-LABEL: uitofp_i64tof16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -407,7 +407,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ;
 ; X64-LABEL: uitofp_i64tof16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vcvtusi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    vcvtusi2sh %rdi, %xmm31, %xmm0
 ; X64-NEXT:    retq
   %result = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x,
                                                metadata !"round.dynamic",
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
index 4933a870ddd87..f0aa3827ce937 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll
@@ -62,7 +62,7 @@ define float @sitofp_i1tof32(i1 %x) #0 {
 ; AVX-X86-NEXT:    andb $1, %al
 ; AVX-X86-NEXT:    negb %al
 ; AVX-X86-NEXT:    movsbl %al, %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -75,7 +75,7 @@ define float @sitofp_i1tof32(i1 %x) #0 {
 ; AVX-X64-NEXT:    andb $1, %dil
 ; AVX-X64-NEXT:    negb %dil
 ; AVX-X64-NEXT:    movsbl %dil, %eax
-; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i1tof32:
@@ -123,7 +123,7 @@ define float @sitofp_i8tof32(i8 %x) #0 {
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -134,7 +134,7 @@ define float @sitofp_i8tof32(i8 %x) #0 {
 ; AVX-X64-LABEL: sitofp_i8tof32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movsbl %dil, %eax
-; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i8tof32:
@@ -179,7 +179,7 @@ define float @sitofp_i16tof32(i16 %x) #0 {
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -190,7 +190,7 @@ define float @sitofp_i16tof32(i16 %x) #0 {
 ; AVX-X64-LABEL: sitofp_i16tof32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movswl %di, %eax
-; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i16tof32:
@@ -232,7 +232,7 @@ define float @sitofp_i32tof32(i32 %x) #0 {
 ; AVX-X86:       # %bb.0:
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
-; AVX-X86-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -242,7 +242,7 @@ define float @sitofp_i32tof32(i32 %x) #0 {
 ;
 ; AVX-X64-LABEL: sitofp_i32tof32:
 ; AVX-X64:       # %bb.0:
-; AVX-X64-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i32tof32:
@@ -294,7 +294,7 @@ define float @sitofp_i64tof32(i64 %x) #0 {
 ;
 ; AVX-X64-LABEL: sitofp_i64tof32:
 ; AVX-X64:       # %bb.0:
-; AVX-X64-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i64tof32:
@@ -337,7 +337,7 @@ define float @uitofp_i1tof32(i1 %x) #0 {
 ; AVX-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; AVX-X86-NEXT:    andb $1, %al
 ; AVX-X86-NEXT:    movzbl %al, %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -348,7 +348,7 @@ define float @uitofp_i1tof32(i1 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i1tof32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    andl $1, %edi
-; AVX-X64-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i1tof32:
@@ -395,7 +395,7 @@ define float @uitofp_i8tof32(i8 %x) #0 {
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -406,7 +406,7 @@ define float @uitofp_i8tof32(i8 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i8tof32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movzbl %dil, %eax
-; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i8tof32:
@@ -451,7 +451,7 @@ define float @uitofp_i16tof32(i16 %x) #0 {
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    .cfi_def_cfa_offset 8
 ; AVX-X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2ss %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX-X86-NEXT:    flds (%esp)
 ; AVX-X86-NEXT:    wait
@@ -462,7 +462,7 @@ define float @uitofp_i16tof32(i16 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i16tof32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movzwl %di, %eax
-; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i16tof32:
@@ -534,14 +534,14 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ; AVX1-X64-LABEL: uitofp_i32tof32:
 ; AVX1-X64:       # %bb.0:
 ; AVX1-X64-NEXT:    movl %edi, %eax
-; AVX1-X64-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; AVX1-X64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-X64-NEXT:    retq
 ;
 ; AVX512-X86-LABEL: uitofp_i32tof32:
 ; AVX512-X86:       # %bb.0:
 ; AVX512-X86-NEXT:    pushl %eax
 ; AVX512-X86-NEXT:    .cfi_def_cfa_offset 8
-; AVX512-X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512-X86-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX512-X86-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512-X86-NEXT:    flds (%esp)
 ; AVX512-X86-NEXT:    wait
@@ -551,7 +551,7 @@ define float @uitofp_i32tof32(i32 %x) #0 {
 ;
 ; AVX512-X64-LABEL: uitofp_i32tof32:
 ; AVX512-X64:       # %bb.0:
-; AVX512-X64-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-X64-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i32tof32:
@@ -656,7 +656,7 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ; AVX1-X64-NEXT:    orq %rax, %rcx
 ; AVX1-X64-NEXT:    testq %rdi, %rdi
 ; AVX1-X64-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-X64-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; AVX1-X64-NEXT:    vcvtsi2ss %rcx, %xmm15, %xmm0
 ; AVX1-X64-NEXT:    jns .LBB9_2
 ; AVX1-X64-NEXT:  # %bb.1:
 ; AVX1-X64-NEXT:    vaddss %xmm0, %xmm0, %xmm0
@@ -665,7 +665,7 @@ define float @uitofp_i64tof32(i64 %x) #0 {
 ;
 ; AVX512-X64-LABEL: uitofp_i64tof32:
 ; AVX512-X64:       # %bb.0:
-; AVX512-X64-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-X64-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i64tof32:
@@ -733,7 +733,7 @@ define double @sitofp_i8tof64(i8 %x) #0 {
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $8, %esp
 ; AVX-X86-NEXT:    movsbl 8(%ebp), %eax
-; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -745,7 +745,7 @@ define double @sitofp_i8tof64(i8 %x) #0 {
 ; AVX-X64-LABEL: sitofp_i8tof64:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movsbl %dil, %eax
-; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i8tof64:
@@ -801,7 +801,7 @@ define double @sitofp_i16tof64(i16 %x) #0 {
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $8, %esp
 ; AVX-X86-NEXT:    movswl 8(%ebp), %eax
-; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -813,7 +813,7 @@ define double @sitofp_i16tof64(i16 %x) #0 {
 ; AVX-X64-LABEL: sitofp_i16tof64:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movswl %di, %eax
-; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i16tof64:
@@ -866,7 +866,7 @@ define double @sitofp_i32tof64(i32 %x) #0 {
 ; AVX-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $8, %esp
-; AVX-X86-NEXT:    vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -877,7 +877,7 @@ define double @sitofp_i32tof64(i32 %x) #0 {
 ;
 ; AVX-X64-LABEL: sitofp_i32tof64:
 ; AVX-X64:       # %bb.0:
-; AVX-X64-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i32tof64:
@@ -941,7 +941,7 @@ define double @sitofp_i64tof64(i64 %x) #0 {
 ;
 ; AVX-X64-LABEL: sitofp_i64tof64:
 ; AVX-X64:       # %bb.0:
-; AVX-X64-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: sitofp_i64tof64:
@@ -995,7 +995,7 @@ define double @uitofp_i1tof64(i1 %x) #0 {
 ; AVX-X86-NEXT:    movzbl 8(%ebp), %eax
 ; AVX-X86-NEXT:    andb $1, %al
 ; AVX-X86-NEXT:    movzbl %al, %eax
-; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -1007,7 +1007,7 @@ define double @uitofp_i1tof64(i1 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i1tof64:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    andl $1, %edi
-; AVX-X64-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i1tof64:
@@ -1065,7 +1065,7 @@ define double @uitofp_i8tof64(i8 %x) #0 {
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $8, %esp
 ; AVX-X86-NEXT:    movzbl 8(%ebp), %eax
-; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -1077,7 +1077,7 @@ define double @uitofp_i8tof64(i8 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i8tof64:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movzbl %dil, %eax
-; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i8tof64:
@@ -1133,7 +1133,7 @@ define double @uitofp_i16tof64(i16 %x) #0 {
 ; AVX-X86-NEXT:    andl $-8, %esp
 ; AVX-X86-NEXT:    subl $8, %esp
 ; AVX-X86-NEXT:    movzwl 8(%ebp), %eax
-; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X86-NEXT:    vcvtsi2sd %eax, %xmm7, %xmm0
 ; AVX-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX-X86-NEXT:    fldl (%esp)
 ; AVX-X86-NEXT:    wait
@@ -1145,7 +1145,7 @@ define double @uitofp_i16tof64(i16 %x) #0 {
 ; AVX-X64-LABEL: uitofp_i16tof64:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    movzwl %di, %eax
-; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-X64-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i16tof64:
@@ -1217,7 +1217,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; AVX1-X64-LABEL: uitofp_i32tof64:
 ; AVX1-X64:       # %bb.0:
 ; AVX1-X64-NEXT:    movl %edi, %eax
-; AVX1-X64-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; AVX1-X64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-X64-NEXT:    retq
 ;
 ; AVX512-X86-LABEL: uitofp_i32tof64:
@@ -1229,7 +1229,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ; AVX512-X86-NEXT:    .cfi_def_cfa_register %ebp
 ; AVX512-X86-NEXT:    andl $-8, %esp
 ; AVX512-X86-NEXT:    subl $8, %esp
-; AVX512-X86-NEXT:    vcvtusi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX512-X86-NEXT:    vcvtusi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX512-X86-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512-X86-NEXT:    fldl (%esp)
 ; AVX512-X86-NEXT:    wait
@@ -1240,7 +1240,7 @@ define double @uitofp_i32tof64(i32 %x) #0 {
 ;
 ; AVX512-X64-LABEL: uitofp_i32tof64:
 ; AVX512-X64:       # %bb.0:
-; AVX512-X64-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512-X64-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX512-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i32tof64:
@@ -1345,7 +1345,7 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ; AVX1-X64-NEXT:    orq %rax, %rcx
 ; AVX1-X64-NEXT:    testq %rdi, %rdi
 ; AVX1-X64-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-X64-NEXT:    vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-X64-NEXT:    vcvtsi2sd %rcx, %xmm15, %xmm0
 ; AVX1-X64-NEXT:    jns .LBB18_2
 ; AVX1-X64-NEXT:  # %bb.1:
 ; AVX1-X64-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -1354,7 +1354,7 @@ define double @uitofp_i64tof64(i64 %x) #0 {
 ;
 ; AVX512-X64-LABEL: uitofp_i64tof64:
 ; AVX512-X64:       # %bb.0:
-; AVX512-X64-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512-X64-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512-X64-NEXT:    retq
 ;
 ; X87-LABEL: uitofp_i64tof64:
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index 1ab97dafb8514..c834ddbf46f7b 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -37,7 +37,7 @@ define half @fceil32(half %f) #0 {
 ;
 ; X86-LABEL: fceil32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $10, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fceil32:
@@ -73,7 +73,7 @@ define half @ffloor32(half %f) #0 {
 ;
 ; X86-LABEL: ffloor32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $9, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ffloor32:
@@ -109,7 +109,7 @@ define half @ftrunc32(half %f) #0 {
 ;
 ; X86-LABEL: ftrunc32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $11, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: ftrunc32:
@@ -145,7 +145,7 @@ define half @frint32(half %f) #0 {
 ;
 ; X86-LABEL: frint32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $4, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: frint32:
@@ -182,7 +182,7 @@ define half @fnearbyint32(half %f) #0 {
 ;
 ; X86-LABEL: fnearbyint32:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $12, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fnearbyint32:
@@ -219,7 +219,7 @@ define half @froundeven16(half %f) #0 {
 ;
 ; X86-LABEL: froundeven16:
 ; X86:       # %bb.0:
-; X86-NEXT:    vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    vrndscalesh $8, {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: froundeven16:
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index da44b5ec1371e..3ed98589767fb 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -514,7 +514,7 @@ define double @trunc_signed_f64_no_fast_math(double %x) {
 ; X64-AVX1-LABEL: trunc_signed_f64_no_fast_math:
 ; X64-AVX1:       # %bb.0:
 ; X64-AVX1-NEXT:    vcvttsd2si %xmm0, %rax
-; X64-AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math:
@@ -695,7 +695,7 @@ define float @trunc_unsigned_f32_disable_via_intrinsic(float %x) #0 {
 ; X64-AVX1-NEXT:    vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-AVX1-NEXT:    movl $-1, %eax
 ; X64-AVX1-NEXT:    cmovbel %ecx, %eax
-; X64-AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; X64-AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: trunc_unsigned_f32_disable_via_intrinsic:
@@ -752,7 +752,7 @@ define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 {
 ; X64-AVX1-NEXT:    xorl %eax, %eax
 ; X64-AVX1-NEXT:    vucomisd %xmm0, %xmm0
 ; X64-AVX1-NEXT:    cmovnpq %rcx, %rax
-; X64-AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: trunc_signed_f64_disable_via_intrinsic:
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index d8686b8b2950f..b6a4a12eb0fac 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -262,7 +262,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 {
 ;
 ; BWON-F16C-LABEL: test_sitofp_i64:
 ; BWON-F16C:       # %bb.0:
-; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vpextrw $0, %xmm0, (%rsi)
 ; BWON-F16C-NEXT:    retq
@@ -385,14 +385,14 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 {
 ; BWON-F16C-NEXT:    testq %rdi, %rdi
 ; BWON-F16C-NEXT:    js .LBB10_1
 ; BWON-F16C-NEXT:  # %bb.2:
-; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; BWON-F16C-NEXT:    jmp .LBB10_3
 ; BWON-F16C-NEXT:  .LBB10_1:
 ; BWON-F16C-NEXT:    movq %rdi, %rax
 ; BWON-F16C-NEXT:    shrq %rax
 ; BWON-F16C-NEXT:    andl $1, %edi
 ; BWON-F16C-NEXT:    orq %rax, %rdi
-; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; BWON-F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:  .LBB10_3:
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -843,7 +843,7 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 {
 ; BWON-F16C-LABEL: test_sitofp_fadd_i32:
 ; BWON-F16C:       # %bb.0:
 ; BWON-F16C-NEXT:    vpinsrw $0, (%rsi), %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm1, %xmm1
+; BWON-F16C-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm1
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/isel-int-to-fp.ll b/llvm/test/CodeGen/X86/isel-int-to-fp.ll
index fc99ff95788f3..5884944e41986 100644
--- a/llvm/test/CodeGen/X86/isel-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/isel-int-to-fp.ll
@@ -33,7 +33,7 @@ define double @test_ui64_to_double(i64 %x) {
 ;
 ; AVX512-LABEL: test_ui64_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = uitofp i64 %x to double
@@ -49,7 +49,7 @@ define double @test_ui32_to_double(i32 %x) {
 ;
 ; AVX512-LABEL: test_ui32_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = uitofp i32 %x to double
@@ -64,12 +64,12 @@ define double @test_ui16_to_double(i16 zeroext %x) {
 ;
 ; SDAG-AVX512-LABEL: test_ui16_to_double:
 ; SDAG-AVX512:       # %bb.0: # %entry
-; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; SDAG-AVX512-NEXT:    retq
 ;
 ; GISEL-AVX512-LABEL: test_ui16_to_double:
 ; GISEL-AVX512:       # %bb.0: # %entry
-; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; GISEL-AVX512-NEXT:    retq
 entry:
   %conv = uitofp i16 %x to double
@@ -84,12 +84,12 @@ define double @test_ui8_to_double(i8 zeroext %x) {
 ;
 ; SDAG-AVX512-LABEL: test_ui8_to_double:
 ; SDAG-AVX512:       # %bb.0: # %entry
-; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; SDAG-AVX512-NEXT:    retq
 ;
 ; GISEL-AVX512-LABEL: test_ui8_to_double:
 ; GISEL-AVX512:       # %bb.0: # %entry
-; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; GISEL-AVX512-NEXT:    retq
 entry:
   %conv = uitofp i8 %x to double
@@ -135,7 +135,7 @@ define float @test_ui64_to_float(i64 %x) {
 ;
 ; AVX512-LABEL: test_ui64_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = uitofp i64 %x to float
@@ -151,7 +151,7 @@ define float @test_ui32_to_float(i32 %x) {
 ;
 ; AVX512-LABEL: test_ui32_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = uitofp i32 %x to float
@@ -166,12 +166,12 @@ define float @test_ui16_to_float(i16 zeroext %x) {
 ;
 ; SDAG-AVX512-LABEL: test_ui16_to_float:
 ; SDAG-AVX512:       # %bb.0: # %entry
-; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; SDAG-AVX512-NEXT:    retq
 ;
 ; GISEL-AVX512-LABEL: test_ui16_to_float:
 ; GISEL-AVX512:       # %bb.0: # %entry
-; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; GISEL-AVX512-NEXT:    retq
 entry:
   %conv = uitofp i16 %x to float
@@ -186,12 +186,12 @@ define float @test_ui8_to_float(i8 zeroext %x) {
 ;
 ; SDAG-AVX512-LABEL: test_ui8_to_float:
 ; SDAG-AVX512:       # %bb.0: # %entry
-; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; SDAG-AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; SDAG-AVX512-NEXT:    retq
 ;
 ; GISEL-AVX512-LABEL: test_ui8_to_float:
 ; GISEL-AVX512:       # %bb.0: # %entry
-; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; GISEL-AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; GISEL-AVX512-NEXT:    retq
 entry:
   %conv = uitofp i8 %x to float
@@ -206,7 +206,7 @@ define double @test_si64_to_double(i64 %x) {
 ;
 ; AVX512-LABEL: test_si64_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i64 %x to double
@@ -221,7 +221,7 @@ define double @test_si32_to_double(i32 %x) {
 ;
 ; AVX512-LABEL: test_si32_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i32 %x to double
@@ -236,7 +236,7 @@ define double @test_si16_to_double(i16 signext %x) {
 ;
 ; AVX512-LABEL: test_si16_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i16 %x to double
@@ -251,7 +251,7 @@ define double @test_si8_to_double(i8 signext %x) {
 ;
 ; AVX512-LABEL: test_si8_to_double:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i8 %x to double
@@ -270,7 +270,7 @@ define double @test_si31_to_double(i31 %x) {
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    addl %edi, %edi
 ; AVX512-NEXT:    sarl %edi
-; AVX512-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i31 %x to double
@@ -289,7 +289,7 @@ define double @test_si33_to_double(i33 %x) {
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    shlq $31, %rdi
 ; AVX512-NEXT:    sarq $31, %rdi
-; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i33 %x to double
@@ -304,7 +304,7 @@ define float @test_si64_to_float(i64 %x) {
 ;
 ; AVX512-LABEL: test_si64_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i64 %x to float
@@ -319,7 +319,7 @@ define float @test_si32_to_float(i32 %x) {
 ;
 ; AVX512-LABEL: test_si32_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i32 %x to float
@@ -334,7 +334,7 @@ define float @test_si16_to_float(i16 signext %x) {
 ;
 ; AVX512-LABEL: test_si16_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i16 %x to float
@@ -349,7 +349,7 @@ define float @test_si8_to_float(i8 signext %x) {
 ;
 ; AVX512-LABEL: test_si8_to_float:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i8 %x to float
@@ -368,7 +368,7 @@ define float @test_si31_to_float(i31 %x) {
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    addl %edi, %edi
 ; AVX512-NEXT:    sarl %edi
-; AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i31 %x to float
@@ -387,7 +387,7 @@ define float @test_si33_to_float(i33 %x) {
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    shlq $31, %rdi
 ; AVX512-NEXT:    sarq $31, %rdi
-; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %conv = sitofp i33 %x to float
diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll
index 436b54db333b3..d07d1aaf6fc0a 100644
--- a/llvm/test/CodeGen/X86/pr34080.ll
+++ b/llvm/test/CodeGen/X86/pr34080.ll
@@ -124,7 +124,7 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 {
 ; AVX-NEXT:    fldt 16(%rbp)
 ; AVX-NEXT:    fld %st(0)
 ; AVX-NEXT:    fisttpl -4(%rbp)
-; AVX-NEXT:    vcvtsi2sdl -4(%rbp), %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sdl -4(%rbp), %xmm15, %xmm0
 ; AVX-NEXT:    vmovsd %xmm0, -48(%rbp)
 ; AVX-NEXT:    vmovsd %xmm0, -24(%rbp)
 ; AVX-NEXT:    fsubl -24(%rbp)
@@ -132,7 +132,7 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 {
 ; AVX-NEXT:    fmul %st, %st(1)
 ; AVX-NEXT:    fld %st(1)
 ; AVX-NEXT:    fisttpl -8(%rbp)
-; AVX-NEXT:    vcvtsi2sdl -8(%rbp), %xmm1, %xmm0
+; AVX-NEXT:    vcvtsi2sdl -8(%rbp), %xmm15, %xmm0
 ; AVX-NEXT:    vmovsd %xmm0, -40(%rbp)
 ; AVX-NEXT:    vmovsd %xmm0, -16(%rbp)
 ; AVX-NEXT:    fxch %st(1)
diff --git a/llvm/test/CodeGen/X86/pr37879.ll b/llvm/test/CodeGen/X86/pr37879.ll
index 60ca7c5b6d22b..34cbccca2867b 100644
--- a/llvm/test/CodeGen/X86/pr37879.ll
+++ b/llvm/test/CodeGen/X86/pr37879.ll
@@ -5,7 +5,7 @@ define double @foo(ptr nocapture readonly) #0 {
 ; CHECK-LABEL: foo:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movq (%rax), %rax
-; CHECK-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm1
+; CHECK-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/pr38803.ll b/llvm/test/CodeGen/X86/pr38803.ll
index ebac8121df913..3efe9f8dfa55d 100644
--- a/llvm/test/CodeGen/X86/pr38803.ll
+++ b/llvm/test/CodeGen/X86/pr38803.ll
@@ -17,7 +17,7 @@ define dso_local float @_Z3fn2v() {
 ; CHECK-NEXT:    cmpl $0, c(%rip)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    vcvtsi2ssl b(%rip), %xmm1, %xmm1
+; CHECK-NEXT:    vcvtsi2ssl b(%rip), %xmm15, %xmm1
 ; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vmovss %xmm2, %xmm1, %xmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll
index 04d6d1fdb1418..948449c68b3f0 100644
--- a/llvm/test/CodeGen/X86/rounding-ops.ll
+++ b/llvm/test/CodeGen/X86/rounding-ops.ll
@@ -221,12 +221,12 @@ define float @test11(ptr %xptr) nounwind optsize {
 ;
 ; CHECK-AVX-LABEL: test11:
 ; CHECK-AVX:       ## %bb.0:
-; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: test11:
 ; CHECK-AVX512:       ## %bb.0:
-; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %x = load float, ptr %xptr
   %call = tail call float @truncf(float %x) nounwind readnone
@@ -241,12 +241,12 @@ define double @test12(ptr %xptr) nounwind optsize {
 ;
 ; CHECK-AVX-LABEL: test12:
 ; CHECK-AVX:       ## %bb.0:
-; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: test12:
 ; CHECK-AVX512:       ## %bb.0:
-; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %x = load double, ptr %xptr
   %call = tail call double @trunc(double %x) nounwind readnone
@@ -261,12 +261,12 @@ define float @test11_pgso(ptr %xptr) nounwind !prof !14 {
 ;
 ; CHECK-AVX-LABEL: test11_pgso:
 ; CHECK-AVX:       ## %bb.0:
-; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vroundss $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: test11_pgso:
 ; CHECK-AVX512:       ## %bb.0:
-; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vroundss $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %x = load float, ptr %xptr
   %call = tail call float @truncf(float %x) nounwind readnone
@@ -281,12 +281,12 @@ define double @test12_pgso(ptr %xptr) nounwind !prof !14 {
 ;
 ; CHECK-AVX-LABEL: test12_pgso:
 ; CHECK-AVX:       ## %bb.0:
-; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX-NEXT:    vroundsd $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: test12_pgso:
 ; CHECK-AVX512:       ## %bb.0:
-; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm0, %xmm0
+; CHECK-AVX512-NEXT:    vroundsd $11, (%rdi), %xmm15, %xmm0
 ; CHECK-AVX512-NEXT:    retq
   %x = load double, ptr %xptr
   %call = tail call double @trunc(double %x) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
index 0757d30296e24..43c1a84f7cd6c 100644
--- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -18,7 +18,7 @@ define float @u32_to_f(i32 %a) nounwind {
 ; AVX512_32-LABEL: u32_to_f:
 ; AVX512_32:       # %bb.0:
 ; AVX512_32-NEXT:    pushl %eax
-; AVX512_32-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX512_32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512_32-NEXT:    flds (%esp)
 ; AVX512_32-NEXT:    popl %eax
@@ -26,7 +26,7 @@ define float @u32_to_f(i32 %a) nounwind {
 ;
 ; AVX512_64-LABEL: u32_to_f:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; SSE2_32-LABEL: u32_to_f:
@@ -84,7 +84,7 @@ define float @s32_to_f(i32 %a) nounwind {
 ; AVX512_32-LABEL: s32_to_f:
 ; AVX512_32:       # %bb.0:
 ; AVX512_32-NEXT:    pushl %eax
-; AVX512_32-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0
 ; AVX512_32-NEXT:    vmovss %xmm0, (%esp)
 ; AVX512_32-NEXT:    flds (%esp)
 ; AVX512_32-NEXT:    popl %eax
@@ -92,7 +92,7 @@ define float @s32_to_f(i32 %a) nounwind {
 ;
 ; AVX512_64-LABEL: s32_to_f:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; SSE_32-LABEL: s32_to_f:
@@ -128,7 +128,7 @@ define double @u32_to_d(i32 %a) nounwind {
 ; AVX512_32-NEXT:    movl %esp, %ebp
 ; AVX512_32-NEXT:    andl $-8, %esp
 ; AVX512_32-NEXT:    subl $8, %esp
-; AVX512_32-NEXT:    vcvtusi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vcvtusi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX512_32-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512_32-NEXT:    fldl (%esp)
 ; AVX512_32-NEXT:    movl %ebp, %esp
@@ -137,7 +137,7 @@ define double @u32_to_d(i32 %a) nounwind {
 ;
 ; AVX512_64-LABEL: u32_to_d:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; SSE2_32-LABEL: u32_to_d:
@@ -199,7 +199,7 @@ define double @s32_to_d(i32 %a) nounwind {
 ; AVX512_32-NEXT:    movl %esp, %ebp
 ; AVX512_32-NEXT:    andl $-8, %esp
 ; AVX512_32-NEXT:    subl $8, %esp
-; AVX512_32-NEXT:    vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vcvtsi2sdl 8(%ebp), %xmm7, %xmm0
 ; AVX512_32-NEXT:    vmovsd %xmm0, (%esp)
 ; AVX512_32-NEXT:    fldl (%esp)
 ; AVX512_32-NEXT:    movl %ebp, %esp
@@ -208,7 +208,7 @@ define double @s32_to_d(i32 %a) nounwind {
 ;
 ; AVX512_64-LABEL: s32_to_d:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; SSE2_32-LABEL: s32_to_d:
@@ -308,7 +308,7 @@ define float @u64_to_f(i64 %a) nounwind {
 ;
 ; AVX512_64-LABEL: u64_to_f:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: u64_to_f:
@@ -437,7 +437,7 @@ define float @s64_to_f(i64 %a) nounwind {
 ;
 ; AVX512_64-LABEL: s64_to_f:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: s64_to_f:
@@ -502,7 +502,7 @@ define float @s64_to_f_2(i64 %a) nounwind {
 ; AVX512_64-LABEL: s64_to_f_2:
 ; AVX512_64:       # %bb.0:
 ; AVX512_64-NEXT:    addq $5, %rdi
-; AVX512_64-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: s64_to_f_2:
@@ -626,7 +626,7 @@ define double @u64_to_d(i64 %a) nounwind {
 ;
 ; AVX512_64-LABEL: u64_to_d:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: u64_to_d:
@@ -748,7 +748,7 @@ define double @u64_to_d_optsize(i64 %a) nounwind optsize {
 ;
 ; AVX512_64-LABEL: u64_to_d_optsize:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: u64_to_d_optsize:
@@ -869,7 +869,7 @@ define double @s64_to_d(i64 %a) nounwind {
 ;
 ; AVX512_64-LABEL: s64_to_d:
 ; AVX512_64:       # %bb.0:
-; AVX512_64-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: s64_to_d:
@@ -955,7 +955,7 @@ define double @s64_to_d_2(i64 %a) nounwind {
 ; AVX512_64-LABEL: s64_to_d_2:
 ; AVX512_64:       # %bb.0:
 ; AVX512_64-NEXT:    addq $5, %rdi
-; AVX512_64-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX512_64-NEXT:    retq
 ;
 ; AVX512DQ_32-LABEL: s64_to_d_2:
diff --git a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
index 28b405799dfd0..b64bfb38c7a5a 100644
--- a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
+++ b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll
@@ -17,13 +17,13 @@ define float @uint8ToFloat(i8 %int8) {
 ; CHECK-NO_FP16-LABEL: uint8ToFloat:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movzbl %dil, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: uint8ToFloat:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movzbl %dil, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = uitofp i8 %int8 to float
     ret float %fp32
@@ -62,14 +62,14 @@ define half @uint8ToHalf(i8 %int8) {
 ; CHECK-NO_FP16-LABEL: uint8ToHalf:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movzbl %dil, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: uint8ToHalf:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movzbl %dil, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = uitofp i8 %int8 to half
     ret half %fp32
@@ -111,13 +111,13 @@ define float @sint8ToFloat(i8 %int8) {
 ; CHECK-NO_FP16-LABEL: sint8ToFloat:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movsbl %dil, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: sint8ToFloat:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movsbl %dil, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = sitofp i8 %int8 to float
     ret float %fp32
@@ -143,14 +143,14 @@ define half @sint8ToHalf(i8 %int8) {
 ; CHECK-NO_FP16-LABEL: sint8ToHalf:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movsbl %dil, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: sint8ToHalf:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movsbl %dil, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = sitofp i8 %int8 to half
     ret half %fp32
@@ -184,13 +184,13 @@ define float @uint16ToFloat(i16 %int16) {
 ; CHECK-NO_FP16-LABEL: uint16ToFloat:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movzwl %di, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: uint16ToFloat:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movzwl %di, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = uitofp i16 %int16 to float
     ret float %fp32
@@ -216,14 +216,14 @@ define half @uint16ToHalf(i16 %int16) {
 ; CHECK-NO_FP16-LABEL: uint16ToHalf:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movzwl %di, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: uint16ToHalf:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movzwl %di, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = uitofp i16 %int16 to half
     ret half %fp32
@@ -249,13 +249,13 @@ define float @sint16ToFloat(i16 %int16) {
 ; CHECK-NO_FP16-LABEL: sint16ToFloat:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movswl %di, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: sint16ToFloat:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movswl %di, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = sitofp i16 %int16 to float
     ret float %fp32
@@ -281,14 +281,14 @@ define half @sint16ToHalf(i16 %int16) {
 ; CHECK-NO_FP16-LABEL: sint16ToHalf:
 ; CHECK-NO_FP16:       # %bb.0:
 ; CHECK-NO_FP16-NEXT:    movswl %di, %eax
-; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; CHECK-NO_FP16-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; CHECK-NO_FP16-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO_FP16-NEXT:    retq
 ;
 ; CHECK-WITH_FP16-LABEL: sint16ToHalf:
 ; CHECK-WITH_FP16:       # %bb.0:
 ; CHECK-WITH_FP16-NEXT:    movswl %di, %eax
-; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; CHECK-WITH_FP16-NEXT:    vcvtsi2sh %eax, %xmm31, %xmm0
 ; CHECK-WITH_FP16-NEXT:    retq
     %fp32 = sitofp i16 %int16 to half
     ret half %fp32
diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
index 8a6c2f851a6d6..c8e31f7088a45 100644
--- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -1,21 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
 
 define <4 x i64> @autogen_SD88863() {
-; CHECK-LABEL: autogen_SD88863:
-; CHECK:       # %bb.0: # %BB
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
-; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_1: # %CF
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB0_1
-; CHECK-NEXT:  # %bb.2: # %CF240
-; CHECK-NEXT:    ret{{[l|q]}}
+; X86-LABEL: autogen_SD88863:
+; X86:       # %bb.0: # %BB
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm7[0,1]
+; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    .p2align 4
+; X86-NEXT:  .LBB0_1: # %CF
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2: # %CF240
+; X86-NEXT:    retl
+;
+; X64-LABEL: autogen_SD88863:
+; X64:       # %bb.0: # %BB
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm15[0,1]
+; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3]
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    .p2align 4
+; X64-NEXT:  .LBB0_1: # %CF
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # %bb.2: # %CF240
+; X64-NEXT:    retq
 BB:
   %I26 = insertelement <4 x i64> undef, i64 undef, i32 2
   br label %CF
diff --git a/llvm/test/CodeGen/X86/sse-cvttp2si.ll b/llvm/test/CodeGen/X86/sse-cvttp2si.ll
index d08cf120bb4b6..09b1d0f8b87db 100644
--- a/llvm/test/CodeGen/X86/sse-cvttp2si.ll
+++ b/llvm/test/CodeGen/X86/sse-cvttp2si.ll
@@ -23,7 +23,7 @@ define float @float_to_int_to_float_mem_f32_i32(ptr %p) #0 {
 ; AVX-LABEL: float_to_int_to_float_mem_f32_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttss2si (%rdi), %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %x = load <4 x float>, ptr %p, align 16
   %fptosi = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %x)
@@ -42,7 +42,7 @@ define float @float_to_int_to_float_reg_f32_i32(<4 x float> %x) #0 {
 ; AVX-LABEL: float_to_int_to_float_reg_f32_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttss2si %xmm0, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %fptosi = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %x)
   %sitofp = sitofp i32 %fptosi to float
@@ -59,7 +59,7 @@ define float @float_to_int_to_float_mem_f32_i64(ptr %p) #0 {
 ; AVX-LABEL: float_to_int_to_float_mem_f32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttss2si (%rdi), %rax
-; AVX-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %x = load <4 x float>, ptr %p, align 16
   %fptosi = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %x)
@@ -78,7 +78,7 @@ define float @float_to_int_to_float_reg_f32_i64(<4 x float> %x) #0 {
 ; AVX-LABEL: float_to_int_to_float_reg_f32_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttss2si %xmm0, %rax
-; AVX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %fptosi = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %x)
   %sitofp = sitofp i64 %fptosi to float
@@ -95,7 +95,7 @@ define double @float_to_int_to_float_mem_f64_i32(ptr %p) #0 {
 ; AVX-LABEL: float_to_int_to_float_mem_f64_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttsd2si (%rdi), %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %x = load <2 x double>, ptr %p, align 16
   %fptosi = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %x)
@@ -114,7 +114,7 @@ define double @float_to_int_to_float_reg_f64_i32(<2 x double> %x) #0 {
 ; AVX-LABEL: float_to_int_to_float_reg_f64_i32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttsd2si %xmm0, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm1, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %fptosi = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %x)
   %sitofp = sitofp i32 %fptosi to double
@@ -131,7 +131,7 @@ define double @float_to_int_to_float_mem_f64_i64(ptr %p) #0 {
 ; AVX-LABEL: float_to_int_to_float_mem_f64_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttsd2si (%rdi), %rax
-; AVX-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %x = load <2 x double>, ptr %p, align 16
   %fptosi = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %x)
@@ -150,7 +150,7 @@ define double @float_to_int_to_float_reg_f64_i64(<2 x double> %x) #0 {
 ; AVX-LABEL: float_to_int_to_float_reg_f64_i64:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
+; AVX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX-NEXT:    retq
   %fptosi = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %x)
   %sitofp = sitofp i64 %fptosi to double
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index f6b0df153c260..6dd75c8c09ce5 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -782,7 +782,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr %
 ; X86-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vcvtss2sd (%eax), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x08]
+; X86-AVX1-NEXT:    vcvtss2sd (%eax), %xmm7, %xmm1 ## encoding: [0xc5,0xc2,0x5a,0x08]
 ; X86-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -790,7 +790,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr %
 ; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vcvtss2sd (%eax), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x08]
+; X86-AVX512-NEXT:    vcvtss2sd (%eax), %xmm7, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xc2,0x5a,0x08]
 ; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
@@ -804,14 +804,14 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr %
 ;
 ; X64-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vcvtss2sd (%rdi), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x0f]
+; X64-AVX1-NEXT:    vcvtss2sd (%rdi), %xmm15, %xmm1 ## encoding: [0xc5,0x82,0x5a,0x0f]
 ; X64-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vcvtss2sd (%rdi), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x0f]
+; X64-AVX512-NEXT:    vcvtss2sd (%rdi), %xmm15, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0x82,0x5a,0x0f]
 ; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1]
 ; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 6625cc4f07a27..d7404c9e7c7da 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -990,7 +990,7 @@ define double @stack_fold_cvtsi2sd(i32 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1034,7 +1034,7 @@ define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -1080,7 +1080,7 @@ define double @stack_fold_cvtsi642sd(i64 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1124,7 +1124,7 @@ define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
@@ -1170,7 +1170,7 @@ define float @stack_fold_cvtsi2ss(i32 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1214,7 +1214,7 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
@@ -1261,7 +1261,7 @@ define float @stack_fold_cvtsi642ss(i64 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1305,7 +1305,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
@@ -2861,8 +2861,8 @@ define double @stack_fold_roundsd(double %a0) optsize {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vxorps %xmm15, %xmm15, %xmm15
+; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call double @llvm.floor.f64(double %a0)
@@ -2876,7 +2876,7 @@ define double @stack_fold_roundsd_minsize(double %a0) minsize {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call double @llvm.floor.f64(double %a0)
@@ -2908,8 +2908,8 @@ define float @stack_fold_roundss(float %a0) optsize {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vxorps %xmm15, %xmm15, %xmm15
+; CHECK-NEXT:    vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call float @llvm.floor.f32(float %a0)
@@ -3106,8 +3106,8 @@ define double @stack_fold_sqrtsd(double %a0) optsize {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload
+; CHECK-NEXT:    vxorps %xmm15, %xmm15, %xmm15
+; CHECK-NEXT:    vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call double @llvm.sqrt.f64(double %a0)
@@ -3124,8 +3124,8 @@ define float @stack_fold_sqrtss(float %a0) optsize {
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    vxorps %xmm15, %xmm15, %xmm15
+; CHECK-NEXT:    vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call float @llvm.sqrt.f32(float %a0)
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
index 7c788d291a5c7..cd4ceca6716b1 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll
@@ -227,9 +227,9 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX-64-LABEL: sitofp_v2i64_v2f32:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX-64-NEXT:    vmovq %xmm0, %rax
-; AVX-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX-64-NEXT:    retq
 ;
@@ -246,9 +246,9 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX512DQ-64-LABEL: sitofp_v2i64_v2f32:
 ; AVX512DQ-64:       # %bb.0:
 ; AVX512DQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512DQ-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512DQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512DQ-64-NEXT:    vmovq %xmm0, %rax
-; AVX512DQ-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512DQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512DQ-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX512DQ-64-NEXT:    retq
 ;
@@ -439,9 +439,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-64-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
 ; AVX1-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vmovq %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
 ; AVX1-64-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX1-64-NEXT:    vpxor %xmm3, %xmm3, %xmm3
@@ -453,18 +453,18 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX512F-64-LABEL: uitofp_v2i64_v2f32:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX512F-64-NEXT:    retq
 ;
 ; AVX512VL-64-LABEL: uitofp_v2i64_v2f32:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX512VL-64-NEXT:    retq
 ;
@@ -481,9 +481,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 {
 ; AVX512DQ-64-LABEL: uitofp_v2i64_v2f32:
 ; AVX512DQ-64:       # %bb.0:
 ; AVX512DQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512DQ-64-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512DQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512DQ-64-NEXT:    vmovq %xmm0, %rax
-; AVX512DQ-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512DQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512DQ-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX512DQ-64-NEXT:    retq
 ;
@@ -1237,9 +1237,9 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; AVX-64-LABEL: sitofp_v2i64_v2f64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-64-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX-64-NEXT:    vmovq %xmm0, %rax
-; AVX-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX-64-NEXT:    retq
 ;
@@ -1439,7 +1439,7 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; AVX1-64-NEXT:    orq %rcx, %rdx
 ; AVX1-64-NEXT:    testq %rax, %rax
 ; AVX1-64-NEXT:    cmovnsq %rax, %rdx
-; AVX1-64-NEXT:    vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-64-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm1
 ; AVX1-64-NEXT:    jns .LBB21_2
 ; AVX1-64-NEXT:  # %bb.1:
 ; AVX1-64-NEXT:    vaddsd %xmm1, %xmm1, %xmm1
@@ -1452,7 +1452,7 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; AVX1-64-NEXT:    orq %rcx, %rdx
 ; AVX1-64-NEXT:    testq %rax, %rax
 ; AVX1-64-NEXT:    cmovnsq %rax, %rdx
-; AVX1-64-NEXT:    vcvtsi2sd %rdx, %xmm2, %xmm0
+; AVX1-64-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm0
 ; AVX1-64-NEXT:    jns .LBB21_4
 ; AVX1-64-NEXT:  # %bb.3:
 ; AVX1-64-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -1463,18 +1463,18 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 {
 ; AVX512F-64-LABEL: uitofp_v2i64_v2f64:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-64-NEXT:    retq
 ;
 ; AVX512VL-64-LABEL: uitofp_v2i64_v2f64:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index a336d0a01fa7b..f790377f3331a 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -670,14 +670,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX1-64:       # %bb.0:
 ; AVX1-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vmovq %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vmovq %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX1-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-64-NEXT:    retq
@@ -686,14 +686,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vmovq %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX2-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vmovq %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-64-NEXT:    retq
@@ -702,14 +702,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vmovq %xmm1, %rax
-; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512F-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-64-NEXT:    retq
@@ -718,14 +718,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512VL-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-64-NEXT:    retq
@@ -802,26 +802,26 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX1-64:       # %bb.0:
 ; AVX1-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-64-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vmovd %xmm1, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX1-64-NEXT:    vextractps $2, %xmm0, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-64-NEXT:    vmovq %xmm0, %rax
 ; AVX1-64-NEXT:    movl %eax, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
 ; AVX1-64-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-64-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-64-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; AVX1-64-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-64-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
+; AVX1-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; AVX1-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-64-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -832,28 +832,28 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-64-NEXT:    vextractps $3, %xmm1, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vextractps $1, %xmm1, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX2-64-NEXT:    vextractps $3, %xmm0, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vextractps $1, %xmm0, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
 ; AVX2-64-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX2-64-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9]
 ; AVX2-64-NEXT:    vmulpd %ymm3, %ymm2, %ymm2
 ; AVX2-64-NEXT:    vextractps $2, %xmm1, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vmovd %xmm1, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; AVX2-64-NEXT:    vextractps $2, %xmm0, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vmovq %xmm0, %rax
 ; AVX2-64-NEXT:    movl %eax, %eax
-; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
+; AVX2-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX2-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; AVX2-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-64-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
@@ -863,14 +863,14 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vmovq %xmm1, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512F-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-64-NEXT:    retq
@@ -879,14 +879,14 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512VL-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512VL-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-64-NEXT:    retq
@@ -947,16 +947,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX1-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX1-64:       # %bb.0:
 ; AVX1-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-64-NEXT:    vmovq %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-64-NEXT:    vmovq %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX1-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX1-64-NEXT:    vzeroupper
 ; AVX1-64-NEXT:    retq
@@ -964,16 +964,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX2-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX2-64:       # %bb.0:
 ; AVX2-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-64-NEXT:    vmovq %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-64-NEXT:    vmovq %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX2-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX2-64-NEXT:    vzeroupper
 ; AVX2-64-NEXT:    retq
@@ -981,16 +981,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512F-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512F-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-64-NEXT:    vzeroupper
 ; AVX512F-64-NEXT:    retq
@@ -998,16 +998,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512VL-64-LABEL: sitofp_v4i64_v4f32:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512VL-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512VL-64-NEXT:    vzeroupper
 ; AVX512VL-64-NEXT:    retq
@@ -1092,16 +1092,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX1-64-NEXT:    vorpd %ymm3, %ymm1, %ymm1
 ; AVX1-64-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX1-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-64-NEXT:    vmovq %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; AVX1-64-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-64-NEXT:    vmovq %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; AVX1-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
+; AVX1-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; AVX1-64-NEXT:    vaddps %xmm1, %xmm1, %xmm3
 ; AVX1-64-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
@@ -1117,16 +1117,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX2-64-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-64-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX2-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-64-NEXT:    vmovq %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX2-64-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-64-NEXT:    vmovq %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX2-64-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
+; AVX2-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX2-64-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm3
@@ -1138,16 +1138,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512F-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX512F-64:       # %bb.0:
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-64-NEXT:    vmovq %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512F-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-64-NEXT:    vzeroupper
 ; AVX512F-64-NEXT:    retq
@@ -1155,16 +1155,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
 ; AVX512VL-64-LABEL: uitofp_v4i64_v4f32:
 ; AVX512VL-64:       # %bb.0:
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512VL-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-64-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512VL-64-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512VL-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512VL-64-NEXT:    vzeroupper
 ; AVX512VL-64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index 0cf945202a2d4..59294dd17fbca 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -323,27 +323,27 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-64:       # %bb.0:
 ; NODQ-64-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-64-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm1, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-64-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-64-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
+; NODQ-64-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -452,27 +452,27 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
 ; NODQ-64:       # %bb.0:
 ; NODQ-64-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
 ; NODQ-64-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm2
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm1, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm1
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-64-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm3
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-64-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm4, %xmm0
+; NODQ-64-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; NODQ-64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -548,28 +548,28 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-64:       # %bb.0:
 ; NODQ-64-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; NODQ-64-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm1, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; NODQ-64-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; NODQ-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm0
+; NODQ-64-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-64-NEXT:    retq
@@ -675,28 +675,28 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 {
 ; NODQ-64:       # %bb.0:
 ; NODQ-64-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; NODQ-64-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm1, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; NODQ-64-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm3
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; NODQ-64-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NODQ-64-NEXT:    vmovq %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm3
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; NODQ-64-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm0
+; NODQ-64-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; NODQ-64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
 ; NODQ-64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index af841cf38b24a..62ab5d82bfbb6 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -126,27 +126,27 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; VEX-LABEL: sitofp_2i64_to_2f64:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -352,14 +352,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -368,14 +368,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX2-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX2-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
@@ -384,14 +384,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -400,14 +400,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1247,27 +1247,27 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
 ; VEX-LABEL: sitofp_2i64_to_4f32:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_2i64_to_4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_2i64_to_4f32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1316,27 +1316,27 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
 ; VEX-LABEL: sitofp_2i64_to_4f32_zero:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1383,27 +1383,27 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; VEX-LABEL: sitofp_4i64_to_4f32_undef:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1581,16 +1581,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX1-LABEL: sitofp_4i64_to_4f32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1598,16 +1598,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX2-LABEL: sitofp_4i64_to_4f32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
@@ -1615,16 +1615,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512F-LABEL: sitofp_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1632,16 +1632,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512VL-LABEL: sitofp_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -1892,9 +1892,9 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
 ; VEX-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; VEX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
 ; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
 ; VEX-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; VEX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
@@ -1906,18 +1906,18 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
 ; AVX512F-LABEL: uitofp_2i64_to_4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i64_to_4f32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2007,9 +2007,9 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; VEX-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; VEX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
 ; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
 ; VEX-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; VEX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
@@ -2022,18 +2022,18 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; AVX512F-LABEL: uitofp_2i64_to_2f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i64_to_2f32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2125,9 +2125,9 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@@ -2148,16 +2148,16 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
@@ -2168,18 +2168,18 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2494,16 +2494,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX1-NEXT:    vorpd %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
@@ -2519,16 +2519,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
@@ -2540,16 +2540,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512F-LABEL: uitofp_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2557,16 +2557,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512VL-LABEL: uitofp_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2836,22 +2836,22 @@ define <2 x double> @sitofp_load_2i64_to_2f64(ptr%a) {
 ;
 ; VEX-LABEL: sitofp_load_2i64_to_2f64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm0
+; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -3011,33 +3011,33 @@ define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) {
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2sdq 24(%rdi), %xmm15, %xmm0
+; VEX-NEXT:    vcvtsi2sdq 16(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
+; VEX-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm1
+; VEX-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm2
 ; VEX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; VEX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2sdq 24(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtsi2sdq 16(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm1
+; AVX512F-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2sdq 24(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2sdq 16(%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2sdq 24(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtsi2sdq 16(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtsi2sdq 8(%rdi), %xmm15, %xmm1
+; AVX512VL-NEXT:    vcvtsi2sdq (%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -3776,34 +3776,34 @@ define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) {
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm0
+; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -3938,57 +3938,57 @@ define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) {
 ;
 ; VEX-LABEL: sitofp_load_8i64_to_8f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; VEX-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 40(%rdi), %xmm15, %xmm0
+; VEX-NEXT:    vcvtsi2ssq 32(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; VEX-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 48(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; VEX-NEXT:    vcvtsi2ssq 56(%rdi), %xmm15, %xmm1
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
+; VEX-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm1
+; VEX-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm2
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
+; VEX-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm2
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
+; VEX-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm2
 ; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; VEX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 40(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtsi2ssq 32(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 48(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq 56(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm1
+; AVX512F-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtsi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtsi2ssq 32(%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 40(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtsi2ssq 32(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 48(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 48(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq 56(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ssq 8(%rdi), %xmm15, %xmm1
+; AVX512VL-NEXT:    vcvtsi2ssq (%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ssq 16(%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtsi2ssq 24(%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -4235,16 +4235,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; AVX1-NEXT:    vorpd %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
@@ -4261,16 +4261,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm1
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX2-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX2-NEXT:    vpackssdw 16(%rdi), %xmm0, %xmm0
@@ -4280,23 +4280,23 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) {
 ;
 ; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
@@ -4664,16 +4664,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; AVX1-NEXT:    vorps %ymm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm1, %ymm3
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm6
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm6
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
 ; AVX1-NEXT:    vmovq %xmm3, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm6
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm6
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0]
 ; AVX1-NEXT:    vaddps %xmm3, %xmm3, %xmm4
 ; AVX1-NEXT:    vpackssdw %xmm5, %xmm1, %xmm1
@@ -4686,16 +4686,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; AVX1-NEXT:    vorps %ymm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm0, %ymm2
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm5
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm5
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
 ; AVX1-NEXT:    vmovq %xmm2, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm5
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm5
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
 ; AVX1-NEXT:    vaddps %xmm2, %xmm2, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm4, %xmm0, %xmm0
@@ -4713,16 +4713,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; AVX2-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vblendvpd %ymm1, %ymm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm5
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm5
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
 ; AVX2-NEXT:    vmovq %xmm3, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm5
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm5
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0]
 ; AVX2-NEXT:    vaddps %xmm3, %xmm3, %xmm4
 ; AVX2-NEXT:    vpackssdw 48(%rdi), %xmm1, %xmm1
@@ -4732,16 +4732,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm4
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
 ; AVX2-NEXT:    vmovq %xmm2, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm4
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm2
+; AVX2-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0]
 ; AVX2-NEXT:    vaddps %xmm2, %xmm2, %xmm3
 ; AVX2-NEXT:    vpackssdw 16(%rdi), %xmm0, %xmm0
@@ -4751,38 +4751,38 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) {
 ;
 ; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 40(%rdi), %xmm15, %xmm0
+; AVX512F-NEXT:    vcvtusi2ssq 32(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 48(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT:    vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq 56(%rdi), %xmm15, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtusi2ssq 8(%rdi), %xmm15, %xmm1
+; AVX512F-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtusi2ssq 16(%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtusi2ssq 24(%rdi), %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtusi2ssq 40(%rdi), %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvtusi2ssq 32(%rdi), %xmm1, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 40(%rdi), %xmm15, %xmm0
+; AVX512VL-NEXT:    vcvtusi2ssq 32(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 48(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 48(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 56(%rdi), %xmm2, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq 56(%rdi), %xmm15, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm2, %xmm1
-; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm2, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ssq 8(%rdi), %xmm15, %xmm1
+; AVX512VL-NEXT:    vcvtusi2ssq (%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ssq 16(%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm3, %xmm2
+; AVX512VL-NEXT:    vcvtusi2ssq 24(%rdi), %xmm15, %xmm2
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -5148,7 +5148,7 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; AVX-NEXT:    incl %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm1
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm1
 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
@@ -5207,7 +5207,7 @@ define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
 ; VEX-LABEL: extract0_uitofp_v4i32_f32:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovd %xmm0, %eax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
@@ -5251,7 +5251,7 @@ define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
 ; VEX-LABEL: extract0_uitofp_v4i32_f64:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovd %xmm0, %eax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
@@ -5348,7 +5348,7 @@ define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
 ; VEX-LABEL: extract3_uitofp_v4i32_f32:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vextractps $3, %xmm0, %eax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
+; VEX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: extract3_uitofp_v4i32_f32:
@@ -5402,7 +5402,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
 ; VEX-LABEL: extract3_uitofp_v4i32_f64:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vextractps $3, %xmm0, %eax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
+; VEX-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: extract3_uitofp_v4i32_f64:
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 49062eaef3188..4a5b4277c3cca 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
@@ -6504,7 +6504,7 @@ define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 {
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <1 x double>
@@ -6522,7 +6522,7 @@ define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 {
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <1 x float>
@@ -6540,7 +6540,7 @@ define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 {
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2sd %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <1 x double>
@@ -6558,7 +6558,7 @@ define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 {
 ;
 ; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    retq
 entry:
   %result = call <1 x float>
@@ -6622,18 +6622,18 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 {
 ; AVX1-LABEL: constrained_vector_sitofp_v2f64_v2i64:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    retq
 ;
 ; AVX512F-LABEL: constrained_vector_sitofp_v2f64_v2i64:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
@@ -6668,9 +6668,9 @@ define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 {
 ; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
+; AVX-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX-NEXT:    retq
 entry:
@@ -6703,12 +6703,12 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm1, %xmm1
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm1
 ; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm2, %xmm2
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm2
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    vcvtsi2sd %eax, %xmm3, %xmm0
+; AVX-NEXT:    vcvtsi2sd %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
 entry:
@@ -6740,12 +6740,12 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm1
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm1
 ; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm2
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm3, %xmm0
+; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX-NEXT:    retq
 entry:
@@ -6770,26 +6770,26 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
 entry:
@@ -6814,13 +6814,13 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -6828,13 +6828,13 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -6910,14 +6910,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -6926,14 +6926,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 {
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -6977,16 +6977,16 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -6994,16 +6994,16 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; AVX512F-LABEL: constrained_vector_sitofp_v4f32_v4i64:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -7033,12 +7033,12 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 {
 ; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edi, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <1 x double>
@@ -7058,12 +7058,12 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 {
 ; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    movl %edi, %eax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <1 x float>
@@ -7099,7 +7099,7 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    testq %rdi, %rdi
 ; AVX1-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-NEXT:    vcvtsi2sd %rcx, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rcx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB175_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -7108,7 +7108,7 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 {
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <1 x double>
@@ -7144,7 +7144,7 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rax, %rcx
 ; AVX1-NEXT:    testq %rdi, %rdi
 ; AVX1-NEXT:    cmovnsq %rdi, %rcx
-; AVX1-NEXT:    vcvtsi2ss %rcx, %xmm0, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rcx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB176_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
@@ -7153,7 +7153,7 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 {
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm0, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    retq
 entry:
   %result = call <1 x float>
@@ -7279,7 +7279,7 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm1
 ; AVX1-NEXT:    jns .LBB179_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddsd %xmm1, %xmm1, %xmm1
@@ -7292,7 +7292,7 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm2, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB179_4
 ; AVX1-NEXT:  # %bb.3:
 ; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -7303,9 +7303,9 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 {
 ; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm0
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
@@ -7367,9 +7367,9 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 {
 ; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
 ; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
@@ -7381,9 +7381,9 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 {
 ; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; AVX512-NEXT:    retq
 entry:
@@ -7416,24 +7416,24 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
 ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractps $1, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vextractps $1, %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2sd %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2sd %eax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm2
 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2sd %eax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %eax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
 entry:
@@ -7465,24 +7465,24 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
 ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractps $1, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vextractps $1, %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2ss %eax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2ss %eax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm2
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX512-NEXT:    vcvtusi2ss %eax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %eax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX512-NEXT:    retq
 entry:
@@ -7547,7 +7547,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm1
 ; AVX1-NEXT:    jns .LBB183_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddsd %xmm1, %xmm1, %xmm1
@@ -7560,7 +7560,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm2
 ; AVX1-NEXT:    jns .LBB183_4
 ; AVX1-NEXT:  # %bb.3:
 ; AVX1-NEXT:    vaddsd %xmm2, %xmm2, %xmm2
@@ -7575,7 +7575,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rdx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB183_6
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm0
@@ -7586,13 +7586,13 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
 ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2sd %rax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512-NEXT:    retq
 entry:
@@ -7657,7 +7657,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm1, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm15, %xmm1
 ; AVX1-NEXT:    jns .LBB184_2
 ; AVX1-NEXT:  # %bb.1:
 ; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
@@ -7670,7 +7670,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm15, %xmm2
 ; AVX1-NEXT:    jns .LBB184_4
 ; AVX1-NEXT:  # %bb.3:
 ; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
@@ -7685,7 +7685,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    testq %rax, %rax
 ; AVX1-NEXT:    cmovnsq %rax, %rdx
-; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm3, %xmm0
+; AVX1-NEXT:    vcvtsi2ss %rdx, %xmm15, %xmm0
 ; AVX1-NEXT:    jns .LBB184_6
 ; AVX1-NEXT:  # %bb.5:
 ; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
@@ -7697,13 +7697,13 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
 ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64:
 ; AVX512:       # %bb.0: # %entry
 ; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -7860,26 +7860,26 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpextrd $2, %xmm1, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm2
 ; AVX1-NEXT:    vmovd %xmm1, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX1-NEXT:    vextractps $2, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    movl %eax, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vpextrd $1, %xmm1, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
 ; AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
+; AVX1-NEXT:    vcvtsi2sd %rax, %xmm15, %xmm0
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -7890,14 +7890,14 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 {
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm1
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtusi2sd %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -7991,16 +7991,16 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; AVX1-NEXT:    vorpd %ymm3, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm3
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm4
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
+; AVX1-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm1
 ; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; AVX1-NEXT:    vaddps %xmm1, %xmm1, %xmm3
 ; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
@@ -8011,16 +8011,16 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 {
 ; AVX512F-LABEL: constrained_vector_uitofp_v4f32_v4i64:
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm2
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
+; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm15, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 6392514bf4157..bc1650a4acf0b 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1403,10 +1403,10 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) {
     for (const auto &RC : RegisterClasses) {
       if (!RC.AltOrderSelect.empty()) {
         OS << "\nstatic inline unsigned " << RC.getName()
-           << "AltOrderSelect(const MachineFunction &MF) {" << RC.AltOrderSelect
-           << "}\n\n"
+           << "AltOrderSelect(const MachineFunction &MF, bool Rev) {"
+           << RC.AltOrderSelect << "}\n\n"
            << "static ArrayRef<MCPhysReg> " << RC.getName()
-           << "GetRawAllocationOrder(const MachineFunction &MF) {\n";
+           << "GetRawAllocationOrder(const MachineFunction &MF, bool Rev) {\n";
         for (unsigned oi = 1, oe = RC.getNumOrders(); oi != oe; ++oi) {
           ArrayRef<const Record *> Elems = RC.getOrder(oi);
           if (!Elems.empty()) {
@@ -1426,8 +1426,8 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) {
           else
             OS << "),\n    ArrayRef(AltOrder" << oi;
         OS << ")\n  };\n  const unsigned Select = " << RC.getName()
-           << "AltOrderSelect(MF);\n  assert(Select < " << RC.getNumOrders()
-           << ");\n  return Order[Select];\n}\n";
+           << "AltOrderSelect(MF, Rev);\n  assert(Select < "
+           << RC.getNumOrders() << ");\n  return Order[Select];\n}\n";
       }
     }
 

From fa9e1a1515549124dd76ddc55a8a532795d51fae Mon Sep 17 00:00:00 2001
From: RonDahan101 <166982786+RonDahan101@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:15:24 +0300
Subject: [PATCH 084/851] [AArch64] Expand llvm.histogram intrinsic to support
 umax, umin, and uadd.sat operations (#138447)

This patch extends the llvm.histogram intrinsic to support additional
update operations beyond the existing add. Specifically, the new
supported operations are:

* umax: unsigned maximum

* umin: unsigned minimum

* uadd.sat: unsigned saturated addition

Based on the discussion from:


https://discourse.llvm.org/t/rfc-expanding-the-experimental-histogram-intrinsic/84673
---
 llvm/docs/LangRef.rst                         |   3 +
 llvm/include/llvm/IR/Intrinsics.td            |  18 +
 .../Scalar/ScalarizeMaskedMemIntrin.cpp       |  36 +-
 .../AArch64/neon-scalarize-histogram.ll       | 354 ++++++++++++++++++
 4 files changed, 407 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 78604d0df6bc6..cc72a37f68599 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20545,6 +20545,9 @@ More update operation types may be added in the future.
 
     declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
     declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %ptrs, i64 %inc, <vscale x 2 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
+    declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask)
 
 Arguments:
 """"""""""
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e68243c2e406b..7add4a27ce9e9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1968,6 +1968,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[],
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
                              [ IntrArgMemOnly ]>;
 
+def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Increment
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Update value
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
+def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[],
+                             [ llvm_anyvector_ty, // Vector of pointers
+                               llvm_anyint_ty,    // Update value
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask
+                             [ IntrArgMemOnly ]>;
+
 // Experimental match
 def int_experimental_vector_match : DefaultAttrsIntrinsic<
                              [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ],
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index e24088c294987..42d6680c3cb7d 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -968,6 +968,29 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
 
   // FIXME: Do we need to add an alignment parameter to the intrinsic?
   unsigned VectorWidth = AddrType->getNumElements();
+  auto CreateHistogramUpdateValue = [&](IntrinsicInst *CI, Value *Load,
+                                        Value *Inc) -> Value * {
+    Value *UpdateOp;
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::experimental_vector_histogram_add:
+      UpdateOp = Builder.CreateAdd(Load, Inc);
+      break;
+    case Intrinsic::experimental_vector_histogram_uadd_sat:
+      UpdateOp =
+          Builder.CreateIntrinsic(Intrinsic::uadd_sat, {EltTy}, {Load, Inc});
+      break;
+    case Intrinsic::experimental_vector_histogram_umin:
+      UpdateOp = Builder.CreateIntrinsic(Intrinsic::umin, {EltTy}, {Load, Inc});
+      break;
+    case Intrinsic::experimental_vector_histogram_umax:
+      UpdateOp = Builder.CreateIntrinsic(Intrinsic::umax, {EltTy}, {Load, Inc});
+      break;
+
+    default:
+      llvm_unreachable("Unexpected histogram intrinsic");
+    }
+    return UpdateOp;
+  };
 
   // Shorten the way if the mask is a vector of constants.
   if (isConstantIntVector(Mask)) {
@@ -976,8 +999,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
         continue;
       Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
       LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
-      Value *Add = Builder.CreateAdd(Load, Inc);
-      Builder.CreateStore(Add, Ptr);
+      Value *Update =
+          CreateHistogramUpdateValue(cast<IntrinsicInst>(CI), Load, Inc);
+      Builder.CreateStore(Update, Ptr);
     }
     CI->eraseFromParent();
     return;
@@ -997,8 +1021,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
     Builder.SetInsertPoint(CondBlock->getTerminator());
     Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx));
     LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx));
-    Value *Add = Builder.CreateAdd(Load, Inc);
-    Builder.CreateStore(Add, Ptr);
+    Value *UpdateOp =
+        CreateHistogramUpdateValue(cast<IntrinsicInst>(CI), Load, Inc);
+    Builder.CreateStore(UpdateOp, Ptr);
 
     // Create "else" block, fill it in the next iteration
     BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0);
@@ -1089,6 +1114,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
     default:
       break;
     case Intrinsic::experimental_vector_histogram_add:
+    case Intrinsic::experimental_vector_histogram_uadd_sat:
+    case Intrinsic::experimental_vector_histogram_umin:
+    case Intrinsic::experimental_vector_histogram_umax:
       if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(),
                                            CI->getArgOperand(1)->getType()))
         return false;
diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
index e59d9098a30d6..ca74b4e95b0ae 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll
@@ -112,3 +112,357 @@ define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
   call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret void
 }
+
+define void @histogram_uadd_sat_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_uadd_sat_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    tbnz w8, #0, .LBB3_3
+; CHECK-NEXT:  // %bb.1: // %else
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbnz w8, #0, .LBB3_4
+; CHECK-NEXT:  .LBB3_2: // %else2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_3: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    adds x9, x9, x0
+; CHECK-NEXT:    csinv x9, x9, xzr, lo
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbz w8, #0, .LBB3_2
+; CHECK-NEXT:  .LBB3_4: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    adds x9, x9, x0
+; CHECK-NEXT:    csinv x9, x9, xzr, lo
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_uadd_sat_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_uadd_sat_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v2.2d, x0
+; CHECK-NEXT:    sshll v3.2d, v0.2s, #2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT:    tbz w8, #0, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB4_2: // %else
+; CHECK-NEXT:    umov w8, v1.h[1]
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    tbz w8, #0, .LBB4_4
+; CHECK-NEXT:  // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB4_4: // %else2
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
+; CHECK-NEXT:  // %bb.5: // %else4
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_8
+; CHECK-NEXT:  .LBB4_6: // %else6
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_7: // %cond.histogram.update3
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbz w8, #0, .LBB4_6
+; CHECK-NEXT:  .LBB4_8: // %cond.histogram.update5
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_uadd_sat_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_uadd_sat_i32_literal_alltruemask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #2
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov x10, v2.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    adds w8, w8, #1
+; CHECK-NEXT:    csinv w8, w8, wzr, lo
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    adds w9, w9, #1
+; CHECK-NEXT:    csinv w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    adds w8, w8, #1
+; CHECK-NEXT:    csinv w8, w8, wzr, lo
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define void @histogram_umax_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_umax_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    tbnz w8, #0, .LBB6_3
+; CHECK-NEXT:  // %bb.1: // %else
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbnz w8, #0, .LBB6_4
+; CHECK-NEXT:  .LBB6_2: // %else2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_3: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    cmp x9, x0
+; CHECK-NEXT:    csel x9, x9, x0, hi
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbz w8, #0, .LBB6_2
+; CHECK-NEXT:  .LBB6_4: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    cmp x9, x0
+; CHECK-NEXT:    csel x9, x9, x0, hi
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.umax.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_umax_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_umax_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v2.2d, x0
+; CHECK-NEXT:    sshll v3.2d, v0.2s, #2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT:    tbz w8, #0, .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB7_2: // %else
+; CHECK-NEXT:    umov w8, v1.h[1]
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    tbz w8, #0, .LBB7_4
+; CHECK-NEXT:  // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB7_4: // %else2
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    tbnz w8, #0, .LBB7_7
+; CHECK-NEXT:  // %bb.5: // %else4
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbnz w8, #0, .LBB7_8
+; CHECK-NEXT:  .LBB7_6: // %else6
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB7_7: // %cond.histogram.update3
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbz w8, #0, .LBB7_6
+; CHECK-NEXT:  .LBB7_8: // %cond.histogram.update5
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_umax_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_umax_i32_literal_alltruemask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #2
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov x10, v2.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    csinc w8, w8, wzr, hi
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, hi
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    csinc w8, w8, wzr, hi
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define void @histogram_umin_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) {
+; CHECK-LABEL: histogram_umin_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    tbnz w8, #0, .LBB9_3
+; CHECK-NEXT:  // %bb.1: // %else
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbnz w8, #0, .LBB9_4
+; CHECK-NEXT:  .LBB9_2: // %else2
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB9_3: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    cmp x9, x0
+; CHECK-NEXT:    csel x9, x9, x0, lo
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    mov w8, v1.s[1]
+; CHECK-NEXT:    tbz w8, #0, .LBB9_2
+; CHECK-NEXT:  .LBB9_4: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    cmp x9, x0
+; CHECK-NEXT:    csel x9, x9, x0, lo
+; CHECK-NEXT:    str x9, [x8]
+; CHECK-NEXT:    ret
+  call void @llvm.experimental.vector.histogram.umin.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask)
+  ret void
+}
+
+define void @histogram_umin_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) {
+; CHECK-LABEL: histogram_umin_i32_literal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v2.2d, x0
+; CHECK-NEXT:    sshll v3.2d, v0.2s, #2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    umov w8, v1.h[0]
+; CHECK-NEXT:    add v3.2d, v2.2d, v3.2d
+; CHECK-NEXT:    tbz w8, #0, .LBB10_2
+; CHECK-NEXT:  // %bb.1: // %cond.histogram.update
+; CHECK-NEXT:    fmov x8, d3
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB10_2: // %else
+; CHECK-NEXT:    umov w8, v1.h[1]
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    tbz w8, #0, .LBB10_4
+; CHECK-NEXT:  // %bb.3: // %cond.histogram.update1
+; CHECK-NEXT:    mov x8, v3.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:  .LBB10_4: // %else2
+; CHECK-NEXT:    umov w8, v1.h[2]
+; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    tbnz w8, #0, .LBB10_7
+; CHECK-NEXT:  // %bb.5: // %else4
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbnz w8, #0, .LBB10_8
+; CHECK-NEXT:  .LBB10_6: // %else6
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB10_7: // %cond.histogram.update3
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    umov w8, v1.h[3]
+; CHECK-NEXT:    tbz w8, #0, .LBB10_6
+; CHECK-NEXT:  .LBB10_8: // %cond.histogram.update5
+; CHECK-NEXT:    mov x8, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask)
+  ret void
+}
+
+define void @histogram_umin_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) {
+; CHECK-LABEL: histogram_umin_i32_literal_alltruemask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2d, x0
+; CHECK-NEXT:    sshll v2.2d, v0.2s, #2
+; CHECK-NEXT:    sshll2 v0.2d, v0.4s, #2
+; CHECK-NEXT:    add v2.2d, v1.2d, v2.2d
+; CHECK-NEXT:    add v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    mov x10, v2.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov x10, v0.d[1]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    cmp w9, #1
+; CHECK-NEXT:    csinc w9, w9, wzr, lo
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr w8, [x10]
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-NEXT:    str w8, [x10]
+; CHECK-NEXT:    ret
+  %buckets = getelementptr i32, ptr %base, <4 x i32> %indices
+  call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}

From 775ad3e49c83407b79dd5ad533204884cb8b23ce Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Wed, 11 Jun 2025 07:16:58 -0700
Subject: [PATCH 085/851] [flang][acc] Ensure all acc.loop get a default
 parallelism determination mode (#143623)

This PR updates the flang lowering to explicitly implement the OpenACC
rules:
- As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop
construct with no auto or seq clause is treated as if it has the
independent clause when it is an orphaned loop construct or its parent
compute construct is a parallel construct.
- As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent
compute construct is a kernels construct, a loop construct with no
independent or seq clause is treated as if it has the auto clause.
- Loops in serial regions are `seq` if they have no other parallelism
marking such as gang, worker, vector.

For now the `acc.loop` verifier has not yet been updated to enforce
this.
---
 flang/lib/Lower/OpenACC.cpp                   | 67 +++++++++++++++++++
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 | 28 ++++----
 flang/test/Lower/OpenACC/acc-loop.f90         | 52 +++++++-------
 .../test/Lower/OpenACC/acc-parallel-loop.f90  | 28 ++++----
 flang/test/Lower/OpenACC/acc-serial-loop.f90  | 28 ++++----
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h   |  6 ++
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 11 +++
 7 files changed, 152 insertions(+), 68 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index c10e1777614cd..69e9c53baa740 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2150,6 +2150,70 @@ privatizeIv(Fortran::lower::AbstractConverter &converter,
   ivPrivate.push_back(privateValue);
 }
 
+static void determineDefaultLoopParMode(
+    Fortran::lower::AbstractConverter &converter, mlir::acc::LoopOp &loopOp,
+    llvm::SmallVector<mlir::Attribute> &seqDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &independentDeviceTypes,
+    llvm::SmallVector<mlir::Attribute> &autoDeviceTypes) {
+  auto hasDeviceNone = [](mlir::Attribute attr) -> bool {
+    return mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr).getValue() ==
+           mlir::acc::DeviceType::None;
+  };
+  bool hasDefaultSeq = llvm::any_of(seqDeviceTypes, hasDeviceNone);
+  bool hasDefaultIndependent =
+      llvm::any_of(independentDeviceTypes, hasDeviceNone);
+  bool hasDefaultAuto = llvm::any_of(autoDeviceTypes, hasDeviceNone);
+  if (hasDefaultSeq || hasDefaultIndependent || hasDefaultAuto)
+    return; // Default loop par mode is already specified.
+
+  mlir::Region *currentRegion =
+      converter.getFirOpBuilder().getBlock()->getParent();
+  mlir::Operation *parentOp = mlir::acc::getEnclosingComputeOp(*currentRegion);
+  const bool isOrphanedLoop = !parentOp;
+  if (isOrphanedLoop ||
+      mlir::isa_and_present<mlir::acc::ParallelOp>(parentOp)) {
+    // As per OpenACC 3.3 standard section 2.9.6 independent clause:
+    // A loop construct with no auto or seq clause is treated as if it has the
+    // independent clause when it is an orphaned loop construct or its parent
+    // compute construct is a parallel construct.
+    independentDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+        converter.getFirOpBuilder().getContext(), mlir::acc::DeviceType::None));
+  } else if (mlir::isa_and_present<mlir::acc::SerialOp>(parentOp)) {
+    // Serial construct implies `seq` clause on loop. However, this
+    // conflicts with parallelism assignment if already set. Therefore check
+    // that first.
+    bool hasDefaultGangWorkerOrVector =
+        loopOp.hasVector() || loopOp.getVectorValue() || loopOp.hasWorker() ||
+        loopOp.getWorkerValue() || loopOp.hasGang() ||
+        loopOp.getGangValue(mlir::acc::GangArgType::Num) ||
+        loopOp.getGangValue(mlir::acc::GangArgType::Dim) ||
+        loopOp.getGangValue(mlir::acc::GangArgType::Static);
+    if (!hasDefaultGangWorkerOrVector)
+      seqDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+          converter.getFirOpBuilder().getContext(),
+          mlir::acc::DeviceType::None));
+    // Since the loop has some parallelism assigned - we cannot assign `seq`.
+    // However, the `acc.loop` verifier will check that one of seq, independent,
+    // or auto is marked. Seems reasonable to mark as auto since the OpenACC
+    // spec does say "If not, or if it is unable to make a determination, it
+    // must treat the auto clause as if it is a seq clause, and it must
+    // ignore any gang, worker, or vector clauses on the loop construct"
+    else
+      autoDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+          converter.getFirOpBuilder().getContext(),
+          mlir::acc::DeviceType::None));
+  } else {
+    // As per OpenACC 3.3 standard section 2.9.7 auto clause:
+    // When the parent compute construct is a kernels construct, a loop
+    // construct with no independent or seq clause is treated as if it has the
+    // auto clause.
+    assert(mlir::isa_and_present<mlir::acc::KernelsOp>(parentOp) &&
+           "Expected kernels construct");
+    autoDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+        converter.getFirOpBuilder().getContext(), mlir::acc::DeviceType::None));
+  }
+}
+
 static mlir::acc::LoopOp createLoopOp(
     Fortran::lower::AbstractConverter &converter,
     mlir::Location currentLocation,
@@ -2482,6 +2546,9 @@ static mlir::acc::LoopOp createLoopOp(
     loopOp.setTileOperandsSegmentsAttr(
         builder.getDenseI32ArrayAttr(tileOperandsSegments));
 
+  // Determine the loop's default par mode - either seq, independent, or auto.
+  determineDefaultLoopParMode(converter, loopOp, seqDeviceTypes,
+                              independentDeviceTypes, autoDeviceTypes);
   if (!seqDeviceTypes.empty())
     loopOp.setSeqAttr(builder.getArrayAttr(seqDeviceTypes));
   if (!independentDeviceTypes.empty())
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 8608b0ad98ce6..4e968144399a8 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -47,7 +47,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {
 ! CHECK:        acc.loop private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -59,7 +59,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels combined(loop) {
 ! CHECK:        acc.loop combined(kernels) private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -490,7 +490,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -503,7 +503,7 @@ subroutine acc_kernels_loop
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -516,7 +516,7 @@ subroutine acc_kernels_loop
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -528,7 +528,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32})
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -540,7 +540,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {{.*}} {
 ! CHECK:        acc.loop {{.*}} vector {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -553,7 +553,7 @@ subroutine acc_kernels_loop
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
 ! CHECK:        acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -566,7 +566,7 @@ subroutine acc_kernels_loop
 ! CHECK:        [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -578,7 +578,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {{.*}} {
 ! CHECK:        acc.loop {{.*}} worker {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -591,7 +591,7 @@ subroutine acc_kernels_loop
 ! CHECK:        [[WORKER128:%.*]] = arith.constant 128 : i32
 ! CHECK:        acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -605,7 +605,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {{.*}} {
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
+! CHECK-NEXT:   } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -621,9 +621,9 @@ subroutine acc_kernels_loop
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
-! CHECK-NEXT:     }{{$}}
+! CHECK-NEXT:     } attributes {auto_ = [#acc.device_type<none>]{{.*}}}
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 0246f60705898..5baa485534b2a 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -29,7 +29,7 @@ program acc_loop
 
 ! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}{{$}}
 
  !$acc loop seq
   DO i = 1, n
@@ -65,7 +65,7 @@ program acc_loop
 
 ! CHECK:      acc.loop gang private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop gang(num: 8)
   DO i = 1, n
@@ -75,7 +75,7 @@ program acc_loop
 ! CHECK:      [[GANGNUM1:%.*]] = arith.constant 8 : i32
 ! CHECK:      acc.loop gang({num=[[GANGNUM1]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop gang(num: gangNum)
   DO i = 1, n
@@ -85,7 +85,7 @@ program acc_loop
 ! CHECK:      [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.loop gang({num=[[GANGNUM2]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
  !$acc loop gang(num: gangNum, static: gangStatic)
   DO i = 1, n
@@ -94,7 +94,7 @@ program acc_loop
 
 ! CHECK: acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop vector
   DO i = 1, n
@@ -103,7 +103,7 @@ program acc_loop
 
 ! CHECK:      acc.loop vector private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop vector(128)
   DO i = 1, n
@@ -113,7 +113,7 @@ program acc_loop
 ! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32
 ! CHECK:      acc.loop vector([[CONSTANT128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: }{{$}}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop vector(vectorLength)
   DO i = 1, n
@@ -123,7 +123,7 @@ program acc_loop
 ! CHECK:      [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.loop vector([[VECTORLENGTH]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 !$acc loop worker
   DO i = 1, n
@@ -132,7 +132,7 @@ program acc_loop
 
 ! CHECK:      acc.loop worker private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop worker(128)
   DO i = 1, n
@@ -142,7 +142,7 @@ program acc_loop
 ! CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32
 ! CHECK:      acc.loop worker([[WORKER128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop private(c)
   DO i = 1, n
@@ -151,7 +151,7 @@ program acc_loop
 
 ! CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   ! When the induction variable is explicitly private - only a single private entry should be created.
   !$acc loop private(i)
@@ -161,7 +161,7 @@ program acc_loop
 
 ! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop private(c, d)
   DO i = 1, n
@@ -170,7 +170,7 @@ program acc_loop
 
 ! CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop private(c) private(d)
   DO i = 1, n
@@ -179,7 +179,7 @@ program acc_loop
 
 ! CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop tile(2)
   DO i = 1, n
@@ -189,7 +189,7 @@ program acc_loop
 ! CHECK:      [[TILESIZE:%.*]] = arith.constant 2 : i32
 ! CHECK:      acc.loop {{.*}} tile({[[TILESIZE]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
  !$acc loop tile(*)
   DO i = 1, n
@@ -198,7 +198,7 @@ program acc_loop
 ! CHECK:      [[TILESIZEM1:%.*]] = arith.constant -1 : i32
 ! CHECK:      acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop tile(2, 2)
   DO i = 1, n
@@ -211,7 +211,7 @@ program acc_loop
 ! CHECK:      [[TILESIZE2:%.*]] = arith.constant 2 : i32
 ! CHECK:      acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop tile(tileSize)
   DO i = 1, n
@@ -220,7 +220,7 @@ program acc_loop
 
 ! CHECK:      acc.loop {{.*}} tile({%{{.*}} : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop tile(tileSize, tileSize)
   DO i = 1, n
@@ -231,7 +231,7 @@ program acc_loop
 
 ! CHECK:      acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop collapse(2)
   DO i = 1, n
@@ -244,7 +244,7 @@ program acc_loop
 ! CHECK:        fir.store %arg0 to %{{.*}} : !fir.ref<i32>
 ! CHECK:        fir.store %arg1 to %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
+! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]{{.*}}}
 
   !$acc loop
   DO i = 1, n
@@ -257,9 +257,9 @@ program acc_loop
 ! CHECK:      acc.loop {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:          acc.loop {{.*}} control(%arg1 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:            acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop reduction(+:reduction_r) reduction(*:reduction_i)
   do i = 1, n
@@ -269,7 +269,7 @@ program acc_loop
 
 ! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
  !$acc loop gang(dim: gangDim, static: gangStatic)
   DO i = 1, n
@@ -278,7 +278,7 @@ program acc_loop
 
 ! CHECK: acc.loop gang({dim=%{{.*}}, static=%{{.*}} : i32}) {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop gang(dim: 1)
   DO i = 1, n
@@ -287,7 +287,7 @@ program acc_loop
 
 ! CHECK:      acc.loop gang({dim={{.*}} : i32}) {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop
   DO i = 1, n
@@ -335,4 +335,4 @@ subroutine sub1(i, j, k)
 ! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "j"}
 ! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = "k"}
 ! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_K]] : !fir.ref<i32>) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32)  step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
-! CHECK: } attributes {inclusiveUpperbound = array<i1: true, true, true>}
+! CHECK: } attributes {inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 4cf268d2517f5..32060179acdf1 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -49,7 +49,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.loop private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -61,7 +61,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel combined(loop) {
 ! CHECK:        acc.loop combined(parallel) private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -505,7 +505,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -518,7 +518,7 @@ subroutine acc_parallel_loop
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32})
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -531,7 +531,7 @@ subroutine acc_parallel_loop
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32})
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -543,7 +543,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32})
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -555,7 +555,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {{.*}} {
 ! CHECK:        acc.loop {{.*}} vector
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -568,7 +568,7 @@ subroutine acc_parallel_loop
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
 ! CHECK:        acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -582,7 +582,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} {
 ! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -595,7 +595,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {{.*}} worker {{.*}} {
 ! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -609,7 +609,7 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} {
 ! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -623,7 +623,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {{.*}} {
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
+! CHECK-NEXT:   } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -639,9 +639,9 @@ subroutine acc_parallel_loop
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
-! CHECK-NEXT:     }{{$}}
+! CHECK-NEXT:     } attributes {{{.*}}independent = [#acc.device_type<none>]}
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 34391f78ae707..af7bb0fac158c 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -68,7 +68,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {
 ! CHECK:        acc.loop private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -80,7 +80,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial combined(loop) {
 ! CHECK:        acc.loop combined(serial) private{{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -446,7 +446,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -459,7 +459,7 @@ subroutine acc_serial_loop
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -472,7 +472,7 @@ subroutine acc_serial_loop
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -484,7 +484,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {{.*}} {
 ! CHECK:        acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -496,7 +496,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {{.*}} {
 ! CHECK:        acc.loop {{.*}} vector {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -509,7 +509,7 @@ subroutine acc_serial_loop
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
 ! CHECK:        acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -522,7 +522,7 @@ subroutine acc_serial_loop
 ! CHECK:        [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:        acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -534,7 +534,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {{.*}} {
 ! CHECK:        acc.loop {{.*}} worker {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -547,7 +547,7 @@ subroutine acc_serial_loop
 ! CHECK:        [[WORKER128:%.*]] = arith.constant 128 : i32
 ! CHECK:        acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -562,7 +562,7 @@ subroutine acc_serial_loop
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK-NOT:            fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
+! CHECK-NEXT:   } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type<none>]{{.*}}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -578,9 +578,9 @@ subroutine acc_serial_loop
 ! CHECK:        acc.loop {{.*}} {
 ! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
-! CHECK-NEXT:     }{{$}}
+! CHECK-NEXT:     } attributes {{{.*}}seq = [#acc.device_type<none>]}
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {{{.*}}seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index ff5845343313c..4eb666239d4e4 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -151,6 +151,12 @@ mlir::ValueRange getDataOperands(mlir::Operation *accOp);
 /// Used to get a mutable range iterating over the data operands.
 mlir::MutableOperandRange getMutableDataOperands(mlir::Operation *accOp);
 
+/// Used to obtain the enclosing compute construct operation that contains
+/// the provided `region`. Returns nullptr if no compute construct operation
+/// is found. The returns operation is one of types defined by
+///`ACC_COMPUTE_CONSTRUCT_OPS`.
+mlir::Operation *getEnclosingComputeOp(mlir::Region &region);
+
 /// Used to check whether the provided `type` implements the `PointerLikeType`
 /// interface.
 inline bool isPointerLikeType(mlir::Type type) {
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 658ad28477ace..c72ec47be9f04 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -3820,3 +3820,14 @@ mlir::acc::getMutableDataOperands(mlir::Operation *accOp) {
           .Default([&](mlir::Operation *) { return nullptr; })};
   return dataOperands;
 }
+
+mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
+  mlir::Operation *parentOp = region.getParentOp();
+  while (parentOp) {
+    if (mlir::isa<ACC_COMPUTE_CONSTRUCT_OPS>(parentOp)) {
+      return parentOp;
+    }
+    parentOp = parentOp->getParentOp();
+  }
+  return nullptr;
+}

From b3db0c6a1d063ec9ee15253bde3d428c0ad5968b Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 11 Jun 2025 10:30:38 -0400
Subject: [PATCH 086/851] [HLSL][Driver] Make vk1.3 the default. (#143384)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HLSL driver currently defaults the triple to an unversioned os and
subarch when targeting SPIR-V. This means the SPIR-V backend decides the
default value. That is not a great option because a change the backend
could cause a change in Clang.

Now that we want to choose the default we need to consider the best
option. DXC currently defaults to Vulkan1.0. We are planning on not
supporting Vulkan1.0 in the Clang HLSL compiler because it is newer
versions of Vulkan are commonly supported on nearly all hardware, so
users do not use it.

Since we have to change from DXC anyway, we are using VK1.3. It has been
out long enough to be commonly available, and the initial implementation
of SPIR-V features for HLSL are assuming Vulkan 1.3.

---------

Co-authored-by: Nathan Gauër <github@keenuts.net>
---
 clang/lib/Driver/Driver.cpp      | 27 +++++++++++++--------------
 clang/test/Driver/dxc_spirv.hlsl |  2 +-
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 80728daca03c9..eb60d907d2218 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1596,28 +1596,27 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
       A->claim();
 
       if (Args.hasArg(options::OPT_spirv)) {
+        const llvm::StringMap<llvm::Triple::SubArchType> ValidTargets = {
+            {"vulkan1.2", llvm::Triple::SPIRVSubArch_v15},
+            {"vulkan1.3", llvm::Triple::SPIRVSubArch_v16}};
         llvm::Triple T(TargetTriple);
-        T.setArch(llvm::Triple::spirv);
-        T.setOS(llvm::Triple::Vulkan);
 
-        // Set specific Vulkan version if applicable.
+        // Set specific Vulkan version. Default to vulkan1.3.
+        auto TargetInfo = ValidTargets.find("vulkan1.3");
+        assert(TargetInfo != ValidTargets.end());
         if (const Arg *A = Args.getLastArg(options::OPT_fspv_target_env_EQ)) {
-          const llvm::StringMap<llvm::Triple::SubArchType> ValidTargets = {
-              {"vulkan1.2", llvm::Triple::SPIRVSubArch_v15},
-              {"vulkan1.3", llvm::Triple::SPIRVSubArch_v16}};
-
-          auto TargetInfo = ValidTargets.find(A->getValue());
-          if (TargetInfo != ValidTargets.end()) {
-            T.setOSName(TargetInfo->getKey());
-            T.setArch(llvm::Triple::spirv, TargetInfo->getValue());
-          } else {
+          TargetInfo = ValidTargets.find(A->getValue());
+          if (TargetInfo == ValidTargets.end()) {
             Diag(diag::err_drv_invalid_value)
                 << A->getAsString(Args) << A->getValue();
           }
           A->claim();
         }
-
-        TargetTriple = T.str();
+        if (TargetInfo != ValidTargets.end()) {
+          T.setOSName(TargetInfo->getKey());
+          T.setArch(llvm::Triple::spirv, TargetInfo->getValue());
+          TargetTriple = T.str();
+        }
       }
     } else {
       Diag(diag::err_drv_dxc_missing_target_profile);
diff --git a/clang/test/Driver/dxc_spirv.hlsl b/clang/test/Driver/dxc_spirv.hlsl
index e6624e5f1b3f6..65c9018dc54c5 100644
--- a/clang/test/Driver/dxc_spirv.hlsl
+++ b/clang/test/Driver/dxc_spirv.hlsl
@@ -3,7 +3,7 @@
 // RUN: %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.3 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-VULKAN13
 // RUN: not %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.0 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
-// CHECK: "-triple" "spirv-unknown-vulkan-compute"
+// CHECK: "-triple" "spirv1.6-unknown-vulkan1.3-compute"
 // CHECK-SAME: "-x" "hlsl"
 
 // CHECK-VULKAN12: "-triple" "spirv1.5-unknown-vulkan1.2-compute"

From 4e441665cc0d1585c8c6e44cf3c71a055f597d2e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 11 Jun 2025 14:47:47 +0200
Subject: [PATCH 087/851] [BasicAA][ValueTracking] Use MaxLookupSearchDepth
 constant (NFC)

Use MaxLookupSearchDepth in all places limiting an underlying
object walk, instead of hardcoding 6 in various places.
---
 llvm/include/llvm/Analysis/ValueTracking.h | 13 +++++++++----
 llvm/lib/Analysis/BasicAliasAnalysis.cpp   |  4 ----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index b05b8f349b8d5..32ab9733d13c9 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -45,6 +45,10 @@ template <typename T> class ArrayRef;
 
 constexpr unsigned MaxAnalysisRecursionDepth = 6;
 
+/// The max limit of the search depth in DecomposeGEPExpression() and
+/// getUnderlyingObject().
+constexpr unsigned MaxLookupSearchDepth = 6;
+
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
 ///
@@ -432,9 +436,10 @@ LLVM_ABI bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
 /// original object being addressed. Note that the returned value has pointer
 /// type if the specified value does. If the \p MaxLookup value is non-zero, it
 /// limits the number of instructions to be stripped off.
-LLVM_ABI const Value *getUnderlyingObject(const Value *V,
-                                          unsigned MaxLookup = 6);
-inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) {
+LLVM_ABI const Value *
+getUnderlyingObject(const Value *V, unsigned MaxLookup = MaxLookupSearchDepth);
+inline Value *getUnderlyingObject(Value *V,
+                                  unsigned MaxLookup = MaxLookupSearchDepth) {
   // Force const to avoid infinite recursion.
   const Value *VConst = V;
   return const_cast<Value *>(getUnderlyingObject(VConst, MaxLookup));
@@ -475,7 +480,7 @@ LLVM_ABI const Value *getUnderlyingObjectAggressive(const Value *V);
 LLVM_ABI void getUnderlyingObjects(const Value *V,
                                    SmallVectorImpl<const Value *> &Objects,
                                    const LoopInfo *LI = nullptr,
-                                   unsigned MaxLookup = 6);
+                                   unsigned MaxLookup = MaxLookupSearchDepth);
 
 /// This is a wrapper around getUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index e6675256fd5a0..f862d6930f545 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -79,10 +79,6 @@ STATISTIC(SearchLimitReached, "Number of times the limit to "
                               "decompose GEPs is reached");
 STATISTIC(SearchTimes, "Number of times a GEP is decomposed");
 
-// The max limit of the search depth in DecomposeGEPExpression() and
-// getUnderlyingObject().
-static const unsigned MaxLookupSearchDepth = 6;
-
 bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA,
                                FunctionAnalysisManager::Invalidator &Inv) {
   // We don't care if this analysis itself is preserved, it has no state. But

From 10f512f7bbda076ca2a0f9e3fcb2e7be0cb07199 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Wed, 11 Jun 2025 07:55:06 -0700
Subject: [PATCH 088/851] Revert runtime work queue patch, it breaks some tests
 that need investigation (#143713)

Revert "[flang][runtime] Another try to fix build failure"

This reverts commit 13869cac2b5051e453aa96ad71220d9d33404620.

Revert "[flang][runtime] Fix build bot flang-runtime-cuda-gcc errors
(#143650)"

This reverts commit d75e28477af0baa063a4d4cc7b3cf657cfadd758.

Revert "[flang][runtime] Replace recursion with iterative work queue
(#137727)"

This reverts commit 163c67ad3d1bf7af6590930d8f18700d65ad4564.
---
 .../include/flang-rt/runtime/environment.h    |   3 -
 flang-rt/include/flang-rt/runtime/stat.h      |  10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |   2 -
 .../include/flang-rt/runtime/work-queue.h     | 552 ---------------
 flang-rt/lib/runtime/CMakeLists.txt           |   2 -
 flang-rt/lib/runtime/assign.cpp               | 623 ++++++-----------
 flang-rt/lib/runtime/derived.cpp              | 517 +++++++-------
 flang-rt/lib/runtime/descriptor-io.cpp        | 651 +-----------------
 flang-rt/lib/runtime/descriptor-io.h          | 620 ++++++++++++++++-
 flang-rt/lib/runtime/environment.cpp          |   4 -
 flang-rt/lib/runtime/namelist.cpp             |   1 -
 flang-rt/lib/runtime/tools.cpp                |   4 +-
 flang-rt/lib/runtime/type-info.cpp            |   6 +-
 flang-rt/lib/runtime/work-queue.cpp           | 161 -----
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |   2 +-
 flang/docs/Extensions.md                      |  10 -
 flang/include/flang/Runtime/assign.h          |   2 +-
 flang/include/flang/Semantics/tools.h         |   7 +-
 flang/lib/Semantics/runtime-type-info.cpp     |   4 -
 flang/lib/Semantics/tools.cpp                 |  32 -
 flang/module/__fortran_type_info.f90          |   3 +-
 flang/test/Lower/volatile-openmp.f90          |   8 +-
 flang/test/Semantics/typeinfo01.f90           |  30 +-
 flang/test/Semantics/typeinfo03.f90           |   2 +-
 flang/test/Semantics/typeinfo04.f90           |   8 +-
 flang/test/Semantics/typeinfo05.f90           |   4 +-
 flang/test/Semantics/typeinfo06.f90           |   4 +-
 flang/test/Semantics/typeinfo07.f90           |   8 +-
 flang/test/Semantics/typeinfo08.f90           |   2 +-
 flang/test/Semantics/typeinfo11.f90           |   2 +-
 flang/test/Semantics/typeinfo12.f90           |  67 --
 31 files changed, 1120 insertions(+), 2231 deletions(-)
 delete mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 delete mode 100644 flang-rt/lib/runtime/work-queue.cpp
 delete mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index e579f6012ce86..16258b3bbba9b 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,9 +64,6 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
-  enum InternalDebugging { WorkQueue = 1 };
-  int internalDebugging{0}; // FLANG_RT_DEBUG
-
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index dc372de53506a..070d0bf8673fb 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes (>= 11)
+  // Interoperable STAT= codes
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values (>= 101)
+  // Standard STAT= values
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,14 +49,10 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
-
-  // Dummy status for work queue continuation, declared here to perhaps
-  // avoid collisions
-  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 9bde3adba87f5..5e79efde164f2 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -240,7 +240,6 @@ class DerivedType {
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
-  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -323,7 +322,6 @@ class DerivedType {
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
-  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
deleted file mode 100644
index f8cc820c06ca1..0000000000000
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ /dev/null
@@ -1,552 +0,0 @@
-//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Internal runtime utilities for work queues that replace the use of recursion
-// for better GPU device support.
-//
-// A work queue comprises a list of tickets.  Each ticket class has a Begin()
-// member function, which is called once, and a Continue() member function
-// that can be called zero or more times.  A ticket's execution terminates
-// when either of these member functions returns a status other than
-// StatContinue.  When that status is not StatOk, then the whole queue
-// is shut down.
-//
-// By returning StatContinue from its Continue() member function,
-// a ticket suspends its execution so that any nested tickets that it
-// may have created can be run to completion.  It is the reponsibility
-// of each ticket class to maintain resumption information in its state
-// and manage its own progress.  Most ticket classes inherit from
-// class ComponentsOverElements, which implements an outer loop over all
-// components of a derived type, and an inner loop over all elements
-// of a descriptor, possibly with multiple phases of execution per element.
-//
-// Tickets are created by WorkQueue::Begin...() member functions.
-// There is one of these for each "top level" recursive function in the
-// Fortran runtime support library that has been restructured into this
-// ticket framework.
-//
-// When the work queue is running tickets, it always selects the last ticket
-// on the list for execution -- "work stack" might have been a more accurate
-// name for this framework.  This ticket may, while doing its job, create
-// new tickets, and since those are pushed after the active one, the first
-// such nested ticket will be the next one executed to completion -- i.e.,
-// the order of nested WorkQueue::Begin...() calls is respected.
-// Note that a ticket's Continue() member function won't be called again
-// until all nested tickets have run to completion and it is once again
-// the last ticket on the queue.
-//
-// Example for an assignment to a derived type:
-// 1. Assign() is called, and its work queue is created.  It calls
-//    WorkQueue::BeginAssign() and then WorkQueue::Run().
-// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
-//    BeginFinalize() and returns StatContinue.
-// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
-//    until one of them returns StatOk, which ends the finalization ticket.
-// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
-//    and then returns StatOk, which ends the ticket.
-// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
-//    and ::Continue() are called until they are done (not StatContinue).
-//    Along the way, it may create nested AssignTickets for components,
-//    and suspend itself so that they may each run to completion.
-
-#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
-#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
-
-#include "flang-rt/runtime/connection.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/stat.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/api-attrs.h"
-#include "flang/Runtime/freestanding-tools.h"
-#include <flang/Common/variant.h>
-
-namespace Fortran::runtime::io {
-class IoStatementState;
-struct NonTbpDefinedIoTable;
-} // namespace Fortran::runtime::io
-
-namespace Fortran::runtime {
-class Terminator;
-class WorkQueue;
-
-// Ticket worker base classes
-
-template <typename TICKET> class ImmediateTicketRunner {
-public:
-  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
-      : ticket_{ticket} {}
-  RT_API_ATTRS int Run(WorkQueue &workQueue) {
-    int status{ticket_.Begin(workQueue)};
-    while (status == StatContinue) {
-      status = ticket_.Continue(workQueue);
-    }
-    return status;
-  }
-
-private:
-  TICKET &ticket_;
-};
-
-// Base class for ticket workers that operate elementwise over descriptors
-class Elementwise {
-public:
-  RT_API_ATTRS Elementwise(
-      const Descriptor &instance, const Descriptor *from = nullptr)
-      : instance_{instance}, from_{from} {
-    instance_.GetLowerBounds(subscripts_);
-    if (from_) {
-      from_->GetLowerBounds(fromSubscripts_);
-    }
-  }
-  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
-  RT_API_ATTRS void Advance() {
-    ++elementAt_;
-    instance_.IncrementSubscripts(subscripts_);
-    if (from_) {
-      from_->IncrementSubscripts(fromSubscripts_);
-    }
-  }
-  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
-  RT_API_ATTRS void Reset() {
-    elementAt_ = 0;
-    instance_.GetLowerBounds(subscripts_);
-    if (from_) {
-      from_->GetLowerBounds(fromSubscripts_);
-    }
-  }
-
-protected:
-  const Descriptor &instance_, *from_{nullptr};
-  std::size_t elements_{instance_.Elements()};
-  std::size_t elementAt_{0};
-  SubscriptValue subscripts_[common::maxRank];
-  SubscriptValue fromSubscripts_[common::maxRank];
-};
-
-// Base class for ticket workers that operate over derived type components.
-class Componentwise {
-public:
-  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
-  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
-  RT_API_ATTRS void Advance() {
-    ++componentAt_;
-    GetComponent();
-  }
-  RT_API_ATTRS void SkipToEnd() {
-    component_ = nullptr;
-    componentAt_ = components_;
-  }
-  RT_API_ATTRS void Reset() {
-    component_ = nullptr;
-    componentAt_ = 0;
-    GetComponent();
-  }
-  RT_API_ATTRS void GetComponent();
-
-protected:
-  const typeInfo::DerivedType &derived_;
-  std::size_t components_{0}, componentAt_{0};
-  const typeInfo::Component *component_{nullptr};
-  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
-};
-
-// Base class for ticket workers that operate over derived type components
-// in an outer loop, and elements in an inner loop.
-class ComponentsOverElements : public Componentwise, public Elementwise {
-public:
-  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
-      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
-      : Componentwise{derived}, Elementwise{instance, from} {
-    if (Elementwise::IsComplete()) {
-      Componentwise::SkipToEnd();
-    }
-  }
-  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
-  RT_API_ATTRS void Advance() {
-    SkipToNextElement();
-    if (Elementwise::IsComplete()) {
-      Elementwise::Reset();
-      Componentwise::Advance();
-    }
-  }
-  RT_API_ATTRS void SkipToNextElement() {
-    phase_ = 0;
-    Elementwise::Advance();
-  }
-  RT_API_ATTRS void SkipToNextComponent() {
-    phase_ = 0;
-    Elementwise::Reset();
-    Componentwise::Advance();
-  }
-  RT_API_ATTRS void Reset() {
-    phase_ = 0;
-    Elementwise::Reset();
-    Componentwise::Reset();
-  }
-
-protected:
-  int phase_{0};
-};
-
-// Base class for ticket workers that operate over elements in an outer loop,
-// type components in an inner loop.
-class ElementsOverComponents : public Elementwise, public Componentwise {
-public:
-  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
-      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
-      : Elementwise{instance, from}, Componentwise{derived} {
-    if (Componentwise::IsComplete()) {
-      Elementwise::SkipToEnd();
-    }
-  }
-  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
-  RT_API_ATTRS void Advance() {
-    SkipToNextComponent();
-    if (Componentwise::IsComplete()) {
-      Componentwise::Reset();
-      Elementwise::Advance();
-    }
-  }
-  RT_API_ATTRS void SkipToNextComponent() {
-    phase_ = 0;
-    Componentwise::Advance();
-  }
-  RT_API_ATTRS void SkipToNextElement() {
-    phase_ = 0;
-    Componentwise::Reset();
-    Elementwise::Advance();
-  }
-
-protected:
-  int phase_{0};
-};
-
-// Ticket worker classes
-
-// Implements derived type instance initialization
-class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
-                         private ComponentsOverElements {
-public:
-  RT_API_ATTRS InitializeTicket(
-      const Descriptor &instance, const typeInfo::DerivedType &derived)
-      : ImmediateTicketRunner<InitializeTicket>{*this},
-        ComponentsOverElements{instance, derived} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-};
-
-// Initializes one derived type instance from the value of another
-class InitializeCloneTicket
-    : public ImmediateTicketRunner<InitializeCloneTicket>,
-      private ComponentsOverElements {
-public:
-  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
-      const Descriptor &original, const typeInfo::DerivedType &derived,
-      bool hasStat, const Descriptor *errMsg)
-      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
-        ComponentsOverElements{original, derived}, clone_{clone},
-        hasStat_{hasStat}, errMsg_{errMsg} {}
-  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  const Descriptor &clone_;
-  bool hasStat_{false};
-  const Descriptor *errMsg_{nullptr};
-  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
-};
-
-// Implements derived type instance finalization
-class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
-                       private ComponentsOverElements {
-public:
-  RT_API_ATTRS FinalizeTicket(
-      const Descriptor &instance, const typeInfo::DerivedType &derived)
-      : ImmediateTicketRunner<FinalizeTicket>{*this},
-        ComponentsOverElements{instance, derived} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  const typeInfo::DerivedType *finalizableParentType_{nullptr};
-};
-
-// Implements derived type instance destruction
-class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
-                      private ComponentsOverElements {
-public:
-  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
-      const typeInfo::DerivedType &derived, bool finalize)
-      : ImmediateTicketRunner<DestroyTicket>{*this},
-        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  bool finalize_{false};
-};
-
-// Implements general intrinsic assignment
-class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
-public:
-  RT_API_ATTRS AssignTicket(
-      Descriptor &to, const Descriptor &from, int flags, MemmoveFct memmoveFct)
-      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
-        flags_{flags}, memmoveFct_{memmoveFct} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  RT_API_ATTRS bool IsSimpleMemmove() const {
-    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
-        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
-  }
-  RT_API_ATTRS Descriptor &GetTempDescriptor();
-
-  Descriptor &to_;
-  const Descriptor *from_{nullptr};
-  int flags_{0}; // enum AssignFlags
-  MemmoveFct memmoveFct_{nullptr};
-  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
-  const typeInfo::DerivedType *toDerived_{nullptr};
-  Descriptor *toDeallocate_{nullptr};
-  bool persist_{false};
-  bool done_{false};
-};
-
-// Implements derived type intrinsic assignment.
-template <bool IS_COMPONENTWISE>
-class DerivedAssignTicket
-    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
-      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
-          ElementsOverComponents> {
-public:
-  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
-      ElementsOverComponents>;
-  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
-      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
-      Descriptor *deallocateAfter)
-      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
-        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
-        deallocateAfter_{deallocateAfter} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
-  bool toIsContiguous_{this->instance_.IsContiguous()};
-  bool fromIsContiguous_{this->from_->IsContiguous()};
-  int flags_{0};
-  MemmoveFct memmoveFct_{nullptr};
-  Descriptor *deallocateAfter_{nullptr};
-  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
-};
-
-namespace io::descr {
-
-template <io::Direction DIR>
-class DescriptorIoTicket
-    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
-      private Elementwise {
-public:
-  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
-      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
-      bool &anyIoTookPlace)
-      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
-        Elementwise{descriptor}, io_{io}, table_{table},
-        anyIoTookPlace_{anyIoTookPlace} {}
-  RT_API_ATTRS int Begin(WorkQueue &);
-  RT_API_ATTRS int Continue(WorkQueue &);
-  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
-
-private:
-  io::IoStatementState &io_;
-  const io::NonTbpDefinedIoTable *table_{nullptr};
-  bool &anyIoTookPlace_;
-  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
-  const typeInfo::DerivedType *derived_{nullptr};
-  const typeInfo::SpecialBinding *special_{nullptr};
-  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
-};
-
-template <io::Direction DIR>
-class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
-                        private ElementsOverComponents {
-public:
-  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
-      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
-      : ImmediateTicketRunner<DerivedIoTicket>(*this),
-        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
-        anyIoTookPlace_{anyIoTookPlace} {}
-  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
-  RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  io::IoStatementState &io_;
-  const io::NonTbpDefinedIoTable *table_{nullptr};
-  bool &anyIoTookPlace_;
-};
-
-} // namespace io::descr
-
-struct NullTicket {
-  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
-  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
-};
-
-struct Ticket {
-  RT_API_ATTRS int Continue(WorkQueue &);
-  bool begun{false};
-  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
-      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
-      DerivedAssignTicket<true>,
-      io::descr::DescriptorIoTicket<io::Direction::Output>,
-      io::descr::DescriptorIoTicket<io::Direction::Input>,
-      io::descr::DerivedIoTicket<io::Direction::Output>,
-      io::descr::DerivedIoTicket<io::Direction::Input>>
-      u;
-};
-
-class WorkQueue {
-public:
-  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
-      : terminator_{terminator} {
-    for (int j{1}; j < numStatic_; ++j) {
-      static_[j].previous = &static_[j - 1];
-      static_[j - 1].next = &static_[j];
-    }
-  }
-  RT_API_ATTRS ~WorkQueue();
-  RT_API_ATTRS Terminator &terminator() { return terminator_; };
-
-  // APIs for particular tasks.  These can return StatOk if the work is
-  // completed immediately.
-  RT_API_ATTRS int BeginInitialize(
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
-    if (runTicketsImmediately_) {
-      return InitializeTicket{descriptor, derived}.Run(*this);
-    } else {
-      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
-      return StatContinue;
-    }
-  }
-  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
-      const Descriptor &original, const typeInfo::DerivedType &derived,
-      bool hasStat, const Descriptor *errMsg) {
-    if (runTicketsImmediately_) {
-      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
-          .Run(*this);
-    } else {
-      StartTicket().u.emplace<InitializeCloneTicket>(
-          clone, original, derived, hasStat, errMsg);
-      return StatContinue;
-    }
-  }
-  RT_API_ATTRS int BeginFinalize(
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
-    if (runTicketsImmediately_) {
-      return FinalizeTicket{descriptor, derived}.Run(*this);
-    } else {
-      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
-      return StatContinue;
-    }
-  }
-  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
-      const typeInfo::DerivedType &derived, bool finalize) {
-    if (runTicketsImmediately_) {
-      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
-    } else {
-      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
-      return StatContinue;
-    }
-  }
-  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
-      int flags, MemmoveFct memmoveFct) {
-    if (runTicketsImmediately_) {
-      return AssignTicket{to, from, flags, memmoveFct}.Run(*this);
-    } else {
-      StartTicket().u.emplace<AssignTicket>(to, from, flags, memmoveFct);
-      return StatContinue;
-    }
-  }
-  template <bool IS_COMPONENTWISE>
-  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
-      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
-      Descriptor *deallocateAfter) {
-    if (runTicketsImmediately_) {
-      return DerivedAssignTicket<IS_COMPONENTWISE>{
-          to, from, derived, flags, memmoveFct, deallocateAfter}
-          .Run(*this);
-    } else {
-      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
-          to, from, derived, flags, memmoveFct, deallocateAfter);
-      return StatContinue;
-    }
-  }
-  template <io::Direction DIR>
-  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
-      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
-      bool &anyIoTookPlace) {
-    if (runTicketsImmediately_) {
-      return io::descr::DescriptorIoTicket<DIR>{
-          io, descriptor, table, anyIoTookPlace}
-          .Run(*this);
-    } else {
-      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
-          io, descriptor, table, anyIoTookPlace);
-      return StatContinue;
-    }
-  }
-  template <io::Direction DIR>
-  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
-      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
-    if (runTicketsImmediately_) {
-      return io::descr::DerivedIoTicket<DIR>{
-          io, descriptor, derived, table, anyIoTookPlace}
-          .Run(*this);
-    } else {
-      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
-          io, descriptor, derived, table, anyIoTookPlace);
-      return StatContinue;
-    }
-  }
-
-  RT_API_ATTRS int Run();
-
-private:
-#if RT_DEVICE_COMPILATION
-  // Always use the work queue on a GPU device to avoid recursion.
-  static constexpr bool runTicketsImmediately_{false};
-#else
-  // Avoid the work queue overhead on the host, unless it needs
-  // debugging, which is so much easier there.
-  static constexpr bool runTicketsImmediately_{true};
-#endif
-
-  // Most uses of the work queue won't go very deep.
-  static constexpr int numStatic_{2};
-
-  struct TicketList {
-    bool isStatic{true};
-    Ticket ticket;
-    TicketList *previous{nullptr}, *next{nullptr};
-  };
-
-  RT_API_ATTRS Ticket &StartTicket();
-  RT_API_ATTRS void Stop();
-
-  Terminator &terminator_;
-  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
-  TicketList static_[numStatic_];
-  TicketList *firstFree_{static_};
-};
-
-} // namespace Fortran::runtime
-#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 332c0872e065f..a3f63b4315644 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,7 +68,6 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
-  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -132,7 +131,6 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
-  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index 41b130cc8f257..bf67b5dc8b645 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,7 +14,6 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
-#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -103,7 +102,11 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
+  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
+  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
+    result = ReturnError(terminator, Initialize(to, *derived, terminator));
+  }
+  return result;
 }
 
 // least <= 0, most >= 0
@@ -228,8 +231,6 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
-
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -243,453 +244,275 @@ RT_OFFLOAD_API_GROUP_BEGIN
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  WorkQueue workQueue{terminator};
-  if (workQueue.BeginAssign(to, from, flags, memmoveFct) == StatContinue) {
-    workQueue.Run();
-  }
-}
-
-RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
-  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
-      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
-  DescriptorAddendum *toAddendum{to_.Addendum()};
-  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
-  if (toDerived_ && (flags_ & NeedFinalization) &&
-      toDerived_->noFinalizationNeeded()) {
-    flags_ &= ~NeedFinalization;
-  }
-  if (MayAlias(to_, *from_)) {
+  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
+      MustDeallocateLHS(to, from, terminator, flags)};
+  DescriptorAddendum *toAddendum{to.Addendum()};
+  const typeInfo::DerivedType *toDerived{
+      toAddendum ? toAddendum->derivedType() : nullptr};
+  if (toDerived && (flags & NeedFinalization) &&
+      toDerived->noFinalizationNeeded()) {
+    flags &= ~NeedFinalization;
+  }
+  std::size_t toElementBytes{to.ElementBytes()};
+  std::size_t fromElementBytes{from.ElementBytes()};
+  // The following lambda definition violates the conding style,
+  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
+  auto isSimpleMemmove = [&]() {
+    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
+        from.IsContiguous() && toElementBytes == fromElementBytes;
+  };
+  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
+  Descriptor *deferDeallocation{nullptr};
+  if (MayAlias(to, from)) {
     if (mustDeallocateLHS) {
-      // Convert the LHS into a temporary, then make it look deallocated.
-      toDeallocate_ = &tempDescriptor_.descriptor();
-      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      deferDeallocation = &deferredDeallocStatDesc.descriptor();
       std::memcpy(
-          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
-      to_.set_base_addr(nullptr);
-      if (toDerived_ && (flags_ & NeedFinalization)) {
-        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
-            status != StatOk && status != StatContinue) {
-          return status;
-        }
-        flags_ &= ~NeedFinalization;
-      }
-    } else if (!IsSimpleMemmove()) {
+          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
+      to.set_base_addr(nullptr);
+    } else if (!isSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from_->SizeInBytes()};
-      Descriptor &newFrom{tempDescriptor_.descriptor()};
-      persist_ = true; // tempDescriptor_ state must outlive child tickets
-      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
+      auto descBytes{from.SizeInBytes()};
+      StaticDescriptor<maxRank, true, 16> staticDesc;
+      Descriptor &newFrom{staticDesc.descriptor()};
+      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      if (int stat{ReturnError(
-              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
-          stat != StatOk) {
-        return stat;
-      }
-      if (HasDynamicComponent(*from_)) {
-        // If 'from' has allocatable/automatic component, we cannot
-        // just make a shallow copy of the descriptor member.
-        // This will still leave data overlap in 'to' and 'newFrom'.
-        // For example:
-        //   type t
-        //     character, allocatable :: c(:)
-        //   end type t
-        //   type(t) :: x(3)
-        //   x(2:3) = x(1:2)
-        // We have to make a deep copy into 'newFrom' in this case.
-        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
-          if (const auto *derived{addendum->derivedType()}) {
-            if (!derived->noInitializationNeeded()) {
-              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
-                  status != StatOk && status != StatContinue) {
-                return status;
-              }
-            }
-          }
-        }
-        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (int status{workQueue.BeginAssign(
-                newFrom, *from_, nestedFlags, memmoveFct_)};
-            status != StatOk && status != StatContinue) {
-          return status;
+      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
+      if (stat == StatOk) {
+        if (HasDynamicComponent(from)) {
+          // If 'from' has allocatable/automatic component, we cannot
+          // just make a shallow copy of the descriptor member.
+          // This will still leave data overlap in 'to' and 'newFrom'.
+          // For example:
+          //   type t
+          //     character, allocatable :: c(:)
+          //   end type t
+          //   type(t) :: x(3)
+          //   x(2:3) = x(1:2)
+          // We have to make a deep copy into 'newFrom' in this case.
+          RTNAME(AssignTemporary)
+          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
+        } else {
+          ShallowCopy(newFrom, from, true, from.IsContiguous());
         }
-      } else {
-        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
+        Assign(to, newFrom, terminator,
+            flags &
+                (NeedFinalization | ComponentCanBeDefinedAssignment |
+                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
+        newFrom.Deallocate();
       }
-      from_ = &newFrom;
-      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
-          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
-      toDeallocate_ = &newFrom;
+      return;
     }
   }
-  if (to_.IsAllocatable()) {
+  if (to.IsAllocatable()) {
     if (mustDeallocateLHS) {
-      if (!toDeallocate_ && to_.IsAllocated()) {
-        toDeallocate_ = &to_;
+      if (deferDeallocation) {
+        if ((flags & NeedFinalization) && toDerived) {
+          Finalize(*deferDeallocation, *toDerived, &terminator);
+          flags &= ~NeedFinalization;
+        }
+      } else {
+        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
+            &terminator);
+        flags &= ~NeedFinalization;
       }
-    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
-      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
-                                   "assignment to unallocated allocatable",
-          to_.rank(), from_->rank());
+    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
+      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
+                       "unallocated allocatable",
+          to.rank(), from.rank());
     }
-  } else if (!to_.IsAllocated()) {
-    workQueue.terminator().Crash(
-        "Assign: left-hand side variable is neither allocated nor allocatable");
-  }
-  if (toDerived_ && to_.IsAllocated()) {
-    // Schedule finalization or destruction of the LHS.
-    if (flags_ & NeedFinalization) {
-      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
-          status != StatOk && status != StatContinue) {
-        return status;
-      }
-    } else if (!toDerived_->noDestructionNeeded()) {
-      if (int status{
-              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
-          status != StatOk && status != StatContinue) {
-        return status;
+    if (!to.IsAllocated()) {
+      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
+        return;
       }
+      flags &= ~NeedFinalization;
+      toElementBytes = to.ElementBytes(); // may have changed
+      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
     }
   }
-  return StatContinue;
-}
-
-RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
-  if (done_) {
-    // All child tickets are complete; can release this ticket's state.
-    if (toDeallocate_) {
-      toDeallocate_->Deallocate();
-    }
-    return StatOk;
-  }
-  // All necessary finalization or destruction that was initiated by Begin()
-  // has been completed.  Deallocation may be pending, and if it's for the LHS,
-  // do it now so that the LHS gets reallocated.
-  if (toDeallocate_ == &to_) {
-    toDeallocate_ = nullptr;
-    to_.Deallocate();
-  }
-  // Allocate the LHS if needed
-  if (!to_.IsAllocated()) {
-    if (int stat{
-            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
-        stat != StatOk) {
-      return stat;
-    }
-    const auto *addendum{to_.Addendum()};
-    toDerived_ = addendum ? addendum->derivedType() : nullptr;
-    if (toDerived_ && !toDerived_->noInitializationNeeded()) {
-      if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
-          status != StatOk) {
-        return status;
-      }
-    }
-  }
-  // Check for a user-defined assignment type-bound procedure;
-  // see 10.2.1.4-5.
-  // Note that the aliasing and LHS (re)allocation handling above
-  // needs to run even with CanBeDefinedAssignment flag, since
-  // Assign() can be invoked recursively for component-wise assignments.
-  if (toDerived_ && (flags_ & CanBeDefinedAssignment)) {
-    if (to_.rank() == 0) {
-      if (const auto *special{toDerived_->FindSpecialBinding(
+  if (toDerived && (flags & CanBeDefinedAssignment)) {
+    // Check for a user-defined assignment type-bound procedure;
+    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
+    // the semantics, including allocatable (re)allocation and any
+    // finalization.
+    //
+    // Note that the aliasing and LHS (re)allocation handling above
+    // needs to run even with CanBeDefinedAssignment flag, when
+    // the Assign() is invoked recursively for component-per-component
+    // assignments.
+    if (to.rank() == 0) {
+      if (const auto *special{toDerived->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        DoScalarDefinedAssignment(to_, *from_, *special);
-        done_ = true;
-        return StatContinue;
+        return DoScalarDefinedAssignment(to, from, *special);
       }
     }
-    if (const auto *special{toDerived_->FindSpecialBinding(
+    if (const auto *special{toDerived->FindSpecialBinding(
             typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
-      done_ = true;
-      return StatContinue;
+      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
     }
   }
-  // Intrinsic assignment
-  std::size_t toElements{to_.Elements()};
-  if (from_->rank() > 0 && toElements != from_->Elements()) {
-    workQueue.terminator().Crash("Assign: mismatching element counts in array "
-                                 "assignment (to %zd, from %zd)",
-        toElements, from_->Elements());
+  SubscriptValue toAt[maxRank];
+  to.GetLowerBounds(toAt);
+  // Scalar expansion of the RHS is implied by using the same empty
+  // subscript values on each (seemingly) elemental reference into
+  // "from".
+  SubscriptValue fromAt[maxRank];
+  from.GetLowerBounds(fromAt);
+  std::size_t toElements{to.Elements()};
+  if (from.rank() > 0 && toElements != from.Elements()) {
+    terminator.Crash("Assign: mismatching element counts in array assignment "
+                     "(to %zd, from %zd)",
+        toElements, from.Elements());
   }
-  if (to_.type() != from_->type()) {
-    workQueue.terminator().Crash(
-        "Assign: mismatching types (to code %d != from code %d)",
-        to_.type().raw(), from_->type().raw());
+  if (to.type() != from.type()) {
+    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
+        to.type().raw(), from.type().raw());
   }
-  std::size_t toElementBytes{to_.ElementBytes()};
-  std::size_t fromElementBytes{from_->ElementBytes()};
-  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
-    workQueue.terminator().Crash("Assign: mismatching non-character element "
-                                 "sizes (to %zd bytes != from %zd bytes)",
+  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
+    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
+                     "bytes != from %zd bytes)",
         toElementBytes, fromElementBytes);
   }
-  if (toDerived_) {
-    if (toDerived_->noDefinedAssignment()) { // componentwise
-      if (int status{workQueue.BeginDerivedAssign<true>(
-              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
-          status != StatOk && status != StatContinue) {
-        return status;
+  if (const typeInfo::DerivedType *
+      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
+    // Derived type intrinsic assignment, which is componentwise and elementwise
+    // for all components, including parent components (10.2.1.2-3).
+    // The target is first finalized if still necessary (7.5.6.3(1))
+    if (flags & NeedFinalization) {
+      Finalize(to, *updatedToDerived, &terminator);
+    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
+      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
+    }
+    // Copy the data components (incl. the parent) first.
+    const Descriptor &componentDesc{updatedToDerived->component()};
+    std::size_t numComponents{componentDesc.Elements()};
+    for (std::size_t j{0}; j < toElements;
+         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
+      for (std::size_t k{0}; k < numComponents; ++k) {
+        const auto &comp{
+            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+                k)}; // TODO: exploit contiguity here
+        // Use PolymorphicLHS for components so that the right things happen
+        // when the components are polymorphic; when they're not, they're both
+        // not, and their declared types will match.
+        int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (flags & ComponentCanBeDefinedAssignment) {
+          nestedFlags |=
+              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+        }
+        switch (comp.genre()) {
+        case typeInfo::Component::Genre::Data:
+          if (comp.category() == TypeCategory::Derived) {
+            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
+            Descriptor &toCompDesc{statDesc[0].descriptor()};
+            Descriptor &fromCompDesc{statDesc[1].descriptor()};
+            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
+            comp.CreatePointerDescriptor(
+                fromCompDesc, from, terminator, fromAt);
+            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
+          } else { // Component has intrinsic type; simply copy raw bytes
+            std::size_t componentByteSize{comp.SizeInBytes(to)};
+            memmoveFct(to.Element<char>(toAt) + comp.offset(),
+                from.Element<const char>(fromAt) + comp.offset(),
+                componentByteSize);
+          }
+          break;
+        case typeInfo::Component::Genre::Pointer: {
+          std::size_t componentByteSize{comp.SizeInBytes(to)};
+          memmoveFct(to.Element<char>(toAt) + comp.offset(),
+              from.Element<const char>(fromAt) + comp.offset(),
+              componentByteSize);
+        } break;
+        case typeInfo::Component::Genre::Allocatable:
+        case typeInfo::Component::Genre::Automatic: {
+          auto *toDesc{reinterpret_cast<Descriptor *>(
+              to.Element<char>(toAt) + comp.offset())};
+          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+              from.Element<char>(fromAt) + comp.offset())};
+          // Allocatable components of the LHS are unconditionally
+          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+          // unlike a "top-level" assignment to a variable, where
+          // deallocation is optional.
+          //
+          // Be careful not to destroy/reallocate the LHS, if there is
+          // overlap between LHS and RHS (it seems that partial overlap
+          // is not possible, though).
+          // Invoke Assign() recursively to deal with potential aliasing.
+          if (toDesc->IsAllocatable()) {
+            if (!fromDesc->IsAllocated()) {
+              // No aliasing.
+              //
+              // If to is not allocated, the Destroy() call is a no-op.
+              // This is just a shortcut, because the recursive Assign()
+              // below would initiate the destruction for to.
+              // No finalization is required.
+              toDesc->Destroy(
+                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+              continue; // F'2018 10.2.1.3(13)(2)
+            }
+          }
+          // Force LHS deallocation with DeallocateLHS flag.
+          // The actual deallocation may be avoided, if the existing
+          // location can be reoccupied.
+          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
+        } break;
+        }
       }
-    } else { // elementwise
-      if (int status{workQueue.BeginDerivedAssign<false>(
-              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
-          status != StatOk && status != StatContinue) {
-        return status;
+      // Copy procedure pointer components
+      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
+      std::size_t numProcPtrs{procPtrDesc.Elements()};
+      for (std::size_t k{0}; k < numProcPtrs; ++k) {
+        const auto &procPtr{
+            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
+                k)};
+        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
+            from.Element<const char>(fromAt) + procPtr.offset,
+            sizeof(typeInfo::ProcedurePointer));
       }
     }
-    toDeallocate_ = nullptr;
-  } else if (IsSimpleMemmove()) {
-    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
-        toElements * toElementBytes);
-  } else {
-    // Scalar expansion of the RHS is implied by using the same empty
-    // subscript values on each (seemingly) elemental reference into
-    // "from".
-    SubscriptValue toAt[maxRank];
-    to_.GetLowerBounds(toAt);
-    SubscriptValue fromAt[maxRank];
-    from_->GetLowerBounds(fromAt);
-    if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to_.type().raw()) {
+  } else { // intrinsic type, intrinsic assignment
+    if (isSimpleMemmove()) {
+      memmoveFct(to.raw().base_addr, from.raw().base_addr,
+          toElements * toElementBytes);
+    } else if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        workQueue.terminator().Crash(
-            "unexpected type code %d in blank padded Assign()",
-            to_.type().raw());
+        terminator.Crash("unexpected type code %d in blank padded Assign()",
+            to.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
-        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
+          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
+        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (persist_) {
-    done_ = true;
-    return StatContinue;
-  } else {
-    if (toDeallocate_) {
-      toDeallocate_->Deallocate();
-      toDeallocate_ = nullptr;
-    }
-    return StatOk;
+  if (deferDeallocation) {
+    // deferDeallocation is used only when LHS is an allocatable.
+    // The finalization has already been run for it.
+    deferDeallocation->Destroy(
+        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
   }
 }
 
-template <bool IS_COMPONENTWISE>
-RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
-    WorkQueue &workQueue) {
-  if (toIsContiguous_ && fromIsContiguous_ &&
-      this->derived_.noDestructionNeeded() &&
-      this->derived_.noDefinedAssignment() &&
-      this->instance_.rank() == this->from_->rank()) {
-    if (std::size_t elementBytes{this->instance_.ElementBytes()};
-        elementBytes == this->from_->ElementBytes()) {
-      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
-      // to be expanded, the types have the same size, and there are no
-      // allocatable components or defined ASSIGNMENT(=) at any level.
-      memmoveFct_(this->instance_.template OffsetElement<char>(),
-          this->from_->template OffsetElement<const char *>(),
-          this->instance_.Elements() * elementBytes);
-      return StatOk;
-    }
-  }
-  // Use PolymorphicLHS for components so that the right things happen
-  // when the components are polymorphic; when they're not, they're both
-  // not, and their declared types will match.
-  int nestedFlags{MaybeReallocate | PolymorphicLHS};
-  if (flags_ & ComponentCanBeDefinedAssignment) {
-    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-  }
-  flags_ = nestedFlags;
-  // Copy procedure pointer components
-  const Descriptor &procPtrDesc{this->derived_.procPtr()};
-  bool noDataComponents{this->IsComplete()};
-  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
-    for (std::size_t k{0}; k < numProcPtrs; ++k) {
-      const auto &procPtr{
-          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-      // Loop only over elements
-      if (noDataComponents) {
-        Elementwise::Reset();
-      }
-      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
-        memmoveFct_(this->instance_.template ElementComponent<char>(
-                        this->subscripts_, procPtr.offset),
-            this->from_->template ElementComponent<const char>(
-                this->fromSubscripts_, procPtr.offset),
-            sizeof(typeInfo::ProcedurePointer));
-      }
-    }
-    if (noDataComponents) {
-      return StatOk;
-    }
-    Elementwise::Reset();
-  }
-  if (noDataComponents) {
-    return StatOk;
-  }
-  return StatContinue;
-}
-template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
-template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
-
-template <bool IS_COMPONENTWISE>
-RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
-    WorkQueue &workQueue) {
-  while (!this->IsComplete()) {
-    // Copy the data components (incl. the parent) first.
-    switch (this->component_->genre()) {
-    case typeInfo::Component::Genre::Data:
-      if (this->component_->category() == TypeCategory::Derived) {
-        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
-        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
-        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
-            workQueue.terminator(), this->subscripts_);
-        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
-            workQueue.terminator(), this->fromSubscripts_);
-        this->Advance();
-        if (int status{workQueue.BeginAssign(
-                toCompDesc, fromCompDesc, flags_, memmoveFct_)};
-            status != StatOk) {
-          return status;
-        }
-      } else { // Component has intrinsic type; simply copy raw bytes
-        std::size_t componentByteSize{
-            this->component_->SizeInBytes(this->instance_)};
-        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
-          std::size_t offset{this->component_->offset()};
-          char *to{this->instance_.template OffsetElement<char>(offset)};
-          const char *from{
-              this->from_->template OffsetElement<const char>(offset)};
-          std::size_t toElementStride{this->instance_.ElementBytes()};
-          std::size_t fromElementStride{
-              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
-          if (toElementStride == fromElementStride &&
-              toElementStride == componentByteSize) {
-            memmoveFct_(to, from, this->elements_ * componentByteSize);
-          } else {
-            for (std::size_t n{this->elements_}; n--;
-                to += toElementStride, from += fromElementStride) {
-              memmoveFct_(to, from, componentByteSize);
-            }
-          }
-          this->Componentwise::Advance();
-        } else {
-          memmoveFct_(
-              this->instance_.template Element<char>(this->subscripts_) +
-                  this->component_->offset(),
-              this->from_->template Element<const char>(this->fromSubscripts_) +
-                  this->component_->offset(),
-              componentByteSize);
-          this->Advance();
-        }
-      }
-      break;
-    case typeInfo::Component::Genre::Pointer: {
-      std::size_t componentByteSize{
-          this->component_->SizeInBytes(this->instance_)};
-      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
-        std::size_t offset{this->component_->offset()};
-        char *to{this->instance_.template OffsetElement<char>(offset)};
-        const char *from{
-            this->from_->template OffsetElement<const char>(offset)};
-        std::size_t toElementStride{this->instance_.ElementBytes()};
-        std::size_t fromElementStride{
-            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
-        if (toElementStride == fromElementStride &&
-            toElementStride == componentByteSize) {
-          memmoveFct_(to, from, this->elements_ * componentByteSize);
-        } else {
-          for (std::size_t n{this->elements_}; n--;
-              to += toElementStride, from += fromElementStride) {
-            memmoveFct_(to, from, componentByteSize);
-          }
-        }
-        this->Componentwise::Advance();
-      } else {
-        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
-                this->component_->offset(),
-            this->from_->template Element<const char>(this->fromSubscripts_) +
-                this->component_->offset(),
-            componentByteSize);
-        this->Advance();
-      }
-    } break;
-    case typeInfo::Component::Genre::Allocatable:
-    case typeInfo::Component::Genre::Automatic: {
-      auto *toDesc{reinterpret_cast<Descriptor *>(
-          this->instance_.template Element<char>(this->subscripts_) +
-          this->component_->offset())};
-      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-          this->from_->template Element<char>(this->fromSubscripts_) +
-          this->component_->offset())};
-      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
-        if (toDesc->IsAllocated()) {
-          if (this->phase_ == 0) {
-            this->phase_++;
-            if (const auto *componentDerived{this->component_->derivedType()};
-                componentDerived && !componentDerived->noDestructionNeeded()) {
-              if (int status{workQueue.BeginDestroy(
-                      *toDesc, *componentDerived, /*finalize=*/false)};
-                  status != StatOk) {
-                return status;
-              }
-            }
-          }
-          toDesc->Deallocate();
-        }
-        this->Advance();
-      } else {
-        // Allocatable components of the LHS are unconditionally
-        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-        // unlike a "top-level" assignment to a variable, where
-        // deallocation is optional.
-        this->Advance();
-        int nestedFlags{flags_};
-        if (this->derived_.noFinalizationNeeded() &&
-            this->derived_.noInitializationNeeded() &&
-            this->derived_.noDestructionNeeded()) {
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-        } else {
-          // Force LHS deallocation with DeallocateLHS flag.
-          nestedFlags |= DeallocateLHS;
-        }
-        if (int status{workQueue.BeginAssign(
-                *toDesc, *fromDesc, nestedFlags, memmoveFct_)};
-            status != StatOk) {
-          return status;
-        }
-      }
-    } break;
-    }
-  }
-  if (deallocateAfter_) {
-    deallocateAfter_->Deallocate();
-  }
-  return StatOk;
-}
-template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
-template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
+RT_OFFLOAD_API_GROUP_BEGIN
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -759,6 +582,7 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
+
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -775,6 +599,7 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
+
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 8ab737c701b01..35037036f63e7 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,7 +12,6 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
-#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -31,193 +30,180 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
-    const Descriptor *) {
-  WorkQueue workQueue{terminator};
-  int status{workQueue.BeginInitialize(instance, derived)};
-  return status == StatContinue ? workQueue.Run() : status;
-}
-
-RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived_.procPtr()};
-  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
-    bool noDataComponents{IsComplete()};
-    for (std::size_t k{0}; k < numProcPtrs; ++k) {
-      const auto &comp{
-          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-      // Loop only over elements
-      if (noDataComponents) {
-        Elementwise::Reset();
-      }
-      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
-        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
-            subscripts_, comp.offset)};
-        pptr = comp.procInitialization;
-      }
-    }
-    if (noDataComponents) {
-      return StatOk;
-    }
-    Elementwise::Reset();
-  }
-  return StatContinue;
-}
-
-RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
-  while (!IsComplete()) {
-    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
-      // Establish allocatable descriptors
-      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
-        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
-            subscripts_, component_->offset())};
-        component_->EstablishDescriptor(
-            allocDesc, instance_, workQueue.terminator());
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
+    const Descriptor *errMsg) {
+  const Descriptor &componentDesc{derived.component()};
+  std::size_t elements{instance.Elements()};
+  int stat{StatOk};
+  // Initialize data components in each element; the per-element iterations
+  // constitute the inner loops, not the outer ones
+  std::size_t myComponents{componentDesc.Elements()};
+  for (std::size_t k{0}; k < myComponents; ++k) {
+    const auto &comp{
+        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
+    SubscriptValue at[maxRank];
+    instance.GetLowerBounds(at);
+    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
+        comp.genre() == typeInfo::Component::Genre::Automatic) {
+      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
+        Descriptor &allocDesc{
+            *instance.ElementComponent<Descriptor>(at, comp.offset())};
+        comp.EstablishDescriptor(allocDesc, instance, terminator);
         allocDesc.raw().attribute = CFI_attribute_allocatable;
+        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
+          stat = ReturnError(
+              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
+          if (stat == StatOk) {
+            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
+              if (const auto *derived{addendum->derivedType()}) {
+                if (!derived->noInitializationNeeded()) {
+                  stat = Initialize(
+                      allocDesc, *derived, terminator, hasStat, errMsg);
+                }
+              }
+            }
+          }
+          if (stat != StatOk) {
+            break;
+          }
+        }
       }
-      SkipToNextComponent();
-    } else if (const void *init{component_->initialization()}) {
+    } else if (const void *init{comp.initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{component_->SizeInBytes(instance_)};
-      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
-        char *ptr{instance_.ElementComponent<char>(
-            subscripts_, component_->offset())};
+      std::size_t bytes{comp.SizeInBytes(instance)};
+      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
+        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
         std::memcpy(ptr, init, bytes);
       }
-      SkipToNextComponent();
-    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
+    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
-        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
-            subscripts_, component_->offset())};
-        component_->EstablishDescriptor(
-            ptrDesc, instance_, workQueue.terminator());
+      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
+        Descriptor &ptrDesc{
+            *instance.ElementComponent<Descriptor>(at, comp.offset())};
+        comp.EstablishDescriptor(ptrDesc, instance, terminator);
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-      SkipToNextComponent();
-    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
-        component_->derivedType() &&
-        !component_->derivedType()->noInitializationNeeded()) {
+    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
+        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.
+      // data component.  Handles parent component's elements.  Recursive.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, *component_, instance_);
-      Descriptor &compDesc{componentDescriptor_.descriptor()};
-      const typeInfo::DerivedType &compType{*component_->derivedType()};
-      compDesc.Establish(compType,
-          instance_.ElementComponent<char>(subscripts_, component_->offset()),
-          component_->rank(), extents);
-      Advance();
-      if (int status{workQueue.BeginInitialize(compDesc, compType)};
-          status != StatOk) {
-        return status;
+      GetComponentExtents(extents, comp, instance);
+      StaticDescriptor<maxRank, true, 0> staticDescriptor;
+      Descriptor &compDesc{staticDescriptor.descriptor()};
+      const typeInfo::DerivedType &compType{*comp.derivedType()};
+      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
+        compDesc.Establish(compType,
+            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
+        if (stat != StatOk) {
+          break;
+        }
       }
-    } else {
-      SkipToNextComponent();
     }
   }
-  return StatOk;
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived.procPtr()};
+  std::size_t myProcPtrs{procPtrDesc.Elements()};
+  for (std::size_t k{0}; k < myProcPtrs; ++k) {
+    const auto &comp{
+        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+    SubscriptValue at[maxRank];
+    instance.GetLowerBounds(at);
+    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
+      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
+          at, comp.offset)};
+      pptr = comp.procInitialization;
+    }
+  }
+  return stat;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &original, const typeInfo::DerivedType &derived,
+    const Descriptor &orig, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  if (original.IsPointer() || !original.IsAllocated()) {
-    return StatOk; // nothing to do
-  } else {
-    WorkQueue workQueue{terminator};
-    int status{workQueue.BeginInitializeClone(
-        clone, original, derived, hasStat, errMsg)};
-    return status == StatContinue ? workQueue.Run() : status;
-  }
-}
+  const Descriptor &componentDesc{derived.component()};
+  std::size_t elements{orig.Elements()};
+  int stat{StatOk};
 
-RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
-  while (!IsComplete()) {
-    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
-      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
-          subscripts_, component_->offset())};
-      if (origDesc.IsAllocated()) {
-        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
-            subscripts_, component_->offset())};
-        if (phase_ == 0) {
-          ++phase_;
+  // Skip pointers and unallocated variables.
+  if (orig.IsPointer() || !orig.IsAllocated()) {
+    return stat;
+  }
+  // Initialize each data component.
+  std::size_t components{componentDesc.Elements()};
+  for (std::size_t i{0}; i < components; ++i) {
+    const typeInfo::Component &comp{
+        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
+    SubscriptValue at[maxRank];
+    orig.GetLowerBounds(at);
+    // Allocate allocatable components that are also allocated in the original
+    // object.
+    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
+      // Initialize each element.
+      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+        Descriptor &origDesc{
+            *orig.ElementComponent<Descriptor>(at, comp.offset())};
+        Descriptor &cloneDesc{
+            *clone.ElementComponent<Descriptor>(at, comp.offset())};
+        if (origDesc.IsAllocated()) {
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          if (int stat{ReturnError(workQueue.terminator(),
-                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
-              stat != StatOk) {
-            return stat;
-          }
-          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
-              if (!derived->noInitializationNeeded()) {
-                // Perform default initialization for the allocated element.
-                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
-                    status != StatOk) {
-                  return status;
+          stat = ReturnError(
+              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
+          if (stat == StatOk) {
+            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
+              if (const typeInfo::DerivedType *
+                  derived{addendum->derivedType()}) {
+                if (!derived->noInitializationNeeded()) {
+                  // Perform default initialization for the allocated element.
+                  stat = Initialize(
+                      cloneDesc, *derived, terminator, hasStat, errMsg);
+                }
+                // Initialize derived type's allocatables.
+                if (stat == StatOk) {
+                  stat = InitializeClone(cloneDesc, origDesc, *derived,
+                      terminator, hasStat, errMsg);
                 }
               }
             }
           }
         }
-        if (phase_ == 1) {
-          ++phase_;
-          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
-              // Initialize derived type's allocatables.
-              if (int status{workQueue.BeginInitializeClone(
-                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
-                  status != StatOk) {
-                return status;
-              }
-            }
-          }
+        if (stat != StatOk) {
+          break;
         }
       }
-      Advance();
-    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
-      if (component_->derivedType()) {
-        // Handle nested derived types.
-        const typeInfo::DerivedType &compType{*component_->derivedType()};
-        SubscriptValue extents[maxRank];
-        GetComponentExtents(extents, *component_, instance_);
-        Descriptor &origDesc{componentDescriptor_.descriptor()};
-        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
+    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
+        comp.derivedType()) {
+      // Handle nested derived types.
+      const typeInfo::DerivedType &compType{*comp.derivedType()};
+      SubscriptValue extents[maxRank];
+      GetComponentExtents(extents, comp, orig);
+      // Data components don't have descriptors, allocate them.
+      StaticDescriptor<maxRank, true, 0> origStaticDesc;
+      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
+      Descriptor &origDesc{origStaticDesc.descriptor()};
+      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
+      // Initialize each element.
+      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
         origDesc.Establish(compType,
-            instance_.ElementComponent<char>(subscripts_, component_->offset()),
-            component_->rank(), extents);
+            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
         cloneDesc.Establish(compType,
-            clone_.ElementComponent<char>(subscripts_, component_->offset()),
-            component_->rank(), extents);
-        Advance();
-        if (int status{workQueue.BeginInitializeClone(
-                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
-            status != StatOk) {
-          return status;
+            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        stat = InitializeClone(
+            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
+        if (stat != StatOk) {
+          break;
         }
-      } else {
-        SkipToNextComponent();
       }
-    } else {
-      SkipToNextComponent();
-    }
-  }
-  return StatOk;
-}
-
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
-    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
-    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
-    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
-      workQueue.Run();
     }
   }
+  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -235,7 +221,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator &terminator) {
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -272,7 +258,9 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
+        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
+            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
@@ -296,94 +284,87 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
-  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
+    return;
+  }
+  CallFinalSubroutine(descriptor, derived, terminator);
+  const auto *parentType{derived.GetParentType()};
+  bool recurse{parentType && !parentType->noFinalizationNeeded()};
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  finalizableParentType_ = derived_.GetParentType();
-  if (finalizableParentType_) {
-    if (finalizableParentType_->noFinalizationNeeded()) {
-      finalizableParentType_ = nullptr;
-    } else {
-      SkipToNextComponent();
-    }
-  }
-  return StatContinue;
-}
-
-RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
-  while (!IsComplete()) {
-    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
-        component_->category() == TypeCategory::Derived) {
+  const Descriptor &componentDesc{derived.component()};
+  std::size_t myComponents{componentDesc.Elements()};
+  std::size_t elements{descriptor.Elements()};
+  for (auto k{recurse ? std::size_t{1}
+                      /* skip first component, it's the parent */
+                      : 0};
+       k < myComponents; ++k) {
+    const auto &comp{
+        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
+    SubscriptValue at[maxRank];
+    descriptor.GetLowerBounds(at);
+    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
+        comp.category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
-          subscripts_, component_->offset())};
-      Advance();
-      if (compDesc.IsAllocated()) {
-        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
-          if (const typeInfo::DerivedType *compDynamicType{
-                  addendum->derivedType()}) {
-            if (!compDynamicType->noFinalizationNeeded()) {
-              if (int status{
-                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
-                  status != StatOk) {
-                return status;
+      for (std::size_t j{0}; j++ < elements;
+           descriptor.IncrementSubscripts(at)) {
+        const Descriptor &compDesc{
+            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
+        if (compDesc.IsAllocated()) {
+          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *
+                compDynamicType{addendum->derivedType()}) {
+              if (!compDynamicType->noFinalizationNeeded()) {
+                Finalize(compDesc, *compDynamicType, terminator);
               }
             }
           }
         }
       }
-    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
-        component_->genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType *compType{component_->derivedType()};
-          compType && !compType->noFinalizationNeeded()) {
-        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
-            subscripts_, component_->offset())};
-        Advance();
-        if (compDesc.IsAllocated()) {
-          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
-              status != StatOk) {
-            return status;
+    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
+        comp.genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
+        if (!compType->noFinalizationNeeded()) {
+          for (std::size_t j{0}; j++ < elements;
+               descriptor.IncrementSubscripts(at)) {
+            const Descriptor &compDesc{
+                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
+            if (compDesc.IsAllocated()) {
+              Finalize(compDesc, *compType, terminator);
+            }
           }
         }
-      } else {
-        SkipToNextComponent();
       }
-    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
-        component_->derivedType() &&
-        !component_->derivedType()->noFinalizationNeeded()) {
+    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
+        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, *component_, instance_);
-      Descriptor &compDesc{componentDescriptor_.descriptor()};
-      const typeInfo::DerivedType &compType{*component_->derivedType()};
-      compDesc.Establish(compType,
-          instance_.ElementComponent<char>(subscripts_, component_->offset()),
-          component_->rank(), extents);
-      Advance();
-      if (int status{workQueue.BeginFinalize(compDesc, compType)};
-          status != StatOk) {
-        return status;
+      GetComponentExtents(extents, comp, descriptor);
+      StaticDescriptor<maxRank, true, 0> staticDescriptor;
+      Descriptor &compDesc{staticDescriptor.descriptor()};
+      const typeInfo::DerivedType &compType{*comp.derivedType()};
+      for (std::size_t j{0}; j++ < elements;
+           descriptor.IncrementSubscripts(at)) {
+        compDesc.Establish(compType,
+            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        Finalize(compDesc, compType, terminator);
       }
-    } else {
-      SkipToNextComponent();
     }
   }
-  // Last, do the parent component, if any and finalizable.
-  if (finalizableParentType_) {
-    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
-    tmpDesc = instance_;
+  if (recurse) {
+    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
+    Descriptor &tmpDesc{statDesc.descriptor()};
+    tmpDesc = descriptor;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
-    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
-    const auto &parentType{*finalizableParentType_};
-    finalizableParentType_ = nullptr;
-    // Don't return StatOk here if the nested FInalize is still running;
-    // it needs this->componentDescriptor_.
-    return workQueue.BeginFinalize(tmpDesc, parentType);
+    tmpDesc.Addendum()->set_derivedType(parentType);
+    tmpDesc.raw().elem_len = parentType->sizeInBytes();
+    Finalize(tmpDesc, *parentType, terminator);
   }
-  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -392,71 +373,51 @@ RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
-    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
-    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
-    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
-      workQueue.Run();
-    }
+  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
+    return;
   }
-}
-
-RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
-  if (finalize_ && !derived_.noFinalizationNeeded()) {
-    if (int status{workQueue.BeginFinalize(instance_, derived_)};
-        status != StatOk && status != StatContinue) {
-      return status;
-    }
+  if (finalize && !derived.noFinalizationNeeded()) {
+    Finalize(descriptor, derived, terminator);
   }
-  return StatContinue;
-}
-
-RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
   // Deallocate all direct and indirect allocatable and automatic components.
   // Contrary to finalization, the order of deallocation does not matter.
-  while (!IsComplete()) {
-    const auto *componentDerived{component_->derivedType()};
-    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
-        component_->genre() == typeInfo::Component::Genre::Automatic) {
-      Descriptor *d{instance_.ElementComponent<Descriptor>(
-          subscripts_, component_->offset())};
-      if (d->IsAllocated()) {
-        if (phase_ == 0) {
-          ++phase_;
-          if (componentDerived && !componentDerived->noDestructionNeeded()) {
-            if (int status{workQueue.BeginDestroy(
-                    *d, *componentDerived, /*finalize=*/false)};
-                status != StatOk) {
-              return status;
-            }
-          }
+  const Descriptor &componentDesc{derived.component()};
+  std::size_t myComponents{componentDesc.Elements()};
+  std::size_t elements{descriptor.Elements()};
+  SubscriptValue at[maxRank];
+  descriptor.GetLowerBounds(at);
+  for (std::size_t k{0}; k < myComponents; ++k) {
+    const auto &comp{
+        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
+    const bool destroyComp{
+        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
+    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
+        comp.genre() == typeInfo::Component::Genre::Automatic) {
+      for (std::size_t j{0}; j < elements; ++j) {
+        Descriptor *d{
+            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
+        if (destroyComp) {
+          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
         }
         d->Deallocate();
+        descriptor.IncrementSubscripts(at);
       }
-      Advance();
-    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
-      if (!componentDerived || componentDerived->noDestructionNeeded()) {
-        SkipToNextComponent();
-      } else {
-        SubscriptValue extents[maxRank];
-        GetComponentExtents(extents, *component_, instance_);
-        Descriptor &compDesc{componentDescriptor_.descriptor()};
-        const typeInfo::DerivedType &compType{*componentDerived};
+    } else if (destroyComp &&
+        comp.genre() == typeInfo::Component::Genre::Data) {
+      SubscriptValue extents[maxRank];
+      GetComponentExtents(extents, comp, descriptor);
+      StaticDescriptor<maxRank, true, 0> staticDescriptor;
+      Descriptor &compDesc{staticDescriptor.descriptor()};
+      const typeInfo::DerivedType &compType{*comp.derivedType()};
+      for (std::size_t j{0}; j++ < elements;
+           descriptor.IncrementSubscripts(at)) {
         compDesc.Establish(compType,
-            instance_.ElementComponent<char>(subscripts_, component_->offset()),
-            component_->rank(), extents);
-        Advance();
-        if (int status{workQueue.BeginDestroy(
-                compDesc, *componentDerived, /*finalize=*/false)};
-            status != StatOk) {
-          return status;
-        }
+            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
       }
-    } else {
-      SkipToNextComponent();
     }
   }
-  return StatOk;
 }
 
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 364724b89ba0d..3db1455af52fe 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,44 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang-rt/runtime/work-queue.h"
-#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
-
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
 // Defined formatted I/O (maybe)
-static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -133,8 +104,8 @@ static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
 }
 
 // Defined unformatted I/O
-static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -181,619 +152,5 @@ static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
   return handler.GetIoStat() == IostatOk;
 }
 
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
-  while (!IsComplete()) {
-    if (component_->genre() == typeInfo::Component::Genre::Data) {
-      // Create a descriptor for the component
-      Descriptor &compDesc{componentDescriptor_.descriptor()};
-      component_->CreatePointerDescriptor(
-          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
-      Advance();
-      if (int status{workQueue.BeginDescriptorIo<DIR>(
-              io_, compDesc, table_, anyIoTookPlace_)};
-          status != StatOk) {
-        return status;
-      }
-    } else {
-      // Component is itself a descriptor
-      char *pointer{
-          instance_.Element<char>(subscripts_) + component_->offset()};
-      const Descriptor &compDesc{
-          *reinterpret_cast<const Descriptor *>(pointer)};
-      Advance();
-      if (compDesc.IsAllocated()) {
-        if (int status{workQueue.BeginDescriptorIo<DIR>(
-                io_, compDesc, table_, anyIoTookPlace_)};
-            status != StatOk) {
-          return status;
-        }
-      }
-    }
-  }
-  return StatOk;
-}
-
-template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
-    WorkQueue &);
-template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
-    WorkQueue &);
-
-template <Direction DIR>
-RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
-  IoErrorHandler &handler{io_.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return handler.GetIoStat();
-  }
-  if (!io_.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return handler.GetIoStat();
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io_.BeginReadingRecord()) {
-      return StatOk;
-    }
-  }
-  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
-    // Unformatted I/O
-    IoErrorHandler &handler{io_.GetIoErrorHandler()};
-    const DescriptorAddendum *addendum{instance_.Addendum()};
-    if (const typeInfo::DerivedType *type{
-            addendum ? addendum->derivedType() : nullptr}) {
-      // derived type unformatted I/O
-      if (table_) {
-        if (const auto *definedIo{table_->Find(*type,
-                DIR == Direction::Input
-                    ? common::DefinedIo::ReadUnformatted
-                    : common::DefinedIo::WriteUnformatted)}) {
-          if (definedIo->subroutine) {
-            typeInfo::SpecialBinding special{DIR == Direction::Input
-                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
-                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                false};
-            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
-              anyIoTookPlace_ = true;
-              return StatOk;
-            }
-          } else {
-            int status{workQueue.BeginDerivedIo<DIR>(
-                io_, instance_, *type, table_, anyIoTookPlace_)};
-            return status == StatContinue ? StatOk : status; // done here
-          }
-        }
-      }
-      if (const typeInfo::SpecialBinding *special{
-              type->FindSpecialBinding(DIR == Direction::Input
-                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-        if (!table_ || !table_->ignoreNonTbpEntries || special->isTypeBound()) {
-          // defined derived type unformatted I/O
-          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
-            anyIoTookPlace_ = true;
-            return StatOk;
-          } else {
-            return IostatEnd;
-          }
-        }
-      }
-      // Default derived type unformatted I/O
-      // TODO: If no component at any level has defined READ or WRITE
-      // (as appropriate), the elements are contiguous, and no byte swapping
-      // is active, do a block transfer via the code below.
-      int status{workQueue.BeginDerivedIo<DIR>(
-          io_, instance_, *type, table_, anyIoTookPlace_)};
-      return status == StatContinue ? StatOk : status; // done here
-    } else {
-      // intrinsic type unformatted I/O
-      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
-      InquireIOLengthState *inq{nullptr};
-      bool swapEndianness{false};
-      if (externalUnf) {
-        swapEndianness = externalUnf->unit().swapEndianness();
-      } else {
-        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
-        if (!childUnf) {
-          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
-                                         : nullptr;
-          RUNTIME_CHECK(handler, inq != nullptr);
-        }
-      }
-      std::size_t elementBytes{instance_.ElementBytes()};
-      std::size_t swappingBytes{elementBytes};
-      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
-        // Byte swapping units can be smaller than elements, namely
-        // for COMPLEX and CHARACTER.
-        if (maybeCatAndKind->first == TypeCategory::Character) {
-          // swap each character position independently
-          swappingBytes = maybeCatAndKind->second; // kind
-        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-          // swap real and imaginary components independently
-          swappingBytes /= 2;
-        }
-      }
-      using CharType =
-          std::conditional_t<DIR == Direction::Output, const char, char>;
-      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-        if constexpr (DIR == Direction::Output) {
-          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                             : inq->Emit(&x, totalBytes, swappingBytes);
-        } else {
-          return externalUnf
-              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-              : childUnf->Receive(&x, totalBytes, swappingBytes);
-        }
-      }};
-      if (!swapEndianness &&
-          instance_.IsContiguous()) { // contiguous unformatted I/O
-        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
-        if (Transfer(x, elements_ * elementBytes)) {
-          anyIoTookPlace_ = true;
-        } else {
-          return IostatEnd;
-        }
-      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-        for (; !IsComplete(); Advance()) {
-          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
-          if (Transfer(x, elementBytes)) {
-            anyIoTookPlace_ = true;
-          } else {
-            return IostatEnd;
-          }
-        }
-      }
-    }
-    // Unformatted I/O never needs to call Continue().
-    return StatOk;
-  }
-  // Formatted I/O
-  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    bool any{false};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
-        break;
-      case 2:
-        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
-        break;
-      case 4:
-        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
-        break;
-      case 8:
-        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
-        break;
-      case 16:
-        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
-        break;
-      case 2:
-        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
-        break;
-      case 4:
-        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
-        break;
-      case 8:
-        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
-        break;
-      case 16:
-        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        any = FormattedRealIO<2, DIR>(io_, instance_);
-        break;
-      case 3:
-        any = FormattedRealIO<3, DIR>(io_, instance_);
-        break;
-      case 4:
-        any = FormattedRealIO<4, DIR>(io_, instance_);
-        break;
-      case 8:
-        any = FormattedRealIO<8, DIR>(io_, instance_);
-        break;
-      case 10:
-        any = FormattedRealIO<10, DIR>(io_, instance_);
-        break;
-      // TODO: case double/double
-      case 16:
-        any = FormattedRealIO<16, DIR>(io_, instance_);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        any = FormattedComplexIO<2, DIR>(io_, instance_);
-        break;
-      case 3:
-        any = FormattedComplexIO<3, DIR>(io_, instance_);
-        break;
-      case 4:
-        any = FormattedComplexIO<4, DIR>(io_, instance_);
-        break;
-      case 8:
-        any = FormattedComplexIO<8, DIR>(io_, instance_);
-        break;
-      case 10:
-        any = FormattedComplexIO<10, DIR>(io_, instance_);
-        break;
-      // TODO: case double/double
-      case 16:
-        any = FormattedComplexIO<16, DIR>(io_, instance_);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        any = FormattedCharacterIO<char, DIR>(io_, instance_);
-        break;
-      case 2:
-        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
-        break;
-      case 4:
-        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        any = FormattedLogicalIO<1, DIR>(io_, instance_);
-        break;
-      case 2:
-        any = FormattedLogicalIO<2, DIR>(io_, instance_);
-        break;
-      case 4:
-        any = FormattedLogicalIO<4, DIR>(io_, instance_);
-        break;
-      case 8:
-        any = FormattedLogicalIO<8, DIR>(io_, instance_);
-        break;
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return IostatEnd;
-      }
-      break;
-    case TypeCategory::Derived: {
-      // Derived type information must be present for formatted I/O.
-      IoErrorHandler &handler{io_.GetIoErrorHandler()};
-      const DescriptorAddendum *addendum{instance_.Addendum()};
-      RUNTIME_CHECK(handler, addendum != nullptr);
-      derived_ = addendum->derivedType();
-      RUNTIME_CHECK(handler, derived_ != nullptr);
-      if (table_) {
-        if (const auto *definedIo{table_->Find(*derived_,
-                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                        : common::DefinedIo::WriteFormatted)}) {
-          if (definedIo->subroutine) {
-            nonTbpSpecial_.emplace(DIR == Direction::Input
-                    ? typeInfo::SpecialBinding::Which::ReadFormatted
-                    : typeInfo::SpecialBinding::Which::WriteFormatted,
-                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-                false);
-            special_ = &*nonTbpSpecial_;
-          }
-        }
-      }
-      if (!special_) {
-        if (const typeInfo::SpecialBinding *binding{
-                derived_->FindSpecialBinding(DIR == Direction::Input
-                        ? typeInfo::SpecialBinding::Which::ReadFormatted
-                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-          if (!table_ || !table_->ignoreNonTbpEntries ||
-              binding->isTypeBound()) {
-            special_ = binding;
-          }
-        }
-      }
-      return StatContinue;
-    }
-    }
-    if (any) {
-      anyIoTookPlace_ = true;
-    } else {
-      return IostatEnd;
-    }
-  } else {
-    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-        static_cast<int>(instance_.type().raw()));
-    return handler.GetIoStat();
-  }
-  return StatOk;
-}
-
-template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
-    WorkQueue &);
-template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
-    WorkQueue &);
-
-template <Direction DIR>
-RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
-  // Only derived type formatted I/O gets here.
-  while (!IsComplete()) {
-    if (special_) {
-      if (auto defined{DefinedFormattedIo(
-              io_, instance_, *derived_, *special_, subscripts_)}) {
-        anyIoTookPlace_ |= *defined;
-        Advance();
-        continue;
-      }
-    }
-    Descriptor &elementDesc{elementDescriptor_.descriptor()};
-    elementDesc.Establish(
-        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
-    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
-    Advance();
-    if (int status{workQueue.BeginDerivedIo<DIR>(
-            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
-        status != StatOk) {
-      return status;
-    }
-  }
-  return StatOk;
-}
-
-template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
-    WorkQueue &);
-template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
-    WorkQueue &);
-
-template <Direction DIR>
-RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  bool anyIoTookPlace{false};
-  WorkQueue workQueue{io.GetIoErrorHandler()};
-  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
-      StatContinue) {
-    workQueue.Run();
-  }
-  return anyIoTookPlace;
-}
-
-template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
-    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
-template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
-    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
-
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index 88ad59bd24b53..eb60f106c9203 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,27 +9,619 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-#include "flang-rt/runtime/connection.h"
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
 
-namespace Fortran::runtime {
-class Descriptor;
-} // namespace Fortran::runtime
-
-namespace Fortran::runtime::io {
-class IoStatementState;
-struct NonTbpDefinedIoTable;
-} // namespace Fortran::runtime::io
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/optional.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 
 namespace Fortran::runtime::io::descr {
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
 
 template <Direction DIR>
-RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
-    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
-extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
-    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+// For intrinsic (not defined) derived type I/O, formatted & unformatted
+template <Direction DIR>
+static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
+    const typeInfo::Component &component, const Descriptor &origDescriptor,
+    const SubscriptValue origSubscripts[], Terminator &terminator,
+    const NonTbpDefinedIoTable *table) {
+#if !defined(RT_DEVICE_AVOID_RECURSION)
+  if (component.genre() == typeInfo::Component::Genre::Data) {
+    // Create a descriptor for the component
+    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
+    Descriptor &desc{statDesc.descriptor()};
+    component.CreatePointerDescriptor(
+        desc, origDescriptor, terminator, origSubscripts);
+    return DescriptorIO<DIR>(io, desc, table);
+  } else {
+    // Component is itself a descriptor
+    char *pointer{
+        origDescriptor.Element<char>(origSubscripts) + component.offset()};
+    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
+    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
+  }
+#else
+  terminator.Crash("not yet implemented: component IO");
+#endif
+}
+
+template <Direction DIR>
+static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &type,
+    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
+  IoErrorHandler &handler{io.GetIoErrorHandler()};
+  const Descriptor &compArray{type.component()};
+  RUNTIME_CHECK(handler, compArray.rank() == 1);
+  std::size_t numComponents{compArray.Elements()};
+  SubscriptValue at[maxRank];
+  compArray.GetLowerBounds(at);
+  for (std::size_t k{0}; k < numComponents;
+       ++k, compArray.IncrementSubscripts(at)) {
+    const typeInfo::Component &component{
+        *compArray.Element<typeInfo::Component>(at)};
+    if (!DefaultComponentIO<DIR>(
+            io, component, descriptor, subscripts, handler, table)) {
+      // Return true for NAMELIST input if any component appeared.
+      auto *listInput{
+          io.get_if<ListDirectedStatementState<Direction::Input>>()};
+      return DIR == Direction::Input && k > 0 && listInput &&
+          listInput->inNamelistSequence();
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &type,
+    const NonTbpDefinedIoTable *table) {
+  IoErrorHandler &handler{io.GetIoErrorHandler()};
+  const Descriptor &compArray{type.component()};
+  RUNTIME_CHECK(handler, compArray.rank() == 1);
+  std::size_t numComponents{compArray.Elements()};
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  for (std::size_t j{0}; j < numElements;
+       ++j, descriptor.IncrementSubscripts(subscripts)) {
+    SubscriptValue at[maxRank];
+    compArray.GetLowerBounds(at);
+    for (std::size_t k{0}; k < numComponents;
+         ++k, compArray.IncrementSubscripts(at)) {
+      const typeInfo::Component &component{
+          *compArray.Element<typeInfo::Component>(at)};
+      if (!DefaultComponentIO<DIR>(
+              io, component, descriptor, subscripts, handler, table)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
+    const typeInfo::SpecialBinding &, const SubscriptValue[]);
+
+template <Direction DIR>
+static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  IoErrorHandler &handler{io.GetIoErrorHandler()};
+  // Derived type information must be present for formatted I/O.
+  const DescriptorAddendum *addendum{descriptor.Addendum()};
+  RUNTIME_CHECK(handler, addendum != nullptr);
+  const typeInfo::DerivedType *type{addendum->derivedType()};
+  RUNTIME_CHECK(handler, type != nullptr);
+  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
+  const typeInfo::SpecialBinding *special{nullptr};
+  if (table) {
+    if (const auto *definedIo{table->Find(*type,
+            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                    : common::DefinedIo::WriteFormatted)}) {
+      if (definedIo->subroutine) {
+        nonTbpSpecial.emplace(DIR == Direction::Input
+                ? typeInfo::SpecialBinding::Which::ReadFormatted
+                : typeInfo::SpecialBinding::Which::WriteFormatted,
+            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+            false);
+        special = &*nonTbpSpecial;
+      }
+    }
+  }
+  if (!special) {
+    if (const typeInfo::SpecialBinding *
+        binding{type->FindSpecialBinding(DIR == Direction::Input
+                ? typeInfo::SpecialBinding::Which::ReadFormatted
+                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
+        special = binding;
+      }
+    }
+  }
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t numElements{descriptor.Elements()};
+  for (std::size_t j{0}; j < numElements;
+       ++j, descriptor.IncrementSubscripts(subscripts)) {
+    Fortran::common::optional<bool> result;
+    if (special) {
+      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
+    }
+    if (!result) {
+      result = DefaultComponentwiseFormattedIO<DIR>(
+          io, descriptor, *type, table, subscripts);
+    }
+    if (!result.value()) {
+      // Return true for NAMELIST input if we got anything.
+      auto *listInput{
+          io.get_if<ListDirectedStatementState<Direction::Input>>()};
+      return DIR == Direction::Input && j > 0 && listInput &&
+          listInput->inNamelistSequence();
+    }
+  }
+  return true;
+}
+
+RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
+    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
 
+// Unformatted I/O
+template <Direction DIR>
+static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
+  IoErrorHandler &handler{io.GetIoErrorHandler()};
+  const DescriptorAddendum *addendum{descriptor.Addendum()};
+  if (const typeInfo::DerivedType *
+      type{addendum ? addendum->derivedType() : nullptr}) {
+    // derived type unformatted I/O
+    if (table) {
+      if (const auto *definedIo{table->Find(*type,
+              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
+                                      : common::DefinedIo::WriteUnformatted)}) {
+        if (definedIo->subroutine) {
+          typeInfo::SpecialBinding special{DIR == Direction::Input
+                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
+              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+              false};
+          if (Fortran::common::optional<bool> wasDefined{
+                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
+            return *wasDefined;
+          }
+        } else {
+          return DefaultComponentwiseUnformattedIO<DIR>(
+              io, descriptor, *type, table);
+        }
+      }
+    }
+    if (const typeInfo::SpecialBinding *
+        special{type->FindSpecialBinding(DIR == Direction::Input
+                ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
+        // defined derived type unformatted I/O
+        return DefinedUnformattedIo(io, descriptor, *type, *special);
+      }
+    }
+    // Default derived type unformatted I/O
+    // TODO: If no component at any level has defined READ or WRITE
+    // (as appropriate), the elements are contiguous, and no byte swapping
+    // is active, do a block transfer via the code below.
+    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
+  } else {
+    // intrinsic type unformatted I/O
+    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
+    auto *inq{
+        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
+    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
+    std::size_t elementBytes{descriptor.ElementBytes()};
+    std::size_t numElements{descriptor.Elements()};
+    std::size_t swappingBytes{elementBytes};
+    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
+      // Byte swapping units can be smaller than elements, namely
+      // for COMPLEX and CHARACTER.
+      if (maybeCatAndKind->first == TypeCategory::Character) {
+        // swap each character position independently
+        swappingBytes = maybeCatAndKind->second; // kind
+      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+        // swap real and imaginary components independently
+        swappingBytes /= 2;
+      }
+    }
+    SubscriptValue subscripts[maxRank];
+    descriptor.GetLowerBounds(subscripts);
+    using CharType =
+        std::conditional_t<DIR == Direction::Output, const char, char>;
+    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+      if constexpr (DIR == Direction::Output) {
+        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                           : inq->Emit(&x, totalBytes, swappingBytes);
+      } else {
+        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+                           : childUnf->Receive(&x, totalBytes, swappingBytes);
+      }
+    }};
+    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
+    if (!swapEndianness &&
+        descriptor.IsContiguous()) { // contiguous unformatted I/O
+      char &x{ExtractElement<char>(io, descriptor, subscripts)};
+      return Transfer(x, numElements * elementBytes);
+    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+      for (std::size_t j{0}; j < numElements; ++j) {
+        char &x{ExtractElement<char>(io, descriptor, subscripts)};
+        if (!Transfer(x, elementBytes)) {
+          return false;
+        }
+        if (!descriptor.IncrementSubscripts(subscripts) &&
+            j + 1 < numElements) {
+          handler.Crash("DescriptorIO: subscripts out of bounds");
+        }
+      }
+      return true;
+    }
+  }
+}
+
+template <Direction DIR>
+static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  IoErrorHandler &handler{io.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return false;
+  }
+  if (!io.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return false;
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io.BeginReadingRecord()) {
+      return false;
+    }
+  }
+  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
+    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
+  }
+  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
+      case 2:
+        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
+      case 4:
+        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
+      case 8:
+        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
+      case 16:
+        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
+      case 2:
+        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
+      case 4:
+        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
+      case 8:
+        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
+      case 16:
+        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        return FormattedRealIO<2, DIR>(io, descriptor);
+      case 3:
+        return FormattedRealIO<3, DIR>(io, descriptor);
+      case 4:
+        return FormattedRealIO<4, DIR>(io, descriptor);
+      case 8:
+        return FormattedRealIO<8, DIR>(io, descriptor);
+      case 10:
+        return FormattedRealIO<10, DIR>(io, descriptor);
+      // TODO: case double/double
+      case 16:
+        return FormattedRealIO<16, DIR>(io, descriptor);
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        return FormattedComplexIO<2, DIR>(io, descriptor);
+      case 3:
+        return FormattedComplexIO<3, DIR>(io, descriptor);
+      case 4:
+        return FormattedComplexIO<4, DIR>(io, descriptor);
+      case 8:
+        return FormattedComplexIO<8, DIR>(io, descriptor);
+      case 10:
+        return FormattedComplexIO<10, DIR>(io, descriptor);
+      // TODO: case double/double
+      case 16:
+        return FormattedComplexIO<16, DIR>(io, descriptor);
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        return FormattedCharacterIO<char, DIR>(io, descriptor);
+      case 2:
+        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
+      case 4:
+        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        return FormattedLogicalIO<1, DIR>(io, descriptor);
+      case 2:
+        return FormattedLogicalIO<2, DIR>(io, descriptor);
+      case 4:
+        return FormattedLogicalIO<4, DIR>(io, descriptor);
+      case 8:
+        return FormattedLogicalIO<8, DIR>(io, descriptor);
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return false;
+      }
+    case TypeCategory::Derived:
+      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
+    }
+  }
+  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+      static_cast<int>(descriptor.type().raw()));
+  return false;
+}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 0f0564403c0e2..1d5304254ed0e 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,10 +143,6 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
-  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
-    internalDebugging = std::strtol(x, nullptr, 10);
-  }
-
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index 1bef387a9771f..b0cf2180fc6d4 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,7 +10,6 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index 24d05f369fcbe..b08195cd31e05 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
+  static bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
+  static bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 451213202acef..82182696d70c6 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
-  std::size_t offset{offset_};
   if (subscripts) {
-    offset += container.SubscriptsToByteOffset(subscripts);
+    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
+  } else {
+    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
   }
-  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
deleted file mode 100644
index a508ecb637102..0000000000000
--- a/flang-rt/lib/runtime/work-queue.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang-rt/runtime/work-queue.h"
-#include "flang-rt/runtime/environment.h"
-#include "flang-rt/runtime/memory.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/visit.h"
-
-namespace Fortran::runtime {
-
-#if !defined(RT_DEVICE_COMPILATION)
-// FLANG_RT_DEBUG code is disabled when false.
-static constexpr bool enableDebugOutput{false};
-#endif
-
-RT_OFFLOAD_API_GROUP_BEGIN
-
-RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
-    : derived_{derived}, components_{derived_.component().Elements()} {
-  GetComponent();
-}
-
-RT_API_ATTRS void Componentwise::GetComponent() {
-  if (IsComplete()) {
-    component_ = nullptr;
-  } else {
-    const Descriptor &componentDesc{derived_.component()};
-    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-        componentAt_);
-  }
-}
-
-RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
-  if (!begun) {
-    begun = true;
-    return common::visit(
-        [&workQueue](
-            auto &specificTicket) { return specificTicket.Begin(workQueue); },
-        u);
-  } else {
-    return common::visit(
-        [&workQueue](auto &specificTicket) {
-          return specificTicket.Continue(workQueue);
-        },
-        u);
-  }
-}
-
-RT_API_ATTRS WorkQueue::~WorkQueue() {
-  if (last_) {
-    if ((last_->next = firstFree_)) {
-      last_->next->previous = last_;
-    }
-    firstFree_ = first_;
-    first_ = last_ = nullptr;
-  }
-  while (firstFree_) {
-    TicketList *next{firstFree_->next};
-    if (!firstFree_->isStatic) {
-      FreeMemory(firstFree_);
-    }
-    firstFree_ = next;
-  }
-}
-
-RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
-  if (!firstFree_) {
-    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
-    firstFree_ = new (p) TicketList;
-    firstFree_->isStatic = false;
-  }
-  TicketList *newTicket{firstFree_};
-  if ((firstFree_ = newTicket->next)) {
-    firstFree_->previous = nullptr;
-  }
-  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
-  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
-    newTicket->previous->next = newTicket;
-  } else {
-    first_ = newTicket;
-  }
-  if ((newTicket->next = after)) {
-    after->previous = newTicket;
-  } else {
-    last_ = newTicket;
-  }
-  newTicket->ticket.begun = false;
-#if !defined(RT_DEVICE_COMPILATION)
-  if (enableDebugOutput &&
-      (executionEnvironment.internalDebugging &
-          ExecutionEnvironment::WorkQueue)) {
-    std::fprintf(stderr, "WQ: new ticket\n");
-  }
-#endif
-  return newTicket->ticket;
-}
-
-RT_API_ATTRS int WorkQueue::Run() {
-  while (last_) {
-    TicketList *at{last_};
-    insertAfter_ = last_;
-#if !defined(RT_DEVICE_COMPILATION)
-    if (enableDebugOutput &&
-        (executionEnvironment.internalDebugging &
-            ExecutionEnvironment::WorkQueue)) {
-      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
-          at->ticket.begun ? "Continue" : "Begin");
-    }
-#endif
-    int stat{at->ticket.Continue(*this)};
-#if !defined(RT_DEVICE_COMPILATION)
-    if (enableDebugOutput &&
-        (executionEnvironment.internalDebugging &
-            ExecutionEnvironment::WorkQueue)) {
-      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
-    }
-#endif
-    insertAfter_ = nullptr;
-    if (stat == StatOk) {
-      if (at->previous) {
-        at->previous->next = at->next;
-      } else {
-        first_ = at->next;
-      }
-      if (at->next) {
-        at->next->previous = at->previous;
-      } else {
-        last_ = at->previous;
-      }
-      if ((at->next = firstFree_)) {
-        at->next->previous = at;
-      }
-      at->previous = nullptr;
-      firstFree_ = at;
-    } else if (stat != StatContinue) {
-      Stop();
-      return stat;
-    }
-  }
-  return StatOk;
-}
-
-RT_API_ATTRS void WorkQueue::Stop() {
-  if (last_) {
-    if ((last_->next = firstFree_)) {
-      last_->next->previous = last_;
-    }
-    firstFree_ = first_;
-    first_ = last_ = nullptr;
-  }
-}
-
-RT_OFFLOAD_API_GROUP_END
-
-} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 6c148b1de6f82..3833e48be3dd6 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength " << j;
+        << "OutputDescriptor() for InquireIoLength";
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 871749934810c..78d871c593e1d 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,16 +858,6 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
-* Intrinsic assignment of arrays is defined elementally, and intrinsic
-  assignment of derived type components is defined componentwise.
-  However, when intrinsic assignment takes place for an array of derived
-  type, the order of the loop nesting is not defined.
-  Some compilers will loop over the elements, assigning all of the components
-  of each element before proceeding to the next element.
-  This compiler loops over all of the components, and assigns all of
-  the elements for each component before proceeding to the next component.
-  A program using defined assignment might be able to detect the difference.
-
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index eb1f63184a177..bc80997a1bec2 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,7 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6,
+  DeallocateLHS = 1 << 6
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 51df7c40f5b8b..4b2bb4fa167f8 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,12 +182,9 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &);
+bool MayRequireFinalization(const DerivedTypeSpec &derived);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
-// Does this type have any defined assignment at any level (or any polymorphic
-// allocatable)?
-bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 4c186f4874152..26ae81f97895a 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -661,10 +661,6 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
-    // Similarly, a flag to enable optimized runtime assignment.
-    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
-        IntExpr<1>(
-            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index ea5ab2d455b54..ac69e6ff5cb79 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -813,38 +813,6 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
-static bool MayHaveDefinedAssignment(
-    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
-  if (const Scope *scope{derived.GetScope()};
-      scope && checked.find(scope) == checked.end()) {
-    checked.insert(scope);
-    for (const auto &[_, symbolRef] : *scope) {
-      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
-        if (generic->kind().IsAssignment()) {
-          return true;
-        }
-      } else if (symbolRef->has<ObjectEntityDetails>() &&
-          !IsPointer(*symbolRef)) {
-        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
-          if (type->IsPolymorphic()) {
-            return true;
-          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
-            if (MayHaveDefinedAssignment(*derived, checked)) {
-              return true;
-            }
-          }
-        }
-      }
-    }
-  }
-  return false;
-}
-
-bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
-  std::set<const Scope *> checked;
-  return MayHaveDefinedAssignment(derived, checked);
-}
-
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index 7226b06504d28..b30a6bf697563 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,8 +52,7 @@
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
-    integer(1) :: __padding0(3)
+    integer(1) :: __padding0(4)
   end type
 
   type :: Binding
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 2e05b652822b5..28f0bf78f33c9 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index 7dc92504aeebf..d228cd2a84ca4 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ subroutine s2(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ subroutine s2(x, y)
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,7 +155,7 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
@@ -197,7 +197,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
@@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index e2552d0a21d6f..f0c0a817da4a4 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index 94dd2199db35a..de8464321a409 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index df1aecf3821de..2a7f12a153eb8 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 22f37b1a4369d..2385709a8eb44 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index ab20d6f601106..e8766d9811db8 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 391a66f3d6664..689cf469dee3b 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ module m
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 08e0b95abb763..92efc8f9ea54b 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
deleted file mode 100644
index 6b23b63d28b1d..0000000000000
--- a/flang/test/Semantics/typeinfo12.f90
+++ /dev/null
@@ -1,67 +0,0 @@
-!RUN: bbc --dump-symbols %s | FileCheck %s
-!Check "nodefinedassignment" settings.
-
-module m01
-
-  type hasAsst1
-   contains
-    procedure asst1
-    generic :: assignment(=) => asst1
-  end type
-!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-
-  type hasAsst2 ! no defined assignment relevant to the runtime
-  end type
-  interface assignment(=)
-    procedure asst2
-  end interface
-!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type test1
-    type(hasAsst1) c
-  end type
-!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-
-  type test2
-    type(hasAsst2) c
-  end type
-!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type test3
-    type(hasAsst1), pointer :: p
-  end type
-!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type test4
-    type(hasAsst2), pointer :: p
-  end type
-!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type, extends(hasAsst1) :: test5
-  end type
-!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
-
-  type, extends(hasAsst2) :: test6
-  end type
-!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type test7
-    type(test7), allocatable :: c
-  end type
-!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
-
-  type test8
-    class(test8), allocatable :: c
-  end type
-!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
-
- contains
-  impure elemental subroutine asst1(left, right)
-    class(hasAsst1), intent(out) :: left
-    class(hasAsst1), intent(in) :: right
-  end
-  impure elemental subroutine asst2(left, right)
-    class(hasAsst2), intent(out) :: left
-    class(hasAsst2), intent(in) :: right
-  end
-end

From 9150a8249f69930a9ed1e7e523555af9815876ec Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Wed, 11 Jun 2025 15:59:47 +0100
Subject: [PATCH 089/851] [mlir][spirv] Add definition for GL Exp2 (#143678)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       | 28 +++++++++++++++++++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 26 +++++++++++++++++
 mlir/test/Target/SPIRV/gl-ops.mlir            |  2 ++
 3 files changed, 56 insertions(+)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 4c7186077fae0..f3f75240e5214 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -838,6 +838,34 @@ def SPIRV_GLAtanhOp : SPIRV_GLUnaryArithmeticOp<"Atanh", 24, SPIRV_Float16or32>
 
 // -----
 
+def SPIRV_GLExp2Op : SPIRV_GLUnaryArithmeticOp<"Exp2", 29, SPIRV_Float16or32> {
+  let summary = "Result is 2 raised to the x power";
+
+  let description = [{
+    Result is 2 raised to the x power; 2**x.
+
+    ```
+    exp2(Inf) = Inf.
+    exp2(-Inf) = +0.
+    ```
+
+    The operand x must be a scalar or vector whose component type is 16-bit or
+    32-bit floating-point.
+
+    Result Type and the type of x must be the same type. Results are computed
+    per component.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Exp2 %0 : f32
+    %3 = spirv.GL.Exp2 %1 : vector<3xf16>
+    ```
+  }];
+}
+
+// -----
+
 def SPIRV_GLLog2Op : SPIRV_GLUnaryArithmeticOp<"Log2", 30, SPIRV_Float16or32> {
   let summary = "Result is the base-2 logarithm of x";
 
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 50cf1b26d42ab..29beee5aea93c 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -789,3 +789,29 @@ func.func @tanh_invalid_type(%arg0 : i32) -> () {
   %0 = spirv.GL.Tanh %arg0 : i32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Exp2
+//===----------------------------------------------------------------------===//
+
+func.func @exp2(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Exp2 {{%.*}} : f32
+  %0 = spirv.GL.Exp2 %arg0 : f32
+  return
+}
+
+func.func @exp2vec(%arg0 : vector<3xf16>) -> () {
+  // CHECK: spirv.GL.Exp2 {{%.*}} : vector<3xf16>
+  %0 = spirv.GL.Exp2 %arg0 : vector<3xf16>
+  return
+}
+
+// -----
+
+func.func @exp2_invalid_type(%arg0 : i32) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values}}
+  %0 = spirv.GL.Exp2 %arg0 : i32
+  return
+}
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index 368f60e102dc1..3dee03345e9a1 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -44,6 +44,8 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %20 = spirv.GL.Log2 %arg0 : f32
     // CHECK: {{%.*}} = spirv.GL.Tanh {{%.*}} : f32
     %21 = spirv.GL.Tanh %arg0 : f32
+    // CHECK: {{%.*}} = spirv.GL.Exp2 {{%.*}} : f32
+    %22 = spirv.GL.Exp2 %arg0 : f32
     spirv.Return
   }
 

From 3ca6ea0f3aabcfba318ce9b14e4567f05de3b556 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Wed, 11 Jun 2025 08:02:44 -0700
Subject: [PATCH 090/851] [Clang][ByteCode][NFC] Move APInt into pushInteger
 since it is being passed by value (#143578)

Static analysis flagged that we could move APInt instead of copy, indeed
it has a move constructor and so we should move into values for APInt.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index b678f229d50bb..5fc5034569597 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1321,7 +1321,7 @@ static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC,
     if (Mask[I])
       Result.setBitVal(I, Val[P++]);
   }
-  pushInteger(S, Result, Call->getType());
+  pushInteger(S, std::move(Result), Call->getType());
   return true;
 }
 
@@ -1344,7 +1344,7 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC,
     if (Mask[I])
       Result.setBitVal(P++, Val[I]);
   }
-  pushInteger(S, Result, Call->getType());
+  pushInteger(S, std::move(Result), Call->getType());
   return true;
 }
 

From 141d390dcb6cd174b07ca663e58f37ab24eee08a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 11 Jun 2025 10:05:34 -0500
Subject: [PATCH 091/851] [flang][OpenMP] Overhaul implementation of ATOMIC
 construct (#137852)

The parser will accept a wide variety of illegal attempts at forming an
ATOMIC construct, leaving it to the semantic analysis to diagnose any
issues. This consolidates the analysis into one place and allows us to
produce more informative diagnostics.

The parser's outcome will be parser::OpenMPAtomicConstruct object
holding the directive, parser::Body, and an optional end-directive. The
prior variety of OmpAtomicXyz classes, as well as OmpAtomicClause have
been removed. READ, WRITE, etc. are now proper clauses.

The semantic analysis consistently operates on "evaluation"
representations, mainly evaluate::Expr (as SomeExpr) and
evaluate::Assignment. The results of the semantic analysis are stored in
a mutable member of the OpenMPAtomicConstruct node. This follows a
precedent of having `typedExpr` member in parser::Expr, for example.
This allows the lowering code to avoid duplicated handling of AST nodes.

Using a BLOCK construct containing multiple statements for an ATOMIC
construct that requires multiple statements is now allowed. In fact, any
nesting of such BLOCK constructs is allowed.

This implementation will parse, and perform semantic checks for both
conditional-update and conditional-update-capture, although no MLIR will
be generated for those. Instead, a TODO error will be issues prior to
lowering.

The allowed forms of the ATOMIC construct were based on the OpenMP 6.0
spec.
---
 flang/docs/OpenMPSupport.md                   |   13 +
 flang/examples/FeatureList/FeatureList.cpp    |   10 -
 .../FlangOmpReport/FlangOmpReportVisitor.cpp  |   27 +-
 flang/include/flang/Parser/dump-parse-tree.h  |   12 -
 flang/include/flang/Parser/parse-tree.h       |  111 +-
 flang/include/flang/Semantics/tools.h         |  147 ++
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |   40 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 1011 ++++-----
 flang/lib/Parser/openmp-parsers.cpp           |  233 +-
 flang/lib/Parser/parse-tree.cpp               |   28 +
 flang/lib/Parser/unparse.cpp                  |  102 +-
 flang/lib/Semantics/check-omp-structure.cpp   | 1893 +++++++++++++----
 flang/lib/Semantics/check-omp-structure.h     |   59 +-
 flang/lib/Semantics/resolve-names.cpp         |    7 +-
 flang/lib/Semantics/rewrite-directives.cpp    |  126 +-
 flang/lib/Semantics/tools.cpp                 |  317 ++-
 flang/test/Examples/omp-atomic.f90            |   16 +-
 .../Lower/OpenMP/Todo/atomic-compare-fail.f90 |    2 +-
 .../test/Lower/OpenMP/Todo/atomic-compare.f90 |    2 +-
 flang/test/Lower/OpenMP/atomic-capture.f90    |    4 +-
 .../Lower/OpenMP/atomic-implicit-cast.f90     |   10 +-
 flang/test/Lower/OpenMP/atomic-privatize.f90  |    2 +-
 flang/test/Lower/OpenMP/atomic-write.f90      |    2 +-
 .../Lower/OpenMP/dump-atomic-analysis.f90     |   82 +
 flang/test/Parser/OpenMP/atomic-compare.f90   |  306 ++-
 flang/test/Parser/OpenMP/atomic-end.f90       |   63 +
 .../test/Semantics/OpenMP/atomic-compare.f90  |   29 +-
 .../Semantics/OpenMP/atomic-hint-clause.f90   |   23 +-
 flang/test/Semantics/OpenMP/atomic-read.f90   |  118 +
 .../OpenMP/atomic-update-capture.f90          |   77 +
 .../Semantics/OpenMP/atomic-update-only.f90   |   83 +
 .../OpenMP/atomic-update-overloaded-ops.f90   |    4 +-
 flang/test/Semantics/OpenMP/atomic-write.f90  |   81 +
 flang/test/Semantics/OpenMP/atomic.f90        |   31 +-
 flang/test/Semantics/OpenMP/atomic01.f90      |  221 +-
 flang/test/Semantics/OpenMP/atomic02.f90      |   47 +-
 flang/test/Semantics/OpenMP/atomic03.f90      |   51 +-
 flang/test/Semantics/OpenMP/atomic04.f90      |   99 +-
 flang/test/Semantics/OpenMP/atomic05.f90      |   12 +-
 .../Semantics/OpenMP/critical-hint-clause.f90 |   20 +-
 .../OpenMP/omp-atomic-assignment-stmt.f90     |   58 +-
 .../Semantics/OpenMP/requires-atomic01.f90    |   86 +-
 .../Semantics/OpenMP/requires-atomic02.f90    |   86 +-
 43 files changed, 3753 insertions(+), 1998 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/dump-atomic-analysis.f90
 create mode 100644 flang/test/Parser/OpenMP/atomic-end.f90
 create mode 100644 flang/test/Semantics/OpenMP/atomic-read.f90
 create mode 100644 flang/test/Semantics/OpenMP/atomic-update-capture.f90
 create mode 100644 flang/test/Semantics/OpenMP/atomic-update-only.f90
 create mode 100644 flang/test/Semantics/OpenMP/atomic-write.f90

diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md
index 7a4f95693a89c..c9f19c37fd7fa 100644
--- a/flang/docs/OpenMPSupport.md
+++ b/flang/docs/OpenMPSupport.md
@@ -60,3 +60,16 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | target teams distribute parallel loop construct            | P      | device, reduction and dist_schedule clauses are not supported |
 | teams distribute parallel loop simd construct              | P      | reduction, dist_schedule, and linear clauses are not supported |
 | target teams distribute parallel loop simd construct       | P      | device, reduction, dist_schedule and linear clauses are not supported |
+
+## Extensions
+### ATOMIC construct
+The implementation of the ATOMIC construct follows OpenMP 6.0 with the following extensions:
+- `x = x` is an allowed form of ATOMIC UPDATE.
+This is motivated by the fact that the equivalent forms `x = x+0` or `x = x*1` are allowed.
+- Explicit type conversions are allowed in ATOMIC READ, WRITE or UPDATE constructs, and in the capture statement in ATOMIC UPDATE CAPTURE.
+The OpenMP spec requires intrinsic- or pointer-assignments, which include (as per the Fortran standard) implicit type conversions.  Since such conversions need to be handled, allowing explicit conversions comes at no extra cost.
+- A literal `.true.` or `.false.` is an allowed condition in ATOMIC UPDATE COMPARE. [1]
+- A logical variable is an allowed form of the condition even if its value is not computed within the ATOMIC UPDATE COMPARE construct [1].
+- `expr equalop x` is an allowed condition in ATOMIC UPDATE COMPARE. [1]
+
+[1] Code generation for ATOMIC UPDATE COMPARE is not implemented yet.
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index d1407cf0ef239..a36b8719e365d 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -445,13 +445,6 @@ struct NodeVisitor {
   READ_FEATURE(ObjectDecl)
   READ_FEATURE(OldParameterStmt)
   READ_FEATURE(OmpAlignedClause)
-  READ_FEATURE(OmpAtomic)
-  READ_FEATURE(OmpAtomicCapture)
-  READ_FEATURE(OmpAtomicCapture::Stmt1)
-  READ_FEATURE(OmpAtomicCapture::Stmt2)
-  READ_FEATURE(OmpAtomicRead)
-  READ_FEATURE(OmpAtomicUpdate)
-  READ_FEATURE(OmpAtomicWrite)
   READ_FEATURE(OmpBeginBlockDirective)
   READ_FEATURE(OmpBeginLoopDirective)
   READ_FEATURE(OmpBeginSectionsDirective)
@@ -480,7 +473,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpIterationOffset)
   READ_FEATURE(OmpIterationVector)
   READ_FEATURE(OmpEndAllocators)
-  READ_FEATURE(OmpEndAtomic)
   READ_FEATURE(OmpEndBlockDirective)
   READ_FEATURE(OmpEndCriticalDirective)
   READ_FEATURE(OmpEndLoopDirective)
@@ -566,8 +558,6 @@ struct NodeVisitor {
   READ_FEATURE(OpenMPDeclareTargetConstruct)
   READ_FEATURE(OmpMemoryOrderType)
   READ_FEATURE(OmpMemoryOrderClause)
-  READ_FEATURE(OmpAtomicClause)
-  READ_FEATURE(OmpAtomicClauseList)
   READ_FEATURE(OmpAtomicDefaultMemOrderClause)
   READ_FEATURE(OpenMPFlushConstruct)
   READ_FEATURE(OpenMPLoopConstruct)
diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
index bf66151d59950..feb7b4eced9e9 100644
--- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
+++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp
@@ -74,25 +74,19 @@ SourcePosition OpenMPCounterVisitor::getLocation(const OpenMPConstruct &c) {
           // the directive field.
           [&](const auto &c) -> SourcePosition {
             const CharBlock &source{std::get<0>(c.t).source};
-            return (parsing->allCooked().GetSourcePositionRange(source))->first;
+            return parsing->allCooked().GetSourcePositionRange(source)->first;
           },
           [&](const OpenMPAtomicConstruct &c) -> SourcePosition {
-            return std::visit(
-                [&](const auto &o) -> SourcePosition {
-                  const CharBlock &source{std::get<Verbatim>(o.t).source};
-                  return parsing->allCooked()
-                      .GetSourcePositionRange(source)
-                      ->first;
-                },
-                c.u);
+            const CharBlock &source{c.source};
+            return parsing->allCooked().GetSourcePositionRange(source)->first;
           },
           [&](const OpenMPSectionConstruct &c) -> SourcePosition {
             const CharBlock &source{c.source};
-            return (parsing->allCooked().GetSourcePositionRange(source))->first;
+            return parsing->allCooked().GetSourcePositionRange(source)->first;
           },
           [&](const OpenMPUtilityConstruct &c) -> SourcePosition {
             const CharBlock &source{c.source};
-            return (parsing->allCooked().GetSourcePositionRange(source))->first;
+            return parsing->allCooked().GetSourcePositionRange(source)->first;
           },
       },
       c.u);
@@ -157,14 +151,9 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) {
             return normalize_construct_name(source.ToString());
           },
           [&](const OpenMPAtomicConstruct &c) -> std::string {
-            return std::visit(
-                [&](const auto &c) {
-                  // Get source from the verbatim fields
-                  const CharBlock &source{std::get<Verbatim>(c.t).source};
-                  return "atomic-" +
-                      normalize_construct_name(source.ToString());
-                },
-                c.u);
+            auto &dirSpec = std::get<OmpDirectiveSpecification>(c.t);
+            auto &dirName = std::get<OmpDirectiveName>(dirSpec.t);
+            return normalize_construct_name(dirName.source.ToString());
           },
           [&](const OpenMPUtilityConstruct &c) -> std::string {
             const CharBlock &source{c.source};
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index df9278697346f..c6a5150a85a4c 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -532,15 +532,6 @@ class ParseTreeDumper {
   NODE(parser, OmpAtClause)
   NODE_ENUM(OmpAtClause, ActionTime)
   NODE_ENUM(OmpSeverityClause, Severity)
-  NODE(parser, OmpAtomic)
-  NODE(parser, OmpAtomicCapture)
-  NODE(OmpAtomicCapture, Stmt1)
-  NODE(OmpAtomicCapture, Stmt2)
-  NODE(parser, OmpAtomicCompare)
-  NODE(parser, OmpAtomicCompareIfStmt)
-  NODE(parser, OmpAtomicRead)
-  NODE(parser, OmpAtomicUpdate)
-  NODE(parser, OmpAtomicWrite)
   NODE(parser, OmpBeginBlockDirective)
   NODE(parser, OmpBeginLoopDirective)
   NODE(parser, OmpBeginSectionsDirective)
@@ -587,7 +578,6 @@ class ParseTreeDumper {
   NODE(parser, OmpDoacrossClause)
   NODE(parser, OmpDestroyClause)
   NODE(parser, OmpEndAllocators)
-  NODE(parser, OmpEndAtomic)
   NODE(parser, OmpEndBlockDirective)
   NODE(parser, OmpEndCriticalDirective)
   NODE(parser, OmpEndLoopDirective)
@@ -716,8 +706,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPDeclareMapperConstruct)
   NODE_ENUM(common, OmpMemoryOrderType)
   NODE(parser, OmpMemoryOrderClause)
-  NODE(parser, OmpAtomicClause)
-  NODE(parser, OmpAtomicClauseList)
   NODE(parser, OmpAtomicDefaultMemOrderClause)
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPUtilityConstruct)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index c99006f0c1c22..67405f88e09f2 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4857,94 +4857,37 @@ struct OmpMemoryOrderClause {
   CharBlock source;
 };
 
-// 2.17.7 Atomic construct
-//        atomic-clause -> memory-order-clause | HINT(hint-expression) |
-//        FAIL(memory-order)
-struct OmpAtomicClause {
-  UNION_CLASS_BOILERPLATE(OmpAtomicClause);
-  CharBlock source;
-  std::variant<OmpMemoryOrderClause, OmpFailClause, OmpHintClause> u;
-};
-
-// atomic-clause-list -> [atomic-clause, [atomic-clause], ...]
-struct OmpAtomicClauseList {
-  WRAPPER_CLASS_BOILERPLATE(OmpAtomicClauseList, std::list<OmpAtomicClause>);
-  CharBlock source;
-};
-
-// END ATOMIC
-EMPTY_CLASS(OmpEndAtomic);
-
-// ATOMIC READ
-struct OmpAtomicRead {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomicRead);
-  CharBlock source;
-  std::tuple<OmpAtomicClauseList, Verbatim, OmpAtomicClauseList,
-      Statement<AssignmentStmt>, std::optional<OmpEndAtomic>>
-      t;
-};
-
-// ATOMIC WRITE
-struct OmpAtomicWrite {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomicWrite);
-  CharBlock source;
-  std::tuple<OmpAtomicClauseList, Verbatim, OmpAtomicClauseList,
-      Statement<AssignmentStmt>, std::optional<OmpEndAtomic>>
-      t;
-};
-
-// ATOMIC UPDATE
-struct OmpAtomicUpdate {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomicUpdate);
-  CharBlock source;
-  std::tuple<OmpAtomicClauseList, Verbatim, OmpAtomicClauseList,
-      Statement<AssignmentStmt>, std::optional<OmpEndAtomic>>
-      t;
-};
-
-// ATOMIC CAPTURE
-struct OmpAtomicCapture {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomicCapture);
-  CharBlock source;
-  WRAPPER_CLASS(Stmt1, Statement<AssignmentStmt>);
-  WRAPPER_CLASS(Stmt2, Statement<AssignmentStmt>);
-  std::tuple<OmpAtomicClauseList, Verbatim, OmpAtomicClauseList, Stmt1, Stmt2,
-      OmpEndAtomic>
-      t;
-};
-
-struct OmpAtomicCompareIfStmt {
-  UNION_CLASS_BOILERPLATE(OmpAtomicCompareIfStmt);
-  std::variant<common::Indirection<IfStmt>, common::Indirection<IfConstruct>> u;
-};
-
-// ATOMIC COMPARE (OpenMP 5.1, OPenMP 5.2 spec: 15.8.4)
-struct OmpAtomicCompare {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomicCompare);
+struct OpenMPAtomicConstruct {
+  llvm::omp::Clause GetKind() const;
+  bool IsCapture() const;
+  bool IsCompare() const;
+  TUPLE_CLASS_BOILERPLATE(OpenMPAtomicConstruct);
   CharBlock source;
-  std::tuple<OmpAtomicClauseList, Verbatim, OmpAtomicClauseList,
-      OmpAtomicCompareIfStmt, std::optional<OmpEndAtomic>>
+  std::tuple<OmpDirectiveSpecification, Block,
+      std::optional<OmpDirectiveSpecification>>
       t;
-};
 
-// ATOMIC
-struct OmpAtomic {
-  TUPLE_CLASS_BOILERPLATE(OmpAtomic);
-  CharBlock source;
-  std::tuple<Verbatim, OmpAtomicClauseList, Statement<AssignmentStmt>,
-      std::optional<OmpEndAtomic>>
-      t;
-};
+  // Information filled out during semantic checks to avoid duplication
+  // of analyses.
+  struct Analysis {
+    static constexpr int None = 0;
+    static constexpr int Read = 1;
+    static constexpr int Write = 2;
+    static constexpr int Update = Read | Write;
+    static constexpr int Action = 3; // Bitmask for None, Read, Write, Update
+    static constexpr int IfTrue = 4;
+    static constexpr int IfFalse = 8;
+    static constexpr int Condition = 12; // Bitmask for IfTrue, IfFalse
+
+    struct Op {
+      int what;
+      AssignmentStmt::TypedAssignment assign;
+    };
+    TypedExpr atom, cond;
+    Op op0, op1;
+  };
 
-// 2.17.7 atomic ->
-//        ATOMIC [atomic-clause-list] atomic-construct [atomic-clause-list] |
-//        ATOMIC [atomic-clause-list]
-//        atomic-construct -> READ | WRITE | UPDATE | CAPTURE | COMPARE
-struct OpenMPAtomicConstruct {
-  UNION_CLASS_BOILERPLATE(OpenMPAtomicConstruct);
-  std::variant<OmpAtomicRead, OmpAtomicWrite, OmpAtomicCapture, OmpAtomicUpdate,
-      OmpAtomicCompare, OmpAtomic>
-      u;
+  mutable Analysis analysis;
 };
 
 // OpenMP directives that associate with loop(s)
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 4b2bb4fa167f8..b13370512e5cc 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -755,5 +755,152 @@ bool HadUseError(SemanticsContext &, SourceName at, const Symbol *);
 
 // Checks whether the symbol on the LHS is present in the RHS expression.
 bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs);
+
+namespace operation {
+
+enum class Operator {
+  Unknown,
+  Add,
+  And,
+  Associated,
+  Call,
+  Constant,
+  Convert,
+  Div,
+  Eq,
+  Eqv,
+  False,
+  Ge,
+  Gt,
+  Identity,
+  Intrinsic,
+  Le,
+  Lt,
+  Max,
+  Min,
+  Mul,
+  Ne,
+  Neqv,
+  Not,
+  Or,
+  Pow,
+  Resize, // Convert within the same TypeCategory
+  Sub,
+  True,
+};
+
+std::string ToString(Operator op);
+
+template <typename... Ts, int Kind>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::LogicalOperation<Kind>, Ts...> &op) {
+  switch (op.derived().logicalOperator) {
+  case common::LogicalOperator::And:
+    return Operator::And;
+  case common::LogicalOperator::Or:
+    return Operator::Or;
+  case common::LogicalOperator::Eqv:
+    return Operator::Eqv;
+  case common::LogicalOperator::Neqv:
+    return Operator::Neqv;
+  case common::LogicalOperator::Not:
+    return Operator::Not;
+  }
+  return Operator::Unknown;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Relational<T>, Ts...> &op) {
+  switch (op.derived().opr) {
+  case common::RelationalOperator::LT:
+    return Operator::Lt;
+  case common::RelationalOperator::LE:
+    return Operator::Le;
+  case common::RelationalOperator::EQ:
+    return Operator::Eq;
+  case common::RelationalOperator::NE:
+    return Operator::Ne;
+  case common::RelationalOperator::GE:
+    return Operator::Ge;
+  case common::RelationalOperator::GT:
+    return Operator::Gt;
+  }
+  return Operator::Unknown;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(const evaluate::Operation<evaluate::Add<T>, Ts...> &op) {
+  return Operator::Add;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Subtract<T>, Ts...> &op) {
+  return Operator::Sub;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Multiply<T>, Ts...> &op) {
+  return Operator::Mul;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Divide<T>, Ts...> &op) {
+  return Operator::Div;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Power<T>, Ts...> &op) {
+  return Operator::Pow;
+}
+
+template <typename T, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::RealToIntPower<T>, Ts...> &op) {
+  return Operator::Pow;
+}
+
+template <typename T, common::TypeCategory C, typename... Ts>
+Operator OperationCode(
+    const evaluate::Operation<evaluate::Convert<T, C>, Ts...> &op) {
+  if constexpr (C == T::category) {
+    return Operator::Resize;
+  } else {
+    return Operator::Convert;
+  }
+}
+
+template <typename T> //
+Operator OperationCode(const evaluate::Constant<T> &x) {
+  return Operator::Constant;
+}
+
+template <typename T> //
+Operator OperationCode(const T &) {
+  return Operator::Unknown;
+}
+
+Operator OperationCode(const evaluate::ProcedureDesignator &proc);
+
+} // namespace operation
+
+/// Return information about the top-level operation (ignoring parentheses):
+/// the operation code and the list of arguments.
+std::pair<operation::Operator, std::vector<SomeExpr>> GetTopLevelOperation(
+    const SomeExpr &expr);
+
+/// Check if expr is same as x, or a sequence of Convert operations on x.
+bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x);
+
+/// Strip away any top-level Convert operations (if any exist) and return
+/// the input value. A ComplexConstructor(x, 0) is also considered as a
+/// convert operation.
+/// If the input is not Operation, Designator, FunctionRef or Constant,
+/// it returns std::nullopt.
+MaybeExpr GetConvertInput(const SomeExpr &x);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TOOLS_H_
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index f8c68bfc3056a..1b8670b379f82 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -356,26 +356,26 @@ getSource(const semantics::SemanticsContext &semaCtx,
   const parser::CharBlock *source = nullptr;
 
   auto ompConsVisit = [&](const parser::OpenMPConstruct &x) {
-    std::visit(common::visitors{
-                   [&](const parser::OpenMPSectionsConstruct &x) {
-                     source = &std::get<0>(x.t).source;
-                   },
-                   [&](const parser::OpenMPLoopConstruct &x) {
-                     source = &std::get<0>(x.t).source;
-                   },
-                   [&](const parser::OpenMPBlockConstruct &x) {
-                     source = &std::get<0>(x.t).source;
-                   },
-                   [&](const parser::OpenMPCriticalConstruct &x) {
-                     source = &std::get<0>(x.t).source;
-                   },
-                   [&](const parser::OpenMPAtomicConstruct &x) {
-                     std::visit([&](const auto &x) { source = &x.source; },
-                                x.u);
-                   },
-                   [&](const auto &x) { source = &x.source; },
-               },
-               x.u);
+    std::visit(
+        common::visitors{
+            [&](const parser::OpenMPSectionsConstruct &x) {
+              source = &std::get<0>(x.t).source;
+            },
+            [&](const parser::OpenMPLoopConstruct &x) {
+              source = &std::get<0>(x.t).source;
+            },
+            [&](const parser::OpenMPBlockConstruct &x) {
+              source = &std::get<0>(x.t).source;
+            },
+            [&](const parser::OpenMPCriticalConstruct &x) {
+              source = &std::get<0>(x.t).source;
+            },
+            [&](const parser::OpenMPAtomicConstruct &x) {
+              source = &std::get<parser::OmpDirectiveSpecification>(x.t).source;
+            },
+            [&](const auto &x) { source = &x.source; },
+        },
+        x.u);
   };
 
   eval.visit(common::visitors{
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 784749bba5a0c..3f3b85696db31 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -41,10 +41,13 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace Fortran::lower::omp;
 using namespace Fortran::common::openmp;
 
+static llvm::cl::opt<bool> DumpAtomicAnalysis("fdebug-dump-atomic-analysis");
+
 //===----------------------------------------------------------------------===//
 // Code generation helper functions
 //===----------------------------------------------------------------------===//
@@ -1122,6 +1125,16 @@ markDeclareTarget(mlir::Operation *op, lower::AbstractConverter &converter,
   declareTargetOp.setDeclareTarget(deviceType, captureClause);
 }
 
+static bool isPointerAssignment(const evaluate::Assignment &assign) {
+  return common::visit(
+      common::visitors{
+          [](const evaluate::Assignment::BoundsSpec &) { return true; },
+          [](const evaluate::Assignment::BoundsRemapping &) { return true; },
+          [](const auto &) { return false; },
+      },
+      assign.u);
+}
+
 //===----------------------------------------------------------------------===//
 // Op body generation helper structures and functions
 //===----------------------------------------------------------------------===//
@@ -2676,645 +2689,215 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
 //===----------------------------------------------------------------------===//
 // Code generation for atomic operations
 //===----------------------------------------------------------------------===//
+static fir::FirOpBuilder::InsertPoint
+getInsertionPointBefore(mlir::Operation *op) {
+  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
+                                        mlir::Block::iterator(op));
+}
 
-/// Populates \p hint and \p memoryOrder with appropriate clause information
-/// if present on atomic construct.
-static void genOmpAtomicHintAndMemoryOrderClauses(
-    lower::AbstractConverter &converter,
-    const parser::OmpAtomicClauseList &clauseList, mlir::IntegerAttr &hint,
-    mlir::omp::ClauseMemoryOrderKindAttr &memoryOrder) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  for (const parser::OmpAtomicClause &clause : clauseList.v) {
-    common::visit(
-        common::visitors{
-            [&](const parser::OmpMemoryOrderClause &s) {
-              auto kind = common::visit(
-                  common::visitors{
-                      [&](const parser::OmpClause::AcqRel &) {
-                        return mlir::omp::ClauseMemoryOrderKind::Acq_rel;
-                      },
-                      [&](const parser::OmpClause::Acquire &) {
-                        return mlir::omp::ClauseMemoryOrderKind::Acquire;
-                      },
-                      [&](const parser::OmpClause::Relaxed &) {
-                        return mlir::omp::ClauseMemoryOrderKind::Relaxed;
-                      },
-                      [&](const parser::OmpClause::Release &) {
-                        return mlir::omp::ClauseMemoryOrderKind::Release;
-                      },
-                      [&](const parser::OmpClause::SeqCst &) {
-                        return mlir::omp::ClauseMemoryOrderKind::Seq_cst;
-                      },
-                      [&](auto &&) -> mlir::omp::ClauseMemoryOrderKind {
-                        llvm_unreachable("Unexpected clause");
-                      },
-                  },
-                  s.v.u);
-              memoryOrder = mlir::omp::ClauseMemoryOrderKindAttr::get(
-                  firOpBuilder.getContext(), kind);
-            },
-            [&](const parser::OmpHintClause &s) {
-              const auto *expr = semantics::GetExpr(s.v);
-              uint64_t hintExprValue = *evaluate::ToInt64(*expr);
-              hint = firOpBuilder.getI64IntegerAttr(hintExprValue);
-            },
-            [&](const parser::OmpFailClause &) {},
-        },
-        clause.u);
+static fir::FirOpBuilder::InsertPoint
+getInsertionPointAfter(mlir::Operation *op) {
+  return fir::FirOpBuilder::InsertPoint(op->getBlock(),
+                                        ++mlir::Block::iterator(op));
+}
+
+static mlir::IntegerAttr getAtomicHint(lower::AbstractConverter &converter,
+                                       const List<Clause> &clauses) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  for (const Clause &clause : clauses) {
+    if (clause.id != llvm::omp::Clause::OMPC_hint)
+      continue;
+    auto &hint = std::get<clause::Hint>(clause.u);
+    auto maybeVal = evaluate::ToInt64(hint.v);
+    CHECK(maybeVal);
+    return builder.getI64IntegerAttr(*maybeVal);
   }
+  return nullptr;
 }
 
-static void processOmpAtomicTODO(mlir::Type elementType, mlir::Location loc) {
-  if (!elementType)
-    return;
-  assert(fir::isa_trivial(fir::unwrapRefType(elementType)) &&
-         "is supported type for omp atomic");
-}
-
-/// Used to generate atomic.read operation which is created in existing
-/// location set by builder.
-static void genAtomicCaptureStatement(
-    lower::AbstractConverter &converter, mlir::Value fromAddress,
-    mlir::Value toAddress,
-    const parser::OmpAtomicClauseList *leftHandClauseList,
-    const parser::OmpAtomicClauseList *rightHandClauseList,
-    mlir::Type elementType, mlir::Location loc) {
-  // Generate `atomic.read` operation for atomic assigment statements
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+static mlir::omp::ClauseMemoryOrderKindAttr
+getAtomicMemoryOrder(lower::AbstractConverter &converter,
+                     semantics::SemanticsContext &semaCtx,
+                     const List<Clause> &clauses) {
+  std::optional<mlir::omp::ClauseMemoryOrderKind> kind;
+  unsigned version = semaCtx.langOptions().OpenMPVersion;
 
-  processOmpAtomicTODO(elementType, loc);
-
-  // If no hint clause is specified, the effect is as if
-  // hint(omp_sync_hint_none) had been specified.
-  mlir::IntegerAttr hint = nullptr;
-
-  mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr;
-  if (leftHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint,
-                                          memoryOrder);
-  if (rightHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint,
-                                          memoryOrder);
-  firOpBuilder.create<mlir::omp::AtomicReadOp>(loc, fromAddress, toAddress,
-                                               mlir::TypeAttr::get(elementType),
-                                               hint, memoryOrder);
-}
-
-/// Used to generate atomic.write operation which is created in existing
-/// location set by builder.
-static void genAtomicWriteStatement(
-    lower::AbstractConverter &converter, mlir::Value lhsAddr,
-    mlir::Value rhsExpr, const parser::OmpAtomicClauseList *leftHandClauseList,
-    const parser::OmpAtomicClauseList *rightHandClauseList, mlir::Location loc,
-    mlir::Value *evaluatedExprValue = nullptr) {
-  // Generate `atomic.write` operation for atomic assignment statements
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  for (const Clause &clause : clauses) {
+    switch (clause.id) {
+    case llvm::omp::Clause::OMPC_acq_rel:
+      kind = mlir::omp::ClauseMemoryOrderKind::Acq_rel;
+      break;
+    case llvm::omp::Clause::OMPC_acquire:
+      kind = mlir::omp::ClauseMemoryOrderKind::Acquire;
+      break;
+    case llvm::omp::Clause::OMPC_relaxed:
+      kind = mlir::omp::ClauseMemoryOrderKind::Relaxed;
+      break;
+    case llvm::omp::Clause::OMPC_release:
+      kind = mlir::omp::ClauseMemoryOrderKind::Release;
+      break;
+    case llvm::omp::Clause::OMPC_seq_cst:
+      kind = mlir::omp::ClauseMemoryOrderKind::Seq_cst;
+      break;
+    default:
+      break;
+    }
+  }
 
-  mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
-  // Create a conversion outside the capture block.
-  auto insertionPoint = firOpBuilder.saveInsertionPoint();
-  firOpBuilder.setInsertionPointAfter(rhsExpr.getDefiningOp());
-  rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr);
-  firOpBuilder.restoreInsertionPoint(insertionPoint);
-
-  processOmpAtomicTODO(varType, loc);
-
-  // If no hint clause is specified, the effect is as if
-  // hint(omp_sync_hint_none) had been specified.
-  mlir::IntegerAttr hint = nullptr;
-  mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr;
-  if (leftHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint,
-                                          memoryOrder);
-  if (rightHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint,
-                                          memoryOrder);
-  firOpBuilder.create<mlir::omp::AtomicWriteOp>(loc, lhsAddr, rhsExpr, hint,
-                                                memoryOrder);
-}
-
-/// Used to generate atomic.update operation which is created in existing
-/// location set by builder.
-static void genAtomicUpdateStatement(
-    lower::AbstractConverter &converter, mlir::Value lhsAddr,
-    mlir::Type varType, const parser::Variable &assignmentStmtVariable,
-    const parser::Expr &assignmentStmtExpr,
-    const parser::OmpAtomicClauseList *leftHandClauseList,
-    const parser::OmpAtomicClauseList *rightHandClauseList, mlir::Location loc,
-    mlir::Operation *atomicCaptureOp = nullptr,
-    lower::StatementContext *atomicCaptureStmtCtx = nullptr) {
-  // Generate `atomic.update` operation for atomic assignment statements
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
+  // Starting with 5.1, if no memory-order clause is present, the effect
+  // is as if "relaxed" was present.
+  if (!kind) {
+    if (version <= 50)
+      return nullptr;
+    kind = mlir::omp::ClauseMemoryOrderKind::Relaxed;
+  }
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  return mlir::omp::ClauseMemoryOrderKindAttr::get(builder.getContext(), *kind);
+}
+
+static mlir::Operation * //
+genAtomicRead(lower::AbstractConverter &converter, mlir::Location loc,
+              lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+              const semantics::SomeExpr &atom,
+              const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+              mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+              fir::FirOpBuilder::InsertPoint preAt,
+              fir::FirOpBuilder::InsertPoint atomicAt,
+              fir::FirOpBuilder::InsertPoint postAt) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  mlir::Value storeAddr =
+      fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx, &loc));
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+  mlir::Type storeType = fir::unwrapRefType(storeAddr.getType());
+
+  mlir::Value toAddr = [&]() {
+    if (atomType == storeType)
+      return storeAddr;
+    return builder.createTemporary(loc, atomType, ".tmp.atomval");
+  }();
 
-  //  Create the omp.atomic.update or acc.atomic.update operation
-  //
-  //  func.func @_QPsb() {
-  //    %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFsbEa"}
-  //    %1 = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFsbEb"}
-  //    %2 = fir.load %1 : !fir.ref<i32>
-  //    omp.atomic.update   %0 : !fir.ref<i32> {
-  //    ^bb0(%arg0: i32):
-  //      %3 = arith.addi %arg0, %2 : i32
-  //      omp.yield(%3 : i32)
-  //    }
-  //    return
-  //  }
-
-  auto getArgExpression =
-      [](std::list<parser::ActualArgSpec>::const_iterator it) {
-        const auto &arg{std::get<parser::ActualArg>((*it).t)};
-        const auto *parserExpr{
-            std::get_if<common::Indirection<parser::Expr>>(&arg.u)};
-        return parserExpr;
-      };
+  builder.restoreInsertionPoint(atomicAt);
+  mlir::Operation *op = builder.create<mlir::omp::AtomicReadOp>(
+      loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint, memOrder);
+
+  if (atomType != storeType) {
+    lower::ExprToValueMap overrides;
+    // The READ operation could be a part of UPDATE CAPTURE, so make sure
+    // we don't emit extra code into the body of the atomic op.
+    builder.restoreInsertionPoint(postAt);
+    mlir::Value load = builder.create<fir::LoadOp>(loc, toAddr);
+    overrides.try_emplace(&atom, load);
+
+    converter.overrideExprValues(&overrides);
+    mlir::Value value =
+        fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+    converter.resetExprOverrides();
 
-  // Lower any non atomic sub-expression before the atomic operation, and
-  // map its lowered value to the semantic representation.
-  lower::ExprToValueMap exprValueOverrides;
-  // Max and min intrinsics can have a list of Args. Hence we need a list
-  // of nonAtomicSubExprs to hoist. Currently, only the load is hoisted.
-  llvm::SmallVector<const lower::SomeExpr *> nonAtomicSubExprs;
-  common::visit(
-      common::visitors{
-          [&](const common::Indirection<parser::FunctionReference> &funcRef)
-              -> void {
-            const auto &args{std::get<std::list<parser::ActualArgSpec>>(
-                funcRef.value().v.t)};
-            std::list<parser::ActualArgSpec>::const_iterator beginIt =
-                args.begin();
-            std::list<parser::ActualArgSpec>::const_iterator endIt = args.end();
-            const auto *exprFirst{getArgExpression(beginIt)};
-            if (exprFirst && exprFirst->value().source ==
-                                 assignmentStmtVariable.GetSource()) {
-              // Add everything except the first
-              beginIt++;
-            } else {
-              // Add everything except the last
-              endIt--;
-            }
-            std::list<parser::ActualArgSpec>::const_iterator it;
-            for (it = beginIt; it != endIt; it++) {
-              const common::Indirection<parser::Expr> *expr =
-                  getArgExpression(it);
-              if (expr)
-                nonAtomicSubExprs.push_back(semantics::GetExpr(*expr));
-            }
-          },
-          [&](const auto &op) -> void {
-            using T = std::decay_t<decltype(op)>;
-            if constexpr (std::is_base_of<parser::Expr::IntrinsicBinary,
-                                          T>::value) {
-              const auto &exprLeft{std::get<0>(op.t)};
-              const auto &exprRight{std::get<1>(op.t)};
-              if (exprLeft.value().source == assignmentStmtVariable.GetSource())
-                nonAtomicSubExprs.push_back(semantics::GetExpr(exprRight));
-              else
-                nonAtomicSubExprs.push_back(semantics::GetExpr(exprLeft));
-            }
-          },
-      },
-      assignmentStmtExpr.u);
-  lower::StatementContext nonAtomicStmtCtx;
-  lower::StatementContext *stmtCtxPtr = &nonAtomicStmtCtx;
-  if (!nonAtomicSubExprs.empty()) {
-    // Generate non atomic part before all the atomic operations.
-    auto insertionPoint = firOpBuilder.saveInsertionPoint();
-    if (atomicCaptureOp) {
-      assert(atomicCaptureStmtCtx && "must specify statement context");
-      firOpBuilder.setInsertionPoint(atomicCaptureOp);
-      // Any clean-ups associated with the expression lowering
-      // must also be generated outside of the atomic update operation
-      // and after the atomic capture operation.
-      // The atomicCaptureStmtCtx will be finalized at the end
-      // of the atomic capture operation generation.
-      stmtCtxPtr = atomicCaptureStmtCtx;
-    }
-    mlir::Value nonAtomicVal;
-    for (auto *nonAtomicSubExpr : nonAtomicSubExprs) {
-      nonAtomicVal = fir::getBase(converter.genExprValue(
-          currentLocation, *nonAtomicSubExpr, *stmtCtxPtr));
-      exprValueOverrides.try_emplace(nonAtomicSubExpr, nonAtomicVal);
-    }
-    if (atomicCaptureOp)
-      firOpBuilder.restoreInsertionPoint(insertionPoint);
+    builder.create<fir::StoreOp>(loc, value, storeAddr);
   }
+  return op;
+}
 
-  mlir::Operation *atomicUpdateOp = nullptr;
-  // If no hint clause is specified, the effect is as if
-  // hint(omp_sync_hint_none) had been specified.
-  mlir::IntegerAttr hint = nullptr;
-  mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr;
-  if (leftHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint,
-                                          memoryOrder);
-  if (rightHandClauseList)
-    genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint,
-                                          memoryOrder);
-  atomicUpdateOp = firOpBuilder.create<mlir::omp::AtomicUpdateOp>(
-      currentLocation, lhsAddr, hint, memoryOrder);
-
-  processOmpAtomicTODO(varType, loc);
-
-  llvm::SmallVector<mlir::Type> varTys = {varType};
-  llvm::SmallVector<mlir::Location> locs = {currentLocation};
-  firOpBuilder.createBlock(&atomicUpdateOp->getRegion(0), {}, varTys, locs);
-  mlir::Value val =
-      fir::getBase(atomicUpdateOp->getRegion(0).front().getArgument(0));
-
-  exprValueOverrides.try_emplace(semantics::GetExpr(assignmentStmtVariable),
-                                 val);
-  {
-    // statement context inside the atomic block.
-    converter.overrideExprValues(&exprValueOverrides);
-    lower::StatementContext atomicStmtCtx;
-    mlir::Value rhsExpr = fir::getBase(converter.genExprValue(
-        *semantics::GetExpr(assignmentStmtExpr), atomicStmtCtx));
-    mlir::Type exprType = fir::unwrapRefType(rhsExpr.getType());
-    if (fir::isa_complex(exprType) && !fir::isa_complex(varType)) {
-      // Emit an additional `ExtractValueOp` if the expression is of complex
-      // type
-      auto extract = firOpBuilder.create<fir::ExtractValueOp>(
-          currentLocation,
-          mlir::cast<mlir::ComplexType>(exprType).getElementType(), rhsExpr,
-          firOpBuilder.getArrayAttr(
-              firOpBuilder.getIntegerAttr(firOpBuilder.getIndexType(), 0)));
-      mlir::Value convertResult = firOpBuilder.create<fir::ConvertOp>(
-          currentLocation, varType, extract);
-      firOpBuilder.create<mlir::omp::YieldOp>(currentLocation, convertResult);
-    } else {
-      mlir::Value convertResult =
-          firOpBuilder.createConvert(currentLocation, varType, rhsExpr);
-      firOpBuilder.create<mlir::omp::YieldOp>(currentLocation, convertResult);
+static mlir::Operation * //
+genAtomicWrite(lower::AbstractConverter &converter, mlir::Location loc,
+               lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+               const semantics::SomeExpr &atom,
+               const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+               mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+               fir::FirOpBuilder::InsertPoint preAt,
+               fir::FirOpBuilder::InsertPoint atomicAt,
+               fir::FirOpBuilder::InsertPoint postAt) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  mlir::Value value =
+      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+  mlir::Value converted = builder.createConvert(loc, atomType, value);
+
+  builder.restoreInsertionPoint(atomicAt);
+  mlir::Operation *op = builder.create<mlir::omp::AtomicWriteOp>(
+      loc, atomAddr, converted, hint, memOrder);
+  return op;
+}
+
+static mlir::Operation *
+genAtomicUpdate(lower::AbstractConverter &converter, mlir::Location loc,
+                lower::StatementContext &stmtCtx, mlir::Value atomAddr,
+                const semantics::SomeExpr &atom,
+                const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+                mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+                fir::FirOpBuilder::InsertPoint preAt,
+                fir::FirOpBuilder::InsertPoint atomicAt,
+                fir::FirOpBuilder::InsertPoint postAt) {
+  lower::ExprToValueMap overrides;
+  lower::StatementContext naCtx;
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(preAt);
+
+  mlir::Type atomType = fir::unwrapRefType(atomAddr.getType());
+
+  // This must exist by now.
+  SomeExpr input = *semantics::GetConvertInput(assign.rhs);
+  std::vector<SomeExpr> args{semantics::GetTopLevelOperation(input).second};
+  assert(!args.empty() && "Update operation without arguments");
+  for (auto &arg : args) {
+    if (!semantics::IsSameOrConvertOf(arg, atom)) {
+      mlir::Value val = fir::getBase(converter.genExprValue(arg, naCtx, &loc));
+      overrides.try_emplace(&arg, val);
     }
-    converter.resetExprOverrides();
   }
-  firOpBuilder.setInsertionPointAfter(atomicUpdateOp);
-}
-
-/// Processes an atomic construct with write clause.
-static void genAtomicWrite(lower::AbstractConverter &converter,
-                           const parser::OmpAtomicWrite &atomicWrite,
-                           mlir::Location loc) {
-  const parser::OmpAtomicClauseList *rightHandClauseList = nullptr;
-  const parser::OmpAtomicClauseList *leftHandClauseList = nullptr;
-  // Get the address of atomic read operands.
-  rightHandClauseList = &std::get<2>(atomicWrite.t);
-  leftHandClauseList = &std::get<0>(atomicWrite.t);
-
-  const parser::AssignmentStmt &stmt =
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicWrite.t)
-          .statement;
-  const evaluate::Assignment &assign = *stmt.typedAssignment->v;
-  lower::StatementContext stmtCtx;
-  // Get the value and address of atomic write operands.
-  mlir::Value rhsExpr =
-      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx));
-  mlir::Value lhsAddr =
-      fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx));
-  genAtomicWriteStatement(converter, lhsAddr, rhsExpr, leftHandClauseList,
-                          rightHandClauseList, loc);
-}
-
-/*
-    Emit an implicit cast. Different yet compatible types on
-    omp.atomic.read constitute valid Fortran. The OMPIRBuilder will
-    emit atomic instructions (on primitive types) and `__atomic_load`
-    libcall (on complex type) without explicitly converting
-    between such compatible types. The OMPIRBuilder relies on the
-    frontend to resolve such inconsistencies between `omp.atomic.read `
-    operand types. Similar inconsistencies between operand types in
-    `omp.atomic.write` are resolved through implicit casting by use of typed
-    assignment (i.e. `evaluate::Assignment`). However, use of typed
-    assignment in `omp.atomic.read` (of form `v = x`) leads to an unsafe,
-    non-atomic load of `x` into a temporary `alloca`, followed by an atomic
-    read of form `v = alloca`. Hence, it is needed to perform a custom
-    implicit cast.
-
-    An atomic read of form `v = x` would (without implicit casting)
-    lower to `omp.atomic.read %v = %x : !fir.ref<type1>, !fir.ref<type2>,
-    type2`. This implicit casting will rather generate the following FIR:
-
-         %alloca = fir.alloca type2
-         omp.atomic.read %alloca = %x : !fir.ref<type2>, !fir.ref<type2>, type2
-         %load = fir.load %alloca : !fir.ref<type2>
-         %cvt = fir.convert %load : (type2) -> type1
-         fir.store %cvt to %v : !fir.ref<type1>
-
-    These sequence of operations is thread-safe since each thread allocates
-    the `alloca` in its stack, and performs `%alloca = %x` atomically. Once
-    safely read, each thread performs the implicit cast on the local
-    `alloca`, and writes the final result to `%v`.
-
-/// \param builder              : FirOpBuilder
-/// \param loc                  : Location for FIR generation
-/// \param toAddress            : Address of %v
-/// \param toType               : Type of %v
-/// \param fromType             : Type of %x
-/// \param alloca               : Thread scoped `alloca`
-//				  It is the responsibility of the callee
-//				  to position the `alloca` at `AllocaIP`
-//				  through `builder.getAllocaBlock()`
-*/
-
-static void emitAtomicReadImplicitCast(fir::FirOpBuilder &builder,
-                                       mlir::Location loc,
-                                       mlir::Value toAddress, mlir::Type toType,
-                                       mlir::Type fromType,
-                                       mlir::Value alloca) {
-  auto load = builder.create<fir::LoadOp>(loc, alloca);
-  if (fir::isa_complex(fromType) && !fir::isa_complex(toType)) {
-    // Emit an additional `ExtractValueOp` if `fromAddress` is of complex
-    // type, but `toAddress` is not.
-    auto extract = builder.create<fir::ExtractValueOp>(
-        loc, mlir::cast<mlir::ComplexType>(fromType).getElementType(), load,
-        builder.getArrayAttr(
-            builder.getIntegerAttr(builder.getIndexType(), 0)));
-    auto cvt = builder.create<fir::ConvertOp>(loc, toType, extract);
-    builder.create<fir::StoreOp>(loc, cvt, toAddress);
-  } else if (!fir::isa_complex(fromType) && fir::isa_complex(toType)) {
-    // Emit an additional `InsertValueOp` if `toAddress` is of complex
-    // type, but `fromAddress` is not.
-    mlir::Value undef = builder.create<fir::UndefOp>(loc, toType);
-    mlir::Type complexEleTy =
-        mlir::cast<mlir::ComplexType>(toType).getElementType();
-    mlir::Value cvt = builder.create<fir::ConvertOp>(loc, complexEleTy, load);
-    mlir::Value zero = builder.createRealZeroConstant(loc, complexEleTy);
-    mlir::Value idx0 = builder.create<fir::InsertValueOp>(
-        loc, toType, undef, cvt,
-        builder.getArrayAttr(
-            builder.getIntegerAttr(builder.getIndexType(), 0)));
-    mlir::Value idx1 = builder.create<fir::InsertValueOp>(
-        loc, toType, idx0, zero,
-        builder.getArrayAttr(
-            builder.getIntegerAttr(builder.getIndexType(), 1)));
-    builder.create<fir::StoreOp>(loc, idx1, toAddress);
-  } else {
-    auto cvt = builder.create<fir::ConvertOp>(loc, toType, load);
-    builder.create<fir::StoreOp>(loc, cvt, toAddress);
-  }
-}
 
-/// Processes an atomic construct with read clause.
-static void genAtomicRead(lower::AbstractConverter &converter,
-                          const parser::OmpAtomicRead &atomicRead,
-                          mlir::Location loc) {
-  const parser::OmpAtomicClauseList *rightHandClauseList = nullptr;
-  const parser::OmpAtomicClauseList *leftHandClauseList = nullptr;
-  // Get the address of atomic read operands.
-  rightHandClauseList = &std::get<2>(atomicRead.t);
-  leftHandClauseList = &std::get<0>(atomicRead.t);
+  builder.restoreInsertionPoint(atomicAt);
+  auto updateOp =
+      builder.create<mlir::omp::AtomicUpdateOp>(loc, atomAddr, hint, memOrder);
 
-  const auto &assignmentStmtExpr = std::get<parser::Expr>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicRead.t)
-          .statement.t);
-  const auto &assignmentStmtVariable = std::get<parser::Variable>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicRead.t)
-          .statement.t);
+  mlir::Region &region = updateOp->getRegion(0);
+  mlir::Block *block = builder.createBlock(&region, {}, {atomType}, {loc});
+  mlir::Value localAtom = fir::getBase(block->getArgument(0));
+  overrides.try_emplace(&atom, localAtom);
 
-  lower::StatementContext stmtCtx;
-  const semantics::SomeExpr &fromExpr = *semantics::GetExpr(assignmentStmtExpr);
-  mlir::Type elementType = converter.genType(fromExpr);
-  mlir::Value fromAddress =
-      fir::getBase(converter.genExprAddr(fromExpr, stmtCtx));
-  mlir::Value toAddress = fir::getBase(converter.genExprAddr(
-      *semantics::GetExpr(assignmentStmtVariable), stmtCtx));
-
-  if (fromAddress.getType() != toAddress.getType()) {
-
-    mlir::Type toType = fir::unwrapRefType(toAddress.getType());
-    mlir::Type fromType = fir::unwrapRefType(fromAddress.getType());
-    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-    auto oldIP = builder.saveInsertionPoint();
-    builder.setInsertionPointToStart(builder.getAllocaBlock());
-    mlir::Value alloca = builder.create<fir::AllocaOp>(
-        loc, fromType); // Thread scope `alloca` to atomically read `%x`.
-    builder.restoreInsertionPoint(oldIP);
-    genAtomicCaptureStatement(converter, fromAddress, alloca,
-                              leftHandClauseList, rightHandClauseList,
-                              elementType, loc);
-    emitAtomicReadImplicitCast(builder, loc, toAddress, toType, fromType,
-                               alloca);
-  } else
-    genAtomicCaptureStatement(converter, fromAddress, toAddress,
-                              leftHandClauseList, rightHandClauseList,
-                              elementType, loc);
-}
-
-/// Processes an atomic construct with update clause.
-static void genAtomicUpdate(lower::AbstractConverter &converter,
-                            const parser::OmpAtomicUpdate &atomicUpdate,
-                            mlir::Location loc) {
-  const parser::OmpAtomicClauseList *rightHandClauseList = nullptr;
-  const parser::OmpAtomicClauseList *leftHandClauseList = nullptr;
-  // Get the address of atomic read operands.
-  rightHandClauseList = &std::get<2>(atomicUpdate.t);
-  leftHandClauseList = &std::get<0>(atomicUpdate.t);
-
-  const auto &assignmentStmtExpr = std::get<parser::Expr>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicUpdate.t)
-          .statement.t);
-  const auto &assignmentStmtVariable = std::get<parser::Variable>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicUpdate.t)
-          .statement.t);
+  converter.overrideExprValues(&overrides);
+  mlir::Value updated =
+      fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc));
+  mlir::Value converted = builder.createConvert(loc, atomType, updated);
+  builder.create<mlir::omp::YieldOp>(loc, converted);
+  converter.resetExprOverrides();
 
-  lower::StatementContext stmtCtx;
-  mlir::Value lhsAddr = fir::getBase(converter.genExprAddr(
-      *semantics::GetExpr(assignmentStmtVariable), stmtCtx));
-  mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
-  genAtomicUpdateStatement(converter, lhsAddr, varType, assignmentStmtVariable,
-                           assignmentStmtExpr, leftHandClauseList,
-                           rightHandClauseList, loc);
-}
-
-/// Processes an atomic construct with no clause - which implies update clause.
-static void genOmpAtomic(lower::AbstractConverter &converter,
-                         const parser::OmpAtomic &atomicConstruct,
-                         mlir::Location loc) {
-  const parser::OmpAtomicClauseList &atomicClauseList =
-      std::get<parser::OmpAtomicClauseList>(atomicConstruct.t);
-  const auto &assignmentStmtExpr = std::get<parser::Expr>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicConstruct.t)
-          .statement.t);
-  const auto &assignmentStmtVariable = std::get<parser::Variable>(
-      std::get<parser::Statement<parser::AssignmentStmt>>(atomicConstruct.t)
-          .statement.t);
-  lower::StatementContext stmtCtx;
-  mlir::Value lhsAddr = fir::getBase(converter.genExprAddr(
-      *semantics::GetExpr(assignmentStmtVariable), stmtCtx));
-  mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
-  // If atomic-clause is not present on the construct, the behaviour is as if
-  // the update clause is specified (for both OpenMP and OpenACC).
-  genAtomicUpdateStatement(converter, lhsAddr, varType, assignmentStmtVariable,
-                           assignmentStmtExpr, &atomicClauseList, nullptr, loc);
-}
-
-/// Processes an atomic construct with capture clause.
-static void genAtomicCapture(lower::AbstractConverter &converter,
-                             const parser::OmpAtomicCapture &atomicCapture,
-                             mlir::Location loc) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  builder.restoreInsertionPoint(postAt); // For naCtx cleanups
+  return updateOp;
+}
 
-  const parser::AssignmentStmt &stmt1 =
-      std::get<parser::OmpAtomicCapture::Stmt1>(atomicCapture.t).v.statement;
-  const evaluate::Assignment &assign1 = *stmt1.typedAssignment->v;
-  const auto &stmt1Var{std::get<parser::Variable>(stmt1.t)};
-  const auto &stmt1Expr{std::get<parser::Expr>(stmt1.t)};
-  const parser::AssignmentStmt &stmt2 =
-      std::get<parser::OmpAtomicCapture::Stmt2>(atomicCapture.t).v.statement;
-  const evaluate::Assignment &assign2 = *stmt2.typedAssignment->v;
-  const auto &stmt2Var{std::get<parser::Variable>(stmt2.t)};
-  const auto &stmt2Expr{std::get<parser::Expr>(stmt2.t)};
-
-  // Pre-evaluate expressions to be used in the various operations inside
-  // `atomic.capture` since it is not desirable to have anything other than
-  // a `atomic.read`, `atomic.write`, or `atomic.update` operation
-  // inside `atomic.capture`
-  lower::StatementContext stmtCtx;
-  // LHS evaluations are common to all combinations of `atomic.capture`
-  mlir::Value stmt1LHSArg =
-      fir::getBase(converter.genExprAddr(assign1.lhs, stmtCtx));
-  mlir::Value stmt2LHSArg =
-      fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
-
-  // Type information used in generation of `atomic.update` operation
-  mlir::Type stmt1VarType =
-      fir::getBase(converter.genExprValue(assign1.lhs, stmtCtx)).getType();
-  mlir::Type stmt2VarType =
-      fir::getBase(converter.genExprValue(assign2.lhs, stmtCtx)).getType();
-
-  mlir::Operation *atomicCaptureOp = nullptr;
-  mlir::IntegerAttr hint = nullptr;
-  mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr;
-  const parser::OmpAtomicClauseList &rightHandClauseList =
-      std::get<2>(atomicCapture.t);
-  const parser::OmpAtomicClauseList &leftHandClauseList =
-      std::get<0>(atomicCapture.t);
-  genOmpAtomicHintAndMemoryOrderClauses(converter, leftHandClauseList, hint,
-                                        memoryOrder);
-  genOmpAtomicHintAndMemoryOrderClauses(converter, rightHandClauseList, hint,
-                                        memoryOrder);
-  atomicCaptureOp =
-      firOpBuilder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memoryOrder);
-
-  firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
-  mlir::Block &block = atomicCaptureOp->getRegion(0).back();
-  firOpBuilder.setInsertionPointToStart(&block);
-  if (parser::CheckForSingleVariableOnRHS(stmt1)) {
-    if (semantics::CheckForSymbolMatch(semantics::GetExpr(stmt2Var),
-                                       semantics::GetExpr(stmt2Expr))) {
-      // Atomic capture construct is of the form [capture-stmt, update-stmt]
-      const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr);
-      mlir::Type elementType = converter.genType(fromExpr);
-      if (stmt1VarType != stmt2VarType) {
-        mlir::Value alloca;
-        mlir::Type toType = fir::unwrapRefType(stmt1LHSArg.getType());
-        mlir::Type fromType = fir::unwrapRefType(stmt2LHSArg.getType());
-        {
-          mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-          firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
-          alloca = firOpBuilder.create<fir::AllocaOp>(loc, fromType);
-        }
-        genAtomicCaptureStatement(converter, stmt2LHSArg, alloca,
-                                  /*leftHandClauseList=*/nullptr,
-                                  /*rightHandClauseList=*/nullptr, elementType,
-                                  loc);
-        {
-          mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-          firOpBuilder.setInsertionPointAfter(atomicCaptureOp);
-          emitAtomicReadImplicitCast(firOpBuilder, loc, stmt1LHSArg, toType,
-                                     fromType, alloca);
-        }
-      } else {
-        genAtomicCaptureStatement(converter, stmt2LHSArg, stmt1LHSArg,
-                                  /*leftHandClauseList=*/nullptr,
-                                  /*rightHandClauseList=*/nullptr, elementType,
-                                  loc);
-      }
-      genAtomicUpdateStatement(
-          converter, stmt2LHSArg, stmt2VarType, stmt2Var, stmt2Expr,
-          /*leftHandClauseList=*/nullptr,
-          /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp, &stmtCtx);
-    } else {
-      // Atomic capture construct is of the form [capture-stmt, write-stmt]
-      firOpBuilder.setInsertionPoint(atomicCaptureOp);
-      mlir::Value stmt2RHSArg =
-          fir::getBase(converter.genExprValue(assign2.rhs, stmtCtx));
-      firOpBuilder.setInsertionPointToStart(&block);
-      const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr);
-      mlir::Type elementType = converter.genType(fromExpr);
-
-      if (stmt1VarType != stmt2VarType) {
-        mlir::Value alloca;
-        mlir::Type toType = fir::unwrapRefType(stmt1LHSArg.getType());
-        mlir::Type fromType = fir::unwrapRefType(stmt2LHSArg.getType());
-        {
-          mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-          firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
-          alloca = firOpBuilder.create<fir::AllocaOp>(loc, fromType);
-        }
-        genAtomicCaptureStatement(converter, stmt2LHSArg, alloca,
-                                  /*leftHandClauseList=*/nullptr,
-                                  /*rightHandClauseList=*/nullptr, elementType,
-                                  loc);
-        {
-          mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-          firOpBuilder.setInsertionPointAfter(atomicCaptureOp);
-          emitAtomicReadImplicitCast(firOpBuilder, loc, stmt1LHSArg, toType,
-                                     fromType, alloca);
-        }
-      } else {
-        genAtomicCaptureStatement(converter, stmt2LHSArg, stmt1LHSArg,
-                                  /*leftHandClauseList=*/nullptr,
-                                  /*rightHandClauseList=*/nullptr, elementType,
-                                  loc);
-      }
-      genAtomicWriteStatement(converter, stmt2LHSArg, stmt2RHSArg,
-                              /*leftHandClauseList=*/nullptr,
-                              /*rightHandClauseList=*/nullptr, loc);
-    }
-  } else {
-    // Atomic capture construct is of the form [update-stmt, capture-stmt]
-    const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt2Expr);
-    mlir::Type elementType = converter.genType(fromExpr);
-    genAtomicUpdateStatement(
-        converter, stmt1LHSArg, stmt1VarType, stmt1Var, stmt1Expr,
-        /*leftHandClauseList=*/nullptr,
-        /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp, &stmtCtx);
-
-    if (stmt1VarType != stmt2VarType) {
-      mlir::Value alloca;
-      mlir::Type toType = fir::unwrapRefType(stmt2LHSArg.getType());
-      mlir::Type fromType = fir::unwrapRefType(stmt1LHSArg.getType());
-
-      {
-        mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-        firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock());
-        alloca = firOpBuilder.create<fir::AllocaOp>(loc, fromType);
-      }
+static mlir::Operation *
+genAtomicOperation(lower::AbstractConverter &converter, mlir::Location loc,
+                   lower::StatementContext &stmtCtx, int action,
+                   mlir::Value atomAddr, const semantics::SomeExpr &atom,
+                   const evaluate::Assignment &assign, mlir::IntegerAttr hint,
+                   mlir::omp::ClauseMemoryOrderKindAttr memOrder,
+                   fir::FirOpBuilder::InsertPoint preAt,
+                   fir::FirOpBuilder::InsertPoint atomicAt,
+                   fir::FirOpBuilder::InsertPoint postAt) {
+  if (isPointerAssignment(assign)) {
+    TODO(loc, "Code generation for pointer assignment is not implemented yet");
+  }
 
-      genAtomicCaptureStatement(converter, stmt1LHSArg, alloca,
-                                /*leftHandClauseList=*/nullptr,
-                                /*rightHandClauseList=*/nullptr, elementType,
-                                loc);
-      {
-        mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
-        firOpBuilder.setInsertionPointAfter(atomicCaptureOp);
-        emitAtomicReadImplicitCast(firOpBuilder, loc, stmt2LHSArg, toType,
-                                   fromType, alloca);
-      }
-    } else {
-      genAtomicCaptureStatement(converter, stmt1LHSArg, stmt2LHSArg,
-                                /*leftHandClauseList=*/nullptr,
-                                /*rightHandClauseList=*/nullptr, elementType,
-                                loc);
-    }
+  // This function and the functions called here do not preserve the
+  // builder's insertion point, or set it to anything specific.
+  switch (action) {
+  case parser::OpenMPAtomicConstruct::Analysis::Read:
+    return genAtomicRead(converter, loc, stmtCtx, atomAddr, atom, assign, hint,
+                         memOrder, preAt, atomicAt, postAt);
+  case parser::OpenMPAtomicConstruct::Analysis::Write:
+    return genAtomicWrite(converter, loc, stmtCtx, atomAddr, atom, assign, hint,
+                          memOrder, preAt, atomicAt, postAt);
+  case parser::OpenMPAtomicConstruct::Analysis::Update:
+    return genAtomicUpdate(converter, loc, stmtCtx, atomAddr, atom, assign,
+                           hint, memOrder, preAt, atomicAt, postAt);
+  default:
+    return nullptr;
   }
-  firOpBuilder.setInsertionPointToEnd(&block);
-  firOpBuilder.create<mlir::omp::TerminatorOp>(loc);
-  // The clean-ups associated with the statements inside the capture
-  // construct must be generated after the AtomicCaptureOp.
-  firOpBuilder.setInsertionPointAfter(atomicCaptureOp);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4212,10 +3795,6 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       standaloneConstruct.u);
 }
 
-//===----------------------------------------------------------------------===//
-// OpenMPConstruct visitors
-//===----------------------------------------------------------------------===//
-
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
@@ -4223,38 +3802,164 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct");
 }
 
+//===----------------------------------------------------------------------===//
+// OpenMPConstruct visitors
+//===----------------------------------------------------------------------===//
+
+[[maybe_unused]] static void
+dumpAtomicAnalysis(const parser::OpenMPAtomicConstruct::Analysis &analysis) {
+  auto whatStr = [](int k) {
+    std::string txt = "?";
+    switch (k & parser::OpenMPAtomicConstruct::Analysis::Action) {
+    case parser::OpenMPAtomicConstruct::Analysis::None:
+      txt = "None";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Read:
+      txt = "Read";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Write:
+      txt = "Write";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::Update:
+      txt = "Update";
+      break;
+    }
+    switch (k & parser::OpenMPAtomicConstruct::Analysis::Condition) {
+    case parser::OpenMPAtomicConstruct::Analysis::IfTrue:
+      txt += " | IfTrue";
+      break;
+    case parser::OpenMPAtomicConstruct::Analysis::IfFalse:
+      txt += " | IfFalse";
+      break;
+    }
+    return txt;
+  };
+
+  auto exprStr = [&](const parser::TypedExpr &expr) {
+    if (auto *maybe = expr.get()) {
+      if (maybe->v)
+        return maybe->v->AsFortran();
+    }
+    return "<null>"s;
+  };
+  auto assignStr = [&](const parser::AssignmentStmt::TypedAssignment &assign) {
+    if (auto *maybe = assign.get(); maybe && maybe->v) {
+      std::string str;
+      llvm::raw_string_ostream os(str);
+      maybe->v->AsFortran(os);
+      return str;
+    }
+    return "<null>"s;
+  };
+
+  const SomeExpr &atom = *analysis.atom.get()->v;
+
+  llvm::errs() << "Analysis {\n";
+  llvm::errs() << "  atom: " << atom.AsFortran() << "\n";
+  llvm::errs() << "  cond: " << exprStr(analysis.cond) << "\n";
+  llvm::errs() << "  op0 {\n";
+  llvm::errs() << "    what: " << whatStr(analysis.op0.what) << "\n";
+  llvm::errs() << "    assign: " << assignStr(analysis.op0.assign) << "\n";
+  llvm::errs() << "  }\n";
+  llvm::errs() << "  op1 {\n";
+  llvm::errs() << "    what: " << whatStr(analysis.op1.what) << "\n";
+  llvm::errs() << "    assign: " << assignStr(analysis.op1.assign) << "\n";
+  llvm::errs() << "  }\n";
+  llvm::errs() << "}\n";
+}
+
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
-                   const parser::OpenMPAtomicConstruct &atomicConstruct) {
-  Fortran::common::visit(
-      common::visitors{
-          [&](const parser::OmpAtomicRead &atomicRead) {
-            mlir::Location loc = converter.genLocation(atomicRead.source);
-            genAtomicRead(converter, atomicRead, loc);
-          },
-          [&](const parser::OmpAtomicWrite &atomicWrite) {
-            mlir::Location loc = converter.genLocation(atomicWrite.source);
-            genAtomicWrite(converter, atomicWrite, loc);
-          },
-          [&](const parser::OmpAtomic &atomicConstruct) {
-            mlir::Location loc = converter.genLocation(atomicConstruct.source);
-            genOmpAtomic(converter, atomicConstruct, loc);
-          },
-          [&](const parser::OmpAtomicUpdate &atomicUpdate) {
-            mlir::Location loc = converter.genLocation(atomicUpdate.source);
-            genAtomicUpdate(converter, atomicUpdate, loc);
-          },
-          [&](const parser::OmpAtomicCapture &atomicCapture) {
-            mlir::Location loc = converter.genLocation(atomicCapture.source);
-            genAtomicCapture(converter, atomicCapture, loc);
-          },
-          [&](const parser::OmpAtomicCompare &atomicCompare) {
-            mlir::Location loc = converter.genLocation(atomicCompare.source);
-            TODO(loc, "OpenMP atomic compare");
-          },
-      },
-      atomicConstruct.u);
+                   const parser::OpenMPAtomicConstruct &construct) {
+  auto get = [](auto &&typedWrapper) -> decltype(&*typedWrapper.get()->v) {
+    if (auto *maybe = typedWrapper.get(); maybe && maybe->v) {
+      return &*maybe->v;
+    } else {
+      return nullptr;
+    }
+  };
+
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  auto &dirSpec = std::get<parser::OmpDirectiveSpecification>(construct.t);
+  List<Clause> clauses = makeClauses(dirSpec.Clauses(), semaCtx);
+  lower::StatementContext stmtCtx;
+
+  const parser::OpenMPAtomicConstruct::Analysis &analysis = construct.analysis;
+  if (DumpAtomicAnalysis)
+    dumpAtomicAnalysis(analysis);
+
+  const semantics::SomeExpr &atom = *get(analysis.atom);
+  mlir::Location loc = converter.genLocation(construct.source);
+  mlir::Value atomAddr =
+      fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc));
+  mlir::IntegerAttr hint = getAtomicHint(converter, clauses);
+  mlir::omp::ClauseMemoryOrderKindAttr memOrder =
+      getAtomicMemoryOrder(converter, semaCtx, clauses);
+
+  if (auto *cond = get(analysis.cond)) {
+    (void)cond;
+    TODO(loc, "OpenMP ATOMIC COMPARE");
+  } else {
+    int action0 = analysis.op0.what & analysis.Action;
+    int action1 = analysis.op1.what & analysis.Action;
+    mlir::Operation *captureOp = nullptr;
+    fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint();
+    fir::FirOpBuilder::InsertPoint atomicAt, postAt;
+
+    if (construct.IsCapture()) {
+      // Capturing operation.
+      assert(action0 != analysis.None && action1 != analysis.None &&
+             "Expexcing two actions");
+      captureOp =
+          builder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memOrder);
+      // Set the non-atomic insertion point to before the atomic.capture.
+      preAt = getInsertionPointBefore(captureOp);
+
+      mlir::Block *block = builder.createBlock(&captureOp->getRegion(0));
+      builder.setInsertionPointToEnd(block);
+      // Set the atomic insertion point to before the terminator inside
+      // atomic.capture.
+      mlir::Operation *term = builder.create<mlir::omp::TerminatorOp>(loc);
+      atomicAt = getInsertionPointBefore(term);
+      postAt = getInsertionPointAfter(captureOp);
+      hint = nullptr;
+      memOrder = nullptr;
+    } else {
+      // Non-capturing operation.
+      assert(action0 != analysis.None && action1 == analysis.None &&
+             "Expexcing single action");
+      assert(!(analysis.op0.what & analysis.Condition));
+      postAt = atomicAt = preAt;
+    }
+
+    // The builder's insertion point needs to be specifically set before
+    // each call to `genAtomicOperation`.
+    mlir::Operation *firstOp = genAtomicOperation(
+        converter, loc, stmtCtx, analysis.op0.what, atomAddr, atom,
+        *get(analysis.op0.assign), hint, memOrder, preAt, atomicAt, postAt);
+    assert(firstOp && "Should have created an atomic operation");
+    atomicAt = getInsertionPointAfter(firstOp);
+
+    mlir::Operation *secondOp = nullptr;
+    if (analysis.op1.what != analysis.None) {
+      secondOp = genAtomicOperation(converter, loc, stmtCtx, analysis.op1.what,
+                                    atomAddr, atom, *get(analysis.op1.assign),
+                                    hint, memOrder, preAt, atomicAt, postAt);
+    }
+
+    if (construct.IsCapture()) {
+      // If this is a capture operation, the first/second ops will be inside
+      // of it. Set the insertion point to past the capture op itself.
+      builder.restoreInsertionPoint(postAt);
+    } else {
+      if (secondOp) {
+        builder.setInsertionPointAfter(secondOp);
+      } else {
+        builder.setInsertionPointAfter(firstOp);
+      }
+    }
+  }
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 08326fad8c143..9b112a2133918 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -24,6 +24,12 @@
 // OpenMP Directives and Clauses
 namespace Fortran::parser {
 
+// Helper function to print the buffer contents starting at the current point.
+[[maybe_unused]] static std::string ahead(const ParseState &state) {
+  return std::string(
+      state.GetLocation(), std::min<size_t>(64, state.BytesRemaining()));
+}
+
 constexpr auto startOmpLine = skipStuffBeforeStatement >> "!$OMP "_sptok;
 constexpr auto endOmpLine = space >> endOfLine;
 
@@ -941,8 +947,10 @@ TYPE_PARSER( //
             parenthesized(Parser<OmpAtomicDefaultMemOrderClause>{}))) ||
     "BIND" >> construct<OmpClause>(construct<OmpClause::Bind>(
                   parenthesized(Parser<OmpBindClause>{}))) ||
+    "CAPTURE" >> construct<OmpClause>(construct<OmpClause::Capture>()) ||
     "COLLAPSE" >> construct<OmpClause>(construct<OmpClause::Collapse>(
                       parenthesized(scalarIntConstantExpr))) ||
+    "COMPARE" >> construct<OmpClause>(construct<OmpClause::Compare>()) ||
     "CONTAINS" >> construct<OmpClause>(construct<OmpClause::Contains>(
                       parenthesized(Parser<OmpContainsClause>{}))) ||
     "COPYIN" >> construct<OmpClause>(construct<OmpClause::Copyin>(
@@ -1062,6 +1070,7 @@ TYPE_PARSER( //
     "TASK_REDUCTION" >>
         construct<OmpClause>(construct<OmpClause::TaskReduction>(
             parenthesized(Parser<OmpTaskReductionClause>{}))) ||
+    "READ" >> construct<OmpClause>(construct<OmpClause::Read>()) ||
     "RELAXED" >> construct<OmpClause>(construct<OmpClause::Relaxed>()) ||
     "RELEASE" >> construct<OmpClause>(construct<OmpClause::Release>()) ||
     "REVERSE_OFFLOAD" >>
@@ -1105,6 +1114,7 @@ TYPE_PARSER( //
                     maybe(Parser<OmpUpdateClause>{}))) ||
     "WHEN" >> construct<OmpClause>(construct<OmpClause::When>(
                   parenthesized(Parser<OmpWhenClause>{}))) ||
+    "WRITE" >> construct<OmpClause>(construct<OmpClause::Write>()) ||
     // Cancellable constructs
     construct<OmpClause>(construct<OmpClause::CancellationConstructType>(
         Parser<OmpCancellationConstructTypeClause>{})))
@@ -1223,6 +1233,155 @@ TYPE_PARSER(sourced(construct<OmpLoopDirective>(first(
 TYPE_PARSER(sourced(construct<OmpBeginLoopDirective>(
     sourced(Parser<OmpLoopDirective>{}), Parser<OmpClauseList>{})))
 
+struct OmpEndDirectiveParser {
+  using resultType = OmpDirectiveSpecification;
+
+  constexpr OmpEndDirectiveParser(llvm::omp::Directive dir) : dir_(dir) {}
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    if ((startOmpLine >> "END"_sptok).Parse(state)) {
+      auto &&dirSpec{Parser<OmpDirectiveSpecification>{}.Parse(state)};
+      if (dirSpec && dirSpec->DirId() == dir_) {
+        return std::move(dirSpec);
+      }
+    }
+    return std::nullopt;
+  }
+
+private:
+  llvm::omp::Directive dir_;
+};
+
+// Parser for an arbitrary OpenMP ATOMIC construct.
+//
+// Depending on circumstances, an ATOMIC construct applies to one or more
+// following statements. In certain cases when a single statement is
+// expected, the end-directive is optional. The specifics depend on both
+// the clauses used, and the form of the executable statement. To emit
+// more meaningful messages in case of errors, the exact analysis of the
+// structure of the construct will be delayed until semantic checks.
+//
+// The parser will first try the case when the end-directive is present,
+// and will parse at most "BodyLimit" (and potentially zero) constructs
+// while looking for the end-directive before it gives up.
+// Then it will assume that no end-directive is present, and will try to
+// parse a single executable construct as the body of the construct.
+//
+// The limit on the number of constructs is there to reduce the amount of
+// unnecessary parsing when the end-directive is absent. It's higher than
+// the maximum number of statements in any valid construct to accept cases
+// when extra statements are present by mistake.
+// A problem can occur when atomic constructs without end-directive follow
+// each other closely, e.g.
+//   !$omp atomic write
+//     x = v
+//   !$omp atomic update
+//     x = x + 1
+//   ...
+// The speculative parsing will become "recursive", and has the potential
+// to take a (practically) infinite amount of time given a sufficiently
+// large number of such constructs in a row. Since atomic constructs cannot
+// contain other OpenMP constructs, guarding against recursive calls to the
+// atomic construct parser solves the problem.
+struct OmpAtomicConstructParser {
+  using resultType = OpenMPAtomicConstruct;
+
+  static constexpr size_t BodyLimit{5};
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    if (recursing_) {
+      return std::nullopt;
+    }
+    recursing_ = true;
+
+    auto dirSpec{Parser<OmpDirectiveSpecification>{}.Parse(state)};
+    if (!dirSpec || dirSpec->DirId() != llvm::omp::Directive::OMPD_atomic) {
+      recursing_ = false;
+      return std::nullopt;
+    }
+
+    auto exec{Parser<ExecutionPartConstruct>{}};
+    auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_atomic}};
+    TailType tail;
+
+    if (ParseOne(exec, end, tail, state)) {
+      if (!tail.first.empty()) {
+        if (auto &&rest{attempt(LimitedTailParser(BodyLimit)).Parse(state)}) {
+          for (auto &&s : rest->first) {
+            tail.first.emplace_back(std::move(s));
+          }
+          assert(!tail.second);
+          tail.second = std::move(rest->second);
+        }
+      }
+      recursing_ = false;
+      return OpenMPAtomicConstruct{
+          std::move(*dirSpec), std::move(tail.first), std::move(tail.second)};
+    }
+
+    recursing_ = false;
+    return std::nullopt;
+  }
+
+private:
+  // Begin-directive + TailType = entire construct.
+  using TailType = std::pair<Block, std::optional<OmpDirectiveSpecification>>;
+
+  // Parse either an ExecutionPartConstruct, or atomic end-directive. When
+  // successful, record the result in the "tail" provided, otherwise fail.
+  static std::optional<Success> ParseOne( //
+      Parser<ExecutionPartConstruct> &exec, OmpEndDirectiveParser &end,
+      TailType &tail, ParseState &state) {
+    auto isRecovery{[](const ExecutionPartConstruct &e) {
+      return std::holds_alternative<ErrorRecovery>(e.u);
+    }};
+    if (auto &&stmt{attempt(exec).Parse(state)}; stmt && !isRecovery(*stmt)) {
+      tail.first.emplace_back(std::move(*stmt));
+    } else if (auto &&dir{attempt(end).Parse(state)}) {
+      tail.second = std::move(*dir);
+    } else {
+      return std::nullopt;
+    }
+    return Success{};
+  }
+
+  struct LimitedTailParser {
+    using resultType = TailType;
+
+    constexpr LimitedTailParser(size_t count) : count_(count) {}
+
+    std::optional<resultType> Parse(ParseState &state) const {
+      auto exec{Parser<ExecutionPartConstruct>{}};
+      auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_atomic}};
+      TailType tail;
+
+      for (size_t i{0}; i != count_; ++i) {
+        if (ParseOne(exec, end, tail, state)) {
+          if (tail.second) {
+            // Return when the end-directive was parsed.
+            return std::move(tail);
+          }
+        } else {
+          break;
+        }
+      }
+      return std::nullopt;
+    }
+
+  private:
+    const size_t count_;
+  };
+
+  // The recursion guard should become thread_local if parsing is ever
+  // parallelized.
+  static bool recursing_;
+};
+
+bool OmpAtomicConstructParser::recursing_{false};
+
+TYPE_PARSER(sourced( //
+    construct<OpenMPAtomicConstruct>(OmpAtomicConstructParser{})))
+
 // 2.17.7 Atomic construct/2.17.8 Flush construct [OpenMP 5.0]
 //        memory-order-clause ->
 //                               acq_rel
@@ -1237,19 +1396,6 @@ TYPE_PARSER(sourced(construct<OmpMemoryOrderClause>(
         "RELEASE" >> construct<OmpClause>(construct<OmpClause::Release>()) ||
         "SEQ_CST" >> construct<OmpClause>(construct<OmpClause::SeqCst>())))))
 
-// 2.17.7 Atomic construct
-//        atomic-clause -> memory-order-clause | HINT(hint-expression)
-TYPE_PARSER(sourced(construct<OmpAtomicClause>(
-    construct<OmpAtomicClause>(Parser<OmpMemoryOrderClause>{}) ||
-    construct<OmpAtomicClause>(
-        "FAIL" >> parenthesized(Parser<OmpFailClause>{})) ||
-    construct<OmpAtomicClause>(
-        "HINT" >> parenthesized(Parser<OmpHintClause>{})))))
-
-// atomic-clause-list -> [atomic-clause, [atomic-clause], ...]
-TYPE_PARSER(sourced(construct<OmpAtomicClauseList>(
-    many(maybe(","_tok) >> sourced(Parser<OmpAtomicClause>{})))))
-
 static bool IsSimpleStandalone(const OmpDirectiveName &name) {
   switch (name.v) {
   case llvm::omp::Directive::OMPD_barrier:
@@ -1421,67 +1567,6 @@ TYPE_PARSER(sourced(
 TYPE_PARSER(construct<OmpReductionCombiner>(Parser<AssignmentStmt>{}) ||
     construct<OmpReductionCombiner>(Parser<FunctionReference>{}))
 
-// 2.17.7 atomic -> ATOMIC [clause [,]] atomic-clause [[,] clause] |
-//                  ATOMIC [clause]
-//       clause -> memory-order-clause | HINT(hint-expression)
-//       memory-order-clause -> SEQ_CST | ACQ_REL | RELEASE | ACQUIRE | RELAXED
-//       atomic-clause -> READ | WRITE | UPDATE | CAPTURE
-
-// OMP END ATOMIC
-TYPE_PARSER(construct<OmpEndAtomic>(startOmpLine >> "END ATOMIC"_tok))
-
-// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] READ [MEMORY-ORDER-CLAUSE-LIST]
-TYPE_PARSER("ATOMIC" >>
-    sourced(construct<OmpAtomicRead>(
-        Parser<OmpAtomicClauseList>{} / maybe(","_tok), verbatim("READ"_tok),
-        Parser<OmpAtomicClauseList>{} / endOmpLine, statement(assignmentStmt),
-        maybe(Parser<OmpEndAtomic>{} / endOmpLine))))
-
-// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] CAPTURE [MEMORY-ORDER-CLAUSE-LIST]
-TYPE_PARSER("ATOMIC" >>
-    sourced(construct<OmpAtomicCapture>(
-        Parser<OmpAtomicClauseList>{} / maybe(","_tok), verbatim("CAPTURE"_tok),
-        Parser<OmpAtomicClauseList>{} / endOmpLine, statement(assignmentStmt),
-        statement(assignmentStmt), Parser<OmpEndAtomic>{} / endOmpLine)))
-
-TYPE_PARSER(construct<OmpAtomicCompareIfStmt>(indirect(Parser<IfStmt>{})) ||
-    construct<OmpAtomicCompareIfStmt>(indirect(Parser<IfConstruct>{})))
-
-// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] COMPARE [MEMORY-ORDER-CLAUSE-LIST]
-TYPE_PARSER("ATOMIC" >>
-    sourced(construct<OmpAtomicCompare>(
-        Parser<OmpAtomicClauseList>{} / maybe(","_tok), verbatim("COMPARE"_tok),
-        Parser<OmpAtomicClauseList>{} / endOmpLine,
-        Parser<OmpAtomicCompareIfStmt>{},
-        maybe(Parser<OmpEndAtomic>{} / endOmpLine))))
-
-// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] UPDATE [MEMORY-ORDER-CLAUSE-LIST]
-TYPE_PARSER("ATOMIC" >>
-    sourced(construct<OmpAtomicUpdate>(
-        Parser<OmpAtomicClauseList>{} / maybe(","_tok), verbatim("UPDATE"_tok),
-        Parser<OmpAtomicClauseList>{} / endOmpLine, statement(assignmentStmt),
-        maybe(Parser<OmpEndAtomic>{} / endOmpLine))))
-
-// OMP ATOMIC [atomic-clause-list]
-TYPE_PARSER(sourced(construct<OmpAtomic>(verbatim("ATOMIC"_tok),
-    Parser<OmpAtomicClauseList>{} / endOmpLine, statement(assignmentStmt),
-    maybe(Parser<OmpEndAtomic>{} / endOmpLine))))
-
-// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] WRITE [MEMORY-ORDER-CLAUSE-LIST]
-TYPE_PARSER("ATOMIC" >>
-    sourced(construct<OmpAtomicWrite>(
-        Parser<OmpAtomicClauseList>{} / maybe(","_tok), verbatim("WRITE"_tok),
-        Parser<OmpAtomicClauseList>{} / endOmpLine, statement(assignmentStmt),
-        maybe(Parser<OmpEndAtomic>{} / endOmpLine))))
-
-// Atomic Construct
-TYPE_PARSER(construct<OpenMPAtomicConstruct>(Parser<OmpAtomicRead>{}) ||
-    construct<OpenMPAtomicConstruct>(Parser<OmpAtomicCapture>{}) ||
-    construct<OpenMPAtomicConstruct>(Parser<OmpAtomicCompare>{}) ||
-    construct<OpenMPAtomicConstruct>(Parser<OmpAtomicWrite>{}) ||
-    construct<OpenMPAtomicConstruct>(Parser<OmpAtomicUpdate>{}) ||
-    construct<OpenMPAtomicConstruct>(Parser<OmpAtomic>{}))
-
 // 2.13.2 OMP CRITICAL
 TYPE_PARSER(startOmpLine >>
     sourced(construct<OmpEndCriticalDirective>(
diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp
index 3dd87ad9a3650..824612e49293f 100644
--- a/flang/lib/Parser/parse-tree.cpp
+++ b/flang/lib/Parser/parse-tree.cpp
@@ -321,6 +321,34 @@ std::string OmpTraitSetSelectorName::ToString() const {
   return std::string(EnumToString(v));
 }
 
+llvm::omp::Clause OpenMPAtomicConstruct::GetKind() const {
+  auto &dirSpec{std::get<OmpDirectiveSpecification>(t)};
+  for (auto &clause : dirSpec.Clauses().v) {
+    switch (clause.Id()) {
+    case llvm::omp::Clause::OMPC_read:
+    case llvm::omp::Clause::OMPC_write:
+    case llvm::omp::Clause::OMPC_update:
+      return clause.Id();
+    default:
+      break;
+    }
+  }
+  return llvm::omp::Clause::OMPC_update;
+}
+
+bool OpenMPAtomicConstruct::IsCapture() const {
+  auto &dirSpec{std::get<OmpDirectiveSpecification>(t)};
+  return llvm::any_of(dirSpec.Clauses().v, [](auto &clause) {
+    return clause.Id() == llvm::omp::Clause::OMPC_capture;
+  });
+}
+
+bool OpenMPAtomicConstruct::IsCompare() const {
+  auto &dirSpec{std::get<OmpDirectiveSpecification>(t)};
+  return llvm::any_of(dirSpec.Clauses().v, [](auto &clause) {
+    return clause.Id() == llvm::omp::Clause::OMPC_compare;
+  });
+}
 } // namespace Fortran::parser
 
 template <typename C> static llvm::omp::Clause getClauseIdForClass(C &&) {
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index e0abe95d07c86..ed0f227fd5b98 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2571,83 +2571,22 @@ class UnparseVisitor {
     Word(ToUpperCaseLetters(common::EnumToString(x)));
   }
 
-  void Unparse(const OmpAtomicClauseList &x) { Walk(" ", x.v, " "); }
-
-  void Unparse(const OmpAtomic &x) {
-    BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<OmpAtomicClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AssignmentStmt>>(x.t));
-    BeginOpenMP();
-    Walk(std::get<std::optional<OmpEndAtomic>>(x.t), "!$OMP END ATOMIC\n");
-    EndOpenMP();
-  }
-  void Unparse(const OmpAtomicCapture &x) {
-    BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<0>(x.t));
-    Word(" CAPTURE");
-    Walk(std::get<2>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<OmpAtomicCapture::Stmt1>(x.t));
-    Put("\n");
-    Walk(std::get<OmpAtomicCapture::Stmt2>(x.t));
-    BeginOpenMP();
-    Word("!$OMP END ATOMIC\n");
-    EndOpenMP();
-  }
-  void Unparse(const OmpAtomicCompare &x) {
-    BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<0>(x.t));
-    Word(" COMPARE");
-    Walk(std::get<2>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<OmpAtomicCompareIfStmt>(x.t));
-  }
-  void Unparse(const OmpAtomicRead &x) {
-    BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<0>(x.t));
-    Word(" READ");
-    Walk(std::get<2>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AssignmentStmt>>(x.t));
-    BeginOpenMP();
-    Walk(std::get<std::optional<OmpEndAtomic>>(x.t), "!$OMP END ATOMIC\n");
-    EndOpenMP();
-  }
-  void Unparse(const OmpAtomicUpdate &x) {
+  void Unparse(const OpenMPAtomicConstruct &x) {
     BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<0>(x.t));
-    Word(" UPDATE");
-    Walk(std::get<2>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AssignmentStmt>>(x.t));
-    BeginOpenMP();
-    Walk(std::get<std::optional<OmpEndAtomic>>(x.t), "!$OMP END ATOMIC\n");
-    EndOpenMP();
-  }
-  void Unparse(const OmpAtomicWrite &x) {
-    BeginOpenMP();
-    Word("!$OMP ATOMIC");
-    Walk(std::get<0>(x.t));
-    Word(" WRITE");
-    Walk(std::get<2>(x.t));
+    Word("!$OMP ");
+    Walk(std::get<OmpDirectiveSpecification>(x.t));
     Put("\n");
     EndOpenMP();
-    Walk(std::get<Statement<AssignmentStmt>>(x.t));
-    BeginOpenMP();
-    Walk(std::get<std::optional<OmpEndAtomic>>(x.t), "!$OMP END ATOMIC\n");
-    EndOpenMP();
+    Walk(std::get<Block>(x.t), "");
+    if (auto &end{std::get<std::optional<OmpDirectiveSpecification>>(x.t)}) {
+      BeginOpenMP();
+      Word("!$OMP END ");
+      Walk(*end);
+      Put("\n");
+      EndOpenMP();
+    }
   }
+
   void Unparse(const OpenMPExecutableAllocate &x) {
     const auto &fields =
         std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
@@ -2920,23 +2859,8 @@ class UnparseVisitor {
     Put("\n");
     EndOpenMP();
   }
+  void Unparse(const OmpFailClause &x) { Walk(x.v); }
   void Unparse(const OmpMemoryOrderClause &x) { Walk(x.v); }
-  void Unparse(const OmpAtomicClause &x) {
-    common::visit(common::visitors{
-                      [&](const OmpMemoryOrderClause &y) { Walk(y); },
-                      [&](const OmpFailClause &y) {
-                        Word("FAIL(");
-                        Walk(y.v);
-                        Put(")");
-                      },
-                      [&](const OmpHintClause &y) {
-                        Word("HINT(");
-                        Walk(y.v);
-                        Put(")");
-                      },
-                  },
-        x.u);
-  }
   void Unparse(const OmpMetadirectiveDirective &x) {
     BeginOpenMP();
     Word("!$OMP METADIRECTIVE ");
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 31fcbb9683202..4dccb0e88e324 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -17,10 +17,16 @@
 #include "flang/Semantics/openmp-modifiers.h"
 #include "flang/Semantics/tools.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include <variant>
 
 namespace Fortran::semantics {
 
+template <typename T, typename U>
+static bool operator!=(const evaluate::Expr<T> &e, const evaluate::Expr<U> &f) {
+  return !(e == f);
+}
+
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
@@ -79,6 +85,32 @@ static const parser::ArrayElement *GetArrayElementFromObj(
   return nullptr;
 }
 
+static bool IsVarOrFunctionRef(const MaybeExpr &expr) {
+  if (expr) {
+    return evaluate::UnwrapProcedureRef(*expr) != nullptr ||
+        evaluate::IsVariable(*expr);
+  } else {
+    return false;
+  }
+}
+
+static std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
+  const parser::TypedExpr &typedExpr{parserExpr.typedExpr};
+  // ForwardOwningPointer           typedExpr
+  // `- GenericExprWrapper          ^.get()
+  //    `- std::optional<Expr>      ^->v
+  return typedExpr.get()->v;
+}
+
+static std::optional<evaluate::DynamicType> GetDynamicType(
+    const parser::Expr &parserExpr) {
+  if (auto maybeExpr{GetEvaluateExpr(parserExpr)}) {
+    return maybeExpr->GetType();
+  } else {
+    return std::nullopt;
+  }
+}
+
 // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment
 // statements and the expressions enclosed in an OpenMP Workshare construct
 class OmpWorkshareBlockChecker {
@@ -595,51 +627,26 @@ void OmpStructureChecker::CheckPredefinedAllocatorRestriction(
   }
 }
 
-template <class D>
-void OmpStructureChecker::CheckHintClause(
-    D *leftOmpClauseList, D *rightOmpClauseList, std::string_view dirName) {
-  bool foundHint{false};
+void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) {
+  CheckAllowedClause(llvm::omp::Clause::OMPC_hint);
+  auto &dirCtx{GetContext()};
 
-  auto checkForValidHintClause = [&](const D *clauseList) {
-    for (const auto &clause : clauseList->v) {
-      const parser::OmpHintClause *ompHintClause = nullptr;
-      if constexpr (std::is_same_v<D, const parser::OmpAtomicClauseList>) {
-        ompHintClause = std::get_if<parser::OmpHintClause>(&clause.u);
-      } else if constexpr (std::is_same_v<D, const parser::OmpClauseList>) {
-        if (auto *hint{std::get_if<parser::OmpClause::Hint>(&clause.u)}) {
-          ompHintClause = &hint->v;
-        }
-      }
-      if (!ompHintClause)
-        continue;
-      if (foundHint) {
-        context_.Say(clause.source,
-            "At most one HINT clause can appear on the %s directive"_err_en_US,
-            parser::ToUpperCaseLetters(dirName));
-      }
-      foundHint = true;
-      std::optional<std::int64_t> hintValue = GetIntValue(ompHintClause->v);
-      if (hintValue && *hintValue >= 0) {
-        /*`omp_sync_hint_nonspeculative` and `omp_lock_hint_speculative`*/
-        if ((*hintValue & 0xC) == 0xC
-            /*`omp_sync_hint_uncontended` and omp_sync_hint_contended*/
-            || (*hintValue & 0x3) == 0x3)
-          context_.Say(clause.source,
-              "Hint clause value "
-              "is not a valid OpenMP synchronization value"_err_en_US);
-      } else {
-        context_.Say(clause.source,
-            "Hint clause must have non-negative constant "
-            "integer expression"_err_en_US);
+  if (std::optional<int64_t> maybeVal{GetIntValue(x.v.v)}) {
+    int64_t val{*maybeVal};
+    if (val >= 0) {
+      // Check contradictory values.
+      if ((val & 0xC) == 0xC || // omp_sync_hint_speculative and nonspeculative
+          (val & 0x3) == 0x3) { // omp_sync_hint_contended and uncontended
+        context_.Say(dirCtx.clauseSource,
+            "The synchronization hint is not valid"_err_en_US);
       }
+    } else {
+      context_.Say(dirCtx.clauseSource,
+          "Synchronization hint must be non-negative"_err_en_US);
     }
-  };
-
-  if (leftOmpClauseList) {
-    checkForValidHintClause(leftOmpClauseList);
-  }
-  if (rightOmpClauseList) {
-    checkForValidHintClause(rightOmpClauseList);
+  } else {
+    context_.Say(dirCtx.clauseSource,
+        "Synchronization hint must be a constant integer value"_err_en_US);
   }
 }
 
@@ -2396,8 +2403,9 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) {
 
 void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
   const auto &dir{std::get<parser::OmpCriticalDirective>(x.t)};
+  const auto &dirSource{std::get<parser::Verbatim>(dir.t).source};
   const auto &endDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_critical);
+  PushContextAndClauseSets(dirSource, llvm::omp::Directive::OMPD_critical);
   const auto &block{std::get<parser::Block>(x.t)};
   CheckNoBranching(block, llvm::omp::Directive::OMPD_critical, dir.source);
   const auto &dirName{std::get<std::optional<parser::Name>>(dir.t)};
@@ -2430,7 +2438,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
             "Hint clause other than omp_sync_hint_none cannot be specified for "
             "an unnamed CRITICAL directive"_err_en_US});
   }
-  CheckHintClause<const parser::OmpClauseList>(&ompClause, nullptr, "CRITICAL");
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPCriticalConstruct &) {
@@ -2667,422 +2674,1418 @@ void OmpStructureChecker::Leave(const parser::OmpEndBlockDirective &x) {
   }
 }
 
-inline void OmpStructureChecker::ErrIfAllocatableVariable(
-    const parser::Variable &var) {
-  // Err out if the given symbol has
-  // ALLOCATABLE attribute
-  if (const auto *e{GetExpr(context_, var)})
-    for (const Symbol &symbol : evaluate::CollectSymbols(*e))
-      if (IsAllocatable(symbol)) {
-        const auto &designator =
-            std::get<common::Indirection<parser::Designator>>(var.u);
-        const auto *dataRef =
-            std::get_if<parser::DataRef>(&designator.value().u);
-        const parser::Name *name =
-            dataRef ? std::get_if<parser::Name>(&dataRef->u) : nullptr;
-        if (name)
-          context_.Say(name->source,
-              "%s must not have ALLOCATABLE "
-              "attribute"_err_en_US,
-              name->ToString());
+/// parser::Block is a list of executable constructs, parser::BlockConstruct
+/// is Fortran's BLOCK/ENDBLOCK construct.
+/// Strip the outermost BlockConstructs, return the reference to the Block
+/// in the executable part of the innermost of the stripped constructs.
+/// Specifically, if the given `block` has a single entry (it's a list), and
+/// the entry is a BlockConstruct, get the Block contained within. Repeat
+/// this step as many times as possible.
+static const parser::Block &GetInnermostExecPart(const parser::Block &block) {
+  const parser::Block *iter{&block};
+  while (iter->size() == 1) {
+    const parser::ExecutionPartConstruct &ep{iter->front()};
+    if (auto *exec{std::get_if<parser::ExecutableConstruct>(&ep.u)}) {
+      using BlockConstruct = common::Indirection<parser::BlockConstruct>;
+      if (auto *bc{std::get_if<BlockConstruct>(&exec->u)}) {
+        iter = &std::get<parser::Block>(bc->value().t);
+        continue;
       }
+    }
+    break;
+  }
+  return *iter;
 }
 
-inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch(
-    const parser::Variable &var, const parser::Expr &expr) {
-  // Err out if the symbol on the LHS is also used on the RHS of the assignment
-  // statement
-  const auto *e{GetExpr(context_, expr)};
-  const auto *v{GetExpr(context_, var)};
-  if (e && v) {
-    auto vSyms{evaluate::GetSymbolVector(*v)};
-    const Symbol &varSymbol = vSyms.front();
-    for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) {
-      if (varSymbol == symbol) {
-        const common::Indirection<parser::Designator> *designator =
-            std::get_if<common::Indirection<parser::Designator>>(&expr.u);
-        if (designator) {
-          auto *z{var.typedExpr.get()};
-          auto *c{expr.typedExpr.get()};
-          if (z->v == c->v) {
-            context_.Say(expr.source,
-                "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
-                var.GetSource());
-          }
+// There is no consistent way to get the source of a given ActionStmt, so
+// extract the source information from Statement<ActionStmt> when we can,
+// and keep it around for error reporting in further analyses.
+struct SourcedActionStmt {
+  const parser::ActionStmt *stmt{nullptr};
+  parser::CharBlock source;
+
+  operator bool() const { return stmt != nullptr; }
+};
+
+struct AnalyzedCondStmt {
+  SomeExpr cond{evaluate::NullPointer{}}; // Default ctor is deleted
+  parser::CharBlock source;
+  SourcedActionStmt ift, iff;
+};
+
+static SourcedActionStmt GetActionStmt(
+    const parser::ExecutionPartConstruct *x) {
+  if (x == nullptr) {
+    return SourcedActionStmt{};
+  }
+  if (auto *exec{std::get_if<parser::ExecutableConstruct>(&x->u)}) {
+    using ActionStmt = parser::Statement<parser::ActionStmt>;
+    if (auto *stmt{std::get_if<ActionStmt>(&exec->u)}) {
+      return SourcedActionStmt{&stmt->statement, stmt->source};
+    }
+  }
+  return SourcedActionStmt{};
+}
+
+static SourcedActionStmt GetActionStmt(const parser::Block &block) {
+  if (block.size() == 1) {
+    return GetActionStmt(&block.front());
+  }
+  return SourcedActionStmt{};
+}
+
+// Compute the `evaluate::Assignment` from parser::ActionStmt. The assumption
+// is that the ActionStmt will be either an assignment or a pointer-assignment,
+// otherwise return std::nullopt.
+// Note: This function can return std::nullopt on [Pointer]AssignmentStmt where
+// the "typedAssignment" is unset. This can happen if there are semantic errors
+// in the purported assignment.
+static std::optional<evaluate::Assignment> GetEvaluateAssignment(
+    const parser::ActionStmt *x) {
+  if (x == nullptr) {
+    return std::nullopt;
+  }
+
+  using AssignmentStmt = common::Indirection<parser::AssignmentStmt>;
+  using PointerAssignmentStmt =
+      common::Indirection<parser::PointerAssignmentStmt>;
+  using TypedAssignment = parser::AssignmentStmt::TypedAssignment;
+
+  return common::visit(
+      [](auto &&s) -> std::optional<evaluate::Assignment> {
+        using BareS = llvm::remove_cvref_t<decltype(s)>;
+        if constexpr (std::is_same_v<BareS, AssignmentStmt> ||
+            std::is_same_v<BareS, PointerAssignmentStmt>) {
+          const TypedAssignment &typed{s.value().typedAssignment};
+          // ForwardOwningPointer                 typedAssignment
+          // `- GenericAssignmentWrapper          ^.get()
+          //    `- std::optional<Assignment>      ^->v
+          return typed.get()->v;
         } else {
-          context_.Say(expr.source,
-              "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US,
-              var.GetSource());
+          return std::nullopt;
+        }
+      },
+      x->u);
+}
+
+// Check if the ActionStmt is actually a [Pointer]AssignmentStmt. This is
+// to separate cases where the source has something that looks like an
+// assignment, but is semantically wrong (diagnosed by general semantic
+// checks), and where the source has some other statement (which we want
+// to report as "should be an assignment").
+static bool IsAssignment(const parser::ActionStmt *x) {
+  if (x == nullptr) {
+    return false;
+  }
+
+  using AssignmentStmt = common::Indirection<parser::AssignmentStmt>;
+  using PointerAssignmentStmt =
+      common::Indirection<parser::PointerAssignmentStmt>;
+
+  return common::visit(
+      [](auto &&s) -> bool {
+        using BareS = llvm::remove_cvref_t<decltype(s)>;
+        return std::is_same_v<BareS, AssignmentStmt> ||
+            std::is_same_v<BareS, PointerAssignmentStmt>;
+      },
+      x->u);
+}
+
+static std::optional<AnalyzedCondStmt> AnalyzeConditionalStmt(
+    const parser::ExecutionPartConstruct *x) {
+  if (x == nullptr) {
+    return std::nullopt;
+  }
+
+  // Extract the evaluate::Expr from ScalarLogicalExpr.
+  auto getFromLogical{[](const parser::ScalarLogicalExpr &logical) {
+    // ScalarLogicalExpr is Scalar<Logical<common::Indirection<Expr>>>
+    const parser::Expr &expr{logical.thing.thing.value()};
+    return GetEvaluateExpr(expr);
+  }};
+
+  // Recognize either
+  // ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt, or
+  // ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct.
+
+  if (auto &&action{GetActionStmt(x)}) {
+    if (auto *ifs{std::get_if<common::Indirection<parser::IfStmt>>(
+            &action.stmt->u)}) {
+      const parser::IfStmt &s{ifs->value()};
+      auto &&maybeCond{
+          getFromLogical(std::get<parser::ScalarLogicalExpr>(s.t))};
+      auto &thenStmt{
+          std::get<parser::UnlabeledStatement<parser::ActionStmt>>(s.t)};
+      if (maybeCond) {
+        return AnalyzedCondStmt{std::move(*maybeCond), action.source,
+            SourcedActionStmt{&thenStmt.statement, thenStmt.source},
+            SourcedActionStmt{}};
+      }
+    }
+    return std::nullopt;
+  }
+
+  if (auto *exec{std::get_if<parser::ExecutableConstruct>(&x->u)}) {
+    if (auto *ifc{
+            std::get_if<common::Indirection<parser::IfConstruct>>(&exec->u)}) {
+      using ElseBlock = parser::IfConstruct::ElseBlock;
+      using ElseIfBlock = parser::IfConstruct::ElseIfBlock;
+      const parser::IfConstruct &s{ifc->value()};
+
+      if (!std::get<std::list<ElseIfBlock>>(s.t).empty()) {
+        // Not expecting any else-if statements.
+        return std::nullopt;
+      }
+      auto &stmt{std::get<parser::Statement<parser::IfThenStmt>>(s.t)};
+      auto &&maybeCond{getFromLogical(
+          std::get<parser::ScalarLogicalExpr>(stmt.statement.t))};
+      if (!maybeCond) {
+        return std::nullopt;
+      }
+
+      if (auto &maybeElse{std::get<std::optional<ElseBlock>>(s.t)}) {
+        AnalyzedCondStmt result{std::move(*maybeCond), stmt.source,
+            GetActionStmt(std::get<parser::Block>(s.t)),
+            GetActionStmt(std::get<parser::Block>(maybeElse->t))};
+        if (result.ift.stmt && result.iff.stmt) {
+          return result;
+        }
+      } else {
+        AnalyzedCondStmt result{std::move(*maybeCond), stmt.source,
+            GetActionStmt(std::get<parser::Block>(s.t)), SourcedActionStmt{}};
+        if (result.ift.stmt) {
+          return result;
         }
       }
     }
+    return std::nullopt;
   }
+
+  return std::nullopt;
 }
 
-inline void OmpStructureChecker::ErrIfNonScalarAssignmentStmt(
-    const parser::Variable &var, const parser::Expr &expr) {
-  // Err out if either the variable on the LHS or the expression on the RHS of
-  // the assignment statement are non-scalar (i.e. have rank > 0 or is of
-  // CHARACTER type)
-  const auto *e{GetExpr(context_, expr)};
-  const auto *v{GetExpr(context_, var)};
-  if (e && v) {
-    if (e->Rank() != 0 ||
-        (e->GetType().has_value() &&
-            e->GetType().value().category() == common::TypeCategory::Character))
-      context_.Say(expr.source,
-          "Expected scalar expression "
-          "on the RHS of atomic assignment "
-          "statement"_err_en_US);
-    if (v->Rank() != 0 ||
-        (v->GetType().has_value() &&
-            v->GetType()->category() == common::TypeCategory::Character))
-      context_.Say(var.GetSource(),
-          "Expected scalar variable "
-          "on the LHS of atomic assignment "
-          "statement"_err_en_US);
-  }
-}
-
-template <typename T, typename D>
-bool OmpStructureChecker::IsOperatorValid(const T &node, const D &variable) {
-  using AllowedBinaryOperators =
-      std::variant<parser::Expr::Add, parser::Expr::Multiply,
-          parser::Expr::Subtract, parser::Expr::Divide, parser::Expr::AND,
-          parser::Expr::OR, parser::Expr::EQV, parser::Expr::NEQV>;
-  using BinaryOperators = std::variant<parser::Expr::Add,
-      parser::Expr::Multiply, parser::Expr::Subtract, parser::Expr::Divide,
-      parser::Expr::AND, parser::Expr::OR, parser::Expr::EQV,
-      parser::Expr::NEQV, parser::Expr::Power, parser::Expr::Concat,
-      parser::Expr::LT, parser::Expr::LE, parser::Expr::EQ, parser::Expr::NE,
-      parser::Expr::GE, parser::Expr::GT>;
-
-  if constexpr (common::HasMember<T, BinaryOperators>) {
-    const auto &variableName{variable.GetSource().ToString()};
-    const auto &exprLeft{std::get<0>(node.t)};
-    const auto &exprRight{std::get<1>(node.t)};
-    if ((exprLeft.value().source.ToString() != variableName) &&
-        (exprRight.value().source.ToString() != variableName)) {
-      context_.Say(variable.GetSource(),
-          "Atomic update statement should be of form "
-          "`%s = %s operator expr` OR `%s = expr operator %s`"_err_en_US,
-          variableName, variableName, variableName, variableName);
-    }
-    return common::HasMember<T, AllowedBinaryOperators>;
+static std::pair<parser::CharBlock, parser::CharBlock> SplitAssignmentSource(
+    parser::CharBlock source) {
+  // Find => in the range, if not found, find = that is not a part of
+  // <=, >=, ==, or /=.
+  auto trim{[](std::string_view v) {
+    const char *begin{v.data()};
+    const char *end{begin + v.size()};
+    while (*begin == ' ' && begin != end) {
+      ++begin;
+    }
+    while (begin != end && end[-1] == ' ') {
+      --end;
+    }
+    assert(begin != end && "Source should not be empty");
+    return parser::CharBlock(begin, end - begin);
+  }};
+
+  std::string_view sv(source.begin(), source.size());
+
+  if (auto where{sv.find("=>")}; where != sv.npos) {
+    std::string_view lhs(sv.data(), where);
+    std::string_view rhs(sv.data() + where + 2, sv.size() - where - 2);
+    return std::make_pair(trim(lhs), trim(rhs));
   }
-  return false;
+
+  // Go backwards, since all the exclusions above end with a '='.
+  for (size_t next{source.size()}; next > 1; --next) {
+    if (sv[next - 1] == '=' && !llvm::is_contained("<>=/", sv[next - 2])) {
+      std::string_view lhs(sv.data(), next - 1);
+      std::string_view rhs(sv.data() + next, sv.size() - next);
+      return std::make_pair(trim(lhs), trim(rhs));
+    }
+  }
+  llvm_unreachable("Could not find assignment operator");
 }
 
-void OmpStructureChecker::CheckAtomicCaptureStmt(
-    const parser::AssignmentStmt &assignmentStmt) {
-  const auto &var{std::get<parser::Variable>(assignmentStmt.t)};
-  const auto &expr{std::get<parser::Expr>(assignmentStmt.t)};
-  common::visit(
-      common::visitors{
-          [&](const common::Indirection<parser::Designator> &designator) {
-            const auto *dataRef =
-                std::get_if<parser::DataRef>(&designator.value().u);
-            const auto *name =
-                dataRef ? std::get_if<parser::Name>(&dataRef->u) : nullptr;
-            if (name && IsAllocatable(*name->symbol))
-              context_.Say(name->source,
-                  "%s must not have ALLOCATABLE "
-                  "attribute"_err_en_US,
-                  name->ToString());
-          },
-          [&](const auto &) {
-            // Anything other than a `parser::Designator` is not allowed
-            context_.Say(expr.source,
-                "Expected scalar variable "
-                "of intrinsic type on RHS of atomic "
-                "assignment statement"_err_en_US);
-          }},
-      expr.u);
-  ErrIfLHSAndRHSSymbolsMatch(var, expr);
-  ErrIfNonScalarAssignmentStmt(var, expr);
-}
-
-void OmpStructureChecker::CheckAtomicWriteStmt(
-    const parser::AssignmentStmt &assignmentStmt) {
-  const auto &var{std::get<parser::Variable>(assignmentStmt.t)};
-  const auto &expr{std::get<parser::Expr>(assignmentStmt.t)};
-  ErrIfAllocatableVariable(var);
-  ErrIfLHSAndRHSSymbolsMatch(var, expr);
-  ErrIfNonScalarAssignmentStmt(var, expr);
-}
-
-void OmpStructureChecker::CheckAtomicUpdateStmt(
-    const parser::AssignmentStmt &assignment) {
-  const auto &expr{std::get<parser::Expr>(assignment.t)};
-  const auto &var{std::get<parser::Variable>(assignment.t)};
-  bool isIntrinsicProcedure{false};
-  bool isValidOperator{false};
-  common::visit(
-      common::visitors{
-          [&](const common::Indirection<parser::FunctionReference> &x) {
-            isIntrinsicProcedure = true;
-            const auto &procedureDesignator{
-                std::get<parser::ProcedureDesignator>(x.value().v.t)};
-            const parser::Name *name{
-                std::get_if<parser::Name>(&procedureDesignator.u)};
-            if (name &&
-                !(name->source == "max" || name->source == "min" ||
-                    name->source == "iand" || name->source == "ior" ||
-                    name->source == "ieor")) {
-              context_.Say(expr.source,
-                  "Invalid intrinsic procedure name in "
-                  "OpenMP ATOMIC (UPDATE) statement"_err_en_US);
-            }
-          },
-          [&](const auto &x) {
-            if (!IsOperatorValid(x, var)) {
-              context_.Say(expr.source,
-                  "Invalid or missing operator in atomic update "
-                  "statement"_err_en_US);
-            } else
-              isValidOperator = true;
-          },
-      },
-      expr.u);
-  if (const auto *e{GetExpr(context_, expr)}) {
-    const auto *v{GetExpr(context_, var)};
-    if (e->Rank() != 0 ||
-        (e->GetType().has_value() &&
-            e->GetType().value().category() == common::TypeCategory::Character))
-      context_.Say(expr.source,
-          "Expected scalar expression "
-          "on the RHS of atomic update assignment "
-          "statement"_err_en_US);
-    if (v->Rank() != 0 ||
-        (v->GetType().has_value() &&
-            v->GetType()->category() == common::TypeCategory::Character))
-      context_.Say(var.GetSource(),
-          "Expected scalar variable "
-          "on the LHS of atomic update assignment "
-          "statement"_err_en_US);
-    auto vSyms{evaluate::GetSymbolVector(*v)};
-    const Symbol &varSymbol = vSyms.front();
-    int numOfSymbolMatches{0};
-    SymbolVector exprSymbols{evaluate::GetSymbolVector(*e)};
-    for (const Symbol &symbol : exprSymbols) {
-      if (varSymbol == symbol) {
-        numOfSymbolMatches++;
+namespace atomic {
+
+struct DesignatorCollector : public evaluate::Traverse<DesignatorCollector,
+                                 std::vector<SomeExpr>, false> {
+  using Result = std::vector<SomeExpr>;
+  using Base = evaluate::Traverse<DesignatorCollector, Result, false>;
+  DesignatorCollector() : Base(*this) {}
+
+  Result Default() const { return {}; }
+
+  using Base::operator();
+
+  template <typename T> //
+  Result operator()(const evaluate::Designator<T> &x) const {
+    // Once in a designator, don't traverse it any further (i.e. only
+    // collect top-level designators).
+    auto copy{x};
+    return Result{AsGenericExpr(std::move(copy))};
+  }
+
+  template <typename... Rs> //
+  Result Combine(Result &&result, Rs &&...results) const {
+    Result v(std::move(result));
+    auto moveAppend{[](auto &accum, auto &&other) {
+      for (auto &&s : other) {
+        accum.push_back(std::move(s));
       }
+    }};
+    (moveAppend(v, std::move(results)), ...);
+    return v;
+  }
+};
+
+struct VariableFinder : public evaluate::AnyTraverse<VariableFinder> {
+  using Base = evaluate::AnyTraverse<VariableFinder>;
+  VariableFinder(const SomeExpr &v) : Base(*this), var(v) {}
+
+  using Base::operator();
+
+  template <typename T>
+  bool operator()(const evaluate::Designator<T> &x) const {
+    auto copy{x};
+    return evaluate::AsGenericExpr(std::move(copy)) == var;
+  }
+
+  template <typename T>
+  bool operator()(const evaluate::FunctionRef<T> &x) const {
+    auto copy{x};
+    return evaluate::AsGenericExpr(std::move(copy)) == var;
+  }
+
+private:
+  const SomeExpr &var;
+};
+} // namespace atomic
+
+static bool IsPointerAssignment(const evaluate::Assignment &x) {
+  return std::holds_alternative<evaluate::Assignment::BoundsSpec>(x.u) ||
+      std::holds_alternative<evaluate::Assignment::BoundsRemapping>(x.u);
+}
+
+static bool IsCheckForAssociated(const SomeExpr &cond) {
+  return GetTopLevelOperation(cond).first == operation::Operator::Associated;
+}
+
+static bool HasCommonDesignatorSymbols(
+    const evaluate::SymbolVector &baseSyms, const SomeExpr &other) {
+  // Compare the designators used in "other" with the designators whose
+  // symbols are given in baseSyms.
+  // This is a part of the check if these two expressions can access the same
+  // storage: if the designators used in them are different enough, then they
+  // will be assumed not to access the same memory.
+  //
+  // Consider an (array element) expression x%y(w%z), the corresponding symbol
+  // vector will be {x, y, w, z} (i.e. the symbols for these names).
+  // Check whether this exact sequence appears anywhere in any the symbol
+  // vector for "other". This will be true for x(y) and x(y+1), so this is
+  // not a sufficient condition, but can be used to eliminate candidates
+  // before doing more exhaustive checks.
+  //
+  // If any of the symbols in this sequence are function names, assume that
+  // there is no storage overlap, mostly because it would be impossible in
+  // general to determine what storage the function will access.
+  // Note: if f is pure, then two calls to f will access the same storage
+  // when called with the same arguments. This check is not done yet.
+
+  if (llvm::any_of(
+          baseSyms, [](const SymbolRef &s) { return s->IsSubprogram(); })) {
+    // If there is a function symbol in the chain then we can't infer much
+    // about the accessed storage.
+    return false;
+  }
+
+  auto isSubsequence{// Is u a subsequence of v.
+      [](const evaluate::SymbolVector &u, const evaluate::SymbolVector &v) {
+        size_t us{u.size()}, vs{v.size()};
+        if (us > vs) {
+          return false;
+        }
+        for (size_t off{0}; off != vs - us + 1; ++off) {
+          bool same{true};
+          for (size_t i{0}; i != us; ++i) {
+            if (u[i] != v[off + i]) {
+              same = false;
+              break;
+            }
+          }
+          if (same) {
+            return true;
+          }
+        }
+        return false;
+      }};
+
+  evaluate::SymbolVector otherSyms{evaluate::GetSymbolVector(other)};
+  return isSubsequence(baseSyms, otherSyms);
+}
+
+static bool HasCommonTopLevelDesignators(
+    const std::vector<SomeExpr> &baseDsgs, const SomeExpr &other) {
+  // Compare designators directly as expressions. This will ensure
+  // that x(y) and x(y+1) are not flagged as overlapping, whereas
+  // the symbol vectors for both of these would be identical.
+  std::vector<SomeExpr> otherDsgs{atomic::DesignatorCollector{}(other)};
+
+  for (auto &s : baseDsgs) {
+    if (llvm::any_of(otherDsgs, [&](auto &&t) { return s == t; })) {
+      return true;
     }
-    if (isIntrinsicProcedure) {
-      std::string varName = var.GetSource().ToString();
-      if (numOfSymbolMatches != 1)
-        context_.Say(expr.source,
-            "Intrinsic procedure"
-            " arguments in atomic update statement"
-            " must have exactly one occurence of '%s'"_err_en_US,
-            varName);
-      else if (varSymbol != exprSymbols.front() &&
-          varSymbol != exprSymbols.back())
-        context_.Say(expr.source,
-            "Atomic update statement "
-            "should be of the form `%s = intrinsic_procedure(%s, expr_list)` "
-            "OR `%s = intrinsic_procedure(expr_list, %s)`"_err_en_US,
-            varName, varName, varName, varName);
-    } else if (isValidOperator) {
-      if (numOfSymbolMatches != 1)
-        context_.Say(expr.source,
-            "Exactly one occurence of '%s' "
-            "expected on the RHS of atomic update assignment statement"_err_en_US,
-            var.GetSource().ToString());
+  }
+  return false;
+}
+
+static const SomeExpr *HasStorageOverlap(
+    const SomeExpr &base, llvm::ArrayRef<SomeExpr> exprs) {
+  evaluate::SymbolVector baseSyms{evaluate::GetSymbolVector(base)};
+  std::vector<SomeExpr> baseDsgs{atomic::DesignatorCollector{}(base)};
+
+  for (const SomeExpr &expr : exprs) {
+    if (!HasCommonDesignatorSymbols(baseSyms, expr)) {
+      continue;
+    }
+    if (HasCommonTopLevelDesignators(baseDsgs, expr)) {
+      return &expr;
     }
   }
+  return nullptr;
+}
 
-  ErrIfAllocatableVariable(var);
+static bool IsMaybeAtomicWrite(const evaluate::Assignment &assign) {
+  // This ignores function calls, so it will accept "f(x) = f(x) + 1"
+  // for example.
+  return HasStorageOverlap(assign.lhs, assign.rhs) == nullptr;
 }
 
-void OmpStructureChecker::CheckAtomicCompareConstruct(
-    const parser::OmpAtomicCompare &atomicCompareConstruct) {
+static bool IsSubexpressionOf(const SomeExpr &sub, const SomeExpr &super) {
+  return atomic::VariableFinder{sub}(super);
+}
 
-  // TODO: Check that the if-stmt is `if (var == expr) var = new`
-  //       [with or without then/end-do]
+static void SetExpr(parser::TypedExpr &expr, MaybeExpr value) {
+  if (value) {
+    expr.Reset(new evaluate::GenericExprWrapper(std::move(value)),
+        evaluate::GenericExprWrapper::Deleter);
+  }
+}
 
-  unsigned version{context_.langOptions().OpenMPVersion};
-  if (version < 51) {
-    context_.Say(atomicCompareConstruct.source,
-        "%s construct not allowed in %s, %s"_err_en_US,
-        atomicCompareConstruct.source, ThisVersion(version), TryVersion(51));
-  }
-
-  // TODO: More work needed here. Some of the Update restrictions need to
-  // be added, but Update isn't the same either.
-}
-
-// TODO: Allow cond-update-stmt once compare clause is supported.
-void OmpStructureChecker::CheckAtomicCaptureConstruct(
-    const parser::OmpAtomicCapture &atomicCaptureConstruct) {
-  const parser::AssignmentStmt &stmt1 =
-      std::get<parser::OmpAtomicCapture::Stmt1>(atomicCaptureConstruct.t)
-          .v.statement;
-  const auto &stmt1Var{std::get<parser::Variable>(stmt1.t)};
-  const auto &stmt1Expr{std::get<parser::Expr>(stmt1.t)};
-  const auto *v1 = GetExpr(context_, stmt1Var);
-  const auto *e1 = GetExpr(context_, stmt1Expr);
-
-  const parser::AssignmentStmt &stmt2 =
-      std::get<parser::OmpAtomicCapture::Stmt2>(atomicCaptureConstruct.t)
-          .v.statement;
-  const auto &stmt2Var{std::get<parser::Variable>(stmt2.t)};
-  const auto &stmt2Expr{std::get<parser::Expr>(stmt2.t)};
-  const auto *v2 = GetExpr(context_, stmt2Var);
-  const auto *e2 = GetExpr(context_, stmt2Expr);
-
-  if (e1 && v1 && e2 && v2) {
-    if (parser::CheckForSingleVariableOnRHS(stmt1)) {
-      CheckAtomicCaptureStmt(stmt1);
-      if (CheckForSymbolMatch(v2, e2)) {
-        // ATOMIC CAPTURE construct is of the form [capture-stmt, update-stmt]
-        CheckAtomicUpdateStmt(stmt2);
+static void SetAssignment(parser::AssignmentStmt::TypedAssignment &assign,
+    std::optional<evaluate::Assignment> value) {
+  if (value) {
+    assign.Reset(new evaluate::GenericAssignmentWrapper(std::move(value)),
+        evaluate::GenericAssignmentWrapper::Deleter);
+  }
+}
+
+static parser::OpenMPAtomicConstruct::Analysis::Op MakeAtomicAnalysisOp(
+    int what,
+    const std::optional<evaluate::Assignment> &maybeAssign = std::nullopt) {
+  parser::OpenMPAtomicConstruct::Analysis::Op operation;
+  operation.what = what;
+  SetAssignment(operation.assign, maybeAssign);
+  return operation;
+}
+
+static parser::OpenMPAtomicConstruct::Analysis MakeAtomicAnalysis(
+    const SomeExpr &atom, const MaybeExpr &cond,
+    parser::OpenMPAtomicConstruct::Analysis::Op &&op0,
+    parser::OpenMPAtomicConstruct::Analysis::Op &&op1) {
+  // Defined in flang/include/flang/Parser/parse-tree.h
+  //
+  // struct Analysis {
+  //   struct Kind {
+  //     static constexpr int None = 0;
+  //     static constexpr int Read = 1;
+  //     static constexpr int Write = 2;
+  //     static constexpr int Update = Read | Write;
+  //     static constexpr int Action = 3; // Bits containing N, R, W, U
+  //     static constexpr int IfTrue = 4;
+  //     static constexpr int IfFalse = 8;
+  //     static constexpr int Condition = 12; // Bits containing IfTrue, IfFalse
+  //   };
+  //   struct Op {
+  //     int what;
+  //     TypedAssignment assign;
+  //   };
+  //   TypedExpr atom, cond;
+  //   Op op0, op1;
+  // };
+
+  parser::OpenMPAtomicConstruct::Analysis an;
+  SetExpr(an.atom, atom);
+  SetExpr(an.cond, cond);
+  an.op0 = std::move(op0);
+  an.op1 = std::move(op1);
+  return an;
+}
+
+void OmpStructureChecker::CheckStorageOverlap(const SomeExpr &base,
+    llvm::ArrayRef<evaluate::Expr<evaluate::SomeType>> exprs,
+    parser::CharBlock source) {
+  if (auto *expr{HasStorageOverlap(base, exprs)}) {
+    context_.Say(source,
+        "Within atomic operation %s and %s access the same storage"_warn_en_US,
+        base.AsFortran(), expr->AsFortran());
+  }
+}
+
+void OmpStructureChecker::ErrorShouldBeVariable(
+    const MaybeExpr &expr, parser::CharBlock source) {
+  if (expr) {
+    context_.Say(source, "Atomic expression %s should be a variable"_err_en_US,
+        expr->AsFortran());
+  } else {
+    context_.Say(source, "Atomic expression should be a variable"_err_en_US);
+  }
+}
+
+/// Check if `expr` satisfies the following conditions for x and v:
+///
+/// [6.0:189:10-12]
+/// - x and v (as applicable) are either scalar variables or
+///   function references with scalar data pointer result of non-character
+///   intrinsic type or variables that are non-polymorphic scalar pointers
+///   and any length type parameter must be constant.
+void OmpStructureChecker::CheckAtomicType(
+    SymbolRef sym, parser::CharBlock source, std::string_view name) {
+  const DeclTypeSpec *typeSpec{sym->GetType()};
+  if (!typeSpec) {
+    return;
+  }
+
+  if (!IsPointer(sym)) {
+    using Category = DeclTypeSpec::Category;
+    Category cat{typeSpec->category()};
+    if (cat == Category::Character) {
+      context_.Say(source,
+          "Atomic variable %s cannot have CHARACTER type"_err_en_US, name);
+    } else if (cat != Category::Numeric && cat != Category::Logical) {
+      context_.Say(source,
+          "Atomic variable %s should have an intrinsic type"_err_en_US, name);
+    }
+    return;
+  }
+
+  // Variable is a pointer.
+  if (typeSpec->IsPolymorphic()) {
+    context_.Say(source,
+        "Atomic variable %s cannot be a pointer to a polymorphic type"_err_en_US,
+        name);
+    return;
+  }
+
+  // Go over all length parameters, if any, and check if they are
+  // explicit.
+  if (const DerivedTypeSpec *derived{typeSpec->AsDerived()}) {
+    if (llvm::any_of(derived->parameters(), [](auto &&entry) {
+          // "entry" is a map entry
+          return entry.second.isLen() && !entry.second.isExplicit();
+        })) {
+      context_.Say(source,
+          "Atomic variable %s is a pointer to a type with non-constant length parameter"_err_en_US,
+          name);
+    }
+  }
+}
+
+void OmpStructureChecker::CheckAtomicVariable(
+    const SomeExpr &atom, parser::CharBlock source) {
+  if (atom.Rank() != 0) {
+    context_.Say(source, "Atomic variable %s should be a scalar"_err_en_US,
+        atom.AsFortran());
+  }
+
+  std::vector<SomeExpr> dsgs{atomic::DesignatorCollector{}(atom)};
+  assert(dsgs.size() == 1 && "Should have a single top-level designator");
+  evaluate::SymbolVector syms{evaluate::GetSymbolVector(dsgs.front())};
+
+  CheckAtomicType(syms.back(), source, atom.AsFortran());
+
+  if (IsAllocatable(syms.back()) && !IsArrayElement(atom)) {
+    context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US,
+        atom.AsFortran());
+  }
+}
+
+std::pair<const parser::ExecutionPartConstruct *,
+    const parser::ExecutionPartConstruct *>
+OmpStructureChecker::CheckUpdateCapture(
+    const parser::ExecutionPartConstruct *ec1,
+    const parser::ExecutionPartConstruct *ec2, parser::CharBlock source) {
+  // Decide which statement is the atomic update and which is the capture.
+  //
+  // The two allowed cases are:
+  //   x = ...      atomic-var = ...
+  //   ... = x      capture-var = atomic-var (with optional converts)
+  // or
+  //   ... = x      capture-var = atomic-var (with optional converts)
+  //   x = ...      atomic-var = ...
+  //
+  // The case of 'a = b; b = a' is ambiguous, so pick the first one as capture
+  // (which makes more sense, as it captures the original value of the atomic
+  // variable).
+  //
+  // If the two statements don't fit these criteria, return a pair of default-
+  // constructed values.
+  using ReturnTy = std::pair<const parser::ExecutionPartConstruct *,
+      const parser::ExecutionPartConstruct *>;
+
+  SourcedActionStmt act1{GetActionStmt(ec1)};
+  SourcedActionStmt act2{GetActionStmt(ec2)};
+  auto maybeAssign1{GetEvaluateAssignment(act1.stmt)};
+  auto maybeAssign2{GetEvaluateAssignment(act2.stmt)};
+  if (!maybeAssign1 || !maybeAssign2) {
+    if (!IsAssignment(act1.stmt) || !IsAssignment(act2.stmt)) {
+      context_.Say(source,
+          "ATOMIC UPDATE operation with CAPTURE should contain two assignments"_err_en_US);
+    }
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  auto as1{*maybeAssign1}, as2{*maybeAssign2};
+
+  auto isUpdateCapture{
+      [](const evaluate::Assignment &u, const evaluate::Assignment &c) {
+        return IsSameOrConvertOf(c.rhs, u.lhs);
+      }};
+
+  // Do some checks that narrow down the possible choices for the update
+  // and the capture statements. This will help to emit better diagnostics.
+  // 1. An assignment could be an update (cbu) if the left-hand side is a
+  //    subexpression of the right-hand side.
+  // 2. An assignment could be a capture (cbc) if the right-hand side is
+  //    a variable (or a function ref), with potential type conversions.
+  bool cbu1{IsSubexpressionOf(as1.lhs, as1.rhs)}; // Can as1 be an update?
+  bool cbu2{IsSubexpressionOf(as2.lhs, as2.rhs)}; // Can as2 be an update?
+  bool cbc1{IsVarOrFunctionRef(GetConvertInput(as1.rhs))}; // Can 1 be capture?
+  bool cbc2{IsVarOrFunctionRef(GetConvertInput(as2.rhs))}; // Can 2 be capture?
+
+  // We want to diagnose cases where both assignments cannot be an update,
+  // or both cannot be a capture, as well as cases where either assignment
+  // cannot be any of these two.
+  //
+  // If we organize these boolean values into a matrix
+  //   |cbu1 cbu2|
+  //   |cbc1 cbc2|
+  // then we want to diagnose cases where the matrix has a zero (i.e. "false")
+  // row or column, including the case where everything is zero. All these
+  // cases correspond to the determinant of the matrix being 0, which suggests
+  // that checking the det may be a convenient diagnostic check. There is only
+  // one additional case where the det is 0, which is when the matrix is all 1
+  // ("true"). The "all true" case represents the situation where both
+  // assignments could be an update as well as a capture. On the other hand,
+  // whenever det != 0, the roles of the update and the capture can be
+  // unambiguously assigned to as1 and as2 [1].
+  //
+  // [1] This can be easily verified by hand: there are 10 2x2 matrices with
+  // det = 0, leaving 6 cases where det != 0:
+  //   0 1   0 1   1 0   1 0   1 1   1 1
+  //   1 0   1 1   0 1   1 1   0 1   1 0
+  // In each case the classification is unambiguous.
+
+  //     |cbu1 cbu2|
+  // det |cbc1 cbc2| = cbu1*cbc2 - cbu2*cbc1
+  int det{int(cbu1) * int(cbc2) - int(cbu2) * int(cbc1)};
+
+  auto errorCaptureShouldRead{[&](const parser::CharBlock &source,
+                                  const std::string &expr) {
+    context_.Say(source,
+        "In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read %s"_err_en_US,
+        expr);
+  }};
+
+  auto errorNeitherWorks{[&]() {
+    context_.Say(source,
+        "In ATOMIC UPDATE operation with CAPTURE neither statement could be the update or the capture"_err_en_US);
+  }};
+
+  auto makeSelectionFromDet{[&](int det) -> ReturnTy {
+    // If det != 0, then the checks unambiguously suggest a specific
+    // categorization.
+    // If det == 0, then this function should be called only if the
+    // checks haven't ruled out any possibility, i.e. when both assigments
+    // could still be either updates or captures.
+    if (det > 0) {
+      // as1 is update, as2 is capture
+      if (isUpdateCapture(as1, as2)) {
+        return std::make_pair(/*Update=*/ec1, /*Capture=*/ec2);
       } else {
-        // ATOMIC CAPTURE construct is of the form [capture-stmt, write-stmt]
-        CheckAtomicWriteStmt(stmt2);
+        errorCaptureShouldRead(act2.source, as1.lhs.AsFortran());
+        return std::make_pair(nullptr, nullptr);
       }
-      if (!(*e1 == *v2)) {
-        context_.Say(stmt1Expr.source,
-            "Captured variable/array element/derived-type component %s expected to be assigned in the second statement of ATOMIC CAPTURE construct"_err_en_US,
-            stmt1Expr.source);
+    } else if (det < 0) {
+      // as2 is update, as1 is capture
+      if (isUpdateCapture(as2, as1)) {
+        return std::make_pair(/*Update=*/ec2, /*Capture=*/ec1);
+      } else {
+        errorCaptureShouldRead(act1.source, as2.lhs.AsFortran());
+        return std::make_pair(nullptr, nullptr);
+      }
+    } else {
+      bool updateFirst{isUpdateCapture(as1, as2)};
+      bool captureFirst{isUpdateCapture(as2, as1)};
+      if (updateFirst && captureFirst) {
+        // If both assignment could be the update and both could be the
+        // capture, emit a warning about the ambiguity.
+        context_.Say(act1.source,
+            "In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement"_warn_en_US);
+        return std::make_pair(/*Update=*/ec2, /*Capture=*/ec1);
       }
-    } else if (CheckForSymbolMatch(v1, e1) &&
-        parser::CheckForSingleVariableOnRHS(stmt2)) {
-      // ATOMIC CAPTURE construct is of the form [update-stmt, capture-stmt]
-      CheckAtomicUpdateStmt(stmt1);
-      CheckAtomicCaptureStmt(stmt2);
-      // Variable updated in stmt1 should be captured in stmt2
-      if (!(*v1 == *e2)) {
-        context_.Say(stmt1Var.GetSource(),
-            "Updated variable/array element/derived-type component %s expected to be captured in the second statement of ATOMIC CAPTURE construct"_err_en_US,
-            stmt1Var.GetSource());
+      if (updateFirst != captureFirst) {
+        const parser::ExecutionPartConstruct *upd{updateFirst ? ec1 : ec2};
+        const parser::ExecutionPartConstruct *cap{captureFirst ? ec1 : ec2};
+        return std::make_pair(upd, cap);
       }
+      assert(!updateFirst && !captureFirst);
+      errorNeitherWorks();
+      return std::make_pair(nullptr, nullptr);
+    }
+  }};
+
+  if (det != 0 || (cbu1 && cbu2 && cbc1 && cbc2)) {
+    return makeSelectionFromDet(det);
+  }
+  assert(det == 0 && "Prior checks should have covered det != 0");
+
+  // If neither of the statements is an RMW update, it could still be a
+  // "write" update. Pretty much any assignment can be a write update, so
+  // recompute det with cbu1 = cbu2 = true.
+  if (int writeDet{int(cbc2) - int(cbc1)}; writeDet || (cbc1 && cbc2)) {
+    return makeSelectionFromDet(writeDet);
+  }
+
+  // It's only errors from here on.
+
+  if (!cbu1 && !cbu2 && !cbc1 && !cbc2) {
+    errorNeitherWorks();
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  // The remaining cases are that
+  // - no candidate for update, or for capture,
+  // - one of the assigments cannot be anything.
+
+  if (!cbu1 && !cbu2) {
+    context_.Say(source,
+        "In ATOMIC UPDATE operation with CAPTURE neither statement could be the update"_err_en_US);
+    return std::make_pair(nullptr, nullptr);
+  } else if (!cbc1 && !cbc2) {
+    context_.Say(source,
+        "In ATOMIC UPDATE operation with CAPTURE neither statement could be the capture"_err_en_US);
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  if ((!cbu1 && !cbc1) || (!cbu2 && !cbc2)) {
+    auto &src = (!cbu1 && !cbc1) ? act1.source : act2.source;
+    context_.Say(src,
+        "In ATOMIC UPDATE operation with CAPTURE the statement could be neither the update nor the capture"_err_en_US);
+    return std::make_pair(nullptr, nullptr);
+  }
+
+  // All cases should have been covered.
+  llvm_unreachable("Unchecked condition");
+}
+
+void OmpStructureChecker::CheckAtomicCaptureAssignment(
+    const evaluate::Assignment &capture, const SomeExpr &atom,
+    parser::CharBlock source) {
+  auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  const SomeExpr &cap{capture.lhs};
+
+  if (!IsVarOrFunctionRef(atom)) {
+    ErrorShouldBeVariable(atom, rsrc);
+  } else {
+    CheckAtomicVariable(atom, rsrc);
+    // This part should have been checked prior to calling this function.
+    assert(*GetConvertInput(capture.rhs) == atom &&
+        "This cannot be a capture assignment");
+    CheckStorageOverlap(atom, {cap}, source);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicReadAssignment(
+    const evaluate::Assignment &read, parser::CharBlock source) {
+  auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+
+  if (auto maybe{GetConvertInput(read.rhs)}) {
+    const SomeExpr &atom{*maybe};
+
+    if (!IsVarOrFunctionRef(atom)) {
+      ErrorShouldBeVariable(atom, rsrc);
     } else {
-      context_.Say(stmt1Expr.source,
-          "Invalid ATOMIC CAPTURE construct statements. Expected one of [update-stmt, capture-stmt], [capture-stmt, update-stmt], or [capture-stmt, write-stmt]"_err_en_US);
+      CheckAtomicVariable(atom, rsrc);
+      CheckStorageOverlap(atom, {read.lhs}, source);
     }
+  } else {
+    ErrorShouldBeVariable(read.rhs, rsrc);
   }
 }
 
-void OmpStructureChecker::CheckAtomicMemoryOrderClause(
-    const parser::OmpAtomicClauseList *leftHandClauseList,
-    const parser::OmpAtomicClauseList *rightHandClauseList) {
-  int numMemoryOrderClause{0};
-  int numFailClause{0};
-  auto checkForValidMemoryOrderClause = [&](const parser::OmpAtomicClauseList
-                                                *clauseList) {
-    for (const auto &clause : clauseList->v) {
-      if (std::get_if<parser::OmpFailClause>(&clause.u)) {
-        numFailClause++;
-        if (numFailClause > 1) {
-          context_.Say(clause.source,
-              "More than one FAIL clause not allowed on OpenMP ATOMIC construct"_err_en_US);
-          return;
+void OmpStructureChecker::CheckAtomicWriteAssignment(
+    const evaluate::Assignment &write, parser::CharBlock source) {
+  // [6.0:190:13-15]
+  // A write structured block is write-statement, a write statement that has
+  // one of the following forms:
+  //   x = expr
+  //   x => expr
+  auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  const SomeExpr &atom{write.lhs};
+
+  if (!IsVarOrFunctionRef(atom)) {
+    ErrorShouldBeVariable(atom, rsrc);
+  } else {
+    CheckAtomicVariable(atom, lsrc);
+    CheckStorageOverlap(atom, {write.rhs}, source);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicUpdateAssignment(
+    const evaluate::Assignment &update, parser::CharBlock source) {
+  // [6.0:191:1-7]
+  // An update structured block is update-statement, an update statement
+  // that has one of the following forms:
+  //   x = x operator expr
+  //   x = expr operator x
+  //   x = intrinsic-procedure-name (x)
+  //   x = intrinsic-procedure-name (x, expr-list)
+  //   x = intrinsic-procedure-name (expr-list, x)
+  auto [lsrc, rsrc]{SplitAssignmentSource(source)};
+  const SomeExpr &atom{update.lhs};
+
+  if (!IsVarOrFunctionRef(atom)) {
+    ErrorShouldBeVariable(atom, rsrc);
+    // Skip other checks.
+    return;
+  }
+
+  CheckAtomicVariable(atom, lsrc);
+
+  std::pair<operation::Operator, std::vector<SomeExpr>> top{
+      operation::Operator::Unknown, {}};
+  if (auto &&maybeInput{GetConvertInput(update.rhs)}) {
+    top = GetTopLevelOperation(*maybeInput);
+  }
+  switch (top.first) {
+  case operation::Operator::Add:
+  case operation::Operator::Sub:
+  case operation::Operator::Mul:
+  case operation::Operator::Div:
+  case operation::Operator::And:
+  case operation::Operator::Or:
+  case operation::Operator::Eqv:
+  case operation::Operator::Neqv:
+  case operation::Operator::Min:
+  case operation::Operator::Max:
+  case operation::Operator::Identity:
+    break;
+  case operation::Operator::Call:
+    context_.Say(source,
+        "A call to this function is not a valid ATOMIC UPDATE operation"_err_en_US);
+    return;
+  case operation::Operator::Convert:
+    context_.Say(source,
+        "An implicit or explicit type conversion is not a valid ATOMIC UPDATE operation"_err_en_US);
+    return;
+  case operation::Operator::Intrinsic:
+    context_.Say(source,
+        "This intrinsic function is not a valid ATOMIC UPDATE operation"_err_en_US);
+    return;
+  case operation::Operator::Constant:
+  case operation::Operator::Unknown:
+    context_.Say(
+        source, "This is not a valid ATOMIC UPDATE operation"_err_en_US);
+    return;
+  default:
+    assert(
+        top.first != operation::Operator::Identity && "Handle this separately");
+    context_.Say(source,
+        "The %s operator is not a valid ATOMIC UPDATE operation"_err_en_US,
+        operation::ToString(top.first));
+    return;
+  }
+  // Check if `atom` occurs exactly once in the argument list.
+  std::vector<SomeExpr> nonAtom;
+  auto unique{[&]() { // -> iterator
+    auto found{top.second.end()};
+    for (auto i{top.second.begin()}, e{top.second.end()}; i != e; ++i) {
+      if (IsSameOrConvertOf(*i, atom)) {
+        if (found != top.second.end()) {
+          return top.second.end();
         }
+        found = i;
       } else {
-        if (std::get_if<parser::OmpMemoryOrderClause>(&clause.u)) {
-          numMemoryOrderClause++;
-          if (numMemoryOrderClause > 1) {
-            context_.Say(clause.source,
-                "More than one memory order clause not allowed on OpenMP ATOMIC construct"_err_en_US);
-            return;
-          }
+        nonAtom.push_back(*i);
+      }
+    }
+    return found;
+  }()};
+
+  if (unique == top.second.end()) {
+    if (top.first == operation::Operator::Identity) {
+      // This is "x = y".
+      context_.Say(rsrc,
+          "The atomic variable %s should appear as an argument in the update operation"_err_en_US,
+          atom.AsFortran());
+    } else {
+      assert(top.first != operation::Operator::Identity &&
+          "Handle this separately");
+      context_.Say(rsrc,
+          "The atomic variable %s should occur exactly once among the arguments of the top-level %s operator"_err_en_US,
+          atom.AsFortran(), operation::ToString(top.first));
+    }
+  } else {
+    CheckStorageOverlap(atom, nonAtom, source);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicConditionalUpdateAssignment(
+    const SomeExpr &cond, parser::CharBlock condSource,
+    const evaluate::Assignment &assign, parser::CharBlock assignSource) {
+  auto [alsrc, arsrc]{SplitAssignmentSource(assignSource)};
+  const SomeExpr &atom{assign.lhs};
+
+  if (!IsVarOrFunctionRef(atom)) {
+    ErrorShouldBeVariable(atom, arsrc);
+    // Skip other checks.
+    return;
+  }
+
+  CheckAtomicVariable(atom, alsrc);
+
+  auto top{GetTopLevelOperation(cond)};
+  // Missing arguments to operations would have been diagnosed by now.
+
+  switch (top.first) {
+  case operation::Operator::Associated:
+    if (atom != top.second.front()) {
+      context_.Say(assignSource,
+          "The pointer argument to ASSOCIATED must be same as the target of the assignment"_err_en_US);
+    }
+    break;
+  // x equalop e | e equalop x  (allowing "e equalop x" is an extension)
+  case operation::Operator::Eq:
+  case operation::Operator::Eqv:
+  // x ordop expr | expr ordop x
+  case operation::Operator::Lt:
+  case operation::Operator::Gt: {
+    const SomeExpr &arg0{top.second[0]};
+    const SomeExpr &arg1{top.second[1]};
+    if (IsSameOrConvertOf(arg0, atom)) {
+      CheckStorageOverlap(atom, {arg1}, condSource);
+    } else if (IsSameOrConvertOf(arg1, atom)) {
+      CheckStorageOverlap(atom, {arg0}, condSource);
+    } else {
+      assert(top.first != operation::Operator::Identity &&
+          "Handle this separately");
+      context_.Say(assignSource,
+          "An argument of the %s operator should be the target of the assignment"_err_en_US,
+          operation::ToString(top.first));
+    }
+    break;
+  }
+  case operation::Operator::Identity:
+  case operation::Operator::True:
+  case operation::Operator::False:
+    break;
+  default:
+    assert(
+        top.first != operation::Operator::Identity && "Handle this separately");
+    context_.Say(condSource,
+        "The %s operator is not a valid condition for ATOMIC operation"_err_en_US,
+        operation::ToString(top.first));
+    break;
+  }
+}
+
+void OmpStructureChecker::CheckAtomicConditionalUpdateStmt(
+    const AnalyzedCondStmt &update, parser::CharBlock source) {
+  // The condition/statements must be:
+  // - cond: x equalop e      ift: x =  d     iff: -
+  // - cond: x ordop expr     ift: x =  expr  iff: -  (+ commute ordop)
+  // - cond: associated(x)    ift: x => expr  iff: -
+  // - cond: associated(x, e) ift: x => expr  iff: -
+
+  // The if-true statement must be present, and must be an assignment.
+  auto maybeAssign{GetEvaluateAssignment(update.ift.stmt)};
+  if (!maybeAssign) {
+    if (update.ift.stmt && !IsAssignment(update.ift.stmt)) {
+      context_.Say(update.ift.source,
+          "In ATOMIC UPDATE COMPARE the update statement should be an assignment"_err_en_US);
+    } else {
+      context_.Say(
+          source, "Invalid body of ATOMIC UPDATE COMPARE operation"_err_en_US);
+    }
+    return;
+  }
+  const evaluate::Assignment assign{*maybeAssign};
+  const SomeExpr &atom{assign.lhs};
+
+  CheckAtomicConditionalUpdateAssignment(
+      update.cond, update.source, assign, update.ift.source);
+
+  CheckStorageOverlap(atom, {assign.rhs}, update.ift.source);
+
+  if (update.iff) {
+    context_.Say(update.iff.source,
+        "In ATOMIC UPDATE COMPARE the update statement should not have an ELSE branch"_err_en_US);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicUpdateOnly(
+    const parser::OpenMPAtomicConstruct &x, const parser::Block &body,
+    parser::CharBlock source) {
+  if (body.size() == 1) {
+    SourcedActionStmt action{GetActionStmt(&body.front())};
+    if (auto maybeUpdate{GetEvaluateAssignment(action.stmt)}) {
+      const SomeExpr &atom{maybeUpdate->lhs};
+      CheckAtomicUpdateAssignment(*maybeUpdate, action.source);
+
+      using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+      x.analysis = MakeAtomicAnalysis(atom, std::nullopt,
+          MakeAtomicAnalysisOp(Analysis::Update, maybeUpdate),
+          MakeAtomicAnalysisOp(Analysis::None));
+    } else if (!IsAssignment(action.stmt)) {
+      context_.Say(
+          source, "ATOMIC UPDATE operation should be an assignment"_err_en_US);
+    }
+  } else {
+    context_.Say(x.source,
+        "ATOMIC UPDATE operation should have a single statement"_err_en_US);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicConditionalUpdate(
+    const parser::OpenMPAtomicConstruct &x, const parser::Block &body,
+    parser::CharBlock source) {
+  // Allowable forms are (single-statement):
+  // - if ...
+  // - x = (... ? ... : x)
+  // and two-statement:
+  // - r = cond ; if (r) ...
+
+  const parser::ExecutionPartConstruct *ust{nullptr}; // update
+  const parser::ExecutionPartConstruct *cst{nullptr}; // condition
+
+  if (body.size() == 1) {
+    ust = &body.front();
+  } else if (body.size() == 2) {
+    cst = &body.front();
+    ust = &body.back();
+  } else {
+    context_.Say(source,
+        "ATOMIC UPDATE COMPARE operation should contain one or two statements"_err_en_US);
+    return;
+  }
+
+  // Flang doesn't support conditional-expr yet, so all update statements
+  // are if-statements.
+
+  // IfStmt:        if (...) ...
+  // IfConstruct:   if (...) then ... endif
+  auto maybeUpdate{AnalyzeConditionalStmt(ust)};
+  if (!maybeUpdate) {
+    context_.Say(source,
+        "In ATOMIC UPDATE COMPARE the update statement should be a conditional statement"_err_en_US);
+    return;
+  }
+
+  AnalyzedCondStmt &update{*maybeUpdate};
+
+  if (SourcedActionStmt action{GetActionStmt(cst)}) {
+    // The "condition" statement must be `r = cond`.
+    if (auto maybeCond{GetEvaluateAssignment(action.stmt)}) {
+      if (maybeCond->lhs != update.cond) {
+        context_.Say(update.source,
+            "In ATOMIC UPDATE COMPARE the conditional statement must use %s as the condition"_err_en_US,
+            maybeCond->lhs.AsFortran());
+      } else {
+        // If it's "r = ...; if (r) ..." then put the original condition
+        // in `update`.
+        update.cond = maybeCond->rhs;
+      }
+    } else {
+      context_.Say(action.source,
+          "In ATOMIC UPDATE COMPARE with two statements the first statement should compute the condition"_err_en_US);
+    }
+  }
+
+  evaluate::Assignment assign{*GetEvaluateAssignment(update.ift.stmt)};
+
+  CheckAtomicConditionalUpdateStmt(update, source);
+  if (IsCheckForAssociated(update.cond)) {
+    if (!IsPointerAssignment(assign)) {
+      context_.Say(source,
+          "The assignment should be a pointer-assignment when the condition is ASSOCIATED"_err_en_US);
+    }
+  } else {
+    if (IsPointerAssignment(assign)) {
+      context_.Say(source,
+          "The assignment cannot be a pointer-assignment except when the condition is ASSOCIATED"_err_en_US);
+    }
+  }
+
+  using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+  x.analysis = MakeAtomicAnalysis(assign.lhs, update.cond,
+      MakeAtomicAnalysisOp(Analysis::Update | Analysis::IfTrue, assign),
+      MakeAtomicAnalysisOp(Analysis::None));
+}
+
+void OmpStructureChecker::CheckAtomicUpdateCapture(
+    const parser::OpenMPAtomicConstruct &x, const parser::Block &body,
+    parser::CharBlock source) {
+  if (body.size() != 2) {
+    context_.Say(source,
+        "ATOMIC UPDATE operation with CAPTURE should contain two statements"_err_en_US);
+    return;
+  }
+
+  auto [uec, cec]{CheckUpdateCapture(&body.front(), &body.back(), source)};
+  if (!uec || !cec) {
+    // Diagnostics already emitted.
+    return;
+  }
+  SourcedActionStmt uact{GetActionStmt(uec)};
+  SourcedActionStmt cact{GetActionStmt(cec)};
+  // The "dereferences" of std::optional are guaranteed to be valid after
+  // CheckUpdateCapture.
+  evaluate::Assignment update{*GetEvaluateAssignment(uact.stmt)};
+  evaluate::Assignment capture{*GetEvaluateAssignment(cact.stmt)};
+
+  const SomeExpr &atom{update.lhs};
+
+  using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+  int action;
+
+  if (IsMaybeAtomicWrite(update)) {
+    action = Analysis::Write;
+    CheckAtomicWriteAssignment(update, uact.source);
+  } else {
+    action = Analysis::Update;
+    CheckAtomicUpdateAssignment(update, uact.source);
+  }
+  CheckAtomicCaptureAssignment(capture, atom, cact.source);
+
+  if (IsPointerAssignment(update) != IsPointerAssignment(capture)) {
+    context_.Say(cact.source,
+        "The update and capture assignments should both be pointer-assignments or both be non-pointer-assignments"_err_en_US);
+    return;
+  }
+
+  if (GetActionStmt(&body.front()).stmt == uact.stmt) {
+    x.analysis = MakeAtomicAnalysis(atom, std::nullopt,
+        MakeAtomicAnalysisOp(action, update),
+        MakeAtomicAnalysisOp(Analysis::Read, capture));
+  } else {
+    x.analysis = MakeAtomicAnalysis(atom, std::nullopt,
+        MakeAtomicAnalysisOp(Analysis::Read, capture),
+        MakeAtomicAnalysisOp(action, update));
+  }
+}
+
+void OmpStructureChecker::CheckAtomicConditionalUpdateCapture(
+    const parser::OpenMPAtomicConstruct &x, const parser::Block &body,
+    parser::CharBlock source) {
+  // There are two different variants of this:
+  // (1) conditional-update and capture separately:
+  //     This form only allows single-statement updates, i.e. the update
+  //     form "r = cond; if (r) ..." is not allowed.
+  // (2) conditional-update combined with capture in a single statement:
+  //     This form does allow the condition to be calculated separately,
+  //     i.e. "r = cond; if (r) ...".
+  // Regardless of what form it is, the actual update assignment is a
+  // proper write, i.e. "x = d", where d does not depend on x.
+
+  AnalyzedCondStmt update;
+  SourcedActionStmt capture;
+  bool captureAlways{true}, captureFirst{true};
+
+  auto extractCapture{[&]() {
+    capture = update.iff;
+    captureAlways = false;
+    update.iff = SourcedActionStmt{};
+  }};
+
+  auto classifyNonUpdate{[&](const SourcedActionStmt &action) {
+    // The non-update statement is either "r = cond" or the capture.
+    if (auto maybeAssign{GetEvaluateAssignment(action.stmt)}) {
+      if (update.cond == maybeAssign->lhs) {
+        // If this is "r = cond; if (r) ...", then update the condition.
+        update.cond = maybeAssign->rhs;
+        update.source = action.source;
+        // In this form, the update and the capture are combined into
+        // an IF-THEN-ELSE statement.
+        extractCapture();
+      } else {
+        // Assume this is the capture-statement.
+        capture = action;
+      }
+    }
+  }};
+
+  if (body.size() == 2) {
+    // This could be
+    // - capture; conditional-update (in any order), or
+    // - r = cond; if (r) capture-update
+    const parser::ExecutionPartConstruct *st1{&body.front()};
+    const parser::ExecutionPartConstruct *st2{&body.back()};
+    // In either case, the conditional statement can be analyzed by
+    // AnalyzeConditionalStmt, whereas the other statement cannot.
+    if (auto maybeUpdate1{AnalyzeConditionalStmt(st1)}) {
+      update = *maybeUpdate1;
+      classifyNonUpdate(GetActionStmt(st2));
+      captureFirst = false;
+    } else if (auto maybeUpdate2{AnalyzeConditionalStmt(st2)}) {
+      update = *maybeUpdate2;
+      classifyNonUpdate(GetActionStmt(st1));
+    } else {
+      // None of the statements are conditional, this rules out the
+      // "r = cond; if (r) ..." and the "capture + conditional-update"
+      // variants. This could still be capture + write (which is classified
+      // as conditional-update-capture in the spec).
+      auto [uec, cec]{CheckUpdateCapture(st1, st2, source)};
+      if (!uec || !cec) {
+        // Diagnostics already emitted.
+        return;
+      }
+      SourcedActionStmt uact{GetActionStmt(uec)};
+      SourcedActionStmt cact{GetActionStmt(cec)};
+      update.ift = uact;
+      capture = cact;
+      if (uec == st1) {
+        captureFirst = false;
+      }
+    }
+  } else if (body.size() == 1) {
+    if (auto maybeUpdate{AnalyzeConditionalStmt(&body.front())}) {
+      update = *maybeUpdate;
+      // This is the form with update and capture combined into an IF-THEN-ELSE
+      // statement. The capture-statement is always the ELSE branch.
+      extractCapture();
+    } else {
+      goto invalid;
+    }
+  } else {
+    context_.Say(source,
+        "ATOMIC UPDATE COMPARE CAPTURE operation should contain one or two statements"_err_en_US);
+    return;
+  invalid:
+    context_.Say(source,
+        "Invalid body of ATOMIC UPDATE COMPARE CAPTURE operation"_err_en_US);
+    return;
+  }
+
+  // The update must have a form `x = d` or `x => d`.
+  if (auto maybeWrite{GetEvaluateAssignment(update.ift.stmt)}) {
+    const SomeExpr &atom{maybeWrite->lhs};
+    CheckAtomicWriteAssignment(*maybeWrite, update.ift.source);
+    if (auto maybeCapture{GetEvaluateAssignment(capture.stmt)}) {
+      CheckAtomicCaptureAssignment(*maybeCapture, atom, capture.source);
+
+      if (IsPointerAssignment(*maybeWrite) !=
+          IsPointerAssignment(*maybeCapture)) {
+        context_.Say(capture.source,
+            "The update and capture assignments should both be pointer-assignments or both be non-pointer-assignments"_err_en_US);
+        return;
+      }
+    } else {
+      if (!IsAssignment(capture.stmt)) {
+        context_.Say(capture.source,
+            "In ATOMIC UPDATE COMPARE CAPTURE the capture statement should be an assignment"_err_en_US);
+      }
+      return;
+    }
+  } else {
+    if (!IsAssignment(update.ift.stmt)) {
+      context_.Say(update.ift.source,
+          "In ATOMIC UPDATE COMPARE CAPTURE the update statement should be an assignment"_err_en_US);
+    }
+    return;
+  }
+
+  // update.iff should be empty here, the capture statement should be
+  // stored in "capture".
+
+  // Fill out the analysis in the AST node.
+  using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+  bool condUnused{std::visit(
+      [](auto &&s) {
+        using BareS = llvm::remove_cvref_t<decltype(s)>;
+        if constexpr (std::is_same_v<BareS, evaluate::NullPointer>) {
+          return true;
+        } else {
+          return false;
         }
+      },
+      update.cond.u)};
+
+  int updateWhen{!condUnused ? Analysis::IfTrue : 0};
+  int captureWhen{!captureAlways ? Analysis::IfFalse : 0};
+
+  evaluate::Assignment updAssign{*GetEvaluateAssignment(update.ift.stmt)};
+  evaluate::Assignment capAssign{*GetEvaluateAssignment(capture.stmt)};
+
+  if (captureFirst) {
+    x.analysis = MakeAtomicAnalysis(updAssign.lhs, update.cond,
+        MakeAtomicAnalysisOp(Analysis::Read | captureWhen, capAssign),
+        MakeAtomicAnalysisOp(Analysis::Write | updateWhen, updAssign));
+  } else {
+    x.analysis = MakeAtomicAnalysis(updAssign.lhs, update.cond,
+        MakeAtomicAnalysisOp(Analysis::Write | updateWhen, updAssign),
+        MakeAtomicAnalysisOp(Analysis::Read | captureWhen, capAssign));
+  }
+}
+
+void OmpStructureChecker::CheckAtomicRead(
+    const parser::OpenMPAtomicConstruct &x) {
+  // [6.0:190:5-7]
+  // A read structured block is read-statement, a read statement that has one
+  // of the following forms:
+  //   v = x
+  //   v => x
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &block{std::get<parser::Block>(x.t)};
+
+  // Read cannot be conditional or have a capture statement.
+  if (x.IsCompare() || x.IsCapture()) {
+    context_.Say(dirSpec.source,
+        "ATOMIC READ cannot have COMPARE or CAPTURE clauses"_err_en_US);
+    return;
+  }
+
+  const parser::Block &body{GetInnermostExecPart(block)};
+
+  if (body.size() == 1) {
+    SourcedActionStmt action{GetActionStmt(&body.front())};
+    if (auto maybeRead{GetEvaluateAssignment(action.stmt)}) {
+      CheckAtomicReadAssignment(*maybeRead, action.source);
+
+      if (auto maybe{GetConvertInput(maybeRead->rhs)}) {
+        const SomeExpr &atom{*maybe};
+        using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+        x.analysis = MakeAtomicAnalysis(atom, std::nullopt,
+            MakeAtomicAnalysisOp(Analysis::Read, maybeRead),
+            MakeAtomicAnalysisOp(Analysis::None));
       }
+    } else if (!IsAssignment(action.stmt)) {
+      context_.Say(
+          x.source, "ATOMIC READ operation should be an assignment"_err_en_US);
     }
-  };
-  if (leftHandClauseList) {
-    checkForValidMemoryOrderClause(leftHandClauseList);
+  } else {
+    context_.Say(x.source,
+        "ATOMIC READ operation should have a single statement"_err_en_US);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicWrite(
+    const parser::OpenMPAtomicConstruct &x) {
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &block{std::get<parser::Block>(x.t)};
+
+  // Write cannot be conditional or have a capture statement.
+  if (x.IsCompare() || x.IsCapture()) {
+    context_.Say(dirSpec.source,
+        "ATOMIC WRITE cannot have COMPARE or CAPTURE clauses"_err_en_US);
+    return;
   }
-  if (rightHandClauseList) {
-    checkForValidMemoryOrderClause(rightHandClauseList);
+
+  const parser::Block &body{GetInnermostExecPart(block)};
+
+  if (body.size() == 1) {
+    SourcedActionStmt action{GetActionStmt(&body.front())};
+    if (auto maybeWrite{GetEvaluateAssignment(action.stmt)}) {
+      const SomeExpr &atom{maybeWrite->lhs};
+      CheckAtomicWriteAssignment(*maybeWrite, action.source);
+
+      using Analysis = parser::OpenMPAtomicConstruct::Analysis;
+      x.analysis = MakeAtomicAnalysis(atom, std::nullopt,
+          MakeAtomicAnalysisOp(Analysis::Write, maybeWrite),
+          MakeAtomicAnalysisOp(Analysis::None));
+    } else if (!IsAssignment(action.stmt)) {
+      context_.Say(
+          x.source, "ATOMIC WRITE operation should be an assignment"_err_en_US);
+    }
+  } else {
+    context_.Say(x.source,
+        "ATOMIC WRITE operation should have a single statement"_err_en_US);
+  }
+}
+
+void OmpStructureChecker::CheckAtomicUpdate(
+    const parser::OpenMPAtomicConstruct &x) {
+  auto &block{std::get<parser::Block>(x.t)};
+
+  bool isConditional{x.IsCompare()};
+  bool isCapture{x.IsCapture()};
+  const parser::Block &body{GetInnermostExecPart(block)};
+
+  if (isConditional && isCapture) {
+    CheckAtomicConditionalUpdateCapture(x, body, x.source);
+  } else if (isConditional) {
+    CheckAtomicConditionalUpdate(x, body, x.source);
+  } else if (isCapture) {
+    CheckAtomicUpdateCapture(x, body, x.source);
+  } else { // update-only
+    CheckAtomicUpdateOnly(x, body, x.source);
   }
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPAtomicConstruct &x) {
-  common::visit(
-      common::visitors{
-          [&](const parser::OmpAtomic &atomicConstruct) {
-            const auto &dir{std::get<parser::Verbatim>(atomicConstruct.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicUpdateStmt(
-                std::get<parser::Statement<parser::AssignmentStmt>>(
-                    atomicConstruct.t)
-                    .statement);
-            CheckAtomicMemoryOrderClause(
-                &std::get<parser::OmpAtomicClauseList>(atomicConstruct.t),
-                nullptr);
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<parser::OmpAtomicClauseList>(atomicConstruct.t),
-                nullptr, "ATOMIC");
-          },
-          [&](const parser::OmpAtomicUpdate &atomicUpdate) {
-            const auto &dir{std::get<parser::Verbatim>(atomicUpdate.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicUpdateStmt(
-                std::get<parser::Statement<parser::AssignmentStmt>>(
-                    atomicUpdate.t)
-                    .statement);
-            CheckAtomicMemoryOrderClause(
-                &std::get<0>(atomicUpdate.t), &std::get<2>(atomicUpdate.t));
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<0>(atomicUpdate.t), &std::get<2>(atomicUpdate.t),
-                "UPDATE");
-          },
-          [&](const parser::OmpAtomicRead &atomicRead) {
-            const auto &dir{std::get<parser::Verbatim>(atomicRead.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicMemoryOrderClause(
-                &std::get<0>(atomicRead.t), &std::get<2>(atomicRead.t));
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<0>(atomicRead.t), &std::get<2>(atomicRead.t), "READ");
-            CheckAtomicCaptureStmt(
-                std::get<parser::Statement<parser::AssignmentStmt>>(
-                    atomicRead.t)
-                    .statement);
-          },
-          [&](const parser::OmpAtomicWrite &atomicWrite) {
-            const auto &dir{std::get<parser::Verbatim>(atomicWrite.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicMemoryOrderClause(
-                &std::get<0>(atomicWrite.t), &std::get<2>(atomicWrite.t));
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<0>(atomicWrite.t), &std::get<2>(atomicWrite.t),
-                "WRITE");
-            CheckAtomicWriteStmt(
-                std::get<parser::Statement<parser::AssignmentStmt>>(
-                    atomicWrite.t)
-                    .statement);
-          },
-          [&](const parser::OmpAtomicCapture &atomicCapture) {
-            const auto &dir{std::get<parser::Verbatim>(atomicCapture.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicMemoryOrderClause(
-                &std::get<0>(atomicCapture.t), &std::get<2>(atomicCapture.t));
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<0>(atomicCapture.t), &std::get<2>(atomicCapture.t),
-                "CAPTURE");
-            CheckAtomicCaptureConstruct(atomicCapture);
-          },
-          [&](const parser::OmpAtomicCompare &atomicCompare) {
-            const auto &dir{std::get<parser::Verbatim>(atomicCompare.t)};
-            PushContextAndClauseSets(
-                dir.source, llvm::omp::Directive::OMPD_atomic);
-            CheckAtomicMemoryOrderClause(
-                &std::get<0>(atomicCompare.t), &std::get<2>(atomicCompare.t));
-            CheckHintClause<const parser::OmpAtomicClauseList>(
-                &std::get<0>(atomicCompare.t), &std::get<2>(atomicCompare.t),
-                "CAPTURE");
-            CheckAtomicCompareConstruct(atomicCompare);
-          },
-      },
-      x.u);
+  // All of the following groups have the "exclusive" property, i.e. at
+  // most one clause from each group is allowed.
+  // The exclusivity-checking code should eventually be unified for all
+  // clauses, with clause groups defined in OMP.td.
+  std::array atomic{llvm::omp::Clause::OMPC_read,
+      llvm::omp::Clause::OMPC_update, llvm::omp::Clause::OMPC_write};
+  std::array memoryOrder{llvm::omp::Clause::OMPC_acq_rel,
+      llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_relaxed,
+      llvm::omp::Clause::OMPC_release, llvm::omp::Clause::OMPC_seq_cst};
+
+  auto checkExclusive{[&](llvm::ArrayRef<llvm::omp::Clause> group,
+                          std::string_view name,
+                          const parser::OmpClauseList &clauses) {
+    const parser::OmpClause *present{nullptr};
+    for (const parser::OmpClause &clause : clauses.v) {
+      llvm::omp::Clause id{clause.Id()};
+      if (!llvm::is_contained(group, id)) {
+        continue;
+      }
+      if (present == nullptr) {
+        present = &clause;
+        continue;
+      } else if (id == present->Id()) {
+        // Ignore repetitions of the same clause, those will be diagnosed
+        // separately.
+        continue;
+      }
+      parser::MessageFormattedText txt(
+          "At most one clause from the '%s' group is allowed on ATOMIC construct"_err_en_US,
+          name.data());
+      parser::Message message(clause.source, txt);
+      message.Attach(present->source,
+          "Previous clause from this group provided here"_en_US);
+      context_.Say(std::move(message));
+      return;
+    }
+  }};
+
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &dir{std::get<parser::OmpDirectiveName>(dirSpec.t)};
+  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_atomic);
+  llvm::omp::Clause kind{x.GetKind()};
+
+  checkExclusive(atomic, "atomic", dirSpec.Clauses());
+  checkExclusive(memoryOrder, "memory-order", dirSpec.Clauses());
+
+  switch (kind) {
+  case llvm::omp::Clause::OMPC_read:
+    CheckAtomicRead(x);
+    break;
+  case llvm::omp::Clause::OMPC_write:
+    CheckAtomicWrite(x);
+    break;
+  case llvm::omp::Clause::OMPC_update:
+    CheckAtomicUpdate(x);
+    break;
+  default:
+    break;
+  }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPAtomicConstruct &) {
@@ -3332,7 +4335,6 @@ CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
 CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
 CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
 CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
-CHECK_SIMPLE_CLAUSE(Hint, OMPC_hint)
 CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
 CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
 CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
@@ -4014,40 +5016,6 @@ void OmpStructureChecker::CheckIsLoopIvPartOfClause(
     }
   }
 }
-// Following clauses have a separate node in parse-tree.h.
-// Atomic-clause
-CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicRead, OMPC_read)
-CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicWrite, OMPC_write)
-CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicUpdate, OMPC_update)
-CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicCapture, OMPC_capture)
-
-void OmpStructureChecker::Leave(const parser::OmpAtomicRead &) {
-  CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_read,
-      {llvm::omp::Clause::OMPC_release, llvm::omp::Clause::OMPC_acq_rel});
-}
-
-void OmpStructureChecker::Leave(const parser::OmpAtomicWrite &) {
-  CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_write,
-      {llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_acq_rel});
-}
-
-void OmpStructureChecker::Leave(const parser::OmpAtomicUpdate &) {
-  CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_update,
-      {llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_acq_rel});
-}
-
-// OmpAtomic node represents atomic directive without atomic-clause.
-// atomic-clause - READ,WRITE,UPDATE,CAPTURE.
-void OmpStructureChecker::Leave(const parser::OmpAtomic &) {
-  if (const auto *clause{FindClause(llvm::omp::Clause::OMPC_acquire)}) {
-    context_.Say(clause->source,
-        "Clause ACQUIRE is not allowed on the ATOMIC directive"_err_en_US);
-  }
-  if (const auto *clause{FindClause(llvm::omp::Clause::OMPC_acq_rel)}) {
-    context_.Say(clause->source,
-        "Clause ACQ_REL is not allowed on the ATOMIC directive"_err_en_US);
-  }
-}
 
 // Restrictions specific to each clause are implemented apart from the
 // generalized restrictions.
@@ -5026,21 +5994,6 @@ void OmpStructureChecker::Leave(const parser::OmpContextSelector &) {
   ExitDirectiveNest(ContextSelectorNest);
 }
 
-std::optional<evaluate::DynamicType> OmpStructureChecker::GetDynamicType(
-    const common::Indirection<parser::Expr> &parserExpr) {
-  // Indirection<parser::Expr>      parserExpr
-  //  `- parser::Expr               ^.value()
-  const parser::TypedExpr &typedExpr{parserExpr.value().typedExpr};
-  // ForwardOwningPointer           typedExpr
-  // `- GenericExprWrapper          ^.get()
-  //    `- std::optional<Expr>      ^->v
-  if (auto maybeExpr{typedExpr.get()->v}) {
-    return maybeExpr->GetType();
-  } else {
-    return std::nullopt;
-  }
-}
-
 const std::list<parser::OmpTraitProperty> &
 OmpStructureChecker::GetTraitPropertyList(
     const parser::OmpTraitSelector &trait) {
@@ -5430,7 +6383,7 @@ void OmpStructureChecker::CheckTraitCondition(
     const parser::OmpTraitProperty &property{properties.front()};
     auto &scalarExpr{std::get<parser::ScalarExpr>(property.u)};
 
-    auto maybeType{GetDynamicType(scalarExpr.thing)};
+    auto maybeType{GetDynamicType(scalarExpr.thing.value())};
     if (!maybeType || maybeType->category() != TypeCategory::Logical) {
       context_.Say(property.source,
           "%s trait requires a single LOGICAL expression"_err_en_US,
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 1a8059d8548ed..2074ec611dc2a 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -48,6 +48,7 @@ static const OmpDirectiveSet noWaitClauseNotAllowedSet{
 } // namespace llvm
 
 namespace Fortran::semantics {
+struct AnalyzedCondStmt;
 
 // Mapping from 'Symbol' to 'Source' to keep track of the variables
 // used in multiple clauses
@@ -144,15 +145,6 @@ class OmpStructureChecker
   void Leave(const parser::OmpClauseList &);
   void Enter(const parser::OmpClause &);
 
-  void Enter(const parser::OmpAtomicRead &);
-  void Leave(const parser::OmpAtomicRead &);
-  void Enter(const parser::OmpAtomicWrite &);
-  void Leave(const parser::OmpAtomicWrite &);
-  void Enter(const parser::OmpAtomicUpdate &);
-  void Leave(const parser::OmpAtomicUpdate &);
-  void Enter(const parser::OmpAtomicCapture &);
-  void Leave(const parser::OmpAtomic &);
-
   void Enter(const parser::DoConstruct &);
   void Leave(const parser::DoConstruct &);
 
@@ -192,8 +184,6 @@ class OmpStructureChecker
   void CheckAllowedMapTypes(const parser::OmpMapType::Value &,
       const std::list<parser::OmpMapType::Value> &);
 
-  std::optional<evaluate::DynamicType> GetDynamicType(
-      const common::Indirection<parser::Expr> &);
   const std::list<parser::OmpTraitProperty> &GetTraitPropertyList(
       const parser::OmpTraitSelector &);
   std::optional<llvm::omp::Clause> GetClauseFromProperty(
@@ -265,14 +255,44 @@ class OmpStructureChecker
   void CheckDoWhile(const parser::OpenMPLoopConstruct &x);
   void CheckAssociatedLoopConstraints(const parser::OpenMPLoopConstruct &x);
   template <typename T, typename D> bool IsOperatorValid(const T &, const D &);
-  void CheckAtomicMemoryOrderClause(
-      const parser::OmpAtomicClauseList *, const parser::OmpAtomicClauseList *);
-  void CheckAtomicUpdateStmt(const parser::AssignmentStmt &);
-  void CheckAtomicCaptureStmt(const parser::AssignmentStmt &);
-  void CheckAtomicWriteStmt(const parser::AssignmentStmt &);
-  void CheckAtomicCaptureConstruct(const parser::OmpAtomicCapture &);
-  void CheckAtomicCompareConstruct(const parser::OmpAtomicCompare &);
-  void CheckAtomicConstructStructure(const parser::OpenMPAtomicConstruct &);
+
+  void CheckStorageOverlap(const evaluate::Expr<evaluate::SomeType> &,
+      llvm::ArrayRef<evaluate::Expr<evaluate::SomeType>>, parser::CharBlock);
+  void ErrorShouldBeVariable(const MaybeExpr &expr, parser::CharBlock source);
+  void CheckAtomicType(
+      SymbolRef sym, parser::CharBlock source, std::string_view name);
+  void CheckAtomicVariable(
+      const evaluate::Expr<evaluate::SomeType> &, parser::CharBlock);
+  std::pair<const parser::ExecutionPartConstruct *,
+      const parser::ExecutionPartConstruct *>
+  CheckUpdateCapture(const parser::ExecutionPartConstruct *ec1,
+      const parser::ExecutionPartConstruct *ec2, parser::CharBlock source);
+  void CheckAtomicCaptureAssignment(const evaluate::Assignment &capture,
+      const SomeExpr &atom, parser::CharBlock source);
+  void CheckAtomicReadAssignment(
+      const evaluate::Assignment &read, parser::CharBlock source);
+  void CheckAtomicWriteAssignment(
+      const evaluate::Assignment &write, parser::CharBlock source);
+  void CheckAtomicUpdateAssignment(
+      const evaluate::Assignment &update, parser::CharBlock source);
+  void CheckAtomicConditionalUpdateAssignment(const SomeExpr &cond,
+      parser::CharBlock condSource, const evaluate::Assignment &assign,
+      parser::CharBlock assignSource);
+  void CheckAtomicConditionalUpdateStmt(
+      const AnalyzedCondStmt &update, parser::CharBlock source);
+  void CheckAtomicUpdateOnly(const parser::OpenMPAtomicConstruct &x,
+      const parser::Block &body, parser::CharBlock source);
+  void CheckAtomicConditionalUpdate(const parser::OpenMPAtomicConstruct &x,
+      const parser::Block &body, parser::CharBlock source);
+  void CheckAtomicUpdateCapture(const parser::OpenMPAtomicConstruct &x,
+      const parser::Block &body, parser::CharBlock source);
+  void CheckAtomicConditionalUpdateCapture(
+      const parser::OpenMPAtomicConstruct &x, const parser::Block &body,
+      parser::CharBlock source);
+  void CheckAtomicRead(const parser::OpenMPAtomicConstruct &x);
+  void CheckAtomicWrite(const parser::OpenMPAtomicConstruct &x);
+  void CheckAtomicUpdate(const parser::OpenMPAtomicConstruct &x);
+
   void CheckDistLinear(const parser::OpenMPLoopConstruct &x);
   void CheckSIMDNest(const parser::OpenMPConstruct &x);
   void CheckTargetNest(const parser::OpenMPConstruct &x);
@@ -324,7 +344,6 @@ class OmpStructureChecker
   void EnterDirectiveNest(const int index) { directiveNest_[index]++; }
   void ExitDirectiveNest(const int index) { directiveNest_[index]--; }
   int GetDirectiveNest(const int index) { return directiveNest_[index]; }
-  template <typename D> void CheckHintClause(D *, D *, std::string_view);
   inline void ErrIfAllocatableVariable(const parser::Variable &);
   inline void ErrIfLHSAndRHSSymbolsMatch(
       const parser::Variable &, const parser::Expr &);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 3e133b156a9f3..7db447aee0026 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1684,11 +1684,8 @@ class OmpVisitor : public virtual DeclarationVisitor {
     messageHandler().set_currStmtSource(std::nullopt);
   }
   bool Pre(const parser::OpenMPAtomicConstruct &x) {
-    return common::visit(common::visitors{[&](const auto &u) -> bool {
-      AddOmpSourceRange(u.source);
-      return true;
-    }},
-        x.u);
+    AddOmpSourceRange(x.source);
+    return true;
   }
   void Post(const parser::OpenMPAtomicConstruct &) {
     messageHandler().set_currStmtSource(std::nullopt);
diff --git a/flang/lib/Semantics/rewrite-directives.cpp b/flang/lib/Semantics/rewrite-directives.cpp
index 104a77885d276..b4fef2c881b67 100644
--- a/flang/lib/Semantics/rewrite-directives.cpp
+++ b/flang/lib/Semantics/rewrite-directives.cpp
@@ -51,23 +51,21 @@ class OmpRewriteMutator : public DirectiveRewriteMutator {
 
 bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) {
   // Find top-level parent of the operation.
-  Symbol *topLevelParent{common::visit(
-      [&](auto &atomic) {
-        Symbol *symbol{nullptr};
-        Scope *scope{
-            &context_.FindScope(std::get<parser::Verbatim>(atomic.t).source)};
-        do {
-          if (Symbol * parent{scope->symbol()}) {
-            symbol = parent;
-          }
-          scope = &scope->parent();
-        } while (!scope->IsGlobal());
-
-        assert(symbol &&
-            "Atomic construct must be within a scope associated with a symbol");
-        return symbol;
-      },
-      x.u)};
+  Symbol *topLevelParent{[&]() {
+    Symbol *symbol{nullptr};
+    Scope *scope{&context_.FindScope(
+        std::get<parser::OmpDirectiveSpecification>(x.t).source)};
+    do {
+      if (Symbol * parent{scope->symbol()}) {
+        symbol = parent;
+      }
+      scope = &scope->parent();
+    } while (!scope->IsGlobal());
+
+    assert(symbol &&
+        "Atomic construct must be within a scope associated with a symbol");
+    return symbol;
+  }()};
 
   // Get the `atomic_default_mem_order` clause from the top-level parent.
   std::optional<common::OmpMemoryOrderType> defaultMemOrder;
@@ -86,66 +84,48 @@ bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) {
     return false;
   }
 
-  auto findMemOrderClause =
-      [](const std::list<parser::OmpAtomicClause> &clauses) {
-        return llvm::any_of(clauses, [](const auto &clause) {
-          return std::get_if<parser::OmpMemoryOrderClause>(&clause.u);
+  auto findMemOrderClause{[](const parser::OmpClauseList &clauses) {
+    return llvm::any_of(
+        clauses.v, [](auto &clause) -> const parser::OmpClause * {
+          switch (clause.Id()) {
+          case llvm::omp::Clause::OMPC_acq_rel:
+          case llvm::omp::Clause::OMPC_acquire:
+          case llvm::omp::Clause::OMPC_relaxed:
+          case llvm::omp::Clause::OMPC_release:
+          case llvm::omp::Clause::OMPC_seq_cst:
+            return &clause;
+          default:
+            return nullptr;
+          }
         });
-      };
-
-  // Get the clause list to which the new memory order clause must be added,
-  // only if there are no other memory order clauses present for this atomic
-  // directive.
-  std::list<parser::OmpAtomicClause> *clauseList = common::visit(
-      common::visitors{[&](parser::OmpAtomic &atomicConstruct) {
-                         // OmpAtomic only has a single list of clauses.
-                         auto &clauses{std::get<parser::OmpAtomicClauseList>(
-                             atomicConstruct.t)};
-                         return !findMemOrderClause(clauses.v) ? &clauses.v
-                                                               : nullptr;
-                       },
-          [&](auto &atomicConstruct) {
-            // All other atomic constructs have two lists of clauses.
-            auto &clausesLhs{std::get<0>(atomicConstruct.t)};
-            auto &clausesRhs{std::get<2>(atomicConstruct.t)};
-            return !findMemOrderClause(clausesLhs.v) &&
-                    !findMemOrderClause(clausesRhs.v)
-                ? &clausesRhs.v
-                : nullptr;
-          }},
-      x.u);
+  }};
 
-  // Add a memory order clause to the atomic directive.
+  auto &dirSpec{std::get<parser::OmpDirectiveSpecification>(x.t)};
+  auto &clauseList{std::get<std::optional<parser::OmpClauseList>>(dirSpec.t)};
   if (clauseList) {
-    atomicDirectiveDefaultOrderFound_ = true;
-    switch (*defaultMemOrder) {
-    case common::OmpMemoryOrderType::Acq_Rel:
-      clauseList->emplace_back<parser::OmpMemoryOrderClause>(common::visit(
-          common::visitors{[](parser::OmpAtomicRead &) -> parser::OmpClause {
-                             return parser::OmpClause::Acquire{};
-                           },
-              [](parser::OmpAtomicCapture &) -> parser::OmpClause {
-                return parser::OmpClause::AcqRel{};
-              },
-              [](auto &) -> parser::OmpClause {
-                // parser::{OmpAtomic, OmpAtomicUpdate, OmpAtomicWrite}
-                return parser::OmpClause::Release{};
-              }},
-          x.u));
-      break;
-    case common::OmpMemoryOrderType::Relaxed:
-      clauseList->emplace_back<parser::OmpMemoryOrderClause>(
-          parser::OmpClause{parser::OmpClause::Relaxed{}});
-      break;
-    case common::OmpMemoryOrderType::Seq_Cst:
-      clauseList->emplace_back<parser::OmpMemoryOrderClause>(
-          parser::OmpClause{parser::OmpClause::SeqCst{}});
-      break;
-    default:
-      // FIXME: Don't process other values at the moment since their validity
-      // depends on the OpenMP version (which is unavailable here).
-      break;
+    if (findMemOrderClause(*clauseList)) {
+      return false;
     }
+  } else {
+    clauseList = parser::OmpClauseList(decltype(parser::OmpClauseList::v){});
+  }
+
+  // Add a memory order clause to the atomic directive.
+  atomicDirectiveDefaultOrderFound_ = true;
+  switch (*defaultMemOrder) {
+  case common::OmpMemoryOrderType::Acq_Rel:
+    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::AcqRel{}});
+    break;
+  case common::OmpMemoryOrderType::Relaxed:
+    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::Relaxed{}});
+    break;
+  case common::OmpMemoryOrderType::Seq_Cst:
+    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::SeqCst{}});
+    break;
+  default:
+    // FIXME: Don't process other values at the moment since their validity
+    // depends on the OpenMP version (which is unavailable here).
+    break;
   }
 
   return false;
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index ac69e6ff5cb79..a1445187b1e98 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -17,6 +17,7 @@
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
 #include "flang/Support/Fortran.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <set>
@@ -1770,4 +1771,318 @@ bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) {
   }
   return false;
 }
-} // namespace Fortran::semantics
+
+namespace operation {
+template <typename T> //
+SomeExpr asSomeExpr(const T &x) {
+  auto copy{x};
+  return AsGenericExpr(std::move(copy));
+}
+
+template <bool IgnoreResizingConverts> //
+struct ArgumentExtractor
+    : public evaluate::Traverse<ArgumentExtractor<IgnoreResizingConverts>,
+          std::pair<operation::Operator, std::vector<SomeExpr>>, false> {
+  using Arguments = std::vector<SomeExpr>;
+  using Result = std::pair<operation::Operator, Arguments>;
+  using Base = evaluate::Traverse<ArgumentExtractor<IgnoreResizingConverts>,
+      Result, false>;
+  static constexpr auto IgnoreResizes = IgnoreResizingConverts;
+  static constexpr auto Logical = common::TypeCategory::Logical;
+  ArgumentExtractor() : Base(*this) {}
+
+  Result Default() const { return {}; }
+
+  using Base::operator();
+
+  template <int Kind> //
+  Result operator()(
+      const evaluate::Constant<evaluate::Type<Logical, Kind>> &x) const {
+    if (const auto &val{x.GetScalarValue()}) {
+      return val->IsTrue()
+          ? std::make_pair(operation::Operator::True, Arguments{})
+          : std::make_pair(operation::Operator::False, Arguments{});
+    }
+    return Default();
+  }
+
+  template <typename R> //
+  Result operator()(const evaluate::FunctionRef<R> &x) const {
+    Result result{operation::OperationCode(x.proc()), {}};
+    for (size_t i{0}, e{x.arguments().size()}; i != e; ++i) {
+      if (auto *e{x.UnwrapArgExpr(i)}) {
+        result.second.push_back(*e);
+      }
+    }
+    return result;
+  }
+
+  template <typename D, typename R, typename... Os>
+  Result operator()(const evaluate::Operation<D, R, Os...> &x) const {
+    if constexpr (std::is_same_v<D, evaluate::Parentheses<R>>) {
+      // Ignore top-level parentheses.
+      return (*this)(x.template operand<0>());
+    }
+    if constexpr (IgnoreResizes &&
+        std::is_same_v<D, evaluate::Convert<R, R::category>>) {
+      // Ignore conversions within the same category.
+      // Atomic operations on int(kind=1) may be implicitly widened
+      // to int(kind=4) for example.
+      return (*this)(x.template operand<0>());
+    } else {
+      return std::make_pair(operation::OperationCode(x),
+          OperationArgs(x, std::index_sequence_for<Os...>{}));
+    }
+  }
+
+  template <typename T> //
+  Result operator()(const evaluate::Designator<T> &x) const {
+    return {operation::Operator::Identity, {asSomeExpr(x)}};
+  }
+
+  template <typename T> //
+  Result operator()(const evaluate::Constant<T> &x) const {
+    return {operation::Operator::Identity, {asSomeExpr(x)}};
+  }
+
+  template <typename... Rs> //
+  Result Combine(Result &&result, Rs &&...results) const {
+    // There shouldn't be any combining needed, since we're stopping the
+    // traversal at the top-level operation, but implement one that picks
+    // the first non-empty result.
+    if constexpr (sizeof...(Rs) == 0) {
+      return std::move(result);
+    } else {
+      if (!result.second.empty()) {
+        return std::move(result);
+      } else {
+        return Combine(std::move(results)...);
+      }
+    }
+  }
+
+private:
+  template <typename D, typename R, typename... Os, size_t... Is>
+  Arguments OperationArgs(const evaluate::Operation<D, R, Os...> &x,
+      std::index_sequence<Is...>) const {
+    return Arguments{SomeExpr(x.template operand<Is>())...};
+  }
+};
+} // namespace operation
+
+std::string operation::ToString(operation::Operator op) {
+  switch (op) {
+  case Operator::Unknown:
+    return "??";
+  case Operator::Add:
+    return "+";
+  case Operator::And:
+    return "AND";
+  case Operator::Associated:
+    return "ASSOCIATED";
+  case Operator::Call:
+    return "function-call";
+  case Operator::Constant:
+    return "constant";
+  case Operator::Convert:
+    return "type-conversion";
+  case Operator::Div:
+    return "/";
+  case Operator::Eq:
+    return "==";
+  case Operator::Eqv:
+    return "EQV";
+  case Operator::False:
+    return ".FALSE.";
+  case Operator::Ge:
+    return ">=";
+  case Operator::Gt:
+    return ">";
+  case Operator::Identity:
+    return "identity";
+  case Operator::Intrinsic:
+    return "intrinsic";
+  case Operator::Le:
+    return "<=";
+  case Operator::Lt:
+    return "<";
+  case Operator::Max:
+    return "MAX";
+  case Operator::Min:
+    return "MIN";
+  case Operator::Mul:
+    return "*";
+  case Operator::Ne:
+    return "/=";
+  case Operator::Neqv:
+    return "NEQV/EOR";
+  case Operator::Not:
+    return "NOT";
+  case Operator::Or:
+    return "OR";
+  case Operator::Pow:
+    return "**";
+  case Operator::Resize:
+    return "resize";
+  case Operator::Sub:
+    return "-";
+  case Operator::True:
+    return ".TRUE.";
+  }
+  llvm_unreachable("Unhandler operator");
+}
+
+operation::Operator operation::OperationCode(
+    const evaluate::ProcedureDesignator &proc) {
+  Operator code = llvm::StringSwitch<Operator>(proc.GetName())
+                      .Case("associated", Operator::Associated)
+                      .Case("min", Operator::Min)
+                      .Case("max", Operator::Max)
+                      .Case("iand", Operator::And)
+                      .Case("ior", Operator::Or)
+                      .Case("ieor", Operator::Neqv)
+                      .Default(Operator::Call);
+  if (code == Operator::Call && proc.GetSpecificIntrinsic()) {
+    return Operator::Intrinsic;
+  }
+  return code;
+}
+
+std::pair<operation::Operator, std::vector<SomeExpr>> GetTopLevelOperation(
+    const SomeExpr &expr) {
+  return operation::ArgumentExtractor<true>{}(expr);
+}
+
+namespace operation {
+struct ConvertCollector
+    : public evaluate::Traverse<ConvertCollector,
+          std::pair<MaybeExpr, std::vector<evaluate::DynamicType>>, false> {
+  using Result = std::pair<MaybeExpr, std::vector<evaluate::DynamicType>>;
+  using Base = evaluate::Traverse<ConvertCollector, Result, false>;
+  ConvertCollector() : Base(*this) {}
+
+  Result Default() const { return {}; }
+
+  using Base::operator();
+
+  template <typename T> //
+  Result operator()(const evaluate::Designator<T> &x) const {
+    return {asSomeExpr(x), {}};
+  }
+
+  template <typename T> //
+  Result operator()(const evaluate::FunctionRef<T> &x) const {
+    return {asSomeExpr(x), {}};
+  }
+
+  template <typename T> //
+  Result operator()(const evaluate::Constant<T> &x) const {
+    return {asSomeExpr(x), {}};
+  }
+
+  template <typename D, typename R, typename... Os>
+  Result operator()(const evaluate::Operation<D, R, Os...> &x) const {
+    if constexpr (std::is_same_v<D, evaluate::Parentheses<R>>) {
+      // Ignore parentheses.
+      return (*this)(x.template operand<0>());
+    } else if constexpr (is_convert_v<D>) {
+      // Convert should always have a typed result, so it should be safe to
+      // dereference x.GetType().
+      return Combine(
+          {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
+    } else if constexpr (is_complex_constructor_v<D>) {
+      // This is a conversion iff the imaginary operand is 0.
+      if (IsZero(x.template operand<1>())) {
+        return Combine(
+            {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>()));
+      } else {
+        return {asSomeExpr(x.derived()), {}};
+      }
+    } else {
+      return {asSomeExpr(x.derived()), {}};
+    }
+  }
+
+  template <typename... Rs> //
+  Result Combine(Result &&result, Rs &&...results) const {
+    Result v(std::move(result));
+    auto setValue{[](MaybeExpr &x, MaybeExpr &&y) {
+      assert((!x.has_value() || !y.has_value()) && "Multiple designators");
+      if (!x.has_value()) {
+        x = std::move(y);
+      }
+    }};
+    auto moveAppend{[](auto &accum, auto &&other) {
+      for (auto &&s : other) {
+        accum.push_back(std::move(s));
+      }
+    }};
+    (setValue(v.first, std::move(results).first), ...);
+    (moveAppend(v.second, std::move(results).second), ...);
+    return v;
+  }
+
+private:
+  template <typename T> //
+  static bool IsZero(const T &x) {
+    return false;
+  }
+  template <typename T> //
+  static bool IsZero(const evaluate::Expr<T> &x) {
+    return common::visit([](auto &&s) { return IsZero(s); }, x.u);
+  }
+  template <typename T> //
+  static bool IsZero(const evaluate::Constant<T> &x) {
+    if (auto &&maybeScalar{x.GetScalarValue()}) {
+      return maybeScalar->IsZero();
+    } else {
+      return false;
+    }
+  }
+
+  template <typename T> //
+  struct is_convert {
+    static constexpr bool value{false};
+  };
+  template <typename T, common::TypeCategory C> //
+  struct is_convert<evaluate::Convert<T, C>> {
+    static constexpr bool value{true};
+  };
+  template <int K> //
+  struct is_convert<evaluate::ComplexComponent<K>> {
+    // Conversion from complex to real.
+    static constexpr bool value{true};
+  };
+  template <typename T> //
+  static constexpr bool is_convert_v = is_convert<T>::value;
+
+  template <typename T> //
+  struct is_complex_constructor {
+    static constexpr bool value{false};
+  };
+  template <int K> //
+  struct is_complex_constructor<evaluate::ComplexConstructor<K>> {
+    static constexpr bool value{true};
+  };
+  template <typename T> //
+  static constexpr bool is_complex_constructor_v =
+      is_complex_constructor<T>::value;
+};
+} // namespace operation
+
+MaybeExpr GetConvertInput(const SomeExpr &x) {
+  // This returns SomeExpr(x) when x is a designator/functionref/constant.
+  return operation::ConvertCollector{}(x).first;
+}
+
+bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x) {
+  // Check if expr is same as x, or a sequence of Convert operations on x.
+  if (expr == x) {
+    return true;
+  } else if (auto maybe{GetConvertInput(expr)}) {
+    return *maybe == x;
+  } else {
+    return false;
+  }
+}
+} // namespace Fortran::semantics
\ No newline at end of file
diff --git a/flang/test/Examples/omp-atomic.f90 b/flang/test/Examples/omp-atomic.f90
index dcca34b633a3e..934f84f132484 100644
--- a/flang/test/Examples/omp-atomic.f90
+++ b/flang/test/Examples/omp-atomic.f90
@@ -26,25 +26,31 @@
 ! CHECK:---
 ! CHECK-NEXT:- file:            '{{[^"]*}}omp-atomic.f90'
 ! CHECK-NEXT:  line:            9
-! CHECK-NEXT:  construct:       atomic-read
+! CHECK-NEXT:  construct:       atomic
 ! CHECK-NEXT:  clauses:
-! CHECK-NEXT:    - clause:      seq_cst
+! CHECK-NEXT:    - clause:      read
 ! CHECK-NEXT:      details:     ''
+! CHECK-NEXT:    - clause:      seq_cst
+! CHECK-NEXT:      details:     'name_modifier=atomic;'
 ! CHECK-NEXT:- file:            '{{[^"]*}}omp-atomic.f90'
 ! CHECK-NEXT:  line:            12
-! CHECK-NEXT:  construct:       atomic-write
+! CHECK-NEXT:  construct:       atomic
 ! CHECK-NEXT:  clauses:
 ! CHECK-NEXT:    - clause:      seq_cst
+! CHECK-NEXT:      details:     'name_modifier=atomic;'
+! CHECK-NEXT:    - clause:      write
 ! CHECK-NEXT:      details:     ''
 ! CHECK-NEXT:- file:            '{{[^"]*}}omp-atomic.f90'
 ! CHECK-NEXT:  line:            16
-! CHECK-NEXT:  construct:       atomic-capture
+! CHECK-NEXT:  construct:       atomic
 ! CHECK-NEXT:  clauses:
+! CHECK-NEXT:    - clause:      capture
+! CHECK-NEXT:      details:     'name_modifier=atomic;name_modifier=atomic;'
 ! CHECK-NEXT:    - clause:      seq_cst
 ! CHECK-NEXT:      details:     ''
 ! CHECK-NEXT:- file:            '{{[^"]*}}omp-atomic.f90'
 ! CHECK-NEXT:  line:            21
-! CHECK-NEXT:  construct:       atomic-atomic
+! CHECK-NEXT:  construct:       atomic
 ! CHECK-NEXT:  clauses:         []
 ! CHECK-NEXT:- file:            '{{[^"]*}}omp-atomic.f90'
 ! CHECK-NEXT:  line:            8
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90 b/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90
index b82bd13622764..6f58e0939a787 100644
--- a/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90
+++ b/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90
@@ -1,6 +1,6 @@
 ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
 
-! CHECK: not yet implemented: OpenMP atomic compare
+! CHECK: not yet implemented: OpenMP ATOMIC COMPARE
 program p
   integer :: x
   logical :: r
diff --git a/flang/test/Lower/OpenMP/Todo/atomic-compare.f90 b/flang/test/Lower/OpenMP/Todo/atomic-compare.f90
index 88ec6fe910b9e..6729be6e5cf8b 100644
--- a/flang/test/Lower/OpenMP/Todo/atomic-compare.f90
+++ b/flang/test/Lower/OpenMP/Todo/atomic-compare.f90
@@ -1,6 +1,6 @@
 ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
 
-! CHECK: not yet implemented: OpenMP atomic compare
+! CHECK: not yet implemented: OpenMP ATOMIC COMPARE
 program p
   integer :: x
   logical :: r
diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90
index 2f800d534dc36..14fd0c942a9b4 100644
--- a/flang/test/Lower/OpenMP/atomic-capture.f90
+++ b/flang/test/Lower/OpenMP/atomic-capture.f90
@@ -79,16 +79,16 @@ subroutine pointers_in_atomic_capture()
 !CHECK: %[[VAL_A_BOX_ADDR:.*]] = fir.box_addr %[[VAL_A_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 !CHECK: %[[VAL_B_LOADED:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !CHECK: %[[VAL_B_BOX_ADDR:.*]] = fir.box_addr %[[VAL_B_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR]] : !fir.ptr<i32>
 !CHECK: %[[VAL_B_LOADED_2:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !CHECK: %[[VAL_B_BOX_ADDR_2:.*]] = fir.box_addr %[[VAL_B_LOADED_2]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
-!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR_2]] : !fir.ptr<i32>
 !CHECK: omp.atomic.capture {
 !CHECK: omp.atomic.update %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32> {
 !CHECK: ^bb0(%[[ARG:.*]]: i32):
 !CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_B]] : i32
 !CHECK: omp.yield(%[[TEMP]] : i32)
 !CHECK: }
-!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32>, !fir.ptr<i32>, i32
+!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR_2]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32>, !fir.ptr<i32>, i32
 !CHECK: }
 !CHECK: return
 !CHECK: }
diff --git a/flang/test/Lower/OpenMP/atomic-implicit-cast.f90 b/flang/test/Lower/OpenMP/atomic-implicit-cast.f90
index 4c1be1ca91ac0..5e00235b85e74 100644
--- a/flang/test/Lower/OpenMP/atomic-implicit-cast.f90
+++ b/flang/test/Lower/OpenMP/atomic-implicit-cast.f90
@@ -1,5 +1,3 @@
-! REQUIRES : openmp_runtime
-
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
 ! CHECK: func.func @_QPatomic_implicit_cast_read() {
@@ -97,9 +95,9 @@ subroutine atomic_implicit_cast_read
 ! CHECK: }
 ! CHECK: omp.atomic.read %[[ALLOCA6]] = %[[X_DECL]]#0 : !fir.ref<i32>, !fir.ref<i32>, i32
 ! CHECK: %[[LOAD:.*]] = fir.load %[[ALLOCA6]] : !fir.ref<i32>
-! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32>
 ! CHECK: %[[CVT:.*]] = fir.convert %[[LOAD]] : (i32) -> f32
 ! CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f32>
 ! CHECK: %[[IDX1:.*]] = fir.insert_value %[[UNDEF]], %[[CVT]], [0 : index] : (complex<f32>, f32) -> complex<f32>
 ! CHECK: %[[IDX2:.*]] = fir.insert_value %[[IDX1]], %[[CST]], [1 : index] : (complex<f32>, f32) -> complex<f32>
 ! CHECK: fir.store %[[IDX2]] to %[[W_DECL]]#0 : !fir.ref<complex<f32>>
@@ -109,14 +107,14 @@ subroutine atomic_implicit_cast_read
      !$omp end atomic
 
 
-! CHECK: omp.atomic.capture {
-! CHECK: omp.atomic.update %[[M_DECL]]#0 : !fir.ref<complex<f64>> {
-! CHECK: ^bb0(%[[ARG:.*]]: complex<f64>):
 ! CHECK: %[[CST1:.*]] = arith.constant 1.000000e+00 : f64
 ! CHECK: %[[CST2:.*]] = arith.constant 0.000000e+00 : f64
 ! CHECK: %[[UNDEF:.*]] = fir.undefined complex<f64>
 ! CHECK: %[[IDX1:.*]] = fir.insert_value %[[UNDEF]], %[[CST1]], [0 : index] : (complex<f64>, f64) -> complex<f64>
 ! CHECK: %[[IDX2:.*]] = fir.insert_value %[[IDX1]], %[[CST2]], [1 : index] : (complex<f64>, f64) -> complex<f64>
+! CHECK: omp.atomic.capture {
+! CHECK: omp.atomic.update %[[M_DECL]]#0 : !fir.ref<complex<f64>> {
+! CHECK: ^bb0(%[[ARG:.*]]: complex<f64>):
 ! CHECK: %[[RESULT:.*]] = fir.addc %[[ARG]], %[[IDX2]] {fastmath = #arith.fastmath<contract>} : complex<f64>
 ! CHECK: omp.yield(%[[RESULT]] : complex<f64>)
 ! CHECK: }
diff --git a/flang/test/Lower/OpenMP/atomic-privatize.f90 b/flang/test/Lower/OpenMP/atomic-privatize.f90
index f922095264fca..c876266cf018c 100644
--- a/flang/test/Lower/OpenMP/atomic-privatize.f90
+++ b/flang/test/Lower/OpenMP/atomic-privatize.f90
@@ -8,7 +8,7 @@
 
 !CHECK: omp.task private(@_QFfredEprv_firstprivate_i32 %{{[0-9]+}}#0 -> %arg0
 !CHECK: %[[DECL:[0-9]+]]:2 = hlfir.declare %arg0 {uniq_name = "_QFfredEprv"}
-!CHECK: omp.atomic.update %[[DECL]]#0
+!CHECK: omp.atomic.update memory_order(relaxed) %[[DECL]]#0
 
 integer function fred
   integer :: prv
diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90
index 13392ad76471f..6eded49b0b15d 100644
--- a/flang/test/Lower/OpenMP/atomic-write.f90
+++ b/flang/test/Lower/OpenMP/atomic-write.f90
@@ -44,9 +44,9 @@ end program OmpAtomicWrite
 !CHECK-LABEL: func.func @_QPatomic_write_pointer() {
 !CHECK:    %[[X_REF:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "x", uniq_name = "_QFatomic_write_pointerEx"}
 !CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFatomic_write_pointerEx"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
-!CHECK:    %[[C1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[X_ADDR_BOX:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 !CHECK:    %[[X_POINTEE_ADDR:.*]] = fir.box_addr %[[X_ADDR_BOX]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK:    %[[C1:.*]] = arith.constant 1 : i32
 !CHECK:    omp.atomic.write %[[X_POINTEE_ADDR]] = %[[C1]]   : !fir.ptr<i32>, i32
 !CHECK:    %[[C2:.*]] = arith.constant 2 : i32
 !CHECK:    %[[X_ADDR_BOX:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
diff --git a/flang/test/Lower/OpenMP/dump-atomic-analysis.f90 b/flang/test/Lower/OpenMP/dump-atomic-analysis.f90
new file mode 100644
index 0000000000000..cbaf7bc9f2d8a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/dump-atomic-analysis.f90
@@ -0,0 +1,82 @@
+!RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -emit-hlfir -mmlir -fdebug-dump-atomic-analysis %s -o /dev/null 2>&1 | FileCheck %s
+
+subroutine f00(x)
+  integer :: x, v
+  !$omp atomic read
+    v = x
+end
+
+!CHECK: Analysis {
+!CHECK-NEXT:   atom: x
+!CHECK-NEXT:   cond: <null>
+!CHECK-NEXT:   op0 {
+!CHECK-NEXT:     what: Read
+!CHECK-NEXT:     assign: v=x
+!CHECK-NEXT:   }
+!CHECK-NEXT:   op1 {
+!CHECK-NEXT:     what: None
+!CHECK-NEXT:     assign: <null>
+!CHECK-NEXT:   }
+!CHECK-NEXT: }
+
+
+subroutine f01(v)
+  integer :: x, v
+  !$omp atomic write
+    x = v
+end
+
+!CHECK: Analysis {
+!CHECK-NEXT:   atom: x
+!CHECK-NEXT:   cond: <null>
+!CHECK-NEXT:   op0 {
+!CHECK-NEXT:     what: Write
+!CHECK-NEXT:     assign: x=v
+!CHECK-NEXT:   }
+!CHECK-NEXT:   op1 {
+!CHECK-NEXT:     what: None
+!CHECK-NEXT:     assign: <null>
+!CHECK-NEXT:   }
+!CHECK-NEXT: }
+
+
+subroutine f02(x, v)
+  integer :: x, v
+  !$omp atomic update
+    x = x + v
+end
+
+!CHECK: Analysis {
+!CHECK-NEXT:   atom: x
+!CHECK-NEXT:   cond: <null>
+!CHECK-NEXT:   op0 {
+!CHECK-NEXT:     what: Update
+!CHECK-NEXT:     assign: x=x+v
+!CHECK-NEXT:   }
+!CHECK-NEXT:   op1 {
+!CHECK-NEXT:     what: None
+!CHECK-NEXT:     assign: <null>
+!CHECK-NEXT:   }
+!CHECK-NEXT: }
+
+
+subroutine f03(x, v)
+  integer :: x, v, t
+  !$omp atomic update capture
+    t = x
+    x = x + v
+  !$omp end atomic
+end
+
+!CHECK: Analysis {
+!CHECK-NEXT:   atom: x
+!CHECK-NEXT:   cond: <null>
+!CHECK-NEXT:   op0 {
+!CHECK-NEXT:     what: Read
+!CHECK-NEXT:     assign: t=x
+!CHECK-NEXT:   }
+!CHECK-NEXT:   op1 {
+!CHECK-NEXT:     what: Update
+!CHECK-NEXT:     assign: x=x+v
+!CHECK-NEXT:   }
+!CHECK-NEXT: }
diff --git a/flang/test/Parser/OpenMP/atomic-compare.f90 b/flang/test/Parser/OpenMP/atomic-compare.f90
index 5cd02698ff482..e09da4a359fcc 100644
--- a/flang/test/Parser/OpenMP/atomic-compare.f90
+++ b/flang/test/Parser/OpenMP/atomic-compare.f90
@@ -1,16 +1,290 @@
-! RUN: not %flang_fc1  -fopenmp-version=51 -fopenmp %s 2>&1 | FileCheck %s
-! OpenMP version for documentation purposes only - it isn't used until Sema.
-! This is testing for Parser errors that bail out before Sema. 
-program main
-   implicit none
-   integer :: i, j = 10
-   logical :: r
-
-  !CHECK: error: expected OpenMP construct
-  !$omp atomic compare write
-  r =  i .eq. j + 1
-
-  !CHECK: error: expected end of line
-  !$omp atomic compare num_threads(4)
-  r = i .eq. j
-end program main
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00(a, b)
+  integer :: a, b
+  integer :: x
+  !$omp atomic update compare
+  if (x < a) x = b
+end
+
+!UNPARSE: SUBROUTINE f00 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP ATOMIC UPDATE COMPARE
+!UNPARSE:  IF (x<a)  x=b
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt
+!PARSE-TREE: | | | Scalar -> Logical -> Expr = 'x<a'
+!PARSE-TREE: | | | | LT
+!PARSE-TREE: | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'a'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: | | | ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | Variable = 'x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | Expr = 'b'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'b'
+
+subroutine f01(a, b)
+  integer :: a, b
+  integer :: x
+  !$omp atomic update compare
+  if (x < a) then
+    x = b
+  endif
+end
+
+!UNPARSE: SUBROUTINE f01 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x
+!UNPARSE: !$OMP ATOMIC UPDATE COMPARE
+!UNPARSE:  IF (x<a) THEN
+!UNPARSE:    x=b
+!UNPARSE:  END IF
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct
+!PARSE-TREE: | | | IfThenStmt
+!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x<a'
+!PARSE-TREE: | | | | | LT
+!PARSE-TREE: | | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'a'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | | Variable = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'b'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b'
+!PARSE-TREE: | | | EndIfStmt ->
+
+subroutine f02(a, b)
+  integer :: a, b
+  integer :: x
+  logical :: c
+  c = x < a
+  !$omp atomic update compare
+  if (c) then
+    x = b
+  endif
+end
+
+!UNPARSE: SUBROUTINE f02 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x
+!UNPARSE:  LOGICAL c
+!UNPARSE:   c=x<a
+!UNPARSE: !$OMP ATOMIC UPDATE COMPARE
+!UNPARSE:  IF (c) THEN
+!UNPARSE:    x=b
+!UNPARSE:  END IF
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'c=x<a'
+!PARSE-TREE: | Variable = 'c'
+!PARSE-TREE: | | Designator -> DataRef -> Name = 'c'
+!PARSE-TREE: | Expr = 'x<a'
+!PARSE-TREE: | | LT
+!PARSE-TREE: | | | Expr = 'x'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | Expr = 'a'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct
+!PARSE-TREE: | | | IfThenStmt
+!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'c'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'c'
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | | Variable = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'b'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b'
+!PARSE-TREE: | | | EndIfStmt ->
+
+subroutine g00(a, b)
+  integer :: a, b
+  integer :: x, v
+  !$omp atomic update capture compare
+  v = x
+  if (x < a) x = b
+  !$omp end atomic
+end
+
+!UNPARSE: SUBROUTINE g00 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x, v
+!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE
+!UNPARSE:   v=x
+!UNPARSE:  IF (x<a)  x=b
+!UNPARSE: !$OMP END ATOMIC
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Capture
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x'
+!PARSE-TREE: | | | Variable = 'v'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | Expr = 'x'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt
+!PARSE-TREE: | | | Scalar -> Logical -> Expr = 'x<a'
+!PARSE-TREE: | | | | LT
+!PARSE-TREE: | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'a'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: | | | ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | Variable = 'x'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | Expr = 'b'
+!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'b'
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
+subroutine g01(a, b)
+  integer :: a, b
+  integer :: x, v
+  !$omp atomic update capture compare
+  v = x
+  if (x < a) then
+    x = b
+  endif
+  !$omp end atomic
+end
+
+!UNPARSE: SUBROUTINE g01 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x, v
+!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE
+!UNPARSE:   v=x
+!UNPARSE:  IF (x<a) THEN
+!UNPARSE:    x=b
+!UNPARSE:  END IF
+!UNPARSE: !$OMP END ATOMIC
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Capture
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x'
+!PARSE-TREE: | | | Variable = 'v'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | Expr = 'x'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct
+!PARSE-TREE: | | | IfThenStmt
+!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x<a'
+!PARSE-TREE: | | | | | LT
+!PARSE-TREE: | | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'a'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | | Variable = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'b'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b'
+!PARSE-TREE: | | | EndIfStmt ->
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
+subroutine g02(a, b)
+  integer :: a, b
+  integer :: x, v
+  !$omp atomic update capture compare
+  if (x < a) then
+    x = b
+  else
+    v = x
+  endif
+  !$omp end atomic
+end
+
+!UNPARSE: SUBROUTINE g02 (a, b)
+!UNPARSE:  INTEGER a, b
+!UNPARSE:  INTEGER x, v
+!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE
+!UNPARSE:  IF (x<a) THEN
+!UNPARSE:    x=b
+!UNPARSE:  ELSE
+!UNPARSE:    v=x
+!UNPARSE:  END IF
+!UNPARSE: !$OMP END ATOMIC
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update ->
+!PARSE-TREE: | | OmpClause -> Capture
+!PARSE-TREE: | | OmpClause -> Compare
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct
+!PARSE-TREE: | | | IfThenStmt
+!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x<a'
+!PARSE-TREE: | | | | | LT
+!PARSE-TREE: | | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | | Expr = 'a'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a'
+!PARSE-TREE: | | | Block
+!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b'
+!PARSE-TREE: | | | | | Variable = 'x'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | | | Expr = 'b'
+!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b'
+!PARSE-TREE: | | | ElseBlock
+!PARSE-TREE: | | | | ElseStmt ->
+!PARSE-TREE: | | | | Block
+!PARSE-TREE: | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x'
+!PARSE-TREE: | | | | | | Variable = 'v'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | | | | Expr = 'x'
+!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | | | EndIfStmt ->
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
diff --git a/flang/test/Parser/OpenMP/atomic-end.f90 b/flang/test/Parser/OpenMP/atomic-end.f90
new file mode 100644
index 0000000000000..e5eac87517b1e
--- /dev/null
+++ b/flang/test/Parser/OpenMP/atomic-end.f90
@@ -0,0 +1,63 @@
+!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s
+!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine f00
+  integer :: x, v
+  !$omp atomic read
+  v = x
+  !$omp end atomic
+end
+
+!UNPARSE: SUBROUTINE f00
+!UNPARSE:  INTEGER x, v
+!UNPARSE: !$OMP ATOMIC READ
+!UNPARSE:   v=x
+!UNPARSE: !$OMP END ATOMIC
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Read
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x'
+!PARSE-TREE: | | | Variable = 'v'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | Expr = 'x'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
+
+
+subroutine f01
+  integer :: x, v
+  !$omp atomic read
+  v = x
+  !$omp endatomic
+end
+
+!UNPARSE: SUBROUTINE f01
+!UNPARSE:  INTEGER x, v
+!UNPARSE: !$OMP ATOMIC READ
+!UNPARSE:   v=x
+!UNPARSE: !$OMP END ATOMIC
+!UNPARSE: END SUBROUTINE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList -> OmpClause -> Read
+!PARSE-TREE: | | Flags = None
+!PARSE-TREE: | Block
+!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x'
+!PARSE-TREE: | | | Variable = 'v'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v'
+!PARSE-TREE: | | | Expr = 'x'
+!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x'
+!PARSE-TREE: | OmpDirectiveSpecification
+!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic
+!PARSE-TREE: | | OmpClauseList ->
+!PARSE-TREE: | | Flags = None
diff --git a/flang/test/Semantics/OpenMP/atomic-compare.f90 b/flang/test/Semantics/OpenMP/atomic-compare.f90
index 54492bf6a22a6..11e23e062bce7 100644
--- a/flang/test/Semantics/OpenMP/atomic-compare.f90
+++ b/flang/test/Semantics/OpenMP/atomic-compare.f90
@@ -44,46 +44,37 @@
   !$omp end atomic
 
   ! Check for error conditions:
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst compare
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic compare seq_cst seq_cst
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst compare seq_cst
   if (b .eq. c) b = a
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire acquire compare
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic compare acquire acquire
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire compare acquire
   if (b .eq. c) b = a
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the COMPARE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed compare
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the COMPARE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic compare relaxed relaxed
   if (b .eq. c) b = a
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the COMPARE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed compare relaxed
   if (b .eq. c) b = a
 
-  !ERROR: More than one FAIL clause not allowed on OpenMP ATOMIC construct
+  !ERROR: At most one FAIL clause can appear on the ATOMIC directive
   !$omp atomic fail(release) compare fail(release)
   if (c .eq. a) a = b
   !$omp end atomic
diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
index c13a11a8dd5dc..8adb0f1a67409 100644
--- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
@@ -16,20 +16,21 @@ program sample
     !$omp atomic read hint(2)
         y = x    
      
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp atomic hint(3)
         y = y + 10
     
     !$omp atomic update hint(5)
         y = x + y
     
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp atomic hint(7) capture
+    !WARNING: In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement
         y = x
         x = y
     !$omp end atomic
    
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp atomic update hint(x)
         y = y * 1
@@ -46,7 +47,7 @@ program sample
     !$omp atomic hint(omp_lock_hint_speculative)
         x = y + x
     
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp atomic hint(omp_sync_hint_uncontended + omp_sync_hint) read
         y = x 
@@ -69,36 +70,36 @@ program sample
     !$omp atomic hint(omp_lock_hint_contended + omp_sync_hint_nonspeculative)
         x = y + x
 
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp atomic hint(omp_sync_hint_uncontended + omp_sync_hint_contended) read
         y = x 
 
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp atomic hint(omp_sync_hint_nonspeculative + omp_lock_hint_speculative)
         y = y * 9
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must have INTEGER type, but is REAL(4)
     !$omp atomic hint(1.0) read
         y = x
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Operands of + must be numeric; have LOGICAL(4) and INTEGER(4)
     !$omp atomic hint(z + omp_sync_hint_nonspeculative) read
         y = x
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp atomic hint(k + omp_sync_hint_speculative) read
         y = x
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp atomic hint(p(1) + omp_sync_hint_uncontended) write
         x = 10 * y
 
     !$omp atomic write hint(a)
-    !ERROR: RHS expression on atomic assignment statement cannot access 'x'
+    !ERROR: Within atomic operation x and y+x access the same storage
         x = y + x
 
     !$omp atomic hint(abs(-1)) write
diff --git a/flang/test/Semantics/OpenMP/atomic-read.f90 b/flang/test/Semantics/OpenMP/atomic-read.f90
new file mode 100644
index 0000000000000..06c301cb78b77
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/atomic-read.f90
@@ -0,0 +1,118 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00
+  integer :: x, v
+  ! The end-directive is optional in ATOMIC READ. Expect no diagnostics.
+  !$omp atomic read
+  v = x
+
+  !$omp atomic read
+  v = x
+  !$omp end atomic
+end
+
+subroutine f01
+  integer, pointer :: x, v
+  ! Intrinsic assignment and pointer assignment are both ok. Expect no
+  ! diagnostics.
+  !$omp atomic read
+  v = x
+
+  !$omp atomic read
+  v => x
+end
+
+subroutine f02(i)
+  integer :: i, v
+  interface
+    function p(i)
+      integer, pointer :: p
+      integer :: i
+    end
+  end interface
+
+  ! Atomic variable can be a function reference. Expect no diagostics.
+  !$omp atomic read
+  v = p(i)
+end
+
+subroutine f03
+  integer :: x(3), y(5), v(3)
+
+  !$omp atomic read
+  !ERROR: Atomic variable x should be a scalar
+  v = x
+
+  !$omp atomic read
+  !ERROR: Atomic variable y(2_8:4_8:1_8) should be a scalar
+  v = y(2:4)
+end
+
+subroutine f04
+  integer :: x, y(3), v
+
+  !$omp atomic read
+  !ERROR: Within atomic operation x and x access the same storage
+  x = x
+
+  ! Accessing same array, but not the same storage. Expect no diagnostics.
+  !$omp atomic read
+  y(1) = y(2)
+end
+
+subroutine f05
+  integer :: x, v
+
+  !$omp atomic read
+  !ERROR: Atomic expression x+1_4 should be a variable
+  v = x + 1
+end
+
+subroutine f06
+  character :: x, v
+
+  !$omp atomic read
+  !ERROR: Atomic variable x cannot have CHARACTER type
+  v = x
+end
+
+subroutine f07
+  integer, allocatable :: x
+  integer :: v
+
+  allocate(x)
+
+  !$omp atomic read
+  !ERROR: Atomic variable x cannot be ALLOCATABLE
+  v = x
+end
+
+subroutine f08
+  type :: struct
+    integer :: m
+  end type
+  type(struct) :: x, v
+
+  !$omp atomic read
+  !ERROR: Atomic variable x should have an intrinsic type
+  v = x
+end
+
+subroutine f09(x, v)
+  class(*), pointer :: x, v
+
+  !$omp atomic read
+  !ERROR: Atomic variable x cannot be a pointer to a polymorphic type
+  v => x
+end
+
+subroutine f10(x, v)
+  type struct(length)
+    integer, len :: length
+  end type
+  type(struct(*)), pointer :: x, v
+
+  !$omp atomic read
+  !ERROR: Atomic variable x is a pointer to a type with non-constant length parameter
+  v => x
+end
diff --git a/flang/test/Semantics/OpenMP/atomic-update-capture.f90 b/flang/test/Semantics/OpenMP/atomic-update-capture.f90
new file mode 100644
index 0000000000000..f808ed916fb7e
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/atomic-update-capture.f90
@@ -0,0 +1,77 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00
+  integer :: x, y, v
+
+  !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two statements
+  !$omp atomic update capture
+  x = v
+  x = x + 1
+  y = x
+  !$omp end atomic
+end
+
+subroutine f01
+  integer :: x, y, v
+
+  !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two assignments
+  !$omp atomic update capture
+  x = v
+  block
+    x = x + 1
+    y = x
+  end block
+  !$omp end atomic
+end
+
+subroutine f02
+  integer :: x, y
+
+  ! The update and capture statements can be inside of a single BLOCK.
+  ! The end-directive is then optional. Expect no diagnostics.
+  !$omp atomic update capture
+  block
+    x = x + 1
+    y = x
+  end block
+end
+
+subroutine f03
+  integer :: x
+
+  !ERROR: In ATOMIC UPDATE operation with CAPTURE neither statement could be the capture
+  !$omp atomic update capture
+  x = x + 1
+  x = x + 2
+  !$omp end atomic
+end
+
+subroutine f04
+  integer :: x, v
+
+  !$omp atomic update capture
+  !WARNING: In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement
+  v = x
+  x = v
+  !$omp end atomic
+end
+
+subroutine f05
+  integer :: x, v, z
+
+  !$omp atomic update capture
+  !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z
+  v = x
+  z = x + 1
+  !$omp end atomic
+end
+
+subroutine f06
+  integer :: x, v, z
+
+  !$omp atomic update capture
+  z = x + 1
+  !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z
+  v = x
+  !$omp end atomic
+end
diff --git a/flang/test/Semantics/OpenMP/atomic-update-only.f90 b/flang/test/Semantics/OpenMP/atomic-update-only.f90
new file mode 100644
index 0000000000000..28d0e264359cb
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/atomic-update-only.f90
@@ -0,0 +1,83 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00
+  integer :: x, y
+
+  ! The x is a direct argument of the + operator. Expect no diagnostics.
+  !$omp atomic update
+  x = x + (y - 1)
+end
+
+subroutine f01
+  integer :: x
+
+  ! x + 0 is unusual, but legal. Expect no diagnostics.
+  !$omp atomic update
+  x = x + 0
+end
+
+subroutine f02
+  integer :: x
+
+  ! This is formally not allowed by the syntax restrictions of the spec,
+  ! but it's equivalent to either x+0 or x*1, both of which are legal.
+  ! Allow this case. Expect no diagnostics.
+  !$omp atomic update
+  x = x
+end
+
+subroutine f03
+  integer :: x, y
+
+  !$omp atomic update
+  !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
+  x = (x + y) + 1
+end
+
+subroutine f04
+  integer :: x
+  real :: y
+
+  !$omp atomic update
+  !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation
+  x = floor(x + y)
+end
+
+subroutine f05
+  integer :: x
+  real :: y
+
+  ! An explicit conversion is accepted as an extension.
+  !$omp atomic update
+  x = int(x + y)
+end
+
+subroutine f06
+  integer :: x, y
+  interface
+    function f(i, j)
+      integer :: f, i, j
+    end
+  end interface
+
+  !$omp atomic update
+  !ERROR: A call to this function is not a valid ATOMIC UPDATE operation
+  x = f(x, y)
+end
+
+subroutine f07
+  real :: x
+  integer :: y
+
+  !$omp atomic update
+  !ERROR: The ** operator is not a valid ATOMIC UPDATE operation
+  x = x ** y
+end
+
+subroutine f08
+  integer :: x, y
+
+  !$omp atomic update
+  !ERROR: The atomic variable x should appear as an argument in the update operation
+  x = y
+end
diff --git a/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90 b/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90
index 21a9b87d26345..3084376b4275d 100644
--- a/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90
+++ b/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90
@@ -22,10 +22,10 @@ program sample
         x = x / y
      
     !$omp atomic update
-    !ERROR: Invalid or missing operator in atomic update statement
+    !ERROR: A call to this function is not a valid ATOMIC UPDATE operation
         x = x .MYOPERATOR. y
 
     !$omp atomic
-    !ERROR: Invalid or missing operator in atomic update statement
+    !ERROR: A call to this function is not a valid ATOMIC UPDATE operation
         x = x .MYOPERATOR. y
 end program
diff --git a/flang/test/Semantics/OpenMP/atomic-write.f90 b/flang/test/Semantics/OpenMP/atomic-write.f90
new file mode 100644
index 0000000000000..7965ad2dc7dbf
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/atomic-write.f90
@@ -0,0 +1,81 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+subroutine f00
+  integer :: x, v
+  ! The end-directive is optional in ATOMIC WRITE. Expect no diagnostics.
+  !$omp atomic write
+  x = v + 1
+
+  !$omp atomic write
+  x = v + 3
+  !$omp end atomic
+end
+
+subroutine f01
+  integer, pointer :: x, v
+  ! Intrinsic assignment and pointer assignment are both ok. Expect no
+  ! diagnostics.
+  !$omp atomic write
+  x = 2 * v + 3
+
+  !$omp atomic write
+  x => v
+end
+
+subroutine f02(i)
+  integer :: i, v
+  interface
+    function p(i)
+      integer, pointer :: p
+      integer :: i
+    end
+  end interface
+
+  ! Atomic variable can be a function reference. Expect no diagostics.
+  !$omp atomic write
+  p(i) = v
+end
+
+subroutine f03
+  integer :: x(3), y(5), v(3)
+
+  !$omp atomic write
+  !ERROR: Atomic variable x should be a scalar
+  x = v
+
+  !$omp atomic write
+  !ERROR: Atomic variable y(2_8:4_8:1_8) should be a scalar
+  y(2:4) = v
+end
+
+subroutine f04
+  integer :: x, y(3), v
+
+  !$omp atomic write
+  !ERROR: Within atomic operation x and x+1_4 access the same storage
+  x = x + 1
+
+  ! Accessing same array, but not the same storage. Expect no diagnostics.
+  !$omp atomic write
+  y(1) = y(2)
+end
+
+subroutine f06
+  character :: x, v
+
+  !$omp atomic write
+  !ERROR: Atomic variable x cannot have CHARACTER type
+  x = v
+end
+
+subroutine f07
+  integer, allocatable :: x
+  integer :: v
+
+  allocate(x)
+
+  !$omp atomic write
+  !ERROR: Atomic variable x cannot be ALLOCATABLE
+  x = v
+end
+
diff --git a/flang/test/Semantics/OpenMP/atomic.f90 b/flang/test/Semantics/OpenMP/atomic.f90
index 0e100871ea9b4..10b33a3ade22d 100644
--- a/flang/test/Semantics/OpenMP/atomic.f90
+++ b/flang/test/Semantics/OpenMP/atomic.f90
@@ -1,4 +1,6 @@
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! REQUIRES: openmp_runtime
+
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp %openmp_flags
 use omp_lib
 ! Check OpenMP 2.13.6 atomic Construct
 
@@ -11,9 +13,13 @@
   a = b
   !$omp end atomic
 
+  !ERROR: ACQUIRE clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
+  !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
   !$omp atomic read acquire hint(OMP_LOCK_HINT_CONTENDED)
   a = b
 
+  !ERROR: RELEASE clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
+  !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
   !$omp atomic release hint(OMP_LOCK_HINT_UNCONTENDED) write
   a = b
 
@@ -22,39 +28,32 @@
   a = a + 1
   !$omp end atomic
 
+  !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
+  !ERROR: ACQ_REL clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
   !$omp atomic hint(1) acq_rel capture
   b = a
   a = a + 1
   !$omp end atomic
 
-  !ERROR: expected end of line
+  !ERROR: At most one clause from the 'atomic' group is allowed on ATOMIC construct
   !$omp atomic read write
+  !ERROR: Atomic expression a+1._4 should be a variable
   a = a + 1
 
   !$omp atomic
   a = a + 1
-  !ERROR: expected 'UPDATE'
-  !ERROR: expected 'WRITE'
-  !ERROR: expected 'COMPARE'
-  !ERROR: expected 'CAPTURE'
-  !ERROR: expected 'READ'
+  !ERROR: NUM_THREADS clause is not allowed on the ATOMIC directive
   !$omp atomic num_threads(4)
   a = a + 1
 
-  !ERROR: expected end of line
+  !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two statements
+  !ERROR: NUM_THREADS clause is not allowed on the ATOMIC directive
   !$omp atomic capture num_threads(4)
   a = a + 1
 
+  !ERROR: RELAXED clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50
   !$omp atomic relaxed
   a = a + 1
 
-  !ERROR: expected 'UPDATE'
-  !ERROR: expected 'WRITE'
-  !ERROR: expected 'COMPARE'
-  !ERROR: expected 'CAPTURE'
-  !ERROR: expected 'READ'
-  !$omp atomic num_threads write
-  a = a + 1
-
   !$omp end parallel
 end
diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90
index 173effe86b69c..f700c381cadd0 100644
--- a/flang/test/Semantics/OpenMP/atomic01.f90
+++ b/flang/test/Semantics/OpenMP/atomic01.f90
@@ -14,322 +14,277 @@
 ! At most one memory-order-clause may appear on the construct.
 
 !READ
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the READ directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst read
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the READ directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic read seq_cst seq_cst
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the READ directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst read seq_cst
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the READ directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire acquire read
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the READ directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic read acquire acquire
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the READ directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire read acquire
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the READ directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed read
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the READ directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic read relaxed relaxed
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the READ directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed read relaxed
     i = j
 
 !UPDATE
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic update seq_cst seq_cst
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst update seq_cst
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the UPDATE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release release update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the UPDATE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic update release release
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the UPDATE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release update release
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the UPDATE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the UPDATE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic update relaxed relaxed
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the UPDATE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed update relaxed
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
 !CAPTURE
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst capture
     i = j
     j = k
   !$omp end atomic
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic capture seq_cst seq_cst
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst capture seq_cst
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the CAPTURE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release release capture
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the CAPTURE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic capture release release
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the CAPTURE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release capture release
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the CAPTURE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed capture
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the CAPTURE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic capture relaxed relaxed
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the CAPTURE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed capture relaxed
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive
   !$omp atomic acq_rel acq_rel capture
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive
   !$omp atomic capture acq_rel acq_rel
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive
   !$omp atomic acq_rel capture acq_rel
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire acquire capture
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic capture acquire acquire
     i = j
     j = k
   !$omp end atomic
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive
+  !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive
   !$omp atomic acquire capture acquire
     i = j
     j = k
   !$omp end atomic
 
 !WRITE
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the WRITE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst write
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the WRITE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic write seq_cst seq_cst
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one SEQ_CST clause can appear on the WRITE directive
+  !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst write seq_cst
     i = j
 
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the WRITE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release release write
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the WRITE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic write release release
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELEASE clause can appear on the WRITE directive
+  !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release write release
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the WRITE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed write
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the WRITE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic write relaxed relaxed
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
-  !ERROR: At most one RELAXED clause can appear on the WRITE directive
+  !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed write relaxed
     i = j
 
 !No atomic-clause
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
   !ERROR: At most one RELAXED clause can appear on the ATOMIC directive
   !$omp atomic relaxed relaxed
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
   !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive
   !$omp atomic seq_cst seq_cst
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
   !ERROR: At most one RELEASE clause can appear on the ATOMIC directive
   !$omp atomic release release
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
 ! 2.17.7.3
 ! At most one hint clause may appear on the construct.
 
-  !ERROR: At most one HINT clause can appear on the READ directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_speculative) hint(omp_sync_hint_speculative) read
     i = j
-  !ERROR: At most one HINT clause can appear on the READ directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) read hint(omp_sync_hint_nonspeculative)
     i = j
-  !ERROR: At most one HINT clause can appear on the READ directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic read hint(omp_sync_hint_uncontended) hint (omp_sync_hint_uncontended)
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) write
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) write hint(omp_sync_hint_nonspeculative)
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic write hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) write
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) write hint(omp_sync_hint_nonspeculative)
     i = j
-  !ERROR: At most one HINT clause can appear on the WRITE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic write hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
     i = j
-  !ERROR: At most one HINT clause can appear on the UPDATE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: At most one HINT clause can appear on the UPDATE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) update hint(omp_sync_hint_nonspeculative)
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: At most one HINT clause can appear on the UPDATE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic update hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative)
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_none) hint(omp_sync_hint_nonspeculative)
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
   !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: At most one HINT clause can appear on the CAPTURE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) capture
     i = j
     j = k
   !$omp end atomic
-  !ERROR: At most one HINT clause can appear on the CAPTURE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic hint(omp_sync_hint_nonspeculative) capture hint(omp_sync_hint_nonspeculative)
     i = j
     j = k
   !$omp end atomic
-  !ERROR: At most one HINT clause can appear on the CAPTURE directive
+  !ERROR: At most one HINT clause can appear on the ATOMIC directive
   !$omp atomic capture hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended)
     i = j
     j = k
@@ -337,34 +292,26 @@
 ! 2.17.7.4
 ! If atomic-clause is read then memory-order-clause must not be acq_rel or release.
 
-  !ERROR: Clause ACQ_REL is not allowed if clause READ appears on the ATOMIC directive
   !$omp atomic acq_rel read
     i = j
-  !ERROR: Clause ACQ_REL is not allowed if clause READ appears on the ATOMIC directive
   !$omp atomic read acq_rel
     i = j
 
-  !ERROR: Clause RELEASE is not allowed if clause READ appears on the ATOMIC directive
   !$omp atomic release read
     i = j
-  !ERROR: Clause RELEASE is not allowed if clause READ appears on the ATOMIC directive
   !$omp atomic read release
     i = j
 
 ! 2.17.7.5
 ! If atomic-clause is write then memory-order-clause must not be acq_rel or acquire.
 
-  !ERROR: Clause ACQ_REL is not allowed if clause WRITE appears on the ATOMIC directive
   !$omp atomic acq_rel write
     i = j
-  !ERROR: Clause ACQ_REL is not allowed if clause WRITE appears on the ATOMIC directive
   !$omp atomic write acq_rel
     i = j
 
-  !ERROR: Clause ACQUIRE is not allowed if clause WRITE appears on the ATOMIC directive
   !$omp atomic acquire write
     i = j
-  !ERROR: Clause ACQUIRE is not allowed if clause WRITE appears on the ATOMIC directive
   !$omp atomic write acquire
     i = j
 
@@ -372,33 +319,27 @@
 ! 2.17.7.6
 ! If atomic-clause is update or not present then memory-order-clause must not be acq_rel or acquire.
 
-  !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic acq_rel update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
-  !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic update acq_rel
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic acquire update
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive
   !$omp atomic update acquire
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: Clause ACQ_REL is not allowed on the ATOMIC directive
   !$omp atomic acq_rel
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 
-  !ERROR: Clause ACQUIRE is not allowed on the ATOMIC directive
   !$omp atomic acquire
-  !ERROR: Invalid or missing operator in atomic update statement
+  !ERROR: The atomic variable i should appear as an argument in the update operation
     i = j
 end program
 
diff --git a/flang/test/Semantics/OpenMP/atomic02.f90 b/flang/test/Semantics/OpenMP/atomic02.f90
index c66085d00f157..45e41f2552965 100644
--- a/flang/test/Semantics/OpenMP/atomic02.f90
+++ b/flang/test/Semantics/OpenMP/atomic02.f90
@@ -28,36 +28,29 @@ program OmpAtomic
    !$omp atomic
    a = a/(b + 1)
    !$omp atomic
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The ** operator is not a valid ATOMIC UPDATE operation
    a = a**4
    !$omp atomic 
-   !ERROR: Expected scalar variable on the LHS of atomic update assignment statement
-   !ERROR: Invalid or missing operator in atomic update statement
-   !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+   !ERROR: Atomic variable c cannot have CHARACTER type
+   !ERROR: The atomic variable c should appear as an argument in the update operation
    c = d 
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The < operator is not a valid ATOMIC UPDATE operation
    l = a .LT. b
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The <= operator is not a valid ATOMIC UPDATE operation
    l = a .LE. b
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The == operator is not a valid ATOMIC UPDATE operation
    l = a .EQ. b
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The /= operator is not a valid ATOMIC UPDATE operation
    l = a .NE. b
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The >= operator is not a valid ATOMIC UPDATE operation
    l = a .GE. b
    !$omp atomic
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The > operator is not a valid ATOMIC UPDATE operation
    l = a .GT. b
    !$omp atomic
    m = m .AND. n
@@ -76,32 +69,26 @@ program OmpAtomic
    !$omp atomic update
    a = a/(b + 1)
    !$omp atomic update
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The ** operator is not a valid ATOMIC UPDATE operation
    a = a**4
    !$omp atomic update
-   !ERROR: Expected scalar variable on the LHS of atomic update assignment statement
-   !ERROR: Invalid or missing operator in atomic update statement
-   !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+   !ERROR: Atomic variable c cannot have CHARACTER type
+   !ERROR: This is not a valid ATOMIC UPDATE operation
    c = c//d
    !$omp atomic update
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The < operator is not a valid ATOMIC UPDATE operation
    l = a .LT. b
    !$omp atomic update
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The <= operator is not a valid ATOMIC UPDATE operation
    l = a .LE. b
    !$omp atomic update
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The == operator is not a valid ATOMIC UPDATE operation
    l = a .EQ. b
    !$omp atomic update
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The >= operator is not a valid ATOMIC UPDATE operation
    l = a .GE. b
    !$omp atomic update
-   !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l`
-   !ERROR: Invalid or missing operator in atomic update statement
+   !ERROR: The > operator is not a valid ATOMIC UPDATE operation
    l = a .GT. b
    !$omp atomic update
    m = m .AND. n
diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90
index 76367495b9861..b3a3c0d5e7a14 100644
--- a/flang/test/Semantics/OpenMP/atomic03.f90
+++ b/flang/test/Semantics/OpenMP/atomic03.f90
@@ -25,28 +25,26 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
    z = IAND(y, 4)
 !$omp atomic
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator
    z = IOR(y, 5)
 !$omp atomic
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator
    z = IEOR(y, 6)
 !$omp atomic
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator
    z = MAX(y, 7, b, c)
 !$omp atomic
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator
    z = MIN(y, 8, a, d)
 
 !$omp atomic
-   !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
+   !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation
    y = FRACTION(x)
 !$omp atomic
-   !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
+   !ERROR: The atomic variable y should appear as an argument in the update operation
    y = REAL(x)
 !$omp atomic update
    y = IAND(y, 4)
@@ -60,26 +58,26 @@ program OmpAtomic
    y = MIN(y, 8)
 
 !$omp atomic update
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
    z = IAND(y, 4)
 !$omp atomic update 
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator
    z = IOR(y, 5)
 !$omp atomic update
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator
    z = IEOR(y, 6)
 !$omp atomic update
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator
    z = MAX(y, 7)
 !$omp atomic update
-   !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+   !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator
    z = MIN(y, 8)
 
 !$omp atomic update
-   !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
+  !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation
    y = MOD(y, 9)
 !$omp atomic update
-   !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement
+  !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation
    x = ABS(x)
 end program OmpAtomic
 
@@ -92,7 +90,7 @@ subroutine conflicting_types()
     type(simple) ::s
     z = 1
     !$omp atomic
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z'
+    !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator
     z = IAND(s%z, 4)
 end subroutine
 
@@ -105,40 +103,37 @@ subroutine more_invalid_atomic_update_stmts()
     type(some_type) :: s
  
     !$omp atomic update
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MIN operator
         a = min(a, a, b)
      
     !$omp atomic
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator
         a = max(b, a, b, a)
 
     !$omp atomic
-    !ERROR: Atomic update statement should be of the form `a = intrinsic_procedure(a, expr_list)` OR `a = intrinsic_procedure(expr_list, a)`
         a = min(b, a, b)
 
     !$omp atomic
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a'
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator
         a = max(b, a, b, a, b)
     
     !$omp atomic update
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y'
+    !ERROR: The atomic variable y should occur exactly once among the arguments of the top-level MIN operator
         y = min(z, x)
      
     !$omp atomic
         z = max(z, y)
 
     !$omp atomic update
-    !ERROR: Expected scalar variable on the LHS of atomic update assignment statement
-    !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'k'
+    !ERROR: Atomic variable k should be a scalar
+    !ERROR: The atomic variable k should occur exactly once among the arguments of the top-level MAX operator
         k = max(x, y)
-    
+
     !$omp atomic
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
-    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
         x = min(x, k)
 
     !$omp atomic
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
-    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
-        z =z + s%m
+        z = z + s%m
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index a9644ad95aa30..0f69befed1414 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -1,5 +1,3 @@
-! REQUIRES: openmp_runtime
-
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 
 ! OpenMP Atomic construct
@@ -7,7 +5,6 @@
 ! Update assignment must be 'var = var op expr' or 'var = expr op var'
 
 program OmpAtomic
-   use omp_lib
    real x
    integer y
    logical m, n, l
@@ -20,12 +17,10 @@ program OmpAtomic
 !$omp atomic
    x = 1 + x
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
    x = y + 1
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
    x = 1 + y
 
 !$omp atomic
@@ -33,12 +28,10 @@ program OmpAtomic
 !$omp atomic
    x = 1 - x
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
    x = y - 1
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
    x = 1 - y
 
 !$omp atomic
@@ -46,12 +39,10 @@ program OmpAtomic
 !$omp atomic
    x = 1*x
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should appear as an argument in the update operation
    x = y*1
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should appear as an argument in the update operation
    x = 1*y
 
 !$omp atomic
@@ -59,12 +50,10 @@ program OmpAtomic
 !$omp atomic
    x = 1/x
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
    x = y/1
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
    x = 1/y
 
 !$omp atomic
@@ -72,8 +61,7 @@ program OmpAtomic
 !$omp atomic
    m = n .AND. m
 !$omp atomic 
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator
    m = n .AND. l
 
 !$omp atomic
@@ -81,8 +69,7 @@ program OmpAtomic
 !$omp atomic
    m = n .OR. m
 !$omp atomic 
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator
    m = n .OR. l
 
 !$omp atomic
@@ -90,8 +77,7 @@ program OmpAtomic
 !$omp atomic
    m = n .EQV. m
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator
    m = n .EQV. l
 
 !$omp atomic
@@ -99,8 +85,7 @@ program OmpAtomic
 !$omp atomic
    m = n .NEQV. m
 !$omp atomic
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator
    m = n .NEQV. l
 
 !$omp atomic update
@@ -108,12 +93,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1 + x
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
    x = y + 1
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator
    x = 1 + y
 
 !$omp atomic update
@@ -121,12 +104,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1 - x
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
    x = y - 1
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator
    x = 1 - y
 
 !$omp atomic update
@@ -134,12 +115,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1*x
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should appear as an argument in the update operation
    x = y*1
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should appear as an argument in the update operation
    x = 1*y
 
 !$omp atomic update
@@ -147,12 +126,10 @@ program OmpAtomic
 !$omp atomic update
    x = 1/x
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
    x = y/1
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-   !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
    x = 1/y
 
 !$omp atomic update
@@ -160,8 +137,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .AND. m
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator
    m = n .AND. l
 
 !$omp atomic update
@@ -169,8 +145,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .OR. m
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator
    m = n .OR. l
 
 !$omp atomic update
@@ -178,8 +153,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .EQV. m
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator
    m = n .EQV. l
 
 !$omp atomic update
@@ -187,8 +161,7 @@ program OmpAtomic
 !$omp atomic update
    m = n .NEQV. m
 !$omp atomic update
-   !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m`
-   !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement
+   !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator
    m = n .NEQV. l
 
 end program OmpAtomic
@@ -204,35 +177,34 @@ subroutine more_invalid_atomic_update_stmts()
     type(some_type) p
     
     !$omp atomic
-    !ERROR: Invalid or missing operator in atomic update statement
         x = x
 
     !$omp atomic update
-    !ERROR: Invalid or missing operator in atomic update statement
+    !ERROR: The atomic variable x should appear as an argument in the update operation
         x = 1    
 
     !$omp atomic update
-    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+    !ERROR: Within atomic operation a and a*b access the same storage
         a = a * b + a
 
     !$omp atomic
-    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level * operator
         a = b * (a + 9)
 
     !$omp atomic update
-    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+    !ERROR: Within atomic operation a and (a+b) access the same storage
         a = a * (a + b)
 
     !$omp atomic
-    !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement
+    !ERROR: Within atomic operation a and (b+a) access the same storage
         a = (b + a) * a
 
     !$omp atomic
-    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator
         a = a * b + c
 
     !$omp atomic update
-    !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a`
+    !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator
         a = a + b + c
 
     !$omp atomic
@@ -243,23 +215,18 @@ subroutine more_invalid_atomic_update_stmts()
 
     !$omp atomic
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
-    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
         a = a + d
 
     !$omp atomic update
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
-    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
-    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
+    !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator
         x = x * y / z
 
     !$omp atomic
-    !ERROR: Atomic update statement should be of form `p%m = p%m operator expr` OR `p%m = expr operator p%m`
-    !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement
+    !ERROR: The atomic variable p%m should occur exactly once among the arguments of the top-level + operator
         p%m = x + y
 
     !$omp atomic update
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4)
-    !ERROR: Expected scalar expression on the RHS of atomic update assignment statement
-    !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement
         p%m = p%m + p%n
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90
index 266268a212440..77ffc6e57f1a3 100644
--- a/flang/test/Semantics/OpenMP/atomic05.f90
+++ b/flang/test/Semantics/OpenMP/atomic05.f90
@@ -8,20 +8,20 @@ program OmpAtomic
     use omp_lib
     integer :: g, x
 
-    !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
+    !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct
     !$omp atomic relaxed, seq_cst
         x = x + 1
-    !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
+    !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct
     !$omp atomic read seq_cst, relaxed
         x = g
-    !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
+    !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct
     !$omp atomic write relaxed, release
         x = 2 * 4
-    !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
+    !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct
     !$omp atomic update release, seq_cst
-    !ERROR: Invalid or missing operator in atomic update statement
+    !ERROR: The atomic variable x should appear as an argument in the update operation
         x = 10
-    !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct
+    !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct
     !$omp atomic capture release, seq_cst
         x = g
         g = x * 10
diff --git a/flang/test/Semantics/OpenMP/critical-hint-clause.f90 b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
index 7ca8c858239f7..e9cfa49bf934e 100644
--- a/flang/test/Semantics/OpenMP/critical-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
@@ -18,7 +18,7 @@ program sample
         y = 2
     !$omp end critical (name)
      
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp critical (name) hint(3)
         y = 2
     !$omp end critical (name)
@@ -27,12 +27,12 @@ program sample
         y = 2
     !$omp end critical (name)
     
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp critical (name) hint(7)
         y = 2
     !$omp end critical (name)
    
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp critical (name) hint(x)
         y = 2
@@ -54,7 +54,7 @@ program sample
         y = 2
     !$omp end critical (name)
     
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp critical (name) hint(omp_sync_hint_uncontended + omp_sync_hint) 
         y = 2
@@ -84,35 +84,35 @@ program sample
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
      !$omp critical (name) hint(omp_sync_hint_uncontended + omp_sync_hint_contended)
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause value is not a valid OpenMP synchronization value
+    !ERROR: The synchronization hint is not valid
     !$omp critical (name) hint(omp_sync_hint_nonspeculative + omp_lock_hint_speculative)
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must have INTEGER type, but is REAL(4)
     !$omp critical (name) hint(1.0) 
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Operands of + must be numeric; have LOGICAL(4) and INTEGER(4)
     !$omp critical (name) hint(z + omp_sync_hint_nonspeculative)
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp critical (name) hint(k + omp_sync_hint_speculative)
         y = 2
     !$omp end critical (name)
 
-    !ERROR: Hint clause must have non-negative constant integer expression
+    !ERROR: Synchronization hint must be a constant integer value
     !ERROR: Must be a constant value
     !$omp critical (name) hint(p(1) + omp_sync_hint_uncontended)
         y = 2
diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
index 505cbc48fef90..8fdd2aed3ec1f 100644
--- a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
+++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90
@@ -20,70 +20,64 @@ program sample
 
     !$omp atomic read
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
-    !ERROR: Expected scalar expression on the RHS of atomic assignment statement
+    !ERROR: Atomic variable y(1_8:3_8:1_8) should be a scalar
         v = y(1:3)
 
     !$omp atomic read
-    !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement
+    !ERROR: Atomic expression x*(10_4+x) should be a variable
         v = x * (10 + x)
 
     !$omp atomic read
-    !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement
+    !ERROR: Atomic expression 4_4 should be a variable
         v = 4
 
     !$omp atomic read
-    !ERROR: k must not have ALLOCATABLE attribute
+    !ERROR: Atomic variable k cannot be ALLOCATABLE
         v = k
 
     !$omp atomic write
-    !ERROR: k must not have ALLOCATABLE attribute
+    !ERROR: Atomic variable k cannot be ALLOCATABLE
         k = x
 
     !$omp atomic update
-    !ERROR: k must not have ALLOCATABLE attribute
+    !ERROR: Atomic variable k cannot be ALLOCATABLE
         k = k + x * (v * x)
 
     !$omp atomic
-    !ERROR: k must not have ALLOCATABLE attribute
+    !ERROR: Atomic variable k cannot be ALLOCATABLE
         k = v * k  
          
     !$omp atomic write
-    !ERROR: RHS expression on atomic assignment statement cannot access 'z%y'
+    !ERROR: Within atomic operation z%y and x+z%y access the same storage
        z%y = x + z%y
 
     !$omp atomic write
-    !ERROR: RHS expression on atomic assignment statement cannot access 'x'
+    !ERROR: Within atomic operation x and x access the same storage
         x = x
 
     !$omp atomic write
-    !ERROR: RHS expression on atomic assignment statement cannot access 'm'
+    !ERROR: Within atomic operation m and min(m,x,z%m)+k access the same storage
         m = min(m, x, z%m) + k
  
     !$omp atomic read
-    !ERROR: RHS expression on atomic assignment statement cannot access 'x'
+    !ERROR: Within atomic operation x and x access the same storage
         x = x
 
     !$omp atomic read
-    !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement
-    !ERROR: RHS expression on atomic assignment statement cannot access 'm'
+    !ERROR: Atomic expression min(m,x,z%m)+k should be a variable
         m = min(m, x, z%m) + k
 
     !$omp atomic read
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
-    !ERROR: Expected scalar expression on the RHS of atomic assignment statement
+    !ERROR: Atomic variable a should be a scalar
         x = a
 
-    !$omp atomic read
-    !ERROR: Expected scalar variable on the LHS of atomic assignment statement
-        a = x
-
     !$omp atomic write
     !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4)
-    !ERROR: Expected scalar expression on the RHS of atomic assignment statement
         x = a
 
     !$omp atomic write
-    !ERROR: Expected scalar variable on the LHS of atomic assignment statement
+    !ERROR: Atomic variable a should be a scalar
         a = x
 
     !$omp atomic capture
@@ -93,7 +87,7 @@ program sample
 
     !$omp atomic release capture
         v = x
-    !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x`
+    ! This ends up being "x = b + x".
         x = b + (x*1)
     !$omp end atomic
 
@@ -103,60 +97,58 @@ program sample
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Captured variable/array element/derived-type component x expected to be assigned in the second statement of ATOMIC CAPTURE construct
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read b
         v = x
         b = b + 1
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Captured variable/array element/derived-type component x expected to be assigned in the second statement of ATOMIC CAPTURE construct
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read b
         v = x
         b = 10
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Updated variable/array element/derived-type component x expected to be captured in the second statement of ATOMIC CAPTURE construct
         x = x + 10
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read x
         v = b
     !$omp end atomic
 
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE neither statement could be the update or the capture
     !$omp atomic capture
-    !ERROR: Invalid ATOMIC CAPTURE construct statements. Expected one of [update-stmt, capture-stmt], [capture-stmt, update-stmt], or [capture-stmt, write-stmt]
         v = 1
         x = 4
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Captured variable/array element/derived-type component z%y expected to be assigned in the second statement of ATOMIC CAPTURE construct
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z%m
         x = z%y
         z%m = z%m + 1.0
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Updated variable/array element/derived-type component z%m expected to be captured in the second statement of ATOMIC CAPTURE construct
         z%m = z%m + 1.0
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z%m
         x = z%y
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Captured variable/array element/derived-type component y(2) expected to be assigned in the second statement of ATOMIC CAPTURE construct
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read y(1_8)
         x = y(2)
         y(1) = y(1) + 1
     !$omp end atomic
 
     !$omp atomic capture
-    !ERROR: Updated variable/array element/derived-type component y(1) expected to be captured in the second statement of ATOMIC CAPTURE construct
         y(1) = y(1) + 1
+    !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read y(1_8)
         x = y(2)
     !$omp end atomic
 
     !$omp atomic read
-    !ERROR: Expected scalar variable on the LHS of atomic assignment statement
-    !ERROR: Expected scalar expression on the RHS of atomic assignment statement
+    !ERROR: Atomic variable r cannot have CHARACTER type
         l = r
 
     !$omp atomic write
-    !ERROR: Expected scalar variable on the LHS of atomic assignment statement
-    !ERROR: Expected scalar expression on the RHS of atomic assignment statement
+    !ERROR: Atomic variable l cannot have CHARACTER type
         l = r
 end program
diff --git a/flang/test/Semantics/OpenMP/requires-atomic01.f90 b/flang/test/Semantics/OpenMP/requires-atomic01.f90
index ae9fd086015dd..e8817c3f5ef61 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic01.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic01.f90
@@ -10,20 +10,23 @@ program requires
   ! READ
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Read
+  ! CHECK: OmpClause -> SeqCst
   !$omp atomic read
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Read
   !$omp atomic relaxed read
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Read
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic read relaxed
   i = j
   
@@ -31,20 +34,23 @@ program requires
   ! WRITE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Write
+  ! CHECK: OmpClause -> SeqCst
   !$omp atomic write
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Write
   !$omp atomic relaxed write
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Write
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic write relaxed
   i = j
 
@@ -52,31 +58,34 @@ program requires
   ! UPDATE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Update
+  ! CHECK: OmpClause -> SeqCst
   !$omp atomic update
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Update
   !$omp atomic relaxed update
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Update
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic update relaxed
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> SeqCst
   !$omp atomic
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic relaxed
   i = i + j
 
@@ -84,24 +93,27 @@ program requires
   ! CAPTURE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Capture
+  ! CHECK: OmpClause -> SeqCst
   !$omp atomic capture
   i = j
   j = j + 1
   !$omp end atomic
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Capture
   !$omp atomic relaxed capture
   i = j
   j = j + 1
   !$omp end atomic
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Capture
+  ! CHECK-NOT: OmpClause -> SeqCst
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic capture relaxed
   i = j
   j = j + 1
diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90
index 4976a9667eb78..a3724a83456fd 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic02.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic02.f90
@@ -10,20 +10,23 @@ program requires
   ! READ
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Acquire
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Read
+  ! CHECK: OmpClause -> AcqRel
   !$omp atomic read
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Acquire
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Read
   !$omp atomic relaxed read
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Acquire
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Read
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic read relaxed
   i = j
   
@@ -31,20 +34,23 @@ program requires
   ! WRITE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Write
+  ! CHECK: OmpClause -> AcqRel
   !$omp atomic write
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Write
   !$omp atomic relaxed write
   i = j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Write
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic write relaxed
   i = j
 
@@ -52,31 +58,34 @@ program requires
   ! UPDATE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Update
+  ! CHECK: OmpClause -> AcqRel
   !$omp atomic update
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Update
   !$omp atomic relaxed update
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Update
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic update relaxed
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> AcqRel
   !$omp atomic
   i = i + j
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic relaxed
   i = i + j
 
@@ -84,24 +93,27 @@ program requires
   ! CAPTURE
   ! ----------------------------------------------------------------------------
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> AcqRel
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK: OmpClause -> Capture
+  ! CHECK: OmpClause -> AcqRel
   !$omp atomic capture
   i = j
   j = j + 1
   !$omp end atomic
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> AcqRel
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Relaxed
+  ! CHECK: OmpClause -> Capture
   !$omp atomic relaxed capture
   i = j
   j = j + 1
   !$omp end atomic
 
-  ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture
-  ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> AcqRel
-  ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed
+  ! CHECK-LABEL: OpenMPAtomicConstruct
+  ! CHECK-NOT: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Capture
+  ! CHECK: OmpClause -> Relaxed
   !$omp atomic capture relaxed
   i = j
   j = j + 1

From e64f8e043cdfc394fd31e157c8c5fb25ca85bd2f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Wed, 11 Jun 2025 10:17:54 -0500
Subject: [PATCH 092/851] [flang][Driver] Guard check for pic/pie settings
 without driver flags (#143530)

The default relocation model for clang depends on the cmake flag
CLANG_DEFAULT_PIE_ON_LINUX. By default it is set to ON, but when it's
OFF, the default relocation model will be "static".
The outcome of the test running clang without any PIC/PIE flags will
depend on the cmake flag, so make sure it only runs when the flag is ON.
---
 flang/test/Driver/pic-flags.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Driver/pic-flags.f90 b/flang/test/Driver/pic-flags.f90
index cb62d353cc18c..5a06163c485cd 100644
--- a/flang/test/Driver/pic-flags.f90
+++ b/flang/test/Driver/pic-flags.f90
@@ -1,6 +1,6 @@
 ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fno-pie 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC,CHECK-STATIC-IR %}
 
-! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %}
+! RUN: %if aarch64-registered-target && clang_default_pie_on_linux %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %}
 ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fpie 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL1,CHECK-PIE-LEVEL1-IR %}
 ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fPIE 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %}
 

From adfea33f0c412b8475b755a8d82c9961b785eb02 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Wed, 11 Jun 2025 11:28:48 -0400
Subject: [PATCH 093/851] [PowerPC][AIX] xfail atan-intrinsic to unblock bot
 (#143723)

Testcase from https://github.com/llvm/llvm-project/pull/143416 is
causing the AIX bot to be red. XFAIL for now till issue can be resolved.
---
 llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
index d824d6d35643d..c5c17d65524c2 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+; XFAIL: target={{.*}}-aix{{.*}}
 
 define double @test_atan_0() {
 ; CHECK-LABEL: define double @test_atan_0() {

From bc9f4edf47d2cbed3b1ba7a61d1497dded91ed22 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 11 Jun 2025 16:44:09 +0100
Subject: [PATCH 094/851] [LTO] Fix used before intialised warning (#143705)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For whatever reason I can't reproduce this locally but I can on Compiler
Explorer (https://godbolt.org/z/nfv4b83q6) and on our flang gcc bot
(https://lab.llvm.org/buildbot/#/builders/130/builds/13683/steps/5/logs/stdio).

In file included from ../llvm-project/llvm/include/llvm/LTO/LTO.h:33,
from
../llvm-project/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:29:
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In
constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy()’:
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:275:33:
warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is
used uninitialized [-Wuninitialized]
  275 |     ImportListsTy() : EmptyList(ImportIDs) {}
      |                                 ^~~~~~~~~
../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In
constructor
‘llvm::FunctionImporter::ImportListsTy::ImportListsTy(size_t)’:

../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:276:44:
warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is
used uninitialized [-Wuninitialized]
276 | ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size)
{}
      |                                            ^~~~~~~~~

ImportIDs was being used during construction of EmptyList, before
ImportIDs itself had been constructed.
---
 llvm/include/llvm/Transforms/IPO/FunctionImport.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 65228bb65ba8b..e6ae9ee831d50 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -272,8 +272,9 @@ class FunctionImporter {
   // A map from destination modules to lists of imports.
   class ImportListsTy {
   public:
-    ImportListsTy() : EmptyList(ImportIDs) {}
-    ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {}
+    ImportListsTy() : ImportIDs(), EmptyList(ImportIDs) {}
+    ImportListsTy(size_t Size)
+        : ImportIDs(), EmptyList(ImportIDs), ListsImpl(Size) {}
 
     ImportMapTy &operator[](StringRef DestMod) {
       return ListsImpl.try_emplace(DestMod, ImportIDs).first->second;
@@ -293,9 +294,9 @@ class FunctionImporter {
     const_iterator end() const { return ListsImpl.end(); }
 
   private:
+    ImportIDTable ImportIDs;
     ImportMapTy EmptyList;
     DenseMap<StringRef, ImportMapTy> ListsImpl;
-    ImportIDTable ImportIDs;
   };
 
   /// The set contains an entry for every global value that the module exports.

From 91be47dccfa3480c152916838404d49107fde45c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 08:53:54 -0700
Subject: [PATCH 095/851] [flang] Fix warnings

This patch fixes:

  flang/lib/Lower/OpenMP/OpenMP.cpp:3904:9: error: unused variable
  'action0' [-Werror,-Wunused-variable]

  flang/lib/Lower/OpenMP/OpenMP.cpp:3905:9: error: unused variable
  'action1' [-Werror,-Wunused-variable]
---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3f3b85696db31..c13fa471978db 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3911,6 +3911,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       // Capturing operation.
       assert(action0 != analysis.None && action1 != analysis.None &&
              "Expexcing two actions");
+      (void)action0;
+      (void)action1;
       captureOp =
           builder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memOrder);
       // Set the non-atomic insertion point to before the atomic.capture.

From 2ab83e9f68f0c7b1a7199455d7ce05430d93fa44 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tonypalampalliyil@gmail.com>
Date: Wed, 11 Jun 2025 21:28:26 +0530
Subject: [PATCH 096/851] [NFC][PowerPC] Rename xxevalPattern to adhere to
 naming convention. (#143675)

Rename class `xxevalPattern` to adhere to naming convention listed in
the coding guideline and used for all other classes in the td file.
---
 llvm/lib/Target/PowerPC/PPCInstrP10.td | 62 +++++++++++++-------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index a7f758745efe2..d295f35fb1dd0 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -2159,7 +2159,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in {
                                (COPY_TO_REGCLASS $VRB, VSRC), 2)))>;
 }
 
-class xxevalPattern <dag pattern, bits<8> imm> :
+class XXEvalPattern <dag pattern, bits<8> imm> :
   Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {}
 
 let Predicates = [PrefixInstrs, HasP10Vector] in {
@@ -2192,83 +2192,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in {
     // Anonymous patterns for XXEVAL
     // AND
     // and(A, B, C)
-    def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
+    def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>;
     // and(A, xor(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
+    def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>;
     // and(A, or(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
+    def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>;
     // and(A, nor(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>;
     // and(A, eqv(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>;
     // and(A, nand(B, C))
-    def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
+    def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>;
 
     // NAND
     // nand(A, B, C)
-    def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))),
                          !sub(255, 1)>;
     // nand(A, xor(B, C))
-    def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))),
                          !sub(255, 6)>;
     // nand(A, or(B, C))
-    def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
+    def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))),
                          !sub(255, 7)>;
     // nand(A, nor(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)),
                          !sub(255, 8)>;
     // nand(A, eqv(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)),
                          !sub(255, 9)>;
     // nand(A, nand(B, C))
-    def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)),
                          !sub(255, 14)>;
 
     // EQV
     // (eqv A, B, C)
-    def : xxevalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
+    def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)),
                             (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))),
                          150>;
     // (eqv A, (and B, C))
-    def : xxevalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
+    def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>;
     // (eqv A, (or B, C))
-    def : xxevalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
+    def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>;
 
     // NOR
     // (nor A, B, C)
-    def : xxevalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>;
     // (nor A, (and B, C))
-    def : xxevalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>;
     // (nor A, (eqv B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>;
     // (nor A, (nand B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>;
     // (nor A, (nor B, C))
-    def : xxevalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
+    def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>;
     // (nor A, (xor B, C))
-    def : xxevalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
+    def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>;
 
     // OR
     // (or A, B, C)
-    def : xxevalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
+    def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>;
     // (or A, (and B, C))
-    def : xxevalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
+    def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>;
     // (or A, (eqv B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>;
     // (or A, (nand B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>;
     // (or A, (nor B, C))
-    def : xxevalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
+    def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>;
     // (or A, (xor B, C))
-    def : xxevalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
+    def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>;
 
     // XOR
     // (xor A, B, C)
-    def : xxevalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
+    def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>;
     // (xor A, (and B, C))
-    def : xxevalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
+    def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>;
     // (xor A, (or B, C))
-    def : xxevalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
+    def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>;
 
     // Anonymous patterns to select prefixed VSX loads and stores.
     // Load / Store f128

From 38fb0117ab10c4541e58697a4b56de2a646cf3f4 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Wed, 11 Jun 2025 12:13:36 -0400
Subject: [PATCH 097/851] [libc++] Make forward_list constexpr as part of
 P3372R3 (#129435)

Fixes #128658
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +
 libcxx/include/__memory/allocation_guard.h    |  20 +-
 libcxx/include/__memory/pointer_traits.h      |  16 +-
 libcxx/include/forward_list                   | 469 ++++++++++--------
 libcxx/include/version                        |   2 +
 .../forwardlist/compare.three_way.pass.cpp    |   7 +-
 .../sequences/forwardlist/empty.pass.cpp      |  13 +-
 .../forwardlist.access/front.pass.cpp         |  16 +-
 .../forwardlist.cons/alloc.compile.fail.cpp   |  13 +-
 .../forwardlist.cons/alloc.pass.cpp           |  13 +-
 .../forwardlist.cons/assign_copy.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_init.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_move.pass.cpp     |  13 +-
 .../forwardlist.cons/assign_op_init.pass.cpp  |  13 +-
 .../forwardlist.cons/assign_range.pass.cpp    |  13 +-
 .../assign_size_value.pass.cpp                |  13 +-
 .../forwardlist.cons/copy.pass.cpp            |  13 +-
 .../forwardlist.cons/copy_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/default.pass.cpp         |  13 +-
 .../forwardlist.cons/from_range.pass.cpp      |  19 +-
 .../forwardlist.cons/init.pass.cpp            |  13 +-
 .../forwardlist.cons/init_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/move.pass.cpp            |  13 +-
 .../forwardlist.cons/move_alloc.pass.cpp      |  13 +-
 .../forwardlist.cons/range.pass.cpp           |  13 +-
 .../forwardlist.cons/range_alloc.pass.cpp     |  13 +-
 .../forwardlist.cons/size.pass.cpp            |   4 +-
 .../forwardlist.cons/size_value.pass.cpp      |  13 +-
 .../size_value_alloc.pass.cpp                 |  13 +-
 .../forwardlist.erasure/erase.pass.cpp        |  18 +-
 .../forwardlist.erasure/erase_if.pass.cpp     |  18 +-
 .../forwardlist.iter/before_begin.pass.cpp    |  17 +-
 .../forwardlist.iter/iterators.pass.cpp       |  27 +-
 .../assign_range.pass.cpp                     |  19 +-
 .../forwardlist.modifiers/clear.pass.cpp      |  13 +-
 .../emplace_after.pass.cpp                    |  13 +-
 .../emplace_front.pass.cpp                    |  13 +-
 .../erase_after_many.pass.cpp                 |  13 +-
 .../erase_after_one.pass.cpp                  |  13 +-
 .../insert_after_const.pass.cpp               |  13 +-
 .../insert_after_init.pass.cpp                |  13 +-
 .../insert_after_range.pass.cpp               |  13 +-
 .../insert_after_rv.pass.cpp                  |  13 +-
 .../insert_after_size_value.pass.cpp          |  13 +-
 .../insert_range_after.pass.cpp               |  23 +-
 .../forwardlist.modifiers/pop_front.pass.cpp  |  13 +-
 .../prepend_range.pass.cpp                    |  19 +-
 .../push_front_const.pass.cpp                 |  13 +-
 .../push_front_exception_safety.pass.cpp      |   2 +-
 .../push_front_rv.pass.cpp                    |  13 +-
 .../resize_size.pass.cpp                      |  17 +-
 .../resize_size_value.pass.cpp                |  15 +-
 .../forwardlist.ops/merge_lvalue.pass.cpp     |  17 +-
 .../merge_lvalue_pred.pass.cpp                |  17 +-
 .../forwardlist.ops/merge_rvalue.pass.cpp     |  17 +-
 .../merge_rvalue_pred.pass.cpp                |  17 +-
 .../forwardlist.ops/remove.pass.cpp           |  27 +-
 .../forwardlist.ops/remove_if.pass.cpp        |  25 +-
 .../forwardlist.ops/reverse.pass.cpp          |  19 +-
 .../splice_after_flist.pass.cpp               |  23 +-
 .../forwardlist.ops/splice_after_one.pass.cpp |  25 +-
 .../splice_after_range.pass.cpp               |  27 +-
 .../forwardlist.ops/unique.pass.cpp           |  15 +-
 .../forwardlist.ops/unique_pred.pass.cpp      |  25 +-
 .../forwardlist.spec/equal.pass.cpp           |  17 +-
 .../forwardlist.spec/member_swap.pass.cpp     |  13 +-
 .../forwardlist.spec/non_member_swap.pass.cpp |  13 +-
 .../forwardlist.spec/relational.pass.cpp      |  21 +-
 .../swap_noexcept.compile.pass.cpp            |   4 +-
 .../forwardlist/get_allocator.pass.cpp        |  13 +-
 .../sequences/forwardlist/incomplete.pass.cpp |  17 +-
 .../sequences/forwardlist/max_size.pass.cpp   |  13 +-
 .../forward_list.version.compile.pass.cpp     |  27 +
 .../version.version.compile.pass.cpp          |  27 +
 libcxx/test/support/counting_predicates.h     |  62 +--
 .../generate_feature_test_macro_components.py |   5 +
 76 files changed, 1186 insertions(+), 459 deletions(-)
 mode change 100755 => 100644 libcxx/utils/generate_feature_test_macro_components.py

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index a89d4038785cd..3e6fd643f620c 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -420,6 +420,8 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_algorithms``                         ``202306L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_constexpr_forward_list``                       ``202502L``
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_new``                                ``202406L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_constexpr_queue``                              ``202502L``
diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h
index 66edcd92ed618..016e1a3a429b8 100644
--- a/libcxx/include/__memory/allocation_guard.h
+++ b/libcxx/include/__memory/allocation_guard.h
@@ -49,24 +49,26 @@ struct __allocation_guard {
   using _Size _LIBCPP_NODEBUG    = typename allocator_traits<_Alloc>::size_type;
 
   template <class _AllocT> // we perform the allocator conversion inside the constructor
-  _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n)
       : __alloc_(std::move(__alloc)),
         __n_(__n),
         __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important
   {}
 
-  _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); }
 
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard(const __allocation_guard&) = delete;
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT
+  __allocation_guard(const __allocation_guard&)                    = delete;
+  __allocation_guard& operator=(const __allocation_guard& __other) = delete;
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT
       : __alloc_(std::move(__other.__alloc_)),
         __n_(__other.__n_),
         __ptr_(__other.__ptr_) {
     __other.__ptr_ = nullptr;
   }
 
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(const __allocation_guard& __other) = delete;
-  _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(__allocation_guard&& __other) _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard&
+  operator=(__allocation_guard&& __other) _NOEXCEPT {
     if (std::addressof(__other) != this) {
       __destroy();
 
@@ -79,17 +81,17 @@ struct __allocation_guard {
     return *this;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _Pointer
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer
   __release_ptr() _NOEXCEPT { // not called __release() because it's a keyword in objective-c++
     _Pointer __tmp = __ptr_;
     __ptr_         = nullptr;
     return __tmp;
   }
 
-  _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; }
 
 private:
-  _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT {
     if (__ptr_ != nullptr) {
       allocator_traits<_Alloc>::deallocate(__alloc_, __ptr_, __n_);
     }
diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h
index 4ba50898fb37d..879b387b9ad1f 100644
--- a/libcxx/include/__memory/pointer_traits.h
+++ b/libcxx/include/__memory/pointer_traits.h
@@ -245,8 +245,8 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept {
 }
 
 template <class _Pointer>
-inline _LIBCPP_HIDE_FROM_ABI constexpr auto
-to_address(const _Pointer& __p) noexcept -> decltype(std::__to_address(__p)) {
+inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(const _Pointer& __p) noexcept
+    -> decltype(std::__to_address(__p)) {
   return std::__to_address(__p);
 }
 #endif
@@ -302,6 +302,18 @@ concept __resettable_smart_pointer_with_args = requires(_Smart __s, _Pointer __p
 
 #endif
 
+// This function ensures safe conversions between fancy pointers at compile-time, where we avoid casts from/to
+// `__void_pointer` by obtaining the underlying raw pointer from the fancy pointer using `std::to_address`,
+// then dereferencing it to retrieve the pointed-to object, and finally constructing the target fancy pointer
+// to that object using the `std::pointer_traits<>::pinter_to` function.
+template <class _PtrTo, class _PtrFrom>
+_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _PtrTo __static_fancy_pointer_cast(const _PtrFrom& __p) {
+  using __ptr_traits   = pointer_traits<_PtrTo>;
+  using __element_type = typename __ptr_traits::element_type;
+  return __p ? __ptr_traits::pointer_to(*static_cast<__element_type*>(std::addressof(*__p)))
+             : static_cast<_PtrTo>(nullptr);
+}
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 5046de27a9da1..e9b2c860b89c4 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -295,8 +295,8 @@ struct __forward_node_traits {
                 "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic.");
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__p));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) {
+    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__p);
   }
 };
 
@@ -307,11 +307,11 @@ struct __forward_begin_node {
 
   pointer __next_;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {}
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const {
-    return static_cast<__begin_node_pointer>(__next_);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const {
+    return std::__static_fancy_pointer_cast<__begin_node_pointer>(__next_);
   }
 };
 
@@ -335,7 +335,7 @@ private:
   };
 
 public:
-  _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; }
 #  else
 
 private:
@@ -345,8 +345,8 @@ public:
   _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_)); }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {}
-  _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {}
 };
 
 template <class _Tp, class _Alloc = allocator<_Tp> >
@@ -357,24 +357,26 @@ class __forward_list_const_iterator;
 template <class _NodePtr>
 class __forward_list_iterator {
   typedef __forward_node_traits<_NodePtr> __traits;
+  typedef typename __traits::__node_type __node_type;
+  typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
   typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_));
-  }
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT
+      : __ptr_(nullptr) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT
       : __ptr_(__traits::__as_iter_node(__p)) {}
 
   template <class, class>
@@ -389,27 +391,31 @@ public:
   typedef typename pointer_traits<__node_pointer>::difference_type difference_type;
   typedef __rebind_pointer_t<__node_pointer, value_type> pointer;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __get_unsafe_node_pointer()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() {
     __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) {
     __forward_list_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+  operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) {
     return !(__x == __y);
   }
 };
@@ -421,23 +427,25 @@ class __forward_list_const_iterator {
 
   typedef __forward_node_traits<_NodePtr> __traits;
   typedef typename __traits::__node_type __node_type;
+  typedef typename __traits::__begin_node __begin_node_type;
   typedef typename __traits::__node_pointer __node_pointer;
   typedef typename __traits::__begin_node_pointer __begin_node_pointer;
   typedef typename __traits::__void_pointer __void_pointer;
 
   __begin_node_pointer __ptr_;
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const {
-    return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_));
-  }
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
-    return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_));
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const {
+    return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_);
   }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT
+      : __ptr_(nullptr) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {}
 
+  _LIBCPP_CONSTEXPR_SINCE_CXX26
   _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__node_pointer __p) _NOEXCEPT
       : __ptr_(__traits::__as_iter_node(__p)) {}
 
@@ -451,30 +459,32 @@ public:
   typedef typename pointer_traits<__node_pointer>::difference_type difference_type;
   typedef __rebind_pointer_t<__node_pointer, const value_type> pointer;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT
-      : __ptr_(__p.__ptr_) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT : __ptr_(__p.__ptr_) {}
 
-  _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); }
-  _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const {
+    return __get_unsafe_node_pointer()->__get_value();
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const {
     return pointer_traits<pointer>::pointer_to(__get_unsafe_node_pointer()->__get_value());
   }
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() {
     __ptr_ = __traits::__as_iter_node(__ptr_->__next_);
     return *this;
   }
-  _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) {
     __forward_list_const_iterator __t(*this);
     ++(*this);
     return __t;
   }
 
-  friend _LIBCPP_HIDE_FROM_ABI bool
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
   operator==(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) {
     return __x.__ptr_ == __y.__ptr_;
   }
-  friend _LIBCPP_HIDE_FROM_ABI bool
+  friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
   operator!=(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) {
     return !(__x == __y);
   }
@@ -498,48 +508,53 @@ protected:
 
   _LIBCPP_COMPRESSED_PAIR(__begin_node, __before_begin_, __node_allocator, __alloc_);
 
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT {
     return pointer_traits<__begin_node_pointer>::pointer_to(__before_begin_);
   }
-  _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT {
-    return pointer_traits<__begin_node_pointer>::pointer_to(const_cast<__begin_node&>(__before_begin_));
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT {
+    return pointer_traits<__begin_node_pointer>::pointer_to(
+        *const_cast<__begin_node*>(std::addressof(__before_begin_)));
   }
 
   typedef __forward_list_iterator<__node_pointer> iterator;
   typedef __forward_list_const_iterator<__node_pointer> const_iterator;
 
-  _LIBCPP_HIDE_FROM_ABI __forward_list_base() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value)
       : __before_begin_(__begin_node()) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a)
       : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {}
-  _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a)
       : __before_begin_(__begin_node()), __alloc_(__a) {}
 
 public:
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
   __forward_list_base(__forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value);
-  _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  __forward_list_base(__forward_list_base&& __x, const allocator_type& __a);
 #  endif // _LIBCPP_CXX03_LANG
 
   __forward_list_base(const __forward_list_base&)            = delete;
   __forward_list_base& operator=(const __forward_list_base&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI ~__forward_list_base();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_base();
 
 protected:
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) {
     __copy_assign_alloc(__x, integral_constant<bool, __node_traits::propagate_on_container_copy_assignment::value>());
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x)
       _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value ||
                  is_nothrow_move_assignable<__node_allocator>::value) {
     __move_assign_alloc(__x, integral_constant<bool, __node_traits::propagate_on_container_move_assignment::value>());
   }
 
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__node_pointer __next, _Args&&... __args) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer
+  __create_node(__node_pointer __next, _Args&&... __args) {
     __allocation_guard<__node_allocator> __guard(__alloc_, 1);
     // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value
     // held inside the node, since we need to use the allocator's construct() method for that.
@@ -554,7 +569,7 @@ protected:
     return __guard.__release_ptr();
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
     __node_traits::destroy(__alloc_, std::addressof(__node->__get_value()));
@@ -563,7 +578,7 @@ protected:
   }
 
 public:
-  _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT;
 #  else
@@ -571,18 +586,21 @@ public:
 #  endif
 
 protected:
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT;
 
 private:
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {}
-  _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x, true_type) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  __copy_assign_alloc(const __forward_list_base& __x, true_type) {
     if (__alloc_ != __x.__alloc_)
       clear();
     __alloc_ = __x.__alloc_;
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {}
-  _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {}
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) {
     __alloc_ = std::move(__x.__alloc_);
   }
@@ -591,14 +609,15 @@ private:
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x) noexcept(
-    is_nothrow_move_constructible<__node_allocator>::value)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(
+    __forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value)
     : __before_begin_(std::move(__x.__before_begin_)), __alloc_(std::move(__x.__alloc_)) {
   __x.__before_begin()->__next_ = nullptr;
 }
 
 template <class _Tp, class _Alloc>
-inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x, const allocator_type& __a)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(
+    __forward_list_base&& __x, const allocator_type& __a)
     : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {
   if (__alloc_ == __x.__alloc_) {
     __before_begin()->__next_     = __x.__before_begin()->__next_;
@@ -609,12 +628,12 @@ inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-__forward_list_base<_Tp, _Alloc>::~__forward_list_base() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 __forward_list_base<_Tp, _Alloc>::~__forward_list_base() {
   clear();
 }
 
 template <class _Tp, class _Alloc>
-inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
 #  if _LIBCPP_STD_VER >= 14
     _NOEXCEPT
 #  else
@@ -627,7 +646,7 @@ inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x)
 }
 
 template <class _Tp, class _Alloc>
-void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT {
   for (__node_pointer __p = __before_begin()->__next_; __p != nullptr;) {
     __node_pointer __next = __p->__next_;
     __delete_node(__p);
@@ -672,105 +691,123 @@ public:
   typedef void __remove_return_type;
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI forward_list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {
-  } // = default;
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a);
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list()
+      _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} // = default;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n);
 #  if _LIBCPP_STD_VER >= 14
-  _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a);
 #  endif
-  _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v);
 
   template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : __base(__a) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(size_type __n, const value_type& __v, const allocator_type& __a)
+      : __base(__a) {
     insert_after(cbefore_begin(), __n, __v);
   }
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l);
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type())
       : __base(__a) {
     prepend_range(std::forward<_Range>(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x);
-  _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x);
 
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value)
       : __base(std::move(__x)) {}
-  _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il);
-  _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI
+  forward_list(initializer_list<value_type> __il, const allocator_type& __a);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept(
       (__node_traits::propagate_on_container_move_assignment::value &&
        is_nothrow_move_assignable<allocator_type>::value) ||
       allocator_traits<allocator_type>::is_always_equal::value);
 
-  _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list<value_type> __il);
 
-  _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG
 
   // ~forward_list() = default;
 
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) {
     __assign_with_sentinel(ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v);
 
-  _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(this->__alloc_); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT {
+    return allocator_type(this->__alloc_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__base::__before_begin()->__next_); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT {
+    return iterator(__base::__before_begin()->__next_);
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT {
+    return const_iterator(nullptr);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin()->__next_);
   }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return const_iterator(nullptr); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT {
+    return const_iterator(nullptr);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(__base::__before_begin()); }
-  _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT {
+    return iterator(__base::__before_begin());
+  }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
-  _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT {
     return const_iterator(__base::__before_begin());
   }
 
-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
+  [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT {
     return __base::__before_begin()->__next_ == nullptr;
   }
-  _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT {
     return std::min<size_type>(__node_traits::max_size(this->__alloc_), numeric_limits<difference_type>::max());
   }
 
-  _LIBCPP_HIDE_FROM_ABI reference front() {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
-  _LIBCPP_HIDE_FROM_ABI const_reference front() const {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const {
     _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list");
     return __base::__before_begin()->__next_->__get_value();
   }
@@ -778,54 +815,59 @@ public:
 #  ifndef _LIBCPP_CXX03_LANG
 #    if _LIBCPP_STD_VER >= 17
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args);
 #    else
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args);
 #    endif
-  _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v);
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) {
     insert_range_after(cbefore_begin(), std::forward<_Range>(__range));
   }
 #  endif
 
-  _LIBCPP_HIDE_FROM_ABI void pop_front();
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_front();
 
 #  ifndef _LIBCPP_CXX03_LANG
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args);
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v);
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, initializer_list<value_type> __il) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, initializer_list<value_type> __il) {
     return insert_after(__p, __il.begin(), __il.end());
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v);
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, size_type __n, const value_type& __v) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, size_type __n, const value_type& __v) {
     return __insert_after(__p, __n, __v);
   }
   template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l);
 
 #  if _LIBCPP_STD_VER >= 23
   template <_ContainerCompatibleRange<_Tp> _Range>
-  _LIBCPP_HIDE_FROM_ABI iterator insert_range_after(const_iterator __position, _Range&& __range) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  insert_range_after(const_iterator __position, _Range&& __range) {
     return __insert_after_with_sentinel(__position, ranges::begin(__range), ranges::end(__range));
   }
 #  endif
 
   template <class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l);
 
-  _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p);
-  _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l);
 
-  _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x)
 #  if _LIBCPP_STD_VER >= 14
       _NOEXCEPT
 #  else
@@ -835,58 +877,63 @@ public:
     __base::swap(__x);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
-  _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v);
-  _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); }
 
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __i);
-  _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice_after(const_iterator __p, forward_list&& __x, const_iterator __i);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
   splice_after(const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x);
-  _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __i);
-  _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+  splice_after(const_iterator __p, forward_list& __x, const_iterator __i);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
   splice_after(const_iterator __p, forward_list& __x, const_iterator __f, const_iterator __l);
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v);
   template <class _Predicate>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred);
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); }
   template <class _BinaryPredicate>
-  _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) {
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) {
     merge(__x, std::move(__comp));
   }
 #  endif // _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp);
-  _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); }
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); }
   template <class _Compare>
-  _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp);
-  _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT;
 
 private:
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type)
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type)
       _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value);
-  _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type);
 #  endif // _LIBCPP_CXX03_LANG
 
   template <class _Iter, class _Sent>
-  _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l);
 
   template <class... _Args>
-  _LIBCPP_HIDE_FROM_ABI iterator __insert_after(const_iterator __p, size_type __n, _Args&&... __args);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator
+  __insert_after(const_iterator __p, size_type __n, _Args&&... __args);
 
   template <class _Compare>
-  static _LIBCPP_HIDE_FROM_ABI __node_pointer __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDE_FROM_ABI __node_pointer
+  __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp);
 
   // TODO: Make this _LIBCPP_HIDE_FROM_ABI
   template <class _Compare>
-  static _LIBCPP_HIDDEN __node_pointer __sort(__node_pointer __f, difference_type __sz, _Compare& __comp);
+  _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDDEN __node_pointer
+  __sort(__node_pointer __f, difference_type __sz, _Compare& __comp);
 };
 
 #  if _LIBCPP_STD_VER >= 17
@@ -911,10 +958,10 @@ forward_list(from_range_t, _Range&&, _Alloc = _Alloc()) -> forward_list<ranges::
 #  endif
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {}
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {}
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
   if (__n > 0) {
     for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
@@ -924,7 +971,8 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n) {
 
 #  if _LIBCPP_STD_VER >= 14
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : __base(__base_alloc) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc)
+    : __base(__base_alloc) {
   if (__n > 0) {
     for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) {
       __p->__next_ = this->__create_node(/* next = */ nullptr);
@@ -934,37 +982,39 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __b
 #  endif
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) {
   insert_after(cbefore_begin(), __n, __v);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) {
   insert_after(cbefore_begin(), __f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a)
     : __base(__a) {
   insert_after(cbefore_begin(), __f, __l);
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x)
     : __base(__node_traits::select_on_container_copy_construction(__x.__alloc_)) {
   insert_after(cbefore_begin(), __x.begin(), __x.end());
 }
 
 template <class _Tp, class _Alloc>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x, const __type_identity_t<allocator_type>& __a)
     : __base(__a) {
   insert_after(cbefore_begin(), __x.begin(), __x.end());
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) {
   if (this != std::addressof(__x)) {
     __base::__copy_assign_alloc(__x);
     assign(__x.begin(), __x.end());
@@ -974,6 +1024,7 @@ forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_li
 
 #  ifndef _LIBCPP_CXX03_LANG
 template <class _Tp, class _Alloc>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identity_t<allocator_type>& __a)
     : __base(std::move(__x), __a) {
   if (this->__alloc_ != __x.__alloc_) {
@@ -983,17 +1034,19 @@ forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identit
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il) {
   insert_after(cbefore_begin(), __il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
-forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il, const allocator_type& __a) : __base(__a) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26
+forward_list<_Tp, _Alloc>::forward_list(initializer_list<value_type> __il, const allocator_type& __a)
+    : __base(__a) {
   insert_after(cbefore_begin(), __il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
     _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value) {
   clear();
   __base::__move_assign_alloc(__x);
@@ -1002,7 +1055,7 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type)
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
   if (this->__alloc_ == __x.__alloc_)
     __move_assign(__x, true_type());
   else {
@@ -1012,7 +1065,8 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) {
 }
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>&
+forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept(
     (__node_traits::propagate_on_container_move_assignment::value &&
      is_nothrow_move_assignable<allocator_type>::value) ||
     allocator_traits<allocator_type>::is_always_equal::value) {
@@ -1021,7 +1075,8 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_l
 }
 
 template <class _Tp, class _Alloc>
-inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>&
+forward_list<_Tp, _Alloc>::operator=(initializer_list<value_type> __il) {
   assign(__il.begin(), __il.end());
   return *this;
 }
@@ -1030,13 +1085,14 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializ
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) {
   __assign_with_sentinel(__f, __l);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Iter, class _Sent>
-_LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void
+forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) {
   iterator __i = before_begin();
   iterator __j = std::next(__i);
   iterator __e = end();
@@ -1049,7 +1105,7 @@ _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_It
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
   iterator __i = before_begin();
   iterator __j = std::next(__i);
   iterator __e = end();
@@ -1064,18 +1120,19 @@ void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) {
 #  ifndef _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-inline void forward_list<_Tp, _Alloc>::assign(initializer_list<value_type> __il) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::assign(initializer_list<value_type> __il) {
   assign(__il.begin(), __il.end());
 }
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+_LIBCPP_CONSTEXPR_SINCE_CXX26
 #    if _LIBCPP_STD_VER >= 17
-typename forward_list<_Tp, _Alloc>::reference
+    typename forward_list<_Tp, _Alloc>::reference
 #    else
-void
+    void
 #    endif
-forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
+    forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
   __base::__before_begin()->__next_ =
       this->__create_node(/* next = */ __base::__before_begin()->__next_, std::forward<_Args>(__args)...);
 #    if _LIBCPP_STD_VER >= 17
@@ -1084,7 +1141,7 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
   __base::__before_begin()->__next_ =
       this->__create_node(/* next = */ __base::__before_begin()->__next_, std::move(__v));
 }
@@ -1092,12 +1149,12 @@ void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) {
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) {
   __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, __v);
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::pop_front() {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::pop_front() {
   _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::pop_front called on an empty list");
   __node_pointer __p                = __base::__before_begin()->__next_;
   __base::__before_begin()->__next_ = __p->__next_;
@@ -1108,7 +1165,7 @@ void forward_list<_Tp, _Alloc>::pop_front() {
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::forward<_Args>(__args)...);
@@ -1116,7 +1173,7 @@ forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args)
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, std::move(__v));
@@ -1126,7 +1183,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) {
 #  endif // _LIBCPP_CXX03_LANG
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __v) {
   __begin_node_pointer const __r = __p.__get_begin();
   __r->__next_                   = this->__create_node(/* next = */ __r->__next_, __v);
@@ -1135,7 +1192,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Args&&... __args) {
   __begin_node_pointer __r = __p.__get_begin();
   if (__n > 0) {
@@ -1159,21 +1216,21 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar
 #  endif // _LIBCPP_HAS_EXCEPTIONS
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = static_cast<__begin_node_pointer>(__last);
+    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
   }
   return iterator(__r);
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, __enable_if_t<__has_input_iterator_category<_InputIterator>::value, int> >
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l) {
   return __insert_after_with_sentinel(__p, std::move(__f), std::move(__l));
 }
 
 template <class _Tp, class _Alloc>
 template <class _InputIterator, class _Sentinel>
-_LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l) {
   __begin_node_pointer __r = __p.__get_begin();
 
@@ -1200,14 +1257,15 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp
 
     __last->__next_ = __r->__next_;
     __r->__next_    = __first;
-    __r             = static_cast<__begin_node_pointer>(__last);
+    __r             = __forward_node_traits<__node_pointer>::__as_iter_node(__last);
   }
 
   return iterator(__r);
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
+forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) {
   __begin_node_pointer __p = __f.__get_begin();
   __node_pointer __n       = __p->__next_;
   __p->__next_             = __n->__next_;
@@ -1216,7 +1274,7 @@ typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_af
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::iterator
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator
 forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) {
   __node_pointer __e = __l.__get_unsafe_node_pointer();
   if (__f != __l) {
@@ -1236,7 +1294,7 @@ forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::resize(size_type __n) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n) {
   size_type __sz = 0;
   iterator __p   = before_begin();
   iterator __i   = begin();
@@ -1250,7 +1308,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
   size_type __sz = 0;
   iterator __p   = before_begin();
   iterator __i   = begin();
@@ -1264,7 +1322,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) {
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) {
   if (!__x.empty()) {
     if (__p.__get_begin()->__next_ != nullptr) {
       const_iterator __lm1 = __x.before_begin();
@@ -1278,7 +1336,8 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& _
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void
+forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) {
   const_iterator __lm1 = std::next(__i);
   if (__p != __i && __p != __lm1) {
     __i.__get_begin()->__next_   = __lm1.__get_begin()->__next_;
@@ -1288,7 +1347,7 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::splice_after(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(
     const_iterator __p, forward_list& /*__other*/, const_iterator __f, const_iterator __l) {
   if (__f != __l && __p != __f) {
     const_iterator __lm1 = __f;
@@ -1303,24 +1362,26 @@ void forward_list<_Tp, _Alloc>::splice_after(
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
+forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) {
   splice_after(__p, __x);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
 forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x, const_iterator __i) {
   splice_after(__p, __x, __i);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(
     const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l) {
   splice_after(__p, __x, __f, __l);
 }
 
 template <class _Tp, class _Alloc>
-typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove(const value_type& __v) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
+forward_list<_Tp, _Alloc>::remove(const value_type& __v) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
@@ -1343,7 +1404,8 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo
 
 template <class _Tp, class _Alloc>
 template <class _Predicate>
-typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
+forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
   const iterator __e                                            = end();
@@ -1366,7 +1428,7 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo
 
 template <class _Tp, class _Alloc>
 template <class _BinaryPredicate>
-typename forward_list<_Tp, _Alloc>::__remove_return_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type
 forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) {
   forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing
   typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0;
@@ -1384,7 +1446,7 @@ forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) {
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
   if (this != std::addressof(__x)) {
     __base::__before_begin()->__next_ =
         __merge(__base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp);
@@ -1394,7 +1456,7 @@ void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) {
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-typename forward_list<_Tp, _Alloc>::__node_pointer
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer
 forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp) {
   if (__f1 == nullptr)
     return __f2;
@@ -1431,13 +1493,13 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Co
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) {
   __base::__before_begin()->__next_ = __sort(__base::__before_begin()->__next_, std::distance(begin(), end()), __comp);
 }
 
 template <class _Tp, class _Alloc>
 template <class _Compare>
-typename forward_list<_Tp, _Alloc>::__node_pointer
+_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer
 forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Compare& __comp) {
   switch (__sz) {
   case 0:
@@ -1461,7 +1523,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co
 }
 
 template <class _Tp, class _Alloc>
-void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
   __node_pointer __p = __base::__before_begin()->__next_;
   if (__p != nullptr) {
     __node_pointer __f = __p->__next_;
@@ -1477,7 +1539,8 @@ void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT {
 }
 
 template <class _Tp, class _Alloc>
-_LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool
+operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   typedef forward_list<_Tp, _Alloc> _Cp;
   typedef typename _Cp::const_iterator _Ip;
   _Ip __ix = __x.begin();
@@ -1493,31 +1556,31 @@ _LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, cons
 #  if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator!=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__x == __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator<(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end());
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator>(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return __y < __x;
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator>=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__x < __y);
 }
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI bool
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool
 operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) {
   return !(__y < __x);
 }
@@ -1525,7 +1588,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>
 #  else // #if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Allocator>
-_LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
+_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp>
 operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) {
   return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
 }
@@ -1533,20 +1596,20 @@ operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _A
 #  endif // #if _LIBCPP_STD_VER <= 17
 
 template <class _Tp, class _Alloc>
-inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y)
-    _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void
+swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) {
   __x.swap(__y);
 }
 
 #  if _LIBCPP_STD_VER >= 20
 template <class _Tp, class _Allocator, class _Predicate>
-inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
 erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred) {
   return __c.remove_if(__pred);
 }
 
 template <class _Tp, class _Allocator, class _Up>
-inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
+_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type
 erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v) {
   return std::erase_if(__c, [&](const auto& __elem) -> bool { return __elem == __v; });
 }
diff --git a/libcxx/include/version b/libcxx/include/version
index 65fae111dc8ed..87c4ede9a7e59 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -68,6 +68,7 @@ __cpp_lib_constexpr_charconv                            202207L <charconv>
 __cpp_lib_constexpr_cmath                               202202L <cmath> <cstdlib>
 __cpp_lib_constexpr_complex                             201711L <complex>
 __cpp_lib_constexpr_dynamic_alloc                       201907L <memory>
+__cpp_lib_constexpr_forward_list                        202502L <forward_list>
 __cpp_lib_constexpr_functional                          201907L <functional>
 __cpp_lib_constexpr_iterator                            201811L <iterator>
 __cpp_lib_constexpr_memory                              202202L <memory>
@@ -543,6 +544,7 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_bitset                               202306L
 # undef  __cpp_lib_constexpr_algorithms
 # define __cpp_lib_constexpr_algorithms                 202306L
+# define __cpp_lib_constexpr_forward_list               202502L
 # if !defined(_LIBCPP_ABI_VCRUNTIME)
 #   define __cpp_lib_constexpr_new                      202406L
 # endif
diff --git a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
index 52adfc4d85985..a9ef855e9a73e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp
@@ -11,7 +11,7 @@
 
 // template<class T, class Allocator>
 //   synth-three-way-result<T> operator<=>(const forward_list<T, Allocator>& x,
-//                                         const forward_list<T, Allocator>& y);
+//                                         const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -20,6 +20,9 @@
 
 int main(int, char**) {
   assert(test_sequence_container_spaceship<std::forward_list>());
-  // `std::forward_list` is not constexpr, so no `static_assert` test here.
+#if TEST_STD_VER >= 26
+  static_assert(test_sequence_container_spaceship<std::forward_list>());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
index dbc0631d11930..4482d26f308a6 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp
@@ -10,7 +10,7 @@
 
 // class forward_list
 
-// bool empty() const noexcept;
+// bool empty() const noexcept; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef std::forward_list<int> C;
     C c;
@@ -42,5 +42,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
index 757db7d957f5f..50b549f17d561 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp
@@ -8,17 +8,18 @@
 
 // <forward_list>
 
-// reference       front();
-// const_reference front() const;
+// reference       front();       // constexpr since C++26
+// const_reference front() const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
 #include <iterator>
 
+#include "test_allocator.h"
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -58,5 +59,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
index 31893a1b95994..4645560048cf6 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// explicit forward_list(const allocator_type& a);
+// explicit forward_list(const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_allocator.h"
 #include "../../../NotConstructible.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<NotConstructible> A;
     typedef A::value_type T;
@@ -26,5 +26,14 @@ int main(int, char**) {
     assert(c.empty());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
index bfb330fdaf9fc..ffc6d37f28160 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// explicit forward_list(const allocator_type& a);
+// explicit forward_list(const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "../../../NotConstructible.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<NotConstructible> A;
     typedef A::value_type T;
@@ -46,5 +46,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
index 27d450c63dcae..b99af4ccb79ec 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(const forward_list& x);
+// forward_list& operator=(const forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -143,5 +143,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
index 1cdcca82d3352..ea2802b323a91 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void assign(initializer_list<value_type> il);
+// void assign(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
     assert(n == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
index 998a7e11ef343..9c88db6166ba7 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(forward_list&& x);
+// forward_list& operator=(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -194,5 +194,14 @@ int main(int, char**) {
     assert(c0.empty());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
index a22d6c4985bc5..d21898dc4663a 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list& operator=(initializer_list<value_type> il);
+// forward_list& operator=(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
     assert(n == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
index 9a35328740790..1601b4b47acd1 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class InputIterator>
-//     void assign(InputIterator first, InputIterator last);
+//     void assign(InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -75,5 +75,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
index b0fbfa3249e5e..75626b47c5273 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void assign(size_type n, const value_type& v);
+// void assign(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -65,5 +65,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
index 22d5054b9ae18..12d701bff4b68 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(const forward_list& x);
+// forward_list(const forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
index a61233e4b5d22..fc3ff485b0667 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(const forward_list& x, const allocator_type& a);
+// forward_list(const forward_list& x, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<int> A;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
index b493a89b78003..e0ea8bf66cb3b 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list();
+// forward_list(); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -38,5 +38,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
index 312f6dbad3550..d1e1734e86f9f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp
@@ -9,14 +9,14 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23
+//   forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23; constexpr since C++26
 
 #include <forward_list>
 
 #include "../../from_range_sequence_containers.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for_all_iterators_and_allocators<int>([]<class Iter, class Sent, class Alloc>() {
     test_sequence_container<std::forward_list, int, Iter, Sent, Alloc>([](const auto&) {
       // No additional validation to do.
@@ -26,8 +26,19 @@ int main(int, char**) {
 
   static_assert(test_constraints<std::forward_list, int, double>());
 
-  test_exception_safety_throwing_copy<std::forward_list>();
-  test_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_exception_safety_throwing_copy<std::forward_list>();
+    test_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
index b42242b0a83d4..b7acf60aa70cc 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(initializer_list<value_type> il);
+// forward_list(initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -38,5 +38,14 @@ int main(int, char**) {
     assert(n == 10);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
index 0b29cbfa9254d..33d569c921a94 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(initializer_list<value_type> il, const allocator_type& a);
+// forward_list(initializer_list<value_type> il, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -43,5 +43,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
index 762e252ca76fe..20575479f7357 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(forward_list&& x);
+// forward_list(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -68,5 +68,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
index a9bc2cb12f288..219505bf4fd17 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// forward_list(forward_list&& x, const allocator_type& a);
+// forward_list(forward_list&& x, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef test_allocator<T> A;
@@ -68,5 +68,14 @@ int main(int, char**) {
     assert(c.get_allocator() == A());
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
index ebd0e6a5bd1e0..61393eb28938e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class InputIterator>
-//     forward_list(InputIterator first, InputIterator last);
+//     forward_list(InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -45,5 +45,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
index 4a28041ad2cbc..c0637420e328a 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp
@@ -10,7 +10,7 @@
 
 // template <class InputIterator>
 //     forward_list(InputIterator first, InputIterator last,
-//                  const allocator_type& a);
+//                  const allocator_type& a);                 // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -21,7 +21,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -51,5 +51,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
index 81b128d2149e3..206854560c19f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp
@@ -8,8 +8,8 @@
 
 // <forward_list>
 
-// explicit forward_list(size_type n);
-// explicit forward_list(size_type n, const Alloc& a);
+// explicit forward_list(size_type n);                 // constexpr since C++26
+// explicit forward_list(size_type n, const Alloc& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
index 663422d1c3c30..85d11e3f40a2f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(size_type n, const value_type& v);
+// forward_list(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -42,5 +42,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
index af7f7471d4c98..abcdf62452b89 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// forward_list(size_type n, const value_type& v, const allocator_type& a);
+// forward_list(size_type n, const value_type& v, const allocator_type& a); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef test_allocator<int> A;
     typedef A::value_type T;
@@ -47,5 +47,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
index 1044d779220ee..86d7769fe16ee 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class U>
 //   typename forward_list<T, Allocator>::size_type
-//   erase(forward_list<T, Allocator>& c, const U& value);
+//   erase(forward_list<T, Allocator>& c, const U& value); // constexpr since C++26
 
 #include <forward_list>
 #include <optional>
@@ -21,14 +21,14 @@
 #include "min_allocator.h"
 
 template <class S, class U>
-void test0(S s, U val, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, U val, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase(s, val)));
   assert(expected_erased_count == std::erase(s, val));
   assert(s == expected);
 }
 
 template <class S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   test0(S(), 1, S(), 0);
 
   test0(S({1}), 1, S(), 1);
@@ -62,13 +62,21 @@ void test() {
   test0(S({1, 2, 1}), opt(3), S({1, 2, 1}), 0);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::forward_list<int>>();
   test<std::forward_list<int, min_allocator<int>>>();
   test<std::forward_list<int, test_allocator<int>>>();
-
   test<std::forward_list<long>>();
   test<std::forward_list<double>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
index c4f45a1069a2b..c665f9cccbf0a 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp
@@ -11,7 +11,7 @@
 
 // template <class T, class Allocator, class Predicate>
 //   typename forward_list<T, Allocator>::size_type
-//   erase_if(forward_list<T, Allocator>& c, Predicate pred);
+//   erase_if(forward_list<T, Allocator>& c, Predicate pred); // constexpr since C++26
 
 #include <forward_list>
 
@@ -20,14 +20,14 @@
 #include "min_allocator.h"
 
 template <class S, class Pred>
-void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
+TEST_CONSTEXPR_CXX26 void test0(S s, Pred p, S expected, std::size_t expected_erased_count) {
   ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase_if(s, p)));
   assert(expected_erased_count == std::erase_if(s, p));
   assert(s == expected);
 }
 
 template <typename S>
-void test() {
+TEST_CONSTEXPR_CXX26 void test() {
   auto is1   = [](auto v) { return v == 1; };
   auto is2   = [](auto v) { return v == 2; };
   auto is3   = [](auto v) { return v == 3; };
@@ -64,13 +64,21 @@ void test() {
   test0(S({1, 2, 3}), False, S({1, 2, 3}), 0);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   test<std::forward_list<int>>();
   test<std::forward_list<int, min_allocator<int>>>();
   test<std::forward_list<int, test_allocator<int>>>();
-
   test<std::forward_list<long>>();
   test<std::forward_list<double>>();
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
index d66d2cd879515..52b5d87860aab 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp
@@ -8,9 +8,9 @@
 
 // <forward_list>
 
-// iterator       before_begin();
-// const_iterator before_begin() const;
-// const_iterator cbefore_begin() const;
+// iterator       before_begin();        // constexpr since C++26
+// const_iterator before_begin() const;  // constexpr since C++26
+// const_iterator cbefore_begin() const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -101,5 +101,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
index 135689b2321c3..560c47b17958f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp
@@ -8,12 +8,12 @@
 
 // <forward_list>
 
-// iterator       begin();
-// iterator       end();
-// const_iterator begin()  const;
-// const_iterator end()    const;
-// const_iterator cbegin() const;
-// const_iterator cend()   const;
+// iterator       begin();        // constexpr since C++26
+// iterator       end();          // constexpr since C++26
+// const_iterator begin()  const; // constexpr since C++26
+// const_iterator end()    const; // constexpr since C++26
+// const_iterator cbegin() const; // constexpr since C++26
+// const_iterator cend()   const; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -22,7 +22,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -69,6 +69,8 @@ int main(int, char**) {
     typedef std::forward_list<T> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
 #if TEST_STD_VER >= 11
   {
@@ -117,6 +119,8 @@ int main(int, char**) {
     typedef std::forward_list<T, min_allocator<T>> C;
     C::iterator i;
     C::const_iterator j;
+    (void)i;
+    (void)j;
   }
 #endif
 #if TEST_STD_VER > 11
@@ -142,5 +146,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
index a27cc757025b5..9a3adec1d9756 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void prepend_range(R&& rg); // C++23
+//   constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -21,7 +21,7 @@
 //   {empty/one-element/full} container);
 // - prepending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_assign_range<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +31,19 @@ int main(int, char**) {
   });
   test_sequence_prepend_range_move_only<std::forward_list>();
 
-  test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
-  test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
+    test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
index 9f6d34b701df7..2e1768cf8bad9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void clear() noexcept;
+// void clear() noexcept; // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "../../../NotConstructible.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef NotConstructible T;
     typedef std::forward_list<T> C;
@@ -64,5 +64,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
index f77d47ee7c74f..6433607af9b39 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp
@@ -11,7 +11,7 @@
 // <forward_list>
 
 // template <class... Args>
-//     iterator emplace_after(const_iterator p, Args&&... args);
+//     iterator emplace_after(const_iterator p, Args&&... args); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -20,7 +20,7 @@
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef Emplaceable T;
     typedef std::forward_list<T> C;
@@ -84,5 +84,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
index cd3bb20c52ae5..46ae27b43622e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// template <class... Args> reference emplace_front(Args&&... args);
+// template <class... Args> reference emplace_front(Args&&... args); // constexpr since C++26
 // return type is 'reference' in C++17; 'void' before
 
 #include <forward_list>
@@ -21,7 +21,7 @@
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef Emplaceable T;
     typedef std::forward_list<T> C;
@@ -67,5 +67,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
index e85951798526d..73cb03c2cb7d2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator erase_after(const_iterator first, const_iterator last);
+// iterator erase_after(const_iterator first, const_iterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -153,5 +153,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
index 892228e76def7..12997f1dad3b9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator erase_after(const_iterator p);
+// iterator erase_after(const_iterator p); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -95,5 +95,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
index 8443158413e7f..d93789dd6bb5c 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, const value_type& v);
+// iterator insert_after(const_iterator p, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -84,5 +84,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
index de924a10c18f0..54be47f4264ff 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, initializer_list<value_type> il);
+// iterator insert_after(const_iterator p, initializer_list<value_type> il); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -70,5 +70,14 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 4) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
index af810d0f6961c..f89fbd7619da2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp
@@ -10,7 +10,7 @@
 
 // template <class InputIterator>
 //     iterator insert_after(const_iterator p,
-//                           InputIterator first, InputIterator last);
+//                           InputIterator first, InputIterator last); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_iterators.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -77,5 +77,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
index acd4bc73f724e..01b76f5cd64f1 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, value_type&& v);
+// iterator insert_after(const_iterator p, value_type&& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef std::forward_list<T> C;
@@ -85,5 +85,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 4);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
index 2506f04311e0e..f4f0521ad2371 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// iterator insert_after(const_iterator p, size_type n, const value_type& v);
+// iterator insert_after(const_iterator p, size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -70,5 +70,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
index 25f4c43f38486..71a291430b435 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp
@@ -8,8 +8,10 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
+
 // template<container-compatible-range<T> R>
-//   constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23
+//   constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -321,7 +323,7 @@ constexpr void test_sequence_insert_range_after() {
   }
 }
 
-void test_sequence_insert_range_after_move_only() {
+TEST_CONSTEXPR_CXX26 void test_sequence_insert_range_after_move_only() {
   MoveOnly input[5];
   std::ranges::subrange in(std::move_iterator{input}, std::move_iterator{input + 5});
 
@@ -366,7 +368,7 @@ void test_insert_range_after_exception_safety_throwing_allocator() {
 #endif
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_insert_range_after<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -374,8 +376,19 @@ int main(int, char**) {
   });
   test_sequence_insert_range_after_move_only();
 
-  test_insert_range_after_exception_safety_throwing_copy();
-  test_insert_range_after_exception_safety_throwing_allocator<int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_insert_range_after_exception_safety_throwing_copy();
+    test_insert_range_after_exception_safety_throwing_allocator<int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
index 98c7a26341179..9fcade7ff6bba 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void pop_front();
+// void pop_front(); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -17,7 +17,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -71,5 +71,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
index 418aa72052ba9..c4b9cd9bdfc41 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // template<container-compatible-range<T> R>
-//   constexpr void prepend_range(R&& rg); // C++23
+//   constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26
 
 #include <forward_list>
 
@@ -21,7 +21,7 @@
 //   {empty/one-element/full} container);
 // - prepending move-only elements;
 // - an exception is thrown when copying the elements or when allocating new elements.
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   static_assert(test_constraints_prepend_range<std::forward_list, int, double>());
 
   for_all_iterators_and_allocators<int, const int*>([]<class Iter, class Sent, class Alloc>() {
@@ -31,8 +31,19 @@ int main(int, char**) {
   });
   test_sequence_prepend_range_move_only<std::forward_list>();
 
-  test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
-  test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  if (!TEST_IS_CONSTANT_EVALUATED) {
+    test_prepend_range_exception_safety_throwing_copy<std::forward_list>();
+    test_prepend_range_exception_safety_throwing_allocator<std::forward_list, int>();
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
index f99c40fa0c1a0..61c5dcac0545e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void push_front(const value_type& v);
+// void push_front(const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -16,7 +16,7 @@
 #include "test_macros.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -44,5 +44,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
index 467037465eedd..cd24d6ff6af06 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: no-exceptions
 // <forward_list>
 
-// void push_front(const value_type& x);
+// void push_front(const value_type& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
index d3156c5fdd38a..b30ff7a0189e2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void push_front(value_type&& v);
+// void push_front(value_type&& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "MoveOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef MoveOnly T;
     typedef std::forward_list<T> C;
@@ -45,5 +45,14 @@ int main(int, char**) {
     assert(std::distance(c.begin(), c.end()) == 2);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
index 2dacf458d7d9d..f80886113bf25 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void resize(size_type n);
+// void resize(size_type n); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,8 +18,8 @@
 #include "DefaultOnly.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
-  {
+TEST_CONSTEXPR_CXX26 bool test() {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     typedef DefaultOnly T;
     typedef std::forward_list<T> C;
     C c;
@@ -65,7 +65,7 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 5) == 0);
   }
 #if TEST_STD_VER >= 11
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     typedef DefaultOnly T;
     typedef std::forward_list<T, min_allocator<T>> C;
     C c;
@@ -112,5 +112,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
index a6af763e6937f..4ec859b36336d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void resize(size_type n, const value_type& v);
+// void resize(size_type n, const value_type& v); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -22,7 +22,7 @@
 #  include "container_test_types.h"
 #endif
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -84,7 +84,7 @@ int main(int, char**) {
     assert(*std::next(c.begin(), 4) == 10);
     assert(*std::next(c.begin(), 5) == 10);
   }
-  {
+  if (!TEST_IS_CONSTANT_EVALUATED) {
     // Test that the allocator's construct method is being used to
     // construct the new elements and that it's called exactly N times.
     typedef std::forward_list<int, ContainerTestAllocator<int, int>> Container;
@@ -99,5 +99,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
index 9a162789569d3..d8e80c56bf392 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void merge(forward_list& x);
+// void merge(forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -30,11 +30,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -116,5 +116,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
index 4e1814044808c..0adadb2dd092f 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// template <class Compare> void merge(forward_list& x, Compare comp);
+// template <class Compare> void merge(forward_list& x, Compare comp); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -30,11 +30,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -117,5 +117,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
index acfa014fe2546..906748ec2702b 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// void merge(forward_list&& x);
+// void merge(forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <functional>
@@ -29,11 +29,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -109,5 +109,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
index 41b56ce7a2884..2ced0b1596e4d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp
@@ -10,7 +10,7 @@
 
 // <forward_list>
 
-// template <class Compare> void merge(forward_list&& x, Compare comp);
+// template <class Compare> void merge(forward_list&& x, Compare comp); // constexpr since C++26
 
 #include <forward_list>
 #include <functional>
@@ -29,11 +29,11 @@ struct value {
   int a;
   int b;
 
-  friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
-  friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
+  friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; }
+  friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; }
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   { // Basic merge operation.
     typedef int T;
     typedef std::forward_list<T> C;
@@ -110,5 +110,14 @@ int main(int, char**) {
     assert(c == std::forward_list<int>(std::begin(a), std::end(a)));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
index ec3bf845dcc5a..b17708ba60ee6 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // void remove(const value_type& v);      // C++17 and before
-// size_type remove(const value_type& v); // C++20 and after
+// size_type remove(const value_type& v); // C++20 and after; // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class L>
-void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.remove(value)), typename L::size_type);
@@ -32,22 +32,22 @@ void do_remove(L& l, const typename L::value_type& value, typename L::size_type
 }
 
 struct S {
-  S(int i) : i_(new int(i)) {}
-  S(const S& rhs) : i_(new int(*rhs.i_)) {}
-  S& operator=(const S& rhs) {
+  TEST_CONSTEXPR_CXX20 S(int i) : i_(new int(i)) {}
+  TEST_CONSTEXPR_CXX20 S(const S& rhs) : i_(new int(*rhs.i_)) {}
+  TEST_CONSTEXPR_CXX20 S& operator=(const S& rhs) {
     *i_ = *rhs.i_;
     return *this;
   }
-  ~S() {
+  TEST_CONSTEXPR_CXX20 ~S() {
     delete i_;
     i_ = NULL;
   }
-  bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
-  int get() const { return *i_; }
+  TEST_CONSTEXPR bool operator==(const S& rhs) const { return *i_ == *rhs.i_; }
+  TEST_CONSTEXPR int get() const { return *i_; }
   int* i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -171,5 +171,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
index c6325baea2590..f26205d03f645 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class Predicate> void      remove_if(Predicate pred); // C++17 and before
-// template <class Predicate> size_type remove_if(Predicate pred); // C++20 and after
+// template <class Predicate> size_type remove_if(Predicate pred); // C++20 and after; constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -22,7 +22,7 @@
 #include "counting_predicates.h"
 
 template <class L, class Predicate>
-void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.remove_if(pred)), typename L::size_type);
@@ -34,18 +34,18 @@ void do_remove_if(L& l, Predicate pred, typename L::size_type expected) {
   assert(old_size - std::distance(l.begin(), l.end()) == expected);
 }
 
-bool g(int i) { return i < 3; }
+TEST_CONSTEXPR bool g(int i) { return i < 3; }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& p) const { return p.i_ == i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef unary_counting_predicate<bool (*)(T), T> Predicate;
@@ -187,5 +187,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
index 0d0656897f34e..38f0e74f66323 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void reverse();
+// void reverse(); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N) {
+TEST_CONSTEXPR_CXX26 void test1(int N) {
   C c;
   for (int i = 0; i < N; ++i)
     c.push_front(i);
@@ -30,12 +30,21 @@ void test(int N) {
     assert(*j == i);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
-    test<std::forward_list<int> >(i);
+    test1<std::forward_list<int> >(i);
 #if TEST_STD_VER >= 11
   for (int i = 0; i < 10; ++i)
-    test<std::forward_list<int, min_allocator<int>> >(i);
+    test1<std::forward_list<int, min_allocator<int>> >(i);
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
 #endif
 
   return 0;
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
index 4c91d7397adf0..f8787d70784d1 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void splice_after(const_iterator p, forward_list&& x);
+// void splice_after(const_iterator p, forward_list&& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,13 +19,13 @@
 #include "min_allocator.h"
 
 typedef int T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12, 13, 14, 15};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12, 13, 14, 15};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, int p, int l) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int l) {
   typename C::const_iterator i = c.begin();
   int n1                       = 0;
   for (; n1 < p; ++n1, ++i)
@@ -37,7 +37,7 @@ void testd(const C& c, int p, int l) {
   assert(std::distance(c.begin(), c.end()) == size_t1 + l);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -67,5 +67,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
index bb8bdea632547..7202b0e153627 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void splice_after(const_iterator p, forward_list&& x, const_iterator i);
+// void splice_after(const_iterator p, forward_list&& x, const_iterator i); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,13 +19,13 @@
 #include "min_allocator.h"
 
 typedef int T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, int p, int f) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int f) {
   typename C::const_iterator i = c.begin();
   int n1                       = 0;
   for (; n1 < p; ++n1, ++i)
@@ -38,7 +38,7 @@ void testd(const C& c, int p, int f) {
 }
 
 template <class C>
-void tests(const C& c, int p, int f) {
+TEST_CONSTEXPR_CXX26 void tests(const C& c, int p, int f) {
   typename C::const_iterator i = c.begin();
   int n                        = 0;
   if (p == f || p == f + 1) {
@@ -67,7 +67,7 @@ void tests(const C& c, int p, int f) {
   assert(std::distance(c.begin(), c.end()) == size_t1);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -117,5 +117,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
index 99b3ed1c7836b..18da6f12b28da 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp
@@ -8,8 +8,10 @@
 
 // <forward_list>
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=3000000
+
 // void splice_after(const_iterator p, forward_list&& x,
-//                   const_iterator first, const_iterator last);
+//                   const_iterator first, const_iterator last); // constexpr since C++26
 
 #include <stddef.h>
 #include <forward_list>
@@ -20,13 +22,13 @@
 #include "min_allocator.h"
 
 typedef std::ptrdiff_t T;
-const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
-const T t2[]                 = {10, 11, 12, 13, 14, 15};
-const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
-const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
+TEST_CONSTEXPR const T t1[]                 = {0, 1, 2, 3, 4, 5, 6, 7};
+TEST_CONSTEXPR const T t2[]                 = {10, 11, 12, 13, 14, 15};
+TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1);
+TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2);
 
 template <class C>
-void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
+TEST_CONSTEXPR_CXX26 void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   typename C::const_iterator i = c.begin();
   std::ptrdiff_t n1            = 0;
   for (; n1 < p; ++n1, ++i)
@@ -39,7 +41,7 @@ void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
 }
 
 template <class C>
-void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
+TEST_CONSTEXPR_CXX26 void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   typename C::const_iterator i = c.begin();
   std::ptrdiff_t n             = 0;
   std::ptrdiff_t d             = l > f + 1 ? l - 1 - f : 0;
@@ -69,7 +71,7 @@ void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) {
   assert(std::distance(c.begin(), c.end()) == size_t1);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     // splicing different containers
     typedef std::forward_list<T> C;
@@ -157,5 +159,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
index ebd1a79cdb4bc..28efff3849e68 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // void unique();      // C++17 and before
-// size_type unique(); // C++20 and after
+// size_type unique(); // C++20 and after; constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -19,7 +19,7 @@
 #include "min_allocator.h"
 
 template <class L>
-void do_unique(L& l, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_unique(L& l, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.unique()), typename L::size_type);
@@ -31,7 +31,7 @@ void do_unique(L& l, typename L::size_type expected) {
   assert(old_size - std::distance(l.begin(), l.end()) == expected);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -131,5 +131,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
index 408cbf6ae9c20..f07142dffe9d9 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class BinaryPredicate> void unique(BinaryPredicate binary_pred);      // C++17 and before
-// template <class BinaryPredicate> size_type unique(BinaryPredicate binary_pred); // C++20 and after
+// template <class BinaryPredicate> size_type unique(BinaryPredicate binary_pred); // C++20 and after; constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -20,7 +20,7 @@
 #include "min_allocator.h"
 
 template <class L, class Predicate>
-void do_unique(L& l, Predicate pred, typename L::size_type expected) {
+TEST_CONSTEXPR_CXX26 void do_unique(L& l, Predicate pred, typename L::size_type expected) {
   typename L::size_type old_size = std::distance(l.begin(), l.end());
 #if TEST_STD_VER > 17
   ASSERT_SAME_TYPE(decltype(l.unique(pred)), typename L::size_type);
@@ -33,17 +33,17 @@ void do_unique(L& l, Predicate pred, typename L::size_type expected) {
 }
 
 struct PredLWG526 {
-  PredLWG526(int i) : i_(i) {}
-  ~PredLWG526() { i_ = -32767; }
-  bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
+  TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {}
+  TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; }
+  TEST_CONSTEXPR bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; }
 
-  bool operator==(int i) const { return i == i_; }
+  TEST_CONSTEXPR bool operator==(int i) const { return i == i_; }
   int i_;
 };
 
-bool g(int x, int y) { return x == y; }
+TEST_CONSTEXPR bool g(int x, int y) { return x == y; }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef std::forward_list<T> C;
@@ -157,5 +157,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
index ef6b72ee360a9..cb57b094a077d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp
@@ -10,11 +10,11 @@
 
 // template <class T, class Allocator>
 //     bool operator==(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator!=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -25,7 +25,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N, int M) {
+TEST_CONSTEXPR_CXX26 void test(int N, int M) {
   C c1;
   for (int i = 0; i < N; ++i)
     c1.push_front(i);
@@ -44,7 +44,7 @@ void test(int N, int M) {
   }
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
     for (int j = 0; j < 10; ++j)
       test<std::forward_list<int> >(i, j);
@@ -54,5 +54,14 @@ int main(int, char**) {
       test<std::forward_list<int, min_allocator<int>> >(i, j);
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
index e50f9e6e9e473..f4f7c6d1f7e53 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// void swap(forward_list& x);
+// void swap(forward_list& x); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -257,5 +257,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
index cae6950436dee..ce25479781547 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp
@@ -9,7 +9,7 @@
 // <forward_list>
 
 // template <class T, class Allocator>
-//     void swap(forward_list<T, Allocator>& x, forward_list<T, Allocator>& y);
+//     void swap(forward_list<T, Allocator>& x, forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -19,7 +19,7 @@
 #include "test_allocator.h"
 #include "min_allocator.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef int T;
     typedef test_allocator<T> A;
@@ -258,5 +258,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
index d16acadaeb893..7bf80ca026e8e 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp
@@ -10,19 +10,19 @@
 
 // template <class T, class Allocator>
 //     bool operator< (const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator> (const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator>=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 //
 // template <class T, class Allocator>
 //     bool operator<=(const forward_list<T, Allocator>& x,
-//                     const forward_list<T, Allocator>& y);
+//                     const forward_list<T, Allocator>& y); // constexpr since C++26
 
 #include <forward_list>
 #include <iterator>
@@ -33,7 +33,7 @@
 #include "min_allocator.h"
 
 template <class C>
-void test(int N, int M) {
+TEST_CONSTEXPR_CXX26 void test(int N, int M) {
   C c1;
   for (int i = 0; i < N; ++i)
     c1.push_front(i);
@@ -50,7 +50,7 @@ void test(int N, int M) {
     assert(c1 > c2);
 }
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   for (int i = 0; i < 10; ++i)
     for (int j = 0; j < 10; ++j)
       test<std::forward_list<int> >(i, j);
@@ -60,5 +60,14 @@ int main(int, char**) {
       test<std::forward_list<int, min_allocator<int>> >(i, j);
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
index b50e67589471d..02b7b471a1ae8 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
@@ -12,10 +12,10 @@
 
 // void swap(forward_list& c)
 //     noexcept(!allocator_type::propagate_on_container_swap::value ||
-//              __is_nothrow_swappable<allocator_type>::value);
+//              __is_nothrow_swappable<allocator_type>::value);          // constexpr since C++26
 //
 //  In C++17, the standard says that swap shall have:
-//     noexcept(is_always_equal<allocator_type>::value);
+//     noexcept(is_always_equal<allocator_type>::value);                 // constexpr since C++26
 
 // This tests a conforming extension
 
diff --git a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
index f37f5c2f513bd..624eeb17799c0 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp
@@ -10,7 +10,7 @@
 
 // class forward_list
 
-// allocator_type get_allocator() const
+// allocator_type get_allocator() const // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     std::allocator<int> alloc;
     const std::forward_list<int> fl(alloc);
@@ -30,5 +30,14 @@ int main(int, char**) {
     assert(fl.get_allocator() == alloc);
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
index b7be03f1062dc..16c6f0b90f96d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp
@@ -8,9 +8,9 @@
 
 // <forward_list>
 
-// forward_list()
-// forward_list::iterator()
-// forward_list::const_iterator()
+// forward_list()                 // constexpr since C++26
+// forward_list::iterator()       // constexpr since C++26
+// forward_list::const_iterator() // constexpr since C++26
 
 #include <forward_list>
 #include <cassert>
@@ -33,7 +33,7 @@ struct B {
 };
 #endif
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     A a;
     assert(a.d.empty());
@@ -49,5 +49,14 @@ int main(int, char**) {
   }
 #endif
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
index 5ba0d61f104e0..aab53351f00e2 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp
@@ -8,7 +8,7 @@
 
 // <forward_list>
 
-// size_type max_size() const;
+// size_type max_size() const; // constexpr since C++26
 
 #include <cassert>
 #include <forward_list>
@@ -18,7 +18,7 @@
 #include "test_allocator.h"
 #include "test_macros.h"
 
-int main(int, char**) {
+TEST_CONSTEXPR_CXX26 bool test() {
   {
     typedef limited_allocator<int, 10> A;
     typedef std::forward_list<int, A> C;
@@ -42,5 +42,14 @@ int main(int, char**) {
     assert(c.max_size() <= alloc_max_size(c.get_allocator()));
   }
 
+  return true;
+}
+
+int main(int, char**) {
+  assert(test());
+#if TEST_STD_VER >= 26
+  static_assert(test());
+#endif
+
   return 0;
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
index 31b3e900aabcd..05f903dccafe7 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp
@@ -24,6 +24,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -54,6 +58,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -87,6 +95,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -126,6 +138,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should not be defined before c++23"
 #  endif
@@ -171,6 +187,10 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++23"
 #  endif
@@ -219,6 +239,13 @@
 #    error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_forward_list != 202502L
+#    error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_containers_ranges
 #    error "__cpp_lib_containers_ranges should be defined in c++26"
 #  endif
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index b1cc4afd30696..a13edacd1e46a 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -196,6 +196,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -1084,6 +1088,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -2074,6 +2082,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifdef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should not be defined before c++20"
 #  endif
@@ -3304,6 +3316,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++20"
 #  endif
@@ -4756,6 +4772,10 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23"
 #  endif
 
+#  ifdef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should not be defined before c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++23"
 #  endif
@@ -6427,6 +6447,13 @@
 #    error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26"
 #  endif
 
+#  ifndef __cpp_lib_constexpr_forward_list
+#    error "__cpp_lib_constexpr_forward_list should be defined in c++26"
+#  endif
+#  if __cpp_lib_constexpr_forward_list != 202502L
+#    error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26"
+#  endif
+
 #  ifndef __cpp_lib_constexpr_functional
 #    error "__cpp_lib_constexpr_functional should be defined in c++26"
 #  endif
diff --git a/libcxx/test/support/counting_predicates.h b/libcxx/test/support/counting_predicates.h
index 6f34ce76302a8..8fb2db1af70d3 100644
--- a/libcxx/test/support/counting_predicates.h
+++ b/libcxx/test/support/counting_predicates.h
@@ -16,42 +16,44 @@
 template <typename Predicate, typename Arg>
 struct unary_counting_predicate {
 public:
-    typedef Arg argument_type;
-    typedef bool result_type;
+  typedef Arg argument_type;
+  typedef bool result_type;
 
-    unary_counting_predicate(Predicate p) : p_(p), count_(0) {}
-    unary_counting_predicate(const unary_counting_predicate&) = default;
-    unary_counting_predicate& operator=(const unary_counting_predicate&) = default;
-    ~unary_counting_predicate() {}
+  TEST_CONSTEXPR_CXX20 unary_counting_predicate(Predicate p) : p_(p), count_(0) {}
+  unary_counting_predicate(const unary_counting_predicate&)            = default;
+  unary_counting_predicate& operator=(const unary_counting_predicate&) = default;
+  TEST_CONSTEXPR_CXX20 ~unary_counting_predicate() {}
 
-    bool operator () (const Arg &a) const { ++count_; return p_(a); }
-    std::size_t count() const { return count_; }
-    void reset() { count_ = 0; }
+  TEST_CONSTEXPR_CXX14 bool operator()(const Arg& a) const {
+    ++count_;
+    return p_(a);
+  }
+  TEST_CONSTEXPR std::size_t count() const { return count_; }
+  TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
 
 private:
-    Predicate p_;
-    mutable std::size_t count_;
+  Predicate p_;
+  mutable std::size_t count_;
 };
 
-
-template <typename Predicate, typename Arg1, typename Arg2=Arg1>
+template <typename Predicate, typename Arg1, typename Arg2 = Arg1>
 struct binary_counting_predicate {
 public:
-    typedef Arg1 first_argument_type;
-    typedef Arg2 second_argument_type;
-    typedef bool result_type;
-
-    TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {}
-    TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const {
-      ++count_;
-      return p_(a1, a2);
-    }
-    TEST_CONSTEXPR std::size_t count() const { return count_; }
-    TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
-
-  private:
-    Predicate p_;
-    mutable std::size_t count_;
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+  typedef bool result_type;
+
+  TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {}
+  TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const {
+    ++count_;
+    return p_(a1, a2);
+  }
+  TEST_CONSTEXPR std::size_t count() const { return count_; }
+  TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; }
+
+private:
+  Predicate p_;
+  mutable std::size_t count_;
 };
 
 #if TEST_STD_VER > 14
@@ -66,13 +68,13 @@ class counting_predicate {
   constexpr counting_predicate(Predicate pred, int& count) : pred_(std::move(pred)), count_(&count) {}
 
   template <class... Args>
-  constexpr decltype(auto) operator()(Args&& ...args) {
+  constexpr decltype(auto) operator()(Args&&... args) {
     ++(*count_);
     return pred_(std::forward<Args>(args)...);
   }
 
   template <class... Args>
-  constexpr decltype(auto) operator()(Args&& ...args) const {
+  constexpr decltype(auto) operator()(Args&&... args) const {
     ++(*count_);
     return pred_(std::forward<Args>(args)...);
   }
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
old mode 100755
new mode 100644
index 82f0d09db5c36..b59c7fdaf0a3d
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -357,6 +357,11 @@ def add_version_header(tc):
             "values": {"c++20": 201907},
             "headers": ["memory"],
         },
+        {
+            "name": "__cpp_lib_constexpr_forward_list",
+            "values": {"c++26": 202502},
+            "headers": ["forward_list"],
+        },
         {
             "name": "__cpp_lib_constexpr_functional",
             "values": {"c++20": 201907},

From 5188bea9afac859fa6523e07d98748527c295aaf Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:18:55 -0700
Subject: [PATCH 098/851] [llvm] annotate interfaces in llvm/TargetParser for
 DLL export (#143616)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/TargetParser`
library. These annotations currently have no meaningful impact on the
LLVM build; however, they are a prerequisite to support an LLVM Windows
DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

Most of these changes were generated automatically using the [Interface
Definition Scanner (IDS)](https://github.com/compnerd/ids) tool,
followed formatting with `git clang-format`.

Additionally, I manually removed the redundant declaration of
`getCanonicalArchName` from
llvm/include/llvm/TargetParser/ARMTargetParser.h because IDS only
auto-annotates the first declaration it encounters, and the second
un-annotated declaration results in an MSVC warning.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../llvm/TargetParser/AArch64TargetParser.h   |  60 +++++----
 .../llvm/TargetParser/ARMTargetParser.h       |  75 +++++------
 .../llvm/TargetParser/ARMTargetParserCommon.h |  13 +-
 .../llvm/TargetParser/CSKYTargetParser.h      |  30 +++--
 llvm/include/llvm/TargetParser/Host.h         |  25 ++--
 .../llvm/TargetParser/LoongArchTargetParser.h |  13 +-
 .../llvm/TargetParser/PPCTargetParser.h       |  15 ++-
 llvm/include/llvm/TargetParser/RISCVISAInfo.h |  42 +++---
 .../llvm/TargetParser/RISCVTargetParser.h     |  42 +++---
 .../llvm/TargetParser/SubtargetFeature.h      |  17 +--
 llvm/include/llvm/TargetParser/TargetParser.h |  29 +++--
 llvm/include/llvm/TargetParser/Triple.h       | 121 +++++++++---------
 .../llvm/TargetParser/X86TargetParser.h       |  35 ++---
 13 files changed, 274 insertions(+), 243 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 0338770593bc4..59e8117ccb730 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -79,7 +80,7 @@ struct FMVInfo {
       : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {};
 };
 
-const std::vector<FMVInfo> &getFMVInfo();
+LLVM_ABI const std::vector<FMVInfo> &getFMVInfo();
 
 // Represents a dependency between two architecture extensions. Later is the
 // feature which was added to the architecture after Earlier, and expands the
@@ -146,7 +147,7 @@ struct ArchInfo {
   StringRef getSubArch() const { return ArchFeature.substr(1); }
 
   // Search for ArchInfo by SubArch name
-  static std::optional<ArchInfo> findBySubArch(StringRef SubArch);
+  LLVM_ABI static std::optional<ArchInfo> findBySubArch(StringRef SubArch);
 };
 
 #define EMIT_ARCHITECTURES
@@ -182,34 +183,36 @@ struct ExtensionSet {
   // Enable the given architecture extension, and any other extensions it
   // depends on. Does not change the base architecture, or follow dependencies
   // between features which are only related by required arcitecture versions.
-  void enable(ArchExtKind E);
+  LLVM_ABI void enable(ArchExtKind E);
 
   // Disable the given architecture extension, and any other extensions which
   // depend on it. Does not change the base architecture, or follow
   // dependencies between features which are only related by required
   // arcitecture versions.
-  void disable(ArchExtKind E);
+  LLVM_ABI void disable(ArchExtKind E);
 
   // Add default extensions for the given CPU. Records the base architecture,
   // to later resolve dependencies which depend on it.
-  void addCPUDefaults(const CpuInfo &CPU);
+  LLVM_ABI void addCPUDefaults(const CpuInfo &CPU);
 
   // Add default extensions for the given architecture version. Records the
   // base architecture, to later resolve dependencies which depend on it.
-  void addArchDefaults(const ArchInfo &Arch);
+  LLVM_ABI void addArchDefaults(const ArchInfo &Arch);
 
   // Add or remove a feature based on a modifier string. The string must be of
   // the form "<name>" to enable a feature or "no<name>" to disable it. This
   // will also enable or disable any features as required by the dependencies
   // between them.
-  bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false);
+  LLVM_ABI bool parseModifier(StringRef Modifier,
+                              const bool AllowNoDashForm = false);
 
   // Constructs a new ExtensionSet by toggling the corresponding bits for every
   // feature in the \p Features list without expanding their dependencies. Used
   // for reconstructing an ExtensionSet from the output of toLLVMFeatures().
   // Features that are not recognized are pushed back to \p NonExtensions.
-  void reconstructFromParsedFeatures(const std::vector<std::string> &Features,
-                                     std::vector<std::string> &NonExtensions);
+  LLVM_ABI void
+  reconstructFromParsedFeatures(const std::vector<std::string> &Features,
+                                std::vector<std::string> &NonExtensions);
 
   // Convert the set of enabled extension to an LLVM feature list, appending
   // them to Features.
@@ -227,7 +230,7 @@ struct ExtensionSet {
     }
   }
 
-  void dump() const;
+  LLVM_ABI void dump() const;
 };
 
 // Name alias.
@@ -239,52 +242,53 @@ struct Alias {
 #define EMIT_CPU_ALIAS
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
-const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
+LLVM_ABI const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID));
 
-bool getExtensionFeatures(
-    const AArch64::ExtensionBitset &Extensions,
-    std::vector<StringRef> &Features);
+LLVM_ABI bool getExtensionFeatures(const AArch64::ExtensionBitset &Extensions,
+                                   std::vector<StringRef> &Features);
 
-StringRef getArchExtFeature(StringRef ArchExt);
-StringRef resolveCPUAlias(StringRef CPU);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI StringRef resolveCPUAlias(StringRef CPU);
 
 // Information by Name
-const ArchInfo *getArchForCpu(StringRef CPU);
+LLVM_ABI const ArchInfo *getArchForCpu(StringRef CPU);
 
 // Parser
-const ArchInfo *parseArch(StringRef Arch);
+LLVM_ABI const ArchInfo *parseArch(StringRef Arch);
 
 // Return the extension which has the given -target-feature name.
-std::optional<ExtensionInfo> targetFeatureToExtension(StringRef TargetFeature);
+LLVM_ABI std::optional<ExtensionInfo>
+targetFeatureToExtension(StringRef TargetFeature);
 
 // Parse a name as defined by the Extension class in tablegen.
-std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
+LLVM_ABI std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
 
 // Parse a name as defined by the FMVInfo class in tablegen.
-std::optional<FMVInfo> parseFMVExtension(StringRef Extension);
+LLVM_ABI std::optional<FMVInfo> parseFMVExtension(StringRef Extension);
 
 // Given the name of a CPU or alias, return the correponding CpuInfo.
-std::optional<CpuInfo> parseCpu(StringRef Name);
+LLVM_ABI std::optional<CpuInfo> parseCpu(StringRef Name);
 // Used by target parser tests
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
-bool isX18ReservedByDefault(const Triple &TT);
+LLVM_ABI bool isX18ReservedByDefault(const Triple &TT);
 
 // For a given set of feature names, which can be either target-features, or
 // fmv-features metadata, expand their dependencies and then return a bitmask
 // corresponding to the entries of AArch64::FeatPriorities.
-uint64_t getFMVPriority(ArrayRef<StringRef> Features);
+LLVM_ABI uint64_t getFMVPriority(ArrayRef<StringRef> Features);
 
 // For a given set of FMV feature names, expand their dependencies and then
 // return a bitmask corresponding to the entries of AArch64::CPUFeatures.
 // The values in CPUFeatures are not bitmasks themselves, they are sequential
 // (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether
 // a certain FMV feature is available on the host.
-uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
+LLVM_ABI uint64_t getCpuSupportsMask(ArrayRef<StringRef> Features);
 
-void PrintSupportedExtensions();
+LLVM_ABI void PrintSupportedExtensions();
 
-void printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames);
+LLVM_ABI void
+printEnabledExtensions(const std::set<StringRef> &EnabledFeatureNames);
 
 } // namespace AArch64
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h
index b2403f42f1b79..798c578ced938 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/ARMTargetParserCommon.h"
 #include <vector>
 
@@ -223,53 +224,55 @@ inline ArchKind &operator--(ArchKind &Kind) {
 }
 
 // Information by ID
-StringRef getFPUName(FPUKind FPUKind);
-FPUVersion getFPUVersion(FPUKind FPUKind);
-NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind);
-FPURestriction getFPURestriction(FPUKind FPUKind);
-
-bool getFPUFeatures(FPUKind FPUKind, std::vector<StringRef> &Features);
-bool getHWDivFeatures(uint64_t HWDivKind, std::vector<StringRef> &Features);
-bool getExtensionFeatures(uint64_t Extensions,
-                          std::vector<StringRef> &Features);
-
-StringRef getArchName(ArchKind AK);
-unsigned getArchAttr(ArchKind AK);
-StringRef getCPUAttr(ArchKind AK);
-StringRef getSubArch(ArchKind AK);
-StringRef getArchExtName(uint64_t ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
-                           std::vector<StringRef> &Features,
-                           FPUKind &ArgFPUKind);
-ArchKind convertV9toV8(ArchKind AK);
+LLVM_ABI StringRef getFPUName(FPUKind FPUKind);
+LLVM_ABI FPUVersion getFPUVersion(FPUKind FPUKind);
+LLVM_ABI NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind);
+LLVM_ABI FPURestriction getFPURestriction(FPUKind FPUKind);
+
+LLVM_ABI bool getFPUFeatures(FPUKind FPUKind, std::vector<StringRef> &Features);
+LLVM_ABI bool getHWDivFeatures(uint64_t HWDivKind,
+                               std::vector<StringRef> &Features);
+LLVM_ABI bool getExtensionFeatures(uint64_t Extensions,
+                                   std::vector<StringRef> &Features);
+
+LLVM_ABI StringRef getArchName(ArchKind AK);
+LLVM_ABI unsigned getArchAttr(ArchKind AK);
+LLVM_ABI StringRef getCPUAttr(ArchKind AK);
+LLVM_ABI StringRef getSubArch(ArchKind AK);
+LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK,
+                                    StringRef ArchExt,
+                                    std::vector<StringRef> &Features,
+                                    FPUKind &ArgFPUKind);
+LLVM_ABI ArchKind convertV9toV8(ArchKind AK);
 
 // Information by Name
-FPUKind getDefaultFPU(StringRef CPU, ArchKind AK);
-uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-StringRef getCanonicalArchName(StringRef Arch);
-StringRef getFPUSynonym(StringRef FPU);
+LLVM_ABI FPUKind getDefaultFPU(StringRef CPU, ArchKind AK);
+LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK);
+LLVM_ABI StringRef getDefaultCPU(StringRef Arch);
+LLVM_ABI StringRef getFPUSynonym(StringRef FPU);
 
 // Parser
-uint64_t parseHWDiv(StringRef HWDiv);
-FPUKind parseFPU(StringRef FPU);
-ArchKind parseArch(StringRef Arch);
-uint64_t parseArchExt(StringRef ArchExt);
-ArchKind parseCPUArch(StringRef CPU);
-ProfileKind parseArchProfile(StringRef Arch);
-unsigned parseArchVersion(StringRef Arch);
+LLVM_ABI uint64_t parseHWDiv(StringRef HWDiv);
+LLVM_ABI FPUKind parseFPU(StringRef FPU);
+LLVM_ABI ArchKind parseArch(StringRef Arch);
+LLVM_ABI uint64_t parseArchExt(StringRef ArchExt);
+LLVM_ABI ArchKind parseCPUArch(StringRef CPU);
+LLVM_ABI ProfileKind parseArchProfile(StringRef Arch);
+LLVM_ABI unsigned parseArchVersion(StringRef Arch);
 
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
-StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU);
 
 /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting.
 ///
 /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty
 /// string then the triple's arch name is used.
-StringRef getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch = {});
+LLVM_ABI StringRef getARMCPUForArch(const llvm::Triple &Triple,
+                                    StringRef MArch = {});
 
-void PrintSupportedExtensions(StringMap<StringRef> DescMap);
+LLVM_ABI void PrintSupportedExtensions(StringMap<StringRef> DescMap);
 
 } // namespace ARM
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index f6115718e9f5f..7c8030dd5576a 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -14,6 +14,7 @@
 #define LLVM_TARGETPARSER_ARMTARGETPARSERCOMMON_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 namespace ARM {
@@ -23,19 +24,19 @@ enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 };
 enum class EndianKind { INVALID = 0, LITTLE, BIG };
 
 /// Converts e.g. "armv8" -> "armv8-a"
-StringRef getArchSynonym(StringRef Arch);
+LLVM_ABI StringRef getArchSynonym(StringRef Arch);
 
 /// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but
 /// (iwmmxt|xscale)(eb)? is also permitted. If the former, return
 /// "v.+", if the latter, return unmodified string, minus 'eb'.
 /// If invalid, return empty string.
-StringRef getCanonicalArchName(StringRef Arch);
+LLVM_ABI StringRef getCanonicalArchName(StringRef Arch);
 
 // ARM, Thumb, AArch64
-ISAKind parseArchISA(StringRef Arch);
+LLVM_ABI ISAKind parseArchISA(StringRef Arch);
 
 // Little/Big endian
-EndianKind parseArchEndian(StringRef Arch);
+LLVM_ABI EndianKind parseArchEndian(StringRef Arch);
 
 struct ParsedBranchProtection {
   StringRef Scope;
@@ -45,8 +46,8 @@ struct ParsedBranchProtection {
   bool GuardedControlStack;
 };
 
-bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
-                           StringRef &Err, bool EnablePAuthLR = false);
+LLVM_ABI bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
+                                    StringRef &Err, bool EnablePAuthLR = false);
 
 } // namespace ARM
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/CSKYTargetParser.h b/llvm/include/llvm/TargetParser/CSKYTargetParser.h
index 4c4ec06f758a8..8eab03ca01490 100644
--- a/llvm/include/llvm/TargetParser/CSKYTargetParser.h
+++ b/llvm/include/llvm/TargetParser/CSKYTargetParser.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGETPARSER_CSKYTARGETPARSER_H
 #define LLVM_TARGETPARSER_CSKYTARGETPARSER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <vector>
 
@@ -176,25 +177,26 @@ const ArchNames<CSKY::ArchKind> ARCHNames[] = {
 #include "llvm/TargetParser/CSKYTargetParser.def"
 };
 
-StringRef getArchName(ArchKind AK);
-StringRef getDefaultCPU(StringRef Arch);
-StringRef getArchExtName(uint64_t ArchExtKind);
-StringRef getArchExtFeature(StringRef ArchExt);
-uint64_t getDefaultExtensions(StringRef CPU);
-bool getExtensionFeatures(uint64_t Extensions,
-                          std::vector<StringRef> &Features);
+LLVM_ABI StringRef getArchName(ArchKind AK);
+LLVM_ABI StringRef getDefaultCPU(StringRef Arch);
+LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind);
+LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt);
+LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU);
+LLVM_ABI bool getExtensionFeatures(uint64_t Extensions,
+                                   std::vector<StringRef> &Features);
 
 // Information by ID
-StringRef getFPUName(unsigned FPUKind);
-FPUVersion getFPUVersion(unsigned FPUKind);
+LLVM_ABI StringRef getFPUName(unsigned FPUKind);
+LLVM_ABI FPUVersion getFPUVersion(unsigned FPUKind);
 
-bool getFPUFeatures(CSKYFPUKind Kind, std::vector<StringRef> &Features);
+LLVM_ABI bool getFPUFeatures(CSKYFPUKind Kind,
+                             std::vector<StringRef> &Features);
 
 // Parser
-ArchKind parseArch(StringRef Arch);
-ArchKind parseCPUArch(StringRef CPU);
-uint64_t parseArchExt(StringRef ArchExt);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI ArchKind parseArch(StringRef Arch);
+LLVM_ABI ArchKind parseCPUArch(StringRef CPU);
+LLVM_ABI uint64_t parseArchExt(StringRef ArchExt);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values);
 
 } // namespace CSKY
 
diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h
index 443f4f583b559..be3d41e022ad9 100644
--- a/llvm/include/llvm/TargetParser/Host.h
+++ b/llvm/include/llvm/TargetParser/Host.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TARGETPARSER_HOST_H
 #define LLVM_TARGETPARSER_HOST_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -30,18 +31,18 @@ namespace sys {
 ///   CPU_TYPE-VENDOR-OPERATING_SYSTEM
 /// or
 ///   CPU_TYPE-VENDOR-KERNEL-OPERATING_SYSTEM
-std::string getDefaultTargetTriple();
+LLVM_ABI std::string getDefaultTargetTriple();
 
 /// getProcessTriple() - Return an appropriate target triple for generating
 /// code to be loaded into the current process, e.g. when using the JIT.
-std::string getProcessTriple();
+LLVM_ABI std::string getProcessTriple();
 
 /// getHostCPUName - Get the LLVM name for the host CPU. The particular format
 /// of the name is target dependent, and suitable for passing as -mcpu to the
 /// target which matches the host.
 ///
 /// \return - The host CPU name, or empty if the CPU could not be determined.
-StringRef getHostCPUName();
+LLVM_ABI StringRef getHostCPUName();
 
 /// getHostCPUFeatures - Get the LLVM names for the host CPU features.
 /// The particular format of the names are target dependent, and suitable for
@@ -52,20 +53,20 @@ StringRef getHostCPUName();
 /// which features may appear in this map, except that they are all valid LLVM
 /// feature names. The map can be empty, for example if feature detection
 /// fails.
-const StringMap<bool, MallocAllocator> getHostCPUFeatures();
+LLVM_ABI const StringMap<bool, MallocAllocator> getHostCPUFeatures();
 
 /// This is a function compatible with cl::AddExtraVersionPrinter, which adds
 /// info about the current target triple and detected CPU.
-void printDefaultTargetAndDetectedCPU(raw_ostream &OS);
+LLVM_ABI void printDefaultTargetAndDetectedCPU(raw_ostream &OS);
 
 namespace detail {
 /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
-StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
-StringRef getHostCPUNameForBPF();
+LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent);
+LLVM_ABI StringRef getHostCPUNameForBPF();
 
 /// Helper functions to extract CPU details from CPUID on x86.
 namespace x86 {
@@ -78,7 +79,7 @@ enum class VendorSignatures {
 /// Returns the host CPU's vendor.
 /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be
 /// assigned to its pointee.
-VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
+LLVM_ABI VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr);
 } // namespace x86
 } // namespace detail
 } // namespace sys
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index a28e4e9eff811..1357d74744592 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H
 #define LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <vector>
 
@@ -84,12 +85,12 @@ struct ArchInfo {
   uint32_t Features;
 };
 
-bool isValidArchName(StringRef Arch);
-bool isValidFeatureName(StringRef Feature);
-bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
-bool isValidCPUName(StringRef TuneCPU);
-void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
-StringRef getDefaultArch(bool Is64Bit);
+LLVM_ABI bool isValidArchName(StringRef Arch);
+LLVM_ABI bool isValidFeatureName(StringRef Feature);
+LLVM_ABI bool getArchFeatures(StringRef Arch, std::vector<StringRef> &Features);
+LLVM_ABI bool isValidCPUName(StringRef TuneCPU);
+LLVM_ABI void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI StringRef getDefaultArch(bool Is64Bit);
 
 } // namespace LoongArch
 
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index 5f9fe543aff0b..59d9f867005a4 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -15,25 +15,28 @@
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 
 namespace llvm {
 namespace PPC {
-bool isValidCPU(StringRef CPU);
-void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
-void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI bool isValidCPU(StringRef CPU);
+LLVM_ABI void fillValidCPUList(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values);
 
 // Get target CPU name.
 // If CPUName is empty or generic, return the default CPU name.
 // If CPUName is not empty or generic, return the normalized CPU name.
-StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = "");
+LLVM_ABI StringRef getNormalizedPPCTargetCPU(const Triple &T,
+                                             StringRef CPUName = "");
 
 // Get the tune CPU name.
-StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = "");
+LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
+                                           StringRef CPUName = "");
 
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
-StringRef normalizeCPUName(StringRef CPUName);
+LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
index 5b2b6f29fd3db..0c308cadba790 100644
--- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h
+++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/RISCVISAUtils.h"
 
@@ -31,27 +32,27 @@ class RISCVISAInfo {
   /// extensions with unrecognised versions will be silently dropped, except
   /// for the special case of the base 'i' and 'e' extensions, where the
   /// default version will be used (as ignoring the base is not possible).
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseArchString(StringRef Arch, bool EnableExperimentalExtension,
                   bool ExperimentalExtensionVersionCheck = true);
 
   /// Parse RISC-V ISA info from an arch string that is already in normalized
   /// form (as defined in the psABI). Unlike parseArchString, this function
   /// will not error for unrecognized extension names or extension versions.
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseNormalizedArchString(StringRef Arch);
 
   /// Parse RISC-V ISA info from feature vector.
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   parseFeatures(unsigned XLen, const std::vector<std::string> &Features);
 
-  static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
+  LLVM_ABI static llvm::Expected<std::unique_ptr<RISCVISAInfo>>
   createFromExtMap(unsigned XLen,
                    const RISCVISAUtils::OrderedExtensionMap &Exts);
 
   /// Convert RISC-V ISA info to a feature vector.
-  std::vector<std::string> toFeatures(bool AddAllExtensions = false,
-                                      bool IgnoreUnknown = true) const;
+  LLVM_ABI std::vector<std::string> toFeatures(bool AddAllExtensions = false,
+                                               bool IgnoreUnknown = true) const;
 
   const RISCVISAUtils::OrderedExtensionMap &getExtensions() const {
     return Exts;
@@ -64,25 +65,26 @@ class RISCVISAInfo {
   unsigned getMaxELen() const { return MaxELen; }
   unsigned getMaxELenFp() const { return MaxELenFp; }
 
-  bool hasExtension(StringRef Ext) const;
-  std::string toString() const;
-  StringRef computeDefaultABI() const;
+  LLVM_ABI bool hasExtension(StringRef Ext) const;
+  LLVM_ABI std::string toString() const;
+  LLVM_ABI StringRef computeDefaultABI() const;
 
-  static bool isSupportedExtensionFeature(StringRef Ext);
-  static bool isSupportedExtension(StringRef Ext);
-  static bool isSupportedExtensionWithVersion(StringRef Ext);
-  static bool isSupportedExtension(StringRef Ext, unsigned MajorVersion,
-                                   unsigned MinorVersion);
-  static std::string getTargetFeatureForExtension(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtensionFeature(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtension(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtensionWithVersion(StringRef Ext);
+  LLVM_ABI static bool isSupportedExtension(StringRef Ext,
+                                            unsigned MajorVersion,
+                                            unsigned MinorVersion);
+  LLVM_ABI static std::string getTargetFeatureForExtension(StringRef Ext);
 
-  static void printSupportedExtensions(StringMap<StringRef> &DescMap);
-  static void printEnabledExtensions(bool IsRV64,
-                                     std::set<StringRef> &EnabledFeatureNames,
-                                     StringMap<StringRef> &DescMap);
+  LLVM_ABI static void printSupportedExtensions(StringMap<StringRef> &DescMap);
+  LLVM_ABI static void
+  printEnabledExtensions(bool IsRV64, std::set<StringRef> &EnabledFeatureNames,
+                         StringMap<StringRef> &DescMap);
 
   /// Return the group id and bit position of __riscv_feature_bits.  Returns
   /// <-1, -1> if not supported.
-  static std::pair<int, int> getRISCVFeaturesBitsInfo(StringRef Ext);
+  LLVM_ABI static std::pair<int, int> getRISCVFeaturesBitsInfo(StringRef Ext);
 
   // The maximum value of the group ID obtained from getRISCVFeaturesBitsInfo.
   static constexpr unsigned FeatureBitSize = 2;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index a529479b546d9..41fdab6012aa0 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGETPARSER_RISCVTARGETPARSER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -43,18 +44,20 @@ struct CPUInfo {
 static constexpr unsigned RVVBitsPerBlock = 64;
 static constexpr unsigned RVVBytesPerBlock = RVVBitsPerBlock / 8;
 
-void getFeaturesForCPU(StringRef CPU,
-                       SmallVectorImpl<std::string> &EnabledFeatures,
-                       bool NeedPlus = false);
-bool parseCPU(StringRef CPU, bool IsRV64);
-bool parseTuneCPU(StringRef CPU, bool IsRV64);
-StringRef getMArchFromMcpu(StringRef CPU);
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
-void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values, bool IsRV64);
-bool hasFastScalarUnalignedAccess(StringRef CPU);
-bool hasFastVectorUnalignedAccess(StringRef CPU);
-bool hasValidCPUModel(StringRef CPU);
-CPUModel getCPUModel(StringRef CPU);
+LLVM_ABI void getFeaturesForCPU(StringRef CPU,
+                                SmallVectorImpl<std::string> &EnabledFeatures,
+                                bool NeedPlus = false);
+LLVM_ABI bool parseCPU(StringRef CPU, bool IsRV64);
+LLVM_ABI bool parseTuneCPU(StringRef CPU, bool IsRV64);
+LLVM_ABI StringRef getMArchFromMcpu(StringRef CPU);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                   bool IsRV64);
+LLVM_ABI void fillValidTuneCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                       bool IsRV64);
+LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU);
+LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU);
+LLVM_ABI bool hasValidCPUModel(StringRef CPU);
+LLVM_ABI CPUModel getCPUModel(StringRef CPU);
 
 } // namespace RISCV
 
@@ -86,10 +89,10 @@ inline static bool isValidLMUL(unsigned LMUL, bool Fractional) {
   return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
 }
 
-unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
-                     bool MaskAgnostic);
+LLVM_ABI unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
+                              bool MaskAgnostic);
 
-unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt);
+LLVM_ABI unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt);
 
 inline static VLMUL getVLMUL(unsigned VType) {
   unsigned VLMul = VType & 0x7;
@@ -97,7 +100,7 @@ inline static VLMUL getVLMUL(unsigned VType) {
 }
 
 // Decode VLMUL into 1,2,4,8 and fractional indicator.
-std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul);
+LLVM_ABI std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul);
 
 inline static VLMUL encodeLMUL(unsigned LMUL, bool Fractional) {
   assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL");
@@ -148,11 +151,12 @@ inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; }
 
 inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
 
-void printVType(unsigned VType, raw_ostream &OS);
+LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
 
-unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
+LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
 
-std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, unsigned EEW);
+LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
+                                               unsigned EEW);
 } // namespace RISCVVType
 
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index 2e1f00dad2df3..6f1723dec5d04 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <array>
 #include <initializer_list>
@@ -175,27 +176,27 @@ class SubtargetFeatures {
   std::vector<std::string> Features;    ///< Subtarget features as a vector
 
 public:
-  explicit SubtargetFeatures(StringRef Initial = "");
+  LLVM_ABI explicit SubtargetFeatures(StringRef Initial = "");
 
   /// Returns features as a string.
-  std::string getString() const;
+  LLVM_ABI std::string getString() const;
 
   /// Adds Features.
-  void AddFeature(StringRef String, bool Enable = true);
+  LLVM_ABI void AddFeature(StringRef String, bool Enable = true);
 
-  void addFeaturesVector(const ArrayRef<std::string> OtherFeatures);
+  LLVM_ABI void addFeaturesVector(const ArrayRef<std::string> OtherFeatures);
 
   /// Returns the vector of individual subtarget features.
   const std::vector<std::string> &getFeatures() const { return Features; }
 
   /// Prints feature string.
-  void print(raw_ostream &OS) const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 
   // Dumps feature info.
-  void dump() const;
+  LLVM_ABI void dump() const;
 
   /// Adds the default features for the specified target triple.
-  void getDefaultSubtargetFeatures(const Triple& Triple);
+  LLVM_ABI void getDefaultSubtargetFeatures(const Triple &Triple);
 
   /// Determine if a feature has a flag; '+' or '-'
   static bool hasFlag(StringRef Feature) {
@@ -221,7 +222,7 @@ class SubtargetFeatures {
   }
 
   /// Splits a string of comma separated items in to a vector of strings.
-  static void Split(std::vector<std::string> &V, StringRef S);
+  LLVM_ABI static void Split(std::vector<std::string> &V, StringRef S);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index f776b41f3d7ca..176205e17ae00 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -164,27 +165,27 @@ enum FeatureError : uint32_t {
   UNSUPPORTED_TARGET_FEATURE
 };
 
-StringRef getArchFamilyNameAMDGCN(GPUKind AK);
+LLVM_ABI StringRef getArchFamilyNameAMDGCN(GPUKind AK);
 
-StringRef getArchNameAMDGCN(GPUKind AK);
-StringRef getArchNameR600(GPUKind AK);
-StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
-GPUKind parseArchAMDGCN(StringRef CPU);
-GPUKind parseArchR600(StringRef CPU);
-unsigned getArchAttrAMDGCN(GPUKind AK);
-unsigned getArchAttrR600(GPUKind AK);
+LLVM_ABI StringRef getArchNameAMDGCN(GPUKind AK);
+LLVM_ABI StringRef getArchNameR600(GPUKind AK);
+LLVM_ABI StringRef getCanonicalArchName(const Triple &T, StringRef Arch);
+LLVM_ABI GPUKind parseArchAMDGCN(StringRef CPU);
+LLVM_ABI GPUKind parseArchR600(StringRef CPU);
+LLVM_ABI unsigned getArchAttrAMDGCN(GPUKind AK);
+LLVM_ABI unsigned getArchAttrR600(GPUKind AK);
 
-void fillValidArchListAMDGCN(SmallVectorImpl<StringRef> &Values);
-void fillValidArchListR600(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidArchListAMDGCN(SmallVectorImpl<StringRef> &Values);
+LLVM_ABI void fillValidArchListR600(SmallVectorImpl<StringRef> &Values);
 
-IsaVersion getIsaVersion(StringRef GPU);
+LLVM_ABI IsaVersion getIsaVersion(StringRef GPU);
 
 /// Fills Features map with default values for given target GPU
-void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
-                          StringMap<bool> &Features);
+LLVM_ABI void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
+                                   StringMap<bool> &Features);
 
 /// Inserts wave size feature for given GPU into features map
-std::pair<FeatureError, StringRef>
+LLVM_ABI std::pair<FeatureError, StringRef>
 insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index b56e6e18805e0..b6f15ef13191f 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -10,6 +10,7 @@
 #define LLVM_TARGETPARSER_TRIPLE_H
 
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 
 // Some system headers or GCC predefined macros conflict with identifiers in
@@ -348,10 +349,11 @@ class Triple {
   /// triple fields unknown.
   Triple() = default;
 
-  explicit Triple(const Twine &Str);
-  Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr);
-  Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr,
-         const Twine &EnvironmentStr);
+  LLVM_ABI explicit Triple(const Twine &Str);
+  LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr,
+                  const Twine &OSStr);
+  LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr,
+                  const Twine &OSStr, const Twine &EnvironmentStr);
 
   bool operator==(const Triple &Other) const {
     return Arch == Other.Arch && SubArch == Other.SubArch &&
@@ -381,8 +383,8 @@ class Triple {
   /// reasonably be done).  In particular, it handles the common case in which
   /// otherwise valid components are in the wrong order. \p Form is used to
   /// specify the output canonical form.
-  static std::string normalize(StringRef Str,
-                               CanonicalForm Form = CanonicalForm::ANY);
+  LLVM_ABI static std::string
+  normalize(StringRef Str, CanonicalForm Form = CanonicalForm::ANY);
 
   /// Return the normalized form of this triple's string.
   std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const {
@@ -417,7 +419,7 @@ class Triple {
   /// triple, if present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
-  VersionTuple getEnvironmentVersion() const;
+  LLVM_ABI VersionTuple getEnvironmentVersion() const;
 
   /// Get the object format for this triple.
   ObjectFormatType getObjectFormat() const { return ObjectFormat; }
@@ -426,7 +428,7 @@ class Triple {
   /// present.
   ///
   /// For example, "fooos1.2.3" would return (1, 2, 3).
-  VersionTuple getOSVersion() const;
+  LLVM_ABI VersionTuple getOSVersion() const;
 
   /// Return just the major version number, this is specialized because it is a
   /// common query.
@@ -436,26 +438,26 @@ class Triple {
   /// "darwin" versions to the corresponding OS X versions.  This may also be
   /// called with IOS triples but the OS X version number is just set to a
   /// constant 10.4.0 in that case.  Returns true if successful.
-  bool getMacOSXVersion(VersionTuple &Version) const;
+  LLVM_ABI bool getMacOSXVersion(VersionTuple &Version) const;
 
   /// Parse the version number as with getOSVersion.  This should only be called
   /// with IOS or generic triples.
-  VersionTuple getiOSVersion() const;
+  LLVM_ABI VersionTuple getiOSVersion() const;
 
   /// Parse the version number as with getOSVersion.  This should only be called
   /// with WatchOS or generic triples.
-  VersionTuple getWatchOSVersion() const;
+  LLVM_ABI VersionTuple getWatchOSVersion() const;
 
   /// Parse the version number as with getOSVersion.
-  VersionTuple getDriverKitVersion() const;
+  LLVM_ABI VersionTuple getDriverKitVersion() const;
 
   /// Parse the Vulkan version number from the OSVersion and SPIR-V version
   /// (SubArch).  This should only be called with Vulkan SPIR-V triples.
-  VersionTuple getVulkanVersion() const;
+  LLVM_ABI VersionTuple getVulkanVersion() const;
 
   /// Parse the DXIL version number from the OSVersion and DXIL version
   /// (SubArch).  This should only be called with DXIL triples.
-  VersionTuple getDXILVersion() const;
+  LLVM_ABI VersionTuple getDXILVersion() const;
 
   /// @}
   /// @name Direct Component Access
@@ -469,34 +471,34 @@ class Triple {
   bool empty() const { return Data.empty(); }
 
   /// Get the architecture (first) component of the triple.
-  StringRef getArchName() const;
+  LLVM_ABI StringRef getArchName() const;
 
   /// Get the vendor (second) component of the triple.
-  StringRef getVendorName() const;
+  LLVM_ABI StringRef getVendorName() const;
 
   /// Get the operating system (third) component of the triple.
-  StringRef getOSName() const;
+  LLVM_ABI StringRef getOSName() const;
 
   /// Get the optional environment (fourth) component of the triple, or "" if
   /// empty.
-  StringRef getEnvironmentName() const;
+  LLVM_ABI StringRef getEnvironmentName() const;
 
   /// Get the operating system and optional environment components as a single
   /// string (separated by a '-' if the environment component is present).
-  StringRef getOSAndEnvironmentName() const;
+  LLVM_ABI StringRef getOSAndEnvironmentName() const;
 
   /// Get the version component of the environment component as a single
   /// string (the version after the environment).
   ///
   /// For example, "fooos1.2.3" would return "1.2.3".
-  StringRef getEnvironmentVersionString() const;
+  LLVM_ABI StringRef getEnvironmentVersionString() const;
 
   /// @}
   /// @name Convenience Predicates
   /// @{
 
   /// Returns the pointer width of this architecture.
-  static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch);
+  LLVM_ABI static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch);
 
   /// Returns the pointer width of this architecture.
   unsigned getArchPointerBitWidth() const {
@@ -504,7 +506,7 @@ class Triple {
   }
 
   /// Returns the trampoline size in bytes for this configuration.
-  unsigned getTrampolineSize() const;
+  LLVM_ABI unsigned getTrampolineSize() const;
 
   /// Test whether the architecture is 64-bit
   ///
@@ -513,17 +515,17 @@ class Triple {
   /// 16-bit. The inner details of pointer width for particular architectures
   /// is not summed up in the triple, and so only a coarse grained predicate
   /// system is provided.
-  bool isArch64Bit() const;
+  LLVM_ABI bool isArch64Bit() const;
 
   /// Test whether the architecture is 32-bit
   ///
   /// Note that this tests for 32-bit pointer width, and nothing else.
-  bool isArch32Bit() const;
+  LLVM_ABI bool isArch32Bit() const;
 
   /// Test whether the architecture is 16-bit
   ///
   /// Note that this tests for 16-bit pointer width, and nothing else.
-  bool isArch16Bit() const;
+  LLVM_ABI bool isArch16Bit() const;
 
   /// Helper function for doing comparisons against version numbers included in
   /// the target triple.
@@ -544,8 +546,8 @@ class Triple {
 
   /// Comparison function for checking OS X version compatibility, which handles
   /// supporting skewed version numbering schemes used by the "darwin" triples.
-  bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
-                         unsigned Micro = 0) const;
+  LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0,
+                                  unsigned Micro = 0) const;
 
   /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin"
   /// and "osx" as OS X triples.
@@ -1171,38 +1173,38 @@ class Triple {
   /// @{
 
   /// Set the architecture (first) component of the triple to a known type.
-  void setArch(ArchType Kind, SubArchType SubArch = NoSubArch);
+  LLVM_ABI void setArch(ArchType Kind, SubArchType SubArch = NoSubArch);
 
   /// Set the vendor (second) component of the triple to a known type.
-  void setVendor(VendorType Kind);
+  LLVM_ABI void setVendor(VendorType Kind);
 
   /// Set the operating system (third) component of the triple to a known type.
-  void setOS(OSType Kind);
+  LLVM_ABI void setOS(OSType Kind);
 
   /// Set the environment (fourth) component of the triple to a known type.
-  void setEnvironment(EnvironmentType Kind);
+  LLVM_ABI void setEnvironment(EnvironmentType Kind);
 
   /// Set the object file format.
-  void setObjectFormat(ObjectFormatType Kind);
+  LLVM_ABI void setObjectFormat(ObjectFormatType Kind);
 
   /// Set all components to the new triple \p Str.
-  void setTriple(const Twine &Str);
+  LLVM_ABI void setTriple(const Twine &Str);
 
   /// Set the architecture (first) component of the triple by name.
-  void setArchName(StringRef Str);
+  LLVM_ABI void setArchName(StringRef Str);
 
   /// Set the vendor (second) component of the triple by name.
-  void setVendorName(StringRef Str);
+  LLVM_ABI void setVendorName(StringRef Str);
 
   /// Set the operating system (third) component of the triple by name.
-  void setOSName(StringRef Str);
+  LLVM_ABI void setOSName(StringRef Str);
 
   /// Set the optional environment (fourth) component of the triple by name.
-  void setEnvironmentName(StringRef Str);
+  LLVM_ABI void setEnvironmentName(StringRef Str);
 
   /// Set the operating system and optional environment components with a single
   /// string.
-  void setOSAndEnvironmentName(StringRef Str);
+  LLVM_ABI void setOSAndEnvironmentName(StringRef Str);
 
   /// @}
   /// @name Helpers to build variants of a particular triple.
@@ -1214,7 +1216,7 @@ class Triple {
   ///
   /// \returns A new triple with a 32-bit architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple get32BitArchVariant() const;
+  LLVM_ABI llvm::Triple get32BitArchVariant() const;
 
   /// Form a triple with a 64-bit variant of the current architecture.
   ///
@@ -1222,7 +1224,7 @@ class Triple {
   ///
   /// \returns A new triple with a 64-bit architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple get64BitArchVariant() const;
+  LLVM_ABI llvm::Triple get64BitArchVariant() const;
 
   /// Form a triple with a big endian variant of the current architecture.
   ///
@@ -1230,7 +1232,7 @@ class Triple {
   ///
   /// \returns A new triple with a big endian architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple getBigEndianArchVariant() const;
+  LLVM_ABI llvm::Triple getBigEndianArchVariant() const;
 
   /// Form a triple with a little endian variant of the current architecture.
   ///
@@ -1238,73 +1240,76 @@ class Triple {
   ///
   /// \returns A new triple with a little endian architecture or an unknown
   ///          architecture if no such variant can be found.
-  llvm::Triple getLittleEndianArchVariant() const;
+  LLVM_ABI llvm::Triple getLittleEndianArchVariant() const;
 
   /// Tests whether the target triple is little endian.
   ///
   /// \returns true if the triple is little endian, false otherwise.
-  bool isLittleEndian() const;
+  LLVM_ABI bool isLittleEndian() const;
 
   /// Test whether target triples are compatible.
-  bool isCompatibleWith(const Triple &Other) const;
+  LLVM_ABI bool isCompatibleWith(const Triple &Other) const;
 
   /// Test whether the target triple is for a GPU.
   bool isGPU() const { return isSPIRV() || isNVPTX() || isAMDGPU(); }
 
   /// Merge target triples.
-  std::string merge(const Triple &Other) const;
+  LLVM_ABI std::string merge(const Triple &Other) const;
 
   /// Some platforms have different minimum supported OS versions that
   /// varies by the architecture specified in the triple. This function
   /// returns the minimum supported OS version for this triple if one an exists,
   /// or an invalid version tuple if this triple doesn't have one.
-  VersionTuple getMinimumSupportedOSVersion() const;
+  LLVM_ABI VersionTuple getMinimumSupportedOSVersion() const;
 
   /// @}
   /// @name Static helpers for IDs.
   /// @{
 
   /// Get the canonical name for the \p Kind architecture.
-  static StringRef getArchTypeName(ArchType Kind);
+  LLVM_ABI static StringRef getArchTypeName(ArchType Kind);
 
   /// Get the architecture name based on \p Kind and \p SubArch.
-  static StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch);
+  LLVM_ABI static StringRef getArchName(ArchType Kind,
+                                        SubArchType SubArch = NoSubArch);
 
   /// Get the "prefix" canonical name for the \p Kind architecture. This is the
   /// prefix used by the architecture specific builtins, and is suitable for
   /// passing to \see Intrinsic::getIntrinsicForClangBuiltin().
   ///
   /// \return - The architecture prefix, or 0 if none is defined.
-  static StringRef getArchTypePrefix(ArchType Kind);
+  LLVM_ABI static StringRef getArchTypePrefix(ArchType Kind);
 
   /// Get the canonical name for the \p Kind vendor.
-  static StringRef getVendorTypeName(VendorType Kind);
+  LLVM_ABI static StringRef getVendorTypeName(VendorType Kind);
 
   /// Get the canonical name for the \p Kind operating system.
-  static StringRef getOSTypeName(OSType Kind);
+  LLVM_ABI static StringRef getOSTypeName(OSType Kind);
 
   /// Get the canonical name for the \p Kind environment.
-  static StringRef getEnvironmentTypeName(EnvironmentType Kind);
+  LLVM_ABI static StringRef getEnvironmentTypeName(EnvironmentType Kind);
 
   /// Get the name for the \p Object format.
-  static StringRef getObjectFormatTypeName(ObjectFormatType ObjectFormat);
+  LLVM_ABI static StringRef
+  getObjectFormatTypeName(ObjectFormatType ObjectFormat);
 
   /// @}
   /// @name Static helpers for converting alternate architecture names.
   /// @{
 
   /// The canonical type for the given LLVM architecture name (e.g., "x86").
-  static ArchType getArchTypeForLLVMName(StringRef Str);
+  LLVM_ABI static ArchType getArchTypeForLLVMName(StringRef Str);
 
   /// @}
 
   /// Returns a canonicalized OS version number for the specified OS.
-  static VersionTuple getCanonicalVersionForOS(OSType OSKind,
-                                               const VersionTuple &Version,
-                                               bool IsInValidRange);
+  LLVM_ABI static VersionTuple
+  getCanonicalVersionForOS(OSType OSKind, const VersionTuple &Version,
+                           bool IsInValidRange);
 
   /// Returns whether an OS version is invalid and would not map to an Apple OS.
-  static bool isValidVersionForOS(OSType OSKind, const VersionTuple &Version);
+  LLVM_ABI static bool isValidVersionForOS(OSType OSKind,
+                                           const VersionTuple &Version);
 };
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index 8447aca7bb92a..f6aeaada346e7 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
 #include <array>
 
 namespace llvm {
@@ -153,34 +154,36 @@ enum CPUKind {
 
 /// Parse \p CPU string into a CPUKind. Will only accept 64-bit capable CPUs if
 /// \p Only64Bit is true.
-CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false);
-CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false);
+LLVM_ABI CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false);
+LLVM_ABI CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false);
 
 /// Provide a list of valid CPU names. If \p Only64Bit is true, the list will
 /// only contain 64-bit capable CPUs.
-void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
-                          bool Only64Bit = false);
+LLVM_ABI void fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                   bool Only64Bit = false);
 /// Provide a list of valid -mtune names.
-void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
-                          bool Only64Bit = false);
+LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl<StringRef> &Values,
+                                   bool Only64Bit = false);
 
 /// Get the key feature prioritizing target multiversioning.
-ProcessorFeatures getKeyFeature(CPUKind Kind);
+LLVM_ABI ProcessorFeatures getKeyFeature(CPUKind Kind);
 
 /// Fill in the features that \p CPU supports into \p Features.
 /// "+" will be append in front of each feature if NeedPlus is true.
-void getFeaturesForCPU(StringRef CPU, SmallVectorImpl<StringRef> &Features,
-                       bool NeedPlus = false);
+LLVM_ABI void getFeaturesForCPU(StringRef CPU,
+                                SmallVectorImpl<StringRef> &Features,
+                                bool NeedPlus = false);
 
 /// Set or clear entries in \p Features that are implied to be enabled/disabled
 /// by the provided \p Feature.
-void updateImpliedFeatures(StringRef Feature, bool Enabled,
-                           StringMap<bool> &Features);
-
-char getCPUDispatchMangling(StringRef Name);
-bool validateCPUSpecificCPUDispatch(StringRef Name);
-std::array<uint32_t, 4> getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
-unsigned getFeaturePriority(ProcessorFeatures Feat);
+LLVM_ABI void updateImpliedFeatures(StringRef Feature, bool Enabled,
+                                    StringMap<bool> &Features);
+
+LLVM_ABI char getCPUDispatchMangling(StringRef Name);
+LLVM_ABI bool validateCPUSpecificCPUDispatch(StringRef Name);
+LLVM_ABI std::array<uint32_t, 4>
+getCpuSupportsMask(ArrayRef<StringRef> FeatureStrs);
+LLVM_ABI unsigned getFeaturePriority(ProcessorFeatures Feat);
 
 } // namespace X86
 } // namespace llvm

From 8f8ed23c6247e9c1dd2df4494930813b353c52c4 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:19:13 -0700
Subject: [PATCH 099/851] [llvm] annotate interfaces in llvm/SandboxIR for DLL
 export (#142863)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/SandboxIR` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The following manual adjustments were also applied after running IDS on
Linux:
- Remove explicit `GlobalWithNodeAPI::LLVMGVToGV::operator()` template
function instantiations that were previously added for the dylib build.
Instead, directly annotate the `LLVMGVToGV::operator()` method with
`LLVM_ABI`. This is done so the DLL build works with both MSVC and
clang-cl.
- Explicitly `#include "llvm/SandboxIR/Value.h"` in `Tracker.h` so that
the symbol is available for exported templates in this file. These
templates get fully instantiated on DLL export, so they require the full
definition of `Value`.
- Add extern template instantiation declarations for `GlobalWithNodeAPI`
template types in `Constants.h` and annotate them with
`LLVM_TEMPLATE_ABI`.
- Add `LLVM_EXPORT_TEMPLATE` to `GlobalWithNodeAPI` template
instantiations in `Constants.cpp`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/SandboxIR/BasicBlock.h  |  21 +-
 llvm/include/llvm/SandboxIR/Constant.h    | 207 ++++----
 llvm/include/llvm/SandboxIR/Context.h     | 135 +++---
 llvm/include/llvm/SandboxIR/Function.h    |   5 +-
 llvm/include/llvm/SandboxIR/Instruction.h | 547 +++++++++++-----------
 llvm/include/llvm/SandboxIR/Module.h      |  10 +-
 llvm/include/llvm/SandboxIR/PassManager.h |   6 +-
 llvm/include/llvm/SandboxIR/Region.h      |  19 +-
 llvm/include/llvm/SandboxIR/Tracker.h     |  34 +-
 llvm/include/llvm/SandboxIR/Type.h        |  53 ++-
 llvm/include/llvm/SandboxIR/Use.h         |   9 +-
 llvm/include/llvm/SandboxIR/User.h        |  13 +-
 llvm/include/llvm/SandboxIR/Value.h       |  20 +-
 llvm/lib/SandboxIR/Constant.cpp           |  37 +-
 14 files changed, 565 insertions(+), 551 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h
index 93e79e2a421f9..25bbb6c058faa 100644
--- a/llvm/include/llvm/SandboxIR/BasicBlock.h
+++ b/llvm/include/llvm/SandboxIR/BasicBlock.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -32,20 +33,20 @@ class BBIterator {
   llvm::BasicBlock *BB;
   llvm::BasicBlock::iterator It;
   Context *Ctx;
-  pointer getInstr(llvm::BasicBlock::iterator It) const;
+  LLVM_ABI pointer getInstr(llvm::BasicBlock::iterator It) const;
 
 public:
   BBIterator() : BB(nullptr), Ctx(nullptr) {}
   BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx)
       : BB(BB), It(It), Ctx(Ctx) {}
   reference operator*() const { return *getInstr(It); }
-  BBIterator &operator++();
+  LLVM_ABI BBIterator &operator++();
   BBIterator operator++(int) {
     auto Copy = *this;
     ++*this;
     return Copy;
   }
-  BBIterator &operator--();
+  LLVM_ABI BBIterator &operator--();
   BBIterator operator--(int) {
     auto Copy = *this;
     --*this;
@@ -60,14 +61,14 @@ class BBIterator {
   /// the instruction is not found in the IR-to-SandboxIR tables.
   pointer get() const { return getInstr(It); }
   /// \Returns the parent BB.
-  BasicBlock *getNodeParent() const;
+  LLVM_ABI BasicBlock *getNodeParent() const;
 };
 
 /// Contains a list of sandboxir::Instruction's.
 class BasicBlock : public Value {
   /// Builds a graph that contains all values in \p BB in their original form
   /// i.e., no vectorization is taking place here.
-  void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
+  LLVM_ABI void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB);
   friend class Context;     // For `buildBasicBlockFromIR`
   friend class Instruction; // For LLVM Val.
 
@@ -82,9 +83,9 @@ class BasicBlock : public Value {
   static bool classof(const Value *From) {
     return From->getSubclassID() == Value::ClassID::Block;
   }
-  Function *getParent() const;
+  LLVM_ABI Function *getParent() const;
   using iterator = BBIterator;
-  iterator begin() const;
+  LLVM_ABI iterator begin() const;
   iterator end() const {
     auto *BB = cast<llvm::BasicBlock>(Val);
     return iterator(BB, BB->end(), &Ctx);
@@ -96,10 +97,10 @@ class BasicBlock : public Value {
     return std::make_reverse_iterator(begin());
   }
   Context &getContext() const { return Ctx; }
-  Instruction *getTerminator() const;
+  LLVM_ABI Instruction *getTerminator() const;
   bool empty() const { return begin() == end(); }
-  Instruction &front() const;
-  Instruction &back() const;
+  LLVM_ABI Instruction &front() const;
+  LLVM_ABI Instruction &back() const;
 
 #ifndef NDEBUG
   void verify() const final;
diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h
index e7b18a442d330..6f682a7059d10 100644
--- a/llvm/include/llvm/SandboxIR/Constant.h
+++ b/llvm/include/llvm/SandboxIR/Constant.h
@@ -76,16 +76,16 @@ class ConstantInt : public Constant {
   }
 
 public:
-  static ConstantInt *getTrue(Context &Ctx);
-  static ConstantInt *getFalse(Context &Ctx);
-  static ConstantInt *getBool(Context &Ctx, bool V);
-  static Constant *getTrue(Type *Ty);
-  static Constant *getFalse(Type *Ty);
-  static Constant *getBool(Type *Ty, bool V);
+  LLVM_ABI static ConstantInt *getTrue(Context &Ctx);
+  LLVM_ABI static ConstantInt *getFalse(Context &Ctx);
+  LLVM_ABI static ConstantInt *getBool(Context &Ctx, bool V);
+  LLVM_ABI static Constant *getTrue(Type *Ty);
+  LLVM_ABI static Constant *getFalse(Type *Ty);
+  LLVM_ABI static Constant *getBool(Type *Ty, bool V);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
-  static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
+  LLVM_ABI static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false);
 
   /// Return a ConstantInt with the specified integer value for the specified
   /// type. If the type is wider than 64 bits, the value will be zero-extended
@@ -93,27 +93,29 @@ class ConstantInt : public Constant {
   /// be interpreted as a 64-bit signed integer and sign-extended to fit
   /// the type.
   /// Get a ConstantInt for a specific value.
-  static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false);
+  LLVM_ABI static ConstantInt *get(IntegerType *Ty, uint64_t V,
+                                   bool IsSigned = false);
 
   /// Return a ConstantInt with the specified value for the specified type. The
   /// value V will be canonicalized to a an unsigned APInt. Accessing it with
   /// either getSExtValue() or getZExtValue() will yield a correctly sized and
   /// signed value for the type Ty.
   /// Get a ConstantInt for a specific signed value.
-  static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
-  static Constant *getSigned(Type *Ty, int64_t V);
+  LLVM_ABI static ConstantInt *getSigned(IntegerType *Ty, int64_t V);
+  LLVM_ABI static Constant *getSigned(Type *Ty, int64_t V);
 
   /// Return a ConstantInt with the specified value and an implied Type. The
   /// type is the integer type that corresponds to the bit width of the value.
-  static ConstantInt *get(Context &Ctx, const APInt &V);
+  LLVM_ABI static ConstantInt *get(Context &Ctx, const APInt &V);
 
   /// Return a ConstantInt constructed from the string strStart with the given
   /// radix.
-  static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix);
+  LLVM_ABI static ConstantInt *get(IntegerType *Ty, StringRef Str,
+                                   uint8_t Radix);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantInt for the given value.
-  static Constant *get(Type *Ty, const APInt &V);
+  LLVM_ABI static Constant *get(Type *Ty, const APInt &V);
 
   /// Return the constant as an APInt value reference. This allows clients to
   /// obtain a full-precision copy of the value.
@@ -166,7 +168,7 @@ class ConstantInt : public Constant {
 
   /// Variant of the getType() method to always return an IntegerType, which
   /// reduces the amount of casting needed in parts of the compiler.
-  IntegerType *getIntegerType() const;
+  LLVM_ABI IntegerType *getIntegerType() const;
 
   /// This static method returns true if the type Ty is big enough to
   /// represent the value V. This can be used to avoid having the get method
@@ -177,8 +179,8 @@ class ConstantInt : public Constant {
   /// to the appropriate unsigned type before calling the method.
   /// @returns true if V is a valid value for type Ty
   /// Determine if the value is in range for the given type.
-  static bool isValueValidForType(Type *Ty, uint64_t V);
-  static bool isValueValidForType(Type *Ty, int64_t V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, uint64_t V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, int64_t V);
 
   bool isNegative() const { return cast<llvm::ConstantInt>(Val)->isNegative(); }
 
@@ -264,29 +266,29 @@ class ConstantFP final : public Constant {
   /// for the specified value in the specified type. This should only be used
   /// for simple constant values like 2.0/1.0 etc, that are known-valid both as
   /// host double and as the target format.
-  static Constant *get(Type *Ty, double V);
+  LLVM_ABI static Constant *get(Type *Ty, double V);
 
   /// If Ty is a vector type, return a Constant with a splat of the given
   /// value. Otherwise return a ConstantFP for the given value.
-  static Constant *get(Type *Ty, const APFloat &V);
+  LLVM_ABI static Constant *get(Type *Ty, const APFloat &V);
 
-  static Constant *get(Type *Ty, StringRef Str);
+  LLVM_ABI static Constant *get(Type *Ty, StringRef Str);
 
-  static ConstantFP *get(const APFloat &V, Context &Ctx);
+  LLVM_ABI static ConstantFP *get(const APFloat &V, Context &Ctx);
 
-  static Constant *getNaN(Type *Ty, bool Negative = false,
-                          uint64_t Payload = 0);
-  static Constant *getQNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getSNaN(Type *Ty, bool Negative = false,
-                           APInt *Payload = nullptr);
-  static Constant *getZero(Type *Ty, bool Negative = false);
+  LLVM_ABI static Constant *getNaN(Type *Ty, bool Negative = false,
+                                   uint64_t Payload = 0);
+  LLVM_ABI static Constant *getQNaN(Type *Ty, bool Negative = false,
+                                    APInt *Payload = nullptr);
+  LLVM_ABI static Constant *getSNaN(Type *Ty, bool Negative = false,
+                                    APInt *Payload = nullptr);
+  LLVM_ABI static Constant *getZero(Type *Ty, bool Negative = false);
 
-  static Constant *getNegativeZero(Type *Ty);
-  static Constant *getInfinity(Type *Ty, bool Negative = false);
+  LLVM_ABI static Constant *getNegativeZero(Type *Ty);
+  LLVM_ABI static Constant *getInfinity(Type *Ty, bool Negative = false);
 
   /// Return true if Ty is big enough to represent V.
-  static bool isValueValidForType(Type *Ty, const APFloat &V);
+  LLVM_ABI static bool isValueValidForType(Type *Ty, const APFloat &V);
 
   inline const APFloat &getValueAPF() const {
     return cast<llvm::ConstantFP>(Val)->getValueAPF();
@@ -362,8 +364,8 @@ class ConstantArray final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
-  ArrayType *getType() const;
+  LLVM_ABI static Constant *get(ArrayType *T, ArrayRef<Constant *> V);
+  LLVM_ABI ArrayType *getType() const;
 
   // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get().
 
@@ -379,7 +381,7 @@ class ConstantStruct final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(StructType *T, ArrayRef<Constant *> V);
+  LLVM_ABI static Constant *get(StructType *T, ArrayRef<Constant *> V);
 
   template <typename... Csts>
   static std::enable_if_t<are_base_of<Constant, Csts...>::value, Constant *>
@@ -396,8 +398,8 @@ class ConstantStruct final : public ConstantAggregate {
     return get(getTypeForElements(Ctx, V, Packed), V);
   }
   /// This version of the method allows an empty list.
-  static StructType *getTypeForElements(Context &Ctx, ArrayRef<Constant *> V,
-                                        bool Packed = false);
+  LLVM_ABI static StructType *
+  getTypeForElements(Context &Ctx, ArrayRef<Constant *> V, bool Packed = false);
   /// Return an anonymous struct type to use for a constant with the specified
   /// set of elements. The list must not be empty.
   static StructType *getTypeForElements(ArrayRef<Constant *> V,
@@ -424,10 +426,10 @@ class ConstantVector final : public ConstantAggregate {
   friend class Context; // For constructor.
 
 public:
-  static Constant *get(ArrayRef<Constant *> V);
+  LLVM_ABI static Constant *get(ArrayRef<Constant *> V);
   /// Return a ConstantVector with the specified constant in each element.
   /// Note that this might not return an instance of ConstantVector
-  static Constant *getSplat(ElementCount EC, Constant *Elt);
+  LLVM_ABI static Constant *getSplat(ElementCount EC, Constant *Elt);
   /// Specialize the getType() method to always return a FixedVectorType,
   /// which reduces the amount of casting needed in parts of the compiler.
   inline FixedVectorType *getType() const {
@@ -436,7 +438,7 @@ class ConstantVector final : public ConstantAggregate {
   /// If all elements of the vector constant have the same value, return that
   /// value. Otherwise, return nullptr. Ignore poison elements by setting
   /// AllowPoison to true.
-  Constant *getSplatValue(bool AllowPoison = false) const;
+  LLVM_ABI Constant *getSplatValue(bool AllowPoison = false) const;
 
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
@@ -451,18 +453,18 @@ class ConstantAggregateZero final : public Constant {
   friend class Context; // For constructor.
 
 public:
-  static ConstantAggregateZero *get(Type *Ty);
+  LLVM_ABI static ConstantAggregateZero *get(Type *Ty);
   /// If this CAZ has array or vector type, return a zero with the right element
   /// type.
-  Constant *getSequentialElement() const;
+  LLVM_ABI Constant *getSequentialElement() const;
   /// If this CAZ has struct type, return a zero with the right element type for
   /// the specified element.
-  Constant *getStructElement(unsigned Elt) const;
+  LLVM_ABI Constant *getStructElement(unsigned Elt) const;
   /// Return a zero of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  Constant *getElementValue(Constant *C) const;
+  LLVM_ABI Constant *getElementValue(Constant *C) const;
   /// Return a zero of the right value for the specified GEP index.
-  Constant *getElementValue(unsigned Idx) const;
+  LLVM_ABI Constant *getElementValue(unsigned Idx) const;
   /// Return the number of elements in the array, vector, or struct.
   ElementCount getElementCount() const {
     return cast<llvm::ConstantAggregateZero>(Val)->getElementCount();
@@ -769,9 +771,9 @@ class ConstantPointerNull final : public Constant {
   friend class Context; // For constructor.
 
 public:
-  static ConstantPointerNull *get(PointerType *Ty);
+  LLVM_ABI static ConstantPointerNull *get(PointerType *Ty);
 
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -802,22 +804,22 @@ class UndefValue : public Constant {
 
 public:
   /// Static factory methods - Return an 'undef' object of the specified type.
-  static UndefValue *get(Type *T);
+  LLVM_ABI static UndefValue *get(Type *T);
 
   /// If this Undef has array or vector type, return a undef with the right
   /// element type.
-  UndefValue *getSequentialElement() const;
+  LLVM_ABI UndefValue *getSequentialElement() const;
 
   /// If this undef has struct type, return a undef with the right element type
   /// for the specified element.
-  UndefValue *getStructElement(unsigned Elt) const;
+  LLVM_ABI UndefValue *getStructElement(unsigned Elt) const;
 
   /// Return an undef of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  UndefValue *getElementValue(Constant *C) const;
+  LLVM_ABI UndefValue *getElementValue(Constant *C) const;
 
   /// Return an undef of the right value for the specified GEP index.
-  UndefValue *getElementValue(unsigned Idx) const;
+  LLVM_ABI UndefValue *getElementValue(unsigned Idx) const;
 
   /// Return the number of elements in the array, vector, or struct.
   unsigned getNumElements() const {
@@ -850,22 +852,22 @@ class PoisonValue final : public UndefValue {
 
 public:
   /// Static factory methods - Return an 'poison' object of the specified type.
-  static PoisonValue *get(Type *T);
+  LLVM_ABI static PoisonValue *get(Type *T);
 
   /// If this poison has array or vector type, return a poison with the right
   /// element type.
-  PoisonValue *getSequentialElement() const;
+  LLVM_ABI PoisonValue *getSequentialElement() const;
 
   /// If this poison has struct type, return a poison with the right element
   /// type for the specified element.
-  PoisonValue *getStructElement(unsigned Elt) const;
+  LLVM_ABI PoisonValue *getStructElement(unsigned Elt) const;
 
   /// Return an poison of the right value for the specified GEP index if we can,
   /// otherwise return null (e.g. if C is a ConstantExpr).
-  PoisonValue *getElementValue(Constant *C) const;
+  LLVM_ABI PoisonValue *getElementValue(Constant *C) const;
 
   /// Return an poison of the right value for the specified GEP index.
-  PoisonValue *getElementValue(unsigned Idx) const;
+  LLVM_ABI PoisonValue *getElementValue(unsigned Idx) const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -924,7 +926,7 @@ class GlobalValue : public Constant {
   UnnamedAddr getUnnamedAddr() const {
     return cast<llvm::GlobalValue>(Val)->getUnnamedAddr();
   }
-  void setUnnamedAddr(UnnamedAddr V);
+  LLVM_ABI void setUnnamedAddr(UnnamedAddr V);
 
   static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) {
     return llvm::GlobalValue::getMinUnnamedAddr(A, B);
@@ -946,7 +948,7 @@ class GlobalValue : public Constant {
   bool hasProtectedVisibility() const {
     return cast<llvm::GlobalValue>(Val)->hasProtectedVisibility();
   }
-  void setVisibility(VisibilityTypes V);
+  LLVM_ABI void setVisibility(VisibilityTypes V);
 
   // TODO: Add missing functions.
 };
@@ -996,7 +998,7 @@ class GlobalObject : public GlobalValue {
   ///
   /// Setting the section to the empty string tells LLVM to choose an
   /// appropriate default object file section.
-  void setSection(StringRef S);
+  LLVM_ABI void setSection(StringRef S);
 
   bool hasComdat() const { return cast<llvm::GlobalObject>(Val)->hasComdat(); }
 
@@ -1031,7 +1033,7 @@ class GlobalWithNodeAPI : public ParentT {
   struct LLVMGVToGV {
     Context &Ctx;
     LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
+    LLVM_ABI GlobalT &operator()(LLVMGlobalT &LLVMGV) const;
   };
 
 public:
@@ -1060,24 +1062,15 @@ class GlobalWithNodeAPI : public ParentT {
   }
 };
 
-// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB
-extern template LLVM_TEMPLATE_ABI GlobalIFunc &
-GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                  llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalIFunc
-                                                                  &LLVMGV)
-    const;
-extern template LLVM_TEMPLATE_ABI Function &
-GlobalWithNodeAPI<Function, llvm::Function, GlobalObject, llvm::GlobalObject>::
-    LLVMGVToGV::operator()(llvm::Function &LLVMGV) const;
-
-extern template LLVM_TEMPLATE_ABI GlobalVariable &GlobalWithNodeAPI<
-    GlobalVariable, llvm::GlobalVariable, GlobalObject,
-    llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV)
-    const;
-extern template LLVM_TEMPLATE_ABI GlobalAlias &
-GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                  llvm::GlobalValue>::LLVMGVToGV::operator()(llvm::GlobalAlias
-                                                                 &LLVMGV) const;
+// Explicit instantiations.
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    Function, llvm::Function, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>;
+extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI<
+    GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>;
 
 class GlobalIFunc final
     : public GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
@@ -1097,13 +1090,13 @@ class GlobalIFunc final
   // TODO: Missing functions: copyAttributesFrom(), removeFromParent(),
   // eraseFromParent()
 
-  void setResolver(Constant *Resolver);
+  LLVM_ABI void setResolver(Constant *Resolver);
 
-  Constant *getResolver() const;
+  LLVM_ABI Constant *getResolver() const;
 
   // Return the resolver function after peeling off potential ConstantExpr
   // indirection.
-  Function *getResolverFunction();
+  LLVM_ABI Function *getResolverFunction();
   const Function *getResolverFunction() const {
     return const_cast<GlobalIFunc *>(this)->getResolverFunction();
   }
@@ -1136,7 +1129,7 @@ class GlobalVariable final
   struct LLVMGVToGV {
     Context &Ctx;
     LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {}
-    GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
+    LLVM_ABI GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const;
   };
 
 public:
@@ -1181,11 +1174,11 @@ class GlobalVariable final
   /// illegal to call this method if the global is external, because we cannot
   /// tell what the value is initialized to!
   ///
-  Constant *getInitializer() const;
+  LLVM_ABI Constant *getInitializer() const;
   /// setInitializer - Sets the initializer for this global variable, removing
   /// any existing initializer if InitVal==NULL. The initializer must have the
   /// type getValueType().
-  void setInitializer(Constant *InitVal);
+  LLVM_ABI void setInitializer(Constant *InitVal);
 
   // TODO: Add missing replaceInitializer(). Requires special tracker
 
@@ -1196,12 +1189,12 @@ class GlobalVariable final
   bool isConstant() const {
     return cast<llvm::GlobalVariable>(Val)->isConstant();
   }
-  void setConstant(bool V);
+  LLVM_ABI void setConstant(bool V);
 
   bool isExternallyInitialized() const {
     return cast<llvm::GlobalVariable>(Val)->isExternallyInitialized();
   }
-  void setExternallyInitialized(bool Val);
+  LLVM_ABI void setExternallyInitialized(bool Val);
 
   // TODO: Missing copyAttributesFrom()
 
@@ -1278,7 +1271,7 @@ class GlobalVariable final
   /// Sets the alignment attribute of the GlobalVariable.
   /// This method will be deprecated as the alignment property should always be
   /// defined.
-  void setAlignment(MaybeAlign Align);
+  LLVM_ABI void setAlignment(MaybeAlign Align);
 
   // TODO: Missing setCodeModel(). Requires custom tracker.
 
@@ -1311,10 +1304,10 @@ class GlobalAlias final
   // TODO: Missing copyAttributresFrom().
   // TODO: Missing removeFromParent(), eraseFromParent().
 
-  void setAliasee(Constant *Aliasee);
-  Constant *getAliasee() const;
+  LLVM_ABI void setAliasee(Constant *Aliasee);
+  LLVM_ABI Constant *getAliasee() const;
 
-  const GlobalObject *getAliaseeObject() const;
+  LLVM_ABI const GlobalObject *getAliaseeObject() const;
   GlobalObject *getAliaseeObject() {
     return const_cast<GlobalObject *>(
         static_cast<const GlobalAlias *>(this)->getAliaseeObject());
@@ -1336,12 +1329,12 @@ class NoCFIValue final : public Constant {
 
 public:
   /// Return a NoCFIValue for the specified function.
-  static NoCFIValue *get(GlobalValue *GV);
+  LLVM_ABI static NoCFIValue *get(GlobalValue *GV);
 
-  GlobalValue *getGlobalValue() const;
+  LLVM_ABI GlobalValue *getGlobalValue() const;
 
   /// NoCFIValue is always a pointer.
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
     return From->getSubclassID() == ClassID::NoCFIValue;
@@ -1369,21 +1362,21 @@ class ConstantPtrAuth final : public Constant {
 
 public:
   /// Return a pointer signed with the specified parameters.
-  static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
-                              ConstantInt *Disc, Constant *AddrDisc);
+  LLVM_ABI static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key,
+                                       ConstantInt *Disc, Constant *AddrDisc);
   /// The pointer that is signed in this ptrauth signed pointer.
-  Constant *getPointer() const;
+  LLVM_ABI Constant *getPointer() const;
 
   /// The Key ID, an i32 constant.
-  ConstantInt *getKey() const;
+  LLVM_ABI ConstantInt *getKey() const;
 
   /// The integer discriminator, an i64 constant, or 0.
-  ConstantInt *getDiscriminator() const;
+  LLVM_ABI ConstantInt *getDiscriminator() const;
 
   /// The address discriminator if any, or the null constant.
   /// If present, this must be a value equivalent to the storage location of
   /// the only global-initializer user of the ptrauth signed pointer.
-  Constant *getAddrDiscriminator() const;
+  LLVM_ABI Constant *getAddrDiscriminator() const;
 
   /// Whether there is any non-null address discriminator.
   bool hasAddressDiscriminator() const {
@@ -1410,7 +1403,7 @@ class ConstantPtrAuth final : public Constant {
 
   /// Produce a new ptrauth expression signing the given value using
   /// the same schema as is stored in one.
-  ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
+  LLVM_ABI ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1438,19 +1431,19 @@ class BlockAddress final : public Constant {
 
 public:
   /// Return a BlockAddress for the specified function and basic block.
-  static BlockAddress *get(Function *F, BasicBlock *BB);
+  LLVM_ABI static BlockAddress *get(Function *F, BasicBlock *BB);
 
   /// Return a BlockAddress for the specified basic block.  The basic
   /// block must be embedded into a function.
-  static BlockAddress *get(BasicBlock *BB);
+  LLVM_ABI static BlockAddress *get(BasicBlock *BB);
 
   /// Lookup an existing \c BlockAddress constant for the given BasicBlock.
   ///
   /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress.
-  static BlockAddress *lookup(const BasicBlock *BB);
+  LLVM_ABI static BlockAddress *lookup(const BasicBlock *BB);
 
-  Function *getFunction() const;
-  BasicBlock *getBasicBlock() const;
+  LLVM_ABI Function *getFunction() const;
+  LLVM_ABI BasicBlock *getBasicBlock() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1465,9 +1458,9 @@ class DSOLocalEquivalent final : public Constant {
 
 public:
   /// Return a DSOLocalEquivalent for the specified global value.
-  static DSOLocalEquivalent *get(GlobalValue *GV);
+  LLVM_ABI static DSOLocalEquivalent *get(GlobalValue *GV);
 
-  GlobalValue *getGlobalValue() const;
+  LLVM_ABI GlobalValue *getGlobalValue() const;
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
@@ -1498,7 +1491,7 @@ class ConstantTokenNone final : public Constant {
 
 public:
   /// Return the ConstantTokenNone.
-  static ConstantTokenNone *get(Context &Ctx);
+  LLVM_ABI static ConstantTokenNone *get(Context &Ctx);
 
   /// For isa/dyn_cast.
   static bool classof(const sandboxir::Value *From) {
diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h
index a8a21b0db855e..7d8b2c86e94a7 100644
--- a/llvm/include/llvm/SandboxIR/Context.h
+++ b/llvm/include/llvm/SandboxIR/Context.h
@@ -15,6 +15,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/SandboxIR/Tracker.h"
 #include "llvm/SandboxIR/Type.h"
+#include "llvm/Support/Compiler.h"
 
 #include <cstdint>
 
@@ -112,32 +113,33 @@ class Context {
   CallbackID::ValTy NextCallbackID = 1;
 
   /// Remove \p V from the maps and returns the unique_ptr.
-  std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
+  LLVM_ABI std::unique_ptr<Value> detachLLVMValue(llvm::Value *V);
   /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively
   /// detaches \p V from the underlying IR.
-  std::unique_ptr<Value> detach(Value *V);
+  LLVM_ABI std::unique_ptr<Value> detach(Value *V);
   friend class Instruction; // For detach().
   /// Take ownership of VPtr and store it in `LLVMValueToValueMap`.
-  Value *registerValue(std::unique_ptr<Value> &&VPtr);
+  LLVM_ABI Value *registerValue(std::unique_ptr<Value> &&VPtr);
   friend class EraseFromParent; // For registerValue().
   /// This is the actual function that creates sandboxir values for \p V,
   /// and among others handles all instruction types.
-  Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr);
+  LLVM_ABI Value *getOrCreateValueInternal(llvm::Value *V,
+                                           llvm::User *U = nullptr);
   /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg.
-  Argument *getOrCreateArgument(llvm::Argument *LLVMArg);
+  LLVM_ABI Argument *getOrCreateArgument(llvm::Argument *LLVMArg);
   /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV.
   Value *getOrCreateValue(llvm::Value *LLVMV) {
     return getOrCreateValueInternal(LLVMV, 0);
   }
   /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC.
-  Constant *getOrCreateConstant(llvm::Constant *LLVMC);
+  LLVM_ABI Constant *getOrCreateConstant(llvm::Constant *LLVMC);
   friend class ConstantDataSequential; // For getOrCreateConstant().
   friend class Utils; // For getMemoryBase
 
-  void runEraseInstrCallbacks(Instruction *I);
-  void runCreateInstrCallbacks(Instruction *I);
-  void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where);
-  void runSetUseCallbacks(const Use &U, Value *NewSrc);
+  LLVM_ABI void runEraseInstrCallbacks(Instruction *I);
+  LLVM_ABI void runCreateInstrCallbacks(Instruction *I);
+  LLVM_ABI void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where);
+  LLVM_ABI void runSetUseCallbacks(const Use &U, Value *NewSrc);
 
   friend class User;  // For runSetUseCallbacks().
   friend class Value; // For runSetUseCallbacks().
@@ -148,90 +150,97 @@ class Context {
 
   /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will
   /// also create all contents of the block.
-  BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
+  LLVM_ABI BasicBlock *createBasicBlock(llvm::BasicBlock *BB);
   friend class BasicBlock; // For getOrCreateValue().
 
   IRBuilder<ConstantFolder> LLVMIRBuilder;
   auto &getLLVMIRBuilder() { return LLVMIRBuilder; }
 
-  VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
+  LLVM_ABI VAArgInst *createVAArgInst(llvm::VAArgInst *SI);
   friend VAArgInst; // For createVAArgInst()
-  FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
+  LLVM_ABI FreezeInst *createFreezeInst(llvm::FreezeInst *SI);
   friend FreezeInst; // For createFreezeInst()
-  FenceInst *createFenceInst(llvm::FenceInst *SI);
+  LLVM_ABI FenceInst *createFenceInst(llvm::FenceInst *SI);
   friend FenceInst; // For createFenceInst()
-  SelectInst *createSelectInst(llvm::SelectInst *SI);
+  LLVM_ABI SelectInst *createSelectInst(llvm::SelectInst *SI);
   friend SelectInst; // For createSelectInst()
-  InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI);
+  LLVM_ABI InsertElementInst *
+  createInsertElementInst(llvm::InsertElementInst *IEI);
   friend InsertElementInst; // For createInsertElementInst()
-  ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI);
+  LLVM_ABI ExtractElementInst *
+  createExtractElementInst(llvm::ExtractElementInst *EEI);
   friend ExtractElementInst; // For createExtractElementInst()
-  ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
+  LLVM_ABI ShuffleVectorInst *
+  createShuffleVectorInst(llvm::ShuffleVectorInst *SVI);
   friend ShuffleVectorInst; // For createShuffleVectorInst()
-  ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI);
+  LLVM_ABI ExtractValueInst *
+  createExtractValueInst(llvm::ExtractValueInst *IVI);
   friend ExtractValueInst; // For createExtractValueInst()
-  InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
+  LLVM_ABI InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI);
   friend InsertValueInst; // For createInsertValueInst()
-  BranchInst *createBranchInst(llvm::BranchInst *I);
+  LLVM_ABI BranchInst *createBranchInst(llvm::BranchInst *I);
   friend BranchInst; // For createBranchInst()
-  LoadInst *createLoadInst(llvm::LoadInst *LI);
+  LLVM_ABI LoadInst *createLoadInst(llvm::LoadInst *LI);
   friend LoadInst; // For createLoadInst()
-  StoreInst *createStoreInst(llvm::StoreInst *SI);
+  LLVM_ABI StoreInst *createStoreInst(llvm::StoreInst *SI);
   friend StoreInst; // For createStoreInst()
-  ReturnInst *createReturnInst(llvm::ReturnInst *I);
+  LLVM_ABI ReturnInst *createReturnInst(llvm::ReturnInst *I);
   friend ReturnInst; // For createReturnInst()
-  CallInst *createCallInst(llvm::CallInst *I);
+  LLVM_ABI CallInst *createCallInst(llvm::CallInst *I);
   friend CallInst; // For createCallInst()
-  InvokeInst *createInvokeInst(llvm::InvokeInst *I);
+  LLVM_ABI InvokeInst *createInvokeInst(llvm::InvokeInst *I);
   friend InvokeInst; // For createInvokeInst()
-  CallBrInst *createCallBrInst(llvm::CallBrInst *I);
+  LLVM_ABI CallBrInst *createCallBrInst(llvm::CallBrInst *I);
   friend CallBrInst; // For createCallBrInst()
-  LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
+  LLVM_ABI LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I);
   friend LandingPadInst; // For createLandingPadInst()
-  CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
+  LLVM_ABI CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I);
   friend CatchPadInst; // For createCatchPadInst()
-  CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
+  LLVM_ABI CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I);
   friend CleanupPadInst; // For createCleanupPadInst()
-  CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
+  LLVM_ABI CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I);
   friend CatchReturnInst; // For createCatchReturnInst()
-  CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I);
+  LLVM_ABI CleanupReturnInst *
+  createCleanupReturnInst(llvm::CleanupReturnInst *I);
   friend CleanupReturnInst; // For createCleanupReturnInst()
-  GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I);
+  LLVM_ABI GetElementPtrInst *
+  createGetElementPtrInst(llvm::GetElementPtrInst *I);
   friend GetElementPtrInst; // For createGetElementPtrInst()
-  CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
+  LLVM_ABI CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I);
   friend CatchSwitchInst; // For createCatchSwitchInst()
-  ResumeInst *createResumeInst(llvm::ResumeInst *I);
+  LLVM_ABI ResumeInst *createResumeInst(llvm::ResumeInst *I);
   friend ResumeInst; // For createResumeInst()
-  SwitchInst *createSwitchInst(llvm::SwitchInst *I);
+  LLVM_ABI SwitchInst *createSwitchInst(llvm::SwitchInst *I);
   friend SwitchInst; // For createSwitchInst()
-  UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
+  LLVM_ABI UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I);
   friend UnaryOperator; // For createUnaryOperator()
-  BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
+  LLVM_ABI BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I);
   friend BinaryOperator; // For createBinaryOperator()
-  AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
+  LLVM_ABI AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I);
   friend AtomicRMWInst; // For createAtomicRMWInst()
-  AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
+  LLVM_ABI AtomicCmpXchgInst *
+  createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I);
   friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst()
-  AllocaInst *createAllocaInst(llvm::AllocaInst *I);
+  LLVM_ABI AllocaInst *createAllocaInst(llvm::AllocaInst *I);
   friend AllocaInst; // For createAllocaInst()
-  CastInst *createCastInst(llvm::CastInst *I);
+  LLVM_ABI CastInst *createCastInst(llvm::CastInst *I);
   friend CastInst; // For createCastInst()
-  PHINode *createPHINode(llvm::PHINode *I);
+  LLVM_ABI PHINode *createPHINode(llvm::PHINode *I);
   friend PHINode; // For createPHINode()
-  UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
+  LLVM_ABI UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI);
   friend UnreachableInst; // For createUnreachableInst()
-  CmpInst *createCmpInst(llvm::CmpInst *I);
+  LLVM_ABI CmpInst *createCmpInst(llvm::CmpInst *I);
   friend CmpInst; // For createCmpInst()
-  ICmpInst *createICmpInst(llvm::ICmpInst *I);
+  LLVM_ABI ICmpInst *createICmpInst(llvm::ICmpInst *I);
   friend ICmpInst; // For createICmpInst()
-  FCmpInst *createFCmpInst(llvm::FCmpInst *I);
+  LLVM_ABI FCmpInst *createFCmpInst(llvm::FCmpInst *I);
   friend FCmpInst; // For createFCmpInst()
 
 public:
-  Context(LLVMContext &LLVMCtx);
-  ~Context();
+  LLVM_ABI Context(LLVMContext &LLVMCtx);
+  LLVM_ABI ~Context();
   /// Clears function-level state.
-  void clear();
+  LLVM_ABI void clear();
 
   Tracker &getTracker() { return IRTracker; }
   /// Convenience function for `getTracker().save()`
@@ -241,14 +250,14 @@ class Context {
   /// Convenience function for `getTracker().accept()`
   void accept() { IRTracker.accept(); }
 
-  sandboxir::Value *getValue(llvm::Value *V) const;
+  LLVM_ABI sandboxir::Value *getValue(llvm::Value *V) const;
   const sandboxir::Value *getValue(const llvm::Value *V) const {
     return getValue(const_cast<llvm::Value *>(V));
   }
 
-  Module *getModule(llvm::Module *LLVMM) const;
+  LLVM_ABI Module *getModule(llvm::Module *LLVMM) const;
 
-  Module *getOrCreateModule(llvm::Module *LLVMM);
+  LLVM_ABI Module *getOrCreateModule(llvm::Module *LLVMM);
 
   Type *getType(llvm::Type *LLVMTy) {
     if (LLVMTy == nullptr)
@@ -265,10 +274,10 @@ class Context {
   /// This is the main API function for creating Sandbox IR.
   /// Note: this will not fully populate its parent module. The only globals
   /// that will be available are those used within the function.
-  Function *createFunction(llvm::Function *F);
+  LLVM_ABI Function *createFunction(llvm::Function *F);
 
   /// Create a sandboxir::Module corresponding to \p LLVMM.
-  Module *createModule(llvm::Module *LLVMM);
+  LLVM_ABI Module *createModule(llvm::Module *LLVMM);
 
   /// \Returns the number of values registered with Context.
   size_t getNumValues() const { return LLVMValueToValueMap.size(); }
@@ -277,26 +286,26 @@ class Context {
   /// to be removed from its parent. Note that this will also be called when
   /// reverting the creation of an instruction.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerEraseInstrCallback(EraseInstrCallback CB);
-  void unregisterEraseInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerEraseInstrCallback(EraseInstrCallback CB);
+  LLVM_ABI void unregisterEraseInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called right after a SandboxIR instruction
   /// is created. Note that this will also be called when reverting the removal
   /// of an instruction.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerCreateInstrCallback(CreateInstrCallback CB);
-  void unregisterCreateInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerCreateInstrCallback(CreateInstrCallback CB);
+  LLVM_ABI void unregisterCreateInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called when a SandboxIR instruction is about
   /// to be moved. Note that this will also be called when reverting a move.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerMoveInstrCallback(MoveInstrCallback CB);
-  void unregisterMoveInstrCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerMoveInstrCallback(MoveInstrCallback CB);
+  LLVM_ABI void unregisterMoveInstrCallback(CallbackID ID);
 
   /// Register a callback that gets called when a Use gets set.
   /// \Returns a callback ID for later deregistration.
-  CallbackID registerSetUseCallback(SetUseCallback CB);
-  void unregisterSetUseCallback(CallbackID ID);
+  LLVM_ABI CallbackID registerSetUseCallback(SetUseCallback CB);
+  LLVM_ABI void unregisterSetUseCallback(CallbackID ID);
 };
 
 } // namespace sandboxir
diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h
index 2c4b53ef6c1e6..28c69112b2b7e 100644
--- a/llvm/include/llvm/SandboxIR/Function.h
+++ b/llvm/include/llvm/SandboxIR/Function.h
@@ -11,6 +11,7 @@
 
 #include "llvm/IR/Function.h"
 #include "llvm/SandboxIR/Constant.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -56,7 +57,7 @@ class Function : public GlobalWithNodeAPI<Function, llvm::Function,
     LLVMBBToBB BBGetter(Ctx);
     return iterator(cast<llvm::Function>(Val)->end(), BBGetter);
   }
-  FunctionType *getFunctionType() const;
+  LLVM_ABI FunctionType *getFunctionType() const;
 
   /// Returns the alignment of the given function.
   MaybeAlign getAlign() const { return cast<llvm::Function>(Val)->getAlign(); }
@@ -66,7 +67,7 @@ class Function : public GlobalWithNodeAPI<Function, llvm::Function,
   /// Sets the alignment attribute of the Function.
   /// This method will be deprecated as the alignment property should always be
   /// defined.
-  void setAlignment(MaybeAlign Align);
+  LLVM_ABI void setAlignment(MaybeAlign Align);
 
 #ifndef NDEBUG
   void verify() const final {
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index ce5a2cbec85bd..4e3ff19d47787 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -16,6 +16,7 @@
 #include "llvm/SandboxIR/BasicBlock.h"
 #include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/User.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -57,7 +58,7 @@ class Instruction : public User {
 
   /// A SandboxIR Instruction may map to multiple LLVM IR Instruction. This
   /// returns its topmost LLVM IR instruction.
-  llvm::Instruction *getTopmostLLVMInstruction() const;
+  LLVM_ABI llvm::Instruction *getTopmostLLVMInstruction() const;
   friend class VAArgInst;          // For getTopmostLLVMInstruction().
   friend class FreezeInst;         // For getTopmostLLVMInstruction().
   friend class FenceInst;          // For getTopmostLLVMInstruction().
@@ -113,17 +114,17 @@ class Instruction : public User {
   }
 
 public:
-  static const char *getOpcodeName(Opcode Opc);
+  LLVM_ABI static const char *getOpcodeName(Opcode Opc);
   /// This is used by BasicBlock::iterator.
   virtual unsigned getNumOfIRInstrs() const = 0;
   /// \Returns a BasicBlock::iterator for this Instruction.
-  BBIterator getIterator() const;
+  LLVM_ABI BBIterator getIterator() const;
   /// \Returns the next sandboxir::Instruction in the block, or nullptr if at
   /// the end of the block.
-  Instruction *getNextNode() const;
+  LLVM_ABI Instruction *getNextNode() const;
   /// \Returns the previous sandboxir::Instruction in the block, or nullptr if
   /// at the beginning of the block.
-  Instruction *getPrevNode() const;
+  LLVM_ABI Instruction *getPrevNode() const;
   /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode
   /// state to allow for new SandboxIR-specific instructions.
   Opcode getOpcode() const { return Opc; }
@@ -188,17 +189,17 @@ class Instruction : public User {
   // TODO: More missing functions
 
   /// Detach this from its parent BasicBlock without deleting it.
-  void removeFromParent();
+  LLVM_ABI void removeFromParent();
   /// Detach this Value from its parent and delete it.
-  void eraseFromParent();
+  LLVM_ABI void eraseFromParent();
   /// Insert this detached instruction before \p BeforeI.
-  void insertBefore(Instruction *BeforeI);
+  LLVM_ABI void insertBefore(Instruction *BeforeI);
   /// Insert this detached instruction after \p AfterI.
-  void insertAfter(Instruction *AfterI);
+  LLVM_ABI void insertAfter(Instruction *AfterI);
   /// Insert this detached instruction into \p BB at \p WhereIt.
-  void insertInto(BasicBlock *BB, const BBIterator &WhereIt);
+  LLVM_ABI void insertInto(BasicBlock *BB, const BBIterator &WhereIt);
   /// Move this instruction to \p WhereIt.
-  void moveBefore(BasicBlock &BB, const BBIterator &WhereIt);
+  LLVM_ABI void moveBefore(BasicBlock &BB, const BBIterator &WhereIt);
   /// Move this instruction before \p Before.
   void moveBefore(Instruction *Before) {
     moveBefore(*Before->getParent(), Before->getIterator());
@@ -217,9 +218,9 @@ class Instruction : public User {
   }
   /// \Returns the BasicBlock containing this Instruction, or null if it is
   /// detached.
-  BasicBlock *getParent() const;
+  LLVM_ABI BasicBlock *getParent() const;
   /// For isa/dyn_cast.
-  static bool classof(const sandboxir::Value *From);
+  LLVM_ABI static bool classof(const sandboxir::Value *From);
 
   /// Determine whether the no signed wrap flag is set.
   bool hasNoUnsignedWrap() const {
@@ -227,20 +228,20 @@ class Instruction : public User {
   }
   /// Set or clear the nuw flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setHasNoUnsignedWrap(bool B = true);
+  LLVM_ABI void setHasNoUnsignedWrap(bool B = true);
   /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const {
     return cast<llvm::Instruction>(Val)->hasNoSignedWrap();
   }
   /// Set or clear the nsw flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setHasNoSignedWrap(bool B = true);
+  LLVM_ABI void setHasNoSignedWrap(bool B = true);
   /// Determine whether all fast-math-flags are set.
   bool isFast() const { return cast<llvm::Instruction>(Val)->isFast(); }
   /// Set or clear all fast-math-flags on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setFast(bool B);
+  LLVM_ABI void setFast(bool B);
   /// Determine whether the allow-reassociation flag is set.
   bool hasAllowReassoc() const {
     return cast<llvm::Instruction>(Val)->hasAllowReassoc();
@@ -248,24 +249,24 @@ class Instruction : public User {
   /// Set or clear the reassociation flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowReassoc(bool B);
+  LLVM_ABI void setHasAllowReassoc(bool B);
   /// Determine whether the exact flag is set.
   bool isExact() const { return cast<llvm::Instruction>(Val)->isExact(); }
   /// Set or clear the exact flag on this instruction, which must be an operator
   /// which supports this flag. See LangRef.html for the meaning of this flag.
-  void setIsExact(bool B = true);
+  LLVM_ABI void setIsExact(bool B = true);
   /// Determine whether the no-NaNs flag is set.
   bool hasNoNaNs() const { return cast<llvm::Instruction>(Val)->hasNoNaNs(); }
   /// Set or clear the no-nans flag on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoNaNs(bool B);
+  LLVM_ABI void setHasNoNaNs(bool B);
   /// Determine whether the no-infs flag is set.
   bool hasNoInfs() const { return cast<llvm::Instruction>(Val)->hasNoInfs(); }
   /// Set or clear the no-infs flag on this instruction, which must be an
   /// operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoInfs(bool B);
+  LLVM_ABI void setHasNoInfs(bool B);
   /// Determine whether the no-signed-zeros flag is set.
   bool hasNoSignedZeros() const {
     return cast<llvm::Instruction>(Val)->hasNoSignedZeros();
@@ -273,7 +274,7 @@ class Instruction : public User {
   /// Set or clear the no-signed-zeros flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasNoSignedZeros(bool B);
+  LLVM_ABI void setHasNoSignedZeros(bool B);
   /// Determine whether the allow-reciprocal flag is set.
   bool hasAllowReciprocal() const {
     return cast<llvm::Instruction>(Val)->hasAllowReciprocal();
@@ -281,7 +282,7 @@ class Instruction : public User {
   /// Set or clear the allow-reciprocal flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowReciprocal(bool B);
+  LLVM_ABI void setHasAllowReciprocal(bool B);
   /// Determine whether the allow-contract flag is set.
   bool hasAllowContract() const {
     return cast<llvm::Instruction>(Val)->hasAllowContract();
@@ -289,7 +290,7 @@ class Instruction : public User {
   /// Set or clear the allow-contract flag on this instruction, which must be
   /// an operator which supports this flag. See LangRef.html for the meaning of
   /// this flag.
-  void setHasAllowContract(bool B);
+  LLVM_ABI void setHasAllowContract(bool B);
   /// Determine whether the approximate-math-functions flag is set.
   bool hasApproxFunc() const {
     return cast<llvm::Instruction>(Val)->hasApproxFunc();
@@ -297,7 +298,7 @@ class Instruction : public User {
   /// Set or clear the approximate-math-functions flag on this instruction,
   /// which must be an operator which supports this flag. See LangRef.html for
   /// the meaning of this flag.
-  void setHasApproxFunc(bool B);
+  LLVM_ABI void setHasApproxFunc(bool B);
   /// Convenience function for getting all the fast-math flags, which must be an
   /// operator which supports these flags. See LangRef.html for the meaning of
   /// these flags.
@@ -307,11 +308,11 @@ class Instruction : public User {
   /// Convenience function for setting multiple fast-math flags on this
   /// instruction, which must be an operator which supports these flags. See
   /// LangRef.html for the meaning of these flags.
-  void setFastMathFlags(FastMathFlags FMF);
+  LLVM_ABI void setFastMathFlags(FastMathFlags FMF);
   /// Convenience function for transferring all fast-math flag values to this
   /// instruction, which must be an operator which supports these flags. See
   /// LangRef.html for the meaning of these flags.
-  void copyFastMathFlags(FastMathFlags FMF);
+  LLVM_ABI void copyFastMathFlags(FastMathFlags FMF);
 
   bool isAssociative() const {
     return cast<llvm::Instruction>(Val)->isAssociative();
@@ -352,7 +353,7 @@ class Instruction : public User {
 
   bool isVolatile() const { return cast<llvm::Instruction>(Val)->isVolatile(); }
 
-  Type *getAccessType() const;
+  LLVM_ABI Type *getAccessType() const;
 
   bool mayThrow(bool IncludePhaseOneUnwind = false) const {
     return cast<llvm::Instruction>(Val)->mayThrow(IncludePhaseOneUnwind);
@@ -414,22 +415,22 @@ class FenceInst : public SingleLLVMInstructionImpl<llvm::FenceInst> {
   friend Context; // For constructor;
 
 public:
-  static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos,
-                           Context &Ctx,
-                           SyncScope::ID SSID = SyncScope::System);
+  LLVM_ABI static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos,
+                                    Context &Ctx,
+                                    SyncScope::ID SSID = SyncScope::System);
   /// Returns the ordering constraint of this fence instruction.
   AtomicOrdering getOrdering() const {
     return cast<llvm::FenceInst>(Val)->getOrdering();
   }
   /// Sets the ordering constraint of this fence instruction.  May only be
   /// Acquire, Release, AcquireRelease, or SequentiallyConsistent.
-  void setOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setOrdering(AtomicOrdering Ordering);
   /// Returns the synchronization scope ID of this fence instruction.
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::FenceInst>(Val)->getSyncScopeID();
   }
   /// Sets the synchronization scope ID of this fence instruction.
-  void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Fence;
   }
@@ -443,9 +444,9 @@ class SelectInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
   friend Context; // for SelectInst()
 
 public:
-  static Value *create(Value *Cond, Value *True, Value *False,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Cond, Value *True, Value *False,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   const Value *getCondition() const { return getOperand(0); }
   const Value *getTrueValue() const { return getOperand(1); }
@@ -457,7 +458,7 @@ class SelectInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
   void setCondition(Value *New) { setOperand(0, New); }
   void setTrueValue(Value *New) { setOperand(1, New); }
   void setFalseValue(Value *New) { setOperand(2, New); }
-  void swapValues();
+  LLVM_ABI void swapValues();
 
   /// Return a string if the specified operands are invalid for a select
   /// operation, otherwise return null.
@@ -468,7 +469,7 @@ class SelectInst : public SingleLLVMInstructionImpl<llvm::SelectInst> {
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
 };
 
 class InsertElementInst final
@@ -480,9 +481,9 @@ class InsertElementInst final
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static Value *create(Value *Vec, Value *NewElt, Value *Idx,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Vec, Value *NewElt, Value *Idx,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::InsertElement;
   }
@@ -503,8 +504,8 @@ class ExtractElementInst final
                         // create*()
 
 public:
-  static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Vec, Value *Idx, InsertPosition Pos,
+                                Context &Ctx, const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ExtractElement;
   }
@@ -516,7 +517,7 @@ class ExtractElementInst final
   Value *getIndexOperand() { return getOperand(1); }
   const Value *getVectorOperand() const { return getOperand(0); }
   const Value *getIndexOperand() const { return getOperand(1); }
-  VectorType *getVectorOperandType() const;
+  LLVM_ABI VectorType *getVectorOperandType() const;
 };
 
 class ShuffleVectorInst final
@@ -528,18 +529,19 @@ class ShuffleVectorInst final
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static Value *create(Value *V1, Value *V2, Value *Mask, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *create(Value *V1, Value *V2, ArrayRef<int> Mask,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *V1, Value *V2, Value *Mask,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *V1, Value *V2, ArrayRef<int> Mask,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ShuffleVector;
   }
 
   /// Swap the operands and adjust the mask to preserve the semantics of the
   /// instruction.
-  void commute();
+  LLVM_ABI void commute();
 
   /// Return true if a shufflevector instruction can be formed with the
   /// specified operands.
@@ -554,7 +556,7 @@ class ShuffleVectorInst final
   }
 
   /// Overload to return most specific vector type.
-  VectorType *getType() const;
+  LLVM_ABI VectorType *getType() const;
 
   /// Return the shuffle mask value of this instruction for the given element
   /// index. Return PoisonMaskElem if the element is undef.
@@ -577,12 +579,12 @@ class ShuffleVectorInst final
   }
 
   /// Return the mask for this instruction, for use in bitcode.
-  Constant *getShuffleMaskForBitcode() const;
+  LLVM_ABI Constant *getShuffleMaskForBitcode() const;
 
-  static Constant *convertShuffleMaskForBitcode(ArrayRef<int> Mask,
-                                                Type *ResultTy);
+  LLVM_ABI static Constant *convertShuffleMaskForBitcode(ArrayRef<int> Mask,
+                                                         Type *ResultTy);
 
-  void setShuffleMask(ArrayRef<int> Mask);
+  LLVM_ABI void setShuffleMask(ArrayRef<int> Mask);
 
   ArrayRef<int> getShuffleMask() const {
     return cast<llvm::ShuffleVectorInst>(Val)->getShuffleMask();
@@ -965,9 +967,9 @@ class InsertValueInst
   friend Context; // for InsertValueInst()
 
 public:
-  static Value *create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::InsertValue;
@@ -1024,36 +1026,37 @@ class BranchInst : public SingleLLVMInstructionImpl<llvm::BranchInst> {
   friend Context; // for BranchInst()
 
 public:
-  static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos,
-                            Context &Ctx);
-  static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
-                            Value *Cond, InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos,
+                                     Context &Ctx);
+  LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse,
+                                     Value *Cond, InsertPosition Pos,
+                                     Context &Ctx);
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
   bool isUnconditional() const {
     return cast<llvm::BranchInst>(Val)->isUnconditional();
   }
   bool isConditional() const {
     return cast<llvm::BranchInst>(Val)->isConditional();
   }
-  Value *getCondition() const;
+  LLVM_ABI Value *getCondition() const;
   void setCondition(Value *V) { setOperand(0, V); }
   unsigned getNumSuccessors() const { return 1 + isConditional(); }
-  BasicBlock *getSuccessor(unsigned SuccIdx) const;
-  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
   void swapSuccessors() { swapOperandsInternal(1, 2); }
 
 private:
   struct LLVMBBToSBBB {
     Context &Ctx;
     LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
-    BasicBlock *operator()(llvm::BasicBlock *BB) const;
+    LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *BB) const;
   };
 
   struct ConstLLVMBBToSBBB {
     Context &Ctx;
     ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {}
-    const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
+    LLVM_ABI const BasicBlock *operator()(const llvm::BasicBlock *BB) const;
   };
 
 public:
@@ -1109,8 +1112,9 @@ class ExtractValueInst : public UnaryInstruction {
   friend Context; // for ExtractValueInst()
 
 public:
-  static Value *create(Value *Agg, ArrayRef<unsigned> Idxs, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static Value *create(Value *Agg, ArrayRef<unsigned> Idxs,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::ExtractValue;
@@ -1120,7 +1124,7 @@ class ExtractValueInst : public UnaryInstruction {
   /// with an extractvalue instruction with the specified parameters.
   ///
   /// Null is returned if the indices are invalid for the specified type.
-  static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
+  LLVM_ABI static Type *getIndexedType(Type *Agg, ArrayRef<unsigned> Idxs);
 
   using idx_iterator = llvm::ExtractValueInst::idx_iterator;
 
@@ -1163,9 +1167,9 @@ class VAArgInst : public UnaryInstruction {
   friend Context; // For constructor;
 
 public:
-  static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos,
-                           Context &Ctx, const Twine &Name = "");
-  Value *getPointerOperand();
+  LLVM_ABI static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos,
+                                    Context &Ctx, const Twine &Name = "");
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<VAArgInst *>(this)->getPointerOperand();
   }
@@ -1183,8 +1187,8 @@ class FreezeInst : public UnaryInstruction {
   friend Context; // For constructor;
 
 public:
-  static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx,
-                            const Twine &Name = "");
+  LLVM_ABI static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx,
+                                     const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Freeze;
   }
@@ -1200,11 +1204,11 @@ class LoadInst final : public UnaryInstruction {
   /// Return true if this is a load from a volatile memory location.
   bool isVolatile() const { return cast<llvm::LoadInst>(Val)->isVolatile(); }
   /// Specify whether this is a volatile load or not.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
 
-  static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
-                          InsertPosition Pos, bool IsVolatile, Context &Ctx,
-                          const Twine &Name = "");
+  LLVM_ABI static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
+                                   InsertPosition Pos, bool IsVolatile,
+                                   Context &Ctx, const Twine &Name = "");
   static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align,
                           InsertPosition Pos, Context &Ctx,
                           const Twine &Name = "") {
@@ -1212,8 +1216,8 @@ class LoadInst final : public UnaryInstruction {
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Value *getPointerOperand() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Value *getPointerOperand() const;
   Align getAlign() const { return cast<llvm::LoadInst>(Val)->getAlign(); }
   bool isUnordered() const { return cast<llvm::LoadInst>(Val)->isUnordered(); }
   bool isSimple() const { return cast<llvm::LoadInst>(Val)->isSimple(); }
@@ -1229,19 +1233,20 @@ class StoreInst final : public SingleLLVMInstructionImpl<llvm::StoreInst> {
   /// Return true if this is a store from a volatile memory location.
   bool isVolatile() const { return cast<llvm::StoreInst>(Val)->isVolatile(); }
   /// Specify whether this is a volatile store or not.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
 
-  static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
-                           InsertPosition Pos, bool IsVolatile, Context &Ctx);
+  LLVM_ABI static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
+                                    InsertPosition Pos, bool IsVolatile,
+                                    Context &Ctx);
   static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align,
                            InsertPosition Pos, Context &Ctx) {
     return create(V, Ptr, Align, Pos, /*IsVolatile=*/false, Ctx);
   }
 
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Value *getValueOperand() const;
-  Value *getPointerOperand() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Value *getValueOperand() const;
+  LLVM_ABI Value *getPointerOperand() const;
   Align getAlign() const { return cast<llvm::StoreInst>(Val)->getAlign(); }
   bool isSimple() const { return cast<llvm::StoreInst>(Val)->isSimple(); }
   bool isUnordered() const { return cast<llvm::StoreInst>(Val)->isUnordered(); }
@@ -1260,8 +1265,8 @@ class UnreachableInst final : public Instruction {
   }
 
 public:
-  static UnreachableInst *create(InsertPosition Pos, Context &Ctx);
-  static bool classof(const Value *From);
+  LLVM_ABI static UnreachableInst *create(InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static bool classof(const Value *From);
   unsigned getNumSuccessors() const { return 0; }
   unsigned getUseOperandNo(const Use &Use) const final {
     llvm_unreachable("UnreachableInst has no operands!");
@@ -1280,12 +1285,13 @@ class ReturnInst final : public SingleLLVMInstructionImpl<llvm::ReturnInst> {
                                   Context &Ctx);
 
 public:
-  static ReturnInst *create(Value *RetVal, InsertPosition Pos, Context &Ctx);
+  LLVM_ABI static ReturnInst *create(Value *RetVal, InsertPosition Pos,
+                                     Context &Ctx);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Ret;
   }
   /// \Returns null if there is no return value.
-  Value *getReturnValue() const;
+  LLVM_ABI Value *getReturnValue() const;
 };
 
 class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
@@ -1303,7 +1309,7 @@ class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
            Opc == Instruction::ClassID::CallBr;
   }
 
-  FunctionType *getFunctionType() const;
+  LLVM_ABI FunctionType *getFunctionType() const;
 
   op_iterator data_operands_begin() { return op_begin(); }
   const_op_iterator data_operands_begin() const {
@@ -1390,17 +1396,17 @@ class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
   }
   bool hasArgument(const Value *V) const { return is_contained(args(), V); }
 
-  Value *getCalledOperand() const;
-  Use getCalledOperandUse() const;
+  LLVM_ABI Value *getCalledOperand() const;
+  LLVM_ABI Use getCalledOperandUse() const;
 
-  Function *getCalledFunction() const;
+  LLVM_ABI Function *getCalledFunction() const;
   bool isIndirectCall() const {
     return cast<llvm::CallBase>(Val)->isIndirectCall();
   }
   bool isCallee(Use U) const {
     return cast<llvm::CallBase>(Val)->isCallee(U.LLVMUse);
   }
-  Function *getCaller();
+  LLVM_ABI Function *getCaller();
   const Function *getCaller() const {
     return const_cast<CallBase *>(this)->getCaller();
   }
@@ -1412,7 +1418,7 @@ class CallBase : public SingleLLVMInstructionImpl<llvm::CallBase> {
     return cast<llvm::CallBase>(Val)->getIntrinsicID();
   }
   void setCalledOperand(Value *V) { getCalledOperandUse().set(V); }
-  void setCalledFunction(Function *F);
+  LLVM_ABI void setCalledFunction(Function *F);
   CallingConv::ID getCallingConv() const {
     return cast<llvm::CallBase>(Val)->getCallingConv();
   }
@@ -1428,9 +1434,9 @@ class CallInst : public CallBase {
   friend class IntrinsicInst; // For constructor
 
 public:
-  static CallInst *create(FunctionType *FTy, Value *Func,
-                          ArrayRef<Value *> Args, InsertPosition Pos,
-                          Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static CallInst *create(FunctionType *FTy, Value *Func,
+                                   ArrayRef<Value *> Args, InsertPosition Pos,
+                                   Context &Ctx, const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Call;
@@ -1446,20 +1452,21 @@ class InvokeInst final : public CallBase {
                         // create*()
 
 public:
-  static InvokeInst *create(FunctionType *FTy, Value *Func,
-                            BasicBlock *IfNormal, BasicBlock *IfException,
-                            ArrayRef<Value *> Args, InsertPosition Pos,
-                            Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static InvokeInst *create(FunctionType *FTy, Value *Func,
+                                     BasicBlock *IfNormal,
+                                     BasicBlock *IfException,
+                                     ArrayRef<Value *> Args, InsertPosition Pos,
+                                     Context &Ctx, const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Invoke;
   }
-  BasicBlock *getNormalDest() const;
-  BasicBlock *getUnwindDest() const;
-  void setNormalDest(BasicBlock *BB);
-  void setUnwindDest(BasicBlock *BB);
-  LandingPadInst *getLandingPadInst() const;
-  BasicBlock *getSuccessor(unsigned SuccIdx) const;
+  LLVM_ABI BasicBlock *getNormalDest() const;
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setNormalDest(BasicBlock *BB);
+  LLVM_ABI void setUnwindDest(BasicBlock *BB);
+  LLVM_ABI LandingPadInst *getLandingPadInst() const;
+  LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const;
   void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) {
     assert(SuccIdx < 2 && "Successor # out of range for invoke!");
     if (SuccIdx == 0)
@@ -1481,25 +1488,25 @@ class CallBrInst final : public CallBase {
                         // create*()
 
 public:
-  static CallBrInst *create(FunctionType *FTy, Value *Func,
-                            BasicBlock *DefaultDest,
-                            ArrayRef<BasicBlock *> IndirectDests,
-                            ArrayRef<Value *> Args, InsertPosition Pos,
-                            Context &Ctx, const Twine &NameStr = "");
+  LLVM_ABI static CallBrInst *create(FunctionType *FTy, Value *Func,
+                                     BasicBlock *DefaultDest,
+                                     ArrayRef<BasicBlock *> IndirectDests,
+                                     ArrayRef<Value *> Args, InsertPosition Pos,
+                                     Context &Ctx, const Twine &NameStr = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CallBr;
   }
   unsigned getNumIndirectDests() const {
     return cast<llvm::CallBrInst>(Val)->getNumIndirectDests();
   }
-  Value *getIndirectDestLabel(unsigned Idx) const;
-  Value *getIndirectDestLabelUse(unsigned Idx) const;
-  BasicBlock *getDefaultDest() const;
-  BasicBlock *getIndirectDest(unsigned Idx) const;
-  SmallVector<BasicBlock *, 16> getIndirectDests() const;
-  void setDefaultDest(BasicBlock *BB);
-  void setIndirectDest(unsigned Idx, BasicBlock *BB);
-  BasicBlock *getSuccessor(unsigned Idx) const;
+  LLVM_ABI Value *getIndirectDestLabel(unsigned Idx) const;
+  LLVM_ABI Value *getIndirectDestLabelUse(unsigned Idx) const;
+  LLVM_ABI BasicBlock *getDefaultDest() const;
+  LLVM_ABI BasicBlock *getIndirectDest(unsigned Idx) const;
+  LLVM_ABI SmallVector<BasicBlock *, 16> getIndirectDests() const;
+  LLVM_ABI void setDefaultDest(BasicBlock *BB);
+  LLVM_ABI void setIndirectDest(unsigned Idx, BasicBlock *BB);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const;
   unsigned getNumSuccessors() const {
     return cast<llvm::CallBrInst>(Val)->getNumSuccessors();
   }
@@ -1512,9 +1519,10 @@ class LandingPadInst : public SingleLLVMInstructionImpl<llvm::LandingPadInst> {
   friend class Context; // For constructor.
 
 public:
-  static LandingPadInst *create(Type *RetTy, unsigned NumReservedClauses,
-                                InsertPosition Pos, Context &Ctx,
-                                const Twine &Name = "");
+  LLVM_ABI static LandingPadInst *create(Type *RetTy,
+                                         unsigned NumReservedClauses,
+                                         InsertPosition Pos, Context &Ctx,
+                                         const Twine &Name = "");
   /// Return 'true' if this landingpad instruction is a
   /// cleanup. I.e., it should be run when unwinding even if its landing pad
   /// doesn't catch the exception.
@@ -1522,14 +1530,14 @@ class LandingPadInst : public SingleLLVMInstructionImpl<llvm::LandingPadInst> {
     return cast<llvm::LandingPadInst>(Val)->isCleanup();
   }
   /// Indicate that this landingpad instruction is a cleanup.
-  void setCleanup(bool V);
+  LLVM_ABI void setCleanup(bool V);
 
   // TODO: We are not implementing addClause() because we have no way to revert
   // it for now.
 
   /// Get the value of the clause at index Idx. Use isCatch/isFilter to
   /// determine what type of clause this is.
-  Constant *getClause(unsigned Idx) const;
+  LLVM_ABI Constant *getClause(unsigned Idx) const;
 
   /// Return 'true' if the clause and index Idx is a catch clause.
   bool isCatch(unsigned Idx) const {
@@ -1565,12 +1573,12 @@ class FuncletPadInst : public SingleLLVMInstructionImpl<llvm::FuncletPadInst> {
   ///
   /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst
   /// is a CatchPadInst.
-  Value *getParentPad() const;
-  void setParentPad(Value *ParentPad);
+  LLVM_ABI Value *getParentPad() const;
+  LLVM_ABI void setParentPad(Value *ParentPad);
   /// Return the Idx-th funcletpad argument.
-  Value *getArgOperand(unsigned Idx) const;
+  LLVM_ABI Value *getArgOperand(unsigned Idx) const;
   /// Set the Idx-th funcletpad argument.
-  void setArgOperand(unsigned Idx, Value *V);
+  LLVM_ABI void setArgOperand(unsigned Idx, Value *V);
 
   // TODO: Implement missing functions: arg_operands().
   static bool classof(const Value *From) {
@@ -1585,13 +1593,13 @@ class CatchPadInst : public FuncletPadInst {
   friend class Context; // For constructor.
 
 public:
-  CatchSwitchInst *getCatchSwitch() const;
+  LLVM_ABI CatchSwitchInst *getCatchSwitch() const;
   // TODO: We have not implemented setCatchSwitch() because we can't revert it
   // for now, as there is no CatchPadInst member function that can undo it.
 
-  static CatchPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
-                              InsertPosition Pos, Context &Ctx,
-                              const Twine &Name = "");
+  LLVM_ABI static CatchPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
+                                       InsertPosition Pos, Context &Ctx,
+                                       const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CatchPad;
   }
@@ -1603,9 +1611,10 @@ class CleanupPadInst : public FuncletPadInst {
   friend class Context; // For constructor.
 
 public:
-  static CleanupPadInst *create(Value *ParentPad, ArrayRef<Value *> Args,
-                                InsertPosition Pos, Context &Ctx,
-                                const Twine &Name = "");
+  LLVM_ABI static CleanupPadInst *create(Value *ParentPad,
+                                         ArrayRef<Value *> Args,
+                                         InsertPosition Pos, Context &Ctx,
+                                         const Twine &Name = "");
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CleanupPad;
   }
@@ -1619,16 +1628,17 @@ class CatchReturnInst
   friend class Context; // For constructor.
 
 public:
-  static CatchReturnInst *create(CatchPadInst *CatchPad, BasicBlock *BB,
-                                 InsertPosition Pos, Context &Ctx);
-  CatchPadInst *getCatchPad() const;
-  void setCatchPad(CatchPadInst *CatchPad);
-  BasicBlock *getSuccessor() const;
-  void setSuccessor(BasicBlock *NewSucc);
+  LLVM_ABI static CatchReturnInst *create(CatchPadInst *CatchPad,
+                                          BasicBlock *BB, InsertPosition Pos,
+                                          Context &Ctx);
+  LLVM_ABI CatchPadInst *getCatchPad() const;
+  LLVM_ABI void setCatchPad(CatchPadInst *CatchPad);
+  LLVM_ABI BasicBlock *getSuccessor() const;
+  LLVM_ABI void setSuccessor(BasicBlock *NewSucc);
   unsigned getNumSuccessors() {
     return cast<llvm::CatchReturnInst>(Val)->getNumSuccessors();
   }
-  Value *getCatchSwitchParentPad() const;
+  LLVM_ABI Value *getCatchSwitchParentPad() const;
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CatchRet;
   }
@@ -1642,22 +1652,22 @@ class CleanupReturnInst
   friend class Context; // For constructor.
 
 public:
-  static CleanupReturnInst *create(CleanupPadInst *CleanupPad,
-                                   BasicBlock *UnwindBB, InsertPosition Pos,
-                                   Context &Ctx);
+  LLVM_ABI static CleanupReturnInst *create(CleanupPadInst *CleanupPad,
+                                            BasicBlock *UnwindBB,
+                                            InsertPosition Pos, Context &Ctx);
   bool hasUnwindDest() const {
     return cast<llvm::CleanupReturnInst>(Val)->hasUnwindDest();
   }
   bool unwindsToCaller() const {
     return cast<llvm::CleanupReturnInst>(Val)->unwindsToCaller();
   }
-  CleanupPadInst *getCleanupPad() const;
-  void setCleanupPad(CleanupPadInst *CleanupPad);
+  LLVM_ABI CleanupPadInst *getCleanupPad() const;
+  LLVM_ABI void setCleanupPad(CleanupPadInst *CleanupPad);
   unsigned getNumSuccessors() const {
     return cast<llvm::CleanupReturnInst>(Val)->getNumSuccessors();
   }
-  BasicBlock *getUnwindDest() const;
-  void setUnwindDest(BasicBlock *NewDest);
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setUnwindDest(BasicBlock *NewDest);
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::CleanupRet;
@@ -1677,16 +1687,16 @@ class GetElementPtrInst final
                         // create*()
 
 public:
-  static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &NameStr = "");
+  LLVM_ABI static Value *create(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &NameStr = "");
 
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::GetElementPtr;
   }
 
-  Type *getSourceElementType() const;
-  Type *getResultElementType() const;
+  LLVM_ABI Type *getSourceElementType() const;
+  LLVM_ABI Type *getResultElementType() const;
   unsigned getAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getAddressSpace();
   }
@@ -1706,11 +1716,11 @@ class GetElementPtrInst final
     return const_cast<GetElementPtrInst *>(this)->indices();
   }
 
-  Value *getPointerOperand() const;
+  LLVM_ABI Value *getPointerOperand() const;
   static unsigned getPointerOperandIndex() {
     return llvm::GetElementPtrInst::getPointerOperandIndex();
   }
-  Type *getPointerOperandType() const;
+  LLVM_ABI Type *getPointerOperandType() const;
   unsigned getPointerAddressSpace() const {
     return cast<llvm::GetElementPtrInst>(Val)->getPointerAddressSpace();
   }
@@ -1750,12 +1760,12 @@ class CatchSwitchInst
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static CatchSwitchInst *create(Value *ParentPad, BasicBlock *UnwindBB,
-                                 unsigned NumHandlers, InsertPosition Pos,
-                                 Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static CatchSwitchInst *
+  create(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers,
+         InsertPosition Pos, Context &Ctx, const Twine &Name = "");
 
-  Value *getParentPad() const;
-  void setParentPad(Value *ParentPad);
+  LLVM_ABI Value *getParentPad() const;
+  LLVM_ABI void setParentPad(Value *ParentPad);
 
   bool hasUnwindDest() const {
     return cast<llvm::CatchSwitchInst>(Val)->hasUnwindDest();
@@ -1763,8 +1773,8 @@ class CatchSwitchInst
   bool unwindsToCaller() const {
     return cast<llvm::CatchSwitchInst>(Val)->unwindsToCaller();
   }
-  BasicBlock *getUnwindDest() const;
-  void setUnwindDest(BasicBlock *UnwindDest);
+  LLVM_ABI BasicBlock *getUnwindDest() const;
+  LLVM_ABI void setUnwindDest(BasicBlock *UnwindDest);
 
   unsigned getNumHandlers() const {
     return cast<llvm::CatchSwitchInst>(Val)->getNumHandlers();
@@ -1810,7 +1820,7 @@ class CatchSwitchInst
     return make_range(handler_begin(), handler_end());
   }
 
-  void addHandler(BasicBlock *Dest);
+  LLVM_ABI void addHandler(BasicBlock *Dest);
 
   // TODO: removeHandler() cannot be reverted because there is no equivalent
   // addHandler() with a handler_iterator to specify the position. So we can't
@@ -1839,8 +1849,9 @@ class ResumeInst : public SingleLLVMInstructionImpl<llvm::ResumeInst> {
   friend class Context; // For accessing the constructor in create*()
 
 public:
-  static ResumeInst *create(Value *Exn, InsertPosition Pos, Context &Ctx);
-  Value *getValue() const;
+  LLVM_ABI static ResumeInst *create(Value *Exn, InsertPosition Pos,
+                                     Context &Ctx);
+  LLVM_ABI Value *getValue() const;
   unsigned getNumSuccessors() const {
     return cast<llvm::ResumeInst>(Val)->getNumSuccessors();
   }
@@ -1858,17 +1869,17 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
   static constexpr const unsigned DefaultPseudoIndex =
       llvm::SwitchInst::DefaultPseudoIndex;
 
-  static SwitchInst *create(Value *V, BasicBlock *Dest, unsigned NumCases,
-                            InsertPosition Pos, Context &Ctx,
-                            const Twine &Name = "");
+  LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest,
+                                     unsigned NumCases, InsertPosition Pos,
+                                     Context &Ctx, const Twine &Name = "");
 
-  Value *getCondition() const;
-  void setCondition(Value *V);
-  BasicBlock *getDefaultDest() const;
+  LLVM_ABI Value *getCondition() const;
+  LLVM_ABI void setCondition(Value *V);
+  LLVM_ABI BasicBlock *getDefaultDest() const;
   bool defaultDestUnreachable() const {
     return cast<llvm::SwitchInst>(Val)->defaultDestUnreachable();
   }
-  void setDefaultDest(BasicBlock *DefaultCase);
+  LLVM_ABI void setDefaultDest(BasicBlock *DefaultCase);
   unsigned getNumCases() const {
     return cast<llvm::SwitchInst>(Val)->getNumCases();
   }
@@ -1913,9 +1924,9 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
       return I;
     return case_default();
   }
-  ConstantInt *findCaseDest(BasicBlock *BB);
+  LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB);
 
-  void addCase(ConstantInt *OnVal, BasicBlock *Dest);
+  LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest);
   /// This method removes the specified case and its successor from the switch
   /// instruction. Note that this operation may reorder the remaining cases at
   /// index idx and above.
@@ -1923,13 +1934,13 @@ class SwitchInst : public SingleLLVMInstructionImpl<llvm::SwitchInst> {
   /// This action invalidates iterators for all cases following the one removed,
   /// including the case_end() iterator. It returns an iterator for the next
   /// case.
-  CaseIt removeCase(CaseIt It);
+  LLVM_ABI CaseIt removeCase(CaseIt It);
 
   unsigned getNumSuccessors() const {
     return cast<llvm::SwitchInst>(Val)->getNumSuccessors();
   }
-  BasicBlock *getSuccessor(unsigned Idx) const;
-  void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
+  LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const;
+  LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc);
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::Switch;
   }
@@ -1950,11 +1961,13 @@ class UnaryOperator : public UnaryInstruction {
                          Ctx) {}
   friend Context; // for constructor.
 public:
-  static Value *create(Instruction::Opcode Op, Value *OpV, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *OpV,
-                                      Value *CopyFrom, InsertPosition Pos,
-                                      Context &Ctx, const Twine &Name = "");
+  LLVM_ABI static Value *create(Instruction::Opcode Op, Value *OpV,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op,
+                                               Value *OpV, Value *CopyFrom,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::UnOp;
@@ -2013,14 +2026,15 @@ class BinaryOperator : public SingleLLVMInstructionImpl<llvm::BinaryOperator> {
   friend class Context; // For constructor.
 
 public:
-  static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
-
-  static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *LHS,
-                                      Value *RHS, Value *CopyFrom,
-                                      InsertPosition Pos, Context &Ctx,
-                                      const Twine &Name = "");
+  LLVM_ABI static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+
+  LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op,
+                                               Value *LHS, Value *RHS,
+                                               Value *CopyFrom,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     return From->getSubclassID() == ClassID::BinaryOperator;
@@ -2033,7 +2047,7 @@ class BinaryOperator : public SingleLLVMInstructionImpl<llvm::BinaryOperator> {
 /// can also be treated as an add.
 class PossiblyDisjointInst : public BinaryOperator {
 public:
-  void setIsDisjoint(bool B);
+  LLVM_ABI void setIsDisjoint(bool B);
   bool isDisjoint() const {
     return cast<llvm::PossiblyDisjointInst>(Val)->isDisjoint();
   }
@@ -2066,24 +2080,24 @@ class AtomicRMWInst : public SingleLLVMInstructionImpl<llvm::AtomicRMWInst> {
     cast<llvm::AtomicRMWInst>(Val)->setOperation(Op);
   }
   Align getAlign() const { return cast<llvm::AtomicRMWInst>(Val)->getAlign(); }
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   bool isVolatile() const {
     return cast<llvm::AtomicRMWInst>(Val)->isVolatile();
   }
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
   AtomicOrdering getOrdering() const {
     return cast<llvm::AtomicRMWInst>(Val)->getOrdering();
   }
-  void setOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setOrdering(AtomicOrdering Ordering);
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::AtomicRMWInst>(Val)->getSyncScopeID();
   }
-  void setSyncScopeID(SyncScope::ID SSID);
-  Value *getPointerOperand();
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<AtomicRMWInst *>(this)->getPointerOperand();
   }
-  Value *getValOperand();
+  LLVM_ABI Value *getValOperand();
   const Value *getValOperand() const {
     return const_cast<AtomicRMWInst *>(this)->getValOperand();
   }
@@ -2097,11 +2111,10 @@ class AtomicRMWInst : public SingleLLVMInstructionImpl<llvm::AtomicRMWInst> {
     return From->getSubclassID() == ClassID::AtomicRMW;
   }
 
-  static AtomicRMWInst *create(BinOp Op, Value *Ptr, Value *Val,
-                               MaybeAlign Align, AtomicOrdering Ordering,
-                               InsertPosition Pos, Context &Ctx,
-                               SyncScope::ID SSID = SyncScope::System,
-                               const Twine &Name = "");
+  LLVM_ABI static AtomicRMWInst *
+  create(BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align,
+         AtomicOrdering Ordering, InsertPosition Pos, Context &Ctx,
+         SyncScope::ID SSID = SyncScope::System, const Twine &Name = "");
 };
 
 class AtomicCmpXchgInst
@@ -2119,17 +2132,17 @@ class AtomicCmpXchgInst
     return cast<llvm::AtomicCmpXchgInst>(Val)->getAlign();
   }
 
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   /// Return true if this is a cmpxchg from a volatile memory
   /// location.
   bool isVolatile() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->isVolatile();
   }
   /// Specify whether this is a volatile cmpxchg.
-  void setVolatile(bool V);
+  LLVM_ABI void setVolatile(bool V);
   /// Return true if this cmpxchg may spuriously fail.
   bool isWeak() const { return cast<llvm::AtomicCmpXchgInst>(Val)->isWeak(); }
-  void setWeak(bool IsWeak);
+  LLVM_ABI void setWeak(bool IsWeak);
   static bool isValidSuccessOrdering(AtomicOrdering Ordering) {
     return llvm::AtomicCmpXchgInst::isValidSuccessOrdering(Ordering);
   }
@@ -2139,30 +2152,30 @@ class AtomicCmpXchgInst
   AtomicOrdering getSuccessOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getSuccessOrdering();
   }
-  void setSuccessOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setSuccessOrdering(AtomicOrdering Ordering);
 
   AtomicOrdering getFailureOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getFailureOrdering();
   }
-  void setFailureOrdering(AtomicOrdering Ordering);
+  LLVM_ABI void setFailureOrdering(AtomicOrdering Ordering);
   AtomicOrdering getMergedOrdering() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getMergedOrdering();
   }
   SyncScope::ID getSyncScopeID() const {
     return cast<llvm::AtomicCmpXchgInst>(Val)->getSyncScopeID();
   }
-  void setSyncScopeID(SyncScope::ID SSID);
-  Value *getPointerOperand();
+  LLVM_ABI void setSyncScopeID(SyncScope::ID SSID);
+  LLVM_ABI Value *getPointerOperand();
   const Value *getPointerOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getPointerOperand();
   }
 
-  Value *getCompareOperand();
+  LLVM_ABI Value *getCompareOperand();
   const Value *getCompareOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getCompareOperand();
   }
 
-  Value *getNewValOperand();
+  LLVM_ABI Value *getNewValOperand();
   const Value *getNewValOperand() const {
     return const_cast<AtomicCmpXchgInst *>(this)->getNewValOperand();
   }
@@ -2172,7 +2185,7 @@ class AtomicCmpXchgInst
     return cast<llvm::AtomicCmpXchgInst>(Val)->getPointerAddressSpace();
   }
 
-  static AtomicCmpXchgInst *
+  LLVM_ABI static AtomicCmpXchgInst *
   create(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align,
          AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering,
          InsertPosition Pos, Context &Ctx,
@@ -2190,9 +2203,10 @@ class AllocaInst final : public UnaryInstruction {
   friend class Context; // For constructor.
 
 public:
-  static AllocaInst *create(Type *Ty, unsigned AddrSpace, InsertPosition Pos,
-                            Context &Ctx, Value *ArraySize = nullptr,
-                            const Twine &Name = "");
+  LLVM_ABI static AllocaInst *create(Type *Ty, unsigned AddrSpace,
+                                     InsertPosition Pos, Context &Ctx,
+                                     Value *ArraySize = nullptr,
+                                     const Twine &Name = "");
 
   /// Return true if there is an allocation size parameter to the allocation
   /// instruction that is not 1.
@@ -2201,12 +2215,12 @@ class AllocaInst final : public UnaryInstruction {
   }
   /// Get the number of elements allocated. For a simple allocation of a single
   /// element, this will return a constant 1 value.
-  Value *getArraySize();
+  LLVM_ABI Value *getArraySize();
   const Value *getArraySize() const {
     return const_cast<AllocaInst *>(this)->getArraySize();
   }
   /// Overload to return most specific pointer type.
-  PointerType *getType() const;
+  LLVM_ABI PointerType *getType() const;
   /// Return the address space for the allocation.
   unsigned getAddressSpace() const {
     return cast<llvm::AllocaInst>(Val)->getAddressSpace();
@@ -2222,14 +2236,14 @@ class AllocaInst final : public UnaryInstruction {
     return cast<llvm::AllocaInst>(Val)->getAllocationSizeInBits(DL);
   }
   /// Return the type that is being allocated by the instruction.
-  Type *getAllocatedType() const;
+  LLVM_ABI Type *getAllocatedType() const;
   /// for use only in special circumstances that need to generically
   /// transform a whole instruction (eg: IR linking and vectorization).
-  void setAllocatedType(Type *Ty);
+  LLVM_ABI void setAllocatedType(Type *Ty);
   /// Return the alignment of the memory that is being allocated by the
   /// instruction.
   Align getAlign() const { return cast<llvm::AllocaInst>(Val)->getAlign(); }
-  void setAlignment(Align Align);
+  LLVM_ABI void setAlignment(Align Align);
   /// Return true if this alloca is in the entry block of the function and is a
   /// constant size. If so, the code generator will fold it into the
   /// prolog/epilog code, so it is basically free.
@@ -2242,7 +2256,7 @@ class AllocaInst final : public UnaryInstruction {
     return cast<llvm::AllocaInst>(Val)->isUsedWithInAlloca();
   }
   /// Specify whether this alloca is used to represent the arguments to a call.
-  void setUsedWithInAlloca(bool V);
+  LLVM_ABI void setUsedWithInAlloca(bool V);
 
   static bool classof(const Value *From) {
     if (auto *I = dyn_cast<Instruction>(From))
@@ -2293,13 +2307,13 @@ class CastInst : public UnaryInstruction {
   friend Context; // for SBCastInstruction()
 
 public:
-  static Value *create(Type *DestTy, Opcode Op, Value *Operand,
-                       InsertPosition Pos, Context &Ctx,
-                       const Twine &Name = "");
+  LLVM_ABI static Value *create(Type *DestTy, Opcode Op, Value *Operand,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
-  Type *getSrcTy() const;
-  Type *getDestTy() const;
+  LLVM_ABI static bool classof(const Value *From);
+  LLVM_ABI Type *getSrcTy() const;
+  LLVM_ABI Type *getDestTy() const;
 };
 
 /// Instruction that can have a nneg flag (zext/uitofp).
@@ -2308,7 +2322,7 @@ class PossiblyNonNegInst : public CastInst {
   bool hasNonNeg() const {
     return cast<llvm::PossiblyNonNegInst>(Val)->hasNonNeg();
   }
-  void setNonNeg(bool B);
+  LLVM_ABI void setNonNeg(bool B);
   /// For isa/dyn_cast.
   static bool classof(const Value *From) {
     if (auto *I = dyn_cast<Instruction>(From)) {
@@ -2383,15 +2397,15 @@ class PHINode final : public SingleLLVMInstructionImpl<llvm::PHINode> {
   struct LLVMBBToBB {
     Context &Ctx;
     LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {}
-    BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
+    LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const;
   };
 
 public:
-  static PHINode *create(Type *Ty, unsigned NumReservedValues,
-                         InsertPosition Pos, Context &Ctx,
-                         const Twine &Name = "");
+  LLVM_ABI static PHINode *create(Type *Ty, unsigned NumReservedValues,
+                                  InsertPosition Pos, Context &Ctx,
+                                  const Twine &Name = "");
   /// For isa/dyn_cast.
-  static bool classof(const Value *From);
+  LLVM_ABI static bool classof(const Value *From);
 
   using const_block_iterator =
       mapped_iterator<llvm::PHINode::const_block_iterator, LLVMBBToBB>;
@@ -2417,35 +2431,36 @@ class PHINode final : public SingleLLVMInstructionImpl<llvm::PHINode> {
   unsigned getNumIncomingValues() const {
     return cast<llvm::PHINode>(Val)->getNumIncomingValues();
   }
-  Value *getIncomingValue(unsigned Idx) const;
-  void setIncomingValue(unsigned Idx, Value *V);
+  LLVM_ABI Value *getIncomingValue(unsigned Idx) const;
+  LLVM_ABI void setIncomingValue(unsigned Idx, Value *V);
   static unsigned getOperandNumForIncomingValue(unsigned Idx) {
     return llvm::PHINode::getOperandNumForIncomingValue(Idx);
   }
   static unsigned getIncomingValueNumForOperand(unsigned Idx) {
     return llvm::PHINode::getIncomingValueNumForOperand(Idx);
   }
-  BasicBlock *getIncomingBlock(unsigned Idx) const;
-  BasicBlock *getIncomingBlock(const Use &U) const;
+  LLVM_ABI BasicBlock *getIncomingBlock(unsigned Idx) const;
+  LLVM_ABI BasicBlock *getIncomingBlock(const Use &U) const;
 
-  void setIncomingBlock(unsigned Idx, BasicBlock *BB);
+  LLVM_ABI void setIncomingBlock(unsigned Idx, BasicBlock *BB);
 
-  void addIncoming(Value *V, BasicBlock *BB);
+  LLVM_ABI void addIncoming(Value *V, BasicBlock *BB);
 
-  Value *removeIncomingValue(unsigned Idx);
-  Value *removeIncomingValue(BasicBlock *BB);
+  LLVM_ABI Value *removeIncomingValue(unsigned Idx);
+  LLVM_ABI Value *removeIncomingValue(BasicBlock *BB);
 
-  int getBasicBlockIndex(const BasicBlock *BB) const;
-  Value *getIncomingValueForBlock(const BasicBlock *BB) const;
+  LLVM_ABI int getBasicBlockIndex(const BasicBlock *BB) const;
+  LLVM_ABI Value *getIncomingValueForBlock(const BasicBlock *BB) const;
 
-  Value *hasConstantValue() const;
+  LLVM_ABI Value *hasConstantValue() const;
 
   bool hasConstantOrUndefValue() const {
     return cast<llvm::PHINode>(Val)->hasConstantOrUndefValue();
   }
   bool isComplete() const { return cast<llvm::PHINode>(Val)->isComplete(); }
-  void replaceIncomingBlockWith(const BasicBlock *Old, BasicBlock *New);
-  void removeIncomingValueIf(function_ref<bool(unsigned)> Predicate);
+  LLVM_ABI void replaceIncomingBlockWith(const BasicBlock *Old,
+                                         BasicBlock *New);
+  LLVM_ABI void removeIncomingValueIf(function_ref<bool(unsigned)> Predicate);
   // TODO: Implement
   // void copyIncomingBlocks(iterator_range<const_block_iterator> BBRange,
   //                         uint32_t ToIdx = 0)
@@ -2471,21 +2486,23 @@ class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
   CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc)
       : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {}
   friend Context; // for CmpInst()
-  static Value *createCommon(Value *Cond, Value *True, Value *False,
-                             const Twine &Name, IRBuilder<> &Builder,
-                             Context &Ctx);
+  LLVM_ABI static Value *createCommon(Value *Cond, Value *True, Value *False,
+                                      const Twine &Name, IRBuilder<> &Builder,
+                                      Context &Ctx);
 
 public:
   using Predicate = llvm::CmpInst::Predicate;
 
-  static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos,
-                       Context &Ctx, const Twine &Name = "");
-  static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2,
-                                      const Instruction *FlagsSource,
-                                      InsertPosition Pos, Context &Ctx,
-                                      const Twine &Name = "");
-  void setPredicate(Predicate P);
-  void swapOperands();
+  LLVM_ABI static Value *create(Predicate Pred, Value *S1, Value *S2,
+                                InsertPosition Pos, Context &Ctx,
+                                const Twine &Name = "");
+  LLVM_ABI static Value *createWithCopiedFlags(Predicate Pred, Value *S1,
+                                               Value *S2,
+                                               const Instruction *FlagsSource,
+                                               InsertPosition Pos, Context &Ctx,
+                                               const Twine &Name = "");
+  LLVM_ABI void setPredicate(Predicate P);
+  LLVM_ABI void swapOperands();
 
   WRAP_MEMBER(getPredicate);
   WRAP_BOTH(isFPPredicate);
@@ -2517,7 +2534,7 @@ class CmpInst : public SingleLLVMInstructionImpl<llvm::CmpInst> {
   }
 
   /// Create a result type for fcmp/icmp
-  static Type *makeCmpResultType(Type *OpndType);
+  LLVM_ABI static Type *makeCmpResultType(Type *OpndType);
 
 #ifndef NDEBUG
   void dumpOS(raw_ostream &OS) const override;
@@ -2533,7 +2550,7 @@ class ICmpInst : public CmpInst {
   using LLVMValType = llvm::ICmpInst;
 
 public:
-  void swapOperands();
+  LLVM_ABI void swapOperands();
 
   WRAP_BOTH(getSignedPredicate);
   WRAP_BOTH(getUnsignedPredicate);
@@ -2570,7 +2587,7 @@ class FCmpInst : public CmpInst {
   using LLVMValType = llvm::FCmpInst;
 
 public:
-  void swapOperands();
+  LLVM_ABI void swapOperands();
 
   WRAP_BOTH(isEquality);
   WRAP_MEMBER(isCommutative);
diff --git a/llvm/include/llvm/SandboxIR/Module.h b/llvm/include/llvm/SandboxIR/Module.h
index 429bb04539bcb..275960392211d 100644
--- a/llvm/include/llvm/SandboxIR/Module.h
+++ b/llvm/include/llvm/SandboxIR/Module.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -38,7 +39,7 @@ class Module {
 public:
   Context &getContext() const { return Ctx; }
 
-  Function *getFunction(StringRef Name) const;
+  LLVM_ABI Function *getFunction(StringRef Name) const;
 
   const DataLayout &getDataLayout() const { return LLVMM.getDataLayout(); }
 
@@ -50,7 +51,8 @@ class Module {
   /// does not exist, return null. If AllowInternal is set to true, this
   /// function will return types that have InternalLinkage. By default, these
   /// types are not returned.
-  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const;
+  LLVM_ABI GlobalVariable *getGlobalVariable(StringRef Name,
+                                             bool AllowInternal) const;
   GlobalVariable *getGlobalVariable(StringRef Name) const {
     return getGlobalVariable(Name, /*AllowInternal=*/false);
   }
@@ -66,12 +68,12 @@ class Module {
   /// Return the global alias in the module with the specified name, of
   /// arbitrary type. This method returns null if a global with the specified
   /// name is not found.
-  GlobalAlias *getNamedAlias(StringRef Name) const;
+  LLVM_ABI GlobalAlias *getNamedAlias(StringRef Name) const;
 
   /// Return the global ifunc in the module with the specified name, of
   /// arbitrary type. This method returns null if a global with the specified
   /// name is not found.
-  GlobalIFunc *getNamedIFunc(StringRef Name) const;
+  LLVM_ABI GlobalIFunc *getNamedIFunc(StringRef Name) const;
 
   // TODO: Missing removeGlobalVariable() eraseGlobalVariable(),
   // insertGlobalVariable()
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 55a0301f4756b..6fccaf04b270a 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_SANDBOXIR_PASSMANAGER_H
 #define LLVM_SANDBOXIR_PASSMANAGER_H
 
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 #include "llvm/ADT/DenseMap.h"
@@ -201,7 +202,7 @@ class PassManager : public ParentPass {
   }
 };
 
-class FunctionPassManager final
+class LLVM_ABI FunctionPassManager final
     : public PassManager<FunctionPass, FunctionPass> {
 public:
   FunctionPassManager(StringRef Name) : PassManager(Name) {}
@@ -211,7 +212,8 @@ class FunctionPassManager final
   bool runOnFunction(Function &F, const Analyses &A) final;
 };
 
-class RegionPassManager final : public PassManager<RegionPass, RegionPass> {
+class LLVM_ABI RegionPassManager final
+    : public PassManager<RegionPass, RegionPass> {
 public:
   RegionPassManager(StringRef Name) : PassManager(Name) {}
   RegionPassManager(StringRef Name, StringRef Pipeline,
diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h
index f86199ab6c228..d70f21277fb1b 100644
--- a/llvm/include/llvm/SandboxIR/Region.h
+++ b/llvm/include/llvm/SandboxIR/Region.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_SANDBOXIR_REGION_H
 #define LLVM_SANDBOXIR_REGION_H
 
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 #include "llvm/ADT/SetVector.h"
@@ -30,7 +31,7 @@ class ScoreBoard {
   /// The cost of all instructions that got removed and replaced by new ones.
   InstructionCost BeforeCost = 0;
   /// Helper for both add() and remove(). \Returns the TTI cost of \p I.
-  InstructionCost getCost(Instruction *I) const;
+  LLVM_ABI InstructionCost getCost(Instruction *I) const;
   /// No need to allow copies.
   ScoreBoard(const ScoreBoard &) = delete;
   const ScoreBoard &operator=(const ScoreBoard &) = delete;
@@ -40,7 +41,7 @@ class ScoreBoard {
   /// Mark \p I as a newly added instruction to the region.
   void add(Instruction *I) { AfterCost += getCost(I); }
   /// Mark \p I as a deleted instruction from the region.
-  void remove(Instruction *I);
+  LLVM_ABI void remove(Instruction *I);
   /// \Returns the cost of the newly added instructions.
   InstructionCost getAfterCost() const { return AfterCost; }
   /// \Returns the cost of the Removed instructions.
@@ -122,12 +123,12 @@ class Region {
   /// add an instruction to the auxiliary vector it does get tagged as being a
   /// member of the region (for ownership reasons), but its cost does not get
   /// counted because the instruction hasn't been added in the "normal" way.
-  void addImpl(Instruction *I, bool IgnoreCost);
+  LLVM_ABI void addImpl(Instruction *I, bool IgnoreCost);
   /// Adds I to the set. This is the main API for adding an instruction to the
   /// region.
   void add(Instruction *I) { addImpl(I, /*IgnoreCost=*/false); }
   /// Removes I from the set.
-  void remove(Instruction *I);
+  LLVM_ABI void remove(Instruction *I);
   friend class Context; // The callbacks need to call add() and remove().
   friend class RegionInternalsAttorney; // For unit tests.
   friend class RegionsFromBBs;          // For add().
@@ -141,8 +142,8 @@ class Region {
   void removeFromAux(Instruction *I);
 
 public:
-  Region(Context &Ctx, TargetTransformInfo &TTI);
-  ~Region();
+  LLVM_ABI Region(Context &Ctx, TargetTransformInfo &TTI);
+  LLVM_ABI ~Region();
 
   Context &getContext() const { return Ctx; }
   /// Returns true if I is in the Region.
@@ -150,18 +151,18 @@ class Region {
   /// Returns true if the Region has no instructions.
   bool empty() const { return Insts.empty(); }
   /// Set the auxiliary vector.
-  void setAux(ArrayRef<Instruction *> Aux);
+  LLVM_ABI void setAux(ArrayRef<Instruction *> Aux);
   /// \Returns the auxiliary vector.
   const SmallVector<Instruction *> &getAux() const { return Aux; }
   /// Clears all auxiliary data.
-  void clearAux();
+  LLVM_ABI void clearAux();
 
   using iterator = decltype(Insts.begin());
   iterator begin() { return Insts.begin(); }
   iterator end() { return Insts.end(); }
   iterator_range<iterator> insts() { return make_range(begin(), end()); }
 
-  static SmallVector<std::unique_ptr<Region>>
+  LLVM_ABI static SmallVector<std::unique_ptr<Region>>
   createRegionsFromMD(Function &F, TargetTransformInfo &TTI);
   /// \Returns the ScoreBoard data structure that keeps track of instr costs.
   const ScoreBoard &getScoreboard() const { return Scoreboard; }
diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h
index f7b469965eae8..9a2c9dd516489 100644
--- a/llvm/include/llvm/SandboxIR/Tracker.h
+++ b/llvm/include/llvm/SandboxIR/Tracker.h
@@ -46,6 +46,8 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/SandboxIR/Use.h"
+#include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include <memory>
 
@@ -149,7 +151,7 @@ class UseSet : public IRChangeBase {
 #endif
 };
 
-class PHIRemoveIncoming : public IRChangeBase {
+class LLVM_ABI PHIRemoveIncoming : public IRChangeBase {
   PHINode *PHI;
   unsigned RemovedIdx;
   Value *RemovedV;
@@ -165,7 +167,7 @@ class PHIRemoveIncoming : public IRChangeBase {
 #endif
 };
 
-class PHIAddIncoming : public IRChangeBase {
+class LLVM_ABI PHIAddIncoming : public IRChangeBase {
   PHINode *PHI;
   unsigned Idx;
 
@@ -179,7 +181,7 @@ class PHIAddIncoming : public IRChangeBase {
 #endif
 };
 
-class CmpSwapOperands : public IRChangeBase {
+class LLVM_ABI CmpSwapOperands : public IRChangeBase {
   CmpInst *Cmp;
 
 public:
@@ -210,7 +212,7 @@ class UseSwap : public IRChangeBase {
 #endif
 };
 
-class EraseFromParent : public IRChangeBase {
+class LLVM_ABI EraseFromParent : public IRChangeBase {
   /// Contains all the data we need to restore an "erased" (i.e., detached)
   /// instruction: the instruction itself and its operands in order.
   struct InstrAndOperands {
@@ -242,7 +244,7 @@ class EraseFromParent : public IRChangeBase {
 #endif
 };
 
-class RemoveFromParent : public IRChangeBase {
+class LLVM_ABI RemoveFromParent : public IRChangeBase {
   /// The instruction that is about to get removed.
   Instruction *RemovedI = nullptr;
   /// This is either the next instr, or the parent BB if at the end of the BB.
@@ -327,7 +329,7 @@ class GenericSetterWithIdx final : public IRChangeBase {
 #endif
 };
 
-class CatchSwitchAddHandler : public IRChangeBase {
+class LLVM_ABI CatchSwitchAddHandler : public IRChangeBase {
   CatchSwitchInst *CSI;
   unsigned HandlerIdx;
 
@@ -344,7 +346,7 @@ class CatchSwitchAddHandler : public IRChangeBase {
 #endif // NDEBUG
 };
 
-class SwitchAddCase : public IRChangeBase {
+class LLVM_ABI SwitchAddCase : public IRChangeBase {
   SwitchInst *Switch;
   ConstantInt *Val;
 
@@ -359,7 +361,7 @@ class SwitchAddCase : public IRChangeBase {
 #endif // NDEBUG
 };
 
-class SwitchRemoveCase : public IRChangeBase {
+class LLVM_ABI SwitchRemoveCase : public IRChangeBase {
   SwitchInst *Switch;
   struct Case {
     ConstantInt *Val;
@@ -378,7 +380,7 @@ class SwitchRemoveCase : public IRChangeBase {
 #endif // NDEBUG
 };
 
-class MoveInstr : public IRChangeBase {
+class LLVM_ABI MoveInstr : public IRChangeBase {
   /// The instruction that moved.
   Instruction *MovedI;
   /// This is either the next instruction in the block, or the parent BB if at
@@ -395,7 +397,7 @@ class MoveInstr : public IRChangeBase {
 #endif // NDEBUG
 };
 
-class InsertIntoBB final : public IRChangeBase {
+class LLVM_ABI InsertIntoBB final : public IRChangeBase {
   Instruction *InsertedI = nullptr;
 
 public:
@@ -408,7 +410,7 @@ class InsertIntoBB final : public IRChangeBase {
 #endif // NDEBUG
 };
 
-class CreateAndInsertInst final : public IRChangeBase {
+class LLVM_ABI CreateAndInsertInst final : public IRChangeBase {
   Instruction *NewI = nullptr;
 
 public:
@@ -421,7 +423,7 @@ class CreateAndInsertInst final : public IRChangeBase {
 #endif
 };
 
-class ShuffleVectorSetMask final : public IRChangeBase {
+class LLVM_ABI ShuffleVectorSetMask final : public IRChangeBase {
   ShuffleVectorInst *SVI;
   SmallVector<int, 8> PrevMask;
 
@@ -472,7 +474,7 @@ class Tracker {
   {
   }
 
-  ~Tracker();
+  LLVM_ABI ~Tracker();
   Context &getContext() const { return Ctx; }
   /// \Returns true if there are no changes tracked.
   bool empty() const { return Changes.empty(); }
@@ -506,11 +508,11 @@ class Tracker {
   /// \Returns the current state of the tracker.
   TrackerState getState() const { return State; }
   /// Turns on IR tracking.
-  void save();
+  LLVM_ABI void save();
   /// Stops tracking and accept changes.
-  void accept();
+  LLVM_ABI void accept();
   /// Stops tracking and reverts to saved state.
-  void revert();
+  LLVM_ABI void revert();
 
 #ifndef NDEBUG
   void dump(raw_ostream &OS) const;
diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h
index f90ae096443b5..d9c5e6c098dad 100644
--- a/llvm/include/llvm/SandboxIR/Type.h
+++ b/llvm/include/llvm/SandboxIR/Type.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -264,18 +265,18 @@ class Type {
 
   /// If this is a vector type, return the element type, otherwise return
   /// 'this'.
-  Type *getScalarType() const;
+  LLVM_ABI Type *getScalarType() const;
 
   // TODO: ADD MISSING
 
-  static Type *getInt64Ty(Context &Ctx);
-  static Type *getInt32Ty(Context &Ctx);
-  static Type *getInt16Ty(Context &Ctx);
-  static Type *getInt8Ty(Context &Ctx);
-  static Type *getInt1Ty(Context &Ctx);
-  static Type *getDoubleTy(Context &Ctx);
-  static Type *getFloatTy(Context &Ctx);
-  static Type *getHalfTy(Context &Ctx);
+  LLVM_ABI static Type *getInt64Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt32Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt16Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt8Ty(Context &Ctx);
+  LLVM_ABI static Type *getInt1Ty(Context &Ctx);
+  LLVM_ABI static Type *getDoubleTy(Context &Ctx);
+  LLVM_ABI static Type *getFloatTy(Context &Ctx);
+  LLVM_ABI static Type *getHalfTy(Context &Ctx);
   // TODO: missing get*
 
   /// Get the address space of this pointer or pointer vector type.
@@ -293,7 +294,7 @@ class PointerType : public Type {
 public:
   // TODO: add missing functions
 
-  static PointerType *get(Context &Ctx, unsigned AddressSpace);
+  LLVM_ABI static PointerType *get(Context &Ctx, unsigned AddressSpace);
 
   static bool classof(const Type *From) {
     return isa<llvm::PointerType>(From->LLVMTy);
@@ -302,7 +303,7 @@ class PointerType : public Type {
 
 class ArrayType : public Type {
 public:
-  static ArrayType *get(Type *ElementType, uint64_t NumElements);
+  LLVM_ABI static ArrayType *get(Type *ElementType, uint64_t NumElements);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::ArrayType>(From->LLVMTy);
@@ -312,8 +313,8 @@ class ArrayType : public Type {
 class StructType : public Type {
 public:
   /// This static method is the primary way to create a literal StructType.
-  static StructType *get(Context &Ctx, ArrayRef<Type *> Elements,
-                         bool IsPacked = false);
+  LLVM_ABI static StructType *get(Context &Ctx, ArrayRef<Type *> Elements,
+                                  bool IsPacked = false);
 
   bool isPacked() const { return cast<llvm::StructType>(LLVMTy)->isPacked(); }
 
@@ -325,13 +326,13 @@ class StructType : public Type {
 
 class VectorType : public Type {
 public:
-  static VectorType *get(Type *ElementType, ElementCount EC);
+  LLVM_ABI static VectorType *get(Type *ElementType, ElementCount EC);
   static VectorType *get(Type *ElementType, unsigned NumElements,
                          bool Scalable) {
     return VectorType::get(ElementType,
                            ElementCount::get(NumElements, Scalable));
   }
-  Type *getElementType() const;
+  LLVM_ABI Type *getElementType() const;
 
   static VectorType *get(Type *ElementType, const VectorType *Other) {
     return VectorType::get(ElementType, Other->getElementCount());
@@ -340,13 +341,14 @@ class VectorType : public Type {
   inline ElementCount getElementCount() const {
     return cast<llvm::VectorType>(LLVMTy)->getElementCount();
   }
-  static VectorType *getInteger(VectorType *VTy);
-  static VectorType *getExtendedElementVectorType(VectorType *VTy);
-  static VectorType *getTruncatedElementVectorType(VectorType *VTy);
-  static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs);
-  static VectorType *getHalfElementsVectorType(VectorType *VTy);
-  static VectorType *getDoubleElementsVectorType(VectorType *VTy);
-  static bool isValidElementType(Type *ElemTy);
+  LLVM_ABI static VectorType *getInteger(VectorType *VTy);
+  LLVM_ABI static VectorType *getExtendedElementVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getTruncatedElementVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getSubdividedVectorType(VectorType *VTy,
+                                                      int NumSubdivs);
+  LLVM_ABI static VectorType *getHalfElementsVectorType(VectorType *VTy);
+  LLVM_ABI static VectorType *getDoubleElementsVectorType(VectorType *VTy);
+  LLVM_ABI static bool isValidElementType(Type *ElemTy);
 
   static bool classof(const Type *From) {
     return isa<llvm::VectorType>(From->LLVMTy);
@@ -355,7 +357,7 @@ class VectorType : public Type {
 
 class FixedVectorType : public VectorType {
 public:
-  static FixedVectorType *get(Type *ElementType, unsigned NumElts);
+  LLVM_ABI static FixedVectorType *get(Type *ElementType, unsigned NumElts);
 
   static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) {
     return get(ElementType, FVTy->getNumElements());
@@ -399,7 +401,8 @@ class FixedVectorType : public VectorType {
 
 class ScalableVectorType : public VectorType {
 public:
-  static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts);
+  LLVM_ABI static ScalableVectorType *get(Type *ElementType,
+                                          unsigned MinNumElts);
 
   static ScalableVectorType *get(Type *ElementType,
                                  const ScalableVectorType *SVTy) {
@@ -462,7 +465,7 @@ class FunctionType : public Type {
 /// Integer representation type
 class IntegerType : public Type {
 public:
-  static IntegerType *get(Context &C, unsigned NumBits);
+  LLVM_ABI static IntegerType *get(Context &C, unsigned NumBits);
   // TODO: add missing functions
   static bool classof(const Type *From) {
     return isa<llvm::IntegerType>(From->LLVMTy);
diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h
index c4a774aa3a89e..5c02c4f2b3495 100644
--- a/llvm/include/llvm/SandboxIR/Use.h
+++ b/llvm/include/llvm/SandboxIR/Use.h
@@ -14,6 +14,7 @@
 #define LLVM_SANDBOXIR_USE_H
 
 #include "llvm/IR/Use.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm::sandboxir {
@@ -49,11 +50,11 @@ class Use {
 
 public:
   operator Value *() const { return get(); }
-  Value *get() const;
-  void set(Value *V);
+  LLVM_ABI Value *get() const;
+  LLVM_ABI void set(Value *V);
   class User *getUser() const { return Usr; }
-  unsigned getOperandNo() const;
-  void swap(Use &OtherUse);
+  LLVM_ABI unsigned getOperandNo() const;
+  LLVM_ABI void swap(Use &OtherUse);
   Context *getContext() const { return Ctx; }
   bool operator==(const Use &Other) const {
     assert(Ctx == Other.Ctx && "Contexts differ!");
diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h
index 80e672de34905..c552e2e3378be 100644
--- a/llvm/include/llvm/SandboxIR/User.h
+++ b/llvm/include/llvm/SandboxIR/User.h
@@ -13,6 +13,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Use.h"
 #include "llvm/SandboxIR/Value.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -36,8 +37,8 @@ class OperandUseIterator {
   using iterator_category = std::input_iterator_tag;
 
   OperandUseIterator() = default;
-  value_type operator*() const;
-  OperandUseIterator &operator++();
+  LLVM_ABI value_type operator*() const;
+  LLVM_ABI OperandUseIterator &operator++();
   OperandUseIterator operator++(int) {
     auto Copy = *this;
     this->operator++();
@@ -49,13 +50,13 @@ class OperandUseIterator {
   bool operator!=(const OperandUseIterator &Other) const {
     return !(*this == Other);
   }
-  OperandUseIterator operator+(unsigned Num) const;
-  OperandUseIterator operator-(unsigned Num) const;
-  int operator-(const OperandUseIterator &Other) const;
+  LLVM_ABI OperandUseIterator operator+(unsigned Num) const;
+  LLVM_ABI OperandUseIterator operator-(unsigned Num) const;
+  LLVM_ABI int operator-(const OperandUseIterator &Other) const;
 };
 
 /// A sandboxir::User has operands.
-class User : public Value {
+class LLVM_ABI User : public Value {
 protected:
   User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {}
 
diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h
index dbd0208b4f3f3..dd0bc76db3e37 100644
--- a/llvm/include/llvm/SandboxIR/Value.h
+++ b/llvm/include/llvm/SandboxIR/Value.h
@@ -12,6 +12,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Value.h"
 #include "llvm/SandboxIR/Use.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm::sandboxir {
 
@@ -50,7 +51,7 @@ class UserUseIterator {
 
   UserUseIterator() = default;
   value_type operator*() const { return Use; }
-  UserUseIterator &operator++();
+  LLVM_ABI UserUseIterator &operator++();
   bool operator==(const UserUseIterator &Other) const {
     return Use == Other.Use;
   }
@@ -179,7 +180,7 @@ class Value {
   void clearValue() { Val = nullptr; }
   template <typename ItTy, typename SBTy> friend class LLVMOpUserItToSBTy;
 
-  Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
+  LLVM_ABI Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx);
   /// Disable copies.
   Value(const Value &) = delete;
   Value &operator=(const Value &) = delete;
@@ -191,7 +192,7 @@ class Value {
   using use_iterator = UserUseIterator;
   using const_use_iterator = UserUseIterator;
 
-  use_iterator use_begin();
+  LLVM_ABI use_iterator use_begin();
   const_use_iterator use_begin() const {
     return const_cast<Value *>(this)->use_begin();
   }
@@ -215,7 +216,7 @@ class Value {
   using user_iterator = mapped_iterator<sandboxir::UserUseIterator, UseToUser>;
   using const_user_iterator = user_iterator;
 
-  user_iterator user_begin();
+  LLVM_ABI user_iterator user_begin();
   user_iterator user_end() {
     return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser());
   }
@@ -234,7 +235,7 @@ class Value {
   }
   /// \Returns the number of user edges (not necessarily to unique users).
   /// WARNING: This is a linear-time operation.
-  unsigned getNumUses() const;
+  LLVM_ABI unsigned getNumUses() const;
   /// Return true if this value has N uses or more.
   /// This is logically equivalent to getNumUses() >= N.
   /// WARNING: This can be expensive, as it is linear to the number of users.
@@ -256,13 +257,14 @@ class Value {
     return Cnt == Num;
   }
 
-  Type *getType() const;
+  LLVM_ABI Type *getType() const;
 
   Context &getContext() const { return Ctx; }
 
-  void replaceUsesWithIf(Value *OtherV,
-                         llvm::function_ref<bool(const Use &)> ShouldReplace);
-  void replaceAllUsesWith(Value *Other);
+  LLVM_ABI void
+  replaceUsesWithIf(Value *OtherV,
+                    llvm::function_ref<bool(const Use &)> ShouldReplace);
+  LLVM_ABI void replaceAllUsesWith(Value *Other);
 
   /// \Returns the LLVM IR name of the bottom-most LLVM value.
   StringRef getName() const { return Val->getName(); }
diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp
index 82cf0876d5800..9de88ef2cf0a0 100644
--- a/llvm/lib/SandboxIR/Constant.cpp
+++ b/llvm/lib/SandboxIR/Constant.cpp
@@ -305,35 +305,14 @@ GlobalT &GlobalWithNodeAPI<GlobalT, LLVMGlobalT, ParentT, LLVMParentT>::
 }
 
 // Explicit instantiations.
-template class GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<Function, llvm::Function, GlobalObject,
-                                 llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalVariable, llvm::GlobalVariable,
-                                 GlobalObject, llvm::GlobalObject>;
-template class GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                                 llvm::GlobalValue>;
-
-#if defined(_MSC_VER) && !defined(__clang__)
-// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB
-template LLVM_EXPORT_TEMPLATE GlobalIFunc &
-GlobalWithNodeAPI<GlobalIFunc, llvm::GlobalIFunc, GlobalObject,
-                  llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalIFunc
-                                                                  &LLVMGV)
-    const;
-template LLVM_EXPORT_TEMPLATE Function &
-GlobalWithNodeAPI<Function, llvm::Function, GlobalObject, llvm::GlobalObject>::
-    LLVMGVToGV::operator()(llvm::Function &LLVMGV) const;
-
-template LLVM_EXPORT_TEMPLATE GlobalVariable &GlobalWithNodeAPI<
-    GlobalVariable, llvm::GlobalVariable, GlobalObject,
-    llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV)
-    const;
-template LLVM_EXPORT_TEMPLATE GlobalAlias &
-GlobalWithNodeAPI<GlobalAlias, llvm::GlobalAlias, GlobalValue,
-                  llvm::GlobalValue>::LLVMGVToGV::operator()(llvm::GlobalAlias
-                                                                 &LLVMGV) const;
-#endif
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    Function, llvm::Function, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>;
+template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI<
+    GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>;
 
 void GlobalIFunc::setResolver(Constant *Resolver) {
   Ctx.getTracker()

From 2652d1b2fd65950a66f37ed6d5ed9c4ffabacbee Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Wed, 11 Jun 2025 09:19:47 -0700
Subject: [PATCH 100/851] [llvm] annotate interfaces in llvm/TextAPI for DLL
 export (#143447)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/TextAPI` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

These changes were generated automatically using the [Interface
Definition Scanner (IDS)](https://github.com/compnerd/ids) tool,
followed formatting with `git clang-format`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/TextAPI/Architecture.h    | 17 ++++---
 llvm/include/llvm/TextAPI/ArchitectureSet.h | 13 +++---
 llvm/include/llvm/TextAPI/DylibReader.h     | 10 ++--
 llvm/include/llvm/TextAPI/InterfaceFile.h   | 34 +++++++-------
 llvm/include/llvm/TextAPI/PackedVersion.h   |  9 ++--
 llvm/include/llvm/TextAPI/Platform.h        | 17 +++----
 llvm/include/llvm/TextAPI/Record.h          | 18 +++----
 llvm/include/llvm/TextAPI/RecordVisitor.h   |  5 +-
 llvm/include/llvm/TextAPI/RecordsSlice.h    | 52 ++++++++++++---------
 llvm/include/llvm/TextAPI/Symbol.h          |  8 ++--
 llvm/include/llvm/TextAPI/SymbolSet.h       | 11 +++--
 llvm/include/llvm/TextAPI/Target.h          | 15 +++---
 llvm/include/llvm/TextAPI/TextAPIError.h    |  3 +-
 llvm/include/llvm/TextAPI/TextAPIReader.h   |  5 +-
 llvm/include/llvm/TextAPI/TextAPIWriter.h   |  8 ++--
 llvm/include/llvm/TextAPI/Utils.h           | 21 +++++----
 16 files changed, 138 insertions(+), 108 deletions(-)

diff --git a/llvm/include/llvm/TextAPI/Architecture.h b/llvm/include/llvm/TextAPI/Architecture.h
index 978359995074b..7a7f5416fe7c7 100644
--- a/llvm/include/llvm/TextAPI/Architecture.h
+++ b/llvm/include/llvm/TextAPI/Architecture.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_ARCHITECTURE_H
 #define LLVM_TEXTAPI_ARCHITECTURE_H
 
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <utility>
 
@@ -32,24 +33,26 @@ enum Architecture : uint8_t {
 };
 
 /// Convert a CPU Type and Subtype pair to an architecture slice.
-Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType);
+LLVM_ABI Architecture getArchitectureFromCpuType(uint32_t CPUType,
+                                                 uint32_t CPUSubType);
 
 /// Convert a name to an architecture slice.
-Architecture getArchitectureFromName(StringRef Name);
+LLVM_ABI Architecture getArchitectureFromName(StringRef Name);
 
 /// Convert an architecture slice to a string.
-StringRef getArchitectureName(Architecture Arch);
+LLVM_ABI StringRef getArchitectureName(Architecture Arch);
 
 /// Convert an architecture slice to a CPU Type and Subtype pair.
-std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch);
+LLVM_ABI std::pair<uint32_t, uint32_t>
+getCPUTypeFromArchitecture(Architecture Arch);
 
 /// Convert a target to an architecture slice.
-Architecture mapToArchitecture(const llvm::Triple &Target);
+LLVM_ABI Architecture mapToArchitecture(const llvm::Triple &Target);
 
 /// Check if architecture is 64 bit.
-bool is64Bit(Architecture);
+LLVM_ABI bool is64Bit(Architecture);
 
-raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, Architecture Arch);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/ArchitectureSet.h b/llvm/include/llvm/TextAPI/ArchitectureSet.h
index 2cce9dbf0d80c..a7d3394c99821 100644
--- a/llvm/include/llvm/TextAPI/ArchitectureSet.h
+++ b/llvm/include/llvm/TextAPI/ArchitectureSet.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_ARCHITECTURESET_H
 #define LLVM_TEXTAPI_ARCHITECTURESET_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Architecture.h"
 #include <cstddef>
 #include <iterator>
@@ -38,7 +39,7 @@ class ArchitectureSet {
   constexpr ArchitectureSet() = default;
   constexpr ArchitectureSet(ArchSetType Raw) : ArchSet(Raw) {}
   ArchitectureSet(Architecture Arch) : ArchitectureSet() { set(Arch); }
-  ArchitectureSet(const std::vector<Architecture> &Archs);
+  LLVM_ABI ArchitectureSet(const std::vector<Architecture> &Archs);
 
   static ArchitectureSet All() { return ArchitectureSet(EndIndexVal); }
 
@@ -61,7 +62,7 @@ class ArchitectureSet {
     return (ArchSet & Archs.ArchSet) == Archs.ArchSet;
   }
 
-  size_t count() const;
+  LLVM_ABI size_t count() const;
 
   bool empty() const { return ArchSet == 0; }
 
@@ -158,9 +159,9 @@ class ArchitectureSet {
   const_iterator begin() const { return {&ArchSet}; }
   const_iterator end() const { return {&ArchSet, EndIndexVal}; }
 
-  operator std::string() const;
-  operator std::vector<Architecture>() const;
-  void print(raw_ostream &OS) const;
+  LLVM_ABI operator std::string() const;
+  LLVM_ABI operator std::vector<Architecture>() const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 };
 
 inline ArchitectureSet operator|(const Architecture &lhs,
@@ -168,7 +169,7 @@ inline ArchitectureSet operator|(const Architecture &lhs,
   return ArchitectureSet(lhs) | ArchitectureSet(rhs);
 }
 
-raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/DylibReader.h b/llvm/include/llvm/TextAPI/DylibReader.h
index 6861d3cb1591b..f3a806d78df78 100644
--- a/llvm/include/llvm/TextAPI/DylibReader.h
+++ b/llvm/include/llvm/TextAPI/DylibReader.h
@@ -14,6 +14,7 @@
 #define LLVM_TEXTAPI_DYLIBREADER_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
@@ -37,20 +38,21 @@ struct ParseOption {
 /// \param Buffer Data that points to dylib.
 /// \param Options Determines which attributes to extract.
 /// \return List of record slices.
-Expected<Records> readFile(MemoryBufferRef Buffer, const ParseOption &Opt);
+LLVM_ABI Expected<Records> readFile(MemoryBufferRef Buffer,
+                                    const ParseOption &Opt);
 
 /// Get TAPI file representation of binary dylib.
 ///
 /// \param Buffer Data that points to dylib.
-Expected<std::unique_ptr<InterfaceFile>> get(MemoryBufferRef Buffer);
+LLVM_ABI Expected<std::unique_ptr<InterfaceFile>> get(MemoryBufferRef Buffer);
 
 using SymbolToSourceLocMap = llvm::StringMap<RecordLoc>;
 /// Get the source location for each symbol from dylib.
 ///
 /// \param DSYM Path to DSYM file.
 /// \param T Requested target slice for dylib.
-SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM,
-                                                 const Target &T);
+LLVM_ABI SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM,
+                                                          const Target &T);
 
 } // namespace llvm::MachO::DylibReader
 
diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h
index 23c27cb0f4745..747c8d0a208c5 100644
--- a/llvm/include/llvm/TextAPI/InterfaceFile.h
+++ b/llvm/include/llvm/TextAPI/InterfaceFile.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
@@ -60,7 +61,7 @@ class InterfaceFileRef {
 
   StringRef getInstallName() const { return InstallName; };
 
-  void addTarget(const Target &Target);
+  LLVM_ABI void addTarget(const Target &Target);
   template <typename RangeT> void addTargets(RangeT &&Targets) {
     for (const auto &Target : Targets)
       addTarget(Target(Target));
@@ -146,7 +147,7 @@ class InterfaceFile {
   /// Set and add target.
   ///
   /// \param Target the target to add into.
-  void addTarget(const Target &Target);
+  LLVM_ABI void addTarget(const Target &Target);
 
   /// Determine if target triple slice exists in file.
   ///
@@ -174,7 +175,7 @@ class InterfaceFile {
                             std::function<bool(const Target &)>>;
   using const_filtered_target_range =
       llvm::iterator_range<const_filtered_target_iterator>;
-  const_filtered_target_range targets(ArchitectureSet Archs) const;
+  LLVM_ABI const_filtered_target_range targets(ArchitectureSet Archs) const;
 
   /// Set the install name of the library.
   void setInstallName(StringRef InstallName_) {
@@ -241,7 +242,7 @@ class InterfaceFile {
   /// Set the parent umbrella frameworks.
   /// \param Target_ The target applicable to Parent
   /// \param Parent  The name of Parent
-  void addParentUmbrella(const Target &Target_, StringRef Parent);
+  LLVM_ABI void addParentUmbrella(const Target &Target_, StringRef Parent);
 
   /// Get the list of Parent Umbrella frameworks.
   ///
@@ -261,7 +262,7 @@ class InterfaceFile {
   /// \param InstallName The name of the client that is allowed to link this
   /// library.
   /// \param Target The target triple for which this applies.
-  void addAllowableClient(StringRef InstallName, const Target &Target);
+  LLVM_ABI void addAllowableClient(StringRef InstallName, const Target &Target);
 
   /// Get the list of allowable clients.
   ///
@@ -274,7 +275,8 @@ class InterfaceFile {
   ///
   /// \param InstallName The name of the library to re-export.
   /// \param Target The target triple for which this applies.
-  void addReexportedLibrary(StringRef InstallName, const Target &Target);
+  LLVM_ABI void addReexportedLibrary(StringRef InstallName,
+                                     const Target &Target);
 
   /// Get the list of re-exported libraries.
   ///
@@ -286,7 +288,7 @@ class InterfaceFile {
   /// Add a library for inlining to top level library.
   ///
   ///\param Document The library to inline with top level library.
-  void addDocument(std::shared_ptr<InterfaceFile> &&Document);
+  LLVM_ABI void addDocument(std::shared_ptr<InterfaceFile> &&Document);
 
   /// Returns the pointer to parent document if exists or nullptr otherwise.
   InterfaceFile *getParent() const { return Parent; }
@@ -301,7 +303,7 @@ class InterfaceFile {
   /// Set the runpath search paths.
   /// \param RPath The name of runpath.
   /// \param InputTarget The target applicable to runpath search path.
-  void addRPath(StringRef RPath, const Target &InputTarget);
+  LLVM_ABI void addRPath(StringRef RPath, const Target &InputTarget);
 
   /// Get the list of runpath search paths.
   ///
@@ -373,14 +375,14 @@ class InterfaceFile {
   ///
   /// \param Arch architecture to extract from.
   /// \return New InterfaceFile with extracted architecture slice.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   extract(Architecture Arch) const;
 
   /// Remove architecture slice from Interface.
   ///
   /// \param Arch architecture to remove.
   /// \return New Interface File with removed architecture slice.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   remove(Architecture Arch) const;
 
   /// Merge Interfaces for the same library. The following library attributes
@@ -390,29 +392,29 @@ class InterfaceFile {
   ///
   /// \param O The Interface to merge.
   /// \return New Interface File that was merged.
-  llvm::Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI llvm::Expected<std::unique_ptr<InterfaceFile>>
   merge(const InterfaceFile *O) const;
 
   /// Inline reexported library into Interface.
   ///
   /// \param Library Interface of reexported library.
   /// \param Overwrite Whether to overwrite preexisting inlined library.
-  void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
-                     bool Overwrite = false);
+  LLVM_ABI void inlineLibrary(std::shared_ptr<InterfaceFile> Library,
+                              bool Overwrite = false);
 
   /// Set InterfaceFile properties from pre-gathered binary attributes,
   /// if they are not set already.
   ///
   /// \param BA Attributes typically represented in load commands.
   /// \param Targ MachO Target slice to add attributes to.
-  void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
-                          const Target &Targ);
+  LLVM_ABI void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA,
+                                   const Target &Targ);
 
   /// The equality is determined by attributes that impact linking
   /// compatibilities. Path, & FileKind are irrelevant since these by
   /// itself should not impact linking.
   /// This is an expensive operation.
-  bool operator==(const InterfaceFile &O) const;
+  LLVM_ABI bool operator==(const InterfaceFile &O) const;
 
   bool operator!=(const InterfaceFile &O) const { return !(*this == O); }
 
diff --git a/llvm/include/llvm/TextAPI/PackedVersion.h b/llvm/include/llvm/TextAPI/PackedVersion.h
index e680d40c71044..cabe365e6d97a 100644
--- a/llvm/include/llvm/TextAPI/PackedVersion.h
+++ b/llvm/include/llvm/TextAPI/PackedVersion.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_PACKEDVERSION_H
 #define LLVM_TEXTAPI_PACKEDVERSION_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 #include <cstdint>
 #include <string>
@@ -53,8 +54,8 @@ class PackedVersion {
   /// Retrieve the subminor version number, if provided.
   unsigned getSubminor() const { return Version & 0xff; }
 
-  bool parse32(StringRef Str);
-  std::pair<bool, bool> parse64(StringRef Str);
+  LLVM_ABI bool parse32(StringRef Str);
+  LLVM_ABI std::pair<bool, bool> parse64(StringRef Str);
 
   bool operator<(const PackedVersion &O) const { return Version < O.Version; }
 
@@ -64,9 +65,9 @@ class PackedVersion {
 
   uint32_t rawValue() const { return Version; }
 
-  operator std::string() const;
+  LLVM_ABI operator std::string() const;
 
-  void print(raw_ostream &OS) const;
+  LLVM_ABI void print(raw_ostream &OS) const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const PackedVersion &Version) {
diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h
index d828d9ac49f65..8ea187acc02f9 100644
--- a/llvm/include/llvm/TextAPI/Platform.h
+++ b/llvm/include/llvm/TextAPI/Platform.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 
 namespace llvm {
@@ -22,14 +23,14 @@ namespace MachO {
 using PlatformSet = SmallSet<PlatformType, 3>;
 using PlatformVersionSet = SmallSet<std::pair<PlatformType, VersionTuple>, 3>;
 
-PlatformType mapToPlatformType(PlatformType Platform, bool WantSim);
-PlatformType mapToPlatformType(const Triple &Target);
-PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets);
-StringRef getPlatformName(PlatformType Platform);
-PlatformType getPlatformFromName(StringRef Name);
-std::string getOSAndEnvironmentName(PlatformType Platform,
-                                    std::string Version = "");
-VersionTuple mapToSupportedOSVersion(const Triple &Triple);
+LLVM_ABI PlatformType mapToPlatformType(PlatformType Platform, bool WantSim);
+LLVM_ABI PlatformType mapToPlatformType(const Triple &Target);
+LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets);
+LLVM_ABI StringRef getPlatformName(PlatformType Platform);
+LLVM_ABI PlatformType getPlatformFromName(StringRef Name);
+LLVM_ABI std::string getOSAndEnvironmentName(PlatformType Platform,
+                                             std::string Version = "");
+LLVM_ABI VersionTuple mapToSupportedOSVersion(const Triple &Triple);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h
index 7d721988ec3da..6e470d97325fd 100644
--- a/llvm/include/llvm/TextAPI/Record.h
+++ b/llvm/include/llvm/TextAPI/Record.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Symbol.h"
 #include <string>
 
@@ -104,7 +105,7 @@ class Record {
   SymbolFlags getFlags() const { return Flags; }
 
 private:
-  SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage);
+  LLVM_ABI SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage);
 
 protected:
   StringRef Name;
@@ -164,9 +165,9 @@ class ObjCContainerRecord : public Record {
   ObjCContainerRecord(StringRef Name, RecordLinkage Linkage)
       : Record({Name, Linkage, SymbolFlags::Data}) {}
 
-  ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage);
-  ObjCIVarRecord *findObjCIVar(StringRef IVar) const;
-  std::vector<ObjCIVarRecord *> getObjCIVars() const;
+  LLVM_ABI ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage);
+  LLVM_ABI ObjCIVarRecord *findObjCIVar(StringRef IVar) const;
+  LLVM_ABI std::vector<ObjCIVarRecord *> getObjCIVars() const;
   RecordLinkage getLinkage() const { return Linkage; }
 
 private:
@@ -207,11 +208,12 @@ class ObjCInterfaceRecord : public ObjCContainerRecord {
     return getLinkageForSymbol(CurrType) >= RecordLinkage::Rexported;
   }
 
-  RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const;
-  void updateLinkageForSymbols(ObjCIFSymbolKind SymType, RecordLinkage Link);
+  LLVM_ABI RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const;
+  LLVM_ABI void updateLinkageForSymbols(ObjCIFSymbolKind SymType,
+                                        RecordLinkage Link);
 
-  bool addObjCCategory(ObjCCategoryRecord *Record);
-  std::vector<ObjCCategoryRecord *> getObjCCategories() const;
+  LLVM_ABI bool addObjCCategory(ObjCCategoryRecord *Record);
+  LLVM_ABI std::vector<ObjCCategoryRecord *> getObjCCategories() const;
 
 private:
   /// Linkage level for each symbol represented in ObjCInterfaceRecord.
diff --git a/llvm/include/llvm/TextAPI/RecordVisitor.h b/llvm/include/llvm/TextAPI/RecordVisitor.h
index 34e43f5b0027f..65bc96df244d7 100644
--- a/llvm/include/llvm/TextAPI/RecordVisitor.h
+++ b/llvm/include/llvm/TextAPI/RecordVisitor.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TEXTAPI_RECORDVISITOR_H
 #define LLVM_TEXTAPI_RECORDVISITOR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Record.h"
 #include "llvm/TextAPI/SymbolSet.h"
 
@@ -20,7 +21,7 @@ namespace llvm {
 namespace MachO {
 
 /// Base class for any usage of traversing over collected Records.
-class RecordVisitor {
+class LLVM_ABI RecordVisitor {
 public:
   virtual ~RecordVisitor();
 
@@ -32,7 +33,7 @@ class RecordVisitor {
 /// Specialized RecordVisitor for collecting exported symbols
 /// and undefined symbols if RecordSlice being visited represents a
 /// flat-namespaced library.
-class SymbolConverter : public RecordVisitor {
+class LLVM_ABI SymbolConverter : public RecordVisitor {
 public:
   SymbolConverter(SymbolSet *Symbols, const Target &T,
                   const bool RecordUndefs = false)
diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h
index f934cf7607f1f..6ecb79a115aea 100644
--- a/llvm/include/llvm/TextAPI/RecordsSlice.h
+++ b/llvm/include/llvm/TextAPI/RecordsSlice.h
@@ -15,6 +15,7 @@
 #define LLVM_TEXTAPI_RECORDSLICE_H
 
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/FileTypes.h"
 #include "llvm/TextAPI/PackedVersion.h"
 #include "llvm/TextAPI/Record.h"
@@ -43,9 +44,10 @@ class RecordsSlice {
   /// symbol.
   /// \param Linkage The linkage of symbol.
   /// \return The non-owning pointer to added record in slice.
-  Record *addRecord(StringRef Name, SymbolFlags Flags,
-                    GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown,
-                    RecordLinkage Linkage = RecordLinkage::Unknown);
+  LLVM_ABI Record *
+  addRecord(StringRef Name, SymbolFlags Flags,
+            GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown,
+            RecordLinkage Linkage = RecordLinkage::Unknown);
 
   /// Add non-ObjC global record.
   ///
@@ -56,10 +58,10 @@ class RecordsSlice {
   /// \param Inlined Whether declaration is inlined, only applicable to
   /// functions.
   /// \return The non-owning pointer to added record in slice.
-  GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
-                          GlobalRecord::Kind GV,
-                          SymbolFlags Flags = SymbolFlags::None,
-                          bool Inlined = false);
+  LLVM_ABI GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage,
+                                   GlobalRecord::Kind GV,
+                                   SymbolFlags Flags = SymbolFlags::None,
+                                   bool Inlined = false);
 
   /// Add ObjC Class record.
   ///
@@ -67,8 +69,9 @@ class RecordsSlice {
   /// \param Linkage The linkage of symbol.
   /// \param SymType The symbols this class represents.
   /// \return The non-owning pointer to added record in slice.
-  ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage,
-                                        ObjCIFSymbolKind SymType);
+  LLVM_ABI ObjCInterfaceRecord *addObjCInterface(StringRef Name,
+                                                 RecordLinkage Linkage,
+                                                 ObjCIFSymbolKind SymType);
 
   /// Add ObjC IVar record.
   ///
@@ -76,8 +79,8 @@ class RecordsSlice {
   /// \param Name The name of ivar, not symbol.
   /// \param Linkage The linkage of symbol.
   /// \return The non-owning pointer to added record in slice.
-  ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, StringRef Name,
-                              RecordLinkage Linkage);
+  LLVM_ABI ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container,
+                                       StringRef Name, RecordLinkage Linkage);
 
   /// Add ObjC Category record.
   ///
@@ -85,22 +88,22 @@ class RecordsSlice {
   /// category, not symbol.
   /// \param Category The name of category.
   /// \return The non-owning pointer to added record in slice.
-  ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
-                                      StringRef Category);
+  LLVM_ABI ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend,
+                                               StringRef Category);
 
   /// Find ObjC Class.
   ///
   /// \param Name name of class, not full symbol name.
   /// \return The non-owning pointer to record in slice.
-  ObjCInterfaceRecord *findObjCInterface(StringRef Name) const;
+  LLVM_ABI ObjCInterfaceRecord *findObjCInterface(StringRef Name) const;
 
   /// Find ObjC Category.
   ///
   /// \param ClassToExtend The name of class, not full symbol name.
   /// \param Category The name of category.
   /// \return The non-owning pointer to record in slice.
-  ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend,
-                                       StringRef Category) const;
+  LLVM_ABI ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend,
+                                                StringRef Category) const;
 
   /// Find ObjC Container. This is commonly used for assigning for looking up
   /// instance variables that are assigned to either a category or class.
@@ -110,21 +113,23 @@ class RecordsSlice {
   /// \param Name Either the name of ivar or name of container.
   /// \return The non-owning pointer to record in
   /// slice.
-  ObjCContainerRecord *findContainer(bool IsIVar, StringRef Name) const;
+  LLVM_ABI ObjCContainerRecord *findContainer(bool IsIVar,
+                                              StringRef Name) const;
 
   /// Find ObjC instance variable.
   ///
   /// \param IsScopedName This is used to determine how to parse the name.
   /// \param Name Either the full name of the symbol or just the ivar.
   /// \return The non-owning pointer to record in slice.
-  ObjCIVarRecord *findObjCIVar(bool IsScopedName, StringRef Name) const;
+  LLVM_ABI ObjCIVarRecord *findObjCIVar(bool IsScopedName,
+                                        StringRef Name) const;
 
   /// Find non-objc global.
   ///
   /// \param Name The name of symbol.
   /// \param GV The Kind of global to find.
   /// \return The non-owning pointer to record in slice.
-  GlobalRecord *
+  LLVM_ABI GlobalRecord *
   findGlobal(StringRef Name,
              GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown) const;
 
@@ -138,7 +143,7 @@ class RecordsSlice {
   }
 
   // Visit all records known to RecordsSlice.
-  void visit(RecordVisitor &V) const;
+  LLVM_ABI void visit(RecordVisitor &V) const;
 
   struct BinaryAttrs {
     std::vector<StringRef> AllowableClients;
@@ -158,11 +163,11 @@ class RecordsSlice {
   };
 
   /// Return reference to BinaryAttrs.
-  BinaryAttrs &getBinaryAttrs();
+  LLVM_ABI BinaryAttrs &getBinaryAttrs();
 
   /// Store any strings owned by RecordSlice into allocator and return back
   /// reference to that.
-  StringRef copyString(StringRef String);
+  LLVM_ABI StringRef copyString(StringRef String);
 
 private:
   const llvm::Triple TargetTriple;
@@ -196,7 +201,8 @@ class RecordsSlice {
 
 using Records = llvm::SmallVector<std::shared_ptr<RecordsSlice>, 4>;
 class InterfaceFile;
-std::unique_ptr<InterfaceFile> convertToInterfaceFile(const Records &Slices);
+LLVM_ABI std::unique_ptr<InterfaceFile>
+convertToInterfaceFile(const Records &Slices);
 
 } // namespace MachO
 } // namespace llvm
diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h
index 5a5eb0eb48325..92ff0746f7995 100644
--- a/llvm/include/llvm/TextAPI/Symbol.h
+++ b/llvm/include/llvm/TextAPI/Symbol.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/Target.h"
@@ -152,14 +153,15 @@ class Symbol {
                             std::function<bool(const Target &)>>;
   using const_filtered_target_range =
       llvm::iterator_range<const_filtered_target_iterator>;
-  const_filtered_target_range targets(ArchitectureSet architectures) const;
+  LLVM_ABI const_filtered_target_range
+  targets(ArchitectureSet architectures) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump(raw_ostream &OS) const;
   void dump() const { dump(llvm::errs()); }
 #endif
 
-  bool operator==(const Symbol &O) const;
+  LLVM_ABI bool operator==(const Symbol &O) const;
 
   bool operator!=(const Symbol &O) const { return !(*this == O); }
 
@@ -189,7 +191,7 @@ struct SimpleSymbol {
 /// Get symbol classification by parsing the name of a symbol.
 ///
 /// \param SymName The name of symbol.
-SimpleSymbol parseSymbol(StringRef SymName);
+LLVM_ABI SimpleSymbol parseSymbol(StringRef SymName);
 
 } // end namespace MachO.
 } // end namespace llvm.
diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index 6ccabb9077208..cd3066317f3ae 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/Architecture.h"
 #include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/Symbol.h"
@@ -87,12 +88,12 @@ class SymbolSet {
   using SymbolsMapType = llvm::DenseMap<SymbolsMapKey, Symbol *>;
   SymbolsMapType Symbols;
 
-  Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags);
+  LLVM_ABI Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags);
 
 public:
   SymbolSet() = default;
-  Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
-                    const Target &Targ);
+  LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags,
+                             const Target &Targ);
   size_t size() const { return Symbols.size(); }
 
   template <typename RangeT, typename ElT = std::remove_reference_t<
@@ -107,7 +108,7 @@ class SymbolSet {
     return Global;
   }
 
-  const Symbol *
+  LLVM_ABI const Symbol *
   findSymbol(EncodeKind Kind, StringRef Name,
              ObjCIFSymbolKind ObjCIF = ObjCIFSymbolKind::None) const;
 
@@ -169,7 +170,7 @@ class SymbolSet {
         fn);
   }
 
-  bool operator==(const SymbolSet &O) const;
+  LLVM_ABI bool operator==(const SymbolSet &O) const;
 
   bool operator!=(const SymbolSet &O) const { return !(Symbols == O.Symbols); }
 
diff --git a/llvm/include/llvm/TextAPI/Target.h b/llvm/include/llvm/TextAPI/Target.h
index edcc0708d1478..5cc507fe21f78 100644
--- a/llvm/include/llvm/TextAPI/Target.h
+++ b/llvm/include/llvm/TextAPI/Target.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TEXTAPI_TARGET_H
 #define LLVM_TEXTAPI_TARGET_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/TargetParser/Triple.h"
@@ -35,9 +36,9 @@ class Target {
       : Arch(mapToArchitecture(Triple)), Platform(mapToPlatformType(Triple)),
         MinDeployment(mapToSupportedOSVersion(Triple)) {}
 
-  static llvm::Expected<Target> create(StringRef Target);
+  LLVM_ABI static llvm::Expected<Target> create(StringRef Target);
 
-  operator std::string() const;
+  LLVM_ABI operator std::string() const;
 
   Architecture Arch;
   PlatformType Platform;
@@ -66,13 +67,13 @@ inline bool operator!=(const Target &LHS, const Architecture &RHS) {
   return LHS.Arch != RHS;
 }
 
-PlatformVersionSet mapToPlatformVersionSet(ArrayRef<Target> Targets);
-PlatformSet mapToPlatformSet(ArrayRef<Target> Targets);
-ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets);
+LLVM_ABI PlatformVersionSet mapToPlatformVersionSet(ArrayRef<Target> Targets);
+LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef<Target> Targets);
+LLVM_ABI ArchitectureSet mapToArchitectureSet(ArrayRef<Target> Targets);
 
-std::string getTargetTripleName(const Target &Targ);
+LLVM_ABI std::string getTargetTripleName(const Target &Targ);
 
-raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const Target &Target);
 
 } // namespace MachO
 } // namespace llvm
diff --git a/llvm/include/llvm/TextAPI/TextAPIError.h b/llvm/include/llvm/TextAPI/TextAPIError.h
index f0578654697b8..7b2182edd6210 100644
--- a/llvm/include/llvm/TextAPI/TextAPIError.h
+++ b/llvm/include/llvm/TextAPI/TextAPIError.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TEXTAPI_TEXTAPIERROR_H
 #define LLVM_TEXTAPI_TEXTAPIERROR_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm::MachO {
@@ -25,7 +26,7 @@ enum class TextAPIErrorCode {
   UnsupportedTarget
 };
 
-class TextAPIError : public llvm::ErrorInfo<TextAPIError> {
+class LLVM_ABI TextAPIError : public llvm::ErrorInfo<TextAPIError> {
 public:
   static char ID;
   TextAPIErrorCode EC;
diff --git a/llvm/include/llvm/TextAPI/TextAPIReader.h b/llvm/include/llvm/TextAPI/TextAPIReader.h
index 32af0e3601f18..603b24b47283d 100644
--- a/llvm/include/llvm/TextAPI/TextAPIReader.h
+++ b/llvm/include/llvm/TextAPI/TextAPIReader.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TEXTAPI_TEXTAPIREADER_H
 #define LLVM_TEXTAPI_TEXTAPIREADER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -29,13 +30,13 @@ class TextAPIReader {
   ///
   /// \param InputBuffer Buffer holding contents of TAPI text file.
   /// \return The file format version of TAPI text file.
-  static Expected<FileType> canRead(MemoryBufferRef InputBuffer);
+  LLVM_ABI static Expected<FileType> canRead(MemoryBufferRef InputBuffer);
 
   /// Parse and get an InterfaceFile that represents the full
   /// library.
   ///
   /// \param InputBuffer Buffer holding contents of TAPI text file.
-  static Expected<std::unique_ptr<InterfaceFile>>
+  LLVM_ABI static Expected<std::unique_ptr<InterfaceFile>>
   get(MemoryBufferRef InputBuffer);
 
   TextAPIReader() = delete;
diff --git a/llvm/include/llvm/TextAPI/TextAPIWriter.h b/llvm/include/llvm/TextAPI/TextAPIWriter.h
index 7fd32c6fe2a9e..5f06c372fe852 100644
--- a/llvm/include/llvm/TextAPI/TextAPIWriter.h
+++ b/llvm/include/llvm/TextAPI/TextAPIWriter.h
@@ -10,6 +10,7 @@
 #define LLVM_TEXTAPI_TEXTAPIWRITER_H
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 
 namespace llvm {
@@ -30,9 +31,10 @@ class TextAPIWriter {
   /// \param FileKind File format to write text file as. If not specified, it
   /// will read from File.
   /// \param Compact Whether to limit whitespace in text file.
-  static Error writeToStream(raw_ostream &OS, const InterfaceFile &File,
-                             const FileType FileKind = FileType::Invalid,
-                             bool Compact = false);
+  LLVM_ABI static Error
+  writeToStream(raw_ostream &OS, const InterfaceFile &File,
+                const FileType FileKind = FileType::Invalid,
+                bool Compact = false);
 
   /// Get TAPI FileType from the input string.
   ///
diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h
index 00dfd63e14f91..27db717f5a63b 100644
--- a/llvm/include/llvm/TextAPI/Utils.h
+++ b/llvm/include/llvm/TextAPI/Utils.h
@@ -14,6 +14,7 @@
 #define LLVM_TEXTAPI_UTILS_H
 
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -51,34 +52,35 @@ struct SymLink {
 ///
 /// \param Path Location of file.
 /// \param Extension File extension to update with.
-void replace_extension(SmallVectorImpl<char> &Path, const Twine &Extension);
+LLVM_ABI void replace_extension(SmallVectorImpl<char> &Path,
+                                const Twine &Extension);
 
 /// Determine whether to skip over symlink due to either too many symlink levels
 /// or is cyclic.
 ///
 /// \param Path Location to symlink.
 /// \param Result Holds whether to skip over Path.
-std::error_code shouldSkipSymLink(const Twine &Path, bool &Result);
+LLVM_ABI std::error_code shouldSkipSymLink(const Twine &Path, bool &Result);
 
 /// Turn absolute symlink into relative.
 ///
 /// \param From The symlink.
 /// \param To What the symlink points to.
 /// \param RelativePath Path location to update what the symlink points to.
-std::error_code make_relative(StringRef From, StringRef To,
-                              SmallVectorImpl<char> &RelativePath);
+LLVM_ABI std::error_code make_relative(StringRef From, StringRef To,
+                                       SmallVectorImpl<char> &RelativePath);
 
 /// Determine if library is private by parsing file path.
 /// It does not touch the file system.
 ///
 /// \param Path File path for library.
 /// \param IsSymLink Whether path points to a symlink.
-bool isPrivateLibrary(StringRef Path, bool IsSymLink = false);
+LLVM_ABI bool isPrivateLibrary(StringRef Path, bool IsSymLink = false);
 
 /// Create a regex rule from provided glob string.
 /// \param Glob String that represents glob input.
 /// \return The equivalent regex rule.
-llvm::Expected<llvm::Regex> createRegexFromGlob(llvm::StringRef Glob);
+LLVM_ABI llvm::Expected<llvm::Regex> createRegexFromGlob(llvm::StringRef Glob);
 
 using AliasEntry = std::pair<std::string, EncodeKind>;
 using AliasMap = std::map<AliasEntry, AliasEntry>;
@@ -87,14 +89,15 @@ using AliasMap = std::map<AliasEntry, AliasEntry>;
 ///
 /// \param Buffer Data contents of file for the alias list.
 /// \return Lookup table of alias to their base symbol.
-Expected<AliasMap> parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer);
+LLVM_ABI Expected<AliasMap>
+parseAliasList(std::unique_ptr<llvm::MemoryBuffer> &Buffer);
 
 /// Pickup active paths for a given platform.
 ///
 /// \param Paths File or search paths to pick up.
 /// \param Platform Platform to collect paths for.
-PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths,
-                            PlatformType Platform);
+LLVM_ABI PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths,
+                                     PlatformType Platform);
 
 } // namespace llvm::MachO
 #endif // LLVM_TEXTAPI_UTILS_H

From 78765bb856bd6cdc3b1db48e80f74b8de5181f3f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 11 Jun 2025 17:23:04 +0100
Subject: [PATCH 101/851] [TableGen] Simplify computeUberWeights. NFC.
 (#143716)

Using RegUnitIterator made the code more complicated than having two
nested loops over each register and each register's regunits.
---
 .../TableGen/Common/CodeGenRegisters.cpp      | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 5ec9b35379fa4..4d24eb3de1ed9 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -1849,26 +1849,21 @@ static void computeUberWeights(MutableArrayRef<UberRegSet> UberSets,
   // Skip the first unallocatable set.
   for (UberRegSet &S : UberSets.drop_front()) {
     // Initialize all unit weights in this set, and remember the max units/reg.
-    const CodeGenRegister *Reg = nullptr;
-    unsigned MaxWeight = 0, Weight = 0;
-    for (RegUnitIterator UnitI(S.Regs); UnitI.isValid(); ++UnitI) {
-      if (Reg != UnitI.getReg()) {
-        if (Weight > MaxWeight)
-          MaxWeight = Weight;
-        Reg = UnitI.getReg();
-        Weight = 0;
-      }
-      if (!RegBank.getRegUnit(*UnitI).Artificial) {
-        unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight;
-        if (!UWeight) {
-          UWeight = 1;
-          RegBank.increaseRegUnitWeight(*UnitI, UWeight);
+    unsigned MaxWeight = 0;
+    for (const CodeGenRegister *R : S.Regs) {
+      unsigned Weight = 0;
+      for (unsigned U : R->getRegUnits()) {
+        if (!RegBank.getRegUnit(U).Artificial) {
+          unsigned UWeight = RegBank.getRegUnit(U).Weight;
+          if (!UWeight) {
+            UWeight = 1;
+            RegBank.increaseRegUnitWeight(U, UWeight);
+          }
+          Weight += UWeight;
         }
-        Weight += UWeight;
       }
+      MaxWeight = std::max(MaxWeight, Weight);
     }
-    if (Weight > MaxWeight)
-      MaxWeight = Weight;
     if (S.Weight != MaxWeight) {
       LLVM_DEBUG({
         dbgs() << "UberSet " << &S - UberSets.begin() << " Weight "

From 8e4f0d8614dcd48cfe2d885a021e2927c1bc8616 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:24:46 +0200
Subject: [PATCH 102/851] [CIR] Upstream minimal builtin function call support
 (#142981)

This patch adds all bits required to implement builtin function calls to
ClangIR. It doesn't actually implement any of the builtins except those
that fold to a constant ahead of CodeGen
(`__builtin_is_constant_evaluated()` being one example).
---
 clang/include/clang/CIR/MissingFeatures.h |  3 +-
 clang/lib/CIR/CodeGen/CIRGenBuilder.cpp   | 28 ++++++++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h     | 11 ++++
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp   | 55 ++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenCall.h        | 30 ++++++++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp      | 53 +++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h    |  5 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt      |  1 +
 clang/test/CIR/CodeGen/builtin_call.cpp   | 78 +++++++++++++++++++++++
 9 files changed, 255 insertions(+), 9 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
 create mode 100644 clang/test/CIR/CodeGen/builtin_call.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index f89d386378e51..87908e2ec08ac 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -83,7 +83,6 @@ struct MissingFeatures {
   static bool opFuncSetComdat() { return false; }
 
   // CallOp handling
-  static bool opCallBuiltinFunc() { return false; }
   static bool opCallPseudoDtor() { return false; }
   static bool opCallAggregateArgs() { return false; }
   static bool opCallPaddingArgs() { return false; }
@@ -225,6 +224,8 @@ struct MissingFeatures {
   static bool isMemcpyEquivalentSpecialMember() { return false; }
   static bool isTrivialCtorOrDtor() { return false; }
   static bool implicitConstructorArgs() { return false; }
+  static bool intrinsics() { return false; }
+  static bool attributeNoBuiltin() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
index 4c8c6ed289c3b..9cec17bcb2fd0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -39,6 +39,34 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin,
   return create<cir::PtrStrideOp>(arrayLocEnd, flatPtrTy, basePtr, idx);
 }
 
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
+                                             llvm::APSInt intVal) {
+  bool isSigned = intVal.isSigned();
+  unsigned width = intVal.getBitWidth();
+  cir::IntType t = isSigned ? getSIntNTy(width) : getUIntNTy(width);
+  return getConstInt(loc, t,
+                     isSigned ? intVal.getSExtValue() : intVal.getZExtValue());
+}
+
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
+                                             llvm::APInt intVal) {
+  return getConstInt(loc, llvm::APSInt(intVal));
+}
+
+cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, mlir::Type t,
+                                             uint64_t c) {
+  assert(mlir::isa<cir::IntType>(t) && "expected cir::IntType");
+  return create<cir::ConstantOp>(loc, cir::IntAttr::get(t, c));
+}
+
+cir::ConstantOp
+clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t,
+                                           llvm::APFloat fpVal) {
+  assert(mlir::isa<cir::CIRFPTypeInterface>(t) &&
+         "expected floating point type");
+  return create<cir::ConstantOp>(loc, getAttr<cir::FPAttr>(t, fpVal));
+}
+
 // This can't be defined in Address.h because that file is included by
 // CIRGenBuilder.h
 Address Address::withElementType(CIRGenBuilderTy &builder,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 03077ee062a65..fb1a290c18fa2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -11,10 +11,12 @@
 
 #include "Address.h"
 #include "CIRGenTypeCache.h"
+#include "clang/CIR/Interfaces/CIRFPTypeInterface.h"
 #include "clang/CIR/MissingFeatures.h"
 
 #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/STLExtras.h"
 
 namespace clang::CIRGen {
@@ -229,6 +231,15 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; }
   cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; }
 
+  cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal);
+
+  cir::ConstantOp getConstInt(mlir::Location loc, llvm::APInt intVal);
+
+  cir::ConstantOp getConstInt(mlir::Location loc, mlir::Type t, uint64_t c);
+
+  cir::ConstantOp getConstFP(mlir::Location loc, mlir::Type t,
+                             llvm::APFloat fpVal);
+
   bool isInt8Ty(mlir::Type i) {
     return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty;
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
new file mode 100644
index 0000000000000..c59ac78210f81
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit Builtin calls as CIR or a function call to be
+// later resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenCall.h"
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "CIRGenValue.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/GlobalDecl.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
+                                       const CallExpr *e,
+                                       ReturnValueSlot returnValue) {
+  // See if we can constant fold this builtin.  If so, don't emit it at all.
+  // TODO: Extend this handling to all builtin calls that we can constant-fold.
+  Expr::EvalResult result;
+  if (e->isPRValue() && e->EvaluateAsRValue(result, cgm.getASTContext()) &&
+      !result.hasSideEffects()) {
+    if (result.Val.isInt()) {
+      return RValue::get(builder.getConstInt(getLoc(e->getSourceRange()),
+                                             result.Val.getInt()));
+    }
+    if (result.Val.isFloat()) {
+      // Note: we are using result type of CallExpr to determine the type of
+      // the constant. Classic codegen uses the result value to determine the
+      // type. We feel it should be Ok to use expression type because it is
+      // hard to imagine a builtin function evaluates to a value that
+      // over/underflows its own defined type.
+      mlir::Type type = convertType(e->getType());
+      return RValue::get(builder.getConstFP(getLoc(e->getExprLoc()), type,
+                                            result.Val.getFloat()));
+    }
+  }
+
+  mlir::Location loc = getLoc(e->getExprLoc());
+  cgm.errorNYI(loc, "non constant foldable builtin calls");
+  return getUndefRValue(e->getType());
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 605625705a75c..15c9080448c8b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -44,16 +44,25 @@ class CIRGenCalleeInfo {
 class CIRGenCallee {
   enum class SpecialKind : uintptr_t {
     Invalid,
+    Builtin,
 
-    Last = Invalid,
+    Last = Builtin,
+  };
+
+  struct BuiltinInfoStorage {
+    const clang::FunctionDecl *decl;
+    unsigned id;
   };
 
   SpecialKind kindOrFunctionPtr;
 
   union {
     CIRGenCalleeInfo abstractInfo;
+    BuiltinInfoStorage builtinInfo;
   };
 
+  explicit CIRGenCallee(SpecialKind kind) : kindOrFunctionPtr(kind) {}
+
 public:
   CIRGenCallee() : kindOrFunctionPtr(SpecialKind::Invalid) {}
 
@@ -69,6 +78,25 @@ class CIRGenCallee {
     return CIRGenCallee(abstractInfo, funcPtr);
   }
 
+  bool isBuiltin() const { return kindOrFunctionPtr == SpecialKind::Builtin; }
+
+  const clang::FunctionDecl *getBuiltinDecl() const {
+    assert(isBuiltin());
+    return builtinInfo.decl;
+  }
+  unsigned getBuiltinID() const {
+    assert(isBuiltin());
+    return builtinInfo.id;
+  }
+
+  static CIRGenCallee forBuiltin(unsigned builtinID,
+                                 const clang::FunctionDecl *builtinDecl) {
+    CIRGenCallee result(SpecialKind::Builtin);
+    result.builtinInfo.decl = builtinDecl;
+    result.builtinInfo.id = builtinID;
+    return result;
+  }
+
   bool isOrdinary() const {
     return uintptr_t(kindOrFunctionPtr) > uintptr_t(SpecialKind::Last);
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f2c2de7a4f59d..f1f86509c9a9b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1029,8 +1029,48 @@ static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) {
   return cgm.getAddrOfFunction(gd);
 }
 
-static CIRGenCallee emitDirectCallee(CIRGenModule &cgm, GlobalDecl gd) {
-  assert(!cir::MissingFeatures::opCallBuiltinFunc());
+// Detect the unusual situation where an inline version is shadowed by a
+// non-inline version. In that case we should pick the external one
+// everywhere. That's GCC behavior too.
+static bool onlyHasInlineBuiltinDeclaration(const FunctionDecl *fd) {
+  for (const FunctionDecl *pd = fd; pd; pd = pd->getPreviousDecl())
+    if (!pd->isInlineBuiltinDeclaration())
+      return false;
+  return true;
+}
+
+CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
+  const auto *fd = cast<FunctionDecl>(gd.getDecl());
+
+  if (unsigned builtinID = fd->getBuiltinID()) {
+    if (fd->getAttr<AsmLabelAttr>()) {
+      cgm.errorNYI("AsmLabelAttr");
+    }
+
+    StringRef ident = fd->getName();
+    std::string fdInlineName = (ident + ".inline").str();
+
+    bool isPredefinedLibFunction =
+        cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID);
+    bool hasAttributeNoBuiltin = false;
+    assert(!cir::MissingFeatures::attributeNoBuiltin());
+
+    // When directing calling an inline builtin, call it through it's mangled
+    // name to make it clear it's not the actual builtin.
+    auto fn = cast<cir::FuncOp>(curFn);
+    if (fn.getName() != fdInlineName && onlyHasInlineBuiltinDeclaration(fd)) {
+      cgm.errorNYI("Inline only builtin function calls");
+    }
+
+    // Replaceable builtins provide their own implementation of a builtin. If we
+    // are in an inline builtin implementation, avoid trivial infinite
+    // recursion. Honor __attribute__((no_builtin("foo"))) or
+    // __attribute__((no_builtin)) on the current function unless foo is
+    // not a predefined library function which means we must generate the
+    // builtin no matter what.
+    else if (!isPredefinedLibFunction || !hasAttributeNoBuiltin)
+      return CIRGenCallee::forBuiltin(builtinID, fd);
+  }
 
   cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd);
 
@@ -1106,7 +1146,7 @@ CIRGenCallee CIRGenFunction::emitCallee(const clang::Expr *e) {
   } else if (const auto *declRef = dyn_cast<DeclRefExpr>(e)) {
     // Resolve direct calls.
     const auto *funcDecl = cast<FunctionDecl>(declRef->getDecl());
-    return emitDirectCallee(cgm, funcDecl);
+    return emitDirectCallee(funcDecl);
   } else if (isa<MemberExpr>(e)) {
     cgm.errorNYI(e->getSourceRange(),
                  "emitCallee: call to member function is NYI");
@@ -1162,10 +1202,9 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e,
 
   CIRGenCallee callee = emitCallee(e->getCallee());
 
-  if (e->getBuiltinCallee()) {
-    cgm.errorNYI(e->getSourceRange(), "call to builtin functions");
-  }
-  assert(!cir::MissingFeatures::opCallBuiltinFunc());
+  if (callee.isBuiltin())
+    return emitBuiltinExpr(callee.getBuiltinDecl(), callee.getBuiltinID(), e,
+                           returnValue);
 
   if (isa<CXXPseudoDestructorExpr>(e->getCallee())) {
     cgm.errorNYI(e->getSourceRange(), "call to pseudo destructor");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 7db7f6928fd8f..b08dd540e6289 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -665,6 +665,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc,
                               clang::CharUnits alignment);
 
+  CIRGenCallee emitDirectCallee(const GlobalDecl &gd);
+
 public:
   Address emitAddrOfFieldStorage(Address base, const FieldDecl *field,
                                  llvm::StringRef fieldName,
@@ -711,6 +713,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s);
 
+  RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
+                         const clang::CallExpr *e, ReturnValueSlot returnValue);
+
   RValue emitCall(const CIRGenFunctionInfo &funcInfo,
                   const CIRGenCallee &callee, ReturnValueSlot returnValue,
                   const CallArgList &args, cir::CIRCallOpInterface *callOp,
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 8bfcd2773d07a..beaa9afb31f93 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(clangCIR
   CIRGenClass.cpp
   CIRGenCXXABI.cpp
   CIRGenCXXExpr.cpp
+  CIRGenBuiltin.cpp
   CIRGenDecl.cpp
   CIRGenDeclOpenACC.cpp
   CIRGenExpr.cpp
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
new file mode 100644
index 0000000000000..2706ea7f8f857
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+constexpr extern int cx_var = __builtin_is_constant_evaluated();
+
+// CIR: cir.global {{.*}} @cx_var = #cir.int<1> : !s32i
+// LLVM: @cx_var = {{.*}} i32 1
+// OGCG: @cx_var = {{.*}} i32 1
+
+constexpr extern float cx_var_single = __builtin_huge_valf();
+
+// CIR: cir.global {{.*}} @cx_var_single = #cir.fp<0x7F800000> : !cir.float
+// LLVM: @cx_var_single = {{.*}} float 0x7FF0000000000000
+// OGCG: @cx_var_single = {{.*}} float 0x7FF0000000000000
+
+constexpr extern long double cx_var_ld = __builtin_huge_vall();
+
+// CIR: cir.global {{.*}} @cx_var_ld = #cir.fp<0x7FFF8000000000000000> : !cir.long_double<!cir.f80>
+// LLVM: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000
+// OGCG: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000
+
+int is_constant_evaluated() {
+  return __builtin_is_constant_evaluated();
+}
+
+// CIR: cir.func @_Z21is_constant_evaluatedv() -> !s32i
+// CIR: %[[ZERO:.+]] = cir.const #cir.int<0>
+
+// LLVM: define {{.*}}i32 @_Z21is_constant_evaluatedv()
+// LLVM: %[[MEM:.+]] = alloca i32
+// LLVM: store i32 0, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load i32, ptr %[[MEM]]
+// LLVM: ret i32 %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}i32 @_Z21is_constant_evaluatedv()
+// OGCG: ret i32 0
+// OGCG: }
+
+long double constant_fp_builtin_ld() {
+  return __builtin_fabsl(-0.1L);
+}
+
+// CIR: cir.func @_Z22constant_fp_builtin_ldv() -> !cir.long_double<!cir.f80>
+// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.long_double<!cir.f80>
+
+// LLVM: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv()
+// LLVM: %[[MEM:.+]] = alloca x86_fp80
+// LLVM: store x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load x86_fp80, ptr %[[MEM]]
+// LLVM: ret x86_fp80 %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv()
+// OGCG: ret x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD
+// OGCG: }
+
+float constant_fp_builtin_single() {
+  return __builtin_fabsf(-0.1f);
+}
+
+// CIR: cir.func @_Z26constant_fp_builtin_singlev() -> !cir.float
+// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.float
+
+// LLVM: define {{.*}}float @_Z26constant_fp_builtin_singlev()
+// LLVM: %[[MEM:.+]] = alloca float
+// LLVM: store float 0x3FB99999A0000000, ptr %[[MEM]]
+// LLVM: %[[RETVAL:.+]] = load float, ptr %[[MEM]]
+// LLVM: ret float %[[RETVAL]]
+// LLVM: }
+
+// OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev()
+// OGCG: ret float 0x3FB99999A0000000
+// OGCG: }

From ec8d68b59f82423e5a6bf452e33ee8c5f64b0edc Mon Sep 17 00:00:00 2001
From: vabridgers <58314289+vabridgers@users.noreply.github.com>
Date: Wed, 11 Jun 2025 11:25:24 -0500
Subject: [PATCH 103/851] [clang][analyzer] Correct SMT Layer for _BitInt cases
 refutations (#143310)

Since _BitInt was added later, ASTContext did not comprehend getting a
type by bitwidth that's not a power of 2, and the SMT layer also did not
comprehend this. This led to unexpected crashes using Z3 refutation
during randomized testing. The assertion and redacted and summarized
crash stack is shown here.

clang:
../../clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h:103:
static llvm::SMTExprRef
clang::ento::SMTConv::fromBinOp(llvm::SMTSolverRef &,
const llvm::SMTExprRef &, const BinaryOperator::Opcode, const
llvm::SMTExprRef &, bool):
Assertion `*Solver->getSort(LHS) == *Solver->getSort(RHS) && "AST's must
have the same sort!"' failed.
 ...
<address>
clang::ento::SMTConv::fromBinOp(std::shared_ptr<llvm::SMTSolver>&,
llvm::SMTExpr const* const&, clang::BinaryOperatorKind, llvm::SMTExpr
const* const&,
     bool) SMTConstraintManager.cpp
     clang::ASTContext&, llvm::SMTExpr const* const&, clang::QualType,
clang::BinaryOperatorKind, llvm::SMTExpr const* const&, clang::QualType,
     clang::QualType*) SMTConstraintManager.cpp
clang::ASTContext&, clang::ento::SymExpr const*, llvm::APSInt const&,
     llvm::APSInt const&, bool) SMTConstraintManager.cpp
clang::ento::ExplodedNode const*, clang::ento::PathSensitiveBugReport&)

---------

Co-authored-by: Vince Bridgers <vince.a.bridgers@ericsson.com>
---
 .../Core/PathSensitive/SMTConv.h              | 28 ++++++++++++++-----
 clang/test/Analysis/bitint-z3.c               | 22 +++++++++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Analysis/bitint-z3.c

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
index 580b49a38dc72..70a7953918ace 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h
@@ -18,6 +18,8 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
 #include "llvm/Support/SMTAPI.h"
 
+#include <algorithm>
+
 namespace clang {
 namespace ento {
 
@@ -570,23 +572,35 @@ class SMTConv {
   // TODO: Refactor to put elsewhere
   static inline QualType getAPSIntType(ASTContext &Ctx,
                                        const llvm::APSInt &Int) {
-    return Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned());
+    const QualType Ty =
+        Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned());
+    if (!Ty.isNull())
+      return Ty;
+    // If Ty is Null, could be because the original type was a _BitInt.
+    // Get the size of the _BitInt type (expressed in bits) and round it up to
+    // the next power of 2 that is at least the bit size of 'char' (usually 8).
+    unsigned CharTypeSize = Ctx.getTypeSize(Ctx.CharTy);
+    unsigned Pow2DestWidth =
+        std::max(llvm::bit_ceil(Int.getBitWidth()), CharTypeSize);
+    return Ctx.getIntTypeForBitwidth(Pow2DestWidth, Int.isSigned());
   }
 
   // Get the QualTy for the input APSInt, and fix it if it has a bitwidth of 1.
   static inline std::pair<llvm::APSInt, QualType>
   fixAPSInt(ASTContext &Ctx, const llvm::APSInt &Int) {
     llvm::APSInt NewInt;
+    unsigned APSIntBitwidth = Int.getBitWidth();
+    QualType Ty = getAPSIntType(Ctx, Int);
 
     // FIXME: This should be a cast from a 1-bit integer type to a boolean type,
     // but the former is not available in Clang. Instead, extend the APSInt
     // directly.
-    if (Int.getBitWidth() == 1 && getAPSIntType(Ctx, Int).isNull()) {
-      NewInt = Int.extend(Ctx.getTypeSize(Ctx.BoolTy));
-    } else
-      NewInt = Int;
-
-    return std::make_pair(NewInt, getAPSIntType(Ctx, NewInt));
+    if (APSIntBitwidth == 1 && Ty.isNull())
+      return {Int.extend(Ctx.getTypeSize(Ctx.BoolTy)),
+              getAPSIntType(Ctx, NewInt)};
+    if (llvm::isPowerOf2_32(APSIntBitwidth) || Ty.isNull())
+      return {Int, Ty};
+    return {Int.extend(Ctx.getTypeSize(Ty)), Ty};
   }
 
   // Perform implicit type conversion on binary symbolic expressions.
diff --git a/clang/test/Analysis/bitint-z3.c b/clang/test/Analysis/bitint-z3.c
new file mode 100644
index 0000000000000..4cb97f9de8299
--- /dev/null
+++ b/clang/test/Analysis/bitint-z3.c
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -w \
+// RUN:   -analyzer-config crosscheck-with-z3=true -verify %s
+// REQUIRES: z3
+
+// Previously these tests were crashing because the SMTConv layer did not
+// comprehend the _BitInt types.
+
+void clang_analyzer_warnIfReached();
+
+void c(int b, _BitInt(35) a) {
+  int d = 0;
+  if (a)
+    b = d;
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void f(int *d, _BitInt(3) e) {
+  int g;
+  d = &g;
+  e ?: 0;
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}

From fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 Mon Sep 17 00:00:00 2001
From: Rolf Morel <854835+rolfmorel@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:33:55 +0100
Subject: [PATCH 104/851] [MLIR][Transform] apply_registered_pass op's options
 as a dict (#143159)

Improve ApplyRegisteredPassOp's support for taking options by taking
them as a dict (vs a list of string-valued key-value pairs).

Values of options are provided as either static attributes or as params
(which pass in attributes at interpreter runtime). In either case, the
keys and value attributes are converted to strings and a single
options-string, in the format used on the commandline, is constructed to
pass to the `addToPipeline`-pass API.
---
 .../mlir/Dialect/Transform/IR/CMakeLists.txt  |   4 +
 .../Dialect/Transform/IR/TransformAttrs.h     |   3 +
 .../Dialect/Transform/IR/TransformAttrs.td    |  19 ++
 .../Dialect/Transform/IR/TransformDialect.td  |   1 +
 .../mlir/Dialect/Transform/IR/TransformOps.td |  23 +-
 .../Dialect/Transform/IR/TransformDialect.cpp |   9 +
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 223 +++++++++++-------
 .../mlir/dialects/transform/__init__.py       |  82 ++++++-
 .../Transform/test-pass-application.mlir      | 169 +++++++++++--
 mlir/test/python/dialects/transform.py        |  52 ++++
 10 files changed, 469 insertions(+), 116 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
index df5af7ae710da..9acab9228f100 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
@@ -20,6 +20,10 @@ mlir_tablegen(TransformDialectEnums.h.inc -gen-enum-decls)
 mlir_tablegen(TransformDialectEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRTransformDialectEnumIncGen)
 add_dependencies(mlir-headers MLIRTransformDialectEnumIncGen)
+mlir_tablegen(TransformAttrs.h.inc -gen-attrdef-decls)
+mlir_tablegen(TransformAttrs.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRTransformDialectAttributesIncGen)
+add_dependencies(mlir-headers MLIRTransformDialectAttributesIncGen)
 
 add_mlir_dialect(TransformOps transform)
 add_mlir_doc(TransformOps TransformOps Dialects/ -gen-op-doc -dialect=transform)
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
index 3cb935003b4c4..379af932ca484 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h
@@ -17,4 +17,7 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h.inc"
+
 #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS_H
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
index ebad2994880e7..e67a9444c24a8 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td
@@ -10,6 +10,14 @@
 #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS
 
 include "mlir/IR/EnumAttr.td"
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+
+class Transform_Attr<string name, string attrMnemonic,
+                     list<Trait> traits = [],
+                     string baseCppClass = "::mlir::Attribute">
+    : AttrDef<Transform_Dialect, name, traits, baseCppClass> {
+  let mnemonic = attrMnemonic;
+}
 
 def PropagateFailuresCase : I32EnumAttrCase<"Propagate", 1, "propagate">;
 def SuppressFailuresCase : I32EnumAttrCase<"Suppress", 2, "suppress">;
@@ -33,4 +41,15 @@ def MatchCmpIPredicateAttr : I32EnumAttr<
   let cppNamespace = "::mlir::transform";
 }
 
+def ParamOperandAttr : Transform_Attr<"ParamOperand", "param_operand"> {
+  let description = [{
+    Used to refer to a specific param-operand (via its index) from within an
+    attribute on a transform operation.
+  }];
+  let parameters = (ins
+    "IntegerAttr":$index
+  );
+  let assemblyFormat = "`<` `index` `=` $index `>`";
+}
+
 #endif  // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
index d03049e186f94..c7ea5ade72ace 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
@@ -19,6 +19,7 @@ def Transform_Dialect : Dialect {
   let cppNamespace = "::mlir::transform";
 
   let hasOperationAttrVerify = 1;
+  let useDefaultAttributePrinterParser = 1;
   let extraClassDeclaration = [{
     /// Symbol name for the default entry point "named sequence".
     constexpr const static ::llvm::StringLiteral
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index e864a65f8ceac..f75ba27e58e76 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -405,10 +405,23 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
   let description = [{
     This transform applies the specified pass or pass pipeline to the targeted
     ops. The name of the pass/pipeline is specified as a string attribute, as
-    set during pass/pipeline registration. Optionally, pass options may be
-    specified as (space-separated) string attributes with the option to pass
-    these attributes via params. The pass options syntax is identical to the one
-    used with "mlir-opt".
+    set during pass/pipeline registration.
+
+    Optionally, pass options may be specified via a DictionaryAttr. This
+    dictionary is converted to a string -- formatted `key=value ...` -- which
+    is expected to be in the exact format used by the pass on the commandline.
+    Values are either attributes or (SSA-values of) Transform Dialect params.
+    For example:
+
+    ```mlir
+    transform.apply_registered_pass "canonicalize"
+        with options = { "top-down" = false,
+                         "max-iterations" = %max_iter,
+                         "test-convergence" = true,
+                         "max-num-rewrites" =  %max_rewrites }
+        to %module
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    ```
 
     This op first looks for a pass pipeline with the specified name. If no such
     pipeline exists, it looks for a pass with the specified name. If no such
@@ -422,7 +435,7 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
   }];
 
   let arguments = (ins StrAttr:$pass_name,
-                       DefaultValuedAttr<ArrayAttr, "{}">:$options,
+                       DefaultValuedAttr<DictionaryAttr, "{}">:$options,
                        Variadic<TransformParamTypeInterface>:$dynamic_options,
                        TransformHandleTypeInterface:$target);
   let results = (outs TransformHandleTypeInterface:$result);
diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
index 497ceb19f1a21..4a95fe7459e8c 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
@@ -8,17 +8,22 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Analysis/CallGraph.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.cpp.inc"
 
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc"
+
 #ifndef NDEBUG
 void transform::detail::checkImplementsTransformOpInterface(
     StringRef name, MLIRContext *context) {
@@ -66,6 +71,10 @@ void transform::TransformDialect::initialize() {
 #include "mlir/Dialect/Transform/IR/TransformOps.cpp.inc"
       >();
   initializeTypes();
+  addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc"
+      >();
   initializeLibraryModule();
 }
 
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index a0f9518e3d12f..582d082153bef 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -54,10 +54,11 @@
 using namespace mlir;
 
 static ParseResult parseApplyRegisteredPassOptions(
-    OpAsmParser &parser, ArrayAttr &options,
+    OpAsmParser &parser, DictionaryAttr &options,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions);
 static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
-                                            Operation *op, ArrayAttr options,
+                                            Operation *op,
+                                            DictionaryAttr options,
                                             ValueRange dynamicOptions);
 static ParseResult parseSequenceOpOperands(
     OpAsmParser &parser, std::optional<OpAsmParser::UnresolvedOperand> &root,
@@ -784,41 +785,50 @@ DiagnosedSilenceableFailure
 transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
                                         transform::TransformResults &results,
                                         transform::TransformState &state) {
-  // Obtain a single options-string from options passed statically as
-  // string attributes as well as "dynamically" through params.
+  // Obtain a single options-string to pass to the pass(-pipeline) from options
+  // passed in as a dictionary of keys mapping to values which are either
+  // attributes or param-operands pointing to attributes.
+
   std::string options;
+  llvm::raw_string_ostream optionsStream(options); // For "printing" attrs.
+
   OperandRange dynamicOptions = getDynamicOptions();
-  size_t dynamicOptionsIdx = 0;
-  for (auto [idx, optionAttr] : llvm::enumerate(getOptions())) {
+  for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) {
     if (idx > 0)
-      options += " "; // Interleave options seperator.
-
-    if (auto strAttr = dyn_cast<StringAttr>(optionAttr)) {
-      options += strAttr.getValue();
-    } else if (isa<UnitAttr>(optionAttr)) {
-      assert(dynamicOptionsIdx < dynamicOptions.size() &&
+      optionsStream << " "; // Interleave options separator.
+    optionsStream << namedAttribute.getName().str(); // Append the key.
+    optionsStream << "="; // And the key-value separator.
+
+    Attribute valueAttrToAppend;
+    if (auto paramOperandIndex =
+            dyn_cast<transform::ParamOperandAttr>(namedAttribute.getValue())) {
+      // The corresponding value attribute is passed in via a param.
+      // Obtain the param-operand via its specified index.
+      size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt();
+      assert(dynamicOptionIdx < dynamicOptions.size() &&
              "number of dynamic option markers (UnitAttr) in options ArrayAttr "
              "should be the same as the number of options passed as params");
       ArrayRef<Attribute> dynamicOption =
-          state.getParams(dynamicOptions[dynamicOptionsIdx++]);
+          state.getParams(dynamicOptions[dynamicOptionIdx]);
       if (dynamicOption.size() != 1)
-        return emitSilenceableError() << "options passed as a param must have "
-                                         "a single value associated, param "
-                                      << dynamicOptionsIdx - 1 << " associates "
-                                      << dynamicOption.size();
-
-      if (auto dynamicOptionStr = dyn_cast<StringAttr>(dynamicOption[0])) {
-        options += dynamicOptionStr.getValue();
-      } else {
         return emitSilenceableError()
-               << "options passed as a param must be a string, got "
-               << dynamicOption[0];
-      }
+               << "options passed as a param must have "
+                  "a single value associated, param "
+               << dynamicOptionIdx << " associates " << dynamicOption.size();
+      valueAttrToAppend = dynamicOption[0];
+    } else {
+      // Value is a static attribute.
+      valueAttrToAppend = namedAttribute.getValue();
+    }
+
+    // Append string representation of value attribute.
+    if (auto strAttr = dyn_cast<StringAttr>(valueAttrToAppend)) {
+      optionsStream << strAttr.getValue().str();
     } else {
-      llvm_unreachable(
-          "expected options element to be either StringAttr or UnitAttr");
+      valueAttrToAppend.print(optionsStream, /*elideType=*/true);
     }
   }
+  optionsStream.flush();
 
   // Get pass or pass pipeline from registry.
   const PassRegistryEntry *info = PassPipelineInfo::lookup(getPassName());
@@ -864,84 +874,121 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
 }
 
 static ParseResult parseApplyRegisteredPassOptions(
-    OpAsmParser &parser, ArrayAttr &options,
+    OpAsmParser &parser, DictionaryAttr &options,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions) {
-  auto dynamicOptionMarker = UnitAttr::get(parser.getContext());
-  SmallVector<Attribute> optionsArray;
-
-  auto parseOperandOrString = [&]() -> OptionalParseResult {
-    OpAsmParser::UnresolvedOperand operand;
-    OptionalParseResult parsedOperand = parser.parseOptionalOperand(operand);
-    if (parsedOperand.has_value()) {
-      if (failed(parsedOperand.value()))
-        return failure();
-
-      dynamicOptions.push_back(operand);
-      optionsArray.push_back(
-          dynamicOptionMarker); // Placeholder for knowing where to
-                                // inject the dynamic option-as-param.
-      return success();
-    }
+  // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax.
+  SmallVector<NamedAttribute> keyValuePairs;
 
-    StringAttr stringAttr;
-    OptionalParseResult parsedStringAttr =
-        parser.parseOptionalAttribute(stringAttr);
-    if (parsedStringAttr.has_value()) {
-      if (failed(parsedStringAttr.value()))
-        return failure();
-      optionsArray.push_back(stringAttr);
-      return success();
-    }
+  size_t dynamicOptionsIdx = 0;
+  auto parseKeyValuePair = [&]() -> ParseResult {
+    // Parse items of the form `key = value` where `key` is a bare identifier or
+    // a string and `value` is either an attribute or an operand.
+
+    std::string key;
+    Attribute valueAttr;
+    if (parser.parseOptionalKeywordOrString(&key))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected key to either be an identifier or a string";
+    if (key.empty())
+      return failure();
 
-    return std::nullopt;
+    if (parser.parseEqual())
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected '=' after key in key-value pair";
+
+    // Parse the value, which can be either an attribute or an operand.
+    OptionalParseResult parsedValueAttr =
+        parser.parseOptionalAttribute(valueAttr);
+    if (!parsedValueAttr.has_value()) {
+      OpAsmParser::UnresolvedOperand operand;
+      ParseResult parsedOperand = parser.parseOperand(operand);
+      if (failed(parsedOperand))
+        return parser.emitError(parser.getCurrentLocation())
+               << "expected a valid attribute or operand as value associated "
+               << "to key '" << key << "'";
+      // To make use of the operand, we need to store it in the options dict.
+      // As SSA-values cannot occur in attributes, what we do instead is store
+      // an attribute in its place that contains the index of the param-operand,
+      // so that an attr-value associated to the param can be resolved later on.
+      dynamicOptions.push_back(operand);
+      auto wrappedIndex = IntegerAttr::get(
+          IntegerType::get(parser.getContext(), 64), dynamicOptionsIdx++);
+      valueAttr =
+          transform::ParamOperandAttr::get(parser.getContext(), wrappedIndex);
+    } else if (failed(parsedValueAttr.value())) {
+      return failure(); // NB: Attempted parse should have output error message.
+    } else if (isa<transform::ParamOperandAttr>(valueAttr)) {
+      return parser.emitError(parser.getCurrentLocation())
+             << "the param_operand attribute is a marker reserved for "
+             << "indicating a value will be passed via params and is only used "
+             << "in the generic print format";
+    }
+
+    keyValuePairs.push_back(NamedAttribute(key, valueAttr));
+    return success();
   };
 
-  OptionalParseResult parsedOptionsElement = parseOperandOrString();
-  while (parsedOptionsElement.has_value()) {
-    if (failed(parsedOptionsElement.value()))
-      return failure();
-    parsedOptionsElement = parseOperandOrString();
-  }
+  if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Braces,
+                                     parseKeyValuePair,
+                                     " in options dictionary"))
+    return failure(); // NB: Attempted parse should have output error message.
 
-  if (optionsArray.empty()) {
+  if (DictionaryAttr::findDuplicate(
+          keyValuePairs, /*isSorted=*/false) // Also sorts the keyValuePairs.
+          .has_value())
     return parser.emitError(parser.getCurrentLocation())
-           << "expected at least one option (either a string or a param)";
-  }
-  options = parser.getBuilder().getArrayAttr(optionsArray);
+           << "duplicate keys found in options dictionary";
+
+  options = DictionaryAttr::getWithSorted(parser.getContext(), keyValuePairs);
+
   return success();
 }
 
 static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
-                                            Operation *op, ArrayAttr options,
+                                            Operation *op,
+                                            DictionaryAttr options,
                                             ValueRange dynamicOptions) {
-  size_t currentDynamicOptionIdx = 0;
-  for (auto [idx, optionAttr] : llvm::enumerate(options)) {
-    if (idx > 0)
-      printer << " "; // Interleave options separator.
+  if (options.empty())
+    return;
 
-    if (isa<UnitAttr>(optionAttr))
-      printer.printOperand(dynamicOptions[currentDynamicOptionIdx++]);
-    else if (auto strAttr = dyn_cast<StringAttr>(optionAttr))
-      printer.printAttribute(strAttr);
-    else
-      llvm_unreachable("each option should be either a StringAttr or UnitAttr");
-  }
+  printer << "{";
+  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
+    printer << namedAttribute.getName() << " = ";
+    Attribute value = namedAttribute.getValue();
+    if (auto indexAttr = dyn_cast<transform::ParamOperandAttr>(value)) {
+      // Resolve index of param-operand to its actual SSA-value and print that.
+      printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]);
+    } else {
+      printer.printAttribute(value);
+    }
+  });
+  printer << "}";
 }
 
 LogicalResult transform::ApplyRegisteredPassOp::verify() {
-  size_t numUnitsInOptions = 0;
-  for (Attribute optionsElement : getOptions()) {
-    if (isa<UnitAttr>(optionsElement))
-      numUnitsInOptions++;
-    else if (!isa<StringAttr>(optionsElement))
-      return emitOpError() << "expected each option to be either a StringAttr "
-                           << "or a UnitAttr, got " << optionsElement;
-  }
-
-  if (getDynamicOptions().size() != numUnitsInOptions)
-    return emitOpError()
-           << "expected the same number of options passed as params as "
-           << "UnitAttr elements in options ArrayAttr";
+  // Check that there is a one-to-one correspondence between param operands
+  // and references to dynamic options in the options dictionary.
+
+  auto dynamicOptions = SmallVector<Value>(getDynamicOptions());
+  for (NamedAttribute namedAttr : getOptions())
+    if (auto paramOperand =
+            dyn_cast<transform::ParamOperandAttr>(namedAttr.getValue())) {
+      size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
+      if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size())
+        return emitOpError()
+               << "dynamic option index " << dynamicOptionIdx
+               << " is out of bounds for the number of dynamic options: "
+               << dynamicOptions.size();
+      if (dynamicOptions[dynamicOptionIdx] == nullptr)
+        return emitOpError() << "dynamic option index " << dynamicOptionIdx
+                             << " is already used in options";
+      dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used.
+    }
+
+  for (Value dynamicOption : dynamicOptions)
+    if (dynamicOption)
+      return emitOpError() << "a param operand does not have a corresponding "
+                           << "param_operand attr in the options dict";
 
   return success();
 }
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 5b158ec6b65fd..10a04b0cc14e0 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -18,7 +18,12 @@
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Optional, Sequence, Union, NewType
+from typing import Dict, Optional, Sequence, Union, NewType
+
+
+@register_attribute_builder("ParamOperandAttr")
+def _paramOperandAttr(x: int, context) -> Attribute:
+    return Attribute.parse(f"#transform.param_operand<index={x}>", context=context)
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
@@ -214,6 +219,81 @@ def __init__(
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+@_ods_cext.register_operation(_Dialect, replace=True)
+class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
+    def __init__(
+        self,
+        result: Type,
+        pass_name: Union[str, StringAttr],
+        target: Union[Operation, Value, OpView],
+        *,
+        options: Optional[
+            Dict[
+                Union[str, StringAttr],
+                Union[Attribute, Value, Operation, OpView],
+            ]
+        ] = None,
+        loc=None,
+        ip=None,
+    ):
+        options_dict = {}
+        dynamic_options = []
+
+        ParamOperandAttr = AttrBuilder.get("ParamOperandAttr")
+        context = (loc and loc.context) or Context.current
+
+        cur_param_operand_idx = 0
+        for key, value in options.items() if options is not None else {}:
+            if isinstance(key, StringAttr):
+                key = key.value
+
+            if isinstance(value, (Value, Operation, OpView)):
+                dynamic_options.append(_get_op_result_or_value(value))
+                options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context)
+                cur_param_operand_idx += 1
+            elif isinstance(value, Attribute):
+                options_dict[key] = value
+            elif isinstance(value, str):
+                options_dict[key] = StringAttr.get(value)
+            else:
+                raise TypeError(f"Unsupported option type: {type(value)}")
+        if len(options_dict) > 0:
+            print(options_dict, cur_param_operand_idx)
+        super().__init__(
+            result,
+            pass_name,
+            dynamic_options,
+            target=_get_op_result_or_value(target),
+            options=DictAttr.get(options_dict),
+            loc=loc,
+            ip=ip,
+        )
+
+
+def apply_registered_pass(
+    result: Type,
+    pass_name: Union[str, StringAttr],
+    target: Union[Operation, Value, OpView],
+    *,
+    options: Optional[
+        Dict[
+            Union[str, StringAttr],
+            Union[Attribute, Value, Operation, OpView],
+        ]
+    ] = None,
+    loc=None,
+    ip=None,
+) -> Value:
+    return ApplyRegisteredPassOp(
+        result=result,
+        pass_name=pass_name,
+        target=target,
+        options=options,
+        loc=loc,
+        ip=ip,
+    ).result
+
+
 AnyOpTypeT = NewType("AnyOpType", AnyOpType)
 
 
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 463fd98afa65c..6e6d4eb7e249f 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -80,7 +80,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
     // expected-error @below {{<Pass-Options-Parser>: no such option invalid-option}}
     transform.apply_registered_pass "canonicalize"
-        with options = "invalid-option=1" to %1
+        with options = { "invalid-option" = 1 } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -97,7 +97,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" to %1
+        with options = { "top-down" = false } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     //transform.apply_registered_pass "canonicalize" with options = "top-down=false,max-iterations=10" to %1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false test-convergence=true" to %1
+        with options = { "top-down" = false, "test-convergence" =true } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -132,7 +132,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" "max-iterations=0" to %1
+        with options = { "top-down" = false, "max-iterations" = 0 } to %1
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -148,10 +148,15 @@ func.func @valid_dynamic_pass_options() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
-    %max_rewrites = transform.param.constant "max-num-rewrites=1" -> !transform.any_param
-    %2 = transform.apply_registered_pass "canonicalize"
-        with options = "top-down=false" %max_iter "test-convergence=true" %max_rewrites to %1
+    %max_iter = transform.param.constant 10 -> !transform.any_param
+    %max_rewrites = transform.param.constant 1 -> !transform.any_param
+    %2 = transform.apply_registered_pass
+        "canonicalize"
+        with options = { "top-down" = false,
+                         "max-iterations" = %max_iter,
+                         "test-convergence" = true,
+                         "max-num-rewrites" =  %max_rewrites }
+        to %1
         : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -159,7 +164,7 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @invalid_dynamic_options_as_array() {
+func.func @invalid_options_as_str() {
   return
 }
 
@@ -167,34 +172,80 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
-    // expected-error @+2 {{expected at least one option (either a string or a param)}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
-        with options = ["top-down=false" %max_iter] to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @invalid_options_as_pairs() {
+func.func @invalid_options_as_pairs_without_braces() {
   return
 }
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @+2 {{expected 'to'}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
-        with options = "top-down=" false to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        with options = "top-down"=false to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
 
 // -----
 
-func.func @invalid_pass_option_param() {
+func.func @invalid_options_due_to_reserved_attr() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = { "top-down" = #transform.param_operand<index=0> } to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_options_due_duplicated_key() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{duplicate keys found in options dictionary}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = {"top-down"=false,"top-down"=true} to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_options_due_invalid_key() {
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+2 {{expected key to either be an identifier or a string}}
+    %2 = transform.apply_registered_pass "canonicalize"
+        with options = { @label = 0 } to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @invalid_pass_option_bare_param() {
   return
 }
 
@@ -202,7 +253,7 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
     %pass_options = transform.param.constant 42 -> !transform.any_param
-    // expected-error @below {{options passed as a param must be a string, got 42}}
+    // expected-error @+2 {{expected '{' in options dictionary}}
     transform.apply_registered_pass "canonicalize"
         with options = %pass_options to %1
         : (!transform.any_param, !transform.any_op) -> !transform.any_op
@@ -219,12 +270,12 @@ func.func @too_many_pass_option_params() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %x = transform.param.constant "x" -> !transform.any_param
-    %y = transform.param.constant "y" -> !transform.any_param
-    %pass_options = transform.merge_handles %x, %y : !transform.any_param
+    %x = transform.param.constant true -> !transform.any_param
+    %y = transform.param.constant false -> !transform.any_param
+    %topdown_options = transform.merge_handles %x, %y : !transform.any_param
     // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
     transform.apply_registered_pass "canonicalize"
-        with options = %pass_options to %1
+        with options = { "top-down" = %topdown_options } to %1
         : (!transform.any_param, !transform.any_op) -> !transform.any_op
     transform.yield
   }
@@ -248,3 +299,77 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+/////////////////////////////////////////////////////////////////////
+// Check that the following cases are caugh in the generic format. //
+/////////////////////////////////////////////////////////////////////
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   param_operand_index out of bounds w.r.t. the number of options provided via params.
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}}
+    %2 = "transform.apply_registered_pass"(%1, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=1 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
+
+// -----
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   the first option-param is referred to twice and the second one not at all.
+// (In the pretty-printed format, if you want to refer to a param SSA-value twice, it counts as two param arguments.)
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
+    // expected-error @below {{dynamic option index 0 is already used in options}}
+    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
+                 "max-num-rewrites" = #transform.param_operand<index=0 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
+
+// -----
+
+// Invalid due to param_operand occurences in options dict not being
+// one-to-one with the dynamic options provided as params:
+//   two option-params are provide though only the first one is referred to from the options-dict.
+
+"builtin.module"() ({
+  "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
+  ^bb0(%arg0: !transform.any_op):
+    %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
+    %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
+    %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
+    // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}}
+    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+      options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
+                 "test-convergence" = true,
+                 "top-down" = false},
+      pass_name = "canonicalize"}>
+    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    "transform.yield"() : () -> ()
+  }) : () -> ()
+}) {transform.with_named_sequence} : () -> ()
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 6ed4818fc9d2f..48bc9bad37a1e 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -254,3 +254,55 @@ def testReplicateOp(module: Module):
     # CHECK: %[[FIRST:.+]] = pdl_match
     # CHECK: %[[SECOND:.+]] = pdl_match
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
+
+
+@run
+def testApplyRegisteredPassOp(module: Module):
+    sequence = transform.SequenceOp(
+        transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
+    )
+    with InsertionPoint(sequence.body):
+        mod = transform.ApplyRegisteredPassOp(
+            transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget
+        )
+        mod = transform.ApplyRegisteredPassOp(
+            transform.AnyOpType.get(),
+            "canonicalize",
+            mod.result,
+            options={"top-down": BoolAttr.get(False)},
+        )
+        max_iter = transform.param_constant(
+            transform.AnyParamType.get(),
+            IntegerAttr.get(IntegerType.get_signless(64), 10),
+        )
+        max_rewrites = transform.param_constant(
+            transform.AnyParamType.get(),
+            IntegerAttr.get(IntegerType.get_signless(64), 1),
+        )
+        transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            "canonicalize",
+            mod,
+            options={
+                "top-down": BoolAttr.get(False),
+                "max-iterations": max_iter,
+                "test-convergence": BoolAttr.get(True),
+                "max-rewrites": max_rewrites,
+            },
+        )
+        transform.YieldOp()
+    # CHECK-LABEL: TEST: testApplyRegisteredPassOp
+    # CHECK: transform.sequence
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+    # CHECK-SAME:    with options = {"top-down" = false}
+    # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
+    # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
+    # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
+    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+    # NB: MLIR has sorted the dict lexicographically by key:
+    # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
+    # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
+    # CHECK-SAME:                    "test-convergence" = true,
+    # CHECK-SAME:                    "top-down" = false}
+    # CHECK-SAME:    to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op

From 459475020aeff15d0f886ab99c59d66b744d3e17 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 16:35:55 +0100
Subject: [PATCH 105/851] Reapply 76197ea6f91f after removing an assertion

Specifically this is the assertion in BasicBlock.cpp. Now that we're not
examining or setting that flag consistently (because it'll be deleted in
about an hour) there's no need to keep this assertion.

Original commit title:

[DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451)
---
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp |  3 -
 llvm/lib/IR/AutoUpgrade.cpp                | 25 ++----
 llvm/lib/IR/BasicBlock.cpp                 |  1 -
 llvm/lib/IR/DIBuilder.cpp                  | 97 +++++-----------------
 llvm/lib/IR/DebugInfo.cpp                  | 19 +----
 llvm/lib/Transforms/Utils/LoopUtils.cpp    | 36 +++-----
 llvm/unittests/IR/IRBuilderTest.cpp        | 10 ---
 7 files changed, 40 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 59cd0dc8dd348..e8a3df3366b2b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
   const BasicBlock *BB = FuncInfo.MBB->getBasicBlock();
   bool BlockHasMultipleInstrs = &BB->front() != &BB->back();
-  // Handle legacy case of debug intrinsics
-  if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat)
-    BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1;
   if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
     // For more accurate line information if this is the only non-debug
     // instruction in the block then emit it, otherwise we have the
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index cb90af36f3d9f..a0886776ff93f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   Builder.SetInsertPoint(CI->getParent(), CI->getIterator());
 
   if (!NewFn) {
-    bool FallthroughToDefaultUpgrade = false;
     // Get the Function's name.
     StringRef Name = F->getName();
 
@@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     } else if (IsAMDGCN) {
       Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
     } else if (IsDbg) {
-      // We might have decided we don't want the new format after all between
-      // first requesting the upgrade and now; skip the conversion if that is
-      // the case, and check here to see if the intrinsic needs to be upgraded
-      // normally.
-      if (!CI->getModule()->IsNewDbgInfoFormat) {
-        bool NeedsUpgrade =
-            upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false);
-        if (!NeedsUpgrade)
-          return;
-        FallthroughToDefaultUpgrade = true;
-      } else {
-        upgradeDbgIntrinsicToDbgRecord(Name, CI);
-      }
+      upgradeDbgIntrinsicToDbgRecord(Name, CI);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
 
-    if (!FallthroughToDefaultUpgrade) {
-      if (Rep)
-        CI->replaceAllUsesWith(Rep);
-      CI->eraseFromParent();
-      return;
-    }
+    if (Rep)
+      CI->replaceAllUsesWith(Rep);
+    CI->eraseFromParent();
+    return;
   }
 
   const auto &DefaultCase = [&]() -> void {
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index f716e9970b841..62a75313bb171 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -60,7 +60,6 @@ void BasicBlock::convertToNewDbgValues() {
   // instruction.
   SmallVector<DbgRecord *, 4> DbgVarRecs;
   for (Instruction &I : make_early_inc_range(InstList)) {
-    assert(!I.DebugMarker && "DebugMarker already set on old-format instrs?");
     if (DbgVariableIntrinsic *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
       // Convert this dbg.value to a DbgVariableRecord.
       DbgVariableRecord *Value = new DbgVariableRecord(DVI);
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 5e5ff22132e99..1484c549dd580 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val,
       LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID));
   assert(Link && "Linked instruction must have DIAssign metadata attached");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
-        Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
-    // Insert after LinkedInstr.
-    BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
-    NextIt.setHeadBit(true);
-    insertDbgVariableRecord(DVR, NextIt);
-    return DVR;
-  }
-
-  LLVMContext &Ctx = LinkedInstr->getContext();
-  Module *M = LinkedInstr->getModule();
-  if (!AssignFn)
-    AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign);
-
-  std::array<Value *, 6> Args = {
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)),
-      MetadataAsValue::get(Ctx, SrcVar),
-      MetadataAsValue::get(Ctx, ValExpr),
-      MetadataAsValue::get(Ctx, Link),
-      MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)),
-      MetadataAsValue::get(Ctx, AddrExpr),
-  };
-
-  IRBuilder<> B(Ctx);
-  B.SetCurrentDebugLocation(DL);
-
-  auto *DVI = cast<DbgAssignIntrinsic>(B.CreateCall(AssignFn, Args));
-  DVI->insertAfter(LinkedInstr->getIterator());
-  return DVI;
+  DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign(
+      Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL);
+  // Insert after LinkedInstr.
+  BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator());
+  NextIt.setHeadBit(true);
+  insertDbgVariableRecord(DVR, NextIt);
+  return DVR;
 }
 
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
@@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DIExpression *Expr,
                                               const DILocation *DL,
                                               InsertPosition InsertPt) {
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!ValueFn)
-    ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value);
-  auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt);
-  cast<CallInst>(DVI)->setTailCall();
-  return DVI;
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
@@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
 
-  if (M.IsNewDbgInfoFormat) {
-    DbgVariableRecord *DVR =
-        DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
-    insertDbgVariableRecord(DVR, InsertPt);
-    return DVR;
-  }
-
-  if (!DeclareFn)
-    DeclareFn = getDeclareIntrin(M);
-
-  trackIfUnresolved(VarInfo);
-  trackIfUnresolved(Expr);
-  Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage),
-                   MetadataAsValue::get(VMContext, VarInfo),
-                   MetadataAsValue::get(VMContext, Expr)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(DeclareFn, Args);
+  DbgVariableRecord *DVR =
+      DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL);
+  insertDbgVariableRecord(DVR, InsertPt);
+  return DVR;
 }
 
 void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR,
@@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
          "Expected matching subprograms");
 
   trackIfUnresolved(LabelInfo);
-  if (M.IsNewDbgInfoFormat) {
-    DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
-    if (InsertPt.isValid()) {
-      auto *BB = InsertPt.getBasicBlock();
-      BB->insertDbgRecordBefore(DLR, InsertPt);
-    }
-    return DLR;
+  DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL);
+  if (InsertPt.isValid()) {
+    auto *BB = InsertPt.getBasicBlock();
+    BB->insertDbgRecordBefore(DLR, InsertPt);
   }
-
-  if (!LabelFn)
-    LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label);
-
-  Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
-
-  IRBuilder<> B(DL->getContext());
-  initIRBuilder(B, DL, InsertPt);
-  return B.CreateCall(LabelFn, Args);
+  return DLR;
 }
 
 void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 7db9891fdbd75..2a84e7bae0f10 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
     Expr = *R;
   }
   DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {});
-  if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
-    auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
-        &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
-    (void)Assign;
-    LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-    return;
-  }
-  auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
-                                    AddrExpr, VarRec.DL);
+  auto *Assign = DbgVariableRecord::createLinkedDVRAssign(
+      &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
   (void)Assign;
-  LLVM_DEBUG(if (!Assign.isNull()) {
-    if (const auto *Record = dyn_cast<DbgRecord *>(Assign))
-      errs() << " > INSERT: " << *Record << "\n";
-    else
-      errs() << " > INSERT: " << *cast<Instruction *>(Assign) << "\n";
-  });
+  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+  return;
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 0681ebc111cb2..ff69fa9f70c4e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
 
   // Use a map to unique and a vector to guarantee deterministic ordering.
   llvm::SmallDenseSet<DebugVariable, 4> DeadDebugSet;
-  llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
   llvm::SmallVector<DbgVariableRecord *, 4> DeadDbgVariableRecords;
 
   if (ExitBlock) {
@@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           U.set(Poison);
         }
 
-        // RemoveDIs: do the same as below for DbgVariableRecords.
-        if (Block->IsNewDbgInfoFormat) {
-          for (DbgVariableRecord &DVR : llvm::make_early_inc_range(
-                   filterDbgVars(I.getDbgRecordRange()))) {
-            DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
-                              DVR.getDebugLoc().get());
-            if (!DeadDebugSet.insert(Key).second)
-              continue;
-            // Unlinks the DVR from it's container, for later insertion.
-            DVR.removeFromParent();
-            DeadDbgVariableRecords.push_back(&DVR);
-          }
-        }
-
-        // For one of each variable encountered, preserve a debug intrinsic (set
+        // For one of each variable encountered, preserve a debug record (set
         // to Poison) and transfer it to the loop exit. This terminates any
         // variable locations that were set during the loop.
-        auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
-        if (!DVI)
-          continue;
-        if (!DeadDebugSet.insert(DebugVariable(DVI)).second)
-          continue;
-        DeadDebugInst.push_back(DVI);
+        for (DbgVariableRecord &DVR :
+             llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) {
+          DebugVariable Key(DVR.getVariable(), DVR.getExpression(),
+                            DVR.getDebugLoc().get());
+          if (!DeadDebugSet.insert(Key).second)
+            continue;
+          // Unlinks the DVR from it's container, for later insertion.
+          DVR.removeFromParent();
+          DeadDbgVariableRecords.push_back(&DVR);
+        }
       }
 
     // After the loop has been deleted all the values defined and modified
@@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
            "There should be a non-PHI instruction in exit block, else these "
            "instructions will have no parent.");
 
-    for (auto *DVI : DeadDebugInst)
-      DVI->moveBefore(*ExitBlock, InsertDbgValueBefore);
-
     // Due to the "head" bit in BasicBlock::iterator, we're going to insert
     // each DbgVariableRecord right at the start of the block, wheras dbg.values
     // would be repeatedly inserted before the first instruction. To replicate
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 3a7ba924792ef..aadae5287c380 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in new-debug mode.
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
-
-  // Test in old-debug mode.
-  // Reset the test then call convertFromNewDbgValues to flip the flag
-  // on the test's Module, Function and BasicBlock.
   TearDown();
-  SetUp();
-  M->convertFromNewDbgValues();
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
-  RunTest();
 }
 
 TEST_F(IRBuilderTest, createArtificialSubprogram) {

From f1575de4c5de9268f92eea1641af755a477e4ee4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 11 Jun 2025 11:37:12 -0500
Subject: [PATCH 106/851] [libc][NFC] Remove template from GPU allocator
 reference counter

Summary:
We don't need this to be generic, precommit for
https://github.com/llvm/llvm-project/pull/143607
---
 libc/src/__support/GPU/allocator.cpp | 32 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 135ced3df704c..ecc0de1cb6ec3 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -283,7 +283,7 @@ struct Slab {
 
 /// A wait-free guard around a pointer resource to be created dynamically if
 /// space is available and freed once there are no more users.
-template <typename T> struct GuardPtr {
+struct GuardPtr {
 private:
   struct RefCounter {
     // Indicates that the object is in its deallocation phase and thus invalid.
@@ -339,22 +339,22 @@ template <typename T> struct GuardPtr {
     cpp::Atomic<uint64_t> counter{0};
   };
 
-  cpp::Atomic<T *> ptr{nullptr};
+  cpp::Atomic<Slab *> ptr{nullptr};
   RefCounter ref{};
 
   // Should be called be a single lane for each different pointer.
   template <typename... Args>
-  T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
-    T *expected = ptr.load(cpp::MemoryOrder::RELAXED);
+  Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) {
+    Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED);
     if (!expected &&
-        ptr.compare_exchange_strong(expected, reinterpret_cast<T *>(SENTINEL),
-                                    cpp::MemoryOrder::RELAXED,
-                                    cpp::MemoryOrder::RELAXED)) {
+        ptr.compare_exchange_strong(
+            expected, reinterpret_cast<Slab *>(SENTINEL),
+            cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) {
       count = cpp::numeric_limits<uint64_t>::max();
-      void *raw = impl::rpc_allocate(sizeof(T));
+      void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
-      T *mem = new (raw) T(cpp::forward<Args>(args)...);
+      Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
 
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(mem, cpp::MemoryOrder::RELAXED);
@@ -364,7 +364,7 @@ template <typename T> struct GuardPtr {
       return mem;
     }
 
-    if (!expected || expected == reinterpret_cast<T *>(SENTINEL))
+    if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
       return nullptr;
 
     if (!ref.acquire(n, count))
@@ -379,10 +379,10 @@ template <typename T> struct GuardPtr {
   // The uniform mask represents which lanes share the same pointer. For each
   // uniform value we elect a leader to handle it on behalf of the other lanes.
   template <typename... Args>
-  T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
-              Args &&...args) {
+  Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count,
+                 Args &&...args) {
     count = 0;
-    T *result = nullptr;
+    Slab *result = nullptr;
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
       result = try_lock_impl(cpp::popcount(uniform), count,
                              cpp::forward<Args>(args)...);
@@ -403,8 +403,8 @@ template <typename T> struct GuardPtr {
     cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) &&
         ref.release(cpp::popcount(mask))) {
-      T *p = ptr.load(cpp::MemoryOrder::RELAXED);
-      p->~T();
+      Slab *p = ptr.load(cpp::MemoryOrder::RELAXED);
+      p->~Slab();
       impl::rpc_free(p);
       cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
       ptr.store(nullptr, cpp::MemoryOrder::RELAXED);
@@ -417,7 +417,7 @@ template <typename T> struct GuardPtr {
 };
 
 // The global array used to search for a valid slab to allocate from.
-static GuardPtr<Slab> slots[ARRAY_SIZE] = {};
+static GuardPtr slots[ARRAY_SIZE] = {};
 
 // Tries to find a slab in the table that can support the given chunk size.
 static Slab *find_slab(uint32_t chunk_size) {

From aa8a1fa6f515f45db55365b9c1f8453ded24ed32 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Wed, 11 Jun 2025 18:42:10 +0200
Subject: [PATCH 107/851] [DLCov][NFC] Annotate intentionally-blank DebugLocs
 in existing code (#136192)

Following the work in PR #107279, this patch applies the annotative
DebugLocs, which indicate that a particular instruction is intentionally
missing a location for a given reason, to existing sites in the compiler
where their conditions apply. This is NFC in ordinary LLVM builds (each
function `DebugLoc::getFoo()` is inlined as `DebugLoc()`), but marks the
instruction in coverage-tracking builds so that it will be ignored by
Debugify, allowing only real errors to be reported. From a developer
standpoint, it also communicates the intentionality and reason for a
missing DebugLoc.

Some notes for reviewers:

- The difference between `I->dropLocation()` and
`I->setDebugLoc(DebugLoc::getDropped())` is that the former _may_ decide
to keep some debug info alive, while the latter will always be empty; in
this patch, I always used the latter (even if the former could
technically be correct), because the former could result in some
(barely) different output, and I'd prefer to keep this patch purely NFC.
- I've generally documented the uses of `DebugLoc::getUnknown()`, with
the exception of the vectorizers - in summary, they are a huge cause of
dropped source locations, and I don't have the time or the domain
knowledge currently to solve that, so I've plastered it all over them as
a form of "fixme".
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 10 ++++--
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  4 +--
 .../Transforms/InstCombine/InstCombinePHI.cpp |  9 ++++-
 .../Scalar/CorrelatedValuePropagation.cpp     |  3 +-
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp |  3 ++
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  4 ++-
 llvm/lib/Transforms/Scalar/LICM.cpp           |  4 ++-
 .../Transforms/Scalar/LoopLoadElimination.cpp |  3 +-
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |  3 ++
 .../Scalar/TailRecursionElimination.cpp       |  4 ++-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  9 +++++
 llvm/lib/Transforms/Utils/Local.cpp           |  3 +-
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |  4 ++-
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      |  5 +++
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 10 +++---
 .../Vectorize/LoopVectorizationPlanner.h      | 34 ++++++++++++-------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  8 +++--
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 12 +++++--
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 ++--
 19 files changed, 101 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b3fe0ab8b5cb4..7db0586386506 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1494,8 +1494,14 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     // FIXME: Pass Global's alignment when globals have alignment
     AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(),
                                         nullptr, GV->getName(), FirstI);
-    if (!isa<UndefValue>(GV->getInitializer()))
-      new StoreInst(GV->getInitializer(), Alloca, FirstI);
+    Alloca->setDebugLoc(DebugLoc::getCompilerGenerated());
+    if (!isa<UndefValue>(GV->getInitializer())) {
+      auto *SI = new StoreInst(GV->getInitializer(), Alloca, FirstI);
+      // FIXME: We're localizing a global and creating a store instruction for
+      // the initial value of that global. Could we logically use the global
+      // variable's (if one exists) line for this?
+      SI->setDebugLoc(DebugLoc::getCompilerGenerated());
+    }
 
     GV->replaceAllUsesWith(Alloca);
     GV->eraseFromParent();
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index ff66a518be752..cb18b55ae2183 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -730,7 +730,7 @@ static void moveFunctionData(Function &Old, Function &New,
       // other outlined instructions.
       if (!isa<CallInst>(&Val)) {
         // Remove the debug information for outlined functions.
-        Val.setDebugLoc(DebugLoc());
+        Val.setDebugLoc(DebugLoc::getDropped());
 
         // Loop info metadata may contain line locations. Update them to have no
         // value in the new subprogram since the outlined code could be from
@@ -1864,7 +1864,7 @@ replaceArgumentUses(OutlinableRegion &Region,
       Value *ValueOperand = SI->getValueOperand();
 
       StoreInst *NewI = cast<StoreInst>(I->clone());
-      NewI->setDebugLoc(DebugLoc());
+      NewI->setDebugLoc(DebugLoc::getDropped());
       BasicBlock *OutputBB = VBBIt->second;
       NewI->insertInto(OutputBB, OutputBB->end());
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index a842a5edcb8a3..6477141ab095f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -870,7 +870,14 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
     NewPhi->addIncoming(NewIncoming[I], Phi.getIncomingBlock(I));
 
   InsertNewInstBefore(NewPhi, Phi.getIterator());
-  return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+  auto *CI = CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+
+  // We use a dropped location here because the new ZExt is necessarily a merge
+  // of ZExtInsts and at least one constant from incoming branches; the presence
+  // of the constant means we have no viable DebugLoc from that branch, and
+  // therefore we must use a dropped location.
+  CI->setDebugLoc(DebugLoc::getDropped());
+  return CI;
 }
 
 /// If all operands to a PHI node are the same "unary" operator and they all are
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index b95a851c99b49..4627f537dc16b 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -432,7 +432,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
       BasicBlock *NewUnreachableBB =
           BasicBlock::Create(BB->getContext(), "default.unreachable",
                              BB->getParent(), DefaultDest);
-      new UnreachableInst(BB->getContext(), NewUnreachableBB);
+      auto *UI = new UnreachableInst(BB->getContext(), NewUnreachableBB);
+      UI->setDebugLoc(DebugLoc::getTemporary());
 
       DefaultDest->removePredecessor(BB);
       SI->setDefaultDest(NewUnreachableBB);
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 95d52b9b4e189..334c911191cb8 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1506,6 +1506,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
       auto *NewRHS = CastInst::Create(
           Instruction::Trunc, RHS, LHSOp->getType(), "",
           L->getLoopPreheader()->getTerminator()->getIterator());
+      // NewRHS is an operation that has been hoisted out of the loop, and
+      // therefore should have a dropped location.
+      NewRHS->setDebugLoc(DebugLoc::getDropped());
       ICmp->setOperand(Swapped ? 1 : 0, LHSOp);
       ICmp->setOperand(Swapped ? 0 : 1, NewRHS);
       // Samesign flag cannot be preserved after narrowing the compare.
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 9449b4cb35b93..37b85bf9de811 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -3001,8 +3001,10 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
       continue;
     // Expand the select.
     Value *Cond = SI->getCondition();
-    if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI))
+    if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) {
       Cond = new FreezeInst(Cond, "cond.fr", SI->getIterator());
+      cast<FreezeInst>(Cond)->setDebugLoc(DebugLoc::getTemporary());
+    }
     MDNode *BranchWeights = getBranchWeightMDNode(*SI);
     Instruction *Term =
         SplitBlockAndInsertIfThen(Cond, SI, false, BranchWeights);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9773ef778b690..3024ccb330b1a 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2248,7 +2248,7 @@ bool llvm::promoteLoopAccessesToScalars(
     if (SawUnorderedAtomic)
       PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
     PreheaderLoad->setAlignment(Alignment);
-    PreheaderLoad->setDebugLoc(DebugLoc());
+    PreheaderLoad->setDebugLoc(DebugLoc::getDropped());
     if (AATags && LoadIsGuaranteedToExecute)
       PreheaderLoad->setAAMetadata(AATags);
 
@@ -2808,6 +2808,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     auto *NewBO =
         BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
                                Ins->getName() + ".reass", Ins->getIterator());
+    NewBO->setDebugLoc(DebugLoc::getDropped());
     NewBO->copyIRFlags(Ins);
     if (VariantOp == Ins)
       VariantOp = NewBO;
@@ -2864,6 +2865,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
 
   auto *NewBO = BinaryOperator::Create(
       Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator());
+  NewBO->setDebugLoc(DebugLoc::getDropped());
 
   if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) {
     // Intersect FMF flags for FADD and FMUL.
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 39e8d702a692e..6bdf76f789a49 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -442,7 +442,7 @@ class LoadEliminationForLoop {
     assert(PH && "Preheader should exist!");
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
-    Value *Initial =
+    Instruction *Initial =
         new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial",
                      /* isVolatile */ false, Cand.Load->getAlign(),
                      PH->getTerminator()->getIterator());
@@ -450,6 +450,7 @@ class LoadEliminationForLoop {
     // into the loop's preheader. A debug location inside the loop will cause
     // a misleading stepping when debugging. The test update-debugloc-store
     // -forwarded.ll checks this.
+    Initial->setDebugLoc(DebugLoc::getDropped());
 
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded");
     PHI->insertBefore(L->getHeader()->begin());
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 0bf90036b8b82..9b40fc03da6bb 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -274,6 +274,7 @@ static void buildPartialUnswitchConditionalBranch(
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
     const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) {
   IRBuilder<> IRB(&BB);
+  IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
 
   SmallVector<Value *> FrozenInvariants;
   for (Value *Inv : Invariants) {
@@ -330,6 +331,7 @@ static void buildPartialInvariantUnswitchConditionalBranch(
   }
 
   IRBuilder<> IRB(&BB);
+  IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
   Value *Cond = VMap[ToDuplicate[0]];
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
@@ -2369,6 +2371,7 @@ static void unswitchNontrivialInvariants(
         // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted
         // out of the loop.
         Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator());
+        cast<Instruction>(Cond)->setDebugLoc(DebugLoc::getDropped());
       }
       BI->setCondition(Cond);
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 7dd6c60370ed9..c71c5a70a12fd 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -515,7 +515,8 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
   BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
   NewEntry->takeName(HeaderBB);
   HeaderBB->setName("tailrecurse");
-  BranchInst::Create(HeaderBB, NewEntry);
+  auto *BI = BranchInst::Create(HeaderBB, NewEntry);
+  BI->setDebugLoc(DebugLoc::getCompilerGenerated());
   // If the new branch preserves the debug location of CI, it could result in
   // misleading stepping, if CI is located in a conditional branch.
   // So, here we don't give any debug location to the new branch.
@@ -801,6 +802,7 @@ void TailRecursionEliminator::cleanupAndFinalize() {
         SelectInst *SI =
             SelectInst::Create(RetKnownPN, RetPN, RI->getOperand(0),
                                "current.ret.tr", RI->getIterator());
+        SI->setDebugLoc(DebugLoc::getCompilerGenerated());
         RetSelects.push_back(SI);
         RI->setOperand(0, SI);
       }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7a9605bf5f8d4..f47c467d15140 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1775,6 +1775,7 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg,
   AllocaInst *NewAlloca =
       new AllocaInst(ByValType, Arg->getType()->getPointerAddressSpace(),
                      nullptr, Alignment, Arg->getName());
+  NewAlloca->setDebugLoc(DebugLoc::getCompilerGenerated());
   NewAlloca->insertBefore(Caller->begin()->begin());
   IFI.StaticAllocas.push_back(NewAlloca);
 
@@ -3258,6 +3259,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
     // Add an unconditional branch to make this look like the CallInst case...
     CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+    // We intend to replace this DebugLoc with another later.
+    CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getTemporary());
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
@@ -3359,6 +3362,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     Returns[0]->eraseFromParent();
     ReturnBB->eraseFromParent();
   } else if (!CB.use_empty()) {
+    // In this case there are no returns to use, so there is no clear source
+    // location for the "return".
+    // FIXME: It may be correct to use the scope end line of the function here,
+    // since this likely means we are falling out of the function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getUnknown());
     // No returns, but something is using the return value of the call.  Just
     // nuke the result.
     CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2630a1a7a6af4..a3252a69874d3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3127,7 +3127,8 @@ static bool markAliveBlocks(Function &F,
           BasicBlock *UnreachableNormalDest = BasicBlock::Create(
               Ctx, OrigNormalDest->getName() + ".unreachable",
               II->getFunction(), OrigNormalDest);
-          new UnreachableInst(Ctx, UnreachableNormalDest);
+          auto *UI = new UnreachableInst(Ctx, UnreachableNormalDest);
+          UI->setDebugLoc(DebugLoc::getTemporary());
           II->setNormalDest(UnreachableNormalDest);
           if (DTU)
             DTU->applyUpdates(
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 1a2e422356270..f4b378b82daec 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -348,7 +348,9 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
         NewUnreachableBB =
             BasicBlock::Create(DefaultDest->getContext(), "default.unreachable",
                                DefaultDest->getParent(), DefaultDest);
-        new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+        auto *UI =
+            new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB);
+        UI->setDebugLoc(DebugLoc::getTemporary());
       }
 
       DefaultDest->removePredecessor(BB);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 48d9528f0c3df..5db7fc956c497 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -318,6 +318,11 @@ class SSAUpdaterTraits<SSAUpdater> {
                                SSAUpdater *Updater) {
     PHINode *PHI =
         PHINode::Create(Updater->ProtoType, NumPreds, Updater->ProtoName);
+    // FIXME: Ordinarily we don't care about or try to assign DebugLocs to PHI
+    // nodes, but loop optimizations may try to use a PHI node as a DebugLoc
+    // source (e.g. if this is an induction variable), and it's not clear what
+    // location we could attach here, so mark this unknown for now.
+    PHI->setDebugLoc(DebugLoc::getUnknown());
     PHI->insertBefore(BB->begin());
     return PHI;
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index e221022bb8361..975ce3bef5176 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1137,7 +1137,7 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
         // branch, drop it. When we fold the bonus instructions we want to make
         // sure we reset their debug locations in order to avoid stepping on
         // dead code caused by folding dead branches.
-        NewBonusInst->setDebugLoc(DebugLoc());
+        NewBonusInst->setDebugLoc(DebugLoc::getDropped());
       } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
         mapAtomInstance(DL, VMap);
       }
@@ -2821,7 +2821,8 @@ static void mergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
       // so just form a new block with unreachable terminator.
       BasicBlock *MergedNormalDest = BasicBlock::Create(
           Ctx, II0BB->getName() + ".cont", Func, InsertBeforeBlock);
-      new UnreachableInst(Ctx, MergedNormalDest);
+      auto *UI = new UnreachableInst(Ctx, MergedNormalDest);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       MergedInvoke->setNormalDest(MergedNormalDest);
     }
 
@@ -3389,7 +3390,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
       // Don't update the DILocation of dbg.assign intrinsics.
       if (!isa<DbgAssignIntrinsic>(&I))
-        I.setDebugLoc(DebugLoc());
+        I.setDebugLoc(DebugLoc::getDropped());
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -5707,7 +5708,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   BasicBlock *NewDefaultBlock = BasicBlock::Create(
       BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
       OrigDefaultBlock);
-  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  auto *UI = new UnreachableInst(Switch->getContext(), NewDefaultBlock);
+  UI->setDebugLoc(DebugLoc::getTemporary());
   Switch->setDefaultDest(&*NewDefaultBlock);
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index b81d582f07e88..70f541d64b305 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -153,7 +153,7 @@ class VPBuilder {
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
                               const Twine &Name = "") {
-    DebugLoc DL;
+    DebugLoc DL = DebugLoc::getUnknown();
     if (Inst)
       DL = Inst->getDebugLoc();
     VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
@@ -165,7 +165,8 @@ class VPBuilder {
     return createInstruction(Opcode, Operands, DL, Name);
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                              const VPIRFlags &Flags, DebugLoc DL = {},
+                              const VPIRFlags &Flags,
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, Flags, DL, Name));
@@ -174,7 +175,8 @@ class VPBuilder {
   VPInstruction *createNaryOp(unsigned Opcode,
                               std::initializer_list<VPValue *> Operands,
                               Type *ResultTy, const VPIRFlags &Flags = {},
-                              DebugLoc DL = {}, const Twine &Name = "") {
+                              DebugLoc DL = DebugLoc::getUnknown(),
+                              const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
   }
@@ -182,22 +184,25 @@ class VPBuilder {
   VPInstruction *createOverflowingOp(unsigned Opcode,
                                      std::initializer_list<VPValue *> Operands,
                                      VPRecipeWithIRFlags::WrapFlagsTy WrapFlags,
-                                     DebugLoc DL = {}, const Twine &Name = "") {
+                                     DebugLoc DL = DebugLoc::getUnknown(),
+                                     const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
   }
 
-  VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
+  VPValue *createNot(VPValue *Operand, DebugLoc DL = DebugLoc::getUnknown(),
                      const Twine &Name = "") {
     return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
   }
 
-  VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createAnd(VPValue *LHS, VPValue *RHS,
+                     DebugLoc DL = DebugLoc::getUnknown(),
                      const Twine &Name = "") {
     return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
   }
 
-  VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createOr(VPValue *LHS, VPValue *RHS,
+                    DebugLoc DL = DebugLoc::getUnknown(),
                     const Twine &Name = "") {
 
     return tryInsertInstruction(new VPInstruction(
@@ -205,14 +210,16 @@ class VPBuilder {
         VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name));
   }
 
-  VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {},
+  VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS,
+                            DebugLoc DL = DebugLoc::getUnknown(),
                             const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
   }
 
   VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
-                        DebugLoc DL = {}, const Twine &Name = "",
+                        DebugLoc DL = DebugLoc::getUnknown(),
+                        const Twine &Name = "",
                         std::optional<FastMathFlags> FMFs = std::nullopt) {
     auto *Select =
         FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
@@ -226,20 +233,23 @@ class VPBuilder {
   /// and \p B.
   /// TODO: add createFCmp when needed.
   VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
-                      DebugLoc DL = {}, const Twine &Name = "") {
+                      DebugLoc DL = DebugLoc::getUnknown(),
+                      const Twine &Name = "") {
     assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
            Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
     return tryInsertInstruction(
         new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
   }
 
-  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+  VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
                           GEPNoWrapFlags::none(), DL, Name));
   }
-  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
+  VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset,
+                                DebugLoc DL = DebugLoc::getUnknown(),
                                 const Twine &Name = "") {
     return tryInsertInstruction(
         new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset},
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 427c1460fcfc9..2a237f42e4042 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -772,7 +772,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
 /// Look for a meaningful debug location on the instruction or its operands.
 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
-    return DebugLoc();
+    return DebugLoc::getUnknown();
 
   DebugLoc Empty;
   if (I->getDebugLoc() != Empty)
@@ -1881,13 +1881,15 @@ class GeneratedRTChecks {
     if (SCEVCheckBlock) {
       SCEVCheckBlock->getTerminator()->moveBefore(
           Preheader->getTerminator()->getIterator());
-      new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
+      auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       Preheader->getTerminator()->eraseFromParent();
     }
     if (MemCheckBlock) {
       MemCheckBlock->getTerminator()->moveBefore(
           Preheader->getTerminator()->getIterator());
-      new UnreachableInst(Preheader->getContext(), MemCheckBlock);
+      auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
+      UI->setDebugLoc(DebugLoc::getTemporary());
       Preheader->getTerminator()->eraseFromParent();
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ec40124c57a6a..c3ca22dce0cc4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17434,6 +17434,12 @@ static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
   return llvm::propagateMetadata(Inst, Insts);
 }
 
+static DebugLoc getDebugLocFromPHI(PHINode &PN) {
+  if (DebugLoc DL = PN.getDebugLoc())
+    return DL;
+  return DebugLoc::getUnknown();
+}
+
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilderBase::InsertPointGuard Guard(Builder);
 
@@ -17599,14 +17605,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       auto *PH = cast<PHINode>(VL0);
       Builder.SetInsertPoint(PH->getParent(),
                              PH->getParent()->getFirstNonPHIIt());
-      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
       Value *V = NewPhi;
 
       // Adjust insertion point once all PHI's have been generated.
       Builder.SetInsertPoint(PH->getParent(),
                              PH->getParent()->getFirstInsertionPt());
-      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
 
       V = FinalShuffle(V, E);
 
@@ -17638,7 +17644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         }
 
         Builder.SetInsertPoint(IBB->getTerminator());
-        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+        Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH));
         Value *Vec = vectorizeOperand(E, I);
         if (VecTy != Vec->getType()) {
           assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bbcbfee4e471b..acc861b991975 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1816,9 +1816,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
 class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
 protected:
   VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
-                    VPValue *Start, DebugLoc DL = {})
-      : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>({Start}), UnderlyingInstr, DL) {
-  }
+                    VPValue *Start, DebugLoc DL = DebugLoc::getUnknown())
+      : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>({Start}),
+                          UnderlyingInstr, DL) {}
 
   const VPRecipeBase *getAsRecipe() const override { return this; }
 

From 117e78fe5012087c1ee535b91936bf4d8e3c7785 Mon Sep 17 00:00:00 2001
From: William <113542065+saturn691@users.noreply.github.com>
Date: Wed, 11 Jun 2025 17:51:34 +0100
Subject: [PATCH 108/851] [libc] Add NULL macro definitions to header files
 (#142764)

By the C standard, <locale.h>, <stddef.h> <stdio.h>, <stdlib.h>,
<string.h>, <time.h>, and <wchar.h> require NULL to be defined.
---
 libc/include/CMakeLists.txt | 5 +++++
 libc/include/locale.yaml    | 3 +++
 libc/include/stdio.yaml     | 2 ++
 libc/include/stdlib.yaml    | 4 +++-
 libc/include/string.h.def   | 2 --
 libc/include/string.yaml    | 4 +++-
 libc/include/time.yaml      | 4 +++-
 libc/include/wchar.yaml     | 4 +++-
 8 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 7209e10c68b8f..55268d19529c7 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -255,6 +255,7 @@ add_header_macro(
   time.h
   DEPENDS
     .llvm_libc_common_h
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.time_macros
     .llvm-libc-types.clock_t
     .llvm-libc-types.time_t
@@ -329,6 +330,7 @@ add_header_macro(
   stdio.h
   DEPENDS
     .llvm-libc-macros.file_seek_macros
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.stdio_macros
     .llvm-libc-types.FILE
     .llvm-libc-types.cookie_io_functions_t
@@ -343,6 +345,7 @@ add_header_macro(
   ../libc/include/stdlib.yaml
   stdlib.h
   DEPENDS
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.stdlib_macros
     .llvm-libc-types.__atexithandler_t
     .llvm-libc-types.__qsortcompare_t
@@ -709,6 +712,7 @@ add_header_macro(
   wchar.h
   DEPENDS
     .llvm_libc_common_h
+    .llvm-libc-macros.null_macro
     .llvm-libc-macros.wchar_macros
     .llvm-libc-types.mbstate_t
     .llvm-libc-types.size_t
@@ -723,6 +727,7 @@ add_header_macro(
   DEPENDS
     .llvm_libc_common_h
     .llvm-libc-macros.locale_macros
+    .llvm-libc-macros.null_macro
     .llvm-libc-types.locale_t
     .llvm-libc-types.struct_lconv
 )
diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml
index 6c71b70e59f0b..4566984ad83af 100644
--- a/libc/include/locale.yaml
+++ b/libc/include/locale.yaml
@@ -1,5 +1,8 @@
 header: locale.h
 header_template: locale.h.def
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: locale_t
   - type_name: struct_lconv
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 2619984cca264..3d5164fa10ffb 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -1,6 +1,8 @@
 header: stdio.h
 header_template: stdio.h.def
 macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
   - macro_name: stdout
     macro_value: stdout
   - macro_name: stdin
diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml
index f7155ba27a162..3b2ff13c684b1 100644
--- a/libc/include/stdlib.yaml
+++ b/libc/include/stdlib.yaml
@@ -4,7 +4,9 @@ standards:
   - stdc
 merge_yaml_files:
   - stdlib-malloc.yaml
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: __atexithandler_t
   - type_name: __qsortcompare_t
diff --git a/libc/include/string.h.def b/libc/include/string.h.def
index 1bd2687db2bea..339d005e43a4f 100644
--- a/libc/include/string.h.def
+++ b/libc/include/string.h.def
@@ -11,8 +11,6 @@
 
 #include "__llvm-libc-common.h"
 
-#include "llvm-libc-macros/null-macro.h"
-
 %%public_api()
 
 #endif // LLVM_LIBC_STRING_H
diff --git a/libc/include/string.yaml b/libc/include/string.yaml
index 9f72b8db6c1eb..736deceb453de 100644
--- a/libc/include/string.yaml
+++ b/libc/include/string.yaml
@@ -1,6 +1,8 @@
 header: string.h
 header_template: string.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: locale_t
   - type_name: size_t
diff --git a/libc/include/time.yaml b/libc/include/time.yaml
index 7bb25dbe85ac4..3b9d77c0aaae2 100644
--- a/libc/include/time.yaml
+++ b/libc/include/time.yaml
@@ -1,6 +1,8 @@
 header: time.h
 header_template: time.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: struct_timeval
   - type_name: clockid_t
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 877be48b6a10f..57f4f6660827e 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -1,6 +1,8 @@
 header: wchar.h
 header_template: wchar.h.def
-macros: []
+macros:
+  - macro_name: NULL
+    macro_header: null-macro.h
 types:
   - type_name: size_t
   - type_name: wint_t

From 469922f7c40a1733fba98e29fa2bd09a9565ddd6 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 11 Jun 2025 16:57:23 +0000
Subject: [PATCH 109/851] [X86] Don't emit ENDBR for asm goto branch targets
 (#143439)

Similarly to #141562, which disabled BTI generation for ARM asm goto
branch targets, drop unnecessary ENDBRs from IsInlineAsmBrIndirectTarget
machine basic blocks.
---
 .../Target/X86/X86IndirectBranchTracking.cpp  |  2 +-
 llvm/test/CodeGen/X86/callbr-asm-endbr.ll     | 34 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/X86/callbr-asm-endbr.ll

diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 7740a174af4f3..52be14228e555 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -147,7 +147,7 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MBB : MF) {
     // Find all basic blocks that their address was taken (for example
     // in the case of indirect jump) and add ENDBR instruction.
-    if (MBB.hasAddressTaken())
+    if (MBB.isMachineBlockAddressTaken() || MBB.isIRBlockAddressTaken())
       Changed |= addENDBR(MBB, MBB.begin());
 
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
diff --git a/llvm/test/CodeGen/X86/callbr-asm-endbr.ll b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll
new file mode 100644
index 0000000000000..133de89d5f3a1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    endbr64
+; CHECK-NEXT:    addl $4, %edi
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:  # %bb.1: # %normal
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # Inline asm indirect target
+; CHECK-NEXT:    # %fail
+; CHECK-NEXT:    # Label of block must be emitted
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = add i32 %a, 4
+  callbr void asm "xorl $0, $0; jmp ${1:l}", "r,!i,~{dirflag},~{fpsr},~{flags}"(i32 %0) to label %normal [label %fail]
+
+normal:
+  ret i32 0
+
+fail:
+  ret i32 1
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 8, !"cf-protection-branch", i32 1}

From 145b1b0f103e61cfc8a47ed37080e955630a1390 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 11 Jun 2025 09:57:42 -0700
Subject: [PATCH 110/851] [lldb][nfc] Factor out code checking if Variable is
 in scope (#143572)

This is useful for checking whether a variable is in scope inside a
specific block.
---
 lldb/include/lldb/Symbol/Variable.h |  3 ++
 lldb/source/Symbol/Variable.cpp     | 46 +++++++++++++++--------------
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/lldb/include/lldb/Symbol/Variable.h b/lldb/include/lldb/Symbol/Variable.h
index c437624d1ea6d..5b9c709c8b867 100644
--- a/lldb/include/lldb/Symbol/Variable.h
+++ b/lldb/include/lldb/Symbol/Variable.h
@@ -89,6 +89,9 @@ class Variable : public UserID, public std::enable_shared_from_this<Variable> {
 
   bool IsInScope(StackFrame *frame);
 
+  /// Returns true if this variable is in scope at `addr` inside `block`.
+  bool IsInScope(const Block &block, const Address &addr);
+
   bool LocationIsValidForFrame(StackFrame *frame);
 
   bool LocationIsValidForAddress(const Address &address);
diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp
index 8244725aba545..af32e0e958e51 100644
--- a/lldb/source/Symbol/Variable.cpp
+++ b/lldb/source/Symbol/Variable.cpp
@@ -290,28 +290,9 @@ bool Variable::IsInScope(StackFrame *frame) {
       // this variable was defined in is currently
       Block *deepest_frame_block =
           frame->GetSymbolContext(eSymbolContextBlock).block;
-      if (deepest_frame_block) {
-        SymbolContext variable_sc;
-        CalculateSymbolContext(&variable_sc);
-
-        // Check for static or global variable defined at the compile unit
-        // level that wasn't defined in a block
-        if (variable_sc.block == nullptr)
-          return true;
-
-        // Check if the variable is valid in the current block
-        if (variable_sc.block != deepest_frame_block &&
-            !variable_sc.block->Contains(deepest_frame_block))
-          return false;
-
-        // If no scope range is specified then it means that the scope is the
-        // same as the scope of the enclosing lexical block.
-        if (m_scope_range.IsEmpty())
-          return true;
-
-        addr_t file_address = frame->GetFrameCodeAddress().GetFileAddress();
-        return m_scope_range.FindEntryThatContains(file_address) != nullptr;
-      }
+      Address frame_addr = frame->GetFrameCodeAddress();
+      if (deepest_frame_block)
+        return IsInScope(*deepest_frame_block, frame_addr);
     }
     break;
 
@@ -321,6 +302,27 @@ bool Variable::IsInScope(StackFrame *frame) {
   return false;
 }
 
+bool Variable::IsInScope(const Block &block, const Address &addr) {
+  SymbolContext variable_sc;
+  CalculateSymbolContext(&variable_sc);
+
+  // Check for static or global variable defined at the compile unit
+  // level that wasn't defined in a block
+  if (variable_sc.block == nullptr)
+    return true;
+
+  // Check if the variable is valid in the current block
+  if (variable_sc.block != &block && !variable_sc.block->Contains(&block))
+    return false;
+
+  // If no scope range is specified then it means that the scope is the
+  // same as the scope of the enclosing lexical block.
+  if (m_scope_range.IsEmpty())
+    return true;
+
+  return m_scope_range.FindEntryThatContains(addr.GetFileAddress()) != nullptr;
+}
+
 Status Variable::GetValuesForVariableExpressionPath(
     llvm::StringRef variable_expr_path, ExecutionContextScope *scope,
     GetVariableCallback callback, void *baton, VariableList &variable_list,

From 370e54d03a5bb11f3f283ad5ab479501c74069c7 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 11 Jun 2025 19:02:36 +0200
Subject: [PATCH 111/851] [CIR] Upstream splat op for VectorType (#139827)

This change adds support for splat op for VectorType

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 32 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  8 +++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 51 +++++++++++++++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   | 10 +++
 clang/test/CIR/CodeGen/vector-ext.cpp         | 64 +++++++++++++++++++
 clang/test/CIR/CodeGen/vector.cpp             | 63 ++++++++++++++++++
 clang/test/CIR/IR/vector.cir                  | 33 ++++++++++
 7 files changed, 261 insertions(+)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 565c0676773e6..634f0dd554c77 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2277,6 +2277,38 @@ def VecTernaryOp : CIR_Op<"vec.ternary",
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// VecSplatOp
+//===----------------------------------------------------------------------===//
+
+def VecSplatOp : CIR_Op<"vec.splat", [Pure,
+  TypesMatchWith<"type of 'value' matches element type of 'result'", "result",
+                 "value", "cast<VectorType>($_self).getElementType()">]> {
+
+  let summary = "Convert a scalar into a vector";
+  let description = [{
+    The `cir.vec.splat` operation creates a vector value from a scalar value.
+    All elements of the vector have the same value, that of the given scalar.
+
+    It's a separate operation from `cir.vec.create` because more
+    efficient LLVM IR can be generated for it, and because some optimization and
+    analysis passes can benefit from knowing that all elements of the vector
+    have the same value.
+
+    ```mlir
+    %value = cir.const #cir.int<3> : !s32i
+    %value_vec = cir.vec.splat %value : !s32i, !cir.vector<4 x !s32i>
+    ```
+  }];
+
+  let arguments = (ins CIR_VectorElementType:$value);
+  let results = (outs CIR_VectorType:$result);
+
+  let assemblyFormat = [{
+    $value `:` type($value) `,` qualified(type($result)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // BaseClassAddrOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 481eb492d1875..30d231e2c61de 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1780,6 +1780,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
                               cgf.convertType(destTy));
   }
 
+  case CK_VectorSplat: {
+    // Create a vector object and fill all elements with the same scalar value.
+    assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type");
+    return builder.create<cir::VecSplatOp>(
+        cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy),
+        Visit(subExpr));
+  }
+
   default:
     cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(),
                                    "CastExpr: ", ce->getCastKindName());
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 4fdf8f9ec2695..1642d10d427b5 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1803,6 +1803,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecExtractOpLowering,
                CIRToLLVMVecInsertOpLowering,
                CIRToLLVMVecCmpOpLowering,
+               CIRToLLVMVecSplatOpLowering,
                CIRToLLVMVecShuffleOpLowering,
                CIRToLLVMVecShuffleDynamicOpLowering,
                CIRToLLVMVecTernaryOpLowering
@@ -1956,6 +1957,56 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite(
+    cir::VecSplatOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  // Vector splat can be implemented with an `insertelement` and a
+  // `shufflevector`, which is better than an `insertelement` for each
+  // element in the vector. Start with an undef vector. Insert the value into
+  // the first element. Then use a `shufflevector` with a mask of all 0 to
+  // fill out the entire vector with that value.
+  cir::VectorType vecTy = op.getType();
+  mlir::Type llvmTy = typeConverter->convertType(vecTy);
+  mlir::Location loc = op.getLoc();
+  mlir::Value poison = rewriter.create<mlir::LLVM::PoisonOp>(loc, llvmTy);
+
+  mlir::Value elementValue = adaptor.getValue();
+  if (mlir::isa<mlir::LLVM::PoisonOp>(elementValue.getDefiningOp())) {
+    // If the splat value is poison, then we can just use poison value
+    // for the entire vector.
+    rewriter.replaceOp(op, poison);
+    return mlir::success();
+  }
+
+  if (auto constValue =
+          dyn_cast<mlir::LLVM::ConstantOp>(elementValue.getDefiningOp())) {
+    if (auto intAttr = dyn_cast<mlir::IntegerAttr>(constValue.getValue())) {
+      mlir::DenseIntElementsAttr denseVec = mlir::DenseIntElementsAttr::get(
+          mlir::cast<mlir::ShapedType>(llvmTy), intAttr.getValue());
+      rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(
+          op, denseVec.getType(), denseVec);
+      return mlir::success();
+    }
+
+    if (auto fpAttr = dyn_cast<mlir::FloatAttr>(constValue.getValue())) {
+      mlir::DenseFPElementsAttr denseVec = mlir::DenseFPElementsAttr::get(
+          mlir::cast<mlir::ShapedType>(llvmTy), fpAttr.getValue());
+      rewriter.replaceOpWithNewOp<mlir::LLVM::ConstantOp>(
+          op, denseVec.getType(), denseVec);
+      return mlir::success();
+    }
+  }
+
+  mlir::Value indexValue =
+      rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), 0);
+  mlir::Value oneElement = rewriter.create<mlir::LLVM::InsertElementOp>(
+      loc, poison, elementValue, indexValue);
+  SmallVector<int32_t> zeroValues(vecTy.getSize(), 0);
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ShuffleVectorOp>(op, oneElement,
+                                                           poison, zeroValues);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMVecShuffleOpLowering::matchAndRewrite(
     cir::VecShuffleOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 22d8a1e7c22e0..2eda568c84bdb 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -367,6 +367,16 @@ class CIRToLLVMVecCmpOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMVecSplatOpLowering
+    : public mlir::OpConversionPattern<cir::VecSplatOp> {
+public:
+  using mlir::OpConversionPattern<cir::VecSplatOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::VecSplatOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMVecShuffleOpLowering
     : public mlir::OpConversionPattern<cir::VecShuffleOp> {
 public:
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index e1814f216f6b9..965c44c9461a8 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -990,6 +990,7 @@ void foo14() {
 // OGCG: %[[TMP_B:.*]] = load <4 x float>, ptr %[[VEC_B]], align 16
 // OGCG: %[[GE:.*]] = fcmp oge <4 x float> %[[TMP_A]], %[[TMP_B]]
 // OGCG: %[[RES:.*]] = sext <4 x i1> %[[GE]] to <4 x i32>
+// OGCG: store <4 x i32> %[[RES]], ptr {{.*}}, align 16
 
 void foo15() {
   vi4 a;
@@ -1092,6 +1093,69 @@ void foo17() {
 // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16>
 
+void foo18() {
+  vi4 a = {1, 2, 3, 4};
+  vi4 shl = a << 3;
+
+  uvi4 b = {1u, 2u, 3u, 4u};
+  uvi4 shr = b >> 3u;
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
+// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
+// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
+// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i>
+// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
 void foo19() {
   vi4 a;
   vi4 b;
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index 4f116faa7a1ac..23e91724dc0f3 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -1071,6 +1071,69 @@ void foo17() {
 // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16
 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16>
 
+void foo18() {
+  vi4 a = {1, 2, 3, 4};
+  vi4 shl = a << 3;
+
+  uvi4 b = {1u, 2u, 3u, 4u};
+  uvi4 shr = b >> 3u;
+}
+
+// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
+// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
+// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
+// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
+// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
+// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
+// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
+// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i>
+// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i>
+// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
+
+// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// LLVM: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
+// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16
+// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_A]], align 16
+// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16
+// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16
+// OGCG: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr %[[VEC_B]], align 16
+// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16
+// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3)
+// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16
+
 void foo19() {
   vi4 a;
   vi4 b;
diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir
index a455acf92ab6f..f23f5de9692de 100644
--- a/clang/test/CIR/IR/vector.cir
+++ b/clang/test/CIR/IR/vector.cir
@@ -187,4 +187,37 @@ cir.func @vector_shuffle_dynamic_test() {
 // CHECK:    cir.return
 // CHECK: }
 
+cir.func @vector_splat_test() {
+    %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+    %1 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.const #cir.int<3> : !s32i
+    %5 = cir.const #cir.int<4> : !s32i
+    %6 = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+    cir.store %6, %0 : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+    %7 = cir.load %0 : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+    %8 = cir.const #cir.int<3> : !s32i
+    %9 = cir.vec.splat %8 : !s32i, !cir.vector<4 x !s32i>
+    %10 = cir.shift(left, %7 : !cir.vector<4 x !s32i>, %9 : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+    cir.store %10, %1 : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+    cir.return
+}
+
+// CHECK: cir.func @vector_splat_test() {
+// CHECK-NEXT: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
+// CHECK-NEXT: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
+// CHECK-NEXT: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
+// CHECK-NEXT: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CHECK-NEXT: cir.store %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CHECK-NEXT: %[[TMP:.*]] = cir.load %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
+// CHECK-NEXT: %[[SPLAT_VAL:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SPLAT_VAL]] : !s32i, !cir.vector<4 x !s32i>
+// CHECK-NEXT: %[[SHL:.*]] = cir.shift(left, %[[TMP]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
+// CHECK-NEXT: cir.store %[[SHL]], %[[SHL_RES:.*]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CHECK-NEXT: cir.return
+
 }

From 621a7d0f66f3da27e687dd7dd832450334ee81da Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 11 Jun 2025 19:02:47 +0200
Subject: [PATCH 112/851] [flang] silence bogus error with BIND(C) variable in
 hermetic module (#143737)

The global name semantic check was firing in a bogus way when BIND(C)
variables are in hermetic module.

Do not raise the error if one of the symbol with the conflicting global
name is an "hermetic variant" of the other.
---
 flang/lib/Semantics/check-declarations.cpp | 10 +++++++++
 flang/test/Semantics/modfile76.F90         | 24 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 flang/test/Semantics/modfile76.F90

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 46a5b970fdf0c..f9d64485f1407 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2958,6 +2958,14 @@ static std::optional<std::string> DefinesGlobalName(const Symbol &symbol) {
   return std::nullopt;
 }
 
+static bool IsSameSymbolFromHermeticModule(
+    const Symbol &symbol, const Symbol &other) {
+  return symbol.name() == other.name() && symbol.owner().IsModule() &&
+      other.owner().IsModule() && symbol.owner() != other.owner() &&
+      symbol.owner().GetName() &&
+      symbol.owner().GetName() == other.owner().GetName();
+}
+
 // 19.2 p2
 void CheckHelper::CheckGlobalName(const Symbol &symbol) {
   if (auto global{DefinesGlobalName(symbol)}) {
@@ -2975,6 +2983,8 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) {
           (!IsExternalProcedureDefinition(symbol) ||
               !IsExternalProcedureDefinition(other))) {
         // both are procedures/BLOCK DATA, not both definitions
+      } else if (IsSameSymbolFromHermeticModule(symbol, other)) {
+        // Both symbols are the same thing.
       } else if (symbol.has<ModuleDetails>()) {
         Warn(common::LanguageFeature::BenignNameClash, symbol.name(),
             "Module '%s' conflicts with a global name"_port_en_US,
diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90
new file mode 100644
index 0000000000000..50ee9a088e119
--- /dev/null
+++ b/flang/test/Semantics/modfile76.F90
@@ -0,0 +1,24 @@
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
+!RUN: %flang_fc1 -fsyntax-only %s
+
+! Tests that a BIND(C) variable in a module A captured in a hermetic module
+! file USE'd in a module B is not creating bogus complaints about BIND(C) name
+! conflict when both module A and B are later accessed.
+
+#if STEP == 1
+module modfile75a
+  integer, bind(c) :: x
+end
+
+module modfile75b
+  use modfile75a ! capture hermetically
+end
+
+#else
+subroutine test
+  use modfile75a
+  use modfile75b
+  implicit none
+  print *, x
+end subroutine
+#endif

From 7414d88b5f8af1bdf8da6bf2493b485ba5d079f2 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 11 Jun 2025 18:13:56 +0100
Subject: [PATCH 113/851] Squelch an unused-function warning

After removing some debug-intrinsic creation code, this function is now
unused (and un-necessary)
---
 llvm/lib/IR/DIBuilder.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 1484c549dd580..c56dd7a1d3820 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -1069,10 +1069,6 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) {
   return MetadataAsValue::get(VMContext, ValueAsMetadata::get(V));
 }
 
-static Function *getDeclareIntrin(Module &M) {
-  return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare);
-}
-
 DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val,
                                               DILocalVariable *VarInfo,
                                               DIExpression *Expr,

From 3e24dadee0d7ecc5f95fe0760afb7abdeb9a2dc5 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Wed, 11 Jun 2025 10:24:19 -0700
Subject: [PATCH 114/851] [Clang][Tooling][NFC] Use move to avoid copies of
 large objects (#143603)

Static analysis flagged these cases in which can use std::move and avoid
copies of large objects.
---
 clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 44a270d5f7b35..b1495163ccc24 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -657,7 +657,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc,
     P1689ModuleInfo RequiredModule;
     RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str();
     RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule;
-    MDC.RequiredStdCXXModules.push_back(RequiredModule);
+    MDC.RequiredStdCXXModules.push_back(std::move(RequiredModule));
     return;
   }
 
@@ -920,7 +920,7 @@ void ModuleDepCollectorPP::addAllSubmoduleDeps(
 
 void ModuleDepCollectorPP::addOneModuleDep(const Module *M, const ModuleID ID,
                                            ModuleDeps &MD) {
-  MD.ClangModuleDeps.push_back(ID);
+  MD.ClangModuleDeps.push_back(std::move(ID));
   if (MD.IsInStableDirectories)
     MD.IsInStableDirectories = MDC.ModularDeps[M]->IsInStableDirectories;
 }

From 66f533e7e34d6f6d0e293a67dd54be9e4c240ddd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 10:39:02 -0700
Subject: [PATCH 115/851] [IR] Fix warnings (#143752)

This patch fixes:

  llvm/lib/IR/DIBuilder.cpp:1072:18: error: unused function
  'getDeclareIntrin' [-Werror,-Wunused-function]

  llvm/include/llvm/IR/DIBuilder.h:51:15: error: private field
  'DeclareFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:52:15: error: private field
  'ValueFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:53:15: error: private field
  'LabelFn' is not used [-Werror,-Wunused-private-field]

  llvm/include/llvm/IR/DIBuilder.h:54:15: error: private field
  'AssignFn' is not used [-Werror,-Wunused-private-field]
---
 llvm/include/llvm/IR/DIBuilder.h | 6 +-----
 llvm/lib/IR/DIBuilder.cpp        | 3 +--
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index ebfe41dd59afb..43fca571ee6d5 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -47,11 +47,7 @@ namespace llvm {
     Module &M;
     LLVMContext &VMContext;
 
-    DICompileUnit *CUNode;   ///< The one compile unit created by this DIBuiler.
-    Function *DeclareFn;     ///< llvm.dbg.declare
-    Function *ValueFn;       ///< llvm.dbg.value
-    Function *LabelFn;       ///< llvm.dbg.label
-    Function *AssignFn;      ///< llvm.dbg.assign
+    DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler.
 
     SmallVector<TrackingMDNodeRef, 4> AllEnumTypes;
     /// Track the RetainTypes, since they can be updated later on.
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index c56dd7a1d3820..fd8c2d7bb5cc3 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -25,8 +25,7 @@ using namespace llvm;
 using namespace llvm::dwarf;
 
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
-    : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr),
-      ValueFn(nullptr), LabelFn(nullptr), AssignFn(nullptr),
+    : M(m), VMContext(M.getContext()), CUNode(CU),
       AllowUnresolvedNodes(AllowUnresolvedNodes) {
   if (CUNode) {
     if (const auto &ETs = CUNode->getEnumTypes())

From c2f0af514beb7618660cf8d145fa9e49fb78869c Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Wed, 11 Jun 2025 10:47:17 -0700
Subject: [PATCH 116/851] [GISelValueTracking] Add test case for G_PTRTOINT

While we can only reason about the index/address, the G_PTRTOINT
operations returns all representation bits, so we can't assume the
remaining ones are all zeroes. This behaviour was clarified as part of
the discussion in https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54.
The LangRef semantics of ptrtoint being a full representation bitcast
were documented in https://github.com/llvm/llvm-project/pull/139349.

Prior to 77c8d214131e951e3d3a07b45a7436f54988d6f3 we were incorrectly
assuming known zeroes beyond the index size even if the input was
completely unknown. This commit adds a test case for G_PTRTOINT which
was omitted from that change.

See https://github.com/llvm/llvm-project/issues/139598

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/139608
---
 .../AMDGPU/GlobalISel/knownbits-ptrtoint.mir  | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
new file mode 100644
index 0000000000000..4073568fd4210
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
@@ -0,0 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s
+## Check that we don't incorrectly assume known zeroes for and extend of a truncated ptrtoint
+## Test case for https://github.com/llvm/llvm-project/issues/139598
+---
+## We should see 128 unknown bits.
+name:            PtrToInt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToInt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s128) = G_PTRTOINT %4(p8)
+...
+---
+## We should see 128 high zeroes followed by 128 unknown bits for extending ptrtoint.
+name:            PtrToIntExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:128
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s256) = G_PTRTOINT %4(p8)
+...
+---
+## We should see 48 unknown bits for truncating ptrtoint.
+name:            PtrToIntTrunc
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTrunc
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s48) = G_PTRTOINT %4(p8)
+...
+---
+## This is the test for issue 139598: Truncating and then extending the
+## G_PTRTOINT result was filling all bits above the index bitwidth with known
+## zeroes even though the incoming value is completely unknown and G_PTRTOINT.
+## is lowered to a bitwise copy.
+## We should see all zero high bits with 48 unknown bits.
+name:            PtrToIntTruncExplicitExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTruncExplicitExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %6:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %7:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s128) = G_PTRTOINT %4(p8)
+    %6:_(s48) = G_TRUNC %5(s128)
+    %7:_(s256) = G_ZEXT %6(s48)
+...
+---
+## Same test again but this time have the G_PTRTOINT do the truncation.
+## We should see all zero high bits with 48 unknown bits.
+name:            PtrToIntTruncImplicitExt
+body:             |
+  bb.0:
+  ; CHECK-LABEL: name: @PtrToIntTruncImplicitExt
+  ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1
+  ; CHECK-NEXT: %6:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32)
+    %5:_(s48) = G_PTRTOINT %4(p8)
+    %6:_(s256) = G_ZEXT %5(s48)
+...

From bbe59e19b60b0efa8cc200fb3260fe572e188b26 Mon Sep 17 00:00:00 2001
From: Kewen12 <Kewen.Meng@amd.com>
Date: Wed, 11 Jun 2025 11:12:54 -0700
Subject: [PATCH 117/851] [OpenMP][Offload] Update the Logic for Configuring
 Auto Zero-Copy (#143638)

Summary:

Currently the Auto Zero-Copy is enabled by checking every initialized
device to ensure that no dGPU is attached to an APU. However, an APU is
designed to comprise a homogeneous set of GPUs, therefore, it should be
sufficient to check any device for configuring Auto Zero-Copy. In this
PR, it checks the first initialized device in the list.

The changes in this PR are to clearly reflect the design and logic of
enabling the feature for further improving the readibility.
---
 offload/libomptarget/PluginManager.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp
index 93589960a426d..c4d99dfa9f10c 100644
--- a/offload/libomptarget/PluginManager.cpp
+++ b/offload/libomptarget/PluginManager.cpp
@@ -286,16 +286,16 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
   }
   PM->RTLsMtx.unlock();
 
-  bool UseAutoZeroCopy = Plugins.size() > 0;
+  bool UseAutoZeroCopy = false;
 
   auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor();
-  for (const auto &Device : *ExclusiveDevicesAccessor)
-    UseAutoZeroCopy &= Device->useAutoZeroCopy();
+  // APUs are homogeneous set of GPUs. Check the first device for
+  // configuring Auto Zero-Copy.
+  if (ExclusiveDevicesAccessor->size() > 0) {
+    auto &Device = *(*ExclusiveDevicesAccessor)[0];
+    UseAutoZeroCopy = Device.useAutoZeroCopy();
+  }
 
-  // Auto Zero-Copy can only be currently triggered when the system is an
-  // homogeneous APU architecture without attached discrete GPUs.
-  // If all devices suggest to use it, change requirement flags to trigger
-  // zero-copy behavior when mapping memory.
   if (UseAutoZeroCopy)
     addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
 

From fad1972d74aead159a5e91b068cbf736e83836b5 Mon Sep 17 00:00:00 2001
From: VISHAKH PRAKASH <vishakh.prakash@multicorewareinc.com>
Date: Wed, 11 Jun 2025 23:43:01 +0530
Subject: [PATCH 118/851] [SPIRV] FIX print the symbolic operand for opcode for
 the operation OpSpecConstantOp (#135756)

Current implementation outputs opcode is an immediate but spirv-tools
requires that the name of the operation without "Op" is needed for the
instruction OpSpecConstantOp
that is if the opcode is OpBitcast the instruction must be
`%1 = OpSpecConstantOp %6 Bitcast %17`
instead of
`%1 = OpBitcast %6 124 %17`

[refer this commit for more
info](https://github.com/KhronosGroup/SPIRV-Tools/commit/0f166be68d4b6624a10d6bf312679505d391ec22)

---------

Co-authored-by: Dmitry Sidorov <dmitry.sidorov@intel.com>
Co-authored-by: Ebin-McW <ebin.jose@multicorewareinc.com>
---
 .../SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp      |  3 +-
 .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h |  5 ++
 llvm/lib/Target/SPIRV/SPIRVInstrInfo.td       |  2 +-
 .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 90 +++++++++++++++++++
 llvm/test/CodeGen/SPIRV/const-nested-vecs.ll  |  4 +-
 .../fun-ptr-addrcast.ll                       |  2 +-
 .../opencl/basic/progvar_prog_scope_init.ll   |  2 +-
 .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll  |  2 +-
 .../pointers/PtrCast-in-OpSpecConstantOp.ll   | 12 +--
 .../CodeGen/SPIRV/pointers/global-ptrtoint.ll |  4 +-
 .../pointers/irtrans-added-int-const-32-64.ll |  2 +-
 11 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 342456757409a..0ed97f5b41c51 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -68,7 +68,8 @@ getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category,
       Category != SPIRV::OperandCategory::FunctionControlOperand &&
       Category != SPIRV::OperandCategory::MemorySemanticsOperand &&
       Category != SPIRV::OperandCategory::MemoryOperandOperand &&
-      Category != SPIRV::OperandCategory::KernelProfilingInfoOperand)
+      Category != SPIRV::OperandCategory::KernelProfilingInfoOperand &&
+      Category != SPIRV::OperandCategory::SpecConstantOpOperandsOperand)
     return "UNKNOWN";
   // Value that encodes many enum values (one bit per enum value).
   std::string Name;
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 083c7f8460bf2..b8c467fef8e8e 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -222,6 +222,11 @@ namespace CooperativeMatrixOperands {
 #include "SPIRVGenTables.inc"
 } // namespace CooperativeMatrixOperands
 
+namespace SpecConstantOpOperands {
+#define GET_SpecConstantOpOperands_DECL
+#include "SPIRVGenTables.inc"
+} // namespace SpecConstantOpOperands
+
 struct ExtendedBuiltin {
   StringRef Name;
   InstructionSet::InstructionSet Set;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 338f6809a3e46..049ba0275f223 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -245,7 +245,7 @@ def OpSpecConstantComposite: Op<51, (outs ID:$res), (ins TYPE:$type, variable_op
                   "$res = OpSpecConstantComposite $type">;
 def OpSpecConstantCompositeContinuedINTEL: Op<6092, (outs), (ins variable_ops),
                   "OpSpecConstantCompositeContinuedINTEL">;
-def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, i32imm:$c, ID:$o, variable_ops),
+def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, SpecConstantOpOperands:$c, ID:$o, variable_ops),
                   "$res = OpSpecConstantOp $t $c $o">;
 
 // 3.42.8 Memory Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index ca8a9a9997a8b..f1aae42ea2be0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -172,6 +172,7 @@ def KernelProfilingInfoOperand : OperandCategory;
 def OpcodeOperand : OperandCategory;
 def CooperativeMatrixLayoutOperand : OperandCategory;
 def CooperativeMatrixOperandsOperand : OperandCategory;
+def SpecConstantOpOperandsOperand : OperandCategory;
 def MatrixMultiplyAccumulateOperandsOperand : OperandCategory;
 
 //===----------------------------------------------------------------------===//
@@ -1755,6 +1756,95 @@ defm MatrixAAndBBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x40,
 defm MatrixCBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x80, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>;
 defm MatrixResultBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x100, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>;
 
+//===----------------------------------------------------------------------===//
+// Multiclass used to define SpecConstant Operands enum values and at the
+// same time SymbolicOperand.
+//===----------------------------------------------------------------------===//
+
+def SpecConstantOpOperands : GenericEnum, Operand<i32> {
+  let FilterClass = "SpecConstantOpOperands";
+  let NameField = "Name";
+  let ValueField = "Value";
+  let PrintMethod = !strconcat("printSymbolicOperand<OperandCategory::", FilterClass, "Operand>");
+}
+
+class SpecConstantOpOperands<string name, bits<32> value> {
+  string Name = name;
+  bits<32> Value = value;
+}
+
+multiclass SpecConstantOpOperandsOperand<bits<32> value, list<Extension> reqExtensions, list<Capability> reqCapabilities> {
+  def : SpecConstantOpOperands<NAME, value>;
+  defm : SymbolicOperandWithRequirements<SpecConstantOpOperandsOperand, value, NAME, 0, 0, reqExtensions, reqCapabilities>;
+}
+
+// Conversion
+defm SConvert :  SpecConstantOpOperandsOperand<114, [], []>;
+defm FConvert :  SpecConstantOpOperandsOperand<115, [], []>;
+defm ConvertFToS :  SpecConstantOpOperandsOperand<110, [], [Kernel]>;
+defm ConvertSToF :  SpecConstantOpOperandsOperand<111, [], [Kernel]>;
+defm ConvertFToU :  SpecConstantOpOperandsOperand<109, [], [Kernel]>;
+defm ConvertUToF :  SpecConstantOpOperandsOperand<112, [], [Kernel]>;
+defm UConvert :  SpecConstantOpOperandsOperand<113, [], [Kernel]>;
+defm ConvertPtrToU :  SpecConstantOpOperandsOperand<117, [], [Kernel]>;
+defm ConvertUToPtr :  SpecConstantOpOperandsOperand<120, [], [Kernel]>;
+defm GenericCastToPtr :  SpecConstantOpOperandsOperand<122, [], [Kernel]>;
+defm PtrCastToGeneric :  SpecConstantOpOperandsOperand<121, [], [Kernel]>;
+defm Bitcast :  SpecConstantOpOperandsOperand<124, [], []>;
+defm QuantizeToF16 :  SpecConstantOpOperandsOperand<116, [], [Shader]>;
+// Arithmetic 
+defm SNegate :  SpecConstantOpOperandsOperand<126, [], []>;
+defm Not :  SpecConstantOpOperandsOperand<200, [], []>;
+defm IAdd :  SpecConstantOpOperandsOperand<128, [], []>;
+defm ISub :  SpecConstantOpOperandsOperand<130, [], []>;
+defm IMul :  SpecConstantOpOperandsOperand<132, [], []>;
+defm UDiv :  SpecConstantOpOperandsOperand<134, [], []>;
+defm SDiv :  SpecConstantOpOperandsOperand<135, [], []>;
+defm UMod :  SpecConstantOpOperandsOperand<137, [], []>;
+defm SRem :  SpecConstantOpOperandsOperand<138, [], []>;
+defm SMod :  SpecConstantOpOperandsOperand<139, [], []>;
+defm ShiftRightLogical :  SpecConstantOpOperandsOperand<194, [], []>;
+defm ShiftRightArithmetic :  SpecConstantOpOperandsOperand<195, [], []>;
+defm ShiftLeftLogical :  SpecConstantOpOperandsOperand<196, [], []>;
+defm BitwiseOr :  SpecConstantOpOperandsOperand<197, [], []>;
+defm BitwiseAnd :  SpecConstantOpOperandsOperand<199, [], []>;
+defm BitwiseXor :  SpecConstantOpOperandsOperand<198, [], []>;
+defm FNegate :  SpecConstantOpOperandsOperand<127, [], [Kernel]>;
+defm FAdd :  SpecConstantOpOperandsOperand<129, [], [Kernel]>;
+defm FSub :  SpecConstantOpOperandsOperand<131, [], [Kernel]>;
+defm FMul :  SpecConstantOpOperandsOperand<133, [], [Kernel]>;
+defm FDiv :  SpecConstantOpOperandsOperand<136, [], [Kernel]>;
+defm FRem :  SpecConstantOpOperandsOperand<140, [], [Kernel]>;
+defm FMod :  SpecConstantOpOperandsOperand<141, [], [Kernel]>;
+// Composite;
+defm VectorShuffle :  SpecConstantOpOperandsOperand<79, [], []>;
+defm CompositeExtract :  SpecConstantOpOperandsOperand<81, [], []>;
+defm CompositeInsert :  SpecConstantOpOperandsOperand<82, [], []>;
+// Logical;
+defm LogicalOr :  SpecConstantOpOperandsOperand<166, [], []>;
+defm LogicalAnd :  SpecConstantOpOperandsOperand<167, [], []>;
+defm LogicalNot :  SpecConstantOpOperandsOperand<168, [], []>;
+defm LogicalEqual :  SpecConstantOpOperandsOperand<164, [], []>;
+defm LogicalNotEqual :  SpecConstantOpOperandsOperand<165, [], []>;
+defm Select :  SpecConstantOpOperandsOperand<169, [], []>;
+// Comparison;
+defm IEqual :  SpecConstantOpOperandsOperand<170, [], []>;
+defm INotEqual :  SpecConstantOpOperandsOperand<171, [], []>;
+defm ULessThan :  SpecConstantOpOperandsOperand<176, [], []>;
+defm SLessThan :  SpecConstantOpOperandsOperand<177, [], []>;
+defm UGreaterThan :  SpecConstantOpOperandsOperand<172, [], []>;
+defm SGreaterThan :  SpecConstantOpOperandsOperand<173, [], []>;
+defm ULessThanEqual :  SpecConstantOpOperandsOperand<178, [], []>;
+defm SLessThanEqual :  SpecConstantOpOperandsOperand<179, [], []>;
+defm UGreaterThanEqual :  SpecConstantOpOperandsOperand<174, [], []>;
+defm SGreaterThanEqual :  SpecConstantOpOperandsOperand<175, [], []>;
+// Memory
+defm AccessChain :  SpecConstantOpOperandsOperand<65, [], [Kernel]>;
+defm InBoundsAccessChain :  SpecConstantOpOperandsOperand<66, [], [Kernel]>;
+defm PtrAccessChain :  SpecConstantOpOperandsOperand<67, [], [Kernel]>;
+defm InBoundsPtrAccessChain :  SpecConstantOpOperandsOperand<70, [], [Kernel]>;
+defm CooperativeMatrixLengthKHR : SpecConstantOpOperandsOperand<4460, [], []>;
+
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Matrix Multiply Accumulate Operands enum values and at the same time
 // SymbolicOperand entries with string mnemonics and capabilities.
diff --git a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
index 9234106e5fcd1..266b46e65f319 100644
--- a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
+++ b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll
@@ -25,8 +25,8 @@
 ; CHECK-SPIRV-DAG: %[[#IntZero:]] = OpConstantNull %[[#IntTy]]
 ; CHECK-SPIRV-DAG: %[[#LongZero:]] = OpConstantNull %[[#LongTy]]
 ; CHECK-SPIRV64-DAG: %[[#ConstLong2:]] = OpConstant %[[#LongTy]] 2
-; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]]
-; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]]
+; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]]
+; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]]
 ; CHECK-SPIRV-DAG: %[[#PtrPtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#PtrCharTy]]
 ; CHECK-SPIRV-DAG: %[[#AVar]] = OpVariable %[[#PtrArr2V2CharTy]] CrossWorkgroup %[[#Arr2V2Char]]
 ; CHECK-SPIRV-DAG: %[[#PVar]] = OpVariable %[[#PtrPtrCharTy]] CrossWorkgroup %[[#PvarInit]]
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
index 8edecc1329d07..e5736b88b63a3 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s
 ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]]
+; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] PtrCastToGeneric %[[#]]
 ; CHECK-COUNT-3: OpPtrCastToGeneric
 
 @G1 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4))] }
diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
index 9d759a1cf47d0..fbc83c7a1e045 100644
--- a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
+++ b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll
@@ -10,7 +10,7 @@
 ; CHECK-DAG: %[[#pt2:]] = OpTypePointer CrossWorkgroup %[[#arr2]]
 ; CHECK-DAG: %[[#pt3:]] = OpTypePointer CrossWorkgroup %[[#pt1]]
 ; CHECK-DAG: %[[#a_var]] = OpVariable %[[#pt2]] CrossWorkgroup
-; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] 70 %[[#a_var]]
+; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] InBoundsPtrAccessChain %[[#a_var]]
 ; CHECK-DAG: %[[#p_var]] = OpVariable %[[#pt3]] CrossWorkgroup %[[#const]]
 @var = addrspace(1) global i8 0, align 1
 @g_var = addrspace(1) global i8 1, align 1
diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
index 5f9229f5a5bd6..447dfa701b659 100644
--- a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
+++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll
@@ -14,7 +14,7 @@
 ; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]]
 ; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]]
 ; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]]
+; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] InBoundsPtrAccessChain %[[#Bytes]] %[[#C648]]
 
 ; CHECK: OpFunction
 ; CHECK: %[[#]] = OpFunctionParameter %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
index 55d638f80cc55..ca7ca06fbdc8c 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll
@@ -23,20 +23,20 @@
 ; CHECK-DAG: %[[WPtr:.*]] = OpTypePointer Workgroup %[[Int]]
 
 ; CHECK-DAG: %[[F]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[F]]
+; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[F]]
 ; CHECK-DAG: %[[B]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]]
-; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[B]]
+; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[B]]
 ; CHECK-DAG: %[[GenFB:.*]] = OpConstantComposite %[[Arr2]] %[[GenF]] %[[GenB]]
 ; CHECK-DAG: %[[GenBF:.*]] = OpConstantComposite %[[Arr2]] %[[GenB]] %[[GenF]]
 ; CHECK-DAG: %[[CG1:.*]] = OpConstantComposite %[[Struct2]] %[[GenFB]]
 ; CHECK-DAG: %[[CG2:.*]] = OpConstantComposite %[[Struct2]] %[[GenBF]]
 
 ; CHECK-DAG: %[[X]] = OpVariable %[[WPtr]] Workgroup %[[#]]
-; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[X]]
-; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenX]]
+; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[X]]
+; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenX]]
 ; CHECK-DAG: %[[Y]] = OpVariable %[[WPtr]] Workgroup %[[#]]
-; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[Y]]
-; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenY]]
+; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[Y]]
+; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenY]]
 ; CHECK-DAG: %[[CWXY:.*]] = OpConstantComposite %[[Arr1]] %[[CWX]] %[[CWY]]
 ; CHECK-DAG: %[[CWYX:.*]] = OpConstantComposite %[[Arr1]] %[[CWY]] %[[CWX]]
 ; CHECK-DAG: %[[CG3:.*]] = OpConstantComposite %[[Struct1]] %[[CWXY]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
index 16c20f9067e6e..0fd2f622dc840 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll
@@ -11,9 +11,9 @@
 ; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyI64]] %[[TyI64]]
 ; CHECK-DAG: %[[Const128:.*]] = OpConstant %[[TyI64]] 128
 ; CHECK-DAG: %[[GlobalValue]] = OpVariable
-; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %[[GlobalValue]]
+; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] ConvertPtrToU %[[GlobalValue]]
 ; TODO: The following bitcast line looks unneeded and we may expect it to be removed in future
-; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] 124 %[[PtrToInt]]
+; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] Bitcast %[[PtrToInt]]
 ; CHECK-DAG: %[[ConstComposite:.*]] = OpConstantComposite %[[TyStruct]] %[[Const128]] %[[UseGlobalValue]]
 ; CHECK-DAG: %[[TyPtrStruct:.*]] = OpTypePointer CrossWorkgroup %[[TyStruct]]
 ; CHECK: OpVariable %[[TyPtrStruct]] CrossWorkgroup %[[ConstComposite]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
index c2738229aa4d7..f5abcd38d0405 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll
@@ -12,7 +12,7 @@
 ; CHECK-SPIRV64-DAG: %[[#IntTy:]] = OpTypeInt 64 0
 ; CHECK-SPIRV32-DAG: %[[#IntTy:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2
-; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] 70 %[[#]] %[[#]] %[[#Const2]]
+; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] InBoundsPtrAccessChain %[[#]] %[[#]] %[[#Const2]]
 ; CHECK-SPIRV: OpFunction
 
 @a_var = addrspace(1) global [2 x i8] [i8 1, i8 1]

From 42c82fcc29c1c8e19b2265495a5d8f59fb5ea764 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:19:26 +0200
Subject: [PATCH 119/851] [libc++] Upgrade to GCC 15 (#138293)

---
 .github/workflows/libcxx-build-and-test.yaml      |  8 ++++----
 libcxx/docs/index.rst                             |  2 +-
 libcxx/src/experimental/time_zone.cpp             |  9 +++++++++
 .../alg.contains/ranges.contains.pass.cpp         |  4 ++--
 .../equality_comparable.compile.pass.cpp          |  6 ++++++
 .../equality_comparable_with.compile.pass.cpp     | 15 +++++++++++++++
 .../totally_ordered.compile.pass.cpp              |  3 +++
 .../totally_ordered_with.compile.pass.cpp         | 10 ++++++++++
 .../new.delete.array/new.size.except.pass.cpp     |  3 +++
 .../new.delete/new.delete.array/new.size.pass.cpp |  3 +++
 .../new.size_align.except.pass.cpp                |  3 +++
 .../new.delete.array/new.size_align.pass.cpp      |  3 +++
 .../new.delete.single/new.size.except.pass.cpp    |  3 +++
 .../new.delete.single/new.size.pass.cpp           |  3 +++
 .../new.size_align.except.pass.cpp                |  3 +++
 .../new.delete.single/new.size_align.pass.cpp     |  3 +++
 .../rand.dist.samp.discrete/ctor_func.pass.cpp    |  3 +++
 .../param_ctor_func.pass.cpp                      |  3 +++
 .../range.lazy.split/general.pass.cpp             | 12 ++++++++++++
 .../expected.expected/monadic/transform.pass.cpp  |  4 ++--
 .../monadic/transform_error.pass.cpp              |  4 ++--
 .../monadic/transform_error.pass.cpp              |  4 ++--
 .../formatter.char_array.pass.cpp                 |  2 +-
 .../meta/meta.rel/is_virtual_base_of.pass.cpp     |  7 +++++++
 ...le.pass.cpp => dependent_return_type.pass.cpp} |  4 ++++
 .../meta.unary.prop/is_implicit_lifetime.pass.cpp |  2 +-
 .../make_optional_explicit.pass.cpp               |  3 +++
 ...ke_optional_explicit_initializer_list.pass.cpp |  3 +++
 .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp      |  2 +-
 .../catch_member_function_pointer_02.pass.cpp     |  2 +-
 30 files changed, 119 insertions(+), 17 deletions(-)
 rename libcxx/test/std/utilities/meta/meta.unary/{dependent_return_type.compile.pass.cpp => dependent_return_type.pass.cpp} (94%)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 80f2432b78dea..f0bdf6c0b5899 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -52,8 +52,8 @@ jobs:
         cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc'
-            cc: 'gcc-14'
-            cxx: 'g++-14'
+            cc: 'gcc-15'
+            cxx: 'g++-15'
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: ${{ matrix.config }}.${{ matrix.cxx }}
@@ -92,8 +92,8 @@ jobs:
         cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc-cxx11'
-            cc: 'gcc-14'
-            cxx: 'g++-14'
+            cc: 'gcc-15'
+            cxx: 'g++-15'
           - config: 'generic-cxx26'
             cc: 'clang-20'
             cxx: 'clang++-20'
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 9c957e9d20cb7..ae9cc87c797f8 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -135,7 +135,7 @@ Compiler     Versions            Restrictions               Support policy
 Clang        19, 20, 21-git                                 latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
 AppleClang   15                                             latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
 Open XL      17.1.3 (AIX)                                   latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
-GCC          14                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
+GCC          15                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
 ============ =================== ========================== =====================
 
 Libc++ also supports common platforms and architectures:
diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp
index 289164ab12036..a735800b60317 100644
--- a/libcxx/src/experimental/time_zone.cpp
+++ b/libcxx/src/experimental/time_zone.cpp
@@ -29,6 +29,15 @@
 // These quirks often use a 12h interval; this is the scan interval of zdump,
 // which implies there are no sys_info objects with a duration of less than 12h.
 
+// Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120502
+
+#include <__config>
+
+// TODO(LLVM 23): When upgrading to GCC 16 this can be removed
+#ifdef _LIBCPP_COMPILER_GCC
+#  pragma GCC optimize("-O0")
+#endif
+
 #include <algorithm>
 #include <cctype>
 #include <chrono>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
index 08d8e119a4d24..1e89cd272e643 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp
@@ -195,7 +195,7 @@ constexpr bool test() {
       std::string a[] = {str1, str1, str, str1, str1};
       auto whole =
           std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5)));
-      bool ret = std::ranges::contains(whole.begin(), whole.end(), "hello world", [&](const std::string i) {
+      bool ret = std::ranges::contains(whole.begin(), whole.end(), +"hello world", [&](const std::string i) {
         ++projection_count;
         return i;
       });
@@ -207,7 +207,7 @@ constexpr bool test() {
       std::string a[] = {str1, str1, str, str1, str1};
       auto whole =
           std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5)));
-      bool ret = std::ranges::contains(whole, "hello world", [&](const std::string i) {
+      bool ret = std::ranges::contains(whole, +"hello world", [&](const std::string i) {
         ++projection_count;
         return i;
       });
diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
index ca0f40eb77d49..0531c0e096a13 100644
--- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp
@@ -26,6 +26,7 @@
 #include <vector>
 
 #include "compare_types.h"
+#include "test_macros.h"
 
 namespace fundamentals {
 static_assert(std::equality_comparable<int>);
@@ -43,7 +44,12 @@ static_assert(std::equality_comparable<unsigned char&&>);
 static_assert(std::equality_comparable<unsigned short const&&>);
 static_assert(std::equality_comparable<unsigned int volatile&&>);
 static_assert(std::equality_comparable<unsigned long const volatile&&>);
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(std::equality_comparable<int[5]>);
+#else
+static_assert(!std::equality_comparable<int[5]>);
+#endif
 static_assert(std::equality_comparable<int (*)(int)>);
 static_assert(std::equality_comparable<int (&)(int)>);
 static_assert(std::equality_comparable<int (*)(int) noexcept>);
diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
index 0afbe582ba896..2f8d7862c0f4d 100644
--- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp
@@ -107,7 +107,12 @@ static_assert(!check_equality_comparable_with < int,
               int (S::*)() const volatile&& noexcept > ());
 
 static_assert(check_equality_comparable_with<int*, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<int*, int[5]>());
+#else
+static_assert(!check_equality_comparable_with<int*, int[5]>());
+#endif
 static_assert(!check_equality_comparable_with<int*, int (*)()>());
 static_assert(!check_equality_comparable_with<int*, int (&)()>());
 static_assert(!check_equality_comparable_with<int*, int (S::*)()>());
@@ -148,7 +153,12 @@ static_assert(
 static_assert(!check_equality_comparable_with < int*,
               int (S::*)() const volatile&& noexcept > ());
 
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<int[5], int[5]>());
+#else
+static_assert(!check_equality_comparable_with<int[5], int[5]>());
+#endif
 static_assert(!check_equality_comparable_with<int[5], int (*)()>());
 static_assert(!check_equality_comparable_with<int[5], int (&)()>());
 static_assert(!check_equality_comparable_with<int[5], int (S::*)()>());
@@ -942,7 +952,12 @@ static_assert(
 
 static_assert(!check_equality_comparable_with<std::nullptr_t, int>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_equality_comparable_with<std::nullptr_t, int[5]>());
+#else
+static_assert(!check_equality_comparable_with<std::nullptr_t, int[5]>());
+#endif
 static_assert(check_equality_comparable_with<std::nullptr_t, int (*)()>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int (&)()>());
 static_assert(check_equality_comparable_with<std::nullptr_t, int (S::*)()>());
diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
index 6f8324eaf7647..5959f70cf3963 100644
--- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp
@@ -55,7 +55,10 @@ static_assert(models_totally_ordered<unsigned char&&>());
 static_assert(models_totally_ordered<unsigned short const&&>());
 static_assert(models_totally_ordered<unsigned int volatile&&>());
 static_assert(models_totally_ordered<unsigned long const volatile&&>());
+// Array comparisons are ill-formed in C++26
+#if TEST_STD_VER <= 23
 static_assert(models_totally_ordered<int[5]>());
+#endif
 static_assert(models_totally_ordered<int (*)(int)>());
 static_assert(models_totally_ordered<int (&)(int)>());
 static_assert(models_totally_ordered<int (*)(int) noexcept>());
diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
index dffc33265aebf..398ef445baf9d 100644
--- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
+++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp
@@ -89,7 +89,12 @@ static_assert(!check_totally_ordered_with<int, int (S::*)() const volatile&&>())
 static_assert(!check_totally_ordered_with < int, int (S::*)() const volatile&& noexcept > ());
 
 static_assert(check_totally_ordered_with<int*, int*>());
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_totally_ordered_with<int*, int[5]>());
+#else
+static_assert(!check_totally_ordered_with<int*, int[5]>());
+#endif
 static_assert(!check_totally_ordered_with<int*, int (*)()>());
 static_assert(!check_totally_ordered_with<int*, int (&)()>());
 static_assert(!check_totally_ordered_with<int*, int (S::*)()>());
@@ -117,7 +122,12 @@ static_assert(!check_totally_ordered_with < int*, int (S::*)() volatile&& noexce
 static_assert(!check_totally_ordered_with<int*, int (S::*)() const volatile&&>());
 static_assert(!check_totally_ordered_with < int*, int (S::*)() const volatile&& noexcept > ());
 
+// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet.
+#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG)
 static_assert(check_totally_ordered_with<int[5], int[5]>());
+#else
+static_assert(!check_totally_ordered_with<int[5], int[5]>());
+#endif
 static_assert(!check_totally_ordered_with<int[5], int (*)()>());
 static_assert(!check_totally_ordered_with<int[5], int (&)()>());
 static_assert(!check_totally_ordered_with<int[5], int (S::*)()>());
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
index 6a2b098c1b573..9ee32b8417832 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
index 437d064307735..4fdcc3b535a8d 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp
@@ -11,6 +11,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cstddef>
 #include <cassert>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
index 4e34ebcb46c7d..4dfaf7a30d7a2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
index c9b59ecaff396..a1b8466340a2a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
@@ -13,6 +13,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
index 6a515555e6dbd..346e881d016be 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cassert>
 #include <limits>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
index 729ef3ec46b0c..0013dd3d0cbc3 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp
@@ -11,6 +11,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <new>
 #include <cstddef>
 #include <cassert>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
index 7694314c87bf3..fbeb880c83d8d 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
index 5d321f08282b2..59ecbe205513a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
@@ -13,6 +13,9 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
+// GCC warns about allocating numeric_limits<size_t>::max() being too large (which we test here)
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // XFAIL: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
index c3a88af92d360..c05a9434175a8 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp
@@ -15,6 +15,9 @@
 //     discrete_distribution(size_t nw, double xmin, double xmax,
 //                           UnaryOperation fw);
 
+// There is a bogus diagnostic about a too large allocation
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <random>
 
 #include <cassert>
diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
index 7ef936b7fc355..206bf5a0eb8a2 100644
--- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
+++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp
@@ -15,6 +15,9 @@
 //     param_type(size_t nw, double xmin, double xmax,
 //                           UnaryOperation fw);
 
+// There is a bogus diagnostic about a too large allocation
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than
+
 #include <random>
 
 #include <cassert>
diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
index f4e87bb47399e..521c0b1610bce 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp
@@ -312,7 +312,10 @@ constexpr bool main_test() {
   // Leading separator.
   {
     std::array expected = {""sv, "abc"sv, "def"sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" abc def"sv, short_sep, expected);
+#endif
     test_one("12abc12def"sv, long_sep, expected);
   }
 
@@ -326,7 +329,10 @@ constexpr bool main_test() {
   // Input consisting of a single separator.
   {
     std::array expected = {""sv, ""sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" "sv, short_sep, expected);
+#endif
     test_one("12"sv, long_sep, expected);
   }
 
@@ -354,7 +360,10 @@ constexpr bool main_test() {
   // Separators after every character.
   {
     std::array expected = {""sv, "a"sv, "b"sv, "c"sv, ""sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one(" a b c "sv, short_sep, expected);
+#endif
     test_one("12a12b12c12"sv, long_sep, expected);
   }
 
@@ -383,7 +392,10 @@ constexpr bool main_test() {
   // Terminating null as a separator.
   {
     std::array expected = {"abc"sv, "def"sv};
+// FIXME: Why does GCC complain here?
+#ifndef TEST_COMPILER_GCC
     test_one("abc\0def"sv, '\0', expected);
+#endif
     test_one("abc\0\0def"sv, "\0\0"sv, expected);
   }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
index cbd54d623c0f4..97c1e4a40f355 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
index a19e17b01f6a9..9570b2faac692 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333.
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995.
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
index f6d3011d1ea96..2ec15b51d11ea 100644
--- a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
-// XFAIL: gcc-14
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995
+// XFAIL: gcc-14, gcc-15
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
index bc056db9e254e..8c4f3000ec1e8 100644
--- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // TODO FMT __builtin_memcpy isn't constexpr in GCC
-// UNSUPPORTED: gcc-14
+// UNSUPPORTED: gcc-14, gcc-15
 
 // <format>
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index f443d2030961d..47c95c64a0855 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -18,6 +18,8 @@
 #include <type_traits>
 #include <cassert>
 
+#include "test_macros.h"
+
 template <bool expected, class Base, class Derived>
 void test() {
   // Test the type of the variables
@@ -98,8 +100,13 @@ int main(int, char**) {
 
   // Test with virtual inheritance
   {
+#ifdef TEST_COMPILER_GCC // FIXME: Is this a GCC or Clang bug? Or is the standards wording ambiguous?
+    test<true, Base, Derived3Virtual>();
+    test<true, Derived, Derived3Virtual>();
+#else
     test<false, Base, Derived3Virtual>();
     test<false, Derived, Derived3Virtual>();
+#endif
     test<true, Derived2b, Derived3Virtual>();
     test<true, Derived2a, Derived3Virtual>();
     test<true, Base, DerivedPrivate>();
diff --git a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
similarity index 94%
rename from libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp
rename to libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
index 935a6e3db0017..37d66831c7ce5 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp
@@ -168,3 +168,7 @@ void instantiate() {
   void_t<int>();
 #endif
 }
+
+// This is not a .compile.pass.cpp because we want to ensure that GCC doesn't complain about incorrect builtins usage,
+// which only happens during CodeGen.
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index 681ad13a07dfd..afd76e65060e3 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-15, apple-clang-16, apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
index e7931e07e31d1..23f131d2fc499 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp
@@ -12,6 +12,9 @@
 // template <class T, class... Args>
 //   constexpr optional<T> make_optional(Args&&... args);
 
+// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
+// XFAIL: gcc-15
+
 #include <optional>
 #include <string>
 #include <memory>
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
index 80371d6333712..5ddb229ad9268 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp
@@ -12,6 +12,9 @@
 // template <class T, class U, class... Args>
 //   constexpr optional<T> make_optional(initializer_list<U> il, Args&&... args);
 
+// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577
+// XFAIL: gcc-15
+
 #include <cassert>
 #include <memory>
 #include <optional>
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
index e6812e9a3a30a..ae5984c155300 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03
 
 // FIXME: Why does this start to fail with GCC 14?
-// XFAIL: !(c++11 || c++14) && gcc-14
+// XFAIL: !(c++11 || c++14) && (gcc-14 || gcc-15)
 
 // See https://llvm.org/PR31384.
 
diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
index 5d702031ce352..ec400713620c1 100644
--- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
+++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp
@@ -12,7 +12,7 @@
 
 // GCC supports noexcept function types but this test still fails.
 // This is likely a bug in their implementation. Investigation needed.
-// XFAIL: gcc-14
+// XFAIL: gcc-14, gcc-15
 
 #include <cassert>
 

From 806333063ff9a09ca001dcd77d4d5d6f0b9ecd74 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang@sifive.com>
Date: Thu, 12 Jun 2025 02:24:10 +0800
Subject: [PATCH 120/851] [RISCV] Guard the alternative static chain register
 use on ILP32E/LP64E (#142715)

Asserts the use of t3(x28) as the static chain register when branch control flow protection is enabled with ILP32E/LP64E, because such register is not present within the ABI.
---
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 24 ++++++++++++++--------
 llvm/test/CodeGen/RISCV/nest-register.ll   |  3 +++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index e0d1fb2facc87..cb6117eb0917b 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -333,15 +333,23 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
   unsigned XLen = Subtarget.getXLen();
   MVT XLenVT = Subtarget.getXLenVT();
 
-  // Static chain parameter must not be passed in normal argument registers,
-  // so we assign t2/t3 for it as done in GCC's __builtin_call_with_static_chain
-  bool HasCFBranch =
-      Subtarget.hasStdExtZicfilp() &&
-      MF.getFunction().getParent()->getModuleFlag("cf-protection-branch");
-  // Normal: t2, Branch control flow protection: t3
-  const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7;
-
   if (ArgFlags.isNest()) {
+    // Static chain parameter must not be passed in normal argument registers,
+    // so we assign t2/t3 for it as done in GCC's
+    // __builtin_call_with_static_chain
+    bool HasCFBranch =
+        Subtarget.hasStdExtZicfilp() &&
+        MF.getFunction().getParent()->getModuleFlag("cf-protection-branch");
+
+    // Normal: t2, Branch control flow protection: t3
+    const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7;
+
+    RISCVABI::ABI ABI = Subtarget.getTargetABI();
+    if (HasCFBranch &&
+        (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E))
+      reportFatalUsageError(
+          "Nested functions with control flow protection are not "
+          "usable with ILP32E or LP64E ABI.");
     if (MCRegister Reg = State.AllocateReg(StaticChainReg)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
diff --git a/llvm/test/CodeGen/RISCV/nest-register.ll b/llvm/test/CodeGen/RISCV/nest-register.ll
index 9f8e4e1a2d8d3..6e892e05c4297 100644
--- a/llvm/test/CodeGen/RISCV/nest-register.ll
+++ b/llvm/test/CodeGen/RISCV/nest-register.ll
@@ -5,6 +5,8 @@
 ; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I-ZICFILP %s
+; RUN: not llc -mtriple=riscv64 -target-abi=lp64e -mattr=+experimental-zicfilp \
+; RUN:   -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=LP64E-ZICFILP %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
 ; passed in the right register.
@@ -63,6 +65,7 @@ define ptr @nest_caller(ptr %arg) nounwind {
   ret ptr %result
 }
 
+; LP64E-ZICFILP: LLVM ERROR: Nested functions with control flow protection are not usable with ILP32E or LP64E ABI.
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 8, !"cf-protection-branch", i32 1}

From 7a0c9f607a26b77a7e584fd6734f03b7ee40ca95 Mon Sep 17 00:00:00 2001
From: Tony Varghese <tonypalampalliyil@gmail.com>
Date: Wed, 11 Jun 2025 23:56:15 +0530
Subject: [PATCH 121/851] [NFC][PowerPC] Pre-commit test case for exploitation
 of xxeval for the pattern ternary(A,X,or(B,C)) (#143693)

Pre-commit test case for exploitation of `xxeval` for ternary operations
of the pattern `ternary(A,X,or(B,C))`.
Exploitation of `xxeval` to be added later.

Co-authored-by: Tony Varghese <tony.varghese@ibm.com>
---
 .../CodeGen/PowerPC/xxeval-vselect-x-or.ll    | 268 ++++++++++++++++++
 1 file changed, 268 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll

diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll
new file mode 100644
index 0000000000000..1ad7e95e3682e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Test file to verify the emission of Vector selection instructions when ternary operators are used.
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s
+
+; Function to test ternary(A, and(B, C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_and_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_and_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, and(B, C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_and_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_and_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxland vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, B, or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_B_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, B, or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_B_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+
+; Function to test ternary(A, C, or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_C_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, C, or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_C_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlor vs0, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs0, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+
+; Function to test ternary(A, eqv(B,C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_eqv_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <4 x i32> %B, %C
+  %eqv = xor <4 x i32> %xor, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector eqv operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %eqv, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, eqv(B,C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_eqv_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_eqv_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxleqv vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %xor = xor <2 x i64> %B, %C
+  %eqv = xor <2 x i64> %xor, <i64 -1, i64 -1>  ; Vector eqv operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %eqv, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_C_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_C_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v4, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %C, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, not(B), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_not_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_not_B_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <4 x i32> %B, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector not operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, not(B), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_not_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_not_B_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnor vs0, v3, v3
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %not = xor <2 x i64> %B, <i64 -1, i64 -1>  ; Vector not operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or
+  ret <2 x i64> %res
+}
+
+; Function to test ternary(A, nand(B,C), or(B, C)) for <4 x i32>
+define <4 x i32> @ternary_A_nand_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_or_BC_4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxleqv v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    vslw v2, v2, v5
+; CHECK-NEXT:    vsraw v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <4 x i32> %B, %C
+  %nand = xor <4 x i32> %and, <i32 -1, i32 -1, i32 -1, i32 -1>  ; Vector nand operation
+  %or = or <4 x i32> %B, %C
+  %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %or
+  ret <4 x i32> %res
+}
+
+; Function to test ternary(A, nand(B,C), or(B, C)) for <2 x i64>
+define <2 x i64> @ternary_A_nand_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: ternary_A_nand_BC_or_BC_2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xxlxor v5, v5, v5
+; CHECK-NEXT:    xxlnand vs0, v3, v4
+; CHECK-NEXT:    xxlor vs1, v3, v4
+; CHECK-NEXT:    xxsplti32dx v5, 1, 63
+; CHECK-NEXT:    vsld v2, v2, v5
+; CHECK-NEXT:    vsrad v2, v2, v5
+; CHECK-NEXT:    xxsel v2, vs1, vs0, v2
+; CHECK-NEXT:    blr
+entry:
+  %and = and <2 x i64> %B, %C
+  %nand = xor <2 x i64> %and, <i64 -1, i64 -1>  ; Vector nand operation
+  %or = or <2 x i64> %B, %C
+  %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %or
+  ret <2 x i64> %res
+}

From 8d7da9a2a40302af25ee70841a4b549f4ed5ee8a Mon Sep 17 00:00:00 2001
From: Yifei Xu <yifei.xu@utexas.edu>
Date: Wed, 11 Jun 2025 13:33:23 -0500
Subject: [PATCH 122/851] Update BUILD.bazel

Add missing dependency after https://github.com/llvm/llvm-project/pull/142916.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c1d63de04b8f0..f6a7cd7dea85b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6882,6 +6882,7 @@ cc_library(
         ":SPIRVDialect",
         ":Support",
         "//llvm:config",
+        "//llvm:Support",
     ],
 )
 

From 773d357b9882fe0e30ffddee5ac1fbe2254fac05 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:39:45 +0200
Subject: [PATCH 123/851] [libc++] Simplify the implementation of __next_prime
 a bit (#143512)

---
 libcxx/src/hash.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp
index 41c4eb480a5fc..50d8cf9f9f539 100644
--- a/libcxx/src/hash.cpp
+++ b/libcxx/src/hash.cpp
@@ -9,7 +9,6 @@
 #include <__hash_table>
 #include <algorithm>
 #include <stdexcept>
-#include <type_traits>
 
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wtautological-constant-out-of-range-compare")
 
@@ -52,16 +51,15 @@ const unsigned indices[] = {
 // are fewer potential primes to search, and fewer potential primes to divide
 // against.
 
-template <size_t _Sz = sizeof(size_t)>
-inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 4, void>::type __check_for_overflow(size_t N) {
-  if (N > 0xFFFFFFFB)
-    std::__throw_overflow_error("__next_prime overflow");
-}
-
-template <size_t _Sz = sizeof(size_t)>
-inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 8, void>::type __check_for_overflow(size_t N) {
-  if (N > 0xFFFFFFFFFFFFFFC5ull)
-    std::__throw_overflow_error("__next_prime overflow");
+inline void __check_for_overflow(size_t N) {
+  if constexpr (sizeof(size_t) == 4) {
+    if (N > 0xFFFFFFFB)
+      std::__throw_overflow_error("__next_prime overflow");
+  } else {
+    static_assert(sizeof(size_t) == 8);
+    if (N > 0xFFFFFFFFFFFFFFC5ull)
+      std::__throw_overflow_error("__next_prime overflow");
+  }
 }
 
 size_t __next_prime(size_t n) {

From 8dc63ca59003a4b72217221c1c801237614c9d7d Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 11:47:09 -0700
Subject: [PATCH 124/851] Make
 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c write output file
 to temp dir

---
 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
index 5d65fdafaa251..d761e12e8392e 100644
--- a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
+++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c
@@ -57,7 +57,7 @@
 // RUN:     | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG
 
 // However, sve2 is actually enabled in clang but disabled for MC.
-// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \
+// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s -o %t \
 // RUN:     -Xclang -target-feature -Xclang -sve \
 // RUN:     -Xclang -verify -Xclang -verify-ignore-unexpected=note
 

From 0c62571d9f02f7d5c1a649b5b20fdf5b0f6bb41c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 11 Jun 2025 20:57:07 +0200
Subject: [PATCH 125/851] [libc++] Remove static_assert from hash.cpp that
 fires unconditionall

---
 libcxx/src/hash.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp
index 50d8cf9f9f539..e1e6d2b4c2bdb 100644
--- a/libcxx/src/hash.cpp
+++ b/libcxx/src/hash.cpp
@@ -56,7 +56,6 @@ inline void __check_for_overflow(size_t N) {
     if (N > 0xFFFFFFFB)
       std::__throw_overflow_error("__next_prime overflow");
   } else {
-    static_assert(sizeof(size_t) == 8);
     if (N > 0xFFFFFFFFFFFFFFC5ull)
       std::__throw_overflow_error("__next_prime overflow");
   }

From 02b6849cf1feb425885bf6f5ee505d5cd4a824d7 Mon Sep 17 00:00:00 2001
From: Abhinav Gaba <abhinav.gaba@intel.com>
Date: Wed, 11 Jun 2025 12:03:55 -0700
Subject: [PATCH 126/851] [Clang][OpenMP] Fix mapping of arrays of structs with
 members with mappers (#142511)

This builds upon #101101 from @jyu2-git, which used compiler-generated
mappers when mapping an array-section of structs with members that have
user-defined default mappers.

Now we do the same when mapping arrays of structs.
---
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/lib/Sema/SemaOpenMP.cpp                 |  38 ++-
 ...of_structs_with_nested_mapper_ast_dump.cpp |  34 ++
 ..._of_structs_with_nested_mapper_codegen.cpp | 323 ++++++++++++++++++
 ...f_structs_with_nested_mapper_ast_dump.cpp} |   0
 ...of_structs_with_nested_mapper_codegen.cpp} |   0
 ...re_mapper_nested_default_mappers_array.cpp |   6 +-
 7 files changed, 388 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
 create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
 rename clang/test/OpenMP/{target_map_nest_defalut_mapper_ast_dump.cpp => target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp} (100%)
 rename clang/test/OpenMP/{target_map_nest_defalut_mapper_codegen.cpp => target_map_array_section_of_structs_with_nested_mapper_codegen.cpp} (100%)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b5e6cf088a4b1..8043ab48f0b4f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1102,6 +1102,9 @@ OpenMP Support
 - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have
   an argument larger than what can fit within a 64-bit integer.
 - Added support for private variable reduction.
+- Fixed mapping of arrays of structs containing nested structs with user defined
+  mappers, by using compiler-generated default mappers for the outer structs for
+  such maps.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a3395ac157d96..2cbe79c5c07ca 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -22057,20 +22057,34 @@ static void checkMappableExpressionList(
         Type.getCanonicalType(), UnresolvedMapper);
     if (ER.isInvalid())
       continue;
-    if (!ER.get() && isa<ArraySectionExpr>(VE)) {
-      // Create implicit mapper as needed.
-      QualType BaseType = VE->getType().getCanonicalType();
-      if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) {
-        const auto *OASE = cast<ArraySectionExpr>(VE->IgnoreParenImpCasts());
-        QualType BType = ArraySectionExpr::getBaseOriginalType(OASE->getBase());
-        QualType ElemType;
-        if (const auto *ATy = BType->getAsArrayTypeUnsafe())
-          ElemType = ATy->getElementType();
-        else
-          ElemType = BType->getPointeeType();
+
+    // If no user-defined mapper is found, we need to create an implicit one for
+    // arrays/array-sections on structs that have members that have
+    // user-defined mappers. This is needed to ensure that the mapper for the
+    // member is invoked when mapping each element of the array/array-section.
+    if (!ER.get()) {
+      QualType BaseType;
+
+      if (isa<ArraySectionExpr>(VE)) {
+        BaseType = VE->getType().getCanonicalType();
+        if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) {
+          const auto *OASE = cast<ArraySectionExpr>(VE->IgnoreParenImpCasts());
+          QualType BType =
+              ArraySectionExpr::getBaseOriginalType(OASE->getBase());
+          QualType ElemType;
+          if (const auto *ATy = BType->getAsArrayTypeUnsafe())
+            ElemType = ATy->getElementType();
+          else
+            ElemType = BType->getPointeeType();
+          BaseType = ElemType.getCanonicalType();
+        }
+      } else if (VE->getType()->isArrayType()) {
+        const ArrayType *AT = VE->getType()->getAsArrayTypeUnsafe();
+        const QualType ElemType = AT->getElementType();
         BaseType = ElemType.getCanonicalType();
       }
-      if (BaseType->getAsRecordDecl() &&
+
+      if (!BaseType.isNull() && BaseType->getAsRecordDecl() &&
           isImplicitMapperNeeded(SemaRef, DSAS, BaseType, VE)) {
         ER = buildImplicitMapper(SemaRef, BaseType, DSAS);
       }
diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
new file mode 100644
index 0000000000000..a5847709d3e76
--- /dev/null
+++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp
@@ -0,0 +1,34 @@
+//RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump  %s | FileCheck %s --check-prefix=DUM
+
+typedef struct {
+  int a;
+} C;
+#pragma omp declare mapper(C s) map(to : s.a)
+
+typedef struct {
+  int e;
+  C f;
+  int h;
+} D;
+
+void foo() {
+  D sa[10];
+  sa[1].e = 111;
+  sa[1].f.a = 222;
+
+#pragma omp target map(tofrom : sa)
+  {
+    sa[0].e = 333;
+    sa[1].f.a = 444;
+  }
+}
+
+// DUM: -OMPDeclareMapperDecl{{.*}}<<invalid sloc>> <invalid sloc>
+// DUM-NEXT:  |-OMPMapClause {{.*}}<<invalid sloc>> <implicit>
+// DUM-NEXT:  | |-MemberExpr {{.*}}<line:9:3> 'int' lvalue .e
+// DUM-NEXT:  | | `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  | |-MemberExpr {{.*}}<line:10:3> 'C' lvalue .f {{.*}}
+// DUM-NEXT:  | | `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  | `-MemberExpr {{.*}}<line:11:3> 'int' lvalue .h {{.*}}
+// DUM-NEXT:  |   `-DeclRefExpr {{.*}}<<invalid sloc>> 'D' lvalue Var {{.*}} '_s' 'D'
+// DUM-NEXT:  `-VarDecl {{.*}} <line:12:1> col:1 implicit used _s 'D'
diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
new file mode 100644
index 0000000000000..5df1e958ad55a
--- /dev/null
+++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp
@@ -0,0 +1,323 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex "\.offload_.*"
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+typedef struct {
+  int a;
+} C;
+#pragma omp declare mapper(C s) map(to : s.a)
+
+typedef struct {
+  int e;
+  C f;
+  int h;
+} D;
+
+void foo() {
+  D sa[10];
+  sa[1].e = 111;
+  sa[1].f.a = 222;
+
+#pragma omp target map(tofrom : sa)
+  {
+    sa[1].e = 333;
+    sa[1].f.a = 444;
+  }
+}
+#endif
+//.
+// CHECK: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 120]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
+//.
+// CHECK-LABEL: define {{[^@]+}}@_Z3foov
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SA:%.*]] = alloca [10 x %struct.D], align 4
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK-NEXT:    store i32 111, ptr [[E]], align 4
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
+// CHECK-NEXT:    store i32 222, ptr [[A]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[SA]], ptr [[TMP0]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[SA]], ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT:    store ptr @.omp_mapper._ZTS1D.default, ptr [[TMP2]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// CHECK-NEXT:    store i32 3, ptr [[TMP5]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// CHECK-NEXT:    store i32 1, ptr [[TMP6]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// CHECK-NEXT:    store ptr [[TMP4]], ptr [[TMP8]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4
+// CHECK-NEXT:    store ptr @.offload_sizes, ptr [[TMP9]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5
+// CHECK-NEXT:    store ptr @.offload_maptypes, ptr [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6
+// CHECK-NEXT:    store ptr null, ptr [[TMP11]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7
+// CHECK-NEXT:    store ptr [[DOTOFFLOAD_MAPPERS]], ptr [[TMP12]], align 8
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8
+// CHECK-NEXT:    store i64 0, ptr [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9
+// CHECK-NEXT:    store i64 0, ptr [[TMP14]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10
+// CHECK-NEXT:    store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11
+// CHECK-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12
+// CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26.region_id, ptr [[KERNEL_ARGS]])
+// CHECK-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
+// CHECK-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
+// CHECK:       omp_offload.failed:
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26(ptr [[SA]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
+// CHECK:       omp_offload.cont:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(120) [[SA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SA_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[SA]], ptr [[SA_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SA_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]]
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0
+// CHECK-NEXT:    store i32 333, ptr [[E]], align 4
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0
+// CHECK-NEXT:    store i32 444, ptr [[A]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1D.default
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 12
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP2]], i64 [[TMP6]]
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+// CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]]
+// CHECK-NEXT:    [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]]
+// CHECK:       .omp.array..init:
+// CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 12
+// CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_ARRAYMAP_HEAD]]
+// CHECK:       omp.arraymap.head:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]]
+// CHECK:       omp.arraymap.body:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END20:%.*]] ]
+// CHECK-NEXT:    [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0
+// CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 1
+// CHECK-NEXT:    [[H:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 2
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[H]], i32 1
+// CHECK-NEXT:    [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64
+// CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[E]] to i64
+// CHECK-NEXT:    [[TMP21:%.*]] = sub i64 [[TMP19]], [[TMP20]]
+// CHECK-NEXT:    [[TMP22:%.*]] = sdiv exact i64 [[TMP21]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK-NEXT:    [[TMP23:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP24:%.*]] = shl i64 [[TMP23]], 48
+// CHECK-NEXT:    [[TMP25:%.*]] = add nuw i64 0, [[TMP24]]
+// CHECK-NEXT:    [[TMP26:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0
+// CHECK-NEXT:    br i1 [[TMP27]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]]
+// CHECK:       omp.type.alloc:
+// CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP25]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END:%.*]]
+// CHECK:       omp.type.alloc.else:
+// CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[TMP26]], 1
+// CHECK-NEXT:    br i1 [[TMP29]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]]
+// CHECK:       omp.type.to:
+// CHECK-NEXT:    [[TMP30:%.*]] = and i64 [[TMP25]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.to.else:
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[TMP26]], 2
+// CHECK-NEXT:    br i1 [[TMP31]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]]
+// CHECK:       omp.type.from:
+// CHECK-NEXT:    [[TMP32:%.*]] = and i64 [[TMP25]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.end:
+// CHECK-NEXT:    [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP28]], [[OMP_TYPE_ALLOC]] ], [ [[TMP30]], [[OMP_TYPE_TO]] ], [ [[TMP32]], [[OMP_TYPE_FROM]] ], [ [[TMP25]], [[OMP_TYPE_TO_ELSE]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP22]], i64 [[OMP_MAPTYPE]], ptr null)
+// CHECK-NEXT:    [[TMP33:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP34:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP34]], 0
+// CHECK-NEXT:    br i1 [[TMP35]], label [[OMP_TYPE_ALLOC1:%.*]], label [[OMP_TYPE_ALLOC_ELSE2:%.*]]
+// CHECK:       omp.type.alloc1:
+// CHECK-NEXT:    [[TMP36:%.*]] = and i64 [[TMP33]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END6:%.*]]
+// CHECK:       omp.type.alloc.else2:
+// CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[TMP34]], 1
+// CHECK-NEXT:    br i1 [[TMP37]], label [[OMP_TYPE_TO3:%.*]], label [[OMP_TYPE_TO_ELSE4:%.*]]
+// CHECK:       omp.type.to3:
+// CHECK-NEXT:    [[TMP38:%.*]] = and i64 [[TMP33]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.to.else4:
+// CHECK-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[TMP34]], 2
+// CHECK-NEXT:    br i1 [[TMP39]], label [[OMP_TYPE_FROM5:%.*]], label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.from5:
+// CHECK-NEXT:    [[TMP40:%.*]] = and i64 [[TMP33]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END6]]
+// CHECK:       omp.type.end6:
+// CHECK-NEXT:    [[OMP_MAPTYPE7:%.*]] = phi i64 [ [[TMP36]], [[OMP_TYPE_ALLOC1]] ], [ [[TMP38]], [[OMP_TYPE_TO3]] ], [ [[TMP40]], [[OMP_TYPE_FROM5]] ], [ [[TMP33]], [[OMP_TYPE_TO_ELSE4]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE7]], ptr null)
+// CHECK-NEXT:    [[TMP41:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP42:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP43:%.*]] = icmp eq i64 [[TMP42]], 0
+// CHECK-NEXT:    br i1 [[TMP43]], label [[OMP_TYPE_ALLOC8:%.*]], label [[OMP_TYPE_ALLOC_ELSE9:%.*]]
+// CHECK:       omp.type.alloc8:
+// CHECK-NEXT:    [[TMP44:%.*]] = and i64 [[TMP41]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END13:%.*]]
+// CHECK:       omp.type.alloc.else9:
+// CHECK-NEXT:    [[TMP45:%.*]] = icmp eq i64 [[TMP42]], 1
+// CHECK-NEXT:    br i1 [[TMP45]], label [[OMP_TYPE_TO10:%.*]], label [[OMP_TYPE_TO_ELSE11:%.*]]
+// CHECK:       omp.type.to10:
+// CHECK-NEXT:    [[TMP46:%.*]] = and i64 [[TMP41]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.to.else11:
+// CHECK-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[TMP42]], 2
+// CHECK-NEXT:    br i1 [[TMP47]], label [[OMP_TYPE_FROM12:%.*]], label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.from12:
+// CHECK-NEXT:    [[TMP48:%.*]] = and i64 [[TMP41]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END13]]
+// CHECK:       omp.type.end13:
+// CHECK-NEXT:    [[OMP_MAPTYPE14:%.*]] = phi i64 [ [[TMP44]], [[OMP_TYPE_ALLOC8]] ], [ [[TMP46]], [[OMP_TYPE_TO10]] ], [ [[TMP48]], [[OMP_TYPE_FROM12]] ], [ [[TMP41]], [[OMP_TYPE_TO_ELSE11]] ]
+// CHECK-NEXT:    call void @.omp_mapper._ZTS1C.default(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE14]], ptr null) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP49:%.*]] = add nuw i64 281474976711171, [[TMP24]]
+// CHECK-NEXT:    [[TMP50:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[TMP50]], 0
+// CHECK-NEXT:    br i1 [[TMP51]], label [[OMP_TYPE_ALLOC15:%.*]], label [[OMP_TYPE_ALLOC_ELSE16:%.*]]
+// CHECK:       omp.type.alloc15:
+// CHECK-NEXT:    [[TMP52:%.*]] = and i64 [[TMP49]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.alloc.else16:
+// CHECK-NEXT:    [[TMP53:%.*]] = icmp eq i64 [[TMP50]], 1
+// CHECK-NEXT:    br i1 [[TMP53]], label [[OMP_TYPE_TO17:%.*]], label [[OMP_TYPE_TO_ELSE18:%.*]]
+// CHECK:       omp.type.to17:
+// CHECK-NEXT:    [[TMP54:%.*]] = and i64 [[TMP49]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.to.else18:
+// CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[TMP50]], 2
+// CHECK-NEXT:    br i1 [[TMP55]], label [[OMP_TYPE_FROM19:%.*]], label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.from19:
+// CHECK-NEXT:    [[TMP56:%.*]] = and i64 [[TMP49]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END20]]
+// CHECK:       omp.type.end20:
+// CHECK-NEXT:    [[OMP_MAPTYPE21:%.*]] = phi i64 [ [[TMP52]], [[OMP_TYPE_ALLOC15]] ], [ [[TMP54]], [[OMP_TYPE_TO17]] ], [ [[TMP56]], [[OMP_TYPE_FROM19]] ], [ [[TMP49]], [[OMP_TYPE_TO_ELSE18]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE21]], ptr null)
+// CHECK-NEXT:    [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]]
+// CHECK:       omp.arraymap.exit:
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY22:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP57:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP57]], 0
+// CHECK-NEXT:    [[TMP58:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY22]], [[DOTOMP_ARRAY__DEL__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP58]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]]
+// CHECK:       .omp.array..del:
+// CHECK-NEXT:    [[TMP59:%.*]] = mul nuw i64 [[TMP6]], 12
+// CHECK-NEXT:    [[TMP60:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP61:%.*]] = or i64 [[TMP60]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP59]], i64 [[TMP61]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_DONE]]
+// CHECK:       omp.done:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1C.default
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 4
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP2]], i64 [[TMP6]]
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+// CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]]
+// CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]]
+// CHECK-NEXT:    [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0
+// CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]]
+// CHECK:       .omp.array..init:
+// CHECK-NEXT:    [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:    [[TMP16:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_ARRAYMAP_HEAD]]
+// CHECK:       omp.arraymap.head:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]]
+// CHECK:       omp.arraymap.body:
+// CHECK-NEXT:    [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ]
+// CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP18:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP19:%.*]] = shl i64 [[TMP18]], 48
+// CHECK-NEXT:    [[TMP20:%.*]] = add nuw i64 1, [[TMP19]]
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP4]], 3
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0
+// CHECK-NEXT:    br i1 [[TMP22]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]]
+// CHECK:       omp.type.alloc:
+// CHECK-NEXT:    [[TMP23:%.*]] = and i64 [[TMP20]], -4
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.alloc.else:
+// CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[TMP21]], 1
+// CHECK-NEXT:    br i1 [[TMP24]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]]
+// CHECK:       omp.type.to:
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP20]], -3
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.to.else:
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP21]], 2
+// CHECK-NEXT:    br i1 [[TMP26]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]]
+// CHECK:       omp.type.from:
+// CHECK-NEXT:    [[TMP27:%.*]] = and i64 [[TMP20]], -2
+// CHECK-NEXT:    br label [[OMP_TYPE_END]]
+// CHECK:       omp.type.end:
+// CHECK-NEXT:    [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP23]], [[OMP_TYPE_ALLOC]] ], [ [[TMP25]], [[OMP_TYPE_TO]] ], [ [[TMP27]], [[OMP_TYPE_FROM]] ], [ [[TMP20]], [[OMP_TYPE_TO_ELSE]] ]
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null)
+// CHECK-NEXT:    [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1
+// CHECK-NEXT:    [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]]
+// CHECK:       omp.arraymap.exit:
+// CHECK-NEXT:    [[OMP_ARRAYINIT_ISARRAY1:%.*]] = icmp sgt i64 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP28:%.*]] = and i64 [[TMP4]], 8
+// CHECK-NEXT:    [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP28]], 0
+// CHECK-NEXT:    [[TMP29:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY1]], [[DOTOMP_ARRAY__DEL__DELETE]]
+// CHECK-NEXT:    br i1 [[TMP29]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]]
+// CHECK:       .omp.array..del:
+// CHECK-NEXT:    [[TMP30:%.*]] = mul nuw i64 [[TMP6]], 4
+// CHECK-NEXT:    [[TMP31:%.*]] = and i64 [[TMP4]], -4
+// CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP31]], 512
+// CHECK-NEXT:    call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP30]], i64 [[TMP32]], ptr [[TMP5]])
+// CHECK-NEXT:    br label [[OMP_DONE]]
+// CHECK:       omp.done:
+// CHECK-NEXT:    ret void
+//
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp
similarity index 100%
rename from clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp
rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp
diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp
similarity index 100%
rename from clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp
rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp
diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
index d545e98ef6c3e..93695d1b388ff 100644
--- a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
+++ b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp
@@ -4,8 +4,6 @@
 // RUN: %libomptarget-compilexx-run-and-check-x86_64-unknown-linux-gnu
 // RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda
 
-// UNSUPPORTED: clang
-
 #include <cstdio>
 #include <cstdlib>
 
@@ -50,7 +48,7 @@ int main() {
   sa[1].h = N;
 
   printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
+         sa[1].f.b == &y[0] ? 1 : 0);
   // CHECK: 111 222 777 20.00000 1
 
   __intptr_t p = reinterpret_cast<__intptr_t>(&y[0]);
@@ -65,6 +63,6 @@ int main() {
     sa[1].f.b[1] = 40;
   }
   printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1],
-         sa[1].f.b == &x[0] ? 1 : 0);
+         sa[1].f.b == &y[0] ? 1 : 0);
   // CHECK: 333 222 777 40.00000 1
 }

From 574f77a1ee34461bc1f4a0823da6c960ff1c9655 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 11 Jun 2025 12:04:26 -0700
Subject: [PATCH 127/851] [OpenACC][CIR] Add parallelism determ. to all
 acc.loops (#143751)

PR #143720 adds a requirement to the ACC dialect that every acc.loop
must have a seq, independent, or auto attribute for the 'default'
device_type. The standard has rules for how this can be intuited:

orphan/parallel/parallel loop: independent
kernels/kernels loop: auto
serial/serial loop: seq, unless there is a gang/worker/vector, at which
point it should be 'auto'.

This patch implements all of this rule as a 'cleanup' step on the IR
generation for combined/loop operations. Note that the test impact is
much less since I inadvertently have my 'operation' terminating curley
matching the end curley from 'attribute' instead of the front of the
line, so I've added sufficient tests to ensure I captured the above.
---
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  12 +++
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp   |   2 +
 .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp |  33 ++++++
 clang/test/CIR/CodeGenOpenACC/combined.cpp    |  69 ++++++++++--
 clang/test/CIR/CodeGenOpenACC/loop.cpp        | 101 ++++++++++++++++--
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |   8 ++
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  24 +++++
 7 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index b08dd540e6289..682d59d63faa8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -34,6 +34,12 @@ namespace {
 class ScalarExprEmitter;
 } // namespace
 
+namespace mlir {
+namespace acc {
+class LoopOp;
+} // namespace acc
+} // namespace mlir
+
 namespace clang::CIRGen {
 
 class CIRGenFunction : public CIRGenTypeCache {
@@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache {
                           OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
                           ArrayRef<const OpenACCClause *> clauses);
 
+  // The OpenACC LoopOp requires that we have auto, seq, or independent on all
+  // LoopOp operations for the 'none' device type case. This function checks if
+  // the LoopOp has one, else it updates it to have one.
+  void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan,
+                               OpenACCDirectiveKind dk);
+
 public:
   mlir::LogicalResult
   emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 2aab9cecf93d8..1feefa55eb270 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
 
     emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);
 
+    updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);
+
     builder.create<TermOp>(end);
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
index 24cd1d399de65..71f3ccb8e040e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
@@ -22,6 +22,36 @@ using namespace clang::CIRGen;
 using namespace cir;
 using namespace mlir::acc;
 
+void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op,
+                                             bool isOrphan,
+                                             OpenACCDirectiveKind dk) {
+  // Check that at least one of auto, independent, or seq is present
+  // for the device-independent default clauses.
+  if (op.hasParallelismFlag(mlir::acc::DeviceType::None))
+    return;
+
+  switch (dk) {
+  default:
+    llvm_unreachable("Invalid parent directive kind");
+  case OpenACCDirectiveKind::Invalid:
+  case OpenACCDirectiveKind::Parallel:
+  case OpenACCDirectiveKind::ParallelLoop:
+    op.addIndependent(builder.getContext(), {});
+    return;
+  case OpenACCDirectiveKind::Kernels:
+  case OpenACCDirectiveKind::KernelsLoop:
+    op.addAuto(builder.getContext(), {});
+    return;
+  case OpenACCDirectiveKind::Serial:
+  case OpenACCDirectiveKind::SerialLoop:
+    if (op.hasDefaultGangWorkerVector())
+      op.addAuto(builder.getContext(), {});
+    else
+      op.addSeq(builder.getContext(), {});
+    return;
+  };
+}
+
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   mlir::Location start = getLoc(s.getSourceRange().getBegin());
@@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
                      s.clauses());
 
+  updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
+                          s.getParentComputeConstructKind());
+
   mlir::LogicalResult stmtRes = mlir::success();
   // Emit body.
   {
diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp
index 1f3c9f1a8d3fa..5b83a9cb91898 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp
@@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop seq device_type(nvidia, radeon)
@@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop auto device_type(nvidia, radeon)
@@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 #pragma acc kernels loop independent device_type(nvidia, radeon)
@@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.parallel combined(loop) {
   // CHECK: acc.loop combined(parallel) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
+  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.serial combined(loop) {
   // CHECK: acc.loop combined(serial) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.kernels combined(loop) {
   // CHECK: acc.loop combined(kernels) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
   // CHECK: acc.terminator
   // CHECK-NEXT: } loc
   #pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
@@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) {
   // CHECK: acc.parallel combined(loop) {
   // CHECK: acc.loop combined(parallel) {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
@@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
   // CHECK-NEXT: } loc
   // CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"}
   // CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"}
+
+  // Checking the automatic-addition of parallelism clauses.
+#pragma acc parallel loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.parallel combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(parallel) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.kernels combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(kernels) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop worker
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop vector
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial loop gang
+    for(unsigned I = 0; I < 5; ++I);
+  // CHECK-NEXT: acc.serial combined(loop) {
+  // CHECK-NEXT:  acc.loop combined(serial) gang {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index db94e2819b301..c0bf11e353951 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
 #pragma acc loop device_type(radeon) seq
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc
 #pragma acc loop seq device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
 #pragma acc loop device_type(radeon) independent
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc
 #pragma acc loop independent device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
 #pragma acc loop device_type(radeon) auto
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
 #pragma acc loop auto device_type(nvidia, radeon)
   for(unsigned I = 0; I < N; ++I);
   // CHECK: acc.loop {
@@ -116,7 +116,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
+  // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop collapse(1) device_type(radeon) collapse (2)
   for(unsigned I = 0; I < N; ++I)
@@ -124,7 +124,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2)
   for(unsigned I = 0; I < N; ++I)
@@ -132,14 +132,14 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]}
   #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
   for(unsigned I = 0; I < N; ++I)
     for(unsigned J = 0; J < N; ++J)
       for(unsigned K = 0; K < N; ++K);
   // CHECK: acc.loop {
   // CHECK: acc.yield
-  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
+  // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
 
   #pragma acc loop tile(1, 2, 3)
   for(unsigned I = 0; I < N; ++I)
@@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
   }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+  // Checking the automatic-addition of parallelism clauses.
+#pragma acc loop
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+
+#pragma acc parallel
+  {
+    // CHECK-NEXT: acc.parallel {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels
+  {
+    // CHECK-NEXT: acc.kernels {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop worker
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop vector
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial
+  {
+    // CHECK-NEXT: acc.serial {
+#pragma acc loop gang
+    for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT:  acc.loop gang {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
+  }
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
 }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 083a18d80704e..34312655115a1 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     // device_types. This is for the case where there is no expression specified
     // in a 'gang'.
     void addEmptyGang(MLIRContext *, llvm::ArrayRef<DeviceType>);
+
+    // Return whether this LoopOp has an auto, seq, or independent for the
+    // specified device-type.
+    bool hasParallelismFlag(DeviceType);
+
+    // Return whether this LoopOp has a gang, worker, or vector applying to the
+    // 'default'/None device-type.
+    bool hasDefaultGangWorkerVector();
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index c72ec47be9f04..21e6b9d85f1a1 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2839,6 +2839,30 @@ void acc::LoopOp::addEmptyGang(
                                                  effectiveDeviceTypes));
 }
 
+bool acc::LoopOp::hasParallelismFlag(DeviceType dt) {
+  auto hasDevice = [=](DeviceTypeAttr attr) -> bool {
+    return attr.getValue() == dt;
+  };
+  auto testFromArr = [=](ArrayAttr arr) -> bool {
+    return llvm::any_of(arr.getAsRange<DeviceTypeAttr>(), hasDevice);
+  };
+
+  if (ArrayAttr arr = getSeqAttr(); arr && testFromArr(arr))
+    return true;
+  if (ArrayAttr arr = getIndependentAttr(); arr && testFromArr(arr))
+    return true;
+  if (ArrayAttr arr = getAuto_Attr(); arr && testFromArr(arr))
+    return true;
+
+  return false;
+}
+
+bool acc::LoopOp::hasDefaultGangWorkerVector() {
+  return hasVector() || getVectorValue() || hasWorker() || getWorkerValue() ||
+         hasGang() || getGangValue(GangArgType::Num) ||
+         getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static);
+}
+
 void acc::LoopOp::addGangOperands(
     MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes,
     llvm::ArrayRef<GangArgType> argTypes, mlir::ValueRange values) {

From d5f68cb145059fc6d2944e1d17ef561e183ade83 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 12:09:44 -0700
Subject: [PATCH 128/851] [bazel] Port fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f6a7cd7dea85b..7bcb1d4ca883c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6881,8 +6881,8 @@ cc_library(
     deps = [
         ":SPIRVDialect",
         ":Support",
-        "//llvm:config",
         "//llvm:Support",
+        "//llvm:config",
     ],
 )
 
@@ -11249,7 +11249,7 @@ td_library(
 )
 
 gentbl_cc_library(
-    name = "TransformDialectEnumsIncGen",
+    name = "TransformAttrsIncGen",
     tbl_outs = {
         "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc": [
             "-gen-enum-decls",
@@ -11257,6 +11257,12 @@ gentbl_cc_library(
         "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc": [
             "-gen-enum-defs",
         ],
+        "include/mlir/Dialect/Transform/IR/TransformAttrs.h.inc": [
+            "-gen-attrdef-decls",
+        ],
+        "include/mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc": [
+            "-gen-attrdef-defs",
+        ],
     },
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/Transform/IR/TransformAttrs.td",
@@ -11382,7 +11388,7 @@ cc_library(
         ":Rewrite",
         ":SideEffectInterfaces",
         ":Support",
-        ":TransformDialectEnumsIncGen",
+        ":TransformAttrsIncGen",
         ":TransformDialectIncGen",
         ":TransformDialectInterfaces",
         ":TransformDialectUtils",

From 5dafe9dca867b90f20dcd71c620ad823aee4262b Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 12:23:17 -0700
Subject: [PATCH 129/851] [libc] Reduce direct use of errno in src/stdlib and
 src/__support tests. (#143767)

* Get rid of libc_errno assignments in str_to_* __support tests, since
those API have been migrated to return error in a struct instead.
* Migrate tests for atof and to strto* functions from <stdlib.h> and for
strdup from <string.h> to use ErrnoCheckingTest harness.
---
 libc/test/src/__support/CMakeLists.txt        |  2 -
 .../test/src/__support/str_to_double_test.cpp |  1 -
 libc/test/src/__support/str_to_float_test.cpp |  1 -
 libc/test/src/__support/str_to_fp_test.h      |  2 -
 .../src/__support/str_to_integer_test.cpp     |  1 -
 libc/test/src/stdlib/CMakeLists.txt           |  5 ++
 libc/test/src/stdlib/StrtolTest.h             | 60 +------------------
 libc/test/src/stdlib/atof_test.cpp            |  9 ++-
 libc/test/src/stdlib/strtod_test.cpp          |  5 +-
 libc/test/src/stdlib/strtof_test.cpp          |  5 +-
 libc/test/src/stdlib/strtold_test.cpp         |  5 +-
 libc/test/src/string/CMakeLists.txt           |  1 +
 libc/test/src/string/strdup_test.cpp          | 13 ++--
 13 files changed, 24 insertions(+), 86 deletions(-)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index c1736c8fe59e2..4fb0dae86e5ca 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -127,7 +127,6 @@ add_libc_test(
     libc.src.__support.integer_literals
     libc.src.__support.str_to_float
     libc.src.__support.uint128
-    libc.src.errno.errno
 )
 
 
@@ -140,7 +139,6 @@ add_libc_test(
   DEPENDS
     libc.src.__support.integer_literals
     libc.src.__support.str_to_integer
-    libc.src.errno.errno
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/str_to_double_test.cpp b/libc/test/src/__support/str_to_double_test.cpp
index ccfa44f12d8ef..dc503aa16f08c 100644
--- a/libc/test/src/__support/str_to_double_test.cpp
+++ b/libc/test/src/__support/str_to_double_test.cpp
@@ -99,7 +99,6 @@ TEST(LlvmLibcStrToDblTest, SimpleDecimalConversionExtraTypes) {
   uint64_t double_output_mantissa = 0;
   uint32_t output_exp2 = 0;
 
-  LIBC_NAMESPACE::libc_errno = 0;
   auto double_result =
       internal::simple_decimal_conversion<double>("123456789012345678900");
 
diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp
index 66f7db742eb45..03ae80fc2ee38 100644
--- a/libc/test/src/__support/str_to_float_test.cpp
+++ b/libc/test/src/__support/str_to_float_test.cpp
@@ -55,7 +55,6 @@ TEST(LlvmLibcStrToFltTest, SimpleDecimalConversionExtraTypes) {
   uint32_t float_output_mantissa = 0;
   uint32_t output_exp2 = 0;
 
-  LIBC_NAMESPACE::libc_errno = 0;
   auto float_result =
       internal::simple_decimal_conversion<float>("123456789012345678900");
   float_output_mantissa = float_result.num.mantissa;
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index c7bc57b845fe0..d349192f107c0 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -10,7 +10,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
 #include "src/__support/uint128.h"
-#include "src/errno/libc_errno.h"
 
 #include "test/UnitTest/Test.h"
 
@@ -67,7 +66,6 @@ template <typename T> struct LlvmLibcStrToFloatTest : public testing::Test {
                                       const int expectedErrno = 0) {
     StorageType actual_output_mantissa = 0;
     uint32_t actual_output_exp2 = 0;
-    LIBC_NAMESPACE::libc_errno = 0;
 
     auto result = internal::simple_decimal_conversion<T>(numStart);
 
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 34b645b4b38c8..1ec882b212b8a 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 302971a078c17..45fd49b6d3526 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -9,6 +9,7 @@ add_libc_test(
   DEPENDS
     libc.src.errno.errno
     libc.src.stdlib.atof
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_header_library(
@@ -64,6 +65,7 @@ add_fp_unittest(
     libc.src.errno.errno
     libc.src.stdlib.strtod
     libc.src.__support.FPUtil.fenv_impl
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_fp_unittest(
@@ -76,6 +78,7 @@ add_fp_unittest(
     libc.src.errno.errno
     libc.src.stdlib.strtof
     libc.src.__support.FPUtil.fenv_impl
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_header_library(
@@ -86,6 +89,7 @@ add_header_library(
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -133,6 +137,7 @@ add_libc_test(
     libc.src.errno.errno
     libc.src.__support.uint128
     libc.src.stdlib.strtold
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index ed302f14d03ef..03f0a6539c785 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -10,7 +10,7 @@
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
@@ -18,7 +18,7 @@
 using LIBC_NAMESPACE::cpp::is_signed_v;
 
 template <typename ReturnT>
-struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
+struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
   using FunctionT = ReturnT (*)(const char *, char **, int);
 
   static constexpr ReturnT T_MAX =
@@ -28,7 +28,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
   void InvalidBase(FunctionT func) {
     const char *ten = "10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, nullptr, -1), ReturnT(0));
     ASSERT_ERRNO_EQ(EINVAL);
   }
@@ -38,23 +37,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
     // TODO: Look into collapsing these repeated segments.
     const char *ten = "10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - ten, ptrdiff_t(2));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(ten, nullptr, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
 
     const char *hundred = "100";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(hundred, &str_end, 10), ReturnT(100));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - hundred, ptrdiff_t(3));
 
     const char *big_number = "1234567890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(big_number, &str_end, 10), ReturnT(1234567890));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - big_number, ptrdiff_t(10));
@@ -62,7 +57,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // This number is larger than 2^32, meaning that if long is only 32 bits
     // wide, strtol will return LONG_MAX.
     const char *bigger_number = "12345678900";
-    LIBC_NAMESPACE::libc_errno = 0;
     if constexpr (sizeof(ReturnT) < 8) {
       ASSERT_EQ(func(bigger_number, &str_end, 10), T_MAX);
       ASSERT_ERRNO_EQ(ERANGE);
@@ -73,14 +67,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     EXPECT_EQ(str_end - bigger_number, ptrdiff_t(11));
 
     const char *too_big_number = "123456789012345678901";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(too_big_number, &str_end, 10), T_MAX);
     ASSERT_ERRNO_EQ(ERANGE);
     EXPECT_EQ(str_end - too_big_number, ptrdiff_t(21));
 
     const char *long_number_range_test =
         "10000000000000000000000000000000000000000000000000";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(long_number_range_test, &str_end, 10), T_MAX);
     ASSERT_ERRNO_EQ(ERANGE);
     EXPECT_EQ(str_end - long_number_range_test, ptrdiff_t(50));
@@ -88,19 +80,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // For most negative numbers, the unsigned functions treat it the same as
     // casting a negative variable to an unsigned type.
     const char *negative = "-100";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative, &str_end, 10), ReturnT(-100));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - negative, ptrdiff_t(4));
 
     const char *big_negative_number = "-1234567890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(big_negative_number, &str_end, 10), ReturnT(-1234567890));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - big_negative_number, ptrdiff_t(11));
 
     const char *too_big_negative_number = "-123456789012345678901";
-    LIBC_NAMESPACE::libc_errno = 0;
     // If the number is signed, it should return the smallest negative number
     // for the current type, but if it's unsigned it should max out and return
     // the largest positive number for the current type. From the standard:
@@ -118,73 +107,61 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *spaces_before = "     10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(spaces_before, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - spaces_before, ptrdiff_t(7));
 
     const char *spaces_after = "10      ";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(spaces_after, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - spaces_after, ptrdiff_t(2));
 
     const char *word_before = "word10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(word_before, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - word_before, ptrdiff_t(0));
 
     const char *word_after = "10word";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(word_after, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - word_after, ptrdiff_t(2));
 
     const char *two_numbers = "10 999";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(two_numbers, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - two_numbers, ptrdiff_t(2));
 
     const char *two_signs = "--10 999";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(two_signs, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - two_signs, ptrdiff_t(0));
 
     const char *sign_before = "+2=4";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(sign_before, &str_end, 10), ReturnT(2));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - sign_before, ptrdiff_t(2));
 
     const char *sign_after = "2+2=4";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(sign_after, &str_end, 10), ReturnT(2));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - sign_after, ptrdiff_t(1));
 
     const char *tab_before = "\t10";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(tab_before, &str_end, 10), ReturnT(10));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - tab_before, ptrdiff_t(3));
 
     const char *all_together = "\t  -12345and+67890";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(all_together, &str_end, 10), ReturnT(-12345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - all_together, ptrdiff_t(9));
 
     const char *just_spaces = "  ";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_spaces, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_spaces, ptrdiff_t(0));
 
     const char *just_space_and_sign = " +";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_space_and_sign, &str_end, 10), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_space_and_sign, ptrdiff_t(0));
@@ -203,12 +180,10 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
         small_string[0] = static_cast<char>(
             LIBC_NAMESPACE::internal::int_to_b36_char(first_digit));
         if (first_digit < base) {
-          LIBC_NAMESPACE::libc_errno = 0;
           ASSERT_EQ(func(small_string, nullptr, base),
                     static_cast<ReturnT>(first_digit));
           ASSERT_ERRNO_SUCCESS();
         } else {
-          LIBC_NAMESPACE::libc_errno = 0;
           ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
           ASSERT_ERRNO_SUCCESS();
         }
@@ -223,18 +198,15 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
           small_string[1] = static_cast<char>(
               LIBC_NAMESPACE::internal::int_to_b36_char(second_digit));
           if (first_digit < base && second_digit < base) {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(
                 func(small_string, nullptr, base),
                 static_cast<ReturnT>(second_digit + (first_digit * base)));
             ASSERT_ERRNO_SUCCESS();
           } else if (first_digit < base) {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(func(small_string, nullptr, base),
                       static_cast<ReturnT>(first_digit));
             ASSERT_ERRNO_SUCCESS();
           } else {
-            LIBC_NAMESPACE::libc_errno = 0;
             ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
             ASSERT_ERRNO_SUCCESS();
           }
@@ -255,14 +227,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
 
             if (first_digit < base && second_digit < base &&
                 third_digit < base) {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(func(small_string, nullptr, base),
                         static_cast<ReturnT>(third_digit +
                                              (second_digit * base) +
                                              (first_digit * base * base)));
               ASSERT_ERRNO_SUCCESS();
             } else if (first_digit < base && second_digit < base) {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(
                   func(small_string, nullptr, base),
                   static_cast<ReturnT>(second_digit + (first_digit * base)));
@@ -272,23 +242,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
               // The number is treated as a one digit hexadecimal.
               if (base == 16 && first_digit == 0 && second_digit == 33) {
                 if (third_digit < base) {
-                  LIBC_NAMESPACE::libc_errno = 0;
                   ASSERT_EQ(func(small_string, nullptr, base),
                             static_cast<ReturnT>(third_digit));
                   ASSERT_ERRNO_SUCCESS();
                 } else {
-                  LIBC_NAMESPACE::libc_errno = 0;
                   ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
                   ASSERT_ERRNO_SUCCESS();
                 }
               } else {
-                LIBC_NAMESPACE::libc_errno = 0;
                 ASSERT_EQ(func(small_string, nullptr, base),
                           static_cast<ReturnT>(first_digit));
                 ASSERT_ERRNO_SUCCESS();
               }
             } else {
-              LIBC_NAMESPACE::libc_errno = 0;
               ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0));
               ASSERT_ERRNO_SUCCESS();
             }
@@ -302,19 +268,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *no_prefix = "123abc";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(no_prefix, &str_end, 16), ReturnT(0x123abc));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - no_prefix, ptrdiff_t(6));
 
     const char *yes_prefix = "0x456def";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(yes_prefix, &str_end, 16), ReturnT(0x456def));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - yes_prefix, ptrdiff_t(8));
 
     const char *letter_after_prefix = "0xabc123";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(letter_after_prefix, &str_end, 16), ReturnT(0xabc123));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - letter_after_prefix, ptrdiff_t(8));
@@ -325,7 +288,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for unsigned 32 bit numbers
 
     const char *max_32_bit_value = "0xFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_32_bit_value, &str_end, 0),
               ((is_signed_v<ReturnT> && sizeof(ReturnT) == 4)
                    ? T_MAX
@@ -334,7 +296,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     EXPECT_EQ(str_end - max_32_bit_value, ptrdiff_t(10));
 
     const char *negative_max_32_bit_value = "-0xFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_32_bit_value, &str_end, 0),
               ((is_signed_v<ReturnT> && sizeof(ReturnT) == 4)
                    ? T_MIN
@@ -345,13 +306,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for signed 32 bit numbers
 
     const char *max_31_bit_value = "0x7FFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_31_bit_value, &str_end, 0), ReturnT(0x7FFFFFFF));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - max_31_bit_value, ptrdiff_t(10));
 
     const char *negative_max_31_bit_value = "-0x7FFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_31_bit_value, &str_end, 0),
               -ReturnT(0x7FFFFFFF));
     ASSERT_ERRNO_SUCCESS();
@@ -360,7 +319,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for unsigned 64 bit numbers
 
     const char *max_64_bit_value = "0xFFFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_64_bit_value, &str_end, 0),
               (is_signed_v<ReturnT> || sizeof(ReturnT) < 8
                    ? T_MAX
@@ -371,7 +329,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // See the end of CleanBase10Decode for an explanation of how this large
     // negative number can end up as T_MAX.
     const char *negative_max_64_bit_value = "-0xFFFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(
         func(negative_max_64_bit_value, &str_end, 0),
         (is_signed_v<ReturnT>
@@ -383,14 +340,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     // Max size for signed 64 bit numbers
 
     const char *max_63_bit_value = "0x7FFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(max_63_bit_value, &str_end, 0),
               (sizeof(ReturnT) < 8 ? T_MAX : ReturnT(0x7FFFFFFFFFFFFFFF)));
     ASSERT_ERRNO_EQ(sizeof(ReturnT) < 8 ? ERANGE : 0);
     EXPECT_EQ(str_end - max_63_bit_value, ptrdiff_t(18));
 
     const char *negative_max_63_bit_value = "-0x7FFFFFFFFFFFFFFF";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(negative_max_63_bit_value, &str_end, 0),
               (sizeof(ReturnT) >= 8 ? -ReturnT(0x7FFFFFFFFFFFFFFF)
                                     : (is_signed_v<ReturnT> ? T_MIN : T_MAX)));
@@ -402,23 +357,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *just_prefix = "0x";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_prefix, &str_end, 16), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_prefix, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1));
 
     const char *prefix_with_x_after = "0xx";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(prefix_with_x_after, &str_end, 16), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1));
 
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(prefix_with_x_after, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1));
@@ -428,43 +379,36 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test {
     char *str_end = nullptr;
 
     const char *base_ten = "12345";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_ten, &str_end, 0), ReturnT(12345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_ten, ptrdiff_t(5));
 
     const char *base_sixteen_no_prefix = "123abc";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_sixteen_no_prefix, &str_end, 0), ReturnT(123));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_sixteen_no_prefix, ptrdiff_t(3));
 
     const char *base_sixteen_with_prefix = "0x456def";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_sixteen_with_prefix, &str_end, 0), ReturnT(0x456def));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_sixteen_with_prefix, ptrdiff_t(8));
 
     const char *base_eight_with_prefix = "012345";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(base_eight_with_prefix, &str_end, 0), ReturnT(012345));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - base_eight_with_prefix, ptrdiff_t(6));
 
     const char *just_zero = "0";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero, ptrdiff_t(1));
 
     const char *just_zero_x = "0x";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero_x, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero_x, ptrdiff_t(1));
 
     const char *just_zero_eight = "08";
-    LIBC_NAMESPACE::libc_errno = 0;
     ASSERT_EQ(func(just_zero_eight, &str_end, 0), ReturnT(0));
     ASSERT_ERRNO_SUCCESS();
     EXPECT_EQ(str_end - just_zero_eight, ptrdiff_t(1));
diff --git a/libc/test/src/stdlib/atof_test.cpp b/libc/test/src/stdlib/atof_test.cpp
index 1e4259b792d7e..92b904ecad94e 100644
--- a/libc/test/src/stdlib/atof_test.cpp
+++ b/libc/test/src/stdlib/atof_test.cpp
@@ -7,29 +7,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/atof.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
 
+using LlvmLibcAToFTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 // This is just a simple test to make sure that this function works at all. It's
 // functionally identical to strtod so the bulk of the testing is there.
-TEST(LlvmLibcAToFTest, SimpleTest) {
+TEST_F(LlvmLibcAToFTest, SimpleTest) {
   LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
       LIBC_NAMESPACE::fputil::FPBits<double>(uint64_t(0x405ec00000000000));
 
-  LIBC_NAMESPACE::libc_errno = 0;
   EXPECT_THAT(LIBC_NAMESPACE::atof("123"),
               Succeeds<double>(expected_fp.get_val()));
 }
 
-TEST(LlvmLibcAToFTest, FailedParsingTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+TEST_F(LlvmLibcAToFTest, FailedParsingTest) {
   // atof does not flag errors.
   EXPECT_THAT(LIBC_NAMESPACE::atof("???"), Succeeds<double>(0.0));
 }
diff --git a/libc/test/src/stdlib/strtod_test.cpp b/libc/test/src/stdlib/strtod_test.cpp
index 92d14640e6533..db3c1d73bd22e 100644
--- a/libc/test/src/stdlib/strtod_test.cpp
+++ b/libc/test/src/stdlib/strtod_test.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtod.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
-class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test,
+class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
                            ForceRoundingModeTest<RoundingMode::Nearest> {
 public:
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -46,7 +46,6 @@ class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test,
     LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
         LIBC_NAMESPACE::fputil::FPBits<double>(expectedRawData);
 
-    LIBC_NAMESPACE::libc_errno = 0;
     double result = LIBC_NAMESPACE::strtod(inputString, &str_end);
     if (expectedErrno == 0)
       EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
diff --git a/libc/test/src/stdlib/strtof_test.cpp b/libc/test/src/stdlib/strtof_test.cpp
index 6a716c956291c..6df1ddda93bfa 100644
--- a/libc/test/src/stdlib/strtof_test.cpp
+++ b/libc/test/src/stdlib/strtof_test.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtof.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest;
 using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 
-class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test,
+class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
                            ForceRoundingModeTest<RoundingMode::Nearest> {
 public:
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -43,7 +43,6 @@ class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test,
     LIBC_NAMESPACE::fputil::FPBits<float> expected_fp =
         LIBC_NAMESPACE::fputil::FPBits<float>(expectedRawData);
 
-    LIBC_NAMESPACE::libc_errno = 0;
     float result = LIBC_NAMESPACE::strtof(inputString, &str_end);
 
     EXPECT_EQ(str_end - inputString, expectedStrLen);
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index b209c85b88e36..eb4056dc7ba64 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -8,9 +8,9 @@
 
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/uint128.h"
-#include "src/errno/libc_errno.h"
 #include "src/stdlib/strtold.h"
 
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include <stddef.h>
@@ -25,7 +25,7 @@
 #error "Unknown long double type"
 #endif
 
-class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
   void run_test(const char *inputString, const ptrdiff_t expectedStrLen,
@@ -80,7 +80,6 @@ class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test {
         FPBits(static_cast<FPBits::StorageType>(expectedRawData));
     const int expected_errno = expectedErrno;
 
-    LIBC_NAMESPACE::libc_errno = 0;
     long double result = LIBC_NAMESPACE::strtold(inputString, &str_end);
 
     LIBC_NAMESPACE::fputil::FPBits<long double> actual_fp =
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index a675373938e99..ced60750a45c7 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -168,6 +168,7 @@ add_libc_test(
   DEPENDS
     libc.src.string.strdup
     libc.src.errno.errno
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 # FIXME: This is failing on the bot for some reason, disable for now.
diff --git a/libc/test/src/string/strdup_test.cpp b/libc/test/src/string/strdup_test.cpp
index 20b85c37637dd..4b18fc7f1bdee 100644
--- a/libc/test/src/string/strdup_test.cpp
+++ b/libc/test/src/string/strdup_test.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
 #include "src/string/strdup.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcStrDupTest, EmptyString) {
+using LlvmLibcStrDupTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStrDupTest, EmptyString) {
   const char *empty = "";
 
-  LIBC_NAMESPACE::libc_errno = 0;
   char *result = LIBC_NAMESPACE::strdup(empty);
   ASSERT_ERRNO_SUCCESS();
 
@@ -23,10 +24,9 @@ TEST(LlvmLibcStrDupTest, EmptyString) {
   ::free(result);
 }
 
-TEST(LlvmLibcStrDupTest, AnyString) {
+TEST_F(LlvmLibcStrDupTest, AnyString) {
   const char *abc = "abc";
 
-  LIBC_NAMESPACE::libc_errno = 0;
   char *result = LIBC_NAMESPACE::strdup(abc);
   ASSERT_ERRNO_SUCCESS();
 
@@ -36,8 +36,7 @@ TEST(LlvmLibcStrDupTest, AnyString) {
   ::free(result);
 }
 
-TEST(LlvmLibcStrDupTest, NullPtr) {
-  LIBC_NAMESPACE::libc_errno = 0;
+TEST_F(LlvmLibcStrDupTest, NullPtr) {
   char *result = LIBC_NAMESPACE::strdup(nullptr);
   ASSERT_ERRNO_SUCCESS();
 

From 22fd11fe66a0d64f5ef359e21ae67a7d40936eaf Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Wed, 11 Jun 2025 15:26:49 -0400
Subject: [PATCH 130/851] [SystemZ][z/OS] Refactor AutoConvert.h to remove
 large MVS guard (#143174)

This AutoConvert.h header frequently gets mislabeled as an unused
include because it is guarded by MVS internally and every usage is also
guarded. This refactors the change to remove this guard and instead make
these functions a noop on other non-z/OS platforms.
---
 llvm/include/llvm/Support/AutoConvert.h | 46 +++++++++++++++++++++++--
 llvm/lib/Support/AutoConvert.cpp        | 21 -----------
 llvm/lib/Support/InitLLVM.cpp           | 30 ++++++++++------
 llvm/lib/Support/MemoryBuffer.cpp       | 10 +++---
 llvm/lib/Support/raw_ostream.cpp        | 19 +++++-----
 5 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 352493e9be25f..56ad91425bcc3 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -16,6 +16,7 @@
 
 #ifdef __MVS__
 #include <_Ccsid.h>
+#endif
 #ifdef __cplusplus
 #include "llvm/Support/ErrorOr.h"
 #include <system_error>
@@ -28,9 +29,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
+
 int enablezOSAutoConversion(int FD);
 int disablezOSAutoConversion(int FD);
 int restorezOSStdHandleAutoConversion(int FD);
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
@@ -38,6 +41,46 @@ int restorezOSStdHandleAutoConversion(int FD);
 #ifdef __cplusplus
 namespace llvm {
 
+inline std::error_code disableAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::disablezOSAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code enableAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::enablezOSAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code restoreStdHandleAutoConversion(int FD) {
+#ifdef __MVS__
+  if (::restorezOSStdHandleAutoConversion(FD) == -1)
+    return errnoAsErrorCode();
+#endif
+  return std::error_code();
+}
+
+inline std::error_code setFileTag(int FD, int CCSID, bool Text) {
+#ifdef __MVS__
+  return setzOSFileTag(FD, CCSID, Text);
+#endif
+  return std::error_code();
+}
+
+inline ErrorOr<bool> needConversion(const char *FileName, const int FD = -1) {
+#ifdef __MVS__
+  return needzOSConversion(FileName, FD);
+#endif
+  return false;
+}
+
+#ifdef __MVS__
+
 /** \brief Disable the z/OS enhanced ASCII auto-conversion for the file
  * descriptor.
  */
@@ -63,9 +106,8 @@ ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
  */
 ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
 
+#endif /* __MVS__*/
 } /* namespace llvm */
 #endif /* __cplusplus */
 
-#endif /* __MVS__ */
-
 #endif /* LLVM_SUPPORT_AUTOCONVERT_H */
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index f7918548df1d0..c69e9a8f97c0e 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -83,27 +83,6 @@ int enablezOSAutoConversion(int FD) {
   return fcntl(FD, F_CONTROL_CVT, &Query);
 }
 
-std::error_code llvm::disablezOSAutoConversion(int FD) {
-  if (::disablezOSAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
-std::error_code llvm::enablezOSAutoConversion(int FD) {
-  if (::enablezOSAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
-std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) {
-  if (::restorezOSStdHandleAutoConversion(FD) == -1)
-    return errnoAsErrorCode();
-
-  return std::error_code();
-}
-
 std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) {
   assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) &&
          "FT_UNTAGGED and FT_BINARY are not allowed for text files");
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index 50f7a43cc34a7..b8fbfd21c4f28 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -18,18 +18,28 @@
 #include "llvm/Support/Windows/WindowsSupport.h"
 #endif
 
-#ifdef __MVS__
+#if defined(HAVE_UNISTD_H)
 #include <unistd.h>
+#else
+#ifndef STDIN_FILENO
+#define STDIN_FILENO 0
+#endif
+#ifndef STDOUT_FILENO
+#define STDOUT_FILENO 1
+#endif
+#ifndef STDERR_FILENO
+#define STDERR_FILENO 2
+#endif
+#endif
 
 void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
   Errs->flush();
-  llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO);
-  llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO);
-  llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDIN_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDOUT_FILENO);
+  llvm::restoreStdHandleAutoConversion(STDERR_FILENO);
 }
-#endif
 
 using namespace llvm;
 using namespace llvm::sys;
@@ -41,10 +51,10 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
   assert(!Initialized && "InitLLVM was already initialized!");
   Initialized = true;
 #endif
-#ifdef __MVS__
+
   // Bring stdin/stdout/stderr into a known state.
   sys::AddSignalHandler(CleanupStdHandles, nullptr);
-#endif
+
   if (InstallPipeSignalExitHandler)
     // The pipe signal handler must be installed before any other handlers are
     // registered. This is because the Unix \ref RegisterHandlers function does
@@ -68,8 +78,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 
   // If turning on conversion for stderr fails then the error message
   // may be garbled. There is no solution to this problem.
-  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO)));
-  ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO)));
+  ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO)));
 #endif
 
 #ifdef _WIN32
@@ -97,8 +107,6 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
 }
 
 InitLLVM::~InitLLVM() {
-#ifdef __MVS__
   CleanupStdHandles(nullptr);
-#endif
   llvm_shutdown();
 }
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index e2044bcc4e4f0..601f11f6d23c8 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Alignment.h"
+#include "llvm/Support/AutoConvert.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -34,9 +35,6 @@
 #include <io.h>
 #endif
 
-#ifdef __MVS__
-#include "llvm/Support/AutoConvert.h"
-#endif
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -508,15 +506,15 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
   }
 
 #ifdef __MVS__
-  ErrorOr<bool> NeedConversion = needzOSConversion(Filename.str().c_str(), FD);
-  if (std::error_code EC = NeedConversion.getError())
+  ErrorOr<bool> NeedsConversion = needConversion(Filename.str().c_str(), FD);
+  if (std::error_code EC = NeedsConversion.getError())
     return EC;
   // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we
   // cannot trust the file size and we create the memory buffer by copying
   // off the stream.
   // Note: This only works with the assumption of reading a full file (i.e,
   // Offset == 0 and MapSize == FileSize). Reading a file slice does not work.
-  if (Offset == 0 && MapSize == FileSize && *NeedConversion)
+  if (*NeedsConversion && Offset == 0 && MapSize == FileSize)
     return getMemoryBufferForStream(FD, Filename);
 #endif
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 16631a63d1921..07b99896543bd 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -894,21 +894,24 @@ void raw_fd_ostream::anchor() {}
 raw_fd_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
   std::error_code EC;
-#ifdef __MVS__
-  EC = enablezOSAutoConversion(STDOUT_FILENO);
-  assert(!EC);
-#endif
+
+  // On z/OS we need to enable auto conversion
+  static std::error_code EC1 = enableAutoConversion(STDOUT_FILENO);
+  assert(!EC1);
+  (void)EC1;
+
   static raw_fd_ostream S("-", EC, sys::fs::OF_None);
   assert(!EC);
   return S;
 }
 
 raw_fd_ostream &llvm::errs() {
-  // Set standard error to be unbuffered.
-#ifdef __MVS__
-  std::error_code EC = enablezOSAutoConversion(STDERR_FILENO);
+  // On z/OS we need to enable auto conversion
+  static std::error_code EC = enableAutoConversion(STDERR_FILENO);
   assert(!EC);
-#endif
+  (void)EC;
+
+  // Set standard error to be unbuffered.
   static raw_fd_ostream S(STDERR_FILENO, false, true);
   return S;
 }

From 34a1b8ce2518d7868c080519a05892cd3b197192 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Wed, 11 Jun 2025 12:37:08 -0700
Subject: [PATCH 131/851] [acc] acc.loop verifier now requires parallelism
 determination flag (#143720)

The OpenACC specification for `acc loop` describe that a loop's
parallelism determination mode is either auto, independent, or seq. The
rules are as follows.
- As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop
construct with no auto or seq clause is treated as if it has the
independent clause when it is an orphaned loop construct or its parent
compute construct is a parallel construct.
- As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent
compute construct is a kernels construct, a loop construct with no
independent or seq clause is treated as if it has the auto clause.
- Additionally, loops marked with gang, worker, or vector are not
guaranteed to be parallel. Specifically noted in 2.9.7 auto clause: If
not, or if it is unable to make a determination, it must treat the auto
clause as if it is a seq clause, and it must ignore any gang, worker, or
vector clauses on the loop construct.

The verifier for `acc.loop` was updated to enforce this marking because
the context in which a loop appears is not trivially determined once IR
transformations begin. For example, orphaned loops are implicitly
`independent`, but after inlining into an `acc.kernels` region they
would be implicitly considered `auto`. Thus now the verifier requires
that a frontend specifically generates acc dialect with this marking
since it knows the context.
---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp      | 35 +++++++++--
 mlir/test/Dialect/OpenACC/canonicalize.mlir  |  4 +-
 mlir/test/Dialect/OpenACC/invalid.mlir       | 28 ++++-----
 mlir/test/Dialect/OpenACC/legalize-data.mlir | 16 ++---
 mlir/test/Dialect/OpenACC/ops.mlir           | 66 ++++++++++----------
 5 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 21e6b9d85f1a1..0dfead98b7e73 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2461,10 +2461,34 @@ LogicalResult acc::LoopOp::verify() {
   if (hasDuplicateDeviceTypes(getAuto_(), deviceTypes) ||
       hasDuplicateDeviceTypes(getIndependent(), deviceTypes) ||
       hasDuplicateDeviceTypes(getSeq(), deviceTypes)) {
-    return emitError() << "only one of \"" << acc::LoopOp::getAutoAttrStrName()
-                       << "\", " << getIndependentAttrName() << ", "
-                       << getSeqAttrName()
-                       << " can be present at the same time";
+    return emitError() << "only one of auto, independent, seq can be present "
+                          "at the same time";
+  }
+
+  // Check that at least one of auto, independent, or seq is present
+  // for the device-independent default clauses.
+  auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool {
+    return attr.getValue() == mlir::acc::DeviceType::None;
+  };
+  bool hasDefaultSeq =
+      getSeqAttr()
+          ? llvm::any_of(getSeqAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                         hasDeviceNone)
+          : false;
+  bool hasDefaultIndependent =
+      getIndependentAttr()
+          ? llvm::any_of(
+                getIndependentAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                hasDeviceNone)
+          : false;
+  bool hasDefaultAuto =
+      getAuto_Attr()
+          ? llvm::any_of(getAuto_Attr().getAsRange<mlir::acc::DeviceTypeAttr>(),
+                         hasDeviceNone)
+          : false;
+  if (!hasDefaultSeq && !hasDefaultIndependent && !hasDefaultAuto) {
+    return emitError()
+           << "at least one of auto, independent, seq must be present";
   }
 
   // Gang, worker and vector are incompatible with seq.
@@ -2482,8 +2506,7 @@ LogicalResult acc::LoopOp::verify() {
                        deviceTypeAttr.getValue()) ||
           getGangValue(mlir::acc::GangArgType::Static,
                        deviceTypeAttr.getValue()))
-        return emitError()
-               << "gang, worker or vector cannot appear with the seq attr";
+        return emitError() << "gang, worker or vector cannot appear with seq";
     }
   }
 
diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir
index e43a27f6b9e89..fdc8e6b5cae6e 100644
--- a/mlir/test/Dialect/OpenACC/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir
@@ -116,10 +116,10 @@ func.func @testhostdataop(%a: memref<f32>, %ifCond: i1) -> () {
   acc.host_data dataOperands(%0 : memref<f32>) if(%false) {
     acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.terminator
   }
   return
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index aadf189273212..8f6e961a06163 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -2,7 +2,7 @@
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -12,7 +12,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -22,7 +22,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -32,7 +32,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -42,7 +42,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -52,7 +52,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -62,7 +62,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
+// expected-error@+1 {{gang, worker or vector cannot appear with seq}}
 acc.loop {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
@@ -72,7 +72,7 @@ acc.loop {
 
 // expected-error@+1 {{expected non-empty body.}}
 acc.loop {
-}
+} attributes {independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -99,7 +99,7 @@ acc.loop {
 
 %1 = arith.constant 1 : i32
 %2 = arith.constant 10 : i32
-// expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}}
+// expected-error@+1 {{only one of auto, independent, seq can be present at the same time}}
 acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   acc.yield
 } attributes {auto_ = [#acc.device_type<none>], seq = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
@@ -168,7 +168,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32){
 // expected-error@+1 {{'acc.init' op cannot be nested in a compute operation}}
   acc.init
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -186,7 +186,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}}
   acc.shutdown
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -198,7 +198,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
     acc.shutdown
   }) : () -> ()
   acc.yield
-} attributes {inclusiveUpperbound = array<i1: true>}
+} attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
 // -----
 
@@ -797,7 +797,7 @@ func.func @acc_loop_container() {
         scf.yield
     }
     acc.yield
-  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
 
@@ -816,6 +816,6 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
diff --git a/mlir/test/Dialect/OpenACC/legalize-data.mlir b/mlir/test/Dialect/OpenACC/legalize-data.mlir
index 28ef6761a6ef4..40604dcc736de 100644
--- a/mlir/test/Dialect/OpenACC/legalize-data.mlir
+++ b/mlir/test/Dialect/OpenACC/legalize-data.mlir
@@ -96,7 +96,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -109,7 +109,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[CREATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -134,7 +134,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -147,7 +147,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -172,7 +172,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop private(@privatization_memref_10_f32 -> %p1 : memref<10xf32>) control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -185,7 +185,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop private(@privatization_memref_10_f32 -> %[[PRIVATE]] : memref<10xf32>) control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {independent = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
@@ -210,7 +210,7 @@ func.func @test(%a: memref<10xf32>) {
     acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
       %ci = memref.load %a[%i] : memref<10xf32>
       acc.yield
-    }
+    } attributes {seq = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -223,7 +223,7 @@ func.func @test(%a: memref<10xf32>) {
 // CHECK:   acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index) {
 // DEVICE:    %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32>
 // CHECK:     acc.yield
-// CHECK:   }
+// CHECK:   } attributes {seq = [#acc.device_type<none>]}
 // CHECK:   acc.yield
 // CHECK: }
 
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 550f295f074a2..97278f869534b 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -19,7 +19,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
       %co = arith.addf %cij, %p : f32
       memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
       acc.yield
-    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
 
@@ -40,7 +40,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:       %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 //  CHECK-NEXT:       memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
+//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>, independent = [#acc.device_type<none>]}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
@@ -129,7 +129,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
           %tmp = arith.addf %axy, %bxy : f32
           memref.store %tmp, %c[%y] : memref<10xf32>
           acc.yield
-        } attributes {inclusiveUpperbound = array<i1: true>}
+        } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
         acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
           // for i = 0 to 10 step 1
@@ -139,9 +139,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
           %z = arith.addf %ci, %dx : f32
           memref.store %z, %d[%x] : memref<10xf32>
           acc.yield
-        } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
+        } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>]}
         acc.yield
-      } attributes {inclusiveUpperbound = array<i1: true>}
+      } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
       acc.yield
     }
     acc.terminator
@@ -166,16 +166,16 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           acc.yield
-// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK-NEXT:         acc.loop control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
 // CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
 // CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
 // CHECK-NEXT:           acc.yield
-// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>]}
 // CHECK-NEXT:         acc.yield
-// CHECK-NEXT:       } attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT:       } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK-NEXT:       acc.yield
 // CHECK-NEXT:     }
 // CHECK-NEXT:     acc.terminator
@@ -196,72 +196,72 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
   acc.loop gang vector worker control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop worker(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop vector(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64}) worker vector control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop tile({%i64Value : i64, %i64Value : i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop tile({%i32Value : i32, %i32Value : i32}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   %b = acc.cache varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
   acc.loop cache(%b : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  } attributes {inclusiveUpperbound = array<i1: true>}
+  } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
   return
 }
 
@@ -271,7 +271,7 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
 // CHECK:      acc.loop
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT: attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 // CHECK:      acc.loop gang({num=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
@@ -343,7 +343,7 @@ func.func @acc_loop_multiple_block() {
       cf.br ^bb1(%22 : index)
     ^bb3:
       acc.yield
-    } attributes {inclusiveUpperbound = array<i1: true>}
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -1477,7 +1477,7 @@ func.func @acc_reduc_test(%a : i64) -> () {
   acc.parallel reduction(@reduction_add_i64 -> %a : i64) {
     acc.loop reduction(@reduction_add_i64 -> %a : i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    } attributes { inclusiveUpperbound = array<i1: true> }
+    } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
     acc.yield
   }
   return
@@ -1869,21 +1869,21 @@ func.func @acc_combined() {
   acc.parallel combined(loop) {
     acc.loop combined(parallel) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {independent = [#acc.device_type<none>]}
     acc.terminator
   }
 
   acc.kernels combined(loop) {
     acc.loop combined(kernels) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {auto_ = [#acc.device_type<none>]}
     acc.terminator
   }
 
   acc.serial combined(loop) {
     acc.loop combined(serial) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes {seq = [#acc.device_type<none>]}
     acc.terminator
   }
 
@@ -1949,7 +1949,7 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  }
+  } attributes {independent = [#acc.device_type<none>]}
   return
 }
 
@@ -1971,7 +1971,7 @@ func.func @acc_loop_container() {
       scf.yield
     }
     acc.yield
-  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>] }
+  } attributes { collapse = [2], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
   return
 }
 

From 02161c635fd70e0214bd8b8320a80992c50ec325 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Wed, 11 Jun 2025 12:44:51 -0700
Subject: [PATCH 132/851] [NVPTX] Misc table-gen cleanup (NFC) (#142877)

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  196 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 2504 ++++-------------
 .../Target/NVPTX/NVPTXReplaceImageHandles.cpp |  840 +++---
 3 files changed, 1065 insertions(+), 2475 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b646d39194c7e..9ca4e8d20650a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -160,7 +160,6 @@ def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
 def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
 
 def True : Predicate<"true">;
-def False : Predicate<"false">;
 
 class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;
 class hasSM<int version>: Predicate<"Subtarget->getSmVersion() >= " # version>;
@@ -257,6 +256,11 @@ def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>;
 //                         "prmt.b32${mode}">;
 //         ---> "prmt.b32${mode} \t$d, $a, $b, $c;"
 //
+//   * BasicFlagsNVPTXInst<(outs Int64Regs:$state),
+//                         (ins ADDR:$addr),
+//                         "mbarrier.arrive.b64">;
+//         ---> "mbarrier.arrive.b64 \t$state, [$addr];"
+//
 class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmstr,
                           list<dag> pattern = []>
   : NVPTXInst<
@@ -274,7 +278,11 @@ class BasicFlagsNVPTXInst<dag outs_dag, dag ins_dag, dag flags_dag, string asmst
             !if(!or(!empty(ins_dag), !empty(outs_dag)), "", ", "),
             !interleave(
               !foreach(i, !range(!size(ins_dag)),
-                "$" # !getdagname(ins_dag, i)),
+                 !if(!eq(!cast<string>(!getdagarg<DAGOperand>(ins_dag, i)), "ADDR"),
+                    "[$" # !getdagname(ins_dag, i) # "]",
+                    "$" # !getdagname(ins_dag, i)
+                 )
+                ),
               ", "))),
         ";"),
       pattern>;
@@ -956,31 +964,17 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
 def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
 
 // Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)),
-          (MULWIDES32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)),
-          (MULWIDES32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)),
-          (MULWIDEU32 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)),
-          (MULWIDEU32Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+let Predicates = [doMulWide] in {
+  def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>;
+  def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>;
 
-def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)),
-          (MULWIDES64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)),
-          (MULWIDES64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)),
-          (MULWIDEU64 $a, $b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)),
-          (MULWIDEU64Imm $a, imm:$b)>,
-      Requires<[doMulWide]>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>;
+  def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>;
+}
 
 // Predicates used for converting some patterns to mul.wide.
 def SInt32Const : PatLeaf<(imm), [{
@@ -1106,18 +1100,12 @@ defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>;
 defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>;
 }
 
-def INEG16 :
-  BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "neg.s16",
-            [(set i16:$dst, (ineg i16:$src))]>;
-def INEG32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-            "neg.s32",
-            [(set i32:$dst, (ineg i32:$src))]>;
-def INEG64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-            "neg.s64",
-            [(set i64:$dst, (ineg i64:$src))]>;
+foreach t = [I16RT, I32RT, I64RT] in {
+  def NEG_S # t.Size :
+    BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src),
+              "neg.s" # t.Size,
+              [(set t.Ty:$dst, (ineg t.Ty:$src))]>;
+}
 
 //-----------------------------------
 // Floating Point Arithmetic
@@ -1538,7 +1526,7 @@ def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>;
 
 def SDTPRMT :
   SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
-                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>;
+                       SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
 def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>;
 
 multiclass BFE<string Instr, ValueType T, RegisterClass RC> {
@@ -1961,7 +1949,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // f16 -> pred
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, ModeFTZ)>,
-        Requires<[useFP16Math,doF32FTZ]>;
+        Requires<[useFP16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr $a, $b, Mode)>,
         Requires<[useFP16Math]>;
@@ -1969,7 +1957,7 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, ModeFTZ)>,
-        Requires<[hasBF16Math,doF32FTZ]>;
+        Requires<[hasBF16Math, doF32FTZ]>;
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr $a, $b, Mode)>,
         Requires<[hasBF16Math]>;
@@ -2497,24 +2485,20 @@ def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>;
 def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>;
 
 // sint -> bf16
-def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, 
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>;
+  def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>;
+}
 
 // uint -> bf16
-def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
-def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>,
-      Requires<[hasPTX<78>, hasSM<90>]>;
+let Predicates = [hasPTX<78>, hasSM<90>] in {
+  def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>;
+  def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>;
+}
 
 // sint -> f32
 def : Pat<(f32 (sint_to_fp  i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>;
@@ -2565,27 +2549,25 @@ def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>;
 def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>;
 def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>;
 // f32 -> sint
-def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>;
+}
+def : Pat<(i1  (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
 def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>;
 
 // f32 -> uint
+let Predicates = [doF32FTZ] in {
+  def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>;
+  def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>;
+}
 def : Pat<(i1  (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>, 
-      Requires<[doF32FTZ]>;
 def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>,
-      Requires<[doF32FTZ]>;
 def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>;
 
 // f64 -> sint
@@ -2707,28 +2689,24 @@ let hasSideEffects = false in {
 
   // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the
   // unused high/low part.
-  def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low),
-                             (ins Int32Regs:$s),
-                             "mov.b32 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{_, $high}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
-  def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low),
-                             (ins Int64Regs:$s),
-                             "mov.b64 \t{{$low, _}}, $s;",
-                             []>, Requires<[hasPTX<71>]>;
+  let Predicates = [hasPTX<71>] in {
+    def I32toI16H_Sink  : NVPTXInst<(outs Int16Regs:$high), (ins Int32Regs:$s),
+                              "mov.b32 \t{{_, $high}}, $s;", []>;
+    def I32toI16L_Sink  : NVPTXInst<(outs Int16Regs:$low), (ins Int32Regs:$s),
+                              "mov.b32 \t{{$low, _}}, $s;", []>;
+    def I64toI32H_Sink  : NVPTXInst<(outs Int32Regs:$high), (ins Int64Regs:$s),
+                              "mov.b64 \t{{_, $high}}, $s;", []>;
+    def I64toI32L_Sink  : NVPTXInst<(outs Int32Regs:$low), (ins Int64Regs:$s),
+                              "mov.b64 \t{{$low, _}}, $s;", []>;
+  }
 }
 
-def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
-def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>;
+let Predicates = [hasPTX<71>] in {
+  def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>;
+  def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+  def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>;
+}
 
 // Fall back to the old way if we don't have PTX 7.1.
 def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>;
@@ -3061,29 +3039,19 @@ def stacksave :
   SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf,
          [SDNPHasChain, SDNPSideEffect]>;
 
-def STACKRESTORE_32 :
-  BasicNVPTXInst<(outs), (ins Int32Regs:$ptr),
-            "stackrestore.u32",
-            [(stackrestore i32:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKSAVE_32 :
-  BasicNVPTXInst<(outs Int32Regs:$dst), (ins),
-            "stacksave.u32",
-            [(set i32:$dst, (i32 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKRESTORE_64 :
-  BasicNVPTXInst<(outs), (ins Int64Regs:$ptr),
-            "stackrestore.u64",
-            [(stackrestore i64:$ptr)]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
-
-def STACKSAVE_64 :
-  BasicNVPTXInst<(outs Int64Regs:$dst), (ins),
-            "stacksave.u64",
-            [(set i64:$dst, (i64 stacksave))]>,
-            Requires<[hasPTX<73>, hasSM<52>]>;
+let Predicates = [hasPTX<73>, hasSM<52>] in {
+  foreach t = [I32RT, I64RT] in {
+    def STACKRESTORE_ # t.Size :
+      BasicNVPTXInst<(outs), (ins t.RC:$ptr),
+                "stackrestore.u" # t.Size,
+              [(stackrestore t.Ty:$ptr)]>;
+
+    def STACKSAVE_ # t.Size :
+      BasicNVPTXInst<(outs t.RC:$dst), (ins),
+                "stacksave.u" # t.Size,
+              [(set t.Ty:$dst, (t.Ty stacksave))]>;
+  }
+}
 
 include "NVPTXIntrinsics.td"
 
@@ -3124,7 +3092,7 @@ def : Pat <
 ////////////////////////////////////////////////////////////////////////////////
 
 class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
-    NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+    BasicNVPTXInst<(outs), (ins), "fence."#sem#"."#scope>,
     Requires<[ptx, hasSM<70>]>;
 
 foreach scope = ["sys", "gpu", "cluster", "cta"] in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f918160001ba5..83d7defe6d9a9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -52,7 +52,7 @@ class PTX {
 def ptx : PTX;
 
 // Generates list of n sequential register names.
-// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ]
+// E.g. RegNames<3, "r">.ret -> ["r0", "r1", "r2" ]
 class RegSeq<int n, string prefix> {
   list<string> ret = !if(n, !listconcat(RegSeq<!sub(n, 1), prefix>.ret,
                                         [prefix # !sub(n, 1)]),
@@ -137,7 +137,7 @@ defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive
 
 class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
                           list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
-        NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
+        BasicNVPTXInst<(outs), (ins), "barrier.cluster."# variant, [(Intr)]>,
         Requires<Preds>;
 
 def barrier_cluster_arrive:
@@ -400,13 +400,9 @@ def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
 //-----------------------------------
 
 multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i32:$addr)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
-            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
-            [(Intrin i64:$addr)]>,
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+            "cp.async.mbarrier.arrive" # NoInc # AddrSpace # ".b64",
+            [(Intrin addr:$addr)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -420,30 +416,19 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
   CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
 
 multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin, Intrinsic IntrinS> {
-  def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i32:$dst, i32:$src)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
-            !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"),
-            [(Intrin i64:$dst, i64:$src)]>,
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src),
+            "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ";",
+            [(Intrin addr:$dst, addr:$src)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
+
   // Variant with src_size parameter
-  def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, i32:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i32:$dst, i32:$src, imm:$src_size)]>,
-    Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, i32:$src_size)]>,
+  def _s : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, i32:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
-  def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
-             !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
-             [(IntrinS i64:$dst, i64:$src, imm:$src_size)]>,
+  def _si: NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, i32imm:$src_size),
+             "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;",
+             [(IntrinS addr:$dst, addr:$src, imm:$src_size)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
 
@@ -513,14 +498,14 @@ class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
 }
 
 multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
-  def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+  def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
       !if(has_ch,
           CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;",
           CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"),
       [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
+  def _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
       !if(has_ch,
           CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;",
           CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"),
@@ -533,7 +518,7 @@ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
 multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
   defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
 
-  def NAME : NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -542,7 +527,7 @@ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
       [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
       Requires<[hasPTX<80>, hasSM<90>]>;
 
-  def NAME # _MC : NVPTXInst<(outs),
+  def _MC : NVPTXInst<(outs),
       (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
            Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
       !if(has_ch,
@@ -561,7 +546,7 @@ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
   Requires<[hasPTX<80>, hasSM<90>]>;
 
 multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
-  def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+  def "" : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
       !if(has_ch,
           "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
           "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
@@ -609,19 +594,19 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
   defvar asm_str = !if(!eq(mode, "im2col"),
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag),
             !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _MC: NVPTXInst<(outs),
+  def _MC : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)),
                   !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
                   !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _MC_CH: NVPTXInst<(outs),
+  def _MC_CH : NVPTXInst<(outs),
                      !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)),
                      !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
                      Requires<[hasPTX<80>, hasSM<90>]>;
@@ -661,11 +646,11 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, bit shared32, string mode> {
   defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
   defvar rc = !if(shared32, Int32Regs, Int64Regs);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$src, Int64Regs:$tmap), dims_dag),
             !strconcat(S2G_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)),
                   !strconcat(S2G_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -685,11 +670,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, bit shared32, string mode>
   defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta";
   defvar suffix = "." # mode # ".bulk_group";
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)),
             !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)),
                   !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -735,11 +720,11 @@ multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
   defvar asm_str = !if(!eq(mode, "im2col"),
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
-  def NAME: NVPTXInst<(outs),
+  def "" : NVPTXInst<(outs),
             !con((ins Int64Regs:$tmap), dims_dag, im2col_dag),
             !strconcat(PREFETCH_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
-  def NAME # _CH: NVPTXInst<(outs),
+  def _CH : NVPTXInst<(outs),
                   !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
                   !strconcat(PREFETCH_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
@@ -755,10 +740,10 @@ foreach dim = [1, 2, 3, 4, 5] in {
 //Prefetch and Prefetchu 
 
 class PREFETCH_INTRS<string InstName> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr),
-          InstName # " [$addr];",
+          BasicNVPTXInst<(outs), (ins ADDR:$addr),
+          InstName,
           [(!cast<Intrinsic>(!strconcat("int_nvvm_",
-          !subst(".", "_", InstName))) i64:$addr)]>,
+          !subst(".", "_", InstName))) addr:$addr)]>,
           Requires<[hasPTX<80>, hasSM<90>]>;
    
 
@@ -769,36 +754,39 @@ def PREFETCH_LOCAL_L1  : PREFETCH_INTRS<"prefetch.local.L1">;
 def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
 def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
 
-def PREFETCH_GLOBAL_L2_EVICT_NORMAL : NVPTXInst<(outs), (ins Int64Regs:$addr),
-                                      "prefetch.global.L2::evict_normal" # " [$addr];",
-                                      [(!cast<Intrinsic>("int_nvvm_prefetch_global_L2_evict_normal") i64:$addr)]>,
+def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+                                      "prefetch.global.L2::evict_normal",
+                                      [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>,
                                       Requires<[hasPTX<80>, hasSM<90>]>;
 
-def PREFETCH_GLOBAL_L2_EVICT_LAST   : NVPTXInst<(outs), (ins Int64Regs:$addr),
-                                      "prefetch.global.L2::evict_last" # " [$addr];",
-                                      [(!cast<Intrinsic>("int_nvvm_prefetch_global_L2_evict_last") i64:$addr)]>,
+def PREFETCH_GLOBAL_L2_EVICT_LAST   : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+                                      "prefetch.global.L2::evict_last",
+                                      [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>,
                                       Requires<[hasPTX<80>, hasSM<90>]>;
 
 
 def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
 //Applypriority intrinsics
-class APPLYPRIORITY_L2_INTRS<string addr> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
-          StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;",
-          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret)
-          i64:$addr, i64:$size)]>,
+class APPLYPRIORITY_L2_INTRS<string addrspace> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr, Int64Regs:$size),
+          StrJoin<".", ["applypriority", addrspace , "L2::evict_normal"]>.ret,
+          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_applypriority", addrspace , "L2_evict_normal"]>.ret)
+          addr:$addr, i64:$size)]>,
           Requires<[hasPTX<74>, hasSM<80>]>;
 
 def APPLYPRIORITY_L2_EVICT_NORMAL        : APPLYPRIORITY_L2_INTRS<"">;
 def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
 
 //Discard Intrinsics
-class DISCARD_L2_INTRS<string Addr> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr),
-          StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
-          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
-          i64:$addr, (i64 128))]>,
+
+def discard_size_imm : TImmLeaf<i64, [{ return Imm == 128; }]>;
+
+class DISCARD_L2_INTRS<string addrspace> :
+          BasicNVPTXInst<(outs), (ins ADDR:$addr, i64imm:$size),
+          StrJoin<".", ["discard", addrspace , "L2"]>.ret,
+          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", addrspace , "L2"]>.ret)
+          addr:$addr, discard_size_imm:$size)]>,
           Requires<[hasPTX<74>, hasSM<80>]>;
 
 def DISCARD_L2        : DISCARD_L2_INTRS<"">;
@@ -809,8 +797,8 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
 //-----------------------------------
 
 multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.init" # AddrSpace # ".b64 [$addr], $count;",
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count),
+           "mbarrier.init" # AddrSpace # ".b64",
     [(Intrin addr:$addr, i32:$count)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -820,8 +808,8 @@ defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
                                           int_nvvm_mbarrier_init_shared>;
 
 multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs), (ins ADDR:$addr),
-           "mbarrier.inval" # AddrSpace # ".b64 [$addr];",
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
+           "mbarrier.inval" # AddrSpace # ".b64",
     [(Intrin addr:$addr)]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -831,8 +819,8 @@ defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
                                             int_nvvm_mbarrier_inval_shared>;
 
 multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
-           "mbarrier.arrive" # AddrSpace # ".b64 $state, [$addr];",
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+           "mbarrier.arrive" # AddrSpace # ".b64",
     [(set i64:$state, (Intrin addr:$addr))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -842,9 +830,9 @@ defm MBARRIER_ARRIVE_SHARED :
   MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
 
 multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state),
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state),
            (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.arrive.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+           "mbarrier.arrive.noComplete" # AddrSpace # ".b64",
     [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -855,8 +843,8 @@ defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
   MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
 
 multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
-           "mbarrier.arrive_drop" # AddrSpace # ".b64 $state, [$addr];",
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr),
+           "mbarrier.arrive_drop" # AddrSpace # ".b64",
            [(set i64:$state, (Intrin addr:$addr))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -867,9 +855,9 @@ defm MBARRIER_ARRIVE_DROP_SHARED :
   MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
 
 multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int64Regs:$state),
+  def "" : BasicNVPTXInst<(outs Int64Regs:$state),
            (ins ADDR:$addr, Int32Regs:$count),
-           "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;",
+           "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64",
            [(set i64:$state, (Intrin addr:$addr, i32:$count))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -881,8 +869,8 @@ defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
                        int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
 
 multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
-  def "" : NVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state),
-           "mbarrier.test_wait" # AddrSpace # ".b64 $res, [$addr], $state;",
+  def "" : BasicNVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state),
+           "mbarrier.test_wait" # AddrSpace # ".b64",
            [(set i1:$res, (Intrin addr:$addr, i64:$state))]>,
     Requires<[hasPTX<70>, hasSM<80>]>;
 }
@@ -1790,93 +1778,74 @@ def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b),
 def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b),
           (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a),
           (CVT_e4m3x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a),
           (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a),
           (CVT_e5m2x2_f16x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a),
+def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a),
           (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a),
+def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a),
           (CVT_f16x2_e4m3x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a),
+def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a),
           (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a),
+def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a),
           (CVT_f16x2_e5m2x2 $a, CvtRN)>;
-def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a),
+def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a),
           (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
 
-def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-
-def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e2m3x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a),
-          (CVT_f16x2_e3m2x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a),
-          (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-      
-def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b),
-          (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b),
-          (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-      
-def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn Int16Regs:$a),
-          (CVT_f16x2_e2m1x2 $a, CvtRN)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu Int16Regs:$a),
-          (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32 $a, $b, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b),
-          (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-          
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2 $a, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a),
-          (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
-          
-def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a),
-          (CVT_bf16x2_ue8m0x2 $a)>,
-      Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in {
+  def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e2m3x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>;
+  def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e3m2x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b),
+            (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>;
+  def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b),
+            (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn i16:$a),
+            (CVT_f16x2_e2m1x2 $a, CvtRN)>;
+  def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu i16:$a),
+            (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>;
+
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32 $a, $b, CvtRP)>;
+  def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b),
+            (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>;
+
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2 $a, CvtRP)>;
+  def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite v2bf16:$a),
+            (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>;
+
+  def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a),
+            (CVT_bf16x2_ue8m0x2 $a)>;
+}
 
 //
 // FNS
@@ -1920,14 +1889,14 @@ class ATOMIC_GENERIC_CHK <dag frag>
 
 multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
                       SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;";
+  defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-    def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b),
+    def r : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>,
     Requires<preds>;
     if t.SupportsImm then
-      def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b),
+      def i : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b),
         asm_str,
         [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>,
       Requires<preds>;
@@ -1937,27 +1906,27 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
 // has 3 operands
 multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str,
                       SDPatternOperator op, list<Predicate> preds> {
-  defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;";
+  defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
   let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
-    def rr : NVPTXInst<(outs t.RC:$dst),
+    def rr : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.RC:$b, t.RC:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>,
     Requires<preds>;
 
-    def ir : NVPTXInst<(outs t.RC:$dst),
+    def ir : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.Imm:$b, t.RC:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>,
     Requires<preds>;
 
-    def ri : NVPTXInst<(outs t.RC:$dst),
+    def ri : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.RC:$b, t.Imm:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>,
     Requires<preds>;
 
-    def ii : NVPTXInst<(outs t.RC:$dst),
+    def ii : BasicNVPTXInst<(outs t.RC:$dst),
       (ins ADDR:$addr, t.Imm:$b, t.Imm:$c),
       asm_str,
       [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>,
@@ -2100,7 +2069,7 @@ multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr,
     // For now we only need variants for generic space pointers.
     foreach space = ["gen"] in {
       defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space,
-                         t, !listconcat(Preds,[hasAtomScope])>;
+                         t, !listconcat(Preds, [hasAtomScope])>;
     }
   }
 }
@@ -4454,1956 +4423,616 @@ defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>;
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def TXQ_CHANNEL_ORDER_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.channel_order.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_ORDER_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.channel_order.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_DATA_TYPE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def TXQ_CHANNEL_DATA_TYPE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def TXQ_WIDTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.width.b32 \t$d, [$a];",
-              []>;
-def TXQ_WIDTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.width.b32 \t$d, [$a];",
-              []>;
-def TXQ_HEIGHT_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.height.b32 \t$d, [$a];",
-              []>;
-def TXQ_HEIGHT_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.height.b32 \t$d, [$a];",
-              []>;
-def TXQ_DEPTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.depth.b32 \t$d, [$a];",
-              []>;
-def TXQ_DEPTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.depth.b32 \t$d, [$a];",
-              []>;
-def TXQ_ARRAY_SIZE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.array_size.b32 \t$d, [$a];",
-              []>;
-def TXQ_ARRAY_SIZE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.array_size.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_SAMPLES_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.num_samples.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_SAMPLES_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.num_samples.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_MIPMAP_LEVELS_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "txq.num_mipmap_levels.b32 \t$d, [$a];",
-              []>;
-def TXQ_NUM_MIPMAP_LEVELS_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "txq.num_mipmap_levels.b32 \t$d, [$a];",
-              []>;
-}
-
-def : Pat<(int_nvvm_txq_channel_order i64:$a),
-          (TXQ_CHANNEL_ORDER_R $a)>;
-def : Pat<(int_nvvm_txq_channel_data_type i64:$a),
-          (TXQ_CHANNEL_DATA_TYPE_R $a)>;
-def : Pat<(int_nvvm_txq_width i64:$a),
-          (TXQ_WIDTH_R $a)>;
-def : Pat<(int_nvvm_txq_height i64:$a),
-          (TXQ_HEIGHT_R $a)>;
-def : Pat<(int_nvvm_txq_depth i64:$a),
-          (TXQ_DEPTH_R $a)>;
-def : Pat<(int_nvvm_txq_array_size i64:$a),
-          (TXQ_ARRAY_SIZE_R $a)>;
-def : Pat<(int_nvvm_txq_num_samples i64:$a),
-          (TXQ_NUM_SAMPLES_R $a)>;
-def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a),
-          (TXQ_NUM_MIPMAP_LEVELS_R $a)>;
-
+  foreach query = ["channel_order", "channel_data_type", "width", "height", 
+                   "depth", "array_size", "num_samples", "num_mipmap_levels"] in {
+    def TXQ_ # !toupper(query) # _R
+      : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                  "txq." # query # ".b32 \t$d, [$a];",
+                  [(set i32:$d, (!cast<Intrinsic>("int_nvvm_txq_" # query) i64:$a))]>;
+    def TXQ_ # !toupper(query) # _I
+      : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+                  "txq." # query # ".b32 \t$d, [$a];",
+                  []>;
+  }
+}
 
 //-----------------------------------
 // Surface Query Intrinsics
 //-----------------------------------
 
 let IsSurfTexQuery = true in {
-def SUQ_CHANNEL_ORDER_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.channel_order.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_ORDER_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.channel_order.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_DATA_TYPE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def SUQ_CHANNEL_DATA_TYPE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.channel_data_type.b32 \t$d, [$a];",
-              []>;
-def SUQ_WIDTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.width.b32 \t$d, [$a];",
-              []>;
-def SUQ_WIDTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.width.b32 \t$d, [$a];",
-              []>;
-def SUQ_HEIGHT_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.height.b32 \t$d, [$a];",
-              []>;
-def SUQ_HEIGHT_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.height.b32 \t$d, [$a];",
-              []>;
-def SUQ_DEPTH_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.depth.b32 \t$d, [$a];",
-              []>;
-def SUQ_DEPTH_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.depth.b32 \t$d, [$a];",
-              []>;
-def SUQ_ARRAY_SIZE_R
-  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-              "suq.array_size.b32 \t$d, [$a];",
-              []>;
-def SUQ_ARRAY_SIZE_I
-  : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
-              "suq.array_size.b32 \t$d, [$a];",
-              []>;
-}
-
-def : Pat<(int_nvvm_suq_channel_order i64:$a),
-          (SUQ_CHANNEL_ORDER_R $a)>;
-def : Pat<(int_nvvm_suq_channel_data_type i64:$a),
-          (SUQ_CHANNEL_DATA_TYPE_R $a)>;
-def : Pat<(int_nvvm_suq_width i64:$a),
-          (SUQ_WIDTH_R $a)>;
-def : Pat<(int_nvvm_suq_height i64:$a),
-          (SUQ_HEIGHT_R $a)>;
-def : Pat<(int_nvvm_suq_depth i64:$a),
-          (SUQ_DEPTH_R $a)>;
-def : Pat<(int_nvvm_suq_array_size i64:$a),
-          (SUQ_ARRAY_SIZE_R $a)>;
-
+  foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in {
+    def SUQ_ # !toupper(query) # _R
+      : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                  "suq." # query # ".b32 \t$d, [$a];",
+                  [(set i32:$d, (!cast<Intrinsic>("int_nvvm_suq_" # query) i64:$a))]>;
+    def SUQ_ # !toupper(query) # _I
+      : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a),
+                  "suq." # query # ".b32 \t$d, [$a];",
+                  []>;
+  }
+}
 
 //===- Handle Query -------------------------------------------------------===//
 
 // TODO: These intrinsics are not yet finalized, pending PTX ISA design work
 def ISTYPEP_SAMPLER
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.samplerref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref",
               [(set i1:$d, (int_nvvm_istypep_sampler i64:$a))]>;
 def ISTYPEP_SURFACE
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.surfref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref",
               [(set i1:$d, (int_nvvm_istypep_surface i64:$a))]>;
 def ISTYPEP_TEXTURE
-  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-              "istypep.texref \t$d, $a;",
+  : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref",
               [(set i1:$d, (int_nvvm_istypep_texture i64:$a))]>;
 
 //===- Surface Stores -----------------------------------------------------===//
 
 let IsSust = true in {
 
-class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r)),
-                inst # " \t[$s, \\{$x\\}], \\{$r\\};",
-                []>;
+                inst # " \t[$s, \\{$x\\}], \\{$r\\};", pat>;
 multiclass SUST_1D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+
+  def _R : SUST_1D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r)]>;
+  def _I : SUST_1D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
-defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
-defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
-defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
+defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>;
+defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>;
+defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>;
+defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
-defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
-defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
-defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
+defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>;
+defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>;
+defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>;
+defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
-defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
-defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
-defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
+defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>;
+defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>;
+defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>;
+defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
-defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
-defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
+defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>;
+defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>;
+defm SUST_P_1D_I32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>;
 
-class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r, intype:$g)]>;
+  def _I : SUST_1D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
+// int_nvvm_sust_b_1d_v2i8_clamp
 
-defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
-defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
-defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
-defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
+defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
-defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
-defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
-defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
+defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>;
+defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>;
+defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>;
 
-defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
-defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
-defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
+defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>;
+defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>;
+defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>;
+defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>;
 
-class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>;
+defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>;
+defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>;
+
+class SUST_1D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g,
                                 intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, intype:$r, intype:$g,
+                intype:$b, intype:$a)]>;
+  def _I : SUST_1D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
-defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
-defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
+defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>;
+defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>;
+defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
-defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
-defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
+defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>;
+defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>;
+defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
-defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
-defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
+defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>;
+defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>;
+defm SUST_P_1D_V4I32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+    defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, intype:$r)]>;
+  def _I : SUST_1D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_B8_CLAMP
+defm SUST_B_1D_ARRAY_I8_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_CLAMP
+defm SUST_B_1D_ARRAY_I16_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_CLAMP
+defm SUST_B_1D_ARRAY_I32_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_CLAMP
+defm SUST_B_1D_ARRAY_I64_CLAMP
   : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_B8_TRAP
+defm SUST_B_1D_ARRAY_I8_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_TRAP
+defm SUST_B_1D_ARRAY_I16_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_TRAP
+defm SUST_B_1D_ARRAY_I32_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_TRAP
+defm SUST_B_1D_ARRAY_I64_TRAP
   : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_B8_ZERO
+defm SUST_B_1D_ARRAY_I8_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_B16_ZERO
+defm SUST_B_1D_ARRAY_I16_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_B32_ZERO
+defm SUST_B_1D_ARRAY_I32_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>;
-defm SUST_B_1D_ARRAY_B64_ZERO
+defm SUST_B_1D_ARRAY_I64_ZERO
   : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_ARRAY_B8_TRAP
+defm SUST_P_1D_ARRAY_I8_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_B16_TRAP
+defm SUST_P_1D_ARRAY_I16_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_B32_TRAP
+defm SUST_P_1D_ARRAY_I32_TRAP
   : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_1D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V2B8_CLAMP
+defm SUST_B_1D_ARRAY_V2I8_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_CLAMP
+defm SUST_B_1D_ARRAY_V2I16_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_CLAMP
+defm SUST_B_1D_ARRAY_V2I32_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_CLAMP
+defm SUST_B_1D_ARRAY_V2I64_CLAMP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_V2B8_TRAP
+defm SUST_B_1D_ARRAY_V2I8_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_TRAP
+defm SUST_B_1D_ARRAY_V2I16_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_TRAP
+defm SUST_B_1D_ARRAY_V2I32_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_TRAP
+defm SUST_B_1D_ARRAY_V2I64_TRAP
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_1D_ARRAY_V2B8_ZERO
+defm SUST_B_1D_ARRAY_V2I8_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B16_ZERO
+defm SUST_B_1D_ARRAY_V2I16_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V2B32_ZERO
+defm SUST_B_1D_ARRAY_V2I32_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>;
-defm SUST_B_1D_ARRAY_V2B64_ZERO
+defm SUST_B_1D_ARRAY_V2I64_ZERO
   : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_1D_ARRAY_V2B8_TRAP
+defm SUST_P_1D_ARRAY_V2I8_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V2B16_TRAP
+defm SUST_P_1D_ARRAY_V2I16_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V2B32_TRAP
+defm SUST_P_1D_ARRAY_V2I32_TRAP
   : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>;
 
-class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_1D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_1D_ARRAY_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_1D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_1D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_1D_ARRAY_V4B8_CLAMP
+defm SUST_B_1D_ARRAY_V4I8_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_CLAMP
+defm SUST_B_1D_ARRAY_V4I16_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_CLAMP
+defm SUST_B_1D_ARRAY_V4I32_CLAMP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_1D_ARRAY_V4B8_TRAP
+defm SUST_B_1D_ARRAY_V4I8_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_TRAP
+defm SUST_B_1D_ARRAY_V4I16_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_TRAP
+defm SUST_B_1D_ARRAY_V4I32_TRAP
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_1D_ARRAY_V4B8_ZERO
+defm SUST_B_1D_ARRAY_V4I8_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B16_ZERO
+defm SUST_B_1D_ARRAY_V4I16_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>;
-defm SUST_B_1D_ARRAY_V4B32_ZERO
+defm SUST_B_1D_ARRAY_V4I32_ZERO
   : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_1D_ARRAY_V4B8_TRAP
+defm SUST_P_1D_ARRAY_V4I8_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V4B16_TRAP
+defm SUST_P_1D_ARRAY_V4I16_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>;
-defm SUST_P_1D_ARRAY_V4B32_TRAP
+defm SUST_P_1D_ARRAY_V4I32_TRAP
   : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>;
 
-class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_2D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, intype:$r)]>;
+  def _I : SUST_2D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
-defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
-defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
-defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
+defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>;
+defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>;
+defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>;
+defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
-defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
-defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
-defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
+defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>;
+defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>;
+defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>;
+defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
-defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
-defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
-defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
+defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>;
+defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>;
+defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>;
+defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
-defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
-defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
+defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>;
+defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>;
+defm SUST_P_2D_I32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>;
 
-class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_2D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
+defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
-defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
-defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
-defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
+defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>;
+defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>;
+defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>;
+defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
-defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
-defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
-defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
+defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>;
+defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>;
+defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>;
+defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
-defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
-defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
+defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>;
+defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>;
+defm SUST_P_2D_V2I32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>;
 
-class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_2D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
-defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
-defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
+defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>;
+defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>;
+defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
-defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
-defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
+defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>;
+defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>;
+defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
-defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
-defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
+defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>;
+defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>;
+defm SUST_P_2D_V4I32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r)]>;
+  def _I : SUST_2D_ARRAY_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_B8_CLAMP
+defm SUST_B_2D_ARRAY_I8_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_CLAMP
+defm SUST_B_2D_ARRAY_I16_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_CLAMP
+defm SUST_B_2D_ARRAY_I32_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_CLAMP
+defm SUST_B_2D_ARRAY_I64_CLAMP
   : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_B8_TRAP
+defm SUST_B_2D_ARRAY_I8_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_TRAP
+defm SUST_B_2D_ARRAY_I16_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_TRAP
+defm SUST_B_2D_ARRAY_I32_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_TRAP
+defm SUST_B_2D_ARRAY_I64_TRAP
   : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_B8_ZERO
+defm SUST_B_2D_ARRAY_I8_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_B16_ZERO
+defm SUST_B_2D_ARRAY_I16_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_B32_ZERO
+defm SUST_B_2D_ARRAY_I32_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>;
-defm SUST_B_2D_ARRAY_B64_ZERO
+defm SUST_B_2D_ARRAY_I64_ZERO
   : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_ARRAY_B8_TRAP
+defm SUST_P_2D_ARRAY_I8_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_B16_TRAP
+defm SUST_P_2D_ARRAY_I16_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_B32_TRAP
+defm SUST_P_2D_ARRAY_I32_TRAP
   : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_2D_ARRAY_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V2B8_CLAMP
+defm SUST_B_2D_ARRAY_V2I8_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_CLAMP
+defm SUST_B_2D_ARRAY_V2I16_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_CLAMP
+defm SUST_B_2D_ARRAY_V2I32_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_CLAMP
+defm SUST_B_2D_ARRAY_V2I64_CLAMP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_V2B8_TRAP
+defm SUST_B_2D_ARRAY_V2I8_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_TRAP
+defm SUST_B_2D_ARRAY_V2I16_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_TRAP
+defm SUST_B_2D_ARRAY_V2I32_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_TRAP
+defm SUST_B_2D_ARRAY_V2I64_TRAP
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_2D_ARRAY_V2B8_ZERO
+defm SUST_B_2D_ARRAY_V2I8_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B16_ZERO
+defm SUST_B_2D_ARRAY_V2I16_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V2B32_ZERO
+defm SUST_B_2D_ARRAY_V2I32_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>;
-defm SUST_B_2D_ARRAY_V2B64_ZERO
+defm SUST_B_2D_ARRAY_V2I64_ZERO
   : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_2D_ARRAY_V2B8_TRAP
+defm SUST_P_2D_ARRAY_V2I8_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V2B16_TRAP
+defm SUST_P_2D_ARRAY_V2I16_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V2B32_TRAP
+defm SUST_P_2D_ARRAY_V2I32_TRAP
   : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>;
 
-class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_2D_ARRAY_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_2D_ARRAY_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_2D_ARRAY_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_2D_ARRAY_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_2D_ARRAY_V4B8_CLAMP
+defm SUST_B_2D_ARRAY_V4I8_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_CLAMP
+defm SUST_B_2D_ARRAY_V4I16_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_CLAMP
+defm SUST_B_2D_ARRAY_V4I32_CLAMP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>;
 
-defm SUST_B_2D_ARRAY_V4B8_TRAP
+defm SUST_B_2D_ARRAY_V4I8_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_TRAP
+defm SUST_B_2D_ARRAY_V4I16_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_TRAP
+defm SUST_B_2D_ARRAY_V4I32_TRAP
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>;
 
-defm SUST_B_2D_ARRAY_V4B8_ZERO
+defm SUST_B_2D_ARRAY_V4I8_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B16_ZERO
+defm SUST_B_2D_ARRAY_V4I16_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>;
-defm SUST_B_2D_ARRAY_V4B32_ZERO
+defm SUST_B_2D_ARRAY_V4I32_ZERO
   : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>;
 
-defm SUST_P_2D_ARRAY_V4B8_TRAP
+defm SUST_P_2D_ARRAY_V4I8_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V4B16_TRAP
+defm SUST_P_2D_ARRAY_V4I16_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>;
-defm SUST_P_2D_ARRAY_V4B32_TRAP
+defm SUST_P_2D_ARRAY_V4I32_TRAP
   : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>;
 
-class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
-                []>;
+                pat>;
 multiclass SUST_3D<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r)]>;
+  def _I : SUST_3D_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
-defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
-defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
-defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
+defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>;
+defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>;
+defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>;
+defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>;
 
-defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
-defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
-defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
-defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
+defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>;
+defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>;
+defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>;
+defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>;
 
-defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
-defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
-defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
-defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
+defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>;
+defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>;
+defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>;
+defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>;
 
-defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
-defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
-defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
+defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>;
+defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>;
+defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>;
 
-class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_V2_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r, intype:$g)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};",
-                []>;
+                pat>;
 multiclass SUST_3D_V2<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_V2_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r, intype:$g)]>;
+  def _I : SUST_3D_V2_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
-defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
-defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
-defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
+defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>;
+defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>;
 
-defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
-defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
-defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
-defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
+defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>;
+defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>;
+defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>;
+defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>;
 
-defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
-defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
-defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
-defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
+defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>;
+defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>;
+defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>;
+defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>;
 
-defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
-defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
-defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
+defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>;
+defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>;
+defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>;
 
-class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf>
+class SUST_3D_V4_base<string inst, NVPTXRegClass intype, dag surf, list<dag> pat>
     : NVPTXInst<(outs),
                 !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
                                 intype:$r, intype:$g, intype:$b, intype:$a)),
                 inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};",
-                []>;
+                pat>;
 multiclass SUST_3D_V4<string inst, NVPTXRegClass intype> {
-  def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s)>;
-  def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s)>;
+  defvar intr = !cast<Intrinsic>("int_nvvm_" # !tolower(NAME));
+  def _R : SUST_3D_V4_base<inst, intype, (ins Int64Regs:$s),
+              [(intr Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                intype:$r, intype:$g, intype:$b, intype:$a)]>;
+  def _I : SUST_3D_V4_base<inst, intype, (ins i64imm:$s), []>;
 }
 
-defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
-defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
-defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
+defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>;
+defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>;
+defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>;
+
+defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
+defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
+defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
+
+defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
+defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
+defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
 
-defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>;
-defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>;
-defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>;
-
-defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>;
-defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>;
-defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>;
-
-defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
-defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
-defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
-
-}
-
-// Surface store instruction patterns
-// I'm not sure why we can't just include these in the instruction definitions,
-// but TableGen complains of type errors :(
-
-// .clamp variant
-def : Pat<(int_nvvm_sust_b_1d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
+defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>;
+defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>;
+defm SUST_P_3D_V4I32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>;
+
+}
 
-def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-// .trap variant
-def : Pat<(int_nvvm_sust_b_1d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-// .zero variant
-def : Pat<(int_nvvm_sust_b_1d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v2i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_1d_array_i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_i64_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r),
-          (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i8_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i16_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i32_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v2i64_zero
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_2d_array_i8_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i16_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i32_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_i64_zero
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r),
-          (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r,
-           Int64Regs:$g),
-          (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_b_3d_i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_B_3D_B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_B_3D_B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r),
-          (SUST_B_3D_B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v2i64_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g),
-          (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int64Regs:$r, Int64Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i8_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i16_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_b_3d_v4i32_zero
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-
-def : Pat<(int_nvvm_sust_p_1d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
-          (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
-          (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_2d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
-          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
-          (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
-          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
-          (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
-           Int32Regs:$g),
-          (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
-           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l,
-           Int32Regs:$x, Int32Regs:$y,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
-
-
-
-def : Pat<(int_nvvm_sust_p_3d_i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_P_3D_B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r),
-          (SUST_P_3D_B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r),
-          (SUST_P_3D_B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g),
-          (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g),
-          (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
-          (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
-
-def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
-           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
-          (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s,
-           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
-           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
 
 //-----------------------------------
 // Read Special Registers
@@ -6411,13 +5040,13 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
 
 class PTX_READ_SREG_R64<string regname, Intrinsic intop, list<Predicate> Preds=[]>
   : NVPTXInst<(outs Int64Regs:$d), (ins),
-              !strconcat("mov.u64 \t$d, %", regname, ";"),
+              "mov.u64 \t$d, %" # regname # ";",
               [(set i64:$d, (intop))]>,
     Requires<Preds>;
 
 class PTX_READ_SREG_R32<string regname, Intrinsic intop, list<Predicate> Preds=[]>
   : NVPTXInst<(outs Int32Regs:$d), (ins),
-              !strconcat("mov.u32 \t$d, %", regname, ";"),
+              "mov.u32 \t$d, %" # regname # ";",
               [(set i32:$d, (intop))]>,
     Requires<Preds>;
 
@@ -6547,7 +5176,7 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
          !or(!eq(ptx_elt_type, "f16"),
              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<60>],
 
-    !and(!eq(geom,"m8n8k4"),
+    !and(!eq(geom, "m8n8k4"),
          !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>],
 
     // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16
@@ -6557,46 +5186,46 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
              !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<61>],
 
     // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16
-    !and(!or(!eq(geom,"m16n16k16"),
-             !eq(geom,"m8n32k16"),
-             !eq(geom,"m32n8k16")),
+    !and(!or(!eq(geom, "m16n16k16"),
+             !eq(geom, "m8n32k16"),
+             !eq(geom, "m32n8k16")),
          !or(!eq(ptx_elt_type, "u8"),
              !eq(ptx_elt_type, "s8"),
              !eq(ptx_elt_type, "s32"))) : [hasSM<72>, hasPTX<63>],
 
-    !and(!or(!eq(geom,"m16n16k16"),
-             !eq(geom,"m8n32k16"),
-             !eq(geom,"m32n8k16")),
+    !and(!or(!eq(geom, "m16n16k16"),
+             !eq(geom, "m8n32k16"),
+             !eq(geom, "m32n8k16")),
          !eq(ptx_elt_type, "bf16")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(geom,"m16n16k8"),
+    !and(!eq(geom, "m16n16k8"),
          !eq(ptx_elt_type, "tf32")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(geom,"m16n16k8"),
+    !and(!eq(geom, "m16n16k8"),
          !eq(ptx_elt_type, "f32")) : [hasSM<80>, hasPTX<70>],
 
     // b1 -> s32 @ m8n8k128(b1)
-    !and(!ne(op,"mma"),
-         !eq(geom,"m8n8k128")) : [hasSM<75>, hasPTX<63>],
+    !and(!ne(op, "mma"),
+         !eq(geom, "m8n8k128")) : [hasSM<75>, hasPTX<63>],
 
     // u4/s4 -> s32 @ m8n8k32 (u4/s4)
-    !and(!ne(op,"mma"),
-         !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<63>],
+    !and(!ne(op, "mma"),
+         !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<63>],
 
-    !or(!eq(geom,"m16n8k8"),
-        !eq(geom,"m8n8k16")) : [hasSM<75>, hasPTX<65>],
+    !or(!eq(geom, "m16n8k8"),
+        !eq(geom, "m8n8k16")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!ne(ptx_elt_type,"f64"),
+    !and(!ne(ptx_elt_type, "f64"),
          !eq(geom, "m8n8k4")) : [hasSM<70>, hasPTX<64>],
 
     // mma m8n8k32 requires higher PTX version
-    !and(!eq(op,"mma"),
-         !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<65>],
+    !and(!eq(op, "mma"),
+         !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!eq(ptx_elt_type,"f64"),
+    !and(!eq(ptx_elt_type, "f64"),
          !eq(geom, "m8n8k4")) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(op,"mma"),
+    !and(!eq(op, "mma"),
          !or(!eq(geom, "m16n8k16"),
              !eq(geom, "m16n8k4"),
              !eq(geom, "m16n8k32"),
@@ -6605,28 +5234,28 @@ class WMMA_REGINFO<WMMA_REGS r, string op>
              !eq(geom, "m16n8k128"),
              !eq(geom, "m16n8k256"))) : [hasSM<80>, hasPTX<70>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b16"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b16"),
          !eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b6x16_p32"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b6x16_p32"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b4x16_p64"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b4x16_p64"),
          !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b6x16_p32"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b6x16_p32"),
          !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>],
 
-    !and(!eq(op,"ldmatrix"),
-         !eq(ptx_elt_type,"b8x16.b4x16_p64"),
+    !and(!eq(op, "ldmatrix"),
+         !eq(ptx_elt_type, "b8x16.b4x16_p64"),
          !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]);
 
   // template DAGs for instruction inputs/output.
@@ -6655,7 +5284,7 @@ class WMMA_INSTR<string _Intr, list<dag> _Args>
   : NVPTXInst<(outs), (ins), "?", []> {
   Intrinsic Intr = !cast<Intrinsic>(_Intr);
   // Concatenate all arguments into a single dag.
-  dag Args = !foldl((ins), _Args, a, b, !con(a,b));
+  dag Args = !foldl((ins), _Args, a, b, !con(a, b));
   // Pre-build the pattern to match (intrinsic arg0, arg1, ...).
   dag IntrinsicPattern = BuildPatternI<!cast<Intrinsic>(Intr), Args>.ret;
 }
@@ -6761,7 +5390,7 @@ class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
   WMMA_REGINFO Frag = FragA;
   list<Predicate> ret = !listconcat(
     FragA.Predicates,
-    !if(!eq(b1op, ".and.popc"), [hasSM<80>,hasPTX<71>],[])
+    !if(!eq(b1op, ".and.popc"), [hasSM<80>, hasPTX<71>], [])
   );
 }
 // WMMA.MMA
@@ -7008,25 +5637,22 @@ def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>;
 // Tcgen05 intrinsics
 let isConvergent = true in {
 
-multiclass TCGEN05_ALLOC_INTR<NVPTXRegClass rc, string AS, string num, Intrinsic Intr> {
-  def NAME : NVPTXInst<(outs),
-             (ins rc:$dst, Int32Regs:$ncols),
-             !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"),
-             [(Intr rc:$dst, Int32Regs:$ncols)]>,
+multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$dst, Int32Regs:$ncols),
+             "tcgen05.alloc.cta_group::" # num # ".sync.aligned" # AS # ".b32",
+             [(Intr addr:$dst, Int32Regs:$ncols)]>,
              Requires<[hasTcgen05Instructions]>;
 }
 
-defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, "", "1", int_nvvm_tcgen05_alloc_cg1>;
-defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, "", "2", int_nvvm_tcgen05_alloc_cg2>;
+defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<"", "1", int_nvvm_tcgen05_alloc_cg1>;
+defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<"", "2", int_nvvm_tcgen05_alloc_cg2>;
 
-defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
-defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<Int64Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
-
-defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
-defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR<Int32Regs, ".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
+defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>;
+defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>;
 
 multiclass TCGEN05_DEALLOC_INTR<string num, Intrinsic Intr> {
-  def NAME : BasicNVPTXInst<(outs),
+  def "" : BasicNVPTXInst<(outs),
              (ins Int32Regs:$tmem_addr, Int32Regs:$ncols),
              "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32",
              [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>,
@@ -7036,7 +5662,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1
 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>;
 
 multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
-  def NAME : BasicNVPTXInst<(outs), (ins),
+  def "" : BasicNVPTXInst<(outs), (ins),
              "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned",
              [(Intr)]>,
              Requires<[hasTcgen05Instructions]>;
@@ -7052,36 +5678,33 @@ def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligne
   [(int_nvvm_tcgen05_wait_st)]>,
   Requires<[hasTcgen05Instructions]>;
 
-multiclass TCGEN05_COMMIT_INTR<NVPTXRegClass rc, string AS, string num> {
-  defvar prefix = "tcgen05.commit.cta_group::" # num;
-  defvar suffix = ".mbarrier::arrive::one.shared::cluster";
+multiclass TCGEN05_COMMIT_INTR<string AS, string num> {
+  defvar prefix = "tcgen05.commit.cta_group::" # num #".mbarrier::arrive::one.shared::cluster";
 
   defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num;
   defvar Intr = !cast<Intrinsic>("int_nvvm_tcgen05_commit" # intr_suffix);
   defvar IntrMC = !cast<Intrinsic>("int_nvvm_tcgen05_commit_mc" # intr_suffix);
 
-  def NAME : NVPTXInst<(outs), (ins rc:$mbar),
-             !strconcat(prefix, suffix, ".b64 [$mbar];"),
-             [(Intr rc:$mbar)]>,
+  def "" : BasicNVPTXInst<(outs), (ins ADDR:$mbar),
+             prefix # ".b64",
+             [(Intr addr:$mbar)]>,
              Requires<[hasTcgen05Instructions]>;
-  def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc),
-                   !strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"),
-                   [(IntrMC rc:$mbar, Int16Regs:$mc)]>,
+  def _MC : BasicNVPTXInst<(outs), (ins ADDR:$mbar, Int16Regs:$mc),
+                   prefix # ".multicast::cluster.b64",
+                   [(IntrMC addr:$mbar, Int16Regs:$mc)]>,
                    Requires<[hasTcgen05Instructions]>;
 }
 
-defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "", "1">;
-defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "", "2">;
-defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "1">;
-defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "2">;
-defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "1">;
-defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "2">;
+defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<"", "1">;
+defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
+defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
+defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
 
 multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
-  def NAME : NVPTXInst<(outs),
-             (ins Int32Regs:$tmem_addr),
-             !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"),
-             [(Intr Int32Regs:$tmem_addr)]>,
+  def "" : BasicNVPTXInst<(outs),
+             (ins ADDR:$tmem_addr),
+             "tcgen05.shift.cta_group::" # num # ".down",
+             [(Intr addr:$tmem_addr)]>,
              Requires<[hasTcgen05Instructions]>;
 }
 defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
@@ -7099,15 +5722,15 @@ multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
   defvar IntrCG1 = !cast<Intrinsic>(intr_prefix # "_cg1");
   defvar IntrCG2 = !cast<Intrinsic>(intr_prefix # "_cg2");
 
-  def NAME # _cg1 : NVPTXInst<(outs),
-                    (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc),
-                    "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;",
-                    [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>,
+  def _cg1 : BasicNVPTXInst<(outs),
+                    (ins ADDR:$tmem_addr, Int64Regs:$sdesc),
+                    "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm,
+                    [(IntrCG1 addr:$tmem_addr, Int64Regs:$sdesc)]>,
                     Requires<[hasTcgen05Instructions]>;
-  def NAME # _cg2 : NVPTXInst<(outs),
-                    (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc),
-                    "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;",
-                    [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>,
+  def _cg2 : BasicNVPTXInst<(outs),
+                    (ins ADDR:$tmem_addr, Int64Regs:$sdesc),
+                    "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm,
+                    [(IntrCG2 addr:$tmem_addr, Int64Regs:$sdesc)]>,
                     Requires<[hasTcgen05Instructions]>;
 }
 
@@ -7222,17 +5845,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
 } // isConvergent
 
 // Bulk store instructions
-                            
+def st_bulk_imm : TImmLeaf<i64, [{ return Imm == 0; }]>;
+
 def INT_NVVM_ST_BULK_GENERIC :
-  NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size),
-            "st.bulk [$dest_addr], $size, 0;",
-            [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, (i64 0))]>,
+  BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value),
+            "st.bulk",
+            [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>,
             Requires<[hasSM<100>, hasPTX<86>]>;
 
 def INT_NVVM_ST_BULK_SHARED_CTA:
-  NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size),
-            "st.bulk.shared::cta [$dest_addr], $size, 0;",
-            [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>,
+  BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value),
+            "st.bulk.shared::cta",
+            [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>,
             Requires<[hasSM<100>, hasPTX<86>]>;
 
 //
@@ -7240,17 +5864,15 @@ def INT_NVVM_ST_BULK_SHARED_CTA:
 //
 
 def CLUSTERLAUNCHCONTRL_TRY_CANCEL:
-      NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
-                "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 " #
-                "[$addr], [$mbar];",
+      BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
+                "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128",
                 [(int_nvvm_clusterlaunchcontrol_try_cancel_async_shared addr:$addr, addr:$mbar)]>,
       Requires<[hasSM<100>, hasPTX<86>]>;
 
 def CLUSTERLAUNCHCONTRL_TRY_CANCEL_MULTICAST:
-      NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
+      BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar),
                 "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes" #
-                ".multicast::cluster::all.b128 " #
-                "[$addr], [$mbar];",
+                ".multicast::cluster::all.b128",
                 [(int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared addr:$addr, addr:$mbar)]>,
       Requires<[hasSM<100>, hasArchAccelFeatures, hasPTX<86>]>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 9b5fe473521a1..320c0fb6950a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -408,426 +408,426 @@ static unsigned suldRegisterToIndexOpcode(unsigned RegOC) {
 
 static unsigned sustRegisterToIndexOpcode(unsigned RegOC) {
   switch (RegOC) {
-  case NVPTX::SUST_B_1D_B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_B64_CLAMP_R:
-    return NVPTX::SUST_B_3D_B64_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B32_CLAMP_I;
-  case NVPTX::SUST_B_3D_V2B64_CLAMP_R:
-    return NVPTX::SUST_B_3D_V2B64_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B8_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B8_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B16_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B16_CLAMP_I;
-  case NVPTX::SUST_B_3D_V4B32_CLAMP_R:
-    return NVPTX::SUST_B_3D_V4B32_CLAMP_I;
-  case NVPTX::SUST_B_1D_B8_TRAP_R:
-    return NVPTX::SUST_B_1D_B8_TRAP_I;
-  case NVPTX::SUST_B_1D_B16_TRAP_R:
-    return NVPTX::SUST_B_1D_B16_TRAP_I;
-  case NVPTX::SUST_B_1D_B32_TRAP_R:
-    return NVPTX::SUST_B_1D_B32_TRAP_I;
-  case NVPTX::SUST_B_1D_B64_TRAP_R:
-    return NVPTX::SUST_B_1D_B64_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_1D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_1D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_1D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_1D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_B_2D_B8_TRAP_R:
-    return NVPTX::SUST_B_2D_B8_TRAP_I;
-  case NVPTX::SUST_B_2D_B16_TRAP_R:
-    return NVPTX::SUST_B_2D_B16_TRAP_I;
-  case NVPTX::SUST_B_2D_B32_TRAP_R:
-    return NVPTX::SUST_B_2D_B32_TRAP_I;
-  case NVPTX::SUST_B_2D_B64_TRAP_R:
-    return NVPTX::SUST_B_2D_B64_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_2D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_2D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_2D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_2D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_B_3D_B8_TRAP_R:
-    return NVPTX::SUST_B_3D_B8_TRAP_I;
-  case NVPTX::SUST_B_3D_B16_TRAP_R:
-    return NVPTX::SUST_B_3D_B16_TRAP_I;
-  case NVPTX::SUST_B_3D_B32_TRAP_R:
-    return NVPTX::SUST_B_3D_B32_TRAP_I;
-  case NVPTX::SUST_B_3D_B64_TRAP_R:
-    return NVPTX::SUST_B_3D_B64_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B8_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B8_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B16_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B16_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B32_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B32_TRAP_I;
-  case NVPTX::SUST_B_3D_V2B64_TRAP_R:
-    return NVPTX::SUST_B_3D_V2B64_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B8_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B8_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B16_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B16_TRAP_I;
-  case NVPTX::SUST_B_3D_V4B32_TRAP_R:
-    return NVPTX::SUST_B_3D_V4B32_TRAP_I;
-  case NVPTX::SUST_B_1D_B8_ZERO_R:
-    return NVPTX::SUST_B_1D_B8_ZERO_I;
-  case NVPTX::SUST_B_1D_B16_ZERO_R:
-    return NVPTX::SUST_B_1D_B16_ZERO_I;
-  case NVPTX::SUST_B_1D_B32_ZERO_R:
-    return NVPTX::SUST_B_1D_B32_ZERO_I;
-  case NVPTX::SUST_B_1D_B64_ZERO_R:
-    return NVPTX::SUST_B_1D_B64_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_1D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_1D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_1D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_1D_V4B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_B64_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_B64_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_I;
-  case NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_R:
-    return NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_I;
-  case NVPTX::SUST_B_2D_B8_ZERO_R:
-    return NVPTX::SUST_B_2D_B8_ZERO_I;
-  case NVPTX::SUST_B_2D_B16_ZERO_R:
-    return NVPTX::SUST_B_2D_B16_ZERO_I;
-  case NVPTX::SUST_B_2D_B32_ZERO_R:
-    return NVPTX::SUST_B_2D_B32_ZERO_I;
-  case NVPTX::SUST_B_2D_B64_ZERO_R:
-    return NVPTX::SUST_B_2D_B64_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_2D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_2D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_2D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_2D_V4B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_B64_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_B64_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_I;
-  case NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_R:
-    return NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_I;
-  case NVPTX::SUST_B_3D_B8_ZERO_R:
-    return NVPTX::SUST_B_3D_B8_ZERO_I;
-  case NVPTX::SUST_B_3D_B16_ZERO_R:
-    return NVPTX::SUST_B_3D_B16_ZERO_I;
-  case NVPTX::SUST_B_3D_B32_ZERO_R:
-    return NVPTX::SUST_B_3D_B32_ZERO_I;
-  case NVPTX::SUST_B_3D_B64_ZERO_R:
-    return NVPTX::SUST_B_3D_B64_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B8_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B8_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B16_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B16_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B32_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B32_ZERO_I;
-  case NVPTX::SUST_B_3D_V2B64_ZERO_R:
-    return NVPTX::SUST_B_3D_V2B64_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B8_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B8_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B16_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B16_ZERO_I;
-  case NVPTX::SUST_B_3D_V4B32_ZERO_R:
-    return NVPTX::SUST_B_3D_V4B32_ZERO_I;
-  case NVPTX::SUST_P_1D_B8_TRAP_R:
-    return NVPTX::SUST_P_1D_B8_TRAP_I;
-  case NVPTX::SUST_P_1D_B16_TRAP_R:
-    return NVPTX::SUST_P_1D_B16_TRAP_I;
-  case NVPTX::SUST_P_1D_B32_TRAP_R:
-    return NVPTX::SUST_P_1D_B32_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_1D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_1D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_1D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_1D_V4B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_P_2D_B8_TRAP_R:
-    return NVPTX::SUST_P_2D_B8_TRAP_I;
-  case NVPTX::SUST_P_2D_B16_TRAP_R:
-    return NVPTX::SUST_P_2D_B16_TRAP_I;
-  case NVPTX::SUST_P_2D_B32_TRAP_R:
-    return NVPTX::SUST_P_2D_B32_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_2D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_2D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_2D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_2D_V4B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_I;
-  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_R:
-    return NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_I;
-  case NVPTX::SUST_P_3D_B8_TRAP_R:
-    return NVPTX::SUST_P_3D_B8_TRAP_I;
-  case NVPTX::SUST_P_3D_B16_TRAP_R:
-    return NVPTX::SUST_P_3D_B16_TRAP_I;
-  case NVPTX::SUST_P_3D_B32_TRAP_R:
-    return NVPTX::SUST_P_3D_B32_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B8_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B8_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B16_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B16_TRAP_I;
-  case NVPTX::SUST_P_3D_V2B32_TRAP_R:
-    return NVPTX::SUST_P_3D_V2B32_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B8_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B8_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B16_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B16_TRAP_I;
-  case NVPTX::SUST_P_3D_V4B32_TRAP_R:
-    return NVPTX::SUST_P_3D_V4B32_TRAP_I;
+  case NVPTX::SUST_B_1D_I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_I64_CLAMP_R:
+    return NVPTX::SUST_B_3D_I64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I32_CLAMP_I;
+  case NVPTX::SUST_B_3D_V2I64_CLAMP_R:
+    return NVPTX::SUST_B_3D_V2I64_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I8_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I8_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I16_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I16_CLAMP_I;
+  case NVPTX::SUST_B_3D_V4I32_CLAMP_R:
+    return NVPTX::SUST_B_3D_V4I32_CLAMP_I;
+  case NVPTX::SUST_B_1D_I8_TRAP_R:
+    return NVPTX::SUST_B_1D_I8_TRAP_I;
+  case NVPTX::SUST_B_1D_I16_TRAP_R:
+    return NVPTX::SUST_B_1D_I16_TRAP_I;
+  case NVPTX::SUST_B_1D_I32_TRAP_R:
+    return NVPTX::SUST_B_1D_I32_TRAP_I;
+  case NVPTX::SUST_B_1D_I64_TRAP_R:
+    return NVPTX::SUST_B_1D_I64_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_1D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_1D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_1D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_1D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_B_2D_I8_TRAP_R:
+    return NVPTX::SUST_B_2D_I8_TRAP_I;
+  case NVPTX::SUST_B_2D_I16_TRAP_R:
+    return NVPTX::SUST_B_2D_I16_TRAP_I;
+  case NVPTX::SUST_B_2D_I32_TRAP_R:
+    return NVPTX::SUST_B_2D_I32_TRAP_I;
+  case NVPTX::SUST_B_2D_I64_TRAP_R:
+    return NVPTX::SUST_B_2D_I64_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_2D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_2D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_2D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_2D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_B_3D_I8_TRAP_R:
+    return NVPTX::SUST_B_3D_I8_TRAP_I;
+  case NVPTX::SUST_B_3D_I16_TRAP_R:
+    return NVPTX::SUST_B_3D_I16_TRAP_I;
+  case NVPTX::SUST_B_3D_I32_TRAP_R:
+    return NVPTX::SUST_B_3D_I32_TRAP_I;
+  case NVPTX::SUST_B_3D_I64_TRAP_R:
+    return NVPTX::SUST_B_3D_I64_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I8_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I8_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I16_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I16_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I32_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I32_TRAP_I;
+  case NVPTX::SUST_B_3D_V2I64_TRAP_R:
+    return NVPTX::SUST_B_3D_V2I64_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I8_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I8_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I16_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I16_TRAP_I;
+  case NVPTX::SUST_B_3D_V4I32_TRAP_R:
+    return NVPTX::SUST_B_3D_V4I32_TRAP_I;
+  case NVPTX::SUST_B_1D_I8_ZERO_R:
+    return NVPTX::SUST_B_1D_I8_ZERO_I;
+  case NVPTX::SUST_B_1D_I16_ZERO_R:
+    return NVPTX::SUST_B_1D_I16_ZERO_I;
+  case NVPTX::SUST_B_1D_I32_ZERO_R:
+    return NVPTX::SUST_B_1D_I32_ZERO_I;
+  case NVPTX::SUST_B_1D_I64_ZERO_R:
+    return NVPTX::SUST_B_1D_I64_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_1D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_1D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_1D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_1D_V4I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_I64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_I64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SUST_B_2D_I8_ZERO_R:
+    return NVPTX::SUST_B_2D_I8_ZERO_I;
+  case NVPTX::SUST_B_2D_I16_ZERO_R:
+    return NVPTX::SUST_B_2D_I16_ZERO_I;
+  case NVPTX::SUST_B_2D_I32_ZERO_R:
+    return NVPTX::SUST_B_2D_I32_ZERO_I;
+  case NVPTX::SUST_B_2D_I64_ZERO_R:
+    return NVPTX::SUST_B_2D_I64_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_2D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_2D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_2D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_2D_V4I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_I64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_I64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_I;
+  case NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_R:
+    return NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_I;
+  case NVPTX::SUST_B_3D_I8_ZERO_R:
+    return NVPTX::SUST_B_3D_I8_ZERO_I;
+  case NVPTX::SUST_B_3D_I16_ZERO_R:
+    return NVPTX::SUST_B_3D_I16_ZERO_I;
+  case NVPTX::SUST_B_3D_I32_ZERO_R:
+    return NVPTX::SUST_B_3D_I32_ZERO_I;
+  case NVPTX::SUST_B_3D_I64_ZERO_R:
+    return NVPTX::SUST_B_3D_I64_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I8_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I8_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I16_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I16_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I32_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I32_ZERO_I;
+  case NVPTX::SUST_B_3D_V2I64_ZERO_R:
+    return NVPTX::SUST_B_3D_V2I64_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I8_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I8_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I16_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I16_ZERO_I;
+  case NVPTX::SUST_B_3D_V4I32_ZERO_R:
+    return NVPTX::SUST_B_3D_V4I32_ZERO_I;
+  case NVPTX::SUST_P_1D_I8_TRAP_R:
+    return NVPTX::SUST_P_1D_I8_TRAP_I;
+  case NVPTX::SUST_P_1D_I16_TRAP_R:
+    return NVPTX::SUST_P_1D_I16_TRAP_I;
+  case NVPTX::SUST_P_1D_I32_TRAP_R:
+    return NVPTX::SUST_P_1D_I32_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_1D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_1D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_1D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_1D_V4I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_P_2D_I8_TRAP_R:
+    return NVPTX::SUST_P_2D_I8_TRAP_I;
+  case NVPTX::SUST_P_2D_I16_TRAP_R:
+    return NVPTX::SUST_P_2D_I16_TRAP_I;
+  case NVPTX::SUST_P_2D_I32_TRAP_R:
+    return NVPTX::SUST_P_2D_I32_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_2D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_2D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_2D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_2D_V4I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_I;
+  case NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_R:
+    return NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_I;
+  case NVPTX::SUST_P_3D_I8_TRAP_R:
+    return NVPTX::SUST_P_3D_I8_TRAP_I;
+  case NVPTX::SUST_P_3D_I16_TRAP_R:
+    return NVPTX::SUST_P_3D_I16_TRAP_I;
+  case NVPTX::SUST_P_3D_I32_TRAP_R:
+    return NVPTX::SUST_P_3D_I32_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I8_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I8_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I16_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I16_TRAP_I;
+  case NVPTX::SUST_P_3D_V2I32_TRAP_R:
+    return NVPTX::SUST_P_3D_V2I32_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I8_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I8_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I16_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I16_TRAP_I;
+  case NVPTX::SUST_P_3D_V4I32_TRAP_R:
+    return NVPTX::SUST_P_3D_V4I32_TRAP_I;
   default:
     llvm_unreachable("Unhandled SUST opcode");
   }

From ace356bc9777e6a5b5aa0ba2335d2546ac6f330e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 20:45:32 +0100
Subject: [PATCH 133/851] [VPlan] Always verify VPCanonicalIVPHIRecipe
 placement (NFC).

Loop regions are dissolved since dcef154b5caf6556e69bb1, remove the
check for VerifyLate and corresponding TODO.
---
 llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 45010d0021581..fba4a68f4a27b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -429,8 +429,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
     return false;
   }
 
-  // TODO: Remove once loop regions are dissolved before execution.
-  if (!VerifyLate && !isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
+  if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
     errs() << "VPlan vector loop header does not start with a "
               "VPCanonicalIVPHIRecipe\n";
     return false;

From ebc90d50b88a7c46634ea21e40ddb25c679ac874 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:03 -0700
Subject: [PATCH 134/851] [SandboxVectorizer] Use llvm::find (NFC) (#143724)

llvm::find allows us to pass a range.
---
 .../llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index d4cb34647cf55..6d2144b14bb00 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -68,7 +68,7 @@ class SeedBundle {
   /// the seeds in a bundle. This allows constant time evaluation
   /// and "removal" from the list.
   void setUsed(Instruction *I) {
-    auto It = std::find(begin(), end(), I);
+    auto It = llvm::find(*this, I);
     assert(It != end() && "Instruction not in the bundle!");
     auto Idx = It - begin();
     setUsed(Idx, 1, /*VerifyUnused=*/false);

From e266d6a5da6871c89747416c70a4a39181b594fb Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:11 -0700
Subject: [PATCH 135/851] [Format] Use llvm::min_element (NFC) (#143725)

llvm::min_elements allows us to pass a range.
---
 clang/lib/Format/MacroCallReconstructor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp
index 116bbad320e1f..895d9f93dfce3 100644
--- a/clang/lib/Format/MacroCallReconstructor.cpp
+++ b/clang/lib/Format/MacroCallReconstructor.cpp
@@ -528,10 +528,10 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line,
       // 1. One level below the current line's level.
       // 2. At the correct level relative to each other.
       unsigned MinChildLevel =
-          std::min_element(N->Children.begin(), N->Children.end(),
-                           [](const auto &E1, const auto &E2) {
-                             return E1->Level < E2->Level;
-                           })
+          llvm::min_element(N->Children,
+                            [](const auto &E1, const auto &E2) {
+                              return E1->Level < E2->Level;
+                            })
               ->get()
               ->Level;
       for (const auto &Child : N->Children) {

From c1d21f44340901f6a23ae7eb7c5379f5ad197b27 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:19 -0700
Subject: [PATCH 136/851] [lld] Use std::tie to implement comparison operators
 (NFC) (#143726)

std::tie facilitates lexicographical comparisons through std::tuple's
built-in operator< and operator>.
---
 lld/ELF/SyntheticSections.cpp   | 7 ++-----
 lld/MachO/UnwindInfoSection.cpp | 8 +++-----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 785a56cdb349e..0a9c7a081eb8b 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -1939,11 +1939,8 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize(Ctx &ctx) {
   // For Rela, we also want to sort by r_addend when r_info is the same. This
   // enables us to group by r_addend as well.
   llvm::sort(nonRelatives, [](const Elf_Rela &a, const Elf_Rela &b) {
-    if (a.r_info != b.r_info)
-      return a.r_info < b.r_info;
-    if (a.r_addend != b.r_addend)
-      return a.r_addend < b.r_addend;
-    return a.r_offset < b.r_offset;
+    return std::tie(a.r_info, a.r_addend, a.r_offset) <
+           std::tie(b.r_info, b.r_addend, b.r_offset);
   });
 
   // Group relocations with the same r_info. Note that each group emits a group
diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp
index 624464e41d77c..6e9f6c2aba749 100644
--- a/lld/MachO/UnwindInfoSection.cpp
+++ b/lld/MachO/UnwindInfoSection.cpp
@@ -535,11 +535,9 @@ void UnwindInfoSectionImpl::finalize() {
   llvm::sort(commonEncodings,
              [](const std::pair<compact_unwind_encoding_t, size_t> &a,
                 const std::pair<compact_unwind_encoding_t, size_t> &b) {
-               if (a.second == b.second)
-                 // When frequencies match, secondarily sort on encoding
-                 // to maintain parity with validate-unwind-info.py
-                 return a.first > b.first;
-               return a.second > b.second;
+               // When frequencies match, secondarily sort on encoding
+               // to maintain parity with validate-unwind-info.py
+               return std::tie(a.second, a.first) > std::tie(b.second, b.first);
              });
 
   // Truncate the vector to 127 elements.

From 8da1ac98efa0d315824a92d8b563299eccc3e0f1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:27 -0700
Subject: [PATCH 137/851] [llvm] Use std::tie to implement operator< (NFC)
 (#143728)

std::tie facilitates lexicographical comparisons through std::tuple's
built-in operator<.
---
 .../ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h    | 12 +++---------
 llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp    |  8 ++------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 24b03a058981a..89b20978c40e6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -202,15 +202,9 @@ class RelocationValueRef {
            IsStubThumb == Other.IsStubThumb;
   }
   inline bool operator<(const RelocationValueRef &Other) const {
-    if (SectionID != Other.SectionID)
-      return SectionID < Other.SectionID;
-    if (Offset != Other.Offset)
-      return Offset < Other.Offset;
-    if (Addend != Other.Addend)
-      return Addend < Other.Addend;
-    if (IsStubThumb != Other.IsStubThumb)
-      return IsStubThumb < Other.IsStubThumb;
-    return SymbolName < Other.SymbolName;
+    return std::tie(SectionID, Offset, Addend, IsStubThumb, SymbolName) <
+           std::tie(Other.SectionID, Other.Offset, Other.Addend,
+                    Other.IsStubThumb, Other.SymbolName);
   }
 };
 
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index f38e7b879e5f0..5dde47ab3de57 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -253,7 +253,7 @@ namespace {
       bool operator!=(Register R) const { return !operator==(R); }
       bool operator<(Register R) const {
         // For std::map.
-        return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub);
+        return std::tie(Reg, Sub) < std::tie(R.Reg, R.Sub);
       }
       llvm::Register Reg;
       unsigned Sub = 0;
@@ -298,11 +298,7 @@ namespace {
         return !operator==(Ex);
       }
       bool operator<(const ExtExpr &Ex) const {
-        if (Rs != Ex.Rs)
-          return Rs < Ex.Rs;
-        if (S != Ex.S)
-          return S < Ex.S;
-        return !Neg && Ex.Neg;
+        return std::tie(Rs, S, Neg) < std::tie(Ex.Rs, Ex.S, Ex.Neg);
       }
     };
 

From 43c35e858ccae05d69151ccf9712a725aae37b52 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 12:50:35 -0700
Subject: [PATCH 138/851] [mlir] Simplify calls to *Map::{insert,try_emplace}
 (NFC) (#143729)

This patch simplifies code by removing the values from
insert/try_emplace.  Note that default values inserted by try_emplace
are immediately overrideen in all these cases.
---
 mlir/lib/IR/AsmPrinter.cpp             | 3 +--
 mlir/lib/IR/SymbolTable.cpp            | 2 +-
 mlir/lib/Transforms/Utils/CFGToSCF.cpp | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index fc1806900c0aa..c7cc6a02ad208 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1146,8 +1146,7 @@ template <typename T, typename... PrintArgs>
 std::pair<size_t, size_t> AliasInitializer::visitImpl(
     T value, llvm::MapVector<const void *, InProgressAliasInfo> &aliases,
     bool canBeDeferred, PrintArgs &&...printArgs) {
-  auto [it, inserted] =
-      aliases.insert({value.getAsOpaquePointer(), InProgressAliasInfo()});
+  auto [it, inserted] = aliases.try_emplace(value.getAsOpaquePointer());
   size_t aliasIndex = std::distance(aliases.begin(), it);
   if (!inserted) {
     // Make sure that the alias isn't deferred if we don't permit it.
diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp
index 075a0ba15d7cd..aaa4d5617eb4f 100644
--- a/mlir/lib/IR/SymbolTable.cpp
+++ b/mlir/lib/IR/SymbolTable.cpp
@@ -1100,7 +1100,7 @@ void SymbolUserMap::replaceAllUsesWith(Operation *symbol,
   if (newSymbol != symbol) {
     // Transfer over the users to the new symbol.  The reference to the old one
     // is fetched again as the iterator is invalidated during the insertion.
-    auto newIt = symbolToUsers.try_emplace(newSymbol, SetVector<Operation *>{});
+    auto newIt = symbolToUsers.try_emplace(newSymbol);
     auto oldIt = symbolToUsers.find(symbol);
     assert(oldIt != symbolToUsers.end() && "missing old users list");
     if (newIt.second)
diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
index de380fc325f55..7c1781044d2a2 100644
--- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp
+++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp
@@ -709,7 +709,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock,
     llvm::SmallDenseMap<Block *, bool> dominanceCache;
     // Returns true if `loopBlock` dominates `block`.
     auto loopBlockDominates = [&](Block *block) {
-      auto [iter, inserted] = dominanceCache.insert({block, false});
+      auto [iter, inserted] = dominanceCache.try_emplace(block);
       if (!inserted)
         return iter->second;
       iter->second = dominanceInfo.dominates(loopBlock, block);

From ad2a2b8eed2f3ed1e050833ea8a8d88b0878c6a7 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Wed, 11 Jun 2025 13:05:21 -0700
Subject: [PATCH 139/851] [llvm] Add a tool to check mustache compliance
 against the public spec (#142813)

This is a cli tool to that tests the conformance of LLVM's mustache
implementation against the public Mustache spec, hosted at
https://github.com/mustache/spec. This is a revised version of the
patches in #111487.

Co-authored-by: Peter Chou <peter.chou@mail.utoronto.ca>
---
 llvm/CMakeLists.txt                           |   1 +
 llvm/docs/CommandGuide/index.rst              |   1 +
 .../CommandGuide/llvm-test-mustache-spec.rst  |  37 +++
 .../llvm-test-mustache-spec/CMakeLists.txt    |   5 +
 .../llvm-test-mustache-spec.cpp               | 268 ++++++++++++++++++
 5 files changed, 312 insertions(+)
 create mode 100644 llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
 create mode 100644 llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
 create mode 100644 llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 206f009b45f59..cfb67472aa71e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1313,6 +1313,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
   add_subdirectory(utils/mlgo-utils)
+  add_subdirectory(utils/llvm-test-mustache-spec)
   if( LLVM_INCLUDE_TESTS )
     set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test")
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst
index 643951eca2a26..88fc1fd326b76 100644
--- a/llvm/docs/CommandGuide/index.rst
+++ b/llvm/docs/CommandGuide/index.rst
@@ -87,6 +87,7 @@ Developer Tools
    llvm-exegesis
    llvm-ifs
    llvm-locstats
+   llvm-test-mustache-spec
    llvm-pdbutil
    llvm-profgen
    llvm-tli-checker
diff --git a/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
new file mode 100644
index 0000000000000..8cd5a349e7e49
--- /dev/null
+++ b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst
@@ -0,0 +1,37 @@
+llvm-test-mustache-spec - LLVM tool to test Mustache library compliance
+=======================================================================
+
+.. program:: llvm-test-mustache-spec
+
+SYNOPSIS
+--------
+
+:program:`llvm-test-mustache-spec` [*inputs...*]
+
+Description
+-----------
+
+``llvm-test-mustache-spec`` tests the mustache spec conformance of the LLVM
+mustache library. The spec can be found here: https://github.com/mustache/spec
+
+To test against the spec, simply download the spec and pass the test JSON files
+to the driver. Each spec file should have a list of tests for compliance with
+the spec. These are loaded as test cases, and rendered with our Mustache
+implementation, which is then compared against the expected output from the
+spec.
+
+The current implementation only supports non-optional parts of the spec, so
+we do not expect any of the dynamic-names, inheritance, or lambda tests to
+pass. Additionally, Triple Mustache is not supported. Unsupported tests are
+marked as XFail and are removed from the XFail list as they are fixed.
+
+The tool prints the number of test failures and successes in each of the test
+files to standard output.
+
+EXAMPLE
+-------
+
+.. code-block:: console
+
+   $ llvm-test-mustache-spec path/to/specs/\*.json
+
diff --git a/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
new file mode 100644
index 0000000000000..dc1aa73371ffc
--- /dev/null
+++ b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_utility(llvm-test-mustache-spec
+  llvm-test-mustache-spec.cpp
+)
+
+target_link_libraries(llvm-test-mustache-spec PRIVATE LLVMSupport)
diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
new file mode 100644
index 0000000000000..28ed1b876672d
--- /dev/null
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -0,0 +1,268 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple drivers to test the mustache spec found at:
+// https://github.com/mustache/spec
+//
+// It is used to verify that the current implementation conforms to the spec.
+// Simply download the spec and pass the test JSON files to the driver. Each
+// spec file should have a list of tests for compliance with the spec. These
+// are loaded as test cases, and rendered with our Mustache implementation,
+// which is then compared against the expected output from the spec.
+//
+// The current implementation only supports non-optional parts of the spec, so
+// we do not expect any of the dynamic-names, inheritance, or lambda tests to
+// pass. Additionally, Triple Mustache is not supported. Unsupported tests are
+// marked as XFail and are removed from the XFail list as they are fixed.
+//
+// Usage:
+//  llvm-test-mustache-spec path/to/test/file.json path/to/test/file2.json ...
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Path.h"
+#include <string>
+
+using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
+
+#define DEBUG_TYPE "llvm-test-mustache-spec"
+
+static cl::OptionCategory Cat("llvm-test-mustache-spec Options");
+
+static cl::list<std::string>
+    InputFiles(cl::Positional, cl::desc("<input files>"), cl::OneOrMore);
+
+static cl::opt<bool> ReportErrors("report-errors",
+                                  cl::desc("Report errors in spec tests"),
+                                  cl::cat(Cat));
+
+static ExitOnError ExitOnErr;
+
+static int NumXFail = 0;
+static int NumSuccess = 0;
+
+static const StringMap<StringSet<>> XFailTestNames = {{
+    {"delimiters.json",
+     {
+         "Pair Behavior",
+         "Special Characters",
+         "Sections",
+         "Inverted Sections",
+         "Partial Inheritence",
+         "Post-Partial Behavior",
+         "Standalone Tag",
+         "Indented Standalone Tag",
+         "Standalone Line Endings",
+         "Standalone Without Previous Line",
+         "Standalone Without Newline",
+     }},
+    {"~dynamic-names.json",
+     {
+         "Basic Behavior - Partial",
+         "Basic Behavior - Name Resolution",
+         "Context",
+         "Dotted Names",
+         "Dotted Names - Failed Lookup",
+         "Dotted names - Context Stacking",
+         "Dotted names - Context Stacking Under Repetition",
+         "Dotted names - Context Stacking Failed Lookup",
+         "Recursion",
+         "Surrounding Whitespace",
+         "Inline Indentation",
+         "Standalone Line Endings",
+         "Standalone Without Previous Line",
+         "Standalone Without Newline",
+         "Standalone Indentation",
+         "Padding Whitespace",
+     }},
+    {"~inheritance.json",
+     {
+         "Default",
+         "Variable",
+         "Triple Mustache",
+         "Sections",
+         "Negative Sections",
+         "Mustache Injection",
+         "Inherit",
+         "Overridden content",
+         "Data does not override block default",
+         "Two overridden parents",
+         "Override parent with newlines",
+         "Inherit indentation",
+         "Only one override",
+         "Parent template",
+         "Recursion",
+         "Multi-level inheritance, no sub child",
+         "Text inside parent",
+         "Text inside parent",
+         "Block scope",
+         "Standalone parent",
+         "Standalone block",
+         "Block reindentation",
+         "Intrinsic indentation",
+         "Nested block reindentation",
+
+     }},
+    {"~lambdas.json",
+     {
+         "Interpolation",
+         "Interpolation - Expansion",
+         "Interpolation - Alternate Delimiters",
+         "Interpolation - Multiple Calls",
+         "Escaping",
+         "Section",
+         "Section - Expansion",
+         "Section - Alternate Delimiters",
+         "Section - Multiple Calls",
+
+     }},
+    {"interpolation.json",
+     {
+         "Triple Mustache",
+         "Triple Mustache Integer Interpolation",
+         "Triple Mustache Decimal Interpolation",
+         "Triple Mustache Null Interpolation",
+         "Triple Mustache Context Miss Interpolation",
+         "Dotted Names - Triple Mustache Interpolation",
+         "Implicit Iterators - Triple Mustache",
+         "Triple Mustache - Surrounding Whitespace",
+         "Triple Mustache - Standalone",
+         "Triple Mustache With Padding",
+     }},
+    {"partials.json", {"Standalone Indentation"}},
+    {"sections.json", {"Implicit Iterator - Triple mustache"}},
+}};
+
+struct TestData {
+  static Expected<TestData> createTestData(json::Object *TestCase,
+                                           StringRef InputFile) {
+    // If any of the needed elements are missing, we cannot continue.
+    // NOTE: partials are optional in the test schema.
+    if (!TestCase || !TestCase->getString("template") ||
+        !TestCase->getString("expected") || !TestCase->getString("name") ||
+        !TestCase->get("data"))
+      return createStringError(
+          llvm::inconvertibleErrorCode(),
+          "invalid JSON schema in test file: " + InputFile + "\n");
+
+    return TestData{TestCase->getString("template").value(),
+                    TestCase->getString("expected").value(),
+                    TestCase->getString("name").value(), TestCase->get("data"),
+                    TestCase->get("partials")};
+  }
+
+  TestData() = default;
+
+  StringRef TemplateStr;
+  StringRef ExpectedStr;
+  StringRef Name;
+  Value *Data;
+  Value *Partials;
+};
+
+static void reportTestFailure(const TestData &TD, StringRef ActualStr,
+                              bool IsXFail) {
+  LLVM_DEBUG(dbgs() << "Template: " << TD.TemplateStr << "\n");
+  if (TD.Partials) {
+    LLVM_DEBUG(dbgs() << "Partial: ");
+    LLVM_DEBUG(TD.Partials->print(dbgs()));
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+  LLVM_DEBUG(dbgs() << "JSON Data: ");
+  LLVM_DEBUG(TD.Data->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+  outs() << formatv("Test {}: {}\n", (IsXFail ? "XFailed" : "Failed"), TD.Name);
+  if (ReportErrors) {
+    outs() << "  Expected: \'" << TD.ExpectedStr << "\'\n"
+           << "  Actual: \'" << ActualStr << "\'\n"
+           << " ====================\n";
+  }
+}
+
+static void registerPartials(Value *Partials, Template &T) {
+  if (!Partials)
+    return;
+  for (const auto &[Partial, Str] : *Partials->getAsObject())
+    T.registerPartial(Partial.str(), Str.getAsString()->str());
+}
+
+static json::Value readJsonFromFile(StringRef &InputFile) {
+  std::unique_ptr<MemoryBuffer> Buffer =
+      ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFile)));
+  return ExitOnErr(parse(Buffer->getBuffer()));
+}
+
+static bool isTestXFail(StringRef FileName, StringRef TestName) {
+  auto P = llvm::sys::path::filename(FileName);
+  auto It = XFailTestNames.find(P);
+  return It != XFailTestNames.end() && It->second.contains(TestName);
+}
+
+static bool evaluateTest(StringRef &InputFile, TestData &TestData,
+                         std::string &ActualStr) {
+  bool IsXFail = isTestXFail(InputFile, TestData.Name);
+  bool Matches = TestData.ExpectedStr == ActualStr;
+  if ((Matches && IsXFail) || (!Matches && !IsXFail)) {
+    reportTestFailure(TestData, ActualStr, IsXFail);
+    return false;
+  }
+  IsXFail ? NumXFail++ : NumSuccess++;
+  return true;
+}
+
+static void runTest(StringRef InputFile) {
+  NumXFail = 0;
+  NumSuccess = 0;
+  outs() << "Running Tests: " << InputFile << "\n";
+  json::Value Json = readJsonFromFile(InputFile);
+
+  json::Object *Obj = Json.getAsObject();
+  Array *TestArray = Obj->getArray("tests");
+  // Even though we parsed the JSON, it can have a bad format, so check it.
+  if (!TestArray)
+    ExitOnErr(createStringError(
+        llvm::inconvertibleErrorCode(),
+        "invalid JSON schema in test file: " + InputFile + "\n"));
+
+  const size_t Total = TestArray->size();
+
+  for (Value V : *TestArray) {
+    auto TestData =
+        ExitOnErr(TestData::createTestData(V.getAsObject(), InputFile));
+    Template T(TestData.TemplateStr);
+    registerPartials(TestData.Partials, T);
+
+    std::string ActualStr;
+    raw_string_ostream OS(ActualStr);
+    T.render(*TestData.Data, OS);
+    evaluateTest(InputFile, TestData, ActualStr);
+  }
+
+  const int NumFailed = Total - NumSuccess - NumXFail;
+  outs() << formatv("===Results===\n"
+                    " Suceeded: {}\n"
+                    " Expectedly Failed: {}\n"
+                    " Failed: {}\n"
+                    " Total: {}\n",
+                    NumSuccess, NumXFail, NumFailed, Total);
+}
+
+int main(int argc, char **argv) {
+  ExitOnErr.setBanner(std::string(argv[0]) + " error: ");
+  cl::ParseCommandLineOptions(argc, argv);
+  for (const auto &FileName : InputFiles)
+    runTest(FileName);
+  return 0;
+}

From e7e491f6ee2baee4e2ab2947e1c64bc54e3ebbec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 11 Jun 2025 13:06:22 -0700
Subject: [PATCH 140/851] [SelectionDAG] Add ISD::VSELECT to
 SelectionDAG::canCreateUndefOrPoison. (#143760)

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  1 +
 .../RISCV/rvv/combine-reduce-add-to-vcpop.ll  | 69 +++++++++----------
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 16 ++---
 .../test/CodeGen/X86/avx10_2_512bf16-arith.ll |  2 +-
 llvm/test/CodeGen/X86/avx10_2bf16-arith.ll    |  4 +-
 5 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4fc026ca562ba..45a37622a531b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5553,6 +5553,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BUILD_VECTOR:
   case ISD::BUILD_PAIR:
   case ISD::SPLAT_VECTOR:
+  case ISD::VSELECT:
     return false;
 
   case ISD::SELECT_CC:
diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
index 88894f887cc20..5dc532273b770 100644
--- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll
@@ -313,12 +313,12 @@ define i32 @test_nxv128i1(<vscale x 128 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v6, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v6, v7, a1
+; CHECK-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v7, a0
 ; CHECK-NEXT:    vslidedown.vx v5, v6, a0
-; CHECK-NEXT:    vslidedown.vx v4, v7, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
 ; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
@@ -364,9 +364,9 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vmv1r.v v7, v9
 ; CHECK-NEXT:    vmv1r.v v5, v8
 ; CHECK-NEXT:    vmv1r.v v4, v0
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
@@ -376,7 +376,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v5
-; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add a0, sp, a0
@@ -388,9 +388,8 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v3, v4, a0
 ; CHECK-NEXT:    vslidedown.vx v2, v5, a0
 ; CHECK-NEXT:    vmv.v.v v0, v3
-; CHECK-NEXT:    vmv8r.v v8, v16
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    mv a3, a2
@@ -398,42 +397,43 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v2
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v3, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
+; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v2, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v24, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v4, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vim v16, v8, 1, v0
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v5, a1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v6, a1
 ; CHECK-NEXT:    vslidedown.vx v5, v7, a1
-; CHECK-NEXT:    vslidedown.vx v4, v6, a1
 ; CHECK-NEXT:    vsetvli a2, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v4
-; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v5
-; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v24
 ; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
@@ -443,7 +443,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    vslidedown.vx v0, v4, a1
 ; CHECK-NEXT:    vslidedown.vx v3, v5, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
+; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v3
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -451,7 +451,7 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
@@ -492,16 +492,16 @@ define i32 @test_nxv256i1(<vscale x 256 x i1> %x) {
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    vadd.vi v24, v24, 1, v0.t
-; CHECK-NEXT:    vadd.vv v24, v24, v8
+; CHECK-NEXT:    vadd.vv v0, v24, v8
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vadd.vv v8, v8, v0
-; CHECK-NEXT:    vadd.vv v16, v24, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; CHECK-NEXT:    vadd.vv v8, v8, v24
+; CHECK-NEXT:    vadd.vv v16, v0, v16
 ; CHECK-NEXT:    vadd.vv v8, v16, v8
 ; CHECK-NEXT:    vmv.s.x v16, zero
 ; CHECK-NEXT:    vredsum.vs v8, v8, v16
@@ -537,18 +537,17 @@ entry:
 define i16 @test_narrow_nxv64i1(<vscale x 64 x i1> %x) {
 ; CHECK-LABEL: test_narrow_nxv64i1:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v8, v0, a0
+; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, mu
-; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vadd.vi v16, v16, 1, v0.t
-; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vredsum.vs v8, v16, v8
+; CHECK-NEXT:    vadd.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vmv.s.x v16, zero
+; CHECK-NEXT:    vredsum.vs v8, v8, v16
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 77723609a60c7..e297e88c71f1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -260,18 +260,18 @@ define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZIP-NEXT:    vmv1r.v v9, v0
 ; ZIP-NEXT:    vmv1r.v v0, v8
-; ZIP-NEXT:    vmv.v.i v16, 0
-; ZIP-NEXT:    vmerge.vim v24, v16, 1, v0
+; ZIP-NEXT:    vmv.v.i v24, 0
+; ZIP-NEXT:    vmerge.vim v16, v24, 1, v0
 ; ZIP-NEXT:    vmv1r.v v0, v9
-; ZIP-NEXT:    vmerge.vim v8, v16, 1, v0
+; ZIP-NEXT:    vmerge.vim v8, v24, 1, v0
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v4, v8, v24
-; ZIP-NEXT:    ri.vzip2b.vv v20, v12, v28
-; ZIP-NEXT:    ri.vzip2a.vv v0, v8, v24
-; ZIP-NEXT:    ri.vzip2a.vv v16, v12, v28
+; ZIP-NEXT:    ri.vzip2b.vv v4, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v28, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v0, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v24, v12, v20
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
 ; ZIP-NEXT:    vmsne.vi v9, v0, 0
-; ZIP-NEXT:    vmsne.vi v8, v16, 0
+; ZIP-NEXT:    vmsne.vi v8, v24, 0
 ; ZIP-NEXT:    vmv1r.v v0, v9
 ; ZIP-NEXT:    ret
   %res = call <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b)
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 1e2cf4956bd08..c22a394e6c4e0 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 42831a453cb1d..435f67a0f1e4b 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
 ;
 ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
 ; X86-NEXT:    vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
 ; X86-NEXT:    vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
 ; X86-NEXT:    vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]

From 5623b7f2d56ecba84de5d62444feed2dea2b7e25 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 21:08:35 +0100
Subject: [PATCH 141/851] [LV] Use GeneratedRTChecks to check if safety checks
 were added (NFC).

Directly check via GeneratedRTChecks if any checks have been added,
instead of needing to go through ILV. This simplifies the code and
enables further refactoring in follow-up patches.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2a237f42e4042..d236111836391 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -505,9 +505,6 @@ class InnerLoopVectorizer {
   /// Fix the vectorized code, taking care of header phi's, and more.
   void fixVectorizedLoop(VPTransformState &State);
 
-  // Return true if any runtime check is added.
-  bool areSafetyChecksAdded() { return AddedSafetyChecks; }
-
   /// Fix the non-induction PHIs in \p Plan.
   void fixNonInductionPHIs(VPTransformState &State);
 
@@ -620,9 +617,6 @@ class InnerLoopVectorizer {
   /// The profitablity analysis.
   LoopVectorizationCostModel *Cost;
 
-  // Record whether runtime checks are added.
-  bool AddedSafetyChecks = false;
-
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
@@ -1777,6 +1771,9 @@ class GeneratedRTChecks {
   /// they have been used.
   Value *MemRuntimeCheckCond = nullptr;
 
+  /// True if any checks have been added.
+  bool AddedAnyChecks = false;
+
   DominatorTree *DT;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
@@ -2038,9 +2035,9 @@ class GeneratedRTChecks {
     if (AddBranchWeights)
       setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
-
     // Mark the check as used, to prevent it from being removed during cleanup.
     SCEVCheckCond = nullptr;
+    AddedAnyChecks = true;
     return SCEVCheckBlock;
   }
 
@@ -2070,8 +2067,12 @@ class GeneratedRTChecks {
 
     // Mark the check as used, to prevent it from being removed during cleanup.
     MemRuntimeCheckCond = nullptr;
+    AddedAnyChecks = true;
     return MemCheckBlock;
   }
+
+  /// Return true if any runtime checks have been added
+  bool hasChecks() const { return AddedAnyChecks; }
 };
 } // namespace
 
@@ -2459,7 +2460,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   assert((!Cost->OptForSize ||
           Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
-  AddedSafetyChecks = true;
 
   introduceCheckBlockInVPlan(SCEVCheckBlock);
   return SCEVCheckBlock;
@@ -2494,9 +2494,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-
-  AddedSafetyChecks = true;
-
   introduceCheckBlockInVPlan(MemCheckBlock);
   return MemCheckBlock;
 }
@@ -10287,7 +10284,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         }
         ++LoopsEpilogueVectorized;
 
-        if (!MainILV.areSafetyChecksAdded())
+        if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       } else {
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
@@ -10299,7 +10296,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         // Add metadata to disable runtime unrolling a scalar loop when there
         // are no runtime checks about strides and memory. A scalar loop that is
         // rarely used is not worth unrolling.
-        if (!LB.areSafetyChecksAdded())
+        if (!Checks.hasChecks())
           DisableRuntimeUnroll = true;
       }
       // Report the vectorization decision.

From c70658e32debfc3b2c0f6c2b2228ac48e976fd51 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 13:09:05 -0700
Subject: [PATCH 142/851] [bazel] port 5dafe9dca867b90f20dcd71c620ad823aee4262b

---
 .../llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel    | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 40f672d8099f1..610978059d7e6 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -107,6 +107,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:atof",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -206,6 +207,7 @@ libc_test_library(
         "//libc:__support_macros_properties_architectures",
         "//libc:errno",
         "//libc/test/UnitTest:LibcUnitTest",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 
@@ -251,6 +253,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtof",
+        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -261,6 +264,7 @@ libc_test(
     deps = [
         "//libc:__support_fputil_fp_bits",
         "//libc:strtod",
+        "//libc/test/UnitTest:errno_test_helpers",
         "//libc/test/UnitTest:fp_test_helpers",
     ],
 )
@@ -272,5 +276,6 @@ libc_test(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_uint128",
         "//libc:strtold",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )

From 52583b3ed7dd39788360361fc1e21039c8eb5479 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Wed, 11 Jun 2025 20:11:31 +0000
Subject: [PATCH 143/851] [libc] Character converter skeleton class (#143619)

Made CharacterConverter class skeleton
---
 libc/hdr/types/char32_t.h                     | 22 ++++++
 libc/hdr/types/char8_t.h                      | 22 ++++++
 libc/hdr/uchar_overlay.h                      | 69 +++++++++++++++++++
 libc/src/__support/wchar/CMakeLists.txt       | 26 +++++++
 .../__support/wchar/character_converter.cpp   | 32 +++++++++
 .../src/__support/wchar/character_converter.h | 39 +++++++++++
 libc/src/__support/wchar/mbstate.h            | 27 ++++++++
 libc/src/__support/wchar/utf_ret.h            | 21 ++++++
 8 files changed, 258 insertions(+)
 create mode 100644 libc/hdr/types/char32_t.h
 create mode 100644 libc/hdr/types/char8_t.h
 create mode 100644 libc/hdr/uchar_overlay.h
 create mode 100644 libc/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/src/__support/wchar/character_converter.cpp
 create mode 100644 libc/src/__support/wchar/character_converter.h
 create mode 100644 libc/src/__support/wchar/mbstate.h
 create mode 100644 libc/src/__support/wchar/utf_ret.h

diff --git a/libc/hdr/types/char32_t.h b/libc/hdr/types/char32_t.h
new file mode 100644
index 0000000000000..94fe5747d3415
--- /dev/null
+++ b/libc/hdr/types/char32_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of char32_t.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CHAR32_T_H
+#define LLVM_LIBC_HDR_TYPES_CHAR32_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/char32_t.h"
+
+#else // overlay mode
+
+#include "hdr/uchar_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CHAR32_T_H
diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h
new file mode 100644
index 0000000000000..31de764658f9e
--- /dev/null
+++ b/libc/hdr/types/char8_t.h
@@ -0,0 +1,22 @@
+//===-- Definition of char8_t.h -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H
+#define LLVM_LIBC_HDR_TYPES_CHAR8_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/char8_t.h"
+
+#else // overlay mode
+
+#include "hdr/uchar_overlay.h"
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H
diff --git a/libc/hdr/uchar_overlay.h b/libc/hdr/uchar_overlay.h
new file mode 100644
index 0000000000000..44ed3d48c6c1d
--- /dev/null
+++ b/libc/hdr/uchar_overlay.h
@@ -0,0 +1,69 @@
+//===-- Including uchar.h in overlay mode ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_UCHAR_OVERLAY_H
+#define LLVM_LIBC_HDR_UCHAR_OVERLAY_H
+
+#ifdef LIBC_FULL_BUILD
+#error "This header should only be included in overlay mode"
+#endif
+
+// Overlay mode
+
+// glibc <uchar.h> header might provide extern inline definitions for few
+// functions, causing external alias errors.  They are guarded by
+// `__USE_EXTERN_INLINES` macro.  We temporarily disable `__USE_EXTERN_INLINES`
+// macro by defining `__NO_INLINE__` before including <uchar.h>.
+// And the same with `__USE_FORTIFY_LEVEL`, which will be temporarily disabled
+// with `_FORTIFY_SOURCE`.
+
+#ifdef _FORTIFY_SOURCE
+#define LIBC_OLD_FORTIFY_SOURCE _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+
+#ifndef __NO_INLINE__
+#define __NO_INLINE__ 1
+#define LIBC_SET_NO_INLINE
+#endif
+
+#ifdef __USE_EXTERN_INLINES
+#define LIBC_OLD_USE_EXTERN_INLINES
+#undef __USE_EXTERN_INLINES
+#endif
+
+#ifdef __USE_FORTIFY_LEVEL
+#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL 0
+#endif
+
+#include <uchar.h>
+
+#ifdef LIBC_OLD_FORTIFY_SOURCE
+#define _FORTIFY_SOURCE LIBC_OLD_FORTIFY_SOURCE
+#undef LIBC_OLD_FORTIFY_SOURCE
+#endif
+
+#ifdef LIBC_SET_NO_INLINE
+#undef __NO_INLINE__
+#undef LIBC_SET_NO_INLINE
+#endif
+
+#ifdef LIBC_OLD_USE_FORTIFY_LEVEL
+#undef __USE_FORTIFY_LEVEL
+#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL
+#undef LIBC_OLD_USE_FORTIFY_LEVEL
+#endif
+
+#ifdef LIBC_OLD_USE_EXTERN_INLINES
+#define __USE_EXTERN_INLINES
+#undef LIBC_OLD_USE_EXTERN_INLINES
+#endif
+
+#endif // LLVM_LIBC_HDR_UCHAR_OVERLAY_H
diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..5cca58400ff45
--- /dev/null
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_header_library(
+  mbstate
+  HDRS
+    mbstate.h
+  DEPENDS
+    libc.hdr.types.char32_t    
+)
+
+add_object_library(
+  character_converter
+  HDRS
+    character_converter.h
+  SRCS 
+    character_converter.cpp
+  DEPENDS
+    libc.hdr.types.char8_t
+    libc.hdr.types.char32_t
+    .mbstate
+    .utf_ret
+)
+
+add_header_library(
+  utf_ret
+  HDRS
+    utf_ret.h
+)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
new file mode 100644
index 0000000000000..0afc2a6f59e64
--- /dev/null
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -0,0 +1,32 @@
+//===-- Implementation of a class for conversion --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+
+#include "character_converter.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+
+bool CharacterConverter::isComplete() {}
+
+int CharacterConverter::push(char8_t utf8_byte) {}
+
+int CharacterConverter::push(char32_t utf32) {}
+
+utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+
+utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
new file mode 100644
index 0000000000000..a6bac43805376
--- /dev/null
+++ b/libc/src/__support/wchar/character_converter.h
@@ -0,0 +1,39 @@
+//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
+#define LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
+
+#include "hdr/types/char32_t.h"
+#include "hdr/types/char8_t.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+class CharacterConverter {
+private:
+  mbstate_t *state;
+
+public:
+  CharacterConverter(mbstate_t *mbstate);
+
+  bool isComplete();
+
+  int push(char8_t utf8_byte);
+  int push(char32_t utf32);
+
+  utf_ret<char8_t> pop_utf8();
+  utf_ret<char32_t> pop_utf32();
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
new file mode 100644
index 0000000000000..72ec727560003
--- /dev/null
+++ b/libc/src/__support/wchar/mbstate.h
@@ -0,0 +1,27 @@
+//===-- Definition of mbstate-----------------------------------*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+#define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+
+#include "hdr/types/char32_t.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+struct mbstate {
+  char32_t partial;
+  uint8_t bits_processed;
+  uint8_t total_bytes;
+};
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
new file mode 100644
index 0000000000000..b8a8f6f094143
--- /dev/null
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -0,0 +1,21 @@
+//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
+#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
+
+namespace LIBC_NAMESPACE_DECL {
+
+template <typename T> struct utf_ret {
+  T out;
+  int error;
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H

From a2d2941830d9c141d7f43da1ff58e7b7235a9f7d Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Wed, 11 Jun 2025 13:12:37 -0700
Subject: [PATCH 144/851] [lldb][RPC] Upstream LLDB to RPC converstion Python
 script (#138028)

As part of upstreaming LLDB RPC, this commit adds a python script that
is used by LLDB RPC to modify the public lldb header files for use with
RPC.

https://discourse.llvm.org/t/rfc-upstreaming-lldb-rpc/85804
---
 .../convert-lldb-header-to-rpc-header.py      | 108 ++++++++++++++++++
 .../TestConvertScript/CheckLLDBDefines.test   |  22 ++++
 .../CheckLLDBEnumerations.test                |  17 +++
 .../TestConvertScript/CheckLLDBTypes.test     |  24 ++++
 .../TestConvertScript/CheckSBDefines.test     |  22 ++++
 .../TestConvertScript/Inputs/SBDefines.h      |  22 ++++
 .../TestConvertScript/Inputs/lldb-defines.h   |  23 ++++
 .../Inputs/lldb-enumerations.h                |  17 +++
 .../TestConvertScript/Inputs/lldb-types.h     |  23 ++++
 9 files changed, 278 insertions(+)
 create mode 100755 lldb/scripts/convert-lldb-header-to-rpc-header.py
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
 create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h

diff --git a/lldb/scripts/convert-lldb-header-to-rpc-header.py b/lldb/scripts/convert-lldb-header-to-rpc-header.py
new file mode 100755
index 0000000000000..d7734280076ff
--- /dev/null
+++ b/lldb/scripts/convert-lldb-header-to-rpc-header.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""
+Usage: convert-lldb-header-to-rpc-header.py <path/to/input-header.h> <path/to/output-header.h>
+
+This scripts takes common LLDB headers (such as lldb-defines.h) and replaces references to LLDB
+with those for RPC. This happens for:
+- namespace definitions
+- namespace usage
+- version string macros
+- ifdef/ifndef lines
+"""
+
+import argparse
+import os
+import re
+
+
+INCLUDES_TO_REMOVE_REGEX = re.compile(
+    r'#include "lldb/lldb-forward.h"|#include "lldb/lldb-versioning.h"'
+)
+LLDB_GUARD_REGEX = re.compile(r"(?P<guard_type>#.+)LLDB_LLDB_\s*", re.M)
+LLDB_API_GUARD_REGEX = re.compile(r"(?P<guard_type>#.+)LLDB_API_\s*", re.M)
+LLDB_VERSION_REGEX = re.compile(r"#define LLDB_VERSION", re.M)
+LLDB_REVISION_REGEX = re.compile(r"#define LLDB_REVISION", re.M)
+LLDB_VERSION_STRING_REGEX = re.compile(r"#define LLDB_VERSION_STRING", re.M)
+LLDB_LOCAL_INCLUDE_REGEX = re.compile(r'#include "lldb/lldb-\s*', re.M)
+LLDB_NAMESPACE_DEFINITION_REGEX = re.compile(
+    r"(?P<comment_marker>//\s*){,1}namespace lldb\s{1}", re.M
+)
+LLDB_NAMESPACE_REGEX = re.compile(r"\s*.+lldb::\s*", re.M)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input")
+    parser.add_argument("output")
+    args = parser.parse_args()
+    input_path = str(args.input)
+    output_path = str(args.output)
+    with open(input_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+
+    with open(output_path, "w") as output_file:
+        # NOTE: We do not use lldb-forward.h or lldb-versioning.h in RPC, so remove
+        # all includes that are found for these files.
+        file_buffer = re.sub(INCLUDES_TO_REMOVE_REGEX, r"", file_buffer)
+
+        # For lldb-rpc-defines.h, replace the ifndef LLDB_LLDB_ portion with LLDB_RPC_ as we're not
+        # using LLDB private definitions in RPC.
+        lldb_guard_matches = LLDB_GUARD_REGEX.finditer(file_buffer)
+        for match in lldb_guard_matches:
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}LLDB_RPC_".format(match.group("guard_type")),
+                file_buffer,
+            )
+
+        # Similarly to lldb-rpc-defines.h, replace the ifndef for LLDB_API in SBDefines.h to LLDB_RPC_API_ for the same reason.
+        lldb_api_guard_matches = LLDB_API_GUARD_REGEX.finditer(file_buffer)
+        for match in lldb_api_guard_matches:
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}LLDB_RPC_API_".format(match.group("guard_type")),
+                file_buffer,
+            )
+
+        # Replace the references for the macros that define the versioning strings in
+        # lldb-rpc-defines.h.
+        # NOTE: Here we assume that the versioning info has already been uncommented and
+        # populated from the original lldb-defines.h.
+        file_buffer = re.sub(
+            LLDB_VERSION_REGEX, r"#define LLDB_RPC_VERSION", file_buffer
+        )
+        file_buffer = re.sub(
+            LLDB_REVISION_REGEX, r"#define LLDB_RPC_REVISION", file_buffer
+        )
+        file_buffer = re.sub(
+            LLDB_VERSION_STRING_REGEX, r"#define LLDB_RPC_VERSION_STRING", file_buffer
+        )
+
+        # For local #includes
+        file_buffer = re.sub(
+            LLDB_LOCAL_INCLUDE_REGEX, r'#include "lldb-rpc-', file_buffer
+        )
+
+        # Rename the lldb namespace definition to lldb-rpc.
+        lldb_rpc_namespace_definition_matches = (
+            LLDB_NAMESPACE_DEFINITION_REGEX.finditer(file_buffer)
+        )
+        for match in lldb_rpc_namespace_definition_matches:
+            comment_marker = (
+                match.group("comment_marker") if match.group("comment_marker") else ""
+            )
+            file_buffer = re.sub(
+                match.group(),
+                r"{0}namespace lldb_rpc ".format(comment_marker),
+                file_buffer,
+            )
+
+        # Rename the lldb namespace definition to lldb-rpc.
+        file_buffer = re.sub(LLDB_NAMESPACE_REGEX, r"lldb_rpc::", file_buffer)
+
+        output_file.write(file_buffer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
new file mode 100644
index 0000000000000..0d89d627cfedf
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test
@@ -0,0 +1,22 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-defines.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-defines.h %t/Outputs/lldb-rpc-defines.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-defines.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_DEFINES_H to LLDB_RPC_DEFINES_H.
+CHECK: #ifndef LLDB_RPC_DEFINES_H
+CHECK: #define LLDB_RPC_DEFINES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+CHECK: #include "lldb-rpc-types.h"
+
+# The version info must be changed from LLDB_VERSION to LLDB_RPC_VERSION
+CHECK: #define LLDB_RPC_VERSION 21
+CHECK: #define LLDB_RPC_REVISION 12
+CHECK: #define LLDB_RPC_VERSION_STRING "21.0.12"
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_DEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
new file mode 100644
index 0000000000000..0fb3c6f73dd0f
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test
@@ -0,0 +1,17 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-enumerations.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-enumerations.h %t/Outputs/lldb-rpc-enumerations.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-enumerations.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_ENUMERATIONS_H to LLDB_RPC_ENUMERATIONS_H.
+CHECK: #ifndef LLDB_RPC_ENUMERATIONS_H
+CHECK: #define LLDB_RPC_ENUMERATIONS_H
+
+# Change the namespace to lldb_rpc. Also, the comment that closes the namespace should match the namespace.
+CHECK: namespace lldb_rpc {} // namespace lldb_rpc
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_ENUMERATIONS_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
new file mode 100644
index 0000000000000..86f2d290209e1
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test
@@ -0,0 +1,24 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on lldb-types.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-types.h %t/Outputs/lldb-rpc-types.h
+
+# Check the output
+RUN: cat %t/Outputs/lldb-rpc-types.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_TYPES_H to LLDB_RPC_TYPES_H.
+CHECK: #ifndef LLDB_RPC_TYPES_H
+CHECK: #define LLDB_RPC_TYPES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+# Also, the includes for lldb-forward.h should be removed.
+CHECK: #include "lldb-rpc-enumerations.h"
+
+# Change the namespace to lldb_rpc.
+CHECK: namespace lldb_rpc
+
+# The comment that closes the namespace should match the namespace.
+CHECK: // namespace lldb_rpc
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_TYPES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
new file mode 100644
index 0000000000000..72444aaf069a4
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test
@@ -0,0 +1,22 @@
+RUN: mkdir -p %t/Outputs
+
+# Run the convert script on SBDefines.h.
+RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/SBDefines.h %t/Outputs/SBDefines.h
+
+# Check the output
+RUN: cat %t/Outputs/SBDefines.h | FileCheck %s
+
+# The include guards must change from LLDB_LLDB_API_SBDEFINES_H to LLDB_RPC_API_SBDEFINES_H.
+CHECK: #ifndef LLDB_RPC_API_SBDEFINES_H
+CHECK: #define LLDB_RPC_API_SBDEFINES_H
+
+# Includes of other lldb headers must begin with "lldb-rpc-".
+# Also, the includes for lldb-forward.h and lldb-versioning.h should be removed.
+CHECK: #include "lldb-rpc-defines.h"
+CHECK-NOT: #include "lldb-rpc-forward.h"
+CHECK: #include "lldb-rpc-enumerations.h"
+CHECK: #include "lldb-rpc-types.h"
+CHECK-NOT: #include "lldb-rpc-versioning.h"
+
+# The comment that closes the include guard should match the guard.
+CHECK: #endif // LLDB_RPC_API_SBDEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
new file mode 100644
index 0000000000000..50476c402ba72
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h
@@ -0,0 +1,22 @@
+// This is a truncated version of SBDefines.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_API_SBDEFINES_H -> LLDB_RPC_SBDEFINES_H
+#ifndef LLDB_API_SBDEFINES_H
+#define LLDB_API_SBDEFINES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h"
+// Also, the includes for lldb-forward.h and lldb-versioning.h should be removed.
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-types.h"
+#include "lldb/lldb-versioning.h"
+
+// The comment that closes the include guard must change in the same way
+// the original guard did.
+// #endif // LLDB_API_SBDEFINES_H -> #endif // LLDB_RPC_API_SBDEFINES_H
+#endif // LLDB_API_SBDEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
new file mode 100644
index 0000000000000..32064430b3d04
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h
@@ -0,0 +1,23 @@
+// This is a truncated version of lldb-defines.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_DEFINES_H -> LLDB_RPC_DEFINES_H
+#ifndef LLDB_LLDB_DEFINES_H
+#define LLDB_LLDB_DEFINES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-types.h" -> "lldb-rpc-types.h"
+#include "lldb/lldb-types.h"
+
+// The LLDB version must change from LLDB to LLDB_RPC
+// LLDB_VERSION -> LLDB_RPC_VERSION
+#define LLDB_VERSION 21
+#define LLDB_REVISION 12
+#define LLDB_VERSION_STRING "21.0.12"
+
+// The comment that closes the include guard must change in the same way
+// the original guard did.
+// #endif // LLDB_LLDB_DEFINES_H -> #endif // LLDB_RPC_DEFINES_H
+#endif // LLDB_LLDB_DEFINES_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
new file mode 100644
index 0000000000000..42c4bb277fc45
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h
@@ -0,0 +1,17 @@
+// This is a truncated version of lldb-enumerations.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_ENUMERATIONS_H -> LLDB_RPC_ENUMERATIONS_H
+#ifndef LLDB_LLDB_ENUMERATIONS_H
+#define LLDB_LLDB_ENUMERATIONS_H
+
+// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it:
+// namespace lldb -> namespace lldb_rpc
+namespace lldb {} // namespace lldb
+
+// The comment that closes the include guard must change in the same way
+// the original guard did:
+// #endif // LLDB_LLDB_ENUMERATIONS_H -> #endif // LLDB_RPC_ENUMERATIONS_H
+#endif // LLDB_LLDB_ENUMERATIONS_H
diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h
new file mode 100644
index 0000000000000..5a49920405ec6
--- /dev/null
+++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h
@@ -0,0 +1,23 @@
+// This is a truncated version of lldb-types.h used to test that the script
+// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in
+// the original file to RPC references.
+
+// The include guard should change from LLDB_LLDB to LLDB_RPC.
+// LLDB_LLDB_TYPES_H -> LLDB_RPC_TYPES_H
+#ifndef LLDB_LLDB_TYPES_H
+#define LLDB_LLDB_TYPES_H
+
+// Includes of public main LLDB headers should change to their RPC equivalents:
+// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h":
+// Also, the includes for lldb-forward.h should be removed.
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+
+// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it:
+// namespace lldb -> namespace lldb_rpc
+namespace lldb {} // namespace lldb
+
+// The comment that closes the include guard must change in the same way
+// the original guard did:
+// #endif // LLDB_LLDB_TYPES_H -> #endif // LLDB_RPC_TYPES_H
+#endif // LLDB_LLDB_TYPES_H

From b42aef5e6f32a3ac6c259cb4cacf58239400b5aa Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Wed, 11 Jun 2025 13:12:59 -0700
Subject: [PATCH 145/851] [flang] Don't duplicate hermetic module file
 dependencies (#143605)

When emitting the modules on which a module depends under the
-fhermetic-module-files options, eliminate duplicates by name rather
than by symbol addresses. This way, when a dependent module is in the
symbol table more than once due to the use of a nested hermetic module,
it doesn't get emitted multiple times to the new module file.
---
 flang/lib/Semantics/mod-file.cpp   | 18 +++++++++------
 flang/test/Semantics/modfile77.F90 | 37 ++++++++++++++++++++++++++++++
 flang/test/Semantics/modfile78.F90 | 33 ++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Semantics/modfile77.F90
 create mode 100644 flang/test/Semantics/modfile78.F90

diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index a72641866aa15..9f9e9f5840456 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -143,18 +143,22 @@ void ModFileWriter::Write(const Symbol &symbol) {
   std::string path{context_.moduleDirectory() + '/' +
       ModFileName(symbol.name(), ancestorName, context_.moduleFileSuffix())};
 
-  UnorderedSymbolSet hermeticModules;
-  hermeticModules.insert(symbol);
+  std::set<std::string> hermeticModuleNames;
+  hermeticModuleNames.insert(symbol.name().ToString());
   UnorderedSymbolSet additionalModules;
   PutSymbols(DEREF(symbol.scope()),
       hermeticModuleFileOutput_ ? &additionalModules : nullptr);
   auto asStr{GetAsString(symbol)};
   while (!additionalModules.empty()) {
-    for (auto ref : UnorderedSymbolSet{std::move(additionalModules)}) {
-      if (hermeticModules.insert(*ref).second &&
-          !ref->owner().IsIntrinsicModules()) {
-        PutSymbols(DEREF(ref->scope()), &additionalModules);
-        asStr += GetAsString(*ref);
+    UnorderedSymbolSet nextPass{std::move(additionalModules)};
+    additionalModules.clear();
+    for (const Symbol &modSym : nextPass) {
+      if (!modSym.owner().IsIntrinsicModules() &&
+          hermeticModuleNames.find(modSym.name().ToString()) ==
+              hermeticModuleNames.end()) {
+        hermeticModuleNames.insert(modSym.name().ToString());
+        PutSymbols(DEREF(modSym.scope()), &additionalModules);
+        asStr += GetAsString(modSym);
       }
     }
   }
diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90
new file mode 100644
index 0000000000000..a82904ebbcc22
--- /dev/null
+++ b/flang/test/Semantics/modfile77.F90
@@ -0,0 +1,37 @@
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s
+
+#if WHICH == 1
+module modfile77a
+  interface gen
+    procedure proc
+  end interface
+ contains
+  subroutine proc
+    print *, 'ok'
+  end
+end
+#elif WHICH == 2
+module modfile77b
+  use modfile77a
+end
+#else
+module modfile77c
+  use modfile77a
+  use modfile77b
+end
+#endif
+
+!CHECK: module modfile77c
+!CHECK: use modfile77a,only:proc
+!CHECK: use modfile77a,only:gen
+!CHECK: interface gen
+!CHECK: end interface
+!CHECK: end
+!CHECK: module modfile77a
+!CHECK: interface gen
+!CHECK: procedure::proc
+!CHECK: end interface
+!CHECK: contains
+!CHECK: subroutine proc()
+!CHECK: end
+!CHECK: end
diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90
new file mode 100644
index 0000000000000..cb3eccd9a4108
--- /dev/null
+++ b/flang/test/Semantics/modfile78.F90
@@ -0,0 +1,33 @@
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s
+
+#if WHICH == 1
+module modfile78a
+  integer :: global_variable = 0
+end
+#elif WHICH == 2
+module modfile78b
+  use modfile78a
+ contains
+  subroutine test
+  end
+end
+#else
+module modfile78c
+  use modfile78a
+  use modfile78b
+end
+#endif
+
+!CHECK: module modfile78c
+!CHECK: use modfile78a,only:global_variable
+!CHECK: use modfile78b,only:test
+!CHECK: end
+!CHECK: module modfile78a
+!CHECK: integer(4)::global_variable
+!CHECK: end
+!CHECK: module modfile78b
+!CHECK: use modfile78a,only:global_variable
+!CHECK: contains
+!CHECK: subroutine test()
+!CHECK: end
+!CHECK: end

From e389a0e7bb3d7aabbd10b9ba8f432f292de65649 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Wed, 11 Jun 2025 20:17:35 +0000
Subject: [PATCH 146/851] [libc] Switched calls to inline_memcpy to
 __builtin_memcpy for wide char utilities (#143011)

Switched calls to inline_memcpy to __builtin_memcpy for wide char
utilities
Removed unnecessary wctype_utils dependencies from the cmake file
---
 libc/src/wchar/CMakeLists.txt | 9 ---------
 libc/src/wchar/wcscpy.cpp     | 3 +--
 libc/src/wchar/wcsncpy.cpp    | 2 --
 libc/src/wchar/wmemcpy.cpp    | 3 +--
 libc/src/wchar/wmempcpy.cpp   | 3 +--
 5 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 759f708c2247a..4b8802ede5f5d 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -43,7 +43,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.types.wchar_t
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -54,7 +53,6 @@ add_entrypoint_object(
     wcschr.h
   DEPENDS
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -75,7 +73,6 @@ add_entrypoint_object(
     wcspbrk.h
   DEPENDS
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
     libc.src.__support.macros.null_check
 )
 
@@ -109,7 +106,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.wchar_macros
     libc.hdr.types.size_t
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -121,7 +117,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
     libc.src.__support.macros.null_check
 )
 
@@ -134,7 +129,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
 )
 
 add_entrypoint_object(
@@ -205,8 +199,6 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.__support.wctype_utils
-    libc.src.string.memory_utils.inline_memcpy
 )
 
 add_entrypoint_object(
@@ -218,6 +210,5 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.types.size_t
     libc.hdr.wchar_macros
-    libc.src.string.memory_utils.inline_memcpy
     libc.src.string.string_utils
 )
diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp
index dc46b972c59f7..01ba994cecbb2 100644
--- a/libc/src/wchar/wcscpy.cpp
+++ b/libc/src/wchar/wcscpy.cpp
@@ -12,7 +12,6 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/string_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -20,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(wchar_t *, wcscpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2)) {
   size_t size = internal::string_length(s2) + 1;
-  inline_memcpy(s1, s2, size * sizeof(wchar_t));
+  __builtin_memcpy(s1, s2, size * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wcsncpy.cpp b/libc/src/wchar/wcsncpy.cpp
index e7ae9a4a0da79..7ad6730cd776b 100644
--- a/libc/src/wchar/wcsncpy.cpp
+++ b/libc/src/wchar/wcsncpy.cpp
@@ -12,8 +12,6 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
-#include "src/string/string_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp
index 56708d6cee496..bf92309b20944 100644
--- a/libc/src/wchar/wmemcpy.cpp
+++ b/libc/src/wchar/wmemcpy.cpp
@@ -12,14 +12,13 @@
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy,
                    (wchar_t *__restrict s1, const wchar_t *__restrict s2,
                     size_t n)) {
-  inline_memcpy(s1, s2, n * sizeof(wchar_t));
+  __builtin_memcpy(s1, s2, n * sizeof(wchar_t));
   return s1;
 }
 
diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp
index d8b89c0a88d05..21e16210a757a 100644
--- a/libc/src/wchar/wmempcpy.cpp
+++ b/libc/src/wchar/wmempcpy.cpp
@@ -11,14 +11,13 @@
 #include "hdr/types/size_t.h"
 #include "hdr/types/wchar_t.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy,
                    (wchar_t *__restrict to, const wchar_t *__restrict from,
                     size_t size)) {
-  inline_memcpy(to, from, size * sizeof(wchar_t));
+  __builtin_memcpy(to, from, size * sizeof(wchar_t));
   return reinterpret_cast<wchar_t *>(to) + size;
 }
 

From fb761aa38b0bc01ab911f5dbbfb474b70aaafbb4 Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Wed, 11 Jun 2025 21:19:52 +0100
Subject: [PATCH 147/851] [MLIR][Transform] apply_registered_op fixes: arg
 order & python options auto-conversion (#143779)

---
 .../mlir/Dialect/Transform/IR/TransformOps.td |  6 +++---
 .../mlir/dialects/transform/__init__.py       | 18 +++++++++++-------
 .../Transform/test-pass-application.mlir      | 19 +++++++++----------
 mlir/test/python/dialects/transform.py        | 10 +++++-----
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index f75ba27e58e76..0aa750e625436 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -434,10 +434,10 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
     of targeted ops.
   }];
 
-  let arguments = (ins StrAttr:$pass_name,
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                       StrAttr:$pass_name,
                        DefaultValuedAttr<DictionaryAttr, "{}">:$options,
-                       Variadic<TransformParamTypeInterface>:$dynamic_options,
-                       TransformHandleTypeInterface:$target);
+                       Variadic<TransformParamTypeInterface>:$dynamic_options);
   let results = (outs TransformHandleTypeInterface:$result);
   let assemblyFormat = [{
     $pass_name (`with` `options` `=`
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 10a04b0cc14e0..bfe96b1b3e5d4 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -224,13 +224,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
     def __init__(
         self,
         result: Type,
-        pass_name: Union[str, StringAttr],
         target: Union[Operation, Value, OpView],
+        pass_name: Union[str, StringAttr],
         *,
         options: Optional[
             Dict[
                 Union[str, StringAttr],
-                Union[Attribute, Value, Operation, OpView],
+                Union[Attribute, Value, Operation, OpView, str, int, bool],
             ]
         ] = None,
         loc=None,
@@ -253,17 +253,21 @@ def __init__(
                 cur_param_operand_idx += 1
             elif isinstance(value, Attribute):
                 options_dict[key] = value
+            # The following cases auto-convert Python values to attributes.
+            elif isinstance(value, bool):
+                options_dict[key] = BoolAttr.get(value)
+            elif isinstance(value, int):
+                default_int_type = IntegerType.get_signless(64, context)
+                options_dict[key] = IntegerAttr.get(default_int_type, value)
             elif isinstance(value, str):
                 options_dict[key] = StringAttr.get(value)
             else:
                 raise TypeError(f"Unsupported option type: {type(value)}")
-        if len(options_dict) > 0:
-            print(options_dict, cur_param_operand_idx)
         super().__init__(
             result,
+            _get_op_result_or_value(target),
             pass_name,
             dynamic_options,
-            target=_get_op_result_or_value(target),
             options=DictAttr.get(options_dict),
             loc=loc,
             ip=ip,
@@ -272,13 +276,13 @@ def __init__(
 
 def apply_registered_pass(
     result: Type,
-    pass_name: Union[str, StringAttr],
     target: Union[Operation, Value, OpView],
+    pass_name: Union[str, StringAttr],
     *,
     options: Optional[
         Dict[
             Union[str, StringAttr],
-            Union[Attribute, Value, Operation, OpView],
+            Union[Attribute, Value, Operation, OpView, str, int, bool],
         ]
     ] = None,
     loc=None,
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 6e6d4eb7e249f..1d1be9eda3496 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -157,7 +157,7 @@ module attributes {transform.with_named_sequence} {
                          "test-convergence" = true,
                          "max-num-rewrites" =  %max_rewrites }
         to %1
-        : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -171,7 +171,6 @@ func.func @invalid_options_as_str() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param
     // expected-error @+2 {{expected '{' in options dictionary}}
     %2 = transform.apply_registered_pass "canonicalize"
         with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op
@@ -256,7 +255,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @+2 {{expected '{' in options dictionary}}
     transform.apply_registered_pass "canonicalize"
         with options = %pass_options to %1
-        : (!transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -276,7 +275,7 @@ module attributes {transform.with_named_sequence} {
     // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
     transform.apply_registered_pass "canonicalize"
         with options = { "top-down" = %topdown_options } to %1
-        : (!transform.any_param, !transform.any_op) -> !transform.any_op
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
     transform.yield
   }
 }
@@ -316,12 +315,12 @@ module attributes {transform.with_named_sequence} {
     %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}}
-    %2 = "transform.apply_registered_pass"(%1, %0) <{
+    %2 = "transform.apply_registered_pass"(%0, %1) <{
       options = {"max-iterations" = #transform.param_operand<index=1 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
@@ -340,13 +339,13 @@ module attributes {transform.with_named_sequence} {
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
     // expected-error @below {{dynamic option index 0 is already used in options}}
-    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+    %3 = "transform.apply_registered_pass"(%0, %1, %2) <{
       options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
                  "max-num-rewrites" = #transform.param_operand<index=0 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
@@ -364,12 +363,12 @@ module attributes {transform.with_named_sequence} {
     %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param
     %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param
     // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}}
-    %3 = "transform.apply_registered_pass"(%1, %2, %0) <{
+    %3 = "transform.apply_registered_pass"(%0, %1, %2) <{
       options = {"max-iterations" = #transform.param_operand<index=0 : i64>,
                  "test-convergence" = true,
                  "top-down" = false},
       pass_name = "canonicalize"}>
-    : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
     "transform.yield"() : () -> ()
   }) : () -> ()
 }) {transform.with_named_sequence} : () -> ()
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 48bc9bad37a1e..eeb95605d7a9a 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -263,12 +263,12 @@ def testApplyRegisteredPassOp(module: Module):
     )
     with InsertionPoint(sequence.body):
         mod = transform.ApplyRegisteredPassOp(
-            transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget
+            transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize"
         )
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(),
-            "canonicalize",
             mod.result,
+            "canonicalize",
             options={"top-down": BoolAttr.get(False)},
         )
         max_iter = transform.param_constant(
@@ -281,12 +281,12 @@ def testApplyRegisteredPassOp(module: Module):
         )
         transform.apply_registered_pass(
             transform.AnyOpType.get(),
-            "canonicalize",
             mod,
+            "canonicalize",
             options={
                 "top-down": BoolAttr.get(False),
                 "max-iterations": max_iter,
-                "test-convergence": BoolAttr.get(True),
+                "test-convergence": True,
                 "max-rewrites": max_rewrites,
             },
         )
@@ -305,4 +305,4 @@ def testApplyRegisteredPassOp(module: Module):
     # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
     # CHECK-SAME:                    "test-convergence" = true,
     # CHECK-SAME:                    "top-down" = false}
-    # CHECK-SAME:    to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
+    # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op

From d87eea35fac5a34a841c637db8908128409a184e Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Wed, 11 Jun 2025 16:25:27 -0400
Subject: [PATCH 148/851] [libc] Move libc_errno.h to libc/src/__support and
 make LIBC_ERRNO_MODE_SYSTEM to be header-only. (#143187)

This is the first step in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 .../modules/LLVMLibCCompileOptionRules.cmake  |   4 +
 libc/config/config.json                       |   2 +-
 libc/docs/dev/code_style.rst                  |   4 +-
 libc/shared/fp_bits.h                         |   1 +
 libc/shared/libc_common.h                     |  26 +++++
 libc/shared/rpc_server.h                      |   1 +
 libc/shared/str_to_float.h                    |   1 +
 libc/shared/str_to_integer.h                  |   1 +
 libc/src/__support/CMakeLists.txt             |   9 ++
 libc/src/__support/FPUtil/FEnvImpl.h          |   2 +-
 libc/src/__support/File/dir.cpp               |   2 +-
 libc/src/__support/File/file.cpp              |   2 +-
 libc/src/__support/File/linux/file.cpp        |   2 +-
 libc/src/__support/File/linux/lseekImpl.h     |   2 +-
 libc/src/__support/HashTable/randomness.h     |   2 +-
 libc/src/__support/OSUtil/linux/fcntl.cpp     |   2 +-
 libc/src/__support/OSUtil/linux/vdso.cpp      |   2 +-
 .../tables/linux_extension_errors.h           |   2 +-
 libc/src/__support/libc_errno.h               | 108 ++++++++++++++++++
 libc/src/__support/threads/linux/thread.cpp   |   2 +-
 libc/src/dirent/closedir.cpp                  |   2 +-
 libc/src/dirent/opendir.cpp                   |   2 +-
 libc/src/dirent/readdir.cpp                   |   2 +-
 libc/src/errno/CMakeLists.txt                 |  20 +---
 libc/src/errno/libc_errno.cpp                 |  47 +-------
 libc/src/errno/libc_errno.h                   |  47 --------
 libc/src/fcntl/linux/creat.cpp                |   2 +-
 libc/src/fcntl/linux/open.cpp                 |   2 +-
 libc/src/fcntl/linux/openat.cpp               |   2 +-
 libc/src/inttypes/strtoimax.cpp               |   2 +-
 libc/src/inttypes/strtoumax.cpp               |   2 +-
 libc/src/math/generic/exp10m1f.cpp            |   2 +-
 libc/src/math/generic/exp2m1f.cpp             |   2 +-
 libc/src/math/generic/nan.cpp                 |   2 +-
 libc/src/math/generic/nanf.cpp                |   2 +-
 libc/src/math/generic/nanf128.cpp             |   2 +-
 libc/src/math/generic/nanf16.cpp              |   2 +-
 libc/src/math/generic/nanl.cpp                |   2 +-
 libc/src/poll/linux/poll.cpp                  |   2 +-
 libc/src/pthread/pthread_atfork.cpp           |   2 +-
 .../pthread/pthread_attr_setdetachstate.cpp   |   2 +-
 .../src/pthread/pthread_attr_setguardsize.cpp |   2 +-
 libc/src/pthread/pthread_attr_setstack.cpp    |   2 +-
 .../src/pthread/pthread_attr_setstacksize.cpp |   2 +-
 .../src/pthread/pthread_condattr_setclock.cpp |   2 +-
 .../pthread/pthread_condattr_setpshared.cpp   |   2 +-
 libc/src/pthread/pthread_create.cpp           |   2 +-
 libc/src/pthread/pthread_key_create.cpp       |   2 +-
 libc/src/pthread/pthread_key_delete.cpp       |   2 +-
 .../pthread/pthread_mutexattr_setpshared.cpp  |   2 +-
 .../pthread/pthread_mutexattr_setrobust.cpp   |   2 +-
 .../src/pthread/pthread_mutexattr_settype.cpp |   2 +-
 .../pthread/pthread_rwlock_timedrdlock.cpp    |   2 +-
 libc/src/pthread/pthread_rwlock_trywrlock.cpp |   2 +-
 libc/src/pthread/pthread_rwlock_unlock.cpp    |   2 +-
 .../pthread/pthread_rwlockattr_setkind_np.cpp |   2 +-
 .../pthread/pthread_rwlockattr_setpshared.cpp |   2 +-
 libc/src/pthread/pthread_setspecific.cpp      |   2 +-
 .../sched/linux/sched_get_priority_max.cpp    |   2 +-
 .../sched/linux/sched_get_priority_min.cpp    |   2 +-
 libc/src/sched/linux/sched_getaffinity.cpp    |   2 +-
 libc/src/sched/linux/sched_getparam.cpp       |   2 +-
 libc/src/sched/linux/sched_getscheduler.cpp   |   2 +-
 .../src/sched/linux/sched_rr_get_interval.cpp |   2 +-
 libc/src/sched/linux/sched_setaffinity.cpp    |   2 +-
 libc/src/sched/linux/sched_setparam.cpp       |   2 +-
 libc/src/sched/linux/sched_setscheduler.cpp   |   2 +-
 libc/src/sched/linux/sched_yield.cpp          |   2 +-
 libc/src/search/hcreate.cpp                   |   2 +-
 libc/src/search/hcreate_r.cpp                 |   2 +-
 libc/src/search/hdestroy_r.cpp                |   2 +-
 libc/src/search/hsearch.cpp                   |   2 +-
 libc/src/search/hsearch_r.cpp                 |   2 +-
 libc/src/signal/linux/kill.cpp                |   2 +-
 libc/src/signal/linux/sigaction.cpp           |   2 +-
 libc/src/signal/linux/sigaddset.cpp           |   2 +-
 libc/src/signal/linux/sigaltstack.cpp         |   2 +-
 libc/src/signal/linux/sigdelset.cpp           |   2 +-
 libc/src/signal/linux/sigemptyset.cpp         |   2 +-
 libc/src/signal/linux/sigfillset.cpp          |   2 +-
 libc/src/signal/linux/sigprocmask.cpp         |   2 +-
 .../posix_spawn_file_actions_addclose.cpp     |   2 +-
 .../posix_spawn_file_actions_adddup2.cpp      |   2 +-
 .../posix_spawn_file_actions_addopen.cpp      |   2 +-
 .../posix_spawn_file_actions_destroy.cpp      |   2 +-
 libc/src/stdio/fopencookie.cpp                |   2 +-
 libc/src/stdio/generic/fclose.cpp             |   2 +-
 libc/src/stdio/generic/fflush.cpp             |   2 +-
 libc/src/stdio/generic/fgetc.cpp              |   2 +-
 libc/src/stdio/generic/fgetc_unlocked.cpp     |   2 +-
 libc/src/stdio/generic/fgets.cpp              |   2 +-
 libc/src/stdio/generic/fopen.cpp              |   2 +-
 libc/src/stdio/generic/fputc.cpp              |   2 +-
 libc/src/stdio/generic/fputs.cpp              |   2 +-
 libc/src/stdio/generic/fread.cpp              |   2 +-
 libc/src/stdio/generic/fread_unlocked.cpp     |   2 +-
 libc/src/stdio/generic/fseek.cpp              |   2 +-
 libc/src/stdio/generic/fseeko.cpp             |   2 +-
 libc/src/stdio/generic/ftell.cpp              |   2 +-
 libc/src/stdio/generic/ftello.cpp             |   2 +-
 libc/src/stdio/generic/fwrite.cpp             |   2 +-
 libc/src/stdio/generic/fwrite_unlocked.cpp    |   2 +-
 libc/src/stdio/generic/getc.cpp               |   2 +-
 libc/src/stdio/generic/getc_unlocked.cpp      |   2 +-
 libc/src/stdio/generic/getchar.cpp            |   2 +-
 libc/src/stdio/generic/getchar_unlocked.cpp   |   2 +-
 libc/src/stdio/generic/putc.cpp               |   2 +-
 libc/src/stdio/generic/putchar.cpp            |   2 +-
 libc/src/stdio/generic/puts.cpp               |   2 +-
 libc/src/stdio/gpu/fprintf.cpp                |   2 +-
 libc/src/stdio/gpu/printf.cpp                 |   2 +-
 libc/src/stdio/linux/fdopen.cpp               |   2 +-
 libc/src/stdio/linux/remove.cpp               |   2 +-
 libc/src/stdio/linux/rename.cpp               |   2 +-
 libc/src/stdio/printf_core/parser.h           |   2 +-
 libc/src/stdio/setbuf.cpp                     |   2 +-
 libc/src/stdio/setvbuf.cpp                    |   2 +-
 libc/src/stdlib/atof.cpp                      |   2 +-
 libc/src/stdlib/atoi.cpp                      |   2 +-
 libc/src/stdlib/atol.cpp                      |   2 +-
 libc/src/stdlib/atoll.cpp                     |   2 +-
 libc/src/stdlib/strtod.cpp                    |   2 +-
 libc/src/stdlib/strtod_l.cpp                  |   2 +-
 libc/src/stdlib/strtof.cpp                    |   2 +-
 libc/src/stdlib/strtof_l.cpp                  |   2 +-
 libc/src/stdlib/strtol.cpp                    |   2 +-
 libc/src/stdlib/strtol_l.cpp                  |   2 +-
 libc/src/stdlib/strtold.cpp                   |   2 +-
 libc/src/stdlib/strtold_l.cpp                 |   2 +-
 libc/src/stdlib/strtoll.cpp                   |   2 +-
 libc/src/stdlib/strtoll_l.cpp                 |   2 +-
 libc/src/stdlib/strtoul.cpp                   |   2 +-
 libc/src/stdlib/strtoul_l.cpp                 |   2 +-
 libc/src/stdlib/strtoull.cpp                  |   2 +-
 libc/src/stdlib/strtoull_l.cpp                |   2 +-
 libc/src/string/strdup.cpp                    |   2 +-
 libc/src/sys/auxv/linux/getauxval.cpp         |   2 +-
 libc/src/sys/epoll/linux/epoll_create.cpp     |   2 +-
 libc/src/sys/epoll/linux/epoll_create1.cpp    |   2 +-
 libc/src/sys/epoll/linux/epoll_ctl.cpp        |   2 +-
 libc/src/sys/epoll/linux/epoll_pwait.cpp      |   2 +-
 libc/src/sys/epoll/linux/epoll_pwait2.cpp     |   2 +-
 libc/src/sys/epoll/linux/epoll_wait.cpp       |   2 +-
 libc/src/sys/mman/linux/madvise.cpp           |   2 +-
 libc/src/sys/mman/linux/mincore.cpp           |   2 +-
 libc/src/sys/mman/linux/mlock.cpp             |   2 +-
 libc/src/sys/mman/linux/mlock2.cpp            |   2 +-
 libc/src/sys/mman/linux/mlockall.cpp          |   2 +-
 libc/src/sys/mman/linux/mmap.cpp              |   2 +-
 libc/src/sys/mman/linux/mprotect.cpp          |   2 +-
 libc/src/sys/mman/linux/mremap.cpp            |   2 +-
 libc/src/sys/mman/linux/msync.cpp             |   2 +-
 libc/src/sys/mman/linux/munlock.cpp           |   2 +-
 libc/src/sys/mman/linux/munlockall.cpp        |   2 +-
 libc/src/sys/mman/linux/munmap.cpp            |   4 +-
 libc/src/sys/mman/linux/remap_file_pages.cpp  |   2 +-
 libc/src/sys/mman/linux/shm_common.h          |   2 +-
 libc/src/sys/prctl/linux/prctl.cpp            |   2 +-
 libc/src/sys/random/linux/getrandom.cpp       |   2 +-
 libc/src/sys/resource/linux/getrlimit.cpp     |   2 +-
 libc/src/sys/resource/linux/setrlimit.cpp     |   2 +-
 libc/src/sys/select/linux/select.cpp          |   2 +-
 libc/src/sys/sendfile/linux/sendfile.cpp      |   2 +-
 libc/src/sys/socket/linux/bind.cpp            |   2 +-
 libc/src/sys/socket/linux/recv.cpp            |   2 +-
 libc/src/sys/socket/linux/recvfrom.cpp        |   2 +-
 libc/src/sys/socket/linux/recvmsg.cpp         |   2 +-
 libc/src/sys/socket/linux/send.cpp            |   2 +-
 libc/src/sys/socket/linux/sendmsg.cpp         |   2 +-
 libc/src/sys/socket/linux/sendto.cpp          |   2 +-
 libc/src/sys/socket/linux/socket.cpp          |   2 +-
 libc/src/sys/socket/linux/socketpair.cpp      |   2 +-
 libc/src/sys/stat/linux/chmod.cpp             |   2 +-
 libc/src/sys/stat/linux/fchmod.cpp            |   2 +-
 libc/src/sys/stat/linux/fchmodat.cpp          |   2 +-
 libc/src/sys/stat/linux/fstat.cpp             |   2 +-
 libc/src/sys/stat/linux/lstat.cpp             |   2 +-
 libc/src/sys/stat/linux/mkdir.cpp             |   2 +-
 libc/src/sys/stat/linux/mkdirat.cpp           |   2 +-
 libc/src/sys/stat/linux/stat.cpp              |   2 +-
 libc/src/sys/statvfs/linux/statfs_utils.h     |   2 +-
 libc/src/sys/time/linux/getitimer.cpp         |   2 +-
 libc/src/sys/time/linux/setitimer.cpp         |   2 +-
 libc/src/sys/time/linux/utimes.cpp            |   2 +-
 libc/src/sys/uio/linux/readv.cpp              |   2 +-
 libc/src/sys/uio/linux/writev.cpp             |   2 +-
 libc/src/sys/utsname/linux/uname.cpp          |   2 +-
 libc/src/sys/wait/wait4Impl.h                 |   2 +-
 libc/src/termios/linux/cfsetispeed.cpp        |   2 +-
 libc/src/termios/linux/cfsetospeed.cpp        |   2 +-
 libc/src/termios/linux/tcdrain.cpp            |   2 +-
 libc/src/termios/linux/tcflow.cpp             |   2 +-
 libc/src/termios/linux/tcflush.cpp            |   2 +-
 libc/src/termios/linux/tcgetattr.cpp          |   2 +-
 libc/src/termios/linux/tcgetsid.cpp           |   2 +-
 libc/src/termios/linux/tcsendbreak.cpp        |   2 +-
 libc/src/termios/linux/tcsetattr.cpp          |   2 +-
 libc/src/threads/thrd_create.cpp              |   2 +-
 libc/src/time/linux/clock.cpp                 |   2 +-
 libc/src/time/linux/clock_gettime.cpp         |   2 +-
 libc/src/time/linux/gettimeofday.cpp          |   2 +-
 libc/src/time/linux/nanosleep.cpp             |   2 +-
 libc/src/time/linux/timespec_get.cpp          |   2 +-
 libc/src/time/time.cpp                        |   2 +-
 libc/src/time/time_utils.h                    |   2 +-
 libc/src/time/windows/clock_getres.cpp        |   2 +-
 libc/src/unistd/linux/access.cpp              |   2 +-
 libc/src/unistd/linux/chdir.cpp               |   2 +-
 libc/src/unistd/linux/close.cpp               |   2 +-
 libc/src/unistd/linux/dup.cpp                 |   2 +-
 libc/src/unistd/linux/dup2.cpp                |   2 +-
 libc/src/unistd/linux/dup3.cpp                |   2 +-
 libc/src/unistd/linux/execv.cpp               |   2 +-
 libc/src/unistd/linux/execve.cpp              |   2 +-
 libc/src/unistd/linux/fchdir.cpp              |   2 +-
 libc/src/unistd/linux/fork.cpp                |   2 +-
 libc/src/unistd/linux/fsync.cpp               |   2 +-
 libc/src/unistd/linux/ftruncate.cpp           |   2 +-
 libc/src/unistd/linux/getcwd.cpp              |   2 +-
 libc/src/unistd/linux/getentropy.cpp          |   2 +-
 libc/src/unistd/linux/getsid.cpp              |   2 +-
 libc/src/unistd/linux/isatty.cpp              |   2 +-
 libc/src/unistd/linux/link.cpp                |   2 +-
 libc/src/unistd/linux/linkat.cpp              |   2 +-
 libc/src/unistd/linux/lseek.cpp               |   2 +-
 libc/src/unistd/linux/pathconf.cpp            |   2 +-
 libc/src/unistd/linux/pathconf_utils.cpp      |   2 +-
 libc/src/unistd/linux/pipe.cpp                |   4 +-
 libc/src/unistd/linux/pipe2.cpp               |   2 +-
 libc/src/unistd/linux/pread.cpp               |   6 +-
 libc/src/unistd/linux/pwrite.cpp              |   2 +-
 libc/src/unistd/linux/read.cpp                |   4 +-
 libc/src/unistd/linux/readlink.cpp            |   2 +-
 libc/src/unistd/linux/readlinkat.cpp          |   2 +-
 libc/src/unistd/linux/rmdir.cpp               |   2 +-
 libc/src/unistd/linux/symlink.cpp             |   2 +-
 libc/src/unistd/linux/symlinkat.cpp           |   2 +-
 libc/src/unistd/linux/syscall.cpp             |   2 +-
 libc/src/unistd/linux/sysconf.cpp             |   2 +-
 libc/src/unistd/linux/truncate.cpp            |   2 +-
 libc/src/unistd/linux/unlink.cpp              |   2 +-
 libc/src/unistd/linux/unlinkat.cpp            |   2 +-
 libc/src/unistd/linux/write.cpp               |   2 +-
 libc/src/unistd/windows/getentropy.cpp        |   2 +-
 libc/test/IntegrationTest/test.h              |   9 +-
 libc/test/UnitTest/ErrnoCheckingTest.h        |   4 +-
 libc/test/UnitTest/ErrnoSetterMatcher.h       |   6 +-
 libc/test/UnitTest/FPMatcher.h                |   8 +-
 libc/test/UnitTest/Test.h                     |  11 +-
 .../src/pthread/pthread_create_test.cpp       |   4 +-
 .../src/pthread/pthread_join_test.cpp         |   4 +-
 .../src/pthread/pthread_name_test.cpp         |   2 +-
 .../integration/src/unistd/getcwd_test.cpp    |   6 +-
 .../integration/startup/linux/tls_test.cpp    |   2 +-
 libc/test/src/__support/str_to_fp_test.h      |   1 +
 .../src/__support/str_to_integer_test.cpp     |   1 +
 libc/test/src/dirent/dirent_test.cpp          |  10 +-
 libc/test/src/errno/errno_test.cpp            |   4 +-
 libc/test/src/fcntl/creat_test.cpp            |   2 +-
 libc/test/src/fcntl/fcntl_test.cpp            |   4 +-
 libc/test/src/fcntl/openat_test.cpp           |   2 +-
 libc/test/src/math/RoundToIntegerTest.h       |   2 +-
 libc/test/src/math/acosf_test.cpp             |   4 +-
 libc/test/src/math/acoshf16_test.cpp          |   2 +-
 libc/test/src/math/acoshf_test.cpp            |   4 +-
 libc/test/src/math/asin_test.cpp              |   2 +-
 libc/test/src/math/asinf_test.cpp             |   4 +-
 libc/test/src/math/asinhf_test.cpp            |   4 +-
 libc/test/src/math/atan2f_test.cpp            |   2 +-
 libc/test/src/math/atan_test.cpp              |   2 +-
 libc/test/src/math/atanf_test.cpp             |   4 +-
 libc/test/src/math/atanhf_test.cpp            |   4 +-
 libc/test/src/math/cosf_test.cpp              |   4 +-
 libc/test/src/math/coshf_test.cpp             |   6 +-
 libc/test/src/math/cospif_test.cpp            |   4 +-
 libc/test/src/math/exp10_test.cpp             |   4 +-
 libc/test/src/math/exp10f_test.cpp            |  15 ++-
 libc/test/src/math/exp10m1f_test.cpp          |   8 +-
 libc/test/src/math/exp2_test.cpp              |   4 +-
 libc/test/src/math/exp2f_test.cpp             |  15 ++-
 libc/test/src/math/exp2m1f_test.cpp           |   9 +-
 libc/test/src/math/exp_test.cpp               |   4 +-
 libc/test/src/math/expf_test.cpp              |  15 ++-
 libc/test/src/math/expm1_test.cpp             |   4 +-
 libc/test/src/math/expm1f_test.cpp            |  15 ++-
 libc/test/src/math/log10_test.cpp             |   4 +-
 libc/test/src/math/log1p_test.cpp             |   4 +-
 libc/test/src/math/log1pf_test.cpp            |   4 +-
 libc/test/src/math/log2_test.cpp              |   4 +-
 libc/test/src/math/log2f_test.cpp             |   7 +-
 libc/test/src/math/log_test.cpp               |   4 +-
 libc/test/src/math/powf_test.cpp              |   2 +-
 libc/test/src/math/sin_test.cpp               |   2 +-
 libc/test/src/math/sincosf_test.cpp           |   4 +-
 libc/test/src/math/sinf_test.cpp              |   4 +-
 libc/test/src/math/sinhf_test.cpp             |   6 +-
 libc/test/src/math/sinpif_test.cpp            |   4 +-
 libc/test/src/math/smoke/FModTest.h           |   2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h |   2 +-
 libc/test/src/math/smoke/acos_test.cpp        |   4 +-
 libc/test/src/math/smoke/acosf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/acosf_test.cpp       |   4 +-
 libc/test/src/math/smoke/acoshf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/acoshf_test.cpp      |   4 +-
 libc/test/src/math/smoke/acospif16_test.cpp   |   4 +-
 libc/test/src/math/smoke/asinf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/asinf_test.cpp       |   4 +-
 libc/test/src/math/smoke/asinhf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/asinhf_test.cpp      |   4 +-
 libc/test/src/math/smoke/atan2f_test.cpp      |   4 +-
 libc/test/src/math/smoke/atanf16_test.cpp     |   4 +-
 libc/test/src/math/smoke/atanf_test.cpp       |   4 +-
 libc/test/src/math/smoke/atanhf16_test.cpp    |   4 +-
 libc/test/src/math/smoke/atanhf_test.cpp      |   4 +-
 libc/test/src/math/smoke/cosf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/cosf_test.cpp        |   4 +-
 libc/test/src/math/smoke/coshf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/coshf_test.cpp       |   6 +-
 libc/test/src/math/smoke/cospif16_test.cpp    |   4 +-
 libc/test/src/math/smoke/cospif_test.cpp      |   4 +-
 libc/test/src/math/smoke/exp10_test.cpp       |   2 +-
 libc/test/src/math/smoke/exp10f16_test.cpp    |   8 +-
 libc/test/src/math/smoke/exp10f_test.cpp      |   6 +-
 libc/test/src/math/smoke/exp10m1f16_test.cpp  |   8 +-
 libc/test/src/math/smoke/exp10m1f_test.cpp    |   8 +-
 libc/test/src/math/smoke/exp2_test.cpp        |   2 +-
 libc/test/src/math/smoke/exp2f16_test.cpp     |   8 +-
 libc/test/src/math/smoke/exp2f_test.cpp       |   6 +-
 libc/test/src/math/smoke/exp2m1f16_test.cpp   |   8 +-
 libc/test/src/math/smoke/exp2m1f_test.cpp     |   8 +-
 libc/test/src/math/smoke/exp_test.cpp         |   2 +-
 libc/test/src/math/smoke/expf16_test.cpp      |   8 +-
 libc/test/src/math/smoke/expf_test.cpp        |   6 +-
 libc/test/src/math/smoke/expm1_test.cpp       |   2 +-
 libc/test/src/math/smoke/expm1f16_test.cpp    |   8 +-
 libc/test/src/math/smoke/expm1f_test.cpp      |   6 +-
 libc/test/src/math/smoke/log10_test.cpp       |   2 +-
 libc/test/src/math/smoke/log10f16_test.cpp    |   4 +-
 libc/test/src/math/smoke/log1p_test.cpp       |   2 +-
 libc/test/src/math/smoke/log1pf_test.cpp      |   2 +-
 libc/test/src/math/smoke/log2_test.cpp        |   2 +-
 libc/test/src/math/smoke/log2f16_test.cpp     |   4 +-
 libc/test/src/math/smoke/log2f_test.cpp       |   2 +-
 libc/test/src/math/smoke/log_test.cpp         |   2 +-
 libc/test/src/math/smoke/logf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/sincosf_test.cpp     |   4 +-
 libc/test/src/math/smoke/sinf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/sinf_test.cpp        |   4 +-
 libc/test/src/math/smoke/sinhf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/sinhf_test.cpp       |   6 +-
 libc/test/src/math/smoke/sinpif16_test.cpp    |   4 +-
 libc/test/src/math/smoke/sinpif_test.cpp      |   4 +-
 libc/test/src/math/smoke/tanf16_test.cpp      |   4 +-
 libc/test/src/math/smoke/tanf_test.cpp        |   4 +-
 libc/test/src/math/smoke/tanhf16_test.cpp     |   6 +-
 libc/test/src/math/smoke/tanhf_test.cpp       |   4 +-
 libc/test/src/math/smoke/tanpif16_test.cpp    |   4 +-
 libc/test/src/math/tanf_test.cpp              |   4 +-
 libc/test/src/math/tanhf_test.cpp             |   4 +-
 libc/test/src/poll/poll_test.cpp              |   6 +-
 libc/test/src/sched/affinity_test.cpp         |  10 +-
 libc/test/src/sched/cpu_count_test.cpp        |   4 +-
 libc/test/src/sched/get_priority_test.cpp     |   4 +-
 .../src/sched/param_and_scheduler_test.cpp    |  49 ++++----
 .../src/sched/sched_rr_get_interval_test.cpp  |  10 +-
 libc/test/src/sched/yield_test.cpp            |   4 +-
 libc/test/src/signal/sigaltstack_test.cpp     |   4 +-
 libc/test/src/signal/signal_test.cpp          |   4 +-
 libc/test/src/signal/sigprocmask_test.cpp     |   4 +-
 .../spawn/posix_spawn_file_actions_test.cpp   |   2 +-
 libc/test/src/stdio/fdopen_test.cpp           |  10 +-
 libc/test/src/stdio/fgetc_test.cpp            |   4 +-
 libc/test/src/stdio/fgetc_unlocked_test.cpp   |   4 +-
 libc/test/src/stdio/fgets_test.cpp            |   4 +-
 libc/test/src/stdio/fileop_test.cpp           |  24 ++--
 libc/test/src/stdio/fopencookie_test.cpp      |  10 +-
 libc/test/src/stdio/remove_test.cpp           |   6 +-
 libc/test/src/stdio/rename_test.cpp           |   4 +-
 libc/test/src/stdio/setvbuf_test.cpp          |   4 +-
 libc/test/src/stdio/sprintf_test.cpp          |  76 ++++++------
 libc/test/src/stdio/unlocked_fileop_test.cpp  |   6 +-
 libc/test/src/stdlib/StrtolTest.h             |   1 +
 libc/test/src/stdlib/strtoint32_test.cpp      |   6 +-
 libc/test/src/stdlib/strtoint64_test.cpp      |   6 +-
 libc/test/src/stdlib/strtold_test.cpp         |   1 +
 libc/test/src/sys/mman/linux/mlock_test.cpp   |  17 ++-
 .../src/sys/statvfs/linux/fstatvfs_test.cpp   |   4 +-
 .../src/sys/statvfs/linux/statvfs_test.cpp    |   4 +-
 libc/test/src/sys/time/setitimer_test.cpp     |   2 +-
 libc/test/src/termios/termios_test.cpp        |  12 +-
 libc/test/src/time/asctime_r_test.cpp         |   2 +-
 libc/test/src/time/asctime_test.cpp           |   2 +-
 libc/test/src/time/ctime_r_test.cpp           |   2 +-
 libc/test/src/time/ctime_test.cpp             |   2 +-
 libc/test/src/time/gmtime_test.cpp            |   4 +-
 libc/test/src/time/nanosleep_test.cpp         |   4 +-
 .../llvm-project-overlay/libc/BUILD.bazel     |   3 +-
 397 files changed, 829 insertions(+), 783 deletions(-)
 create mode 100644 libc/shared/libc_common.h
 create mode 100644 libc/src/__support/libc_errno.h
 delete mode 100644 libc/src/errno/libc_errno.h

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 0facb0b9be0c1..a98e7276bef80 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -106,6 +106,10 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}")
   endif()
 
+  if(LIBC_CONF_ERRNO_MODE)
+    set(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/config/config.json b/libc/config/config.json
index bfe956855cb52..d53b2936edb07 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -2,7 +2,7 @@
   "errno": {
     "LIBC_CONF_ERRNO_MODE": {
       "value": "LIBC_ERRNO_MODE_DEFAULT",
-      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM."
+      "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE."
     }
   },
   "printf": {
diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index 0bd3a69ae3ffe..86247966552f9 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -101,7 +101,7 @@ test infrastructure itself can be affected. To avoid perturbing the unit test
 infrastructure around the setting of ``errno``, the following rules are to be
 followed:
 
-#. A special macro named ``libc_errno`` defined in ``src/errno/libc_errno.h``
+#. A special macro named ``libc_errno`` defined in ``src/__support/libc_errno.h``
    should be used when setting ``errno`` from libc runtime code. For example,
    code to set ``errno`` to ``EINVAL`` should be:
 
@@ -117,7 +117,7 @@ followed:
    `ErrorOr <https://github.com/llvm/llvm-project/blob/main/libc/src/__support/error_or.h>`_
    to return error values.
 
-#. The header file ``src/errno/libc_errno.h`` is shipped as part of the target
+#. The header file ``src/__support/libc_errno.h`` is shipped as part of the target
    corresponding to the ``errno`` entrypoint ``libc.src.errno.errno``. We do
    not in general allow dependencies between entrypoints. However, the ``errno``
    entrypoint is the only exceptional entrypoint on which other entrypoints
diff --git a/libc/shared/fp_bits.h b/libc/shared/fp_bits.h
index 2898c508b7772..e6bb1e17b80c9 100644
--- a/libc/shared/fp_bits.h
+++ b/libc/shared/fp_bits.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_FP_BITS_H
 #define LLVM_LIBC_SHARED_FP_BITS_H
 
+#include "libc_common.h"
 #include "src/__support/FPUtil/FPBits.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/libc_common.h b/libc/shared/libc_common.h
new file mode 100644
index 0000000000000..c4560bbb02763
--- /dev/null
+++ b/libc/shared/libc_common.h
@@ -0,0 +1,26 @@
+//===-- Common defines for sharing LLVM libc with LLVM projects -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_LIBC_COMMON_H
+#define LLVM_LIBC_SHARED_LIBC_COMMON_H
+
+// Use system errno.
+#ifdef LIBC_ERRNO_MODE
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#error                                                                         \
+    "LIBC_ERRNO_MODE was set to something different from LIBC_ERRNO_MODE_SYSTEM_INLINE."
+#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#else
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM_INLINE
+#endif // LIBC_ERRNO_MODE
+
+#ifndef LIBC_NAMESPACE
+#define LIBC_NAMESPACE __llvm_libc
+#endif // LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SHARED_LIBC_COMMON_H
diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h
index 5509094b944ad..46e35f13f0eac 100644
--- a/libc/shared/rpc_server.h
+++ b/libc/shared/rpc_server.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_RPC_SERVER_H
 #define LLVM_LIBC_SHARED_RPC_SERVER_H
 
+#include "libc_common.h"
 #include "src/__support/RPC/rpc_server.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/str_to_float.h b/libc/shared/str_to_float.h
index b133a28e26efc..dcc6027d6c77f 100644
--- a/libc/shared/str_to_float.h
+++ b/libc/shared/str_to_float.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_STR_TO_FLOAT_H
 #define LLVM_LIBC_SHARED_STR_TO_FLOAT_H
 
+#include "libc_common.h"
 #include "src/__support/str_to_float.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/shared/str_to_integer.h b/libc/shared/str_to_integer.h
index 15bee698d5a6b..6ed38c932662e 100644
--- a/libc/shared/str_to_integer.h
+++ b/libc/shared/str_to_integer.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SHARED_STR_TO_INTEGER_H
 #define LLVM_LIBC_SHARED_STR_TO_INTEGER_H
 
+#include "libc_common.h"
 #include "src/__support/str_to_integer.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index f92499fdbf451..327ff5e0c6a37 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -1,6 +1,15 @@
 add_subdirectory(CPP)
 add_subdirectory(macros)
 
+add_header_library(
+  libc_errno
+  HDRS
+    libc_errno.h
+  DEPENDS
+    libc.hdr.errno_macros
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   block
   HDRS
diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h
index 4c8f34a435bdf..50a101f833c55 100644
--- a/libc/src/__support/FPUtil/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/FEnvImpl.h
@@ -12,10 +12,10 @@
 #include "hdr/fenv_macros.h"
 #include "hdr/math_macros.h"
 #include "hdr/types/fenv_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
 
 #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP)
 #if defined(__APPLE__)
diff --git a/libc/src/__support/File/dir.cpp b/libc/src/__support/File/dir.cpp
index 21b0106f70106..aea8862c15f7f 100644
--- a/libc/src/__support/File/dir.cpp
+++ b/libc/src/__support/File/dir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/CPP/mutex.h" // lock_guard
 #include "src/__support/CPP/new.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp
index 528542cccf324..303852dbbb717 100644
--- a/libc/src/__support/File/file.cpp
+++ b/libc/src/__support/File/file.cpp
@@ -13,8 +13,8 @@
 #include "hdr/types/off_t.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/CPP/span.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp
index 824c1f200e8c5..761e352f74ead 100644
--- a/libc/src/__support/File/linux/file.cpp
+++ b/libc/src/__support/File/linux/file.cpp
@@ -15,8 +15,8 @@
 #include "src/__support/File/linux/lseekImpl.h"
 #include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/libc_errno.h"     // For error macros
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h" // For error macros
 
 #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall
 #include <sys/stat.h>    // For S_IS*, S_IF*, and S_IR* flags.
diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h
index a034913d9f6ec..300e5c5dd55bf 100644
--- a/libc/src/__support/File/linux/lseekImpl.h
+++ b/libc/src/__support/File/linux/lseekImpl.h
@@ -13,8 +13,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h
index 244dd41be3eec..6b58a4125f785 100644
--- a/libc/src/__support/HashTable/randomness.h
+++ b/libc/src/__support/HashTable/randomness.h
@@ -14,7 +14,7 @@
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #if defined(LIBC_HASHTABLE_USE_GETRANDOM)
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/random/getrandom.h"
 #endif
 
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 4742b2a00220b..99e16ad58c918 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -15,8 +15,8 @@
 #include "hdr/types/struct_flock64.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp
index 8c9bd3e1bcc72..e4e53c3c2a0f2 100644
--- a/libc/src/__support/OSUtil/linux/vdso.cpp
+++ b/libc/src/__support/OSUtil/linux/vdso.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/threads/callonce.h"
 #include "src/__support/threads/linux/futex_word.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/auxv/getauxval.h"
 #include <linux/auxvec.h>
 
diff --git a/libc/src/__support/StringUtil/tables/linux_extension_errors.h b/libc/src/__support/StringUtil/tables/linux_extension_errors.h
index 425590f6e91c9..de637d60bea97 100644
--- a/libc/src/__support/StringUtil/tables/linux_extension_errors.h
+++ b/libc/src/__support/StringUtil/tables/linux_extension_errors.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC___SUPPORT_STRINGUTIL_TABLES_LINUX_EXTENSION_ERRORS_H
 
 #include "src/__support/StringUtil/message_mapper.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/__support/libc_errno.h b/libc/src/__support/libc_errno.h
new file mode 100644
index 0000000000000..ab5f6a9c4b9d9
--- /dev/null
+++ b/libc/src/__support/libc_errno.h
@@ -0,0 +1,108 @@
+//===-- Implementation header for libc_errno --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
+#define LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
+
+// This header is to be consumed by internal implementations, in which all of
+// them should refer to `libc_errno` instead of using `errno` directly from
+// <errno.h> header.
+
+// Unit and hermetic tests should:
+// - #include "src/__support/libc_errno.h"
+// - NOT #include <errno.h>
+// - Only use `libc_errno` in the code
+// - Depend on libc.src.errno.errno
+
+// Integration tests should:
+// - NOT #include "src/__support/libc_errno.h"
+// - #include <errno.h>
+// - Use regular `errno` in the code
+// - Still depend on libc.src.errno.errno
+
+// libc uses a fallback default value, either system or thread local.
+#define LIBC_ERRNO_MODE_DEFAULT 0
+// libc never stores a value; `errno` macro uses get link-time failure.
+#define LIBC_ERRNO_MODE_UNDEFINED 1
+// libc maintains per-thread state (requires C++ `thread_local` support).
+#define LIBC_ERRNO_MODE_THREAD_LOCAL 2
+// libc maintains shared state used by all threads, contrary to standard C
+// semantics unless always single-threaded; nothing prevents data races.
+#define LIBC_ERRNO_MODE_SHARED 3
+// libc doesn't maintain any internal state, instead the embedder must define
+// `int *__llvm_libc_errno(void);` C function.
+#define LIBC_ERRNO_MODE_EXTERNAL 4
+// libc uses system `<errno.h>` `errno` macro directly in the overlay mode; in
+// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
+// In this mode, the public C++ symbol `LIBC_NAMESPACE::libc_errno ` is still
+// exported and get redirected to the system `errno` inside its implementation.
+
+// TODO: Investigate deprecating LIBC_ERRNO_MODE_SYSTEM in favor of
+//       LIBC_ERRNO_MODE_SYSTEM_INLINE.
+//       https://github.com/llvm/llvm-project/issues/143454
+#define LIBC_ERRNO_MODE_SYSTEM 5
+// In this mode, the libc_errno is simply a macro resolved to `errno` from the
+// system header <errno.h>.  There is no need to link against the
+// `libc.src.errno.errno` object.
+#define LIBC_ERRNO_MODE_SYSTEM_INLINE 6
+
+#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
+#undef LIBC_ERRNO_MODE
+#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
+#else
+#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM
+#endif
+#endif // LIBC_ERRNO_MODE
+
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM &&                               \
+    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+#error LIBC_ERRNO_MODE must be one of the following values: \
+LIBC_ERRNO_MODE_DEFAULT, \
+LIBC_ERRNO_MODE_UNDEFINED, \
+LIBC_ERRNO_MODE_THREAD_LOCAL, \
+LIBC_ERRNO_MODE_SHARED, \
+LIBC_ERRNO_MODE_EXTERNAL, \
+LIBC_ERRNO_MODE_SYSTEM, \
+LIBC_ERRNO_MODE_SYSTEM_INLINE.
+#endif
+
+#if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#include <errno.h>
+
+#define libc_errno errno
+
+#else // !LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#include "hdr/errno_macros.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+extern "C" int *__llvm_libc_errno() noexcept;
+
+struct Errno {
+  void operator=(int);
+  operator int();
+};
+
+extern Errno libc_errno;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+using LIBC_NAMESPACE::libc_errno;
+
+#endif // LIBC_ERRNO_MODE_SYSTEM_INLINE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index c531d74c53355..baad26aed6851 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -14,9 +14,9 @@
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h" // For error macros
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/futex_utils.h" // For FutexWordType
-#include "src/errno/libc_errno.h"                    // For error macros
 
 #ifdef LIBC_TARGET_ARCH_IS_AARCH64
 #include <arm_acle.h>
diff --git a/libc/src/dirent/closedir.cpp b/libc/src/dirent/closedir.cpp
index 1249ef94cf411..2f8f6f0c044db 100644
--- a/libc/src/dirent/closedir.cpp
+++ b/libc/src/dirent/closedir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/dirent/opendir.cpp b/libc/src/dirent/opendir.cpp
index fee14ef0f558d..bf47d0edac180 100644
--- a/libc/src/dirent/opendir.cpp
+++ b/libc/src/dirent/opendir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/dirent/readdir.cpp b/libc/src/dirent/readdir.cpp
index ad460b5e80b8b..f95f7c1ae8646 100644
--- a/libc/src/dirent/readdir.cpp
+++ b/libc/src/dirent/readdir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/File/dir.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <dirent.h>
 
diff --git a/libc/src/errno/CMakeLists.txt b/libc/src/errno/CMakeLists.txt
index 1d78a5eedff96..2852044e94164 100644
--- a/libc/src/errno/CMakeLists.txt
+++ b/libc/src/errno/CMakeLists.txt
@@ -1,28 +1,16 @@
 # If we are in full build mode, we will provide the errno definition ourselves,
 # and if we are in overlay mode, we will just re-use the system's errno.
-# We are passing LIBC_FULL_BUILD flag in full build mode so that the
-# implementation of libc_errno will know if we are in full build mode or not.
-
-# TODO: Move LIBC_FULL_BUILD flag to _get_common_compile_options.
-set(full_build_flag "")
-if(LLVM_LIBC_FULL_BUILD)
-  set(full_build_flag "-DLIBC_FULL_BUILD")
-endif()
-
-if(LIBC_CONF_ERRNO_MODE)
-  set(errno_config_copts "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}")
-endif()
 
 add_entrypoint_object(
   errno
   SRCS
     libc_errno.cpp
   HDRS
-    libc_errno.h     # Include this
-  COMPILE_OPTIONS
-    ${full_build_flag}
-    ${errno_config_copts}
+    ../__support/libc_errno.h
   DEPENDS
     libc.hdr.errno_macros
     libc.src.__support.common
+    libc.src.__support.libc_errno
+    libc.src.__support.macros.attributes
+    libc.src.__support.macros.config
 )
diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp
index d1600d1b050e3..8ff1eec1b1035 100644
--- a/libc/src/errno/libc_errno.cpp
+++ b/libc/src/errno/libc_errno.cpp
@@ -6,51 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "libc_errno.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 
-// libc uses a fallback default value, either system or thread local.
-#define LIBC_ERRNO_MODE_DEFAULT 0
-// libc never stores a value; `errno` macro uses get link-time failure.
-#define LIBC_ERRNO_MODE_UNDEFINED 1
-// libc maintains per-thread state (requires C++ `thread_local` support).
-#define LIBC_ERRNO_MODE_THREAD_LOCAL 2
-// libc maintains shared state used by all threads, contrary to standard C
-// semantics unless always single-threaded; nothing prevents data races.
-#define LIBC_ERRNO_MODE_SHARED 3
-// libc doesn't maintain any internal state, instead the embedder must define
-// `int *__llvm_libc_errno(void);` C function.
-#define LIBC_ERRNO_MODE_EXTERNAL 4
-// libc uses system `<errno.h>` `errno` macro directly in the overlay mode; in
-// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`.
-#define LIBC_ERRNO_MODE_SYSTEM 5
-
-#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT
-#undef LIBC_ERRNO_MODE
-#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING)
-#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL
-#else
-#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM
-#endif
-#endif // LIBC_ERRNO_MODE
-
-#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT &&                              \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED &&                            \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL &&                         \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED &&                               \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL &&                             \
-    LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM
-#error LIBC_ERRNO_MODE must be one of the following values: \
-LIBC_ERRNO_MODE_DEFAULT, \
-LIBC_ERRNO_MODE_UNDEFINED, \
-LIBC_ERRNO_MODE_THREAD_LOCAL, \
-LIBC_ERRNO_MODE_SHARED, \
-LIBC_ERRNO_MODE_EXTERNAL, \
-LIBC_ERRNO_MODE_SYSTEM
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
 
+#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+
 #if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_UNDEFINED
 
 void Errno::operator=(int) {}
@@ -93,4 +56,6 @@ Errno::operator int() { return errno; }
 // Define the global `libc_errno` instance.
 Errno libc_errno;
 
+#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE
+
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/errno/libc_errno.h b/libc/src/errno/libc_errno.h
deleted file mode 100644
index 44ee2714843ba..0000000000000
--- a/libc/src/errno/libc_errno.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- Implementation header for libc_errno --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
-#define LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
-
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
-
-#include "hdr/errno_macros.h"
-
-// This header is to be consumed by internal implementations, in which all of
-// them should refer to `libc_errno` instead of using `errno` directly from
-// <errno.h> header.
-
-// Unit and hermetic tests should:
-// - #include "src/errno/libc_errno.h"
-// - NOT #include <errno.h>
-// - Only use `libc_errno` in the code
-// - Depend on libc.src.errno.errno
-
-// Integration tests should:
-// - NOT #include "src/errno/libc_errno.h"
-// - #include <errno.h>
-// - Use regular `errno` in the code
-// - Still depend on libc.src.errno.errno
-
-namespace LIBC_NAMESPACE_DECL {
-
-extern "C" int *__llvm_libc_errno() noexcept;
-
-struct Errno {
-  void operator=(int);
-  operator int();
-};
-
-extern Errno libc_errno;
-
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H
diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp
index 23abae243aed9..71412a8e68c53 100644
--- a/libc/src/fcntl/linux/creat.cpp
+++ b/libc/src/fcntl/linux/creat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp
index 8b699ecdd2043..a21a03788deaa 100644
--- a/libc/src/fcntl/linux/open.cpp
+++ b/libc/src/fcntl/linux/open.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp
index 6063d9c00ad6c..b47ad1fb3bb0f 100644
--- a/libc/src/fcntl/linux/openat.cpp
+++ b/libc/src/fcntl/linux/openat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/types/mode_t.h"
 #include <stdarg.h>
diff --git a/libc/src/inttypes/strtoimax.cpp b/libc/src/inttypes/strtoimax.cpp
index 85f197c75d90c..6e55a4b56aac7 100644
--- a/libc/src/inttypes/strtoimax.cpp
+++ b/libc/src/inttypes/strtoimax.cpp
@@ -8,9 +8,9 @@
 
 #include "src/inttypes/strtoimax.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/inttypes/strtoumax.cpp b/libc/src/inttypes/strtoumax.cpp
index 2e9cbc9acba75..ce5a0a782d979 100644
--- a/libc/src/inttypes/strtoumax.cpp
+++ b/libc/src/inttypes/strtoumax.cpp
@@ -8,9 +8,9 @@
 
 #include "src/inttypes/strtoumax.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp
index e973b2921c2e4..27729104e038d 100644
--- a/libc/src/math/generic/exp10m1f.cpp
+++ b/libc/src/math/generic/exp10m1f.cpp
@@ -14,9 +14,9 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
-#include "src/errno/libc_errno.h"
 
 #include "explogxf.h"
 
diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp
index 4913a5e4277e4..127c6eaa494d4 100644
--- a/libc/src/math/generic/exp2m1f.cpp
+++ b/libc/src/math/generic/exp2m1f.cpp
@@ -14,10 +14,10 @@
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/cpu_features.h"
-#include "src/errno/libc_errno.h"
 
 #include "explogxf.h"
 
diff --git a/libc/src/math/generic/nan.cpp b/libc/src/math/generic/nan.cpp
index f92cd3ff5eb50..829a2ea435ac0 100644
--- a/libc/src/math/generic/nan.cpp
+++ b/libc/src/math/generic/nan.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nan.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf.cpp b/libc/src/math/generic/nanf.cpp
index 7287182406acd..1cb66160e736e 100644
--- a/libc/src/math/generic/nanf.cpp
+++ b/libc/src/math/generic/nanf.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp
index 3d8581afa0371..4155c5333a9c2 100644
--- a/libc/src/math/generic/nanf128.cpp
+++ b/libc/src/math/generic/nanf128.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf128.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanf16.cpp b/libc/src/math/generic/nanf16.cpp
index 27d9d165f4a85..7b166400601bc 100644
--- a/libc/src/math/generic/nanf16.cpp
+++ b/libc/src/math/generic/nanf16.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanf16.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/math/generic/nanl.cpp b/libc/src/math/generic/nanl.cpp
index 4f698cb3c88d0..58d638c4b531d 100644
--- a/libc/src/math/generic/nanl.cpp
+++ b/libc/src/math/generic/nanl.cpp
@@ -8,9 +8,9 @@
 
 #include "src/math/nanl.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/poll/linux/poll.cpp b/libc/src/poll/linux/poll.cpp
index f82fcbcc6577c..4cac75b9687c8 100644
--- a/libc/src/poll/linux/poll.cpp
+++ b/libc/src/poll/linux/poll.cpp
@@ -13,8 +13,8 @@
 #include "hdr/types/struct_timespec.h"
 #include "src/__support/OSUtil/syscall.h" // syscall_impl
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // SYS_poll, SYS_ppoll
 
diff --git a/libc/src/pthread/pthread_atfork.cpp b/libc/src/pthread/pthread_atfork.cpp
index b2c67c78e5d94..4cad16a02de70 100644
--- a/libc/src/pthread/pthread_atfork.cpp
+++ b/libc/src/pthread/pthread_atfork.cpp
@@ -9,9 +9,9 @@
 #include "pthread_atfork.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/fork_callbacks.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // For pthread_* type definitions.
 
diff --git a/libc/src/pthread/pthread_attr_setdetachstate.cpp b/libc/src/pthread/pthread_attr_setdetachstate.cpp
index 872f694e01f3a..c482d25610c28 100644
--- a/libc/src/pthread/pthread_attr_setdetachstate.cpp
+++ b/libc/src/pthread/pthread_attr_setdetachstate.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setdetachstate.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_attr_setguardsize.cpp b/libc/src/pthread/pthread_attr_setguardsize.cpp
index fa4375e915ab4..c996210a61d8a 100644
--- a/libc/src/pthread/pthread_attr_setguardsize.cpp
+++ b/libc/src/pthread/pthread_attr_setguardsize.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setguardsize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <pthread.h>
diff --git a/libc/src/pthread/pthread_attr_setstack.cpp b/libc/src/pthread/pthread_attr_setstack.cpp
index 1154055a63a7e..767f959b14003 100644
--- a/libc/src/pthread/pthread_attr_setstack.cpp
+++ b/libc/src/pthread/pthread_attr_setstack.cpp
@@ -10,9 +10,9 @@
 #include "pthread_attr_setstacksize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h" // For STACK_ALIGNMENT
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 #include <stdint.h>
diff --git a/libc/src/pthread/pthread_attr_setstacksize.cpp b/libc/src/pthread/pthread_attr_setstacksize.cpp
index 0a5d1af661abf..38c77ca761d69 100644
--- a/libc/src/pthread/pthread_attr_setstacksize.cpp
+++ b/libc/src/pthread/pthread_attr_setstacksize.cpp
@@ -9,8 +9,8 @@
 #include "pthread_attr_setstacksize.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
index 5e825d5ecea69..2f63d5e9d1942 100644
--- a/libc/src/pthread/pthread_condattr_setclock.cpp
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -9,8 +9,8 @@
 #include "pthread_condattr_setclock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME
 #include <pthread.h>         // pthread_condattr_t
diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp
index 433b2dc1d2d93..9c117499a5592 100644
--- a/libc/src/pthread/pthread_condattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_condattr_setpshared.cpp
@@ -9,8 +9,8 @@
 #include "pthread_condattr_setpshared.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE
 
diff --git a/libc/src/pthread/pthread_create.cpp b/libc/src/pthread/pthread_create.cpp
index e1b1f3b325d1c..45be2807fa832 100644
--- a/libc/src/pthread/pthread_create.cpp
+++ b/libc/src/pthread/pthread_create.cpp
@@ -16,10 +16,10 @@
 #include "pthread_attr_getstack.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // For pthread_* type definitions.
 
diff --git a/libc/src/pthread/pthread_key_create.cpp b/libc/src/pthread/pthread_key_create.cpp
index 383762f273e7a..7253de14cc0d5 100644
--- a/libc/src/pthread/pthread_key_create.cpp
+++ b/libc/src/pthread/pthread_key_create.cpp
@@ -9,9 +9,9 @@
 #include "pthread_key_create.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_key_delete.cpp b/libc/src/pthread/pthread_key_delete.cpp
index b54db821ab05a..2b14d874fe31c 100644
--- a/libc/src/pthread/pthread_key_delete.cpp
+++ b/libc/src/pthread/pthread_key_delete.cpp
@@ -9,9 +9,9 @@
 #include "pthread_key_delete.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_setpshared.cpp b/libc/src/pthread/pthread_mutexattr_setpshared.cpp
index deeae15be2303..a87a08259c4bb 100644
--- a/libc/src/pthread/pthread_mutexattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_mutexattr_setpshared.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_setrobust.cpp b/libc/src/pthread/pthread_mutexattr_setrobust.cpp
index 9fd46f4c928d7..fd7a8d7ce1d17 100644
--- a/libc/src/pthread/pthread_mutexattr_setrobust.cpp
+++ b/libc/src/pthread/pthread_mutexattr_setrobust.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_mutexattr_settype.cpp b/libc/src/pthread/pthread_mutexattr_settype.cpp
index c7e78271f9c38..5a65f031045d6 100644
--- a/libc/src/pthread/pthread_mutexattr_settype.cpp
+++ b/libc/src/pthread/pthread_mutexattr_settype.cpp
@@ -10,8 +10,8 @@
 #include "pthread_mutexattr.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
index 112ff5c9cdad3..fcddfed224906 100644
--- a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp
@@ -9,11 +9,11 @@
 #include "src/pthread/pthread_rwlock_timedrdlock.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_assert.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/threads/linux/rwlock.h"
 #include "src/__support/time/linux/abs_timeout.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_trywrlock.cpp b/libc/src/pthread/pthread_rwlock_trywrlock.cpp
index a63dc893e7169..660c15a87b36c 100644
--- a/libc/src/pthread/pthread_rwlock_trywrlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_trywrlock.cpp
@@ -9,9 +9,9 @@
 #include "src/pthread/pthread_rwlock_trywrlock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/rwlock.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlock_unlock.cpp b/libc/src/pthread/pthread_rwlock_unlock.cpp
index e61290179bd62..5496bea929c51 100644
--- a/libc/src/pthread/pthread_rwlock_unlock.cpp
+++ b/libc/src/pthread/pthread_rwlock_unlock.cpp
@@ -9,9 +9,9 @@
 #include "src/pthread/pthread_rwlock_unlock.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/linux/rwlock.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
index 80d34a35c717a..e6800311b8587 100644
--- a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
+++ b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp
@@ -9,8 +9,8 @@
 #include "pthread_rwlockattr_setkind_np.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_rwlockattr_t
 
diff --git a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
index 5a7191aefd3d0..4fbd095ac2b46 100644
--- a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
+++ b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp
@@ -9,8 +9,8 @@
 #include "pthread_rwlockattr_setpshared.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h> // pthread_rwlockattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE
 
diff --git a/libc/src/pthread/pthread_setspecific.cpp b/libc/src/pthread/pthread_setspecific.cpp
index 70c29c1670841..b147a66d2fad7 100644
--- a/libc/src/pthread/pthread_setspecific.cpp
+++ b/libc/src/pthread/pthread_setspecific.cpp
@@ -9,9 +9,9 @@
 #include "pthread_setspecific.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <pthread.h>
 
diff --git a/libc/src/sched/linux/sched_get_priority_max.cpp b/libc/src/sched/linux/sched_get_priority_max.cpp
index 77a82c77405f3..fb30b1e319e7b 100644
--- a/libc/src/sched/linux/sched_get_priority_max.cpp
+++ b/libc/src/sched/linux/sched_get_priority_max.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_get_priority_min.cpp b/libc/src/sched/linux/sched_get_priority_min.cpp
index fca66a15edb55..54f67e915fc17 100644
--- a/libc/src/sched/linux/sched_get_priority_min.cpp
+++ b/libc/src/sched/linux/sched_get_priority_min.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp
index 7b1fd8c5aa2af..e005819e2a978 100644
--- a/libc/src/sched/linux/sched_getaffinity.cpp
+++ b/libc/src/sched/linux/sched_getaffinity.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sched.h>
 #include <stdint.h>
diff --git a/libc/src/sched/linux/sched_getparam.cpp b/libc/src/sched/linux/sched_getparam.cpp
index 75756a65f0ede..b0576c3ac65b8 100644
--- a/libc/src/sched/linux/sched_getparam.cpp
+++ b/libc/src/sched/linux/sched_getparam.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_getscheduler.cpp b/libc/src/sched/linux/sched_getscheduler.cpp
index 545cda8e7484b..d8e02967a633d 100644
--- a/libc/src/sched/linux/sched_getscheduler.cpp
+++ b/libc/src/sched/linux/sched_getscheduler.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp
index 1f0ef69dfc893..5668d596bce1f 100644
--- a/libc/src/sched/linux/sched_rr_get_interval.cpp
+++ b/libc/src/sched/linux/sched_rr_get_interval.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_setaffinity.cpp b/libc/src/sched/linux/sched_setaffinity.cpp
index cad48c26bf938..93e930dcf2e3e 100644
--- a/libc/src/sched/linux/sched_setaffinity.cpp
+++ b/libc/src/sched/linux/sched_setaffinity.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sched.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sched/linux/sched_setparam.cpp b/libc/src/sched/linux/sched_setparam.cpp
index e78e78a707e05..7875d9e2f19bc 100644
--- a/libc/src/sched/linux/sched_setparam.cpp
+++ b/libc/src/sched/linux/sched_setparam.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_setscheduler.cpp b/libc/src/sched/linux/sched_setscheduler.cpp
index b6b6f667b3f9e..232e5a59b1858 100644
--- a/libc/src/sched/linux/sched_setscheduler.cpp
+++ b/libc/src/sched/linux/sched_setscheduler.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sched/linux/sched_yield.cpp b/libc/src/sched/linux/sched_yield.cpp
index 3de9d0ba35717..c1e9168f34d0e 100644
--- a/libc/src/sched/linux/sched_yield.cpp
+++ b/libc/src/sched/linux/sched_yield.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/search/hcreate.cpp b/libc/src/search/hcreate.cpp
index ac816a902e221..68bdb29e51dfb 100644
--- a/libc/src/search/hcreate.cpp
+++ b/libc/src/search/hcreate.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hcreate.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/search/hsearch/global.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/search/hcreate_r.cpp b/libc/src/search/hcreate_r.cpp
index 17acd808c19a6..c89be803b4e16 100644
--- a/libc/src/search/hcreate_r.cpp
+++ b/libc/src/search/hcreate_r.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hcreate_r.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, hcreate_r,
diff --git a/libc/src/search/hdestroy_r.cpp b/libc/src/search/hdestroy_r.cpp
index 7eff5bb6fff9d..ba5476098be29 100644
--- a/libc/src/search/hdestroy_r.cpp
+++ b/libc/src/search/hdestroy_r.cpp
@@ -8,8 +8,8 @@
 
 #include "src/search/hdestroy_r.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, hdestroy_r, (struct hsearch_data * htab)) {
diff --git a/libc/src/search/hsearch.cpp b/libc/src/search/hsearch.cpp
index c18b5d3d7f547..034333d170579 100644
--- a/libc/src/search/hsearch.cpp
+++ b/libc/src/search/hsearch.cpp
@@ -9,8 +9,8 @@
 #include "src/search/hsearch.h"
 #include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/search/hsearch/global.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/search/hsearch_r.cpp b/libc/src/search/hsearch_r.cpp
index f93e608a190b1..323001e1b103d 100644
--- a/libc/src/search/hsearch_r.cpp
+++ b/libc/src/search/hsearch_r.cpp
@@ -8,8 +8,8 @@
 
 #include "src/search/hsearch_r.h"
 #include "src/__support/HashTable/table.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, hsearch_r,
diff --git a/libc/src/signal/linux/kill.cpp b/libc/src/signal/linux/kill.cpp
index ed117858f51ef..0f5e88757acb8 100644
--- a/libc/src/signal/linux/kill.cpp
+++ b/libc/src/signal/linux/kill.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include <signal.h>
diff --git a/libc/src/signal/linux/sigaction.cpp b/libc/src/signal/linux/sigaction.cpp
index 65ec36741683c..43a3e195474e5 100644
--- a/libc/src/signal/linux/sigaction.cpp
+++ b/libc/src/signal/linux/sigaction.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigaddset.cpp b/libc/src/signal/linux/sigaddset.cpp
index 628883e13b887..2091e8b51453f 100644
--- a/libc/src/signal/linux/sigaddset.cpp
+++ b/libc/src/signal/linux/sigaddset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigaltstack.cpp b/libc/src/signal/linux/sigaltstack.cpp
index c19394cd17912..990b841c6d904 100644
--- a/libc/src/signal/linux/sigaltstack.cpp
+++ b/libc/src/signal/linux/sigaltstack.cpp
@@ -8,8 +8,8 @@
 
 #include "src/signal/sigaltstack.h"
 #include "hdr/types/stack_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include "src/__support/common.h"
diff --git a/libc/src/signal/linux/sigdelset.cpp b/libc/src/signal/linux/sigdelset.cpp
index 2e964051ebde7..6fce0d7a6e147 100644
--- a/libc/src/signal/linux/sigdelset.cpp
+++ b/libc/src/signal/linux/sigdelset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigemptyset.cpp b/libc/src/signal/linux/sigemptyset.cpp
index d347477695e6c..034a9e2cbe15e 100644
--- a/libc/src/signal/linux/sigemptyset.cpp
+++ b/libc/src/signal/linux/sigemptyset.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/signal/sigemptyset.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include "src/__support/common.h"
diff --git a/libc/src/signal/linux/sigfillset.cpp b/libc/src/signal/linux/sigfillset.cpp
index 3e9897a03bb73..f0b499093b319 100644
--- a/libc/src/signal/linux/sigfillset.cpp
+++ b/libc/src/signal/linux/sigfillset.cpp
@@ -10,8 +10,8 @@
 
 #include "hdr/types/sigset_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/signal/linux/sigprocmask.cpp b/libc/src/signal/linux/sigprocmask.cpp
index 8838379ae5d30..af3c424c5f34e 100644
--- a/libc/src/signal/linux/sigprocmask.cpp
+++ b/libc/src/signal/linux/sigprocmask.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/sigset_t.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
index bb8504f655c4a..9a575bd591632 100644
--- a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
index 710063d52e74d..1ad45ed942bb9 100644
--- a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
index 028d6e895f3c4..9977fc2d0a218 100644
--- a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp
@@ -11,8 +11,8 @@
 #include "file_actions.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
index 168118da249d1..affd338005cf4 100644
--- a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
+++ b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp
@@ -12,8 +12,8 @@
 
 #include "src/__support/CPP/new.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <spawn.h>
 
diff --git a/libc/src/stdio/fopencookie.cpp b/libc/src/stdio/fopencookie.cpp
index 9f5694e8e0581..da8a132a4db6e 100644
--- a/libc/src/stdio/fopencookie.cpp
+++ b/libc/src/stdio/fopencookie.cpp
@@ -14,8 +14,8 @@
 #include "src/__support/CPP/new.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fclose.cpp b/libc/src/stdio/generic/fclose.cpp
index 388407a58d414..902b4cf972373 100644
--- a/libc/src/stdio/generic/fclose.cpp
+++ b/libc/src/stdio/generic/fclose.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fflush.cpp b/libc/src/stdio/generic/fflush.cpp
index 5bdf71ad35940..d0271d9154c87 100644
--- a/libc/src/stdio/generic/fflush.cpp
+++ b/libc/src/stdio/generic/fflush.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fgetc.cpp b/libc/src/stdio/generic/fgetc.cpp
index aa6660ca180cf..e65ce2fda49bd 100644
--- a/libc/src/stdio/generic/fgetc.cpp
+++ b/libc/src/stdio/generic/fgetc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fgetc_unlocked.cpp b/libc/src/stdio/generic/fgetc_unlocked.cpp
index 34a27f1d1c420..5c07d4feb513e 100644
--- a/libc/src/stdio/generic/fgetc_unlocked.cpp
+++ b/libc/src/stdio/generic/fgetc_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fgets.cpp b/libc/src/stdio/generic/fgets.cpp
index de6474087a140..e0ad9b6e2f564 100644
--- a/libc/src/stdio/generic/fgets.cpp
+++ b/libc/src/stdio/generic/fgets.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fopen.cpp b/libc/src/stdio/generic/fopen.cpp
index d6e418bacf37e..57c85c2e54e16 100644
--- a/libc/src/stdio/generic/fopen.cpp
+++ b/libc/src/stdio/generic/fopen.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fputc.cpp b/libc/src/stdio/generic/fputc.cpp
index 54a38aeb2f1e2..6639f0687c87a 100644
--- a/libc/src/stdio/generic/fputc.cpp
+++ b/libc/src/stdio/generic/fputc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fputs.cpp b/libc/src/stdio/generic/fputs.cpp
index 8aef7683b3ce3..621b40f63c912 100644
--- a/libc/src/stdio/generic/fputs.cpp
+++ b/libc/src/stdio/generic/fputs.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fread.cpp b/libc/src/stdio/generic/fread.cpp
index 3a04094ea8b4b..1b576ec34688f 100644
--- a/libc/src/stdio/generic/fread.cpp
+++ b/libc/src/stdio/generic/fread.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fread_unlocked.cpp b/libc/src/stdio/generic/fread_unlocked.cpp
index 151f43c6bbeba..257f1a212add4 100644
--- a/libc/src/stdio/generic/fread_unlocked.cpp
+++ b/libc/src/stdio/generic/fread_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fseek.cpp b/libc/src/stdio/generic/fseek.cpp
index 21820da18542a..99191e7c41949 100644
--- a/libc/src/stdio/generic/fseek.cpp
+++ b/libc/src/stdio/generic/fseek.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fseek.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fseeko.cpp b/libc/src/stdio/generic/fseeko.cpp
index 7456b4a219079..afcfc71c7c09a 100644
--- a/libc/src/stdio/generic/fseeko.cpp
+++ b/libc/src/stdio/generic/fseeko.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fseeko.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/ftell.cpp b/libc/src/stdio/generic/ftell.cpp
index ec15ca4e96caf..b55a806007aff 100644
--- a/libc/src/stdio/generic/ftell.cpp
+++ b/libc/src/stdio/generic/ftell.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/ftell.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/ftello.cpp b/libc/src/stdio/generic/ftello.cpp
index e3d0726ec4843..91031cb7fad70 100644
--- a/libc/src/stdio/generic/ftello.cpp
+++ b/libc/src/stdio/generic/ftello.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/ftello.h"
 #include "src/__support/File/file.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/fwrite.cpp b/libc/src/stdio/generic/fwrite.cpp
index 66eb9a3c71855..b44ecb2838118 100644
--- a/libc/src/stdio/generic/fwrite.cpp
+++ b/libc/src/stdio/generic/fwrite.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/fwrite_unlocked.cpp b/libc/src/stdio/generic/fwrite_unlocked.cpp
index a0d9014cd68de..2f9ec26f2f80c 100644
--- a/libc/src/stdio/generic/fwrite_unlocked.cpp
+++ b/libc/src/stdio/generic/fwrite_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getc.cpp b/libc/src/stdio/generic/getc.cpp
index e988468898c53..0ac010ebc5994 100644
--- a/libc/src/stdio/generic/getc.cpp
+++ b/libc/src/stdio/generic/getc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getc_unlocked.cpp b/libc/src/stdio/generic/getc_unlocked.cpp
index 92d5092623ac5..eee23a18d05df 100644
--- a/libc/src/stdio/generic/getc_unlocked.cpp
+++ b/libc/src/stdio/generic/getc_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/getchar.cpp b/libc/src/stdio/generic/getchar.cpp
index 371fc70eb214f..87d24a2b1f09e 100644
--- a/libc/src/stdio/generic/getchar.cpp
+++ b/libc/src/stdio/generic/getchar.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/getchar_unlocked.cpp b/libc/src/stdio/generic/getchar_unlocked.cpp
index b898f5cb25963..f321969483e35 100644
--- a/libc/src/stdio/generic/getchar_unlocked.cpp
+++ b/libc/src/stdio/generic/getchar_unlocked.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/generic/putc.cpp b/libc/src/stdio/generic/putc.cpp
index b5f008fdce44a..83bc3d4131e76 100644
--- a/libc/src/stdio/generic/putc.cpp
+++ b/libc/src/stdio/generic/putc.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/putchar.cpp b/libc/src/stdio/generic/putchar.cpp
index e86df23d6716b..2b3509e5e414c 100644
--- a/libc/src/stdio/generic/putchar.cpp
+++ b/libc/src/stdio/generic/putchar.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp
index 7dbe2c79f920d..4267dd546c4dc 100644
--- a/libc/src/stdio/generic/puts.cpp
+++ b/libc/src/stdio/generic/puts.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp
index 5b8f01d7d5346..9877817d92099 100644
--- a/libc/src/stdio/gpu/fprintf.cpp
+++ b/libc/src/stdio/gpu/fprintf.cpp
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
 #include <stdarg.h>
diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp
index 53fe69d5e2ebe..8a9174d7397ae 100644
--- a/libc/src/stdio/gpu/printf.cpp
+++ b/libc/src/stdio/gpu/printf.cpp
@@ -11,7 +11,7 @@
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/arg_list.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdio/gpu/vfprintf_utils.h"
 
 #include <stdarg.h>
diff --git a/libc/src/stdio/linux/fdopen.cpp b/libc/src/stdio/linux/fdopen.cpp
index 7d72fdc88e9fb..5623f06b7cff0 100644
--- a/libc/src/stdio/linux/fdopen.cpp
+++ b/libc/src/stdio/linux/fdopen.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/fdopen.h"
 
 #include "src/__support/File/linux/file.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/linux/remove.cpp b/libc/src/stdio/linux/remove.cpp
index dbb4491d0e6cc..ac755db0bc781 100644
--- a/libc/src/stdio/linux/remove.cpp
+++ b/libc/src/stdio/linux/remove.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h" // For AT_* macros.
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp
index fbcb29be48f4e..426c8698e557d 100644
--- a/libc/src/stdio/linux/rename.cpp
+++ b/libc/src/stdio/linux/rename.cpp
@@ -10,8 +10,8 @@
 #include "hdr/fcntl_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 89556f1a9e5f2..cef9b1ae58fa0 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -25,7 +25,7 @@
 #include "src/__support/fixed_point/fx_rep.h"
 #endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT
 #ifndef LIBC_COPT_PRINTF_DISABLE_STRERROR
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #endif // LIBC_COPT_PRINTF_DISABLE_STRERROR
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdio/setbuf.cpp b/libc/src/stdio/setbuf.cpp
index f3db97de58371..fcc6df12ddb08 100644
--- a/libc/src/stdio/setbuf.cpp
+++ b/libc/src/stdio/setbuf.cpp
@@ -9,8 +9,8 @@
 #include "src/stdio/setbuf.h"
 #include "hdr/stdio_macros.h"
 #include "src/__support/File/file.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdio/setvbuf.cpp b/libc/src/stdio/setvbuf.cpp
index 0a6b8cacb59c8..9fc6cb040233b 100644
--- a/libc/src/stdio/setvbuf.cpp
+++ b/libc/src/stdio/setvbuf.cpp
@@ -10,8 +10,8 @@
 #include "src/__support/File/file.h"
 
 #include "hdr/types/FILE.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdlib/atof.cpp b/libc/src/stdlib/atof.cpp
index 18a65c67705d3..d0d8d211dea8c 100644
--- a/libc/src/stdlib/atof.cpp
+++ b/libc/src/stdlib/atof.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atof.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atoi.cpp b/libc/src/stdlib/atoi.cpp
index 9e46b53b1aa0b..420bbc8143d55 100644
--- a/libc/src/stdlib/atoi.cpp
+++ b/libc/src/stdlib/atoi.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atoi.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atol.cpp b/libc/src/stdlib/atol.cpp
index 7f3414a4afdd2..e1110ffa449b0 100644
--- a/libc/src/stdlib/atol.cpp
+++ b/libc/src/stdlib/atol.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atol.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/atoll.cpp b/libc/src/stdlib/atoll.cpp
index 4f1a02ad8315b..063e817f9b790 100644
--- a/libc/src/stdlib/atoll.cpp
+++ b/libc/src/stdlib/atoll.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/atoll.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtod.cpp b/libc/src/stdlib/strtod.cpp
index 2c6819163aa46..deb2390c7fcde 100644
--- a/libc/src/stdlib/strtod.cpp
+++ b/libc/src/stdlib/strtod.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtod.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp
index 247314398315b..ad333b32d2406 100644
--- a/libc/src/stdlib/strtod_l.cpp
+++ b/libc/src/stdlib/strtod_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtod_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtof.cpp b/libc/src/stdlib/strtof.cpp
index 351bf64ad4f70..fc52dc85ffc50 100644
--- a/libc/src/stdlib/strtof.cpp
+++ b/libc/src/stdlib/strtof.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtof.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp
index d54efa66e0846..c6e03ff51fa2f 100644
--- a/libc/src/stdlib/strtof_l.cpp
+++ b/libc/src/stdlib/strtof_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtof_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtol.cpp b/libc/src/stdlib/strtol.cpp
index 77f8712d7c136..42db36b2052b4 100644
--- a/libc/src/stdlib/strtol.cpp
+++ b/libc/src/stdlib/strtol.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtol.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp
index f94aff1a0d7b2..497a4403eff4b 100644
--- a/libc/src/stdlib/strtol_l.cpp
+++ b/libc/src/stdlib/strtol_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtol_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtold.cpp b/libc/src/stdlib/strtold.cpp
index 88d29c9f36278..44046c2c6f613 100644
--- a/libc/src/stdlib/strtold.cpp
+++ b/libc/src/stdlib/strtold.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtold.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp
index d0c57f50246b5..c3af30a1b9ecc 100644
--- a/libc/src/stdlib/strtold_l.cpp
+++ b/libc/src/stdlib/strtold_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtold_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoll.cpp b/libc/src/stdlib/strtoll.cpp
index 8d1b3efdcf87d..c1dca13112e0f 100644
--- a/libc/src/stdlib/strtoll.cpp
+++ b/libc/src/stdlib/strtoll.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoll.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp
index e82971d59c48d..6f30d7794c5ca 100644
--- a/libc/src/stdlib/strtoll_l.cpp
+++ b/libc/src/stdlib/strtoll_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoll_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoul.cpp b/libc/src/stdlib/strtoul.cpp
index 1d832318c4489..d26ca5e5a10a1 100644
--- a/libc/src/stdlib/strtoul.cpp
+++ b/libc/src/stdlib/strtoul.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoul.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp
index 74fce00a0ac3c..9a875ddee9029 100644
--- a/libc/src/stdlib/strtoul_l.cpp
+++ b/libc/src/stdlib/strtoul_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoul_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoull.cpp b/libc/src/stdlib/strtoull.cpp
index dba22611cfb09..8f929f577311e 100644
--- a/libc/src/stdlib/strtoull.cpp
+++ b/libc/src/stdlib/strtoull.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoull.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp
index 2ea8a43a40ef2..9eb056b0e59b4 100644
--- a/libc/src/stdlib/strtoull_l.cpp
+++ b/libc/src/stdlib/strtoull_l.cpp
@@ -8,9 +8,9 @@
 
 #include "src/stdlib/strtoull_l.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/string/strdup.cpp b/libc/src/string/strdup.cpp
index 4cf4173a27bf3..dab0ab4288c9e 100644
--- a/libc/src/string/strdup.cpp
+++ b/libc/src/string/strdup.cpp
@@ -8,8 +8,8 @@
 
 #include "src/string/strdup.h"
 #include "hdr/stdlib_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/string/allocating_string_utils.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index 236fd25698f65..f3ae7c5c4e07a 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -9,8 +9,8 @@
 #include "src/sys/auxv/getauxval.h"
 #include "config/app.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/auxvec.h>
 
 // for guarded initialization
diff --git a/libc/src/sys/epoll/linux/epoll_create.cpp b/libc/src/sys/epoll/linux/epoll_create.cpp
index 7196ac7410c30..2e44e883ddf0a 100644
--- a/libc/src/sys/epoll/linux/epoll_create.cpp
+++ b/libc/src/sys/epoll/linux/epoll_create.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_create1.cpp b/libc/src/sys/epoll/linux/epoll_create1.cpp
index efff282e2714d..3c60090fb7b41 100644
--- a/libc/src/sys/epoll/linux/epoll_create1.cpp
+++ b/libc/src/sys/epoll/linux/epoll_create1.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_ctl.cpp b/libc/src/sys/epoll/linux/epoll_ctl.cpp
index 5f7dbb77b1e5b..079bd60403b09 100644
--- a/libc/src/sys/epoll/linux/epoll_ctl.cpp
+++ b/libc/src/sys/epoll/linux/epoll_ctl.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index d7836549928c4..24fd1dbdc467d 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -13,9 +13,9 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 14b419399fe9b..219984528efdd 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -14,9 +14,9 @@
 #include "hdr/types/struct_timespec.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 1a63be5e260fb..7fae7b55992fa 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -13,9 +13,9 @@
 #include "hdr/types/struct_epoll_event.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/mman/linux/madvise.cpp b/libc/src/sys/mman/linux/madvise.cpp
index 332d6c2db4acb..1bb284f62b892 100644
--- a/libc/src/sys/mman/linux/madvise.cpp
+++ b/libc/src/sys/mman/linux/madvise.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mincore.cpp b/libc/src/sys/mman/linux/mincore.cpp
index b5436fda3853a..d583f1ef85f3d 100644
--- a/libc/src/sys/mman/linux/mincore.cpp
+++ b/libc/src/sys/mman/linux/mincore.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlock.cpp b/libc/src/sys/mman/linux/mlock.cpp
index be7eb28e29c4f..8582eb7c00632 100644
--- a/libc/src/sys/mman/linux/mlock.cpp
+++ b/libc/src/sys/mman/linux/mlock.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlock2.cpp b/libc/src/sys/mman/linux/mlock2.cpp
index 7bc557f9bf58f..955cfe128de74 100644
--- a/libc/src/sys/mman/linux/mlock2.cpp
+++ b/libc/src/sys/mman/linux/mlock2.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mlockall.cpp b/libc/src/sys/mman/linux/mlockall.cpp
index eae3a9ea0a183..c3502fbb3af39 100644
--- a/libc/src/sys/mman/linux/mlockall.cpp
+++ b/libc/src/sys/mman/linux/mlockall.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mmap.cpp b/libc/src/sys/mman/linux/mmap.cpp
index ee9a0a32e8f55..33f9fe8ff3709 100644
--- a/libc/src/sys/mman/linux/mmap.cpp
+++ b/libc/src/sys/mman/linux/mmap.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/mman/linux/mprotect.cpp b/libc/src/sys/mman/linux/mprotect.cpp
index e2351028e2c7f..6b14915b60c94 100644
--- a/libc/src/sys/mman/linux/mprotect.cpp
+++ b/libc/src/sys/mman/linux/mprotect.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/mremap.cpp b/libc/src/sys/mman/linux/mremap.cpp
index 38bcfce833d3d..6cdda9435bb69 100644
--- a/libc/src/sys/mman/linux/mremap.cpp
+++ b/libc/src/sys/mman/linux/mremap.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <linux/param.h> // For EXEC_PAGESIZE.
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/mman/linux/msync.cpp b/libc/src/sys/mman/linux/msync.cpp
index e2b4f81d616ad..650678bcb36e0 100644
--- a/libc/src/sys/mman/linux/msync.cpp
+++ b/libc/src/sys/mman/linux/msync.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munlock.cpp b/libc/src/sys/mman/linux/munlock.cpp
index 93c25f844c6e8..9638949f5fcb3 100644
--- a/libc/src/sys/mman/linux/munlock.cpp
+++ b/libc/src/sys/mman/linux/munlock.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munlockall.cpp b/libc/src/sys/mman/linux/munlockall.cpp
index f5911cb01bc28..f47eaece178e3 100644
--- a/libc/src/sys/mman/linux/munlockall.cpp
+++ b/libc/src/sys/mman/linux/munlockall.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/munmap.cpp b/libc/src/sys/mman/linux/munmap.cpp
index 9c01b15ac8dc2..61b1f1549dd18 100644
--- a/libc/src/sys/mman/linux/munmap.cpp
+++ b/libc/src/sys/mman/linux/munmap.cpp
@@ -11,9 +11,9 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h>          // For syscall numbers.
+#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/mman/linux/remap_file_pages.cpp b/libc/src/sys/mman/linux/remap_file_pages.cpp
index f616e1915ecc5..58ae4017f6285 100644
--- a/libc/src/sys/mman/linux/remap_file_pages.cpp
+++ b/libc/src/sys/mman/linux/remap_file_pages.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
index ce75c2b5b6991..69911012ff7e9 100644
--- a/libc/src/sys/mman/linux/shm_common.h
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -9,8 +9,8 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
 // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
diff --git a/libc/src/sys/prctl/linux/prctl.cpp b/libc/src/sys/prctl/linux/prctl.cpp
index 5d4e9046b8777..c726b0a539591 100644
--- a/libc/src/sys/prctl/linux/prctl.cpp
+++ b/libc/src/sys/prctl/linux/prctl.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/random/linux/getrandom.cpp b/libc/src/sys/random/linux/getrandom.cpp
index 9a8869a2d6d38..0b8471ed8b374 100644
--- a/libc/src/sys/random/linux/getrandom.cpp
+++ b/libc/src/sys/random/linux/getrandom.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/resource/linux/getrlimit.cpp b/libc/src/sys/resource/linux/getrlimit.cpp
index 30c2e91b036d1..d272134194949 100644
--- a/libc/src/sys/resource/linux/getrlimit.cpp
+++ b/libc/src/sys/resource/linux/getrlimit.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/resource.h> // For struct rlimit
 #include <sys/syscall.h>  // For syscall numbers.
 
diff --git a/libc/src/sys/resource/linux/setrlimit.cpp b/libc/src/sys/resource/linux/setrlimit.cpp
index 85f07900aaef4..300bad75baa63 100644
--- a/libc/src/sys/resource/linux/setrlimit.cpp
+++ b/libc/src/sys/resource/linux/setrlimit.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/resource.h> // For struct rlimit
 #include <sys/syscall.h>  // For syscall numbers.
 
diff --git a/libc/src/sys/select/linux/select.cpp b/libc/src/sys/select/linux/select.cpp
index 9ccb1e95f275c..6c434eb584596 100644
--- a/libc/src/sys/select/linux/select.cpp
+++ b/libc/src/sys/select/linux/select.cpp
@@ -13,8 +13,8 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stddef.h>      // For size_t
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/sendfile/linux/sendfile.cpp b/libc/src/sys/sendfile/linux/sendfile.cpp
index 9d4174cb8c916..ec892323def50 100644
--- a/libc/src/sys/sendfile/linux/sendfile.cpp
+++ b/libc/src/sys/sendfile/linux/sendfile.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/sendfile.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/socket/linux/bind.cpp b/libc/src/sys/socket/linux/bind.cpp
index 72a3307a91ddd..83a3d06f5380b 100644
--- a/libc/src/sys/socket/linux/bind.cpp
+++ b/libc/src/sys/socket/linux/bind.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/socket/linux/recv.cpp b/libc/src/sys/socket/linux/recv.cpp
index 5e9f2d3233fcf..baf4de1b5eb54 100644
--- a/libc/src/sys/socket/linux/recv.cpp
+++ b/libc/src/sys/socket/linux/recv.cpp
@@ -16,8 +16,8 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/recvfrom.cpp b/libc/src/sys/socket/linux/recvfrom.cpp
index 574e65f64a54b..3d8397b478cc4 100644
--- a/libc/src/sys/socket/linux/recvfrom.cpp
+++ b/libc/src/sys/socket/linux/recvfrom.cpp
@@ -16,8 +16,8 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/recvmsg.cpp b/libc/src/sys/socket/linux/recvmsg.cpp
index e42b6346f330a..bc6d072dbf9a1 100644
--- a/libc/src/sys/socket/linux/recvmsg.cpp
+++ b/libc/src/sys/socket/linux/recvmsg.cpp
@@ -15,8 +15,8 @@
 #include "hdr/types/struct_msghdr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/send.cpp b/libc/src/sys/socket/linux/send.cpp
index cb3b4d5a9ece7..43b01e7e6e0f6 100644
--- a/libc/src/sys/socket/linux/send.cpp
+++ b/libc/src/sys/socket/linux/send.cpp
@@ -16,7 +16,7 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/sendmsg.cpp b/libc/src/sys/socket/linux/sendmsg.cpp
index b4d9c9deda028..b04783ebfe7e7 100644
--- a/libc/src/sys/socket/linux/sendmsg.cpp
+++ b/libc/src/sys/socket/linux/sendmsg.cpp
@@ -15,7 +15,7 @@
 #include "hdr/types/struct_msghdr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/sendto.cpp b/libc/src/sys/socket/linux/sendto.cpp
index 2fada192b0865..9dda127f872d5 100644
--- a/libc/src/sys/socket/linux/sendto.cpp
+++ b/libc/src/sys/socket/linux/sendto.cpp
@@ -16,7 +16,7 @@
 #include "hdr/types/struct_sockaddr.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/sys/socket/linux/socket.cpp b/libc/src/sys/socket/linux/socket.cpp
index 3e6df4d487a53..69eb6cfa01ced 100644
--- a/libc/src/sys/socket/linux/socket.cpp
+++ b/libc/src/sys/socket/linux/socket.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/sys/socket/linux/socketpair.cpp b/libc/src/sys/socket/linux/socketpair.cpp
index 60612ac04d613..7ea8ca46cee58 100644
--- a/libc/src/sys/socket/linux/socketpair.cpp
+++ b/libc/src/sys/socket/linux/socketpair.cpp
@@ -10,9 +10,9 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include "src/errno/libc_errno.h"
 #include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/chmod.cpp b/libc/src/sys/stat/linux/chmod.cpp
index 1b787e47e7c68..2bd0788ec1dfd 100644
--- a/libc/src/sys/stat/linux/chmod.cpp
+++ b/libc/src/sys/stat/linux/chmod.cpp
@@ -13,8 +13,8 @@
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fchmod.cpp b/libc/src/sys/stat/linux/fchmod.cpp
index 0d6fd359169aa..3dadfdd1d943c 100644
--- a/libc/src/sys/stat/linux/fchmod.cpp
+++ b/libc/src/sys/stat/linux/fchmod.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fchmodat.cpp b/libc/src/sys/stat/linux/fchmodat.cpp
index e76db4d160fb8..add2192a558a4 100644
--- a/libc/src/sys/stat/linux/fchmodat.cpp
+++ b/libc/src/sys/stat/linux/fchmodat.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/fstat.cpp b/libc/src/sys/stat/linux/fstat.cpp
index 35cf8f08f782d..dea002c5e12a5 100644
--- a/libc/src/sys/stat/linux/fstat.cpp
+++ b/libc/src/sys/stat/linux/fstat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/fstat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/sys/stat/linux/lstat.cpp b/libc/src/sys/stat/linux/lstat.cpp
index 354c5b6e029a4..5601dd5d78a98 100644
--- a/libc/src/sys/stat/linux/lstat.cpp
+++ b/libc/src/sys/stat/linux/lstat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/lstat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
diff --git a/libc/src/sys/stat/linux/mkdir.cpp b/libc/src/sys/stat/linux/mkdir.cpp
index b319b5c8393de..0829ff4f94322 100644
--- a/libc/src/sys/stat/linux/mkdir.cpp
+++ b/libc/src/sys/stat/linux/mkdir.cpp
@@ -13,8 +13,8 @@
 
 #include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/mkdirat.cpp b/libc/src/sys/stat/linux/mkdirat.cpp
index 097fc158010d1..8f4194dc32752 100644
--- a/libc/src/sys/stat/linux/mkdirat.cpp
+++ b/libc/src/sys/stat/linux/mkdirat.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/stat.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/sys/stat/linux/stat.cpp b/libc/src/sys/stat/linux/stat.cpp
index de9cdb197d687..5553eaf00be2a 100644
--- a/libc/src/sys/stat/linux/stat.cpp
+++ b/libc/src/sys/stat/linux/stat.cpp
@@ -8,8 +8,8 @@
 
 #include "src/sys/stat/stat.h"
 #include "kernel_statx.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/sys/statvfs/linux/statfs_utils.h b/libc/src/sys/statvfs/linux/statfs_utils.h
index 1e5be51531012..8ee4de288ef61 100644
--- a/libc/src/sys/statvfs/linux/statfs_utils.h
+++ b/libc/src/sys/statvfs/linux/statfs_utils.h
@@ -12,9 +12,9 @@
 #include "include/llvm-libc-types/struct_statvfs.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/OSUtil/syscall.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <asm/statfs.h>
 #include <sys/syscall.h>
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/getitimer.cpp b/libc/src/sys/time/linux/getitimer.cpp
index fec06aa4086e9..b874066796940 100644
--- a/libc/src/sys/time/linux/getitimer.cpp
+++ b/libc/src/sys/time/linux/getitimer.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_itimerval.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/setitimer.cpp b/libc/src/sys/time/linux/setitimer.cpp
index def04a4740118..1de0d43297760 100644
--- a/libc/src/sys/time/linux/setitimer.cpp
+++ b/libc/src/sys/time/linux/setitimer.cpp
@@ -9,7 +9,7 @@
 #include "hdr/types/struct_itimerval.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp
index 76b69937a5f48..ed37b42aedf6c 100644
--- a/libc/src/sys/time/linux/utimes.cpp
+++ b/libc/src/sys/time/linux/utimes.cpp
@@ -15,7 +15,7 @@
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include <sys/syscall.h>
 
diff --git a/libc/src/sys/uio/linux/readv.cpp b/libc/src/sys/uio/linux/readv.cpp
index f1393a9749be9..c9d8d87ddc72b 100644
--- a/libc/src/sys/uio/linux/readv.cpp
+++ b/libc/src/sys/uio/linux/readv.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_iovec.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/uio/linux/writev.cpp b/libc/src/sys/uio/linux/writev.cpp
index 8992bed95c982..b0b9e15207922 100644
--- a/libc/src/sys/uio/linux/writev.cpp
+++ b/libc/src/sys/uio/linux/writev.cpp
@@ -10,7 +10,7 @@
 #include "hdr/types/struct_iovec.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/sys/utsname/linux/uname.cpp b/libc/src/sys/utsname/linux/uname.cpp
index 7bb227e801e3a..b47ba964faf0b 100644
--- a/libc/src/sys/utsname/linux/uname.cpp
+++ b/libc/src/sys/utsname/linux/uname.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 #include <sys/utsname.h>
 
diff --git a/libc/src/sys/wait/wait4Impl.h b/libc/src/sys/wait/wait4Impl.h
index f2bdeb02f8668..77ed3ad22f148 100644
--- a/libc/src/sys/wait/wait4Impl.h
+++ b/libc/src/sys/wait/wait4Impl.h
@@ -12,8 +12,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <signal.h>
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/termios/linux/cfsetispeed.cpp b/libc/src/termios/linux/cfsetispeed.cpp
index 9656b714a8ed2..47b19974d21be 100644
--- a/libc/src/termios/linux/cfsetispeed.cpp
+++ b/libc/src/termios/linux/cfsetispeed.cpp
@@ -9,8 +9,8 @@
 #include "src/termios/cfsetispeed.h"
 
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <termios.h>
 
diff --git a/libc/src/termios/linux/cfsetospeed.cpp b/libc/src/termios/linux/cfsetospeed.cpp
index 6130d266dbff0..d2f138257a47a 100644
--- a/libc/src/termios/linux/cfsetospeed.cpp
+++ b/libc/src/termios/linux/cfsetospeed.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/termios/cfsetospeed.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/common.h"
 
diff --git a/libc/src/termios/linux/tcdrain.cpp b/libc/src/termios/linux/tcdrain.cpp
index 116e3f0e0cbc5..570b15c24fe7f 100644
--- a/libc/src/termios/linux/tcdrain.cpp
+++ b/libc/src/termios/linux/tcdrain.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcflow.cpp b/libc/src/termios/linux/tcflow.cpp
index d229230b5d138..714ef6aa71298 100644
--- a/libc/src/termios/linux/tcflow.cpp
+++ b/libc/src/termios/linux/tcflow.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcflush.cpp b/libc/src/termios/linux/tcflush.cpp
index 028a5414b1960..4c7b9fadc446d 100644
--- a/libc/src/termios/linux/tcflush.cpp
+++ b/libc/src/termios/linux/tcflush.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcgetattr.cpp b/libc/src/termios/linux/tcgetattr.cpp
index 63c096ff88eba..2e768269c874d 100644
--- a/libc/src/termios/linux/tcgetattr.cpp
+++ b/libc/src/termios/linux/tcgetattr.cpp
@@ -11,8 +11,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcgetsid.cpp b/libc/src/termios/linux/tcgetsid.cpp
index c283d0e4fda9a..7487816cf2741 100644
--- a/libc/src/termios/linux/tcgetsid.cpp
+++ b/libc/src/termios/linux/tcgetsid.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcsendbreak.cpp b/libc/src/termios/linux/tcsendbreak.cpp
index 30bc91cf3de0a..1d546c1d5953e 100644
--- a/libc/src/termios/linux/tcsendbreak.cpp
+++ b/libc/src/termios/linux/tcsendbreak.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/termios/linux/tcsetattr.cpp b/libc/src/termios/linux/tcsetattr.cpp
index 8aa1e5c57b34e..8a2c7290217ba 100644
--- a/libc/src/termios/linux/tcsetattr.cpp
+++ b/libc/src/termios/linux/tcsetattr.cpp
@@ -11,8 +11,8 @@
 
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <asm/ioctls.h> // Safe to include without the risk of name pollution.
 #include <sys/syscall.h> // For syscall numbers
diff --git a/libc/src/threads/thrd_create.cpp b/libc/src/threads/thrd_create.cpp
index 4680944c2eee0..67e22e72fd0e4 100644
--- a/libc/src/threads/thrd_create.cpp
+++ b/libc/src/threads/thrd_create.cpp
@@ -8,9 +8,9 @@
 
 #include "src/threads/thrd_create.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/threads/thread.h"
-#include "src/errno/libc_errno.h"
 
 #include <threads.h> // For thrd_* type definitions.
 
diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp
index ee4fa82b4f894..c38697cd0668e 100644
--- a/libc/src/time/linux/clock.cpp
+++ b/libc/src/time/linux/clock.cpp
@@ -10,10 +10,10 @@
 #include "hdr/time_macros.h"
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/__support/time/units.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp
index 743c644d65d02..b3fcd2b22f9da 100644
--- a/libc/src/time/linux/clock_gettime.cpp
+++ b/libc/src/time/linux/clock_gettime.cpp
@@ -8,9 +8,9 @@
 
 #include "src/time/clock_gettime.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp
index e8ddf482fc984..237b05903c70f 100644
--- a/libc/src/time/linux/gettimeofday.cpp
+++ b/libc/src/time/linux/gettimeofday.cpp
@@ -10,10 +10,10 @@
 #include "hdr/time_macros.h"
 #include "hdr/types/suseconds_t.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
 #include "src/__support/time/units.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp
index 7a856376ffb20..6b9704126a0a5 100644
--- a/libc/src/time/linux/nanosleep.cpp
+++ b/libc/src/time/linux/nanosleep.cpp
@@ -10,8 +10,8 @@
 #include "hdr/time_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For syscall functions.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <stdint.h>      // For int64_t.
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp
index cf5174523aa4f..a4d4372332732 100644
--- a/libc/src/time/linux/timespec_get.cpp
+++ b/libc/src/time/linux/timespec_get.cpp
@@ -9,9 +9,9 @@
 #include "src/time/timespec_get.h"
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp
index 860909af7488c..2a81f0182c313 100644
--- a/libc/src/time/time.cpp
+++ b/libc/src/time/time.cpp
@@ -10,9 +10,9 @@
 
 #include "hdr/time_macros.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/time/clock_gettime.h"
-#include "src/errno/libc_errno.h"
 
 namespace LIBC_NAMESPACE_DECL {
 // avoid inconsitent clang-format behavior
diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
index bbbb1c08a4759..0541c24ece82b 100644
--- a/libc/src/time/time_utils.h
+++ b/libc/src/time/time_utils.h
@@ -15,8 +15,8 @@
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "time_constants.h"
 
 #include <stdint.h>
diff --git a/libc/src/time/windows/clock_getres.cpp b/libc/src/time/windows/clock_getres.cpp
index b8c0c82aa6419..969bb66be2d25 100644
--- a/libc/src/time/windows/clock_getres.cpp
+++ b/libc/src/time/windows/clock_getres.cpp
@@ -13,10 +13,10 @@
 
 #include "src/__support/CPP/limits.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/time/units.h"
 #include "src/__support/time/windows/performance_counter.h"
-#include "src/errno/libc_errno.h"
 #include "src/time/clock_getres.h"
 
 #define WIN32_LEAN_AND_MEAN
diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp
index 2f7ebbcdf9e81..55cd6adca779d 100644
--- a/libc/src/unistd/linux/access.cpp
+++ b/libc/src/unistd/linux/access.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/chdir.cpp b/libc/src/unistd/linux/chdir.cpp
index a30d1dc883be8..04ba509b49a56 100644
--- a/libc/src/unistd/linux/chdir.cpp
+++ b/libc/src/unistd/linux/chdir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp
index 58d42a9673fbe..b5842f2b64d20 100644
--- a/libc/src/unistd/linux/close.cpp
+++ b/libc/src/unistd/linux/close.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup.cpp b/libc/src/unistd/linux/dup.cpp
index c1710a37f6119..81d30c6cdbc4c 100644
--- a/libc/src/unistd/linux/dup.cpp
+++ b/libc/src/unistd/linux/dup.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp
index 7ffc151a053c9..0a0e86573b34e 100644
--- a/libc/src/unistd/linux/dup2.cpp
+++ b/libc/src/unistd/linux/dup2.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/dup3.cpp b/libc/src/unistd/linux/dup3.cpp
index c096ba73c96bd..770fb73515b21 100644
--- a/libc/src/unistd/linux/dup3.cpp
+++ b/libc/src/unistd/linux/dup3.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/execv.cpp b/libc/src/unistd/linux/execv.cpp
index a3f2525ed7ca1..d4f2bd9a51653 100644
--- a/libc/src/unistd/linux/execv.cpp
+++ b/libc/src/unistd/linux/execv.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/execve.cpp b/libc/src/unistd/linux/execve.cpp
index 37162c4121782..2214b6df493bd 100644
--- a/libc/src/unistd/linux/execve.cpp
+++ b/libc/src/unistd/linux/execve.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/fchdir.cpp b/libc/src/unistd/linux/fchdir.cpp
index 8196dc63ab1e1..f7a7422363e6e 100644
--- a/libc/src/unistd/linux/fchdir.cpp
+++ b/libc/src/unistd/linux/fchdir.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/fork.cpp b/libc/src/unistd/linux/fork.cpp
index 8aa0477a15d58..75a76fdea50b2 100644
--- a/libc/src/unistd/linux/fork.cpp
+++ b/libc/src/unistd/linux/fork.cpp
@@ -15,7 +15,7 @@
 #include "src/__support/threads/identifier.h"
 #include "src/__support/threads/thread.h" // For thread self object
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <signal.h>      // For SIGCHLD
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/fsync.cpp b/libc/src/unistd/linux/fsync.cpp
index ae3895bab15f3..fe08aed61e250 100644
--- a/libc/src/unistd/linux/fsync.cpp
+++ b/libc/src/unistd/linux/fsync.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/ftruncate.cpp b/libc/src/unistd/linux/ftruncate.cpp
index ccbb0634664aa..f6aa6f8b48cc9 100644
--- a/libc/src/unistd/linux/ftruncate.cpp
+++ b/libc/src/unistd/linux/ftruncate.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/unistd_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getcwd.cpp b/libc/src/unistd/linux/getcwd.cpp
index 1bb11a7c8e7ba..c0e475dd3e8ff 100644
--- a/libc/src/unistd/linux/getcwd.cpp
+++ b/libc/src/unistd/linux/getcwd.cpp
@@ -13,7 +13,7 @@
 #include "src/__support/macros/config.h"
 #include "src/string/allocating_string_utils.h" // For strdup.
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <linux/limits.h> // This is safe to include without any name pollution.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp
index 168a1197734ed..65bcbf27601da 100644
--- a/libc/src/unistd/linux/getentropy.cpp
+++ b/libc/src/unistd/linux/getentropy.cpp
@@ -10,7 +10,7 @@
 #include "hdr/errno_macros.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/getsid.cpp b/libc/src/unistd/linux/getsid.cpp
index 5977c5bf10e94..025b8d1691ac3 100644
--- a/libc/src/unistd/linux/getsid.cpp
+++ b/libc/src/unistd/linux/getsid.cpp
@@ -11,8 +11,8 @@
 #include "hdr/types/pid_t.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/isatty.cpp b/libc/src/unistd/linux/isatty.cpp
index e6ea22a714c78..a4d17912b57b0 100644
--- a/libc/src/unistd/linux/isatty.cpp
+++ b/libc/src/unistd/linux/isatty.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/ioctl.h>   // For ioctl numbers.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/link.cpp b/libc/src/unistd/linux/link.cpp
index 477806a70df74..205cf8a84a5cb 100644
--- a/libc/src/unistd/linux/link.cpp
+++ b/libc/src/unistd/linux/link.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/linkat.cpp b/libc/src/unistd/linux/linkat.cpp
index 40f68cc90c480..ea5bc48cbedc5 100644
--- a/libc/src/unistd/linux/linkat.cpp
+++ b/libc/src/unistd/linux/linkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/lseek.cpp b/libc/src/unistd/linux/lseek.cpp
index 0e957498da746..26a08269fd8de 100644
--- a/libc/src/unistd/linux/lseek.cpp
+++ b/libc/src/unistd/linux/lseek.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/unistd/lseek.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "src/__support/File/linux/lseekImpl.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
diff --git a/libc/src/unistd/linux/pathconf.cpp b/libc/src/unistd/linux/pathconf.cpp
index ca1c10bb9f7f6..7dde857c1cfd8 100644
--- a/libc/src/unistd/linux/pathconf.cpp
+++ b/libc/src/unistd/linux/pathconf.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/unistd/pathconf.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/statvfs/linux/statfs_utils.h"
 #include "src/unistd/linux/pathconf_utils.h"
 
diff --git a/libc/src/unistd/linux/pathconf_utils.cpp b/libc/src/unistd/linux/pathconf_utils.cpp
index 035e628dff253..9a62e31fd1880 100644
--- a/libc/src/unistd/linux/pathconf_utils.cpp
+++ b/libc/src/unistd/linux/pathconf_utils.cpp
@@ -14,8 +14,8 @@
 #include "hdr/unistd_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/statvfs/linux/statfs_utils.h"
 
 // other linux specific includes
diff --git a/libc/src/unistd/linux/pipe.cpp b/libc/src/unistd/linux/pipe.cpp
index dfcd5bfdaf537..b9943c8338056 100644
--- a/libc/src/unistd/linux/pipe.cpp
+++ b/libc/src/unistd/linux/pipe.cpp
@@ -10,10 +10,10 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h> // For syscall numbers.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/pipe2.cpp b/libc/src/unistd/linux/pipe2.cpp
index ebe7e0114ae99..d30f3b37a1adc 100644
--- a/libc/src/unistd/linux/pipe2.cpp
+++ b/libc/src/unistd/linux/pipe2.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/pread.cpp b/libc/src/unistd/linux/pread.cpp
index 3e27857f9a2b4..2f86e397feeff 100644
--- a/libc/src/unistd/linux/pread.cpp
+++ b/libc/src/unistd/linux/pread.cpp
@@ -10,11 +10,11 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <stdint.h>      // For uint64_t.
-#include <sys/syscall.h> // For syscall numbers.
+#include <stdint.h>                         // For uint64_t.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/pwrite.cpp b/libc/src/unistd/linux/pwrite.cpp
index 1b81b2a059494..f4cf8e16d766f 100644
--- a/libc/src/unistd/linux/pwrite.cpp
+++ b/libc/src/unistd/linux/pwrite.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdint.h>      // For uint64_t.
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/linux/read.cpp b/libc/src/unistd/linux/read.cpp
index 4419900f2330e..55676f3f7010a 100644
--- a/libc/src/unistd/linux/read.cpp
+++ b/libc/src/unistd/linux/read.cpp
@@ -10,10 +10,10 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
-#include "src/errno/libc_errno.h"
-#include <sys/syscall.h> // For syscall numbers.
+#include <sys/syscall.h>                    // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp
index 2055e6b3400f2..b297a41ca37bd 100644
--- a/libc/src/unistd/linux/readlink.cpp
+++ b/libc/src/unistd/linux/readlink.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp
index e5e4d0d39bc9c..cd0dcb8e0ff02 100644
--- a/libc/src/unistd/linux/readlinkat.cpp
+++ b/libc/src/unistd/linux/readlinkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp
index 075af12af64c5..eca6e954ef898 100644
--- a/libc/src/unistd/linux/rmdir.cpp
+++ b/libc/src/unistd/linux/rmdir.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/symlink.cpp b/libc/src/unistd/linux/symlink.cpp
index 9e1b2886ea0f5..3f43de19d2f46 100644
--- a/libc/src/unistd/linux/symlink.cpp
+++ b/libc/src/unistd/linux/symlink.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/symlinkat.cpp b/libc/src/unistd/linux/symlinkat.cpp
index bcf2d0f8cc055..8cee172f39dfa 100644
--- a/libc/src/unistd/linux/symlinkat.cpp
+++ b/libc/src/unistd/linux/symlinkat.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/syscall.cpp b/libc/src/unistd/linux/syscall.cpp
index 5394bff46adfa..0f7b3da88d627 100644
--- a/libc/src/unistd/linux/syscall.cpp
+++ b/libc/src/unistd/linux/syscall.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <stdarg.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/sysconf.cpp b/libc/src/unistd/linux/sysconf.cpp
index f785ff321c7d7..03f224b150273 100644
--- a/libc/src/unistd/linux/sysconf.cpp
+++ b/libc/src/unistd/linux/sysconf.cpp
@@ -11,8 +11,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/unistd_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/auxv/getauxval.h"
 #include <sys/auxv.h>
 
diff --git a/libc/src/unistd/linux/truncate.cpp b/libc/src/unistd/linux/truncate.cpp
index 8236edb480d10..6103d4b51350b 100644
--- a/libc/src/unistd/linux/truncate.cpp
+++ b/libc/src/unistd/linux/truncate.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/unistd_macros.h"
 #include <stdint.h>      // For uint64_t.
diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp
index 72d8e2398e3d7..5fde2600937b2 100644
--- a/libc/src/unistd/linux/unlink.cpp
+++ b/libc/src/unistd/linux/unlink.cpp
@@ -12,8 +12,8 @@
 #include "src/__support/common.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp
index 4ed20f542f170..b2012c52b8854 100644
--- a/libc/src/unistd/linux/unlinkat.cpp
+++ b/libc/src/unistd/linux/unlinkat.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include "hdr/fcntl_macros.h"
 #include <sys/syscall.h> // For syscall numbers.
diff --git a/libc/src/unistd/linux/write.cpp b/libc/src/unistd/linux/write.cpp
index 99d5ab7e480b0..eecb74429182a 100644
--- a/libc/src/unistd/linux/write.cpp
+++ b/libc/src/unistd/linux/write.cpp
@@ -10,8 +10,8 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp
index bfaec723ac63d..e25a7a8fed406 100644
--- a/libc/src/unistd/windows/getentropy.cpp
+++ b/libc/src/unistd/windows/getentropy.cpp
@@ -9,7 +9,7 @@
 #include "src/unistd/getentropy.h"
 #include "hdr/errno_macros.h"
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #define WIN32_LEAN_AND_MEAN
 #include <Windows.h>
diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h
index 5be66d9edff02..24c007d2e12e6 100644
--- a/libc/test/IntegrationTest/test.h
+++ b/libc/test/IntegrationTest/test.h
@@ -68,12 +68,9 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Errno checks.
 
-#define ASSERT_ERRNO_EQ(VAL)                                                   \
-  ASSERT_EQ(VAL, static_cast<int>(LIBC_NAMESPACE::libc_errno))
-#define ASSERT_ERRNO_SUCCESS()                                                 \
-  ASSERT_EQ(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
-#define ASSERT_ERRNO_FAILURE()                                                 \
-  ASSERT_NE(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
+#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
+#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast<int>(libc_errno))
 
 // Integration tests are compiled with -ffreestanding which stops treating
 // the main function as a non-overloadable special function. Hence, we use a
diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h
index 3d3b72f80544f..4b7ff452f409c 100644
--- a/libc/test/UnitTest/ErrnoCheckingTest.h
+++ b/libc/test/UnitTest/ErrnoCheckingTest.h
@@ -9,8 +9,8 @@
 #ifndef LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H
 #define LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -25,7 +25,7 @@ class ErrnoCheckingTest : public Test {
 public:
   void SetUp() override {
     Test::SetUp();
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 
   void TearDown() override {
diff --git a/libc/test/UnitTest/ErrnoSetterMatcher.h b/libc/test/UnitTest/ErrnoSetterMatcher.h
index c6eadd25858ea..212b7a8f83e74 100644
--- a/libc/test/UnitTest/ErrnoSetterMatcher.h
+++ b/libc/test/UnitTest/ErrnoSetterMatcher.h
@@ -12,9 +12,9 @@
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "src/__support/StringUtil/error_to_string.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/architectures.h"
-#include "src/errno/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -114,8 +114,8 @@ template <typename T> class ErrnoSetterMatcher : public Matcher<T> {
 
   bool match(T got) {
     actual_return = got;
-    actual_errno = LIBC_NAMESPACE::libc_errno;
-    LIBC_NAMESPACE::libc_errno = 0;
+    actual_errno = libc_errno;
+    libc_errno = 0;
     if constexpr (ignore_errno())
       return return_cmp.compare(actual_return);
     else
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 21b8a45b0726f..da15cf2907f7c 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -279,8 +279,8 @@ struct ModifyMXCSR {
 #define EXPECT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
     if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = LIBC_NAMESPACE::libc_errno;                                 \
-      LIBC_NAMESPACE::libc_errno = 0;                                          \
+      int actual = libc_errno;                                                 \
+      libc_errno = 0;                                                          \
       EXPECT_EQ(actual, expected);                                             \
     }                                                                          \
   } while (0)
@@ -288,8 +288,8 @@ struct ModifyMXCSR {
 #define ASSERT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
     if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = LIBC_NAMESPACE::libc_errno;                                 \
-      LIBC_NAMESPACE::libc_errno = 0;                                          \
+      int actual = libc_errno;                                                 \
+      libc_errno = 0;                                                          \
       ASSERT_EQ(actual, expected);                                             \
     }                                                                          \
   } while (0)
diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h
index 95d48f40914ed..a5a2a3c7cf58e 100644
--- a/libc/test/UnitTest/Test.h
+++ b/libc/test/UnitTest/Test.h
@@ -42,15 +42,14 @@
 
 #define ASSERT_ERRNO_EQ(VAL)                                                   \
   do {                                                                         \
-    ASSERT_EQ(VAL, static_cast<int>(LIBC_NAMESPACE::libc_errno));              \
-    LIBC_NAMESPACE::libc_errno = 0;                                            \
+    ASSERT_EQ(VAL, static_cast<int>(libc_errno));                              \
+    libc_errno = 0;                                                            \
   } while (0)
-#define ASSERT_ERRNO_SUCCESS()                                                 \
-  ASSERT_EQ(0, static_cast<int>(LIBC_NAMESPACE::libc_errno))
+#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast<int>(libc_errno))
 #define ASSERT_ERRNO_FAILURE()                                                 \
   do {                                                                         \
-    ASSERT_NE(0, static_cast<int>(LIBC_NAMESPACE::libc_errno));                \
-    LIBC_NAMESPACE::libc_errno = 0;                                            \
+    ASSERT_NE(0, static_cast<int>(libc_errno));                                \
+    libc_errno = 0;                                                            \
   } while (0)
 
 #endif // LLVM_LIBC_TEST_UNITTEST_TEST_H
diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp
index 29da4d5c3c8d7..aecbad6514aaa 100644
--- a/libc/test/integration/src/pthread/pthread_create_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_create_test.cpp
@@ -29,7 +29,7 @@
 #include "src/__support/CPP/new.h"
 #include "src/__support/threads/thread.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include "test/IntegrationTest/test.h"
 
@@ -332,7 +332,7 @@ static void run_failure_tests() {
 }
 
 TEST_MAIN() {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   run_success_tests();
   run_failure_tests();
   return 0;
diff --git a/libc/test/integration/src/pthread/pthread_join_test.cpp b/libc/test/integration/src/pthread/pthread_join_test.cpp
index 994fa57a6b337..5d0bcd8e23658 100644
--- a/libc/test/integration/src/pthread/pthread_join_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_join_test.cpp
@@ -9,7 +9,7 @@
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_join.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 #include "test/IntegrationTest/test.h"
 #include <pthread.h>
@@ -25,7 +25,7 @@ static void nullJoinTest() {
 }
 
 TEST_MAIN() {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   nullJoinTest();
   return 0;
 }
diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp
index 37ceceee880de..35dd3b165e0ee 100644
--- a/libc/test/integration/src/pthread/pthread_name_test.cpp
+++ b/libc/test/integration/src/pthread/pthread_name_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/pthread/pthread_create.h"
 #include "src/pthread/pthread_getname_np.h"
 #include "src/pthread/pthread_join.h"
diff --git a/libc/test/integration/src/unistd/getcwd_test.cpp b/libc/test/integration/src/unistd/getcwd_test.cpp
index 551768187bf01..1b321b01e9315 100644
--- a/libc/test/integration/src/unistd/getcwd_test.cpp
+++ b/libc/test/integration/src/unistd/getcwd_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/stdlib/getenv.h"
 #include "src/unistd/getcwd.h"
 
@@ -31,12 +31,12 @@ TEST_MAIN(int argc, char **argv, char **envp) {
   cwd = LIBC_NAMESPACE::getcwd(buffer, 0);
   ASSERT_TRUE(cwd == nullptr);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // Insufficient size
   cwd = LIBC_NAMESPACE::getcwd(buffer, 2);
   ASSERT_TRUE(cwd == nullptr);
-  int err = LIBC_NAMESPACE::libc_errno;
+  int err = libc_errno;
   ASSERT_EQ(err, ERANGE);
 
   return 0;
diff --git a/libc/test/integration/startup/linux/tls_test.cpp b/libc/test/integration/startup/linux/tls_test.cpp
index ef9fd9fcb7ff4..de3bd06c39cf6 100644
--- a/libc/test/integration/startup/linux/tls_test.cpp
+++ b/libc/test/integration/startup/linux/tls_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/mman/mmap.h"
 #include "test/IntegrationTest/test.h"
 
diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h
index d349192f107c0..9b4844d410db2 100644
--- a/libc/test/src/__support/str_to_fp_test.h
+++ b/libc/test/src/__support/str_to_fp_test.h
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_float.h"
 #include "src/__support/uint128.h"
diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp
index 1ec882b212b8a..40cb76a8bd6a2 100644
--- a/libc/test/src/__support/str_to_integer_test.cpp
+++ b/libc/test/src/__support/str_to_integer_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/str_to_integer.h"
 #include <stddef.h>
 
diff --git a/libc/test/src/dirent/dirent_test.cpp b/libc/test/src/dirent/dirent_test.cpp
index 41f522a6a75fb..3f0095ca5ebe8 100644
--- a/libc/test/src/dirent/dirent_test.cpp
+++ b/libc/test/src/dirent/dirent_test.cpp
@@ -7,11 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/CPP/string_view.h"
+#include "src/__support/libc_errno.h"
 #include "src/dirent/closedir.h"
 #include "src/dirent/dirfd.h"
 #include "src/dirent/opendir.h"
 #include "src/dirent/readdir.h"
-#include "src/errno/libc_errno.h"
 
 #include "test/UnitTest/Test.h"
 
@@ -55,17 +55,17 @@ TEST(LlvmLibcDirentTest, SimpleOpenAndRead) {
 }
 
 TEST(LlvmLibcDirentTest, OpenNonExistentDir) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ::DIR *dir = LIBC_NAMESPACE::opendir("___xyz123__.non_existent__");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOENT);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
 
 TEST(LlvmLibcDirentTest, OpenFile) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ::DIR *dir = LIBC_NAMESPACE::opendir("testdata/file1.txt");
   ASSERT_TRUE(dir == nullptr);
   ASSERT_ERRNO_EQ(ENOTDIR);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/errno/errno_test.cpp b/libc/test/src/errno/errno_test.cpp
index b0db22a85f3bc..de82b0077f177 100644
--- a/libc/test/src/errno/errno_test.cpp
+++ b/libc/test/src/errno/errno_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcErrnoTest, Basic) {
   int test_val = 123;
-  LIBC_NAMESPACE::libc_errno = test_val;
+  libc_errno = test_val;
   ASSERT_ERRNO_EQ(test_val);
 }
diff --git a/libc/test/src/fcntl/creat_test.cpp b/libc/test/src/fcntl/creat_test.cpp
index 4c9d2cbc33f47..d60c984934703 100644
--- a/libc/test/src/fcntl/creat_test.cpp
+++ b/libc/test/src/fcntl/creat_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/creat.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp
index 1a21afe51085b..082c42481777b 100644
--- a/libc/test/src/fcntl/fcntl_test.cpp
+++ b/libc/test/src/fcntl/fcntl_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/fcntl_macros.h"
 #include "hdr/stdio_macros.h"
 #include "hdr/types/struct_flock.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/fcntl.h"
 #include "src/fcntl/open.h"
 #include "src/unistd/close.h"
@@ -166,7 +166,7 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) {
 }
 
 TEST(LlvmLibcFcntlTest, SetGetOwnerTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t pid = LIBC_NAMESPACE::getpid();
   ASSERT_GT(pid, -1);
diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp
index 213b074799c8d..1997476f16a60 100644
--- a/libc/test/src/fcntl/openat_test.cpp
+++ b/libc/test/src/fcntl/openat_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/fcntl/openat.h"
 #include "src/unistd/close.h"
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 77b465a3a0e63..6af9cfea0e0a5 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -55,7 +55,7 @@ class RoundToIntegerTestTemplate
 
   void test_one_input(RoundToIntegerFunc func, FloatType input,
                       IntType expected, bool expectError) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
 
     ASSERT_EQ(func(input), expected);
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 2e4c8eb2ab961..aa0128fee999b 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acosf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/acoshf16_test.cpp b/libc/test/src/math/acoshf16_test.cpp
index 7348018396bd7..2eb95215e4e8b 100644
--- a/libc/test/src/math/acoshf16_test.cpp
+++ b/libc/test/src/math/acoshf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index 18ed5a11d50a7..3d3b827411a4a 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acoshf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp
index 385e341318aea..03ae963e9f924 100644
--- a/libc/test/src/math/asin_test.cpp
+++ b/libc/test/src/math/asin_test.cpp
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::asin(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index 5197810d8bd58..1eaa6b8a51359 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -9,7 +9,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAsinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index ac125c3520c44..8c78f939cabf7 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinhf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp
index 331f4281af839..50ab38208089a 100644
--- a/libc/test/src/math/atan2f_test.cpp
+++ b/libc/test/src/math/atan2f_test.cpp
@@ -81,7 +81,7 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) {
         if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
-        LIBC_NAMESPACE::libc_errno = 0;
+        libc_errno = 0;
         float result = LIBC_NAMESPACE::atan2f(x, y);
         ++total_count;
         if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/atan_test.cpp b/libc/test/src/math/atan_test.cpp
index 7f52578b9efed..7fa0dffd607e2 100644
--- a/libc/test/src/math/atan_test.cpp
+++ b/libc/test/src/math/atan_test.cpp
@@ -39,7 +39,7 @@ TEST_F(LlvmLibcAtanTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::atan(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 575ec89bd493c..a4bdf1867c39c 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -23,7 +23,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 // TODO: This test needs to have its checks for exceptions, errno
 // tightened
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN));
   // TODO: Uncomment these checks later, RoundingMode affects running
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index 8b9db1dfdd976..32272ef482ab2 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -25,7 +25,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 // tightened https://github.com/llvm/llvm-project/issues/88819.
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN));
   // TODO: Uncomment these checks later, RoundingMode affects running
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 2143c36f3d30b..90dc8ff6a0ea4 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -23,7 +23,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cosf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 0d1c322b8e622..bdaba50f1f148 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::coshf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp
index 37ec2516f6a35..cb88bfcade0dc 100644
--- a/libc/test/src/math/cospif_test.cpp
+++ b/libc/test/src/math/cospif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/src/math/sdcomp26094.h"
@@ -19,7 +19,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index 6fb1d2d9d925e..6126e5f211fff 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -105,7 +105,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp10(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index 001b37809d930..89915961c9b90 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp10f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExp10fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::exp10f(FPBits(0xff7fffffU).get_val()),
       FE_UNDERFLOW);
@@ -97,7 +97,7 @@ TEST_F(LlvmLibcExp10fTest, TrickyInputs) {
       0x41200000, // x = 10.0f
   };
   for (int i = 0; i < N; ++i) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
@@ -113,15 +113,14 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp
index aee273384f1a2..01802bd68f7e4 100644
--- a/libc/test/src/math/exp10m1f_test.cpp
+++ b/libc/test/src/math/exp10m1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -69,7 +69,7 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) {
   };
 
   for (float x : INPUTS) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
                                    LIBC_NAMESPACE::exp10m1f(x), 0.5);
   }
@@ -82,14 +82,14 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_inf_or_nan())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp10m1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_inf_or_nan() || LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_inf_or_nan() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
                                    LIBC_NAMESPACE::exp10m1f(x), 0.5);
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index adfceceeef4b7..4cd95dd5486ed 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp2(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 0c4c821534392..aeecb3e74b07a 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp2f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
       0xc3150000U, /*-0x1.2ap+7f*/
   };
   for (int i = 0; i < N; ++i) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
@@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::exp2f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -108,15 +108,14 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
index 793cf0cc2cbb4..0c87657abc085 100644
--- a/libc/test/src/math/exp2m1f_test.cpp
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) {
   };
 
   for (float x : INPUTS) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
   }
@@ -51,15 +51,14 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::exp2m1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
                                    LIBC_NAMESPACE::exp2m1f(x), 0.5);
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 0ab3a4e543464..83addaeb943d8 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -78,7 +78,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::exp(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 26a0bca4ce253..3c10812ff5bc2 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpfTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExpfTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       0.0f, LIBC_NAMESPACE::expf(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -76,7 +76,7 @@ TEST_F(LlvmLibcExpfTest, Underflow) {
 TEST_F(LlvmLibcExpfTest, Borderline) {
   float x;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
@@ -110,15 +110,14 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::expf(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                    LIBC_NAMESPACE::expf(x), 0.5);
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index 9720773d9f960..0cf07e2e49734 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -64,7 +64,7 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::expm1(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 274fe3bb7afb0..cf3fe9c26ae18 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expm1f(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpm1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(FPBits(0xff7fffffU).get_val()));
 
   float x = FPBits(0xc2cffff8U).get_val();
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcExpm1fTest, Underflow) {
 TEST_F(LlvmLibcExpm1fTest, Borderline) {
   float x;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
@@ -119,15 +119,14 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::expm1f(x);
 
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                    LIBC_NAMESPACE::expm1f(x), 0.5);
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index 01aa1f82ae5d8..e9529d87c3885 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -101,7 +101,7 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log10(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index 107e965a0d3ae..e5747b7e5ec0b 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log1p(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index bb181dc5e43b0..ffe2dd2c33dd6 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -75,7 +75,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
                                    LIBC_NAMESPACE::log1pf(x), 0.5);
   }
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index 8a07991a68886..fc440c09b42bd 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log2(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index 83691fb75300e..92226c763f458 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -52,14 +52,13 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     float result = LIBC_NAMESPACE::log2f(x);
     // If the computation resulted in an error or did not produce valid result
     // in the single-precision floating point range, then ignore comparing with
     // MPFR result as MPFR can still produce valid results because of its
     // wider precision.
-    if (FPBits(result).is_nan() || FPBits(result).is_inf() ||
-        LIBC_NAMESPACE::libc_errno != 0)
+    if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0)
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2f(x), 0.5);
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index 969a469b2e1c6..54afaa33d1350 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -99,7 +99,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0)
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::log(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 448dcc0035e9b..4d189d813e584 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -78,7 +78,7 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) {
         if (FPBits(w).is_nan() || FPBits(w).is_inf())
           continue;
 
-        LIBC_NAMESPACE::libc_errno = 0;
+        libc_errno = 0;
         float result = LIBC_NAMESPACE::powf(x, y);
         ++cc;
         if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index d4c6bd416a409..4d5d9ddf464b1 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) {
       double x = FPBits(v).get_val();
       if (FPBits(v).is_nan() || FPBits(v).is_inf())
         continue;
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
       double result = LIBC_NAMESPACE::sin(x);
       ++cc;
       if (FPBits(result).is_nan() || FPBits(result).is_inf())
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 2823110331f30..ad2155f329cd9 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   float sin, cos;
 
   LIBC_NAMESPACE::sincosf(aNaN, &sin, &cos);
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 8fd3ed1577cee..e0357e6157fdc 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index 6867c7aec57df..74f906ebaa983 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -22,7 +22,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinhf(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -65,7 +65,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 }
 
 TEST_F(LlvmLibcSinhfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
index d00fd77d288c6..986c676761f0e 100644
--- a/libc/test/src/math/sinpif_test.cpp
+++ b/libc/test/src/math/sinpif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/src/math/sdcomp26094.h"
@@ -21,7 +21,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcSinpifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h
index 8fbcc2a276542..04cbc659ece5d 100644
--- a/libc/test/src/math/smoke/FModTest.h
+++ b/libc/test/src/math/smoke/FModTest.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
 
 #include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 6ae97ce35a0d6..745ccbc748ecd 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -40,7 +40,7 @@ class RoundToIntegerTestTemplate
 
   void test_one_input(RoundToIntegerFunc func, F input, I expected,
                       bool expectError) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
 
     ASSERT_EQ(func(input), expected);
diff --git a/libc/test/src/math/smoke/acos_test.cpp b/libc/test/src/math/smoke/acos_test.cpp
index 3a59bce264077..fe2caefb52ab8 100644
--- a/libc/test/src/math/smoke/acos_test.cpp
+++ b/libc/test/src/math/smoke/acos_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fenv_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ TEST_F(LlvmLibcAcosTest, SpecialNumbers) {
   EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(zero));
   EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(neg_zero));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acos(inf),
                                            FE_INVALID);
   EXPECT_MATH_ERRNO(EDOM);
diff --git a/libc/test/src/math/smoke/acosf16_test.cpp b/libc/test/src/math/smoke/acosf16_test.cpp
index c4274b8245092..7103dc33fec3a 100644
--- a/libc/test/src/math/smoke/acosf16_test.cpp
+++ b/libc/test/src/math/smoke/acosf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAcosf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acosf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp
index 74f68e00011aa..257c6a3d1d22c 100644
--- a/libc/test/src/math/smoke/acosf_test.cpp
+++ b/libc/test/src/math/smoke/acosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acosf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/acoshf16_test.cpp b/libc/test/src/math/smoke/acoshf16_test.cpp
index 7681c2a4e7fbc..6b9c995cf9921 100644
--- a/libc/test/src/math/smoke/acoshf16_test.cpp
+++ b/libc/test/src/math/smoke/acoshf16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAcoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAcoshf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acoshf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp
index c5ba88055ac57..b6abfab999293 100644
--- a/libc/test/src/math/smoke/acoshf_test.cpp
+++ b/libc/test/src/math/smoke/acoshf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acoshf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/acospif16_test.cpp b/libc/test/src/math/smoke/acospif16_test.cpp
index 66b94706eab94..4b2f6de3f7e37 100644
--- a/libc/test/src/math/smoke/acospif16_test.cpp
+++ b/libc/test/src/math/smoke/acospif16_test.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/acospif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
 using LlvmLibcAcospif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 TEST_F(LlvmLibcAcospif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acospif16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinf16_test.cpp b/libc/test/src/math/smoke/asinf16_test.cpp
index 9f675b08319c0..b03f0a420a499 100644
--- a/libc/test/src/math/smoke/asinf16_test.cpp
+++ b/libc/test/src/math/smoke/asinf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAsinf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp
index d817d2b366192..2615a8ddd16bd 100644
--- a/libc/test/src/math/smoke/asinf_test.cpp
+++ b/libc/test/src/math/smoke/asinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAsinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/asinhf16_test.cpp b/libc/test/src/math/smoke/asinhf16_test.cpp
index dcaab217331c7..7f612ce3c4674 100644
--- a/libc/test/src/math/smoke/asinhf16_test.cpp
+++ b/libc/test/src/math/smoke/asinhf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAsinhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp
index 4a8743c50075f..d812a2dffe8aa 100644
--- a/libc/test/src/math/smoke/asinhf_test.cpp
+++ b/libc/test/src/math/smoke/asinhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp
index 1fbcfbe96b2d7..7f8cfb9830d2a 100644
--- a/libc/test/src/math/smoke/atan2f_test.cpp
+++ b/libc/test/src/math/smoke/atan2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atan2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, sNaN),
                               FE_INVALID);
diff --git a/libc/test/src/math/smoke/atanf16_test.cpp b/libc/test/src/math/smoke/atanf16_test.cpp
index af50287d9b22a..ba1e3b2fc8bef 100644
--- a/libc/test/src/math/smoke/atanf16_test.cpp
+++ b/libc/test/src/math/smoke/atanf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAtanf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::atanf16(aNaN));
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
index 7d09a28beaa38..b56b9d0162b97 100644
--- a/libc/test/src/math/smoke/atanf_test.cpp
+++ b/libc/test/src/math/smoke/atanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
 
diff --git a/libc/test/src/math/smoke/atanhf16_test.cpp b/libc/test/src/math/smoke/atanhf16_test.cpp
index 81df6da8cee26..c2a520f7638fe 100644
--- a/libc/test/src/math/smoke/atanhf16_test.cpp
+++ b/libc/test/src/math/smoke/atanhf16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcAtanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcAtanhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf16(sNaN),
                                            FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
index 73a5b81b0240b..038cb30d89a4e 100644
--- a/libc/test/src/math/smoke/atanhf_test.cpp
+++ b/libc/test/src/math/smoke/atanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/atanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -20,7 +20,7 @@ using LIBC_NAMESPACE::Sign;
 using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
   // TODO: Strengthen errno,exception checks and remove these assert macros
diff --git a/libc/test/src/math/smoke/cosf16_test.cpp b/libc/test/src/math/smoke/cosf16_test.cpp
index 2638551fb1d1b..4362a5a3a4bd1 100644
--- a/libc/test/src/math/smoke/cosf16_test.cpp
+++ b/libc/test/src/math/smoke/cosf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCosf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp
index 99773583dcb10..470a876c63a75 100644
--- a/libc/test/src/math/smoke/cosf_test.cpp
+++ b/libc/test/src/math/smoke/cosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp
index 08d05ecce86ba..7bf62afa24c43 100644
--- a/libc/test/src/math/smoke/coshf16_test.cpp
+++ b/libc/test/src/math/smoke/coshf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
index 1611ea1b92926..ee8f0199df3b0 100644
--- a/libc/test/src/math/smoke/coshf_test.cpp
+++ b/libc/test/src/math/smoke/coshf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/coshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcCoshfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/cospif16_test.cpp b/libc/test/src/math/smoke/cospif16_test.cpp
index edd8ed97b30f6..fcde0cc79e356 100644
--- a/libc/test/src/math/smoke/cospif16_test.cpp
+++ b/libc/test/src/math/smoke/cospif16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcCospif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcCospif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp
index 20153897dc459..3d48909cca93e 100644
--- a/libc/test/src/math/smoke/cospif_test.cpp
+++ b/libc/test/src/math/smoke/cospif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/cospif.h"
 #include "test/UnitTest/FPMatcher.h"
 
@@ -15,7 +15,7 @@
 using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp
index baf8a76810970..50d3de0c7fe75 100644
--- a/libc/test/src/math/smoke/exp10_test.cpp
+++ b/libc/test/src/math/smoke/exp10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/exp10f16_test.cpp b/libc/test/src/math/smoke/exp10f16_test.cpp
index 1c4ef2aa08a70..bda40348f8832 100644
--- a/libc/test/src/math/smoke/exp10f16_test.cpp
+++ b/libc/test/src/math/smoke/exp10f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10f16(max_normal),
                               FE_OVERFLOW);
@@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp10f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10f16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp10f16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
index bf39e2cc12d0c..fcd334bb9e364 100644
--- a/libc/test/src/math/smoke/exp10f_test.cpp
+++ b/libc/test/src/math/smoke/exp10f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp
index dfa7fa477d3d1..ed2d5a48b3165 100644
--- a/libc/test/src/math/smoke/exp10m1f16_test.cpp
+++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -67,7 +67,7 @@ TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
                               LIBC_NAMESPACE::exp10m1f16(neg_max_normal),
diff --git a/libc/test/src/math/smoke/exp10m1f_test.cpp b/libc/test/src/math/smoke/exp10m1f_test.cpp
index 2c2cfdbb08a3f..19369a897aaa9 100644
--- a/libc/test/src/math/smoke/exp10m1f_test.cpp
+++ b/libc/test/src/math/smoke/exp10m1f_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp10m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp10m1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f(0x1.fffffep+127f),
                               FE_OVERFLOW);
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp10m1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp10m1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp10m1f(-max_normal),
                               FE_UNDERFLOW);
diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp
index 9ab9129416dad..aebf808350727 100644
--- a/libc/test/src/math/smoke/exp2_test.cpp
+++ b/libc/test/src/math/smoke/exp2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/exp2f16_test.cpp b/libc/test/src/math/smoke/exp2f16_test.cpp
index f69b33a3cf37f..1eb7343dcd22f 100644
--- a/libc/test/src/math/smoke/exp2f16_test.cpp
+++ b/libc/test/src/math/smoke/exp2f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2f16(max_normal),
                               FE_OVERFLOW);
@@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp2f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2f16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp2f16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
index a928389cc41b4..c5243273d9ed4 100644
--- a/libc/test/src/math/smoke/exp2f_test.cpp
+++ b/libc/test/src/math/smoke/exp2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/exp2m1f16_test.cpp b/libc/test/src/math/smoke/exp2m1f16_test.cpp
index f423196a70360..635b7a6e187d7 100644
--- a/libc/test/src/math/smoke/exp2m1f16_test.cpp
+++ b/libc/test/src/math/smoke/exp2m1f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -39,7 +39,7 @@ TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2m1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -65,7 +65,7 @@ TEST_F(LlvmLibcExp2m1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2m1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0, LIBC_NAMESPACE::exp2m1f16(neg_max_normal),
                               FE_INEXACT);
diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp
index 99bdf0035df0c..63852e11655ad 100644
--- a/libc/test/src/math/smoke/exp2m1f_test.cpp
+++ b/libc/test/src/math/smoke/exp2m1f_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp2m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
 using LIBC_NAMESPACE::fputil::testing::RoundingMode;
 
 TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2m1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExp2m1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127),
                               FE_OVERFLOW);
@@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp2m1fTest, Overflow) {
 }
 
 TEST_F(LlvmLibcExp2m1fTest, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127),
                               FE_UNDERFLOW);
diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp
index f86243092f1fb..c3b2ae70e1d99 100644
--- a/libc/test/src/math/smoke/exp_test.cpp
+++ b/libc/test/src/math/smoke/exp_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp
index ab745a3cf6f56..863f694ffc41a 100644
--- a/libc/test/src/math/smoke/expf16_test.cpp
+++ b/libc/test/src/math/smoke/expf16_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@
 using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcExpf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal),
                               FE_OVERFLOW);
@@ -54,7 +54,7 @@ TEST_F(LlvmLibcExpf16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExpf16Test, Underflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal),
                               FE_UNDERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
index eee8304999275..d34151735afa7 100644
--- a/libc/test/src/math/smoke/expf_test.cpp
+++ b/libc/test/src/math/smoke/expf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp
index bc71c53abc7ac..c842fe3c45fe1 100644
--- a/libc/test/src/math/smoke/expm1_test.cpp
+++ b/libc/test/src/math/smoke/expm1_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp
index f297c5dfc3c7e..4d19a9bac5eb1 100644
--- a/libc/test/src/math/smoke/expm1f16_test.cpp
+++ b/libc/test/src/math/smoke/expm1f16_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@
 using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1f16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expm1f16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
@@ -67,7 +67,7 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) {
 }
 
 TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
                               LIBC_NAMESPACE::expm1f16(neg_max_normal),
diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
index dfb474d70fb6a..214bfe8abd4d2 100644
--- a/libc/test/src/math/smoke/expm1f_test.cpp
+++ b/libc/test/src/math/smoke/expm1f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp
index ff73850c52101..49cfda85111a5 100644
--- a/libc/test/src/math/smoke/log10_test.cpp
+++ b/libc/test/src/math/smoke/log10_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log10f16_test.cpp b/libc/test/src/math/smoke/log10f16_test.cpp
index 471e198933326..53f5ac46aa60f 100644
--- a/libc/test/src/math/smoke/log10f16_test.cpp
+++ b/libc/test/src/math/smoke/log10f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLog10f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp
index 631c24b8abcf9..61c56cd2c6ddd 100644
--- a/libc/test/src/math/smoke/log1p_test.cpp
+++ b/libc/test/src/math/smoke/log1p_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp
index bd828ad58c4c9..dc3489fddf99f 100644
--- a/libc/test/src/math/smoke/log1pf_test.cpp
+++ b/libc/test/src/math/smoke/log1pf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp
index 9993d442967cb..0534d00b1f408 100644
--- a/libc/test/src/math/smoke/log2_test.cpp
+++ b/libc/test/src/math/smoke/log2_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log2f16_test.cpp b/libc/test/src/math/smoke/log2f16_test.cpp
index 6d98482aa4499..fd20652d2f008 100644
--- a/libc/test/src/math/smoke/log2f16_test.cpp
+++ b/libc/test/src/math/smoke/log2f16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLog2f16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp
index 8648b75b88b83..53d54ac367639 100644
--- a/libc/test/src/math/smoke/log2f_test.cpp
+++ b/libc/test/src/math/smoke/log2f_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp
index d31eb0c1db734..09e9ab0a9a4d8 100644
--- a/libc/test/src/math/smoke/log_test.cpp
+++ b/libc/test/src/math/smoke/log_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/math/smoke/logf16_test.cpp b/libc/test/src/math/smoke/logf16_test.cpp
index c7232aa1c1e32..2784f3d5fa54d 100644
--- a/libc/test/src/math/smoke/logf16_test.cpp
+++ b/libc/test/src/math/smoke/logf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/logf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcLogf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp
index 5f66868f12a1c..8ba0d04347bba 100644
--- a/libc/test/src/math/smoke/sincosf_test.cpp
+++ b/libc/test/src/math/smoke/sincosf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   float sin, cos;
 
   LIBC_NAMESPACE::sincosf(sNaN, &sin, &cos);
diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp
index a0e7a7ba321fd..6b168ac040db9 100644
--- a/libc/test/src/math/smoke/sinf16_test.cpp
+++ b/libc/test/src/math/smoke/sinf16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
index de504b4f5335c..8173969fb2569 100644
--- a/libc/test/src/math/smoke/sinf_test.cpp
+++ b/libc/test/src/math/smoke/sinf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp
index 4f21d33ba78e0..d52739a9adb35 100644
--- a/libc/test/src/math/smoke/sinhf16_test.cpp
+++ b/libc/test/src/math/smoke/sinhf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -38,7 +38,7 @@ TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcSinhf16Test, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal),
                               FE_OVERFLOW | FE_INEXACT);
diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
index e22cfc7ea14d8..ea6a4474a7806 100644
--- a/libc/test/src/math/smoke/sinhf_test.cpp
+++ b/libc/test/src/math/smoke/sinhf_test.cpp
@@ -9,7 +9,7 @@
 #include "hdr/math_macros.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -19,7 +19,7 @@
 using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 }
 
 TEST_F(LlvmLibcSinhfTest, Overflow) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
       inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp
index b2db6fb9f8626..9edf2cc663d4b 100644
--- a/libc/test/src/math/smoke/sinpif16_test.cpp
+++ b/libc/test/src/math/smoke/sinpif16_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp
index 1ba5c1d2b720a..b840f3980eda2 100644
--- a/libc/test/src/math/smoke/sinpif_test.cpp
+++ b/libc/test/src/math/smoke/sinpif_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/sinpif.h"
 #include "test/UnitTest/FPMatcher.h"
 
@@ -15,7 +15,7 @@
 using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcSinpifTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp
index f65b9fced72c4..95d200cf5591d 100644
--- a/libc/test/src/math/smoke/tanf16_test.cpp
+++ b/libc/test/src/math/smoke/tanf16_test.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -15,7 +15,7 @@
 using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp
index 178e9065f430f..12deca5cf9417 100644
--- a/libc/test/src/math/smoke/tanf_test.cpp
+++ b/libc/test/src/math/smoke/tanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcTanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanhf16_test.cpp b/libc/test/src/math/smoke/tanhf16_test.cpp
index fa6328e9ef0a6..eb90f02a8d7c3 100644
--- a/libc/test/src/math/smoke/tanhf16_test.cpp
+++ b/libc/test/src/math/smoke/tanhf16_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/fenv_macros.h"
 #include "src/__support/FPUtil/cast.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -16,7 +16,7 @@
 using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tanhf16(aNaN));
   EXPECT_MATH_ERRNO(0);
@@ -40,7 +40,7 @@ TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) {
 }
 
 TEST_F(LlvmLibcTanhf16Test, ResultNearBounds) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(1.0),
                               LIBC_NAMESPACE::tanhf16(max_normal), FE_INEXACT);
diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp
index c09761ef531f2..b12a331b31906 100644
--- a/libc/test/src/math/smoke/tanhf_test.cpp
+++ b/libc/test/src/math/smoke/tanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -18,7 +18,7 @@
 using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 TEST_F(LlvmLibcTanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/smoke/tanpif16_test.cpp b/libc/test/src/math/smoke/tanpif16_test.cpp
index 74797d1649b1a..ea896d7bb3e57 100644
--- a/libc/test/src/math/smoke/tanpif16_test.cpp
+++ b/libc/test/src/math/smoke/tanpif16_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -14,7 +14,7 @@
 using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 TEST_F(LlvmLibcTanpif16Test, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanpif16(sNaN), FE_INVALID);
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index 9061cf6fb30b8..ecc70194b6491 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcTanfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index 389abe4d85897..966ce649e2b38 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/math/tanhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 TEST_F(LlvmLibcTanhfTest, SpecialNumbers) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanhf(aNaN));
   EXPECT_MATH_ERRNO(0);
diff --git a/libc/test/src/poll/poll_test.cpp b/libc/test/src/poll/poll_test.cpp
index 30f5e41c61ecf..97b7b02718172 100644
--- a/libc/test/src/poll/poll_test.cpp
+++ b/libc/test/src/poll/poll_test.cpp
@@ -7,18 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/limits_macros.h" // UINT_MAX
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/poll/poll.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcPollTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int ret = LIBC_NAMESPACE::poll(nullptr, 0, 0);
   ASSERT_ERRNO_SUCCESS();
   ASSERT_EQ(0, ret);
 }
 TEST(LlvmLibcPollTest, SmokeFailureTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int ret = LIBC_NAMESPACE::poll(nullptr, UINT_MAX, 0);
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_EQ(-1, ret);
diff --git a/libc/test/src/sched/affinity_test.cpp b/libc/test/src/sched/affinity_test.cpp
index b5085203e5ce0..b77f22f8e60d2 100644
--- a/libc/test/src/sched/affinity_test.cpp
+++ b/libc/test/src/sched/affinity_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_setaffinity.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -17,7 +17,7 @@
 
 TEST(LlvmLibcSchedAffinityTest, SmokeTest) {
   cpu_set_t mask;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
@@ -32,15 +32,15 @@ TEST(LlvmLibcSchedAffinityTest, BadMask) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_getaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(
       LIBC_NAMESPACE::sched_setaffinity(tid, sizeof(cpu_set_t), nullptr),
       Fails(EFAULT));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/cpu_count_test.cpp b/libc/test/src/sched/cpu_count_test.cpp
index 5250368a26162..919f1475e1d4d 100644
--- a/libc/test/src/sched/cpu_count_test.cpp
+++ b/libc/test/src/sched/cpu_count_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_getaffinity.h"
 #include "src/sched/sched_getcpucount.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -17,7 +17,7 @@
 
 TEST(LlvmLibcSchedCpuCountTest, SmokeTest) {
   cpu_set_t mask;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   pid_t tid = LIBC_NAMESPACE::syscall_impl<pid_t>(SYS_gettid);
   ASSERT_GT(tid, pid_t(0));
diff --git a/libc/test/src/sched/get_priority_test.cpp b/libc/test/src/sched/get_priority_test.cpp
index 59205c51e4a16..bb41dc0be2019 100644
--- a/libc/test/src/sched/get_priority_test.cpp
+++ b/libc/test/src/sched/get_priority_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_max.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "test/UnitTest/Test.h"
@@ -58,7 +58,7 @@ TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) {
 }
 
 TEST(LlvmLibcSchedGetPriorityTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // We Test:
   // SCHED_OTHER, SCHED_FIFO, SCHED_RR
diff --git a/libc/test/src/sched/param_and_scheduler_test.cpp b/libc/test/src/sched/param_and_scheduler_test.cpp
index 747c7e3409e41..4f2b6e412a4b7 100644
--- a/libc/test/src/sched/param_and_scheduler_test.cpp
+++ b/libc/test/src/sched/param_and_scheduler_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_max.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "src/sched/sched_getparam.h"
@@ -37,7 +37,7 @@
 class SchedTest : public LIBC_NAMESPACE::testing::Test {
 public:
   void testSched(int policy, bool is_mandatory) {
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     int init_policy = LIBC_NAMESPACE::sched_getscheduler(0);
     ASSERT_GE(init_policy, 0);
@@ -55,30 +55,29 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(-1, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(-1), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Invalid Policy
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy | 128, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Out of bounds priority
     param.sched_priority = min_priority - 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, &param), -1);
     // A bit hard to test as depending on user privileges we can run into
     // different issues.
-    ASSERT_TRUE(LIBC_NAMESPACE::libc_errno == EINVAL ||
-                LIBC_NAMESPACE::libc_errno == EPERM);
-    LIBC_NAMESPACE::libc_errno = 0;
+    ASSERT_TRUE(libc_errno == EINVAL || libc_errno == EPERM);
+    libc_errno = 0;
 
     param.sched_priority = min_priority;
     // Success/unsupported policy/missing permissions.
@@ -87,10 +86,9 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_TRUE(setscheduler_result == 0 || setscheduler_result == -1);
     ASSERT_TRUE(
         setscheduler_result != -1
-            ? (LIBC_NAMESPACE::libc_errno == 0)
-            : ((!is_mandatory && LIBC_NAMESPACE::libc_errno == EINVAL) ||
-               LIBC_NAMESPACE::libc_errno == EPERM));
-    LIBC_NAMESPACE::libc_errno = 0;
+            ? (libc_errno == 0)
+            : ((!is_mandatory && libc_errno == EINVAL) || libc_errno == EPERM));
+    libc_errno = 0;
 
     ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(0),
               setscheduler_result != -1 ? policy : init_policy);
@@ -100,12 +98,12 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     param.sched_priority = -1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     param.sched_priority = max_priority + 1;
     ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, &param), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     for (int priority = min_priority; priority <= max_priority; ++priority) {
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, &param), 0);
@@ -117,21 +115,20 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
       // Negative pid
       ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
 
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(-1, &param), -1);
       ASSERT_ERRNO_EQ(EINVAL);
-      LIBC_NAMESPACE::libc_errno = 0;
+      libc_errno = 0;
 
       // Success/unsupported policy/missing permissions
       int setparam_result = LIBC_NAMESPACE::sched_setparam(0, &param);
       ASSERT_TRUE(setparam_result == 0 || setparam_result == -1);
       ASSERT_TRUE(setparam_result != -1
-                      ? (LIBC_NAMESPACE::libc_errno == 0)
-                      : ((setscheduler_result == -1 &&
-                          LIBC_NAMESPACE::libc_errno == EINVAL) ||
-                         LIBC_NAMESPACE::libc_errno == EPERM));
-      LIBC_NAMESPACE::libc_errno = 0;
+                      ? (libc_errno == 0)
+                      : ((setscheduler_result == -1 && libc_errno == EINVAL) ||
+                         libc_errno == EPERM));
+      libc_errno = 0;
 
       ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, &param), 0);
       ASSERT_ERRNO_SUCCESS();
@@ -143,7 +140,7 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test {
     // Null test
     ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, nullptr), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 };
 
@@ -161,13 +158,13 @@ LIST_SCHED_TESTS(SCHED_BATCH, true)
 LIST_SCHED_TESTS(SCHED_IDLE, true)
 
 TEST(LlvmLibcSchedParamAndSchedulerTest, NullParamTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, nullptr), -1);
   ASSERT_ERRNO_EQ(EINVAL);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 }
diff --git a/libc/test/src/sched/sched_rr_get_interval_test.cpp b/libc/test/src/sched/sched_rr_get_interval_test.cpp
index c22a2c76d743c..a0fe5edbe014e 100644
--- a/libc/test/src/sched/sched_rr_get_interval_test.cpp
+++ b/libc/test/src/sched/sched_rr_get_interval_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_get_priority_min.h"
 #include "src/sched/sched_getscheduler.h"
 #include "src/sched/sched_rr_get_interval.h"
@@ -17,7 +17,7 @@
 #include <sched.h>
 
 TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   auto SetSched = [&](int policy) {
     int min_priority = LIBC_NAMESPACE::sched_get_priority_min(policy);
     ASSERT_GE(min_priority, 0);
@@ -58,19 +58,19 @@ TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) {
     // Null timespec
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, nullptr), -1);
     ASSERT_ERRNO_EQ(EFAULT);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     // Negative pid
     ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(-1, &ts), -1);
     ASSERT_ERRNO_EQ(EINVAL);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
   }
 
   // Negative tests don't have SCHED_RR set
   SetSched(SCHED_OTHER);
   ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, &ts), 0);
   ASSERT_ERRNO_SUCCESS();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   // TODO: Missing unkown pid -> ESRCH. This is read only so safe to try a few
   //       unlikely values.
diff --git a/libc/test/src/sched/yield_test.cpp b/libc/test/src/sched/yield_test.cpp
index f1627a71fa9ad..4d13d50e25eb2 100644
--- a/libc/test/src/sched/yield_test.cpp
+++ b/libc/test/src/sched/yield_test.cpp
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sched/sched_yield.h"
 #include "test/UnitTest/Test.h"
 
 TEST(LlvmLibcSchedYieldTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   // sched_yield() always succeeds, just do a basic test that errno/ret are
   // properly 0.
   ASSERT_EQ(LIBC_NAMESPACE::sched_yield(), 0);
diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp
index cc392da8f4731..ce4dfddae2481 100644
--- a/libc/test/src/signal/sigaltstack_test.cpp
+++ b/libc/test/src/signal/sigaltstack_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/signal_macros.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/linux/signal_utils.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaction.h"
@@ -46,7 +46,7 @@ static void handler(int) {
 
 TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) {
   struct sigaction action;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGUSR1, nullptr, &action),
               Succeeds(0));
   action.sa_handler = handler;
diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp
index bac9c3b8b68bb..62b86bf440291 100644
--- a/libc/test/src/signal/signal_test.cpp
+++ b/libc/test/src/signal/signal_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/signal.h"
 
@@ -17,7 +17,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 TEST(LlvmLibcSignal, Invalid) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   auto *valid = +[](int) {};
   EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid),
               Fails(EINVAL, (void *)SIG_ERR));
diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp
index 12403f68b5930..891eac0f5bf75 100644
--- a/libc/test/src/signal/sigprocmask_test.cpp
+++ b/libc/test/src/signal/sigprocmask_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/signal/raise.h"
 #include "src/signal/sigaddset.h"
 #include "src/signal/sigemptyset.h"
@@ -33,7 +33,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 // This tests for invalid input.
 TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   sigset_t valid;
   // 17 and -4 are out of the range for sigprocmask's how paramater.
diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
index c1edf56bdbd87..01ccb8218ee20 100644
--- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/spawn/file_actions.h"
 #include "src/spawn/posix_spawn_file_actions_addclose.h"
 #include "src/spawn/posix_spawn_file_actions_adddup2.h"
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index ef36cff2ffbd5..104fc478b100e 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,7 +9,7 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
@@ -22,7 +22,7 @@
 
 TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
 }
 
 TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -65,7 +65,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
 }
 
 TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +83,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 2cc8436bd66f2..56bde5f0099a8 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -17,7 +17,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
@@ -33,7 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 46cf12c2c253b..90429ecf4e82b 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -20,7 +20,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
@@ -36,7 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    LIBC_NAMESPACE::libc_errno = 0;
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index a8a2c62f07b5e..abed3d4052939 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -14,7 +14,7 @@
 #include "src/stdio/fwrite.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
@@ -35,7 +35,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index a0368d701a676..e624181c795b8 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -21,7 +21,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
@@ -41,7 +41,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +80,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +103,10 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,15 +121,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // LIBC_NAMESPACE::libc_errno = 0;
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
@@ -165,7 +165,7 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 61ce2a207fa19..03e1ac286b646 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -20,7 +20,7 @@
 
 #include "hdr/stdio_macros.h"
 #include "hdr/types/size_t.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
@@ -67,7 +67,7 @@ int seek_ss(void *cookie, off64_t *offset, int whence) {
   } else if (whence == SEEK_END) {
     new_offset = *offset + ss->endpos;
   } else {
-    LIBC_NAMESPACE::libc_errno = EINVAL;
+    libc_errno = EINVAL;
     return -1;
   }
   if (new_offset < 0 || size_t(new_offset) > ss->bufsize)
@@ -115,7 +115,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -149,7 +149,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -178,7 +178,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 72875600903a6..84984e26398c0 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -14,13 +14,13 @@
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
 TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -39,7 +39,7 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
 TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index a5dd734c63616..ac494a4ecaf8e 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,7 +8,7 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
@@ -19,7 +19,7 @@
 TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a1e1fee25db31..5872943c1bb41 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,7 +14,7 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
@@ -102,6 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index f6af6ad3e364b..f1b545ba546f9 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -10,7 +10,7 @@
 #include "src/stdio/sprintf.h"
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
 #include <inttypes.h>
@@ -3228,46 +3228,46 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   char buff[1000];
   int written;
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%m");
   ASSERT_STREQ_LEN(written, buff, "Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
   // Check that it correctly consumes no arguments.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%m %d", 1);
   ASSERT_STREQ_LEN(written, buff, "Success 1");
 
   // Width Tests
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%10m");
   ASSERT_STREQ_LEN(written, buff, "   Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
   // Precision Tests
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%.10m");
   ASSERT_STREQ_LEN(written, buff, "Success");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%.10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical ");
 
   // Flag Tests (Only '-' since the others only affect ints)
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%-10m");
   ASSERT_STREQ_LEN(written, buff, "Success   ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%-10m");
   ASSERT_STREQ_LEN(written, buff, "Numerical result out of range");
 
@@ -3275,93 +3275,93 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   // Since alt mode here is effectively a completely separate conversion, it
   // gets separate tests.
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "0");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // Alt Mode Width
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "         0");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "    ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#10m");
   ASSERT_STREQ_LEN(written, buff, "     -9999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#3m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // Alt Mode Precision
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.10m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.10m");
   ASSERT_STREQ_LEN(written, buff, "-0000009999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.3m");
   ASSERT_STREQ_LEN(written, buff, "ERA");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // We don't test precision (or int flags) on errno = 0 because it behaves
   // weirdly, see the docs for more information.
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#.1m");
   ASSERT_STREQ_LEN(written, buff, "0");
 
   // Alt Mode Flags
 
   // '-' flag
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "0         ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE    ");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-10m");
   ASSERT_STREQ_LEN(written, buff, "-9999     ");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-3m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#-3m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
   // '+' flag
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
@@ -3370,38 +3370,38 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) {
   // come up, but I've avoided it for the other %m tests for ease of
   // refactoring if necessary. Here it needs to be positive to test that the
   // flags that only affect positive signed integers are properly passed along.
-  LIBC_NAMESPACE::libc_errno = 9999;
+  libc_errno = 9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#+m");
   ASSERT_STREQ_LEN(written, buff, "+9999");
 
   // ' ' flag
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 
-  LIBC_NAMESPACE::libc_errno = 9999;
+  libc_errno = 9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%# m");
   ASSERT_STREQ_LEN(written, buff, " 9999");
 
   // '0' flag
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#010m");
   ASSERT_STREQ_LEN(written, buff, "    ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#010m");
   ASSERT_STREQ_LEN(written, buff, "-000009999");
 
-  LIBC_NAMESPACE::libc_errno = ERANGE;
+  libc_errno = ERANGE;
   written = LIBC_NAMESPACE::sprintf(buff, "%#03m");
   ASSERT_STREQ_LEN(written, buff, "ERANGE");
 
-  LIBC_NAMESPACE::libc_errno = -9999;
+  libc_errno = -9999;
   written = LIBC_NAMESPACE::sprintf(buff, "%#03m");
   ASSERT_STREQ_LEN(written, buff, "-9999");
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 67f1b0ff513bc..5d482b70064bd 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -17,7 +17,7 @@
 #include "src/stdio/fwrite_unlocked.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 
 TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
@@ -36,7 +36,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3eeccc5727e77 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtoint32_test.cpp b/libc/test/src/stdlib/strtoint32_test.cpp
index 17df432fc8e68..e6da692714d28 100644
--- a/libc/test/src/stdlib/strtoint32_test.cpp
+++ b/libc/test/src/stdlib/strtoint32_test.cpp
@@ -8,9 +8,9 @@
 
 #include <stdint.h>
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 #include "StrtolTest.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ int32_t strtoint32(const char *__restrict str, char **__restrict str_end,
                    int base) {
   auto result = internal::strtointeger<int32_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
@@ -33,7 +33,7 @@ uint32_t strtouint32(const char *__restrict str, char **__restrict str_end,
                      int base) {
   auto result = internal::strtointeger<uint32_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
diff --git a/libc/test/src/stdlib/strtoint64_test.cpp b/libc/test/src/stdlib/strtoint64_test.cpp
index b5fe69dfaa701..2c5d948f5fae2 100644
--- a/libc/test/src/stdlib/strtoint64_test.cpp
+++ b/libc/test/src/stdlib/strtoint64_test.cpp
@@ -8,9 +8,9 @@
 
 #include <stdint.h>
 
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/str_to_integer.h"
-#include "src/errno/libc_errno.h"
 
 #include "StrtolTest.h"
 #include "test/UnitTest/Test.h"
@@ -21,7 +21,7 @@ int64_t strtoint64(const char *__restrict str, char **__restrict str_end,
                    int base) {
   auto result = internal::strtointeger<int64_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
@@ -33,7 +33,7 @@ uint64_t strtouint64(const char *__restrict str, char **__restrict str_end,
                      int base) {
   auto result = internal::strtointeger<uint64_t>(str, base);
   if (result.has_error())
-    LIBC_NAMESPACE::libc_errno = result.error;
+    libc_errno = result.error;
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(str + result.parsed_len);
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba64..c2f2b9c9a11c3 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 
diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp
index 88abacad554e0..6b81411ca604a 100644
--- a/libc/test/src/sys/mman/linux/mlock_test.cpp
+++ b/libc/test/src/sys/mman/linux/mlock_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/sys/mman/madvise.h"
 #include "src/sys/mman/mincore.h"
 #include "src/sys/mman/mlock.h"
@@ -149,9 +149,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
         Succeeds());
     auto retval = LIBC_NAMESPACE::mlockall(MCL_CURRENT);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     unsigned char vec;
@@ -163,9 +162,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
   {
     auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     PageHolder holder;
@@ -180,9 +178,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) {
   {
     auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE | MCL_ONFAULT);
     if (retval == -1) {
-      EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM ||
-                  LIBC_NAMESPACE::libc_errno == EPERM);
-      LIBC_NAMESPACE::libc_errno = 0;
+      EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM);
+      libc_errno = 0;
       return;
     }
     PageHolder holder;
diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
index 455a82678e18f..ba0ee4f09109e 100644
--- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/sys/stat/mkdirat.h"
 #include "src/sys/statvfs/fstatvfs.h"
@@ -41,7 +41,7 @@ TEST_F(LlvmLibcSysFStatvfsTest, FStatvfsInvalidPath) {
 
   // Always delete the folder so that we start in a consistent state.
   LIBC_NAMESPACE::rmdir(TEST_DIR);
-  LIBC_NAMESPACE::libc_errno = 0; // Reset errno
+  libc_errno = 0; // Reset errno
 
   ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU),
               Succeeds(0));
diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
index f356bb3d277b6..327dec07a1b79 100644
--- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
+++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-#include "src/errno/libc_errno.h"
 #include "src/sys/stat/mkdirat.h"
 #include "src/sys/statvfs/statvfs.h"
 #include "src/unistd/rmdir.h"
@@ -37,7 +37,7 @@ TEST_F(LlvmLibcSysStatvfsTest, StatvfsInvalidPath) {
 
   // Always delete the folder so that we start in a consistent state.
   LIBC_NAMESPACE::rmdir(TEST_DIR);
-  LIBC_NAMESPACE::libc_errno = 0; // Reset errno
+  libc_errno = 0; // Reset errno
 
   ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU),
               Succeeds(0));
diff --git a/libc/test/src/sys/time/setitimer_test.cpp b/libc/test/src/sys/time/setitimer_test.cpp
index 16d33fdf1e4f9..115f9e662ed46 100644
--- a/libc/test/src/sys/time/setitimer_test.cpp
+++ b/libc/test/src/sys/time/setitimer_test.cpp
@@ -24,7 +24,7 @@ static bool timer_fired(false);
 extern "C" void handle_sigalrm(int) { timer_fired = true; }
 
 TEST_F(LlvmLibcSysTimeSetitimerTest, SmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   struct sigaction sa;
   sa.sa_handler = handle_sigalrm;
   LIBC_NAMESPACE::sigemptyset(&sa.sa_mask);
diff --git a/libc/test/src/termios/termios_test.cpp b/libc/test/src/termios/termios_test.cpp
index f8fc09a8bbf0e..5ec169a886b1e 100644
--- a/libc/test/src/termios/termios_test.cpp
+++ b/libc/test/src/termios/termios_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/termios/cfgetispeed.h"
 #include "src/termios/cfgetospeed.h"
@@ -30,21 +30,21 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
 TEST(LlvmLibcTermiosTest, SpeedSmokeTest) {
   struct termios t;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, B50), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetispeed(&t), speed_t(B50));
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, B75), Succeeds(0));
   ASSERT_EQ(LIBC_NAMESPACE::cfgetospeed(&t), speed_t(B75));
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, ~CBAUD), Fails(EINVAL));
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, ~CBAUD), Fails(EINVAL));
 }
 
 TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) {
   struct termios t;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
   if (fd < 0)
     return; // When /dev/tty is not available, no point continuing.
@@ -54,7 +54,7 @@ TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) {
 }
 
 TEST(LlvmLibcTermiosTest, TcGetSidSmokeTest) {
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY);
   if (fd < 0)
     return; // When /dev/tty is not available, no point continuing.
diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp
index b595cfe024866..d840248b7df42 100644
--- a/libc/test/src/time/asctime_r_test.cpp
+++ b/libc/test/src/time/asctime_r_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/asctime_r.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/time/asctime_test.cpp b/libc/test/src/time/asctime_test.cpp
index 169a7463a3037..cad25fffc65af 100644
--- a/libc/test/src/time/asctime_test.cpp
+++ b/libc/test/src/time/asctime_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/asctime.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp
index 27011b7e0fbd6..fe43877aa499d 100644
--- a/libc/test/src/time/ctime_r_test.cpp
+++ b/libc/test/src/time/ctime_r_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/ctime_r.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/time/ctime_test.cpp b/libc/test/src/time/ctime_test.cpp
index 6f1168f0b6685..5ff69f6619b4f 100644
--- a/libc/test/src/time/ctime_test.cpp
+++ b/libc/test/src/time/ctime_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/ctime.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/time/TmHelper.h"
diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp
index 6af5a18d36996..41236665d2eaa 100644
--- a/libc/test/src/time/gmtime_test.cpp
+++ b/libc/test/src/time/gmtime_test.cpp
@@ -8,7 +8,7 @@
 
 #include "hdr/types/struct_tm.h"
 #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/gmtime.h"
 #include "src/time/time_constants.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
@@ -30,7 +30,7 @@ TEST(LlvmLibcGmTime, OutOfRange) {
   EXPECT_TRUE(tm_data == nullptr);
   ASSERT_ERRNO_EQ(EOVERFLOW);
 
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
   seconds =
       INT_MIN *
           static_cast<int64_t>(
diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp
index d4f98e29bd980..e0200ff3aaa26 100644
--- a/libc/test/src/time/nanosleep_test.cpp
+++ b/libc/test/src/time/nanosleep_test.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/types/struct_timespec.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/time/nanosleep.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -17,7 +17,7 @@ namespace cpp = LIBC_NAMESPACE::cpp;
 TEST(LlvmLibcNanosleep, SmokeTest) {
   // TODO: When we have the code to read clocks, test that time has passed.
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  LIBC_NAMESPACE::libc_errno = 0;
+  libc_errno = 0;
 
   struct timespec tim = {1, 500};
   struct timespec tim2 = {0, 0};
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index b86d2f27e516a..123d9ccc8310f 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1601,6 +1601,7 @@ libc_support_library(
 libc_header_library(
     name = "libcxx_shared_headers",
     hdrs = [
+        "shared/libc_common.h",
         "shared/fp_bits.h",
         "shared/str_to_float.h",
         "shared/str_to_integer.h",
@@ -1618,7 +1619,7 @@ libc_header_library(
 libc_support_library(
     name = "errno",
     srcs = ["src/errno/libc_errno.cpp"],
-    hdrs = ["src/errno/libc_errno.h"],
+    hdrs = ["src/__support/libc_errno.h"],
     deps = [
         ":__support_common",
         ":__support_cpp_atomic",

From 79108da325daec08f5b50169a9c35e03ea0645a3 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Wed, 11 Jun 2025 20:28:55 +0000
Subject: [PATCH 149/851] [libc][obvious] Changed incorrect type (#143780)

After changing mbstate_t to mbstate we forgot to change the
character_converter files to reflect it.

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 libc/src/__support/wchar/character_converter.h   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 0afc2a6f59e64..3cdb8ca83b7f0 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -16,7 +16,7 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; }
+CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 bool CharacterConverter::isComplete() {}
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index a6bac43805376..d0602d2defe22 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -19,10 +19,10 @@ namespace internal {
 
 class CharacterConverter {
 private:
-  mbstate_t *state;
+  mbstate *state;
 
 public:
-  CharacterConverter(mbstate_t *mbstate);
+  CharacterConverter(mbstate *mbstate);
 
   bool isComplete();
 

From c0c0f60ca14422dfbfe27fddd8d47faa596165d8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 11 Jun 2025 22:09:55 +0100
Subject: [PATCH 150/851] [GlobalOpt] Bail out on non-ConstExprs in
 isSimpleEnoughtToCommit. (#143400)

Bail out for non ConstantExpr constants in
isSimpleEnoughValueToCommitHelper to prevent crash for non-ConstantExpr
constants

PR: https://github.com/llvm/llvm-project/pull/143400
---
 llvm/lib/Transforms/Utils/Evaluator.cpp       |  4 +-
 .../global-constructor-complex-constants.ll   | 64 +++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll

diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 2af447aadce22..d1db2ee29f3a2 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -77,7 +77,9 @@ isSimpleEnoughValueToCommitHelper(Constant *C,
   // We don't know exactly what relocations are allowed in constant expressions,
   // so we allow &global+constantoffset, which is safe and uniformly supported
   // across targets.
-  ConstantExpr *CE = cast<ConstantExpr>(C);
+  ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
+  if (!CE)
+    return false;
   switch (CE->getOpcode()) {
   case Instruction::BitCast:
     // Bitcast is fine if the casted value is fine.
diff --git a/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll
new file mode 100644
index 0000000000000..6d9bdc41a0041
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -p globalopt -S %s | FileCheck %s
+
+@llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }]
+
+@foo = internal global ptr null
+
+declare void @user(ptr)
+
+;.
+; CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }]
+; CHECK: @foo = internal global ptr null
+;.
+define void @ctor() {
+; CHECK-LABEL: define void @ctor() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr ptrauth (ptr @foo, i32 0), ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr ptrauth (ptr @foo, i32 0), ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}
+
+define void @ctor_nocfi() {
+; CHECK-LABEL: define void @ctor_nocfi() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr no_cfi @foo, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr no_cfi @foo, ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}
+
+define void @fn() {
+; CHECK-LABEL: define void @fn() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @ctor_dso_local_equivalent() {
+; CHECK-LABEL: define void @ctor_dso_local_equivalent() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    store ptr dso_local_equivalent @fn, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @user(ptr [[DST]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst = alloca ptr, align 8
+  store ptr dso_local_equivalent @fn, ptr %dst, align 8
+  call void @user(ptr %dst)
+  ret void
+}

From f39f53e569f92987683626d910e9dbcbd59ff410 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Wed, 11 Jun 2025 14:11:19 -0700
Subject: [PATCH 151/851] [Clang][NFC] Move HeadingAndSpellings to avoid
 copying (#143611)

Static analysis flagged that we could move HeadingAndSpellings and avoid
a copy of a large object.
---
 clang/utils/TableGen/ClangAttrEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 21d76c12a3cce..42627f02cf356 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -5405,7 +5405,7 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) {
       // Handle Undocumented category separately - no content merging
       if (Cat == "Undocumented" && UndocumentedCategory) {
         UndocumentedDocs.push_back(
-            DocumentationData(Doc, Attr, HeadingAndSpellings));
+            DocumentationData(Doc, Attr, std::move(HeadingAndSpellings)));
         continue;
       }
 

From d7e7f22626f214766f3592341dd1737fd232c6a5 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Thu, 12 Jun 2025 00:19:25 +0300
Subject: [PATCH 152/851] [Clang] fix missing source location for errors in
 macro-expanded (#143460)

Fixes #143216

---

This patch fixes diagnostic locations for tokens from macro expansions.
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Parse/Parser.h            |  4 +---
 clang/lib/Parse/ParseExprCXX.cpp              |  4 ++--
 clang/lib/Parse/ParseStmt.cpp                 |  7 ++++--
 clang/lib/Parse/Parser.cpp                    |  5 +++++
 .../test/Parser/macro-expansion-recovery.cpp  | 22 +++++++++++++++++++
 clang/test/Parser/switch-recovery.cpp         | 13 +++++++++++
 7 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Parser/macro-expansion-recovery.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8043ab48f0b4f..b42d5f8425af6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -694,6 +694,7 @@ Bug Fixes in This Version
 - Constant evaluation now correctly runs the destructor of a variable declared in
   the second clause of a C-style ``for`` loop. (#GH139818)
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
+- Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 0b2fab4a45c96..d99de77a52919 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -290,9 +290,7 @@ class Parser : public CodeCompletionHandler {
     return ConsumeToken();
   }
 
-  SourceLocation getEndOfPreviousToken() {
-    return PP.getLocForEndOfToken(PrevTokLocation);
-  }
+  SourceLocation getEndOfPreviousToken() const;
 
   /// GetLookAheadToken - This peeks ahead N tokens and returns that token
   /// without consuming any tokens.  LookAhead(0) returns 'Tok', LookAhead(1)
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index d95260829e4a0..55ad7f256fa82 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -421,8 +421,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier(
       // like we never saw it.
       Token Identifier = Tok; // Stash away the identifier.
       ConsumeToken();         // Eat the identifier, current token is now '::'.
-      Diag(PP.getLocForEndOfToken(ConsumeToken()), diag::err_expected)
-          << tok::identifier;
+      ConsumeToken();
+      Diag(getEndOfPreviousToken(), diag::err_expected) << tok::identifier;
       UnconsumeToken(Identifier); // Stick the identifier back.
       Next = NextToken();         // Point Next at the '{' token.
     }
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c788723023c8b..c00759893b0c4 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -832,10 +832,13 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
           << "'case'" << tok::colon
           << FixItHint::CreateReplacement(ColonLoc, ":");
     } else {
-      SourceLocation ExpectedLoc = PP.getLocForEndOfToken(PrevTokLocation);
+      SourceLocation ExpectedLoc = getEndOfPreviousToken();
+
       Diag(ExpectedLoc, diag::err_expected_after)
           << "'case'" << tok::colon
-          << FixItHint::CreateInsertion(ExpectedLoc, ":");
+          << FixItHint::CreateInsertion(ExpectedLoc,
+                                        tok::getTokenName(tok::colon));
+
       ColonLoc = ExpectedLoc;
     }
 
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index db65c05cc114a..788ed79e0c1fa 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1873,6 +1873,11 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC,
   return AnnotatedNameKind::Unresolved;
 }
 
+SourceLocation Parser::getEndOfPreviousToken() const {
+  SourceLocation TokenEndLoc = PP.getLocForEndOfToken(PrevTokLocation);
+  return TokenEndLoc.isValid() ? TokenEndLoc : Tok.getLocation();
+}
+
 bool Parser::TryKeywordIdentFallback(bool DisableKeyword) {
   assert(Tok.isNot(tok::identifier));
   Diag(Tok, diag::ext_keyword_as_ident)
diff --git a/clang/test/Parser/macro-expansion-recovery.cpp b/clang/test/Parser/macro-expansion-recovery.cpp
new file mode 100644
index 0000000000000..6826cc04e4df5
--- /dev/null
+++ b/clang/test/Parser/macro-expansion-recovery.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace GH143216 {
+#define A x y
+enum { A }; // expected-error {{missing ',' between enumerators}}
+
+#define B x y
+void f() {
+    int a[2];
+    auto [B] = a; // expected-error {{expected ','}}
+}
+
+#define C <int!
+template <class T> class D;
+D C; // expected-error {{expected unqualified-id}} \
+     // expected-error {{expected '>'}} \
+     // expected-note {{to match this '<'}}
+
+#define E F::{
+class F { E }}; // expected-error {{expected identifier}} \
+                // expected-error {{expected member name or ';' after declaration specifiers}}
+}
diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp
index baf703cd03aed..7b3909e3b0d32 100644
--- a/clang/test/Parser/switch-recovery.cpp
+++ b/clang/test/Parser/switch-recovery.cpp
@@ -229,3 +229,16 @@ void fn1() {
     }
 } // expected-error{{expected statement}}
 }
+
+namespace GH143216 {
+#define FOO 1 case 3:
+
+int f(int x) {
+  switch (x) {
+  case FOO // expected-error {{expected ':' after 'case'}}
+    return 0;
+  default:
+    return 1;
+  }
+}
+}

From 625bfb7179ad1acab2aba1023095826628275a60 Mon Sep 17 00:00:00 2001
From: Jiachen Yuan <jiacheny@nvidia.com>
Date: Wed, 11 Jun 2025 14:23:41 -0700
Subject: [PATCH 153/851] Workaround MSVC Linker Issue when Cross-Compiling for
 ARM64EC (#143659)

This MR presents a temporary workaround for the issue described at
https://github.com/llvm/llvm-project/issues/143575. While an [upstream
MSVC
bug](https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-Compiling-L/10920141)
is reported, it makes sense to apply a workaround in LLVM code to
quickly unblock anyone affected.
---
 llvm/include/llvm/IR/Mangler.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h
index e3dfe1eac6189..232101a8926b7 100644
--- a/llvm/include/llvm/IR/Mangler.h
+++ b/llvm/include/llvm/IR/Mangler.h
@@ -26,7 +26,16 @@ class Triple;
 class Twine;
 class raw_ostream;
 
-constexpr std::string_view HybridPatchableTargetSuffix = "$hp_target";
+// TODO: The weird assignment of HybridPatchableTargetSuffix below is a
+// temporary workaround for a linker failure that is only hit when compiling
+// llvm for arm64ec on windows. The description and context of the issue is at
+// https://github.com/llvm/llvm-project/issues/143575.
+// An upstream MSVC bug is filed at
+// https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-
+// Compiling-L/10920141.
+constexpr char HybridPatchableTargetSuffixArr[] = "$hp_target";
+constexpr std::string_view HybridPatchableTargetSuffix =
+    HybridPatchableTargetSuffixArr;
 
 class Mangler {
   /// We need to give global values the same name every time they are mangled.

From 7838fc0cd3fbe578d9554fdcd3198c2ba3616bcc Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Wed, 11 Jun 2025 23:24:33 +0200
Subject: [PATCH 154/851] [Clang] [NFC] Move diagnostics emitting code from
 `DiagnosticIDs` into `DiagnosticsEngine` (#143517)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It makes more sense for this functionality to be all in one place rather
than split up across two files—at least it caused me a bit of a headache
to try and find all places where we were actually forwarding the
diagnostic to the `DiagnosticConsumer`. Moreover, moving these functions
into `DiagnosticsEngine` simplifies the code quite a bit since we access
members of `DiagnosticsEngine` more frequently than those of
`DiagnosticIDs`. There was also a duplicated code snippet that I’ve
moved out into a new function.
---
 clang/include/clang/Basic/Diagnostic.h    | 23 +++---
 clang/include/clang/Basic/DiagnosticIDs.h | 12 ---
 clang/lib/Basic/Diagnostic.cpp            | 98 ++++++++++++++++++++---
 clang/lib/Basic/DiagnosticIDs.cpp         | 97 ----------------------
 4 files changed, 102 insertions(+), 128 deletions(-)

diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index e9c54c3c487c9..efee8302e7501 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -18,6 +18,7 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/Basic/UnsignedOrNone.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FunctionExtras.h"
@@ -49,6 +50,7 @@ class FileSystem;
 namespace clang {
 
 class DeclContext;
+class Diagnostic;
 class DiagnosticBuilder;
 class DiagnosticConsumer;
 class IdentifierInfo;
@@ -228,6 +230,8 @@ class DiagStorageAllocator {
 class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
 public:
   /// The level of the diagnostic, after it has been through mapping.
+  // FIXME: Make this an alias for DiagnosticIDs::Level as soon as
+  // we can use 'using enum'.
   enum Level {
     Ignored = DiagnosticIDs::Ignored,
     Note = DiagnosticIDs::Note,
@@ -532,7 +536,7 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   ///
   /// This is used to emit continuation diagnostics with the same level as the
   /// diagnostic that they follow.
-  DiagnosticIDs::Level LastDiagLevel;
+  Level LastDiagLevel;
 
   /// Number of warnings reported
   unsigned NumWarnings;
@@ -777,18 +781,16 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// the middle of another diagnostic.
   ///
   /// This can be used by clients who suppress diagnostics themselves.
-  void setLastDiagnosticIgnored(bool Ignored) {
-    if (LastDiagLevel == DiagnosticIDs::Fatal)
+  void setLastDiagnosticIgnored(bool IsIgnored) {
+    if (LastDiagLevel == Fatal)
       FatalErrorOccurred = true;
-    LastDiagLevel = Ignored ? DiagnosticIDs::Ignored : DiagnosticIDs::Warning;
+    LastDiagLevel = IsIgnored ? Ignored : Warning;
   }
 
   /// Determine whether the previous diagnostic was ignored. This can
   /// be used by clients that want to determine whether notes attached to a
   /// diagnostic will be suppressed.
-  bool isLastDiagnosticIgnored() const {
-    return LastDiagLevel == DiagnosticIDs::Ignored;
-  }
+  bool isLastDiagnosticIgnored() const { return LastDiagLevel == Ignored; }
 
   /// Controls whether otherwise-unmapped extension diagnostics are
   /// mapped onto ignore/warning/error.
@@ -1024,9 +1026,10 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// Used to report a diagnostic that is finally fully formed.
   ///
   /// \returns true if the diagnostic was emitted, false if it was suppressed.
-  bool ProcessDiag(const DiagnosticBuilder &DiagBuilder) {
-    return Diags->ProcessDiag(*this, DiagBuilder);
-  }
+  bool ProcessDiag(const DiagnosticBuilder &DiagBuilder);
+
+  /// Forward a diagnostic to the DiagnosticConsumer.
+  void Report(Level DiagLevel, const Diagnostic &Info);
 
   /// @name Diagnostic Emission
   /// @{
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index 80d52a0d01112..2b095f0fd6741 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -483,18 +483,6 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
 
   Class getDiagClass(unsigned DiagID) const;
 
-  /// Used to report a diagnostic that is finally fully formed.
-  ///
-  /// \returns \c true if the diagnostic was emitted, \c false if it was
-  /// suppressed.
-  bool ProcessDiag(DiagnosticsEngine &Diag,
-                   const DiagnosticBuilder &DiagBuilder) const;
-
-  /// Used to emit a diagnostic that is finally fully formed,
-  /// ignoring suppression.
-  void EmitDiag(DiagnosticsEngine &Diag, const DiagnosticBuilder &DiagBuilder,
-                Level DiagLevel) const;
-
   /// Whether the diagnostic may leave the AST in a state where some
   /// invariants can break.
   bool isUnrecoverable(unsigned DiagID) const;
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 694224071347a..95d86cb153b4b 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -130,7 +130,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   TrapNumErrorsOccurred = 0;
   TrapNumUnrecoverableErrorsOccurred = 0;
 
-  LastDiagLevel = DiagnosticIDs::Ignored;
+  LastDiagLevel = Ignored;
 
   if (!soft) {
     // Clear state related to #pragma diagnostic.
@@ -658,13 +658,95 @@ void DiagnosticsEngine::Report(const StoredDiagnostic &storedDiag) {
   Level DiagLevel = storedDiag.getLevel();
   Diagnostic Info(this, storedDiag.getLocation(), storedDiag.getID(),
                   DiagStorage, storedDiag.getMessage());
+  Report(DiagLevel, Info);
+}
+
+void DiagnosticsEngine::Report(Level DiagLevel, const Diagnostic &Info) {
+  assert(DiagLevel != Ignored && "Cannot emit ignored diagnostics!");
   Client->HandleDiagnostic(DiagLevel, Info);
   if (Client->IncludeInDiagnosticCounts()) {
-    if (DiagLevel == DiagnosticsEngine::Warning)
+    if (DiagLevel == Warning)
       ++NumWarnings;
   }
 }
 
+/// ProcessDiag - This is the method used to report a diagnostic that is
+/// finally fully formed.
+bool DiagnosticsEngine::ProcessDiag(const DiagnosticBuilder &DiagBuilder) {
+  Diagnostic Info(this, DiagBuilder);
+
+  assert(getClient() && "DiagnosticClient not set!");
+
+  // Figure out the diagnostic level of this message.
+  unsigned DiagID = Info.getID();
+  Level DiagLevel = getDiagnosticLevel(DiagID, Info.getLocation());
+
+  // Update counts for DiagnosticErrorTrap even if a fatal error occurred
+  // or diagnostics are suppressed.
+  if (DiagLevel >= Error) {
+    ++TrapNumErrorsOccurred;
+    if (Diags->isUnrecoverable(DiagID))
+      ++TrapNumUnrecoverableErrorsOccurred;
+  }
+
+  if (SuppressAllDiagnostics)
+    return false;
+
+  if (DiagLevel != Note) {
+    // Record that a fatal error occurred only when we see a second
+    // non-note diagnostic. This allows notes to be attached to the
+    // fatal error, but suppresses any diagnostics that follow those
+    // notes.
+    if (LastDiagLevel == Fatal)
+      FatalErrorOccurred = true;
+
+    LastDiagLevel = DiagLevel;
+  }
+
+  // If a fatal error has already been emitted, silence all subsequent
+  // diagnostics.
+  if (FatalErrorOccurred) {
+    if (DiagLevel >= Error && Client->IncludeInDiagnosticCounts())
+      ++NumErrors;
+
+    return false;
+  }
+
+  // If the client doesn't care about this message, don't issue it.  If this is
+  // a note and the last real diagnostic was ignored, ignore it too.
+  if (DiagLevel == Ignored || (DiagLevel == Note && LastDiagLevel == Ignored))
+    return false;
+
+  if (DiagLevel >= Error) {
+    if (Diags->isUnrecoverable(DiagID))
+      UnrecoverableErrorOccurred = true;
+
+    // Warnings which have been upgraded to errors do not prevent compilation.
+    if (Diags->isDefaultMappingAsError(DiagID))
+      UncompilableErrorOccurred = true;
+
+    ErrorOccurred = true;
+    if (Client->IncludeInDiagnosticCounts())
+      ++NumErrors;
+
+    // If we've emitted a lot of errors, emit a fatal error instead of it to
+    // stop a flood of bogus errors.
+    if (ErrorLimit && NumErrors > ErrorLimit && DiagLevel == Error) {
+      Report(diag::fatal_too_many_errors);
+      return false;
+    }
+  }
+
+  // Make sure we set FatalErrorOccurred to ensure that the notes from the
+  // diagnostic that caused `fatal_too_many_errors` won't be emitted.
+  if (Info.getID() == diag::fatal_too_many_errors)
+    FatalErrorOccurred = true;
+
+  // Finally, report it.
+  Report(DiagLevel, Info);
+  return true;
+}
+
 bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB,
                                        bool Force) {
   assert(getClient() && "DiagnosticClient not set!");
@@ -674,14 +756,12 @@ bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB,
     Diagnostic Info(this, DB);
 
     // Figure out the diagnostic level of this message.
-    DiagnosticIDs::Level DiagLevel =
-        Diags->getDiagnosticLevel(Info.getID(), Info.getLocation(), *this);
+    Level DiagLevel = getDiagnosticLevel(Info.getID(), Info.getLocation());
 
-    Emitted = (DiagLevel != DiagnosticIDs::Ignored);
-    if (Emitted) {
-      // Emit the diagnostic regardless of suppression level.
-      Diags->EmitDiag(*this, DB, DiagLevel);
-    }
+    // Emit the diagnostic regardless of suppression level.
+    Emitted = DiagLevel != Ignored;
+    if (Emitted)
+      Report(DiagLevel, Info);
   } else {
     // Process the diagnostic, sending the accumulated information to the
     // DiagnosticConsumer.
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index 3e90b2d804773..dcf0c6cb54282 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -823,103 +823,6 @@ unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts,
   return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId;
 }
 
-/// ProcessDiag - This is the method used to report a diagnostic that is
-/// finally fully formed.
-bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag,
-                                const DiagnosticBuilder &DiagBuilder) const {
-  Diagnostic Info(&Diag, DiagBuilder);
-
-  assert(Diag.getClient() && "DiagnosticClient not set!");
-
-  // Figure out the diagnostic level of this message.
-  unsigned DiagID = Info.getID();
-  DiagnosticIDs::Level DiagLevel
-    = getDiagnosticLevel(DiagID, Info.getLocation(), Diag);
-
-  // Update counts for DiagnosticErrorTrap even if a fatal error occurred
-  // or diagnostics are suppressed.
-  if (DiagLevel >= DiagnosticIDs::Error) {
-    ++Diag.TrapNumErrorsOccurred;
-    if (isUnrecoverable(DiagID))
-      ++Diag.TrapNumUnrecoverableErrorsOccurred;
-  }
-
-  if (Diag.SuppressAllDiagnostics)
-    return false;
-
-  if (DiagLevel != DiagnosticIDs::Note) {
-    // Record that a fatal error occurred only when we see a second
-    // non-note diagnostic. This allows notes to be attached to the
-    // fatal error, but suppresses any diagnostics that follow those
-    // notes.
-    if (Diag.LastDiagLevel == DiagnosticIDs::Fatal)
-      Diag.FatalErrorOccurred = true;
-
-    Diag.LastDiagLevel = DiagLevel;
-  }
-
-  // If a fatal error has already been emitted, silence all subsequent
-  // diagnostics.
-  if (Diag.FatalErrorOccurred) {
-    if (DiagLevel >= DiagnosticIDs::Error &&
-        Diag.Client->IncludeInDiagnosticCounts()) {
-      ++Diag.NumErrors;
-    }
-
-    return false;
-  }
-
-  // If the client doesn't care about this message, don't issue it.  If this is
-  // a note and the last real diagnostic was ignored, ignore it too.
-  if (DiagLevel == DiagnosticIDs::Ignored ||
-      (DiagLevel == DiagnosticIDs::Note &&
-       Diag.LastDiagLevel == DiagnosticIDs::Ignored))
-    return false;
-
-  if (DiagLevel >= DiagnosticIDs::Error) {
-    if (isUnrecoverable(DiagID))
-      Diag.UnrecoverableErrorOccurred = true;
-
-    // Warnings which have been upgraded to errors do not prevent compilation.
-    if (isDefaultMappingAsError(DiagID))
-      Diag.UncompilableErrorOccurred = true;
-
-    Diag.ErrorOccurred = true;
-    if (Diag.Client->IncludeInDiagnosticCounts()) {
-      ++Diag.NumErrors;
-    }
-
-    // If we've emitted a lot of errors, emit a fatal error instead of it to
-    // stop a flood of bogus errors.
-    if (Diag.ErrorLimit && Diag.NumErrors > Diag.ErrorLimit &&
-        DiagLevel == DiagnosticIDs::Error) {
-      Diag.Report(diag::fatal_too_many_errors);
-      return false;
-    }
-  }
-
-  // Make sure we set FatalErrorOccurred to ensure that the notes from the
-  // diagnostic that caused `fatal_too_many_errors` won't be emitted.
-  if (Info.getID() == diag::fatal_too_many_errors)
-    Diag.FatalErrorOccurred = true;
-  // Finally, report it.
-  EmitDiag(Diag, DiagBuilder, DiagLevel);
-  return true;
-}
-
-void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag,
-                             const DiagnosticBuilder &DiagBuilder,
-                             Level DiagLevel) const {
-  Diagnostic Info(&Diag, DiagBuilder);
-  assert(DiagLevel != DiagnosticIDs::Ignored && "Cannot emit ignored diagnostics!");
-
-  Diag.Client->HandleDiagnostic((DiagnosticsEngine::Level)DiagLevel, Info);
-  if (Diag.Client->IncludeInDiagnosticCounts()) {
-    if (DiagLevel == DiagnosticIDs::Warning)
-      ++Diag.NumWarnings;
-  }
-}
-
 bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
   // Only errors may be unrecoverable.
   if (getDiagClass(DiagID) < CLASS_ERROR)

From 6f2ba4712f17d7c82228a5b705570571e13a3832 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Wed, 11 Jun 2025 14:34:02 -0700
Subject: [PATCH 155/851] [mlir] Fix ComposeExpandOfCollapseOp for dynamic case
 (#142663)

Changes `findCollapsingReassociation` to return nullopt in all cases
where source shape has `>=2` dynamic dims. `expand(collapse)` can
reshape to in any valid output shape but a collapse can only collapse
contiguous dimensions. When there are `>=2` dynamic dimensions it is
impossible to determine if it can be simplified to a collapse or if it
is preforming a more advanced reassociation.


This problem was uncovered by
https://github.com/llvm/llvm-project/pull/137963

---------

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h |  9 ++++++---
 mlir/test/Dialect/Tensor/canonicalize.mlir        | 14 ++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
index af575e10acc8e..61c2a50e514ca 100644
--- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h
@@ -387,11 +387,14 @@ struct ComposeExpandOfCollapseOp : public OpRewritePattern<ExpandOpTy> {
       auto resultSubShape =
           resultShape.slice(resultIndices.front(), resultIndices.size());
 
+      if (llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2 &&
+          llvm::count_if(resultSubShape, ShapedType::isDynamic) >= 2)
+        return std::nullopt;
+
       if (srcSubShape.size() == resultSubShape.size()) {
-        if (srcSubShape != resultSubShape ||
-            llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2) {
+        if (srcSubShape != resultSubShape)
           return std::nullopt;
-        }
+
         for (auto index : llvm::seq<int64_t>(0, srcSubShape.size())) {
           composedReassociation.emplace_back(1, srcIndices.front() + index);
         }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 65c5b3e8602eb..67b03b0a3485b 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1272,6 +1272,20 @@ func.func @compose_expand_of_collapse_dynamic(%arg0 : tensor<4x?x10x64x2xf16>, %
 
 // -----
 
+func.func @no_compose_collapse_of_expand_dynamic(%arg0 : tensor<?x8x128x?xf16>, %arg1: index) -> tensor<?x128x?xf16> {
+  %collapse = tensor.collapse_shape %arg0 [[0, 1, 2, 3]] : tensor<?x8x128x?xf16> into tensor<?xf16>
+  %expanded_19 = tensor.expand_shape %collapse [[0, 1, 2]] output_shape [%arg1, 8, %arg1] : tensor<?xf16> into tensor<?x128x?xf16>
+  return %expanded_19 : tensor<?x128x?xf16>
+}
+// CHECK-LABEL: func @no_compose_collapse_of_expand_dynamic
+//  CHECK-SAME:   %[[ARG0:.+]]: tensor
+//  CHECK-SAME:   %[[ARG1:.+]]: index
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]]
+//       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]]
+//       CHECK:   return %[[EXPAND]]
+
+// -----
+
 // CHECK-LABEL: func @zero_rank_reshape_multi
 func.func @zero_rank_reshape_multi(%arg0: tensor<f32>) -> tensor<f32> {
   // CHECK: return %arg0

From 9c9a4a284e95ea5e27617af7235e3ab049bae680 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Wed, 11 Jun 2025 14:54:30 -0700
Subject: [PATCH 156/851] [LOH] Don't emit AdrpAddStr when register could be
 clobbered (#142849)

https://github.com/llvm/llvm-project/commit/b783aa89795635cbe7b25b4143b562931fcec9f6
added a check to ensure an `AdrpAddLdr` LOH isn't created when there is
an instruction between the `add` and `ldr`


https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp#L419-L431

We need a similar check for `AdrpAddStr`. Although this technically
isn't implemented in LLD, it could be in the future.


https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/lld/MachO/Arch/ARM64.cpp#L699-L702
---
 llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 37 +++++++++++--------
 .../AArch64/loh-adrp-add-ldr-clobber.mir      | 37 +++++++++++++------
 2 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 53e8e438c5e57..064716216d1cb 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -247,6 +247,17 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) {
   }
 }
 
+/// Returns \p true if there are no non-debug instructions between \p First and
+/// \p Second
+static bool areInstructionsConsecutive(const MachineInstr *First,
+                                       const MachineInstr *Second) {
+  auto It = First->getIterator();
+  auto EndIt = First->getParent()->instr_end();
+  if (It == EndIt)
+    return false;
+  return next_nodbg(It, EndIt) == Second->getIterator();
+}
+
 /// Number of GPR registers tracked by mapRegToGPRIndex()
 static const unsigned N_GPR_REGS = 31;
 /// Map register number to index from 0-30.
@@ -415,7 +426,7 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
         ++NumADRPToLDR;
       }
       break;
-    case MCLOH_AdrpAddLdr: {
+    case MCLOH_AdrpAddLdr:
       // There is a possibility that the linker may try to rewrite:
       // adrp x0, @sym@PAGE
       // add x1, x0, @sym@PAGEOFF
@@ -432,28 +443,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
       // FIXME: Implement proper liveness tracking for all registers. For now,
       // don't emit the LOH if there are any instructions between the add and
       // the ldr.
-      MachineInstr *AddMI = const_cast<MachineInstr *>(Info.MI1);
-      const MachineInstr *LdrMI = Info.MI0;
-      auto AddIt = MachineBasicBlock::iterator(AddMI);
-      auto EndIt = AddMI->getParent()->end();
-      if (AddMI->getIterator() == EndIt || LdrMI != &*next_nodbg(AddIt, EndIt))
+      if (!areInstructionsConsecutive(Info.MI1, Info.MI0))
         break;
-
       LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n"
                         << '\t' << MI << '\t' << *Info.MI1 << '\t'
                         << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0});
       ++NumADDToLDR;
       break;
-    }
     case MCLOH_AdrpAddStr:
-      if (Info.MI1 != nullptr) {
-        LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
-                          << '\t' << MI << '\t' << *Info.MI1 << '\t'
-                          << *Info.MI0);
-        AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
-        ++NumADDToSTR;
-      }
+      if (!Info.MI1)
+        break;
+      if (!areInstructionsConsecutive(Info.MI1, Info.MI0))
+        break;
+      LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n"
+                        << '\t' << MI << '\t' << *Info.MI1 << '\t'
+                        << *Info.MI0);
+      AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0});
+      ++NumADDToSTR;
       break;
     case MCLOH_AdrpLdrGotLdr:
       LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n"
diff --git a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
index ce2d8f02f4cc8..a1d8bf375a19b 100644
--- a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
+++ b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir
@@ -1,16 +1,34 @@
-# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s
+# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s --implicit-check-not=MCLOH_
 # REQUIRES: asserts
+
+# Check that we don't emit LOHs when there is a clobbering def of x8.
 --- |
   @sym2 = local_unnamed_addr global [10000000 x i32] zeroinitializer, align 8
   @sym = local_unnamed_addr global i32 zeroinitializer, align 8
 
-  define i32 @main() {
-    ret i32 0
-  }
+  define i32 @adrp_add_ldr() { ret i32 0 }
+  define i32 @adrp_add_str() { ret i32 0 }
+...
+
+---
+name:            adrp_add_ldr
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x21', virtual-reg: '' }
+body:             |
+  bb.0:
+    liveins: $x21
+    renamable $x8 = ADRP target-flags(aarch64-page) @sym
+    renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0
+    renamable $x8 = ADDXri killed renamable $x21, 1, 0
+    $x9 = LDRXui $x9, 0
 
+    RET undef $lr
 ...
+
 ---
-name:            main
+name:            adrp_add_str
 alignment:       4
 tracksRegLiveness: true
 liveins:
@@ -19,13 +37,10 @@ liveins:
 body:             |
   bb.0:
     liveins: $x21, $x22
-    ; Check we don't emit an loh here because there's a clobbering def of x8 before the ldr.
-    ; CHECK-LABEL: main
-    ; CHECK-NOT: MCLOH_AdrpAddLdr
     renamable $x8 = ADRP target-flags(aarch64-page) @sym
     renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0
-    renamable $x8 = ADDXri killed renamable $x22, 1, 0
-    $x9 = LDRXui $x9, 0
-    RET undef $lr
+    renamable $x8 = ADDXri killed renamable $x21, 1, 0
+    STRXui $x22, $x9, 0
 
+    RET undef $lr
 ...

From 74172add65aa14e77e98b048db0074c3f273057f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 11 Jun 2025 18:18:22 -0400
Subject: [PATCH 157/851] [mlir][generate-test-checks] Do not emit the
 autogenerated note if it exists (#143750)

Prior to this PR, the script removed the already existing autogenerated
note if we came across a line that was equal to the note. But the
default note is multiple lines, so there would never be a match.
Instead, check to see if the current line is a substring of the
autogenerated note.

Co-authored-by: Michael Maitland <michaelmaitland@meta.com>
---
 mlir/utils/generate-test-checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index 11fb4e40072e7..f77c9688d9318 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -208,7 +208,7 @@ def process_source_lines(source_lines, note, args):
     source_segments = [[]]
     for line in source_lines:
         # Remove previous note.
-        if line == note:
+        if line in note:
             continue
         # Remove previous CHECK lines.
         if line.find(args.check_prefix) != -1:

From 0e457315f55889878ccbc3e35d4beb04e277733f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 11 Jun 2025 18:19:15 -0400
Subject: [PATCH 158/851] [mlir][generate-test-checks] Emit attributes with
 rest of CHECK lines (#143759)

Prior to this patch, generating test checks in place put the ATTR
definitions at the very top of the file, above the RUN lines and
autogenerated note. All CHECK lines should below the RUN lines and
autogenerated note.

This change ensures that the attribute definitions are emitted with the
rest of the CHECK lines.

---------

Co-authored-by: Michael Maitland <michaelmaitland@meta.com>
---
 mlir/utils/generate-test-checks.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index f77c9688d9318..14a790e6d0e6e 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -220,12 +220,19 @@ def process_source_lines(source_lines, note, args):
         source_segments[-1].append(line + "\n")
     return source_segments
 
-def process_attribute_definition(line, attribute_namer, output):
+
+def process_attribute_definition(line, attribute_namer):
     m = ATTR_DEF_RE.match(line)
     if m:
         attribute_name = attribute_namer.generate_name(m.group(1))
-        line = '// CHECK: #[[' + attribute_name + ':.+]] =' + line[len(m.group(0)):] + '\n'
-        output.write(line)
+        return (
+            "// CHECK: #[["
+            + attribute_name
+            + ":.+]] ="
+            + line[len(m.group(0)) :]
+            + "\n"
+        )
+    return None
 
 def process_attribute_references(line, attribute_namer):
 
@@ -340,6 +347,9 @@ def main():
     variable_namer = VariableNamer(args.variable_names)
     attribute_namer = AttributeNamer(args.attribute_names)
 
+    # Store attribute definitions to emit at appropriate scope
+    pending_attr_defs = []
+
     # Process lines
     for input_line in input_lines:
         if not input_line:
@@ -350,8 +360,9 @@ def main():
         if input_line.startswith("// -----"):
             continue
 
-        # Check if this is an attribute definition and process it
-        process_attribute_definition(input_line, attribute_namer, output)
+        if ATTR_DEF_RE.match(input_line):
+            pending_attr_defs.append(input_line)
+            continue
 
         # Lines with blocks begin with a ^. These lines have a trailing comment
         # that needs to be stripped.
@@ -407,6 +418,13 @@ def main():
             output_line += process_line(ssa_split[1:], variable_namer)
 
         else:
+            # Emit any pending attribute definitions at the start of this scope
+            for attr in pending_attr_defs:
+                attr_line = process_attribute_definition(attr, attribute_namer)
+                if attr_line:
+                    output_segments[-1].append(attr_line)
+            pending_attr_defs.clear()
+
             # Output the first line chunk that does not contain an SSA name for the
             # label.
             output_line = "// " + args.check_prefix + "-LABEL: " + ssa_split[0] + "\n"

From ee35e342945d6825c9b2b004fd135cf16c84ea0e Mon Sep 17 00:00:00 2001
From: Nikolay Panchenko <nicholas.panchenko@gmail.com>
Date: Wed, 11 Jun 2025 19:00:29 -0400
Subject: [PATCH 159/851] [ConstantFolding] Add folding for [de]interleave2,
 insert and extract (#141301)

The change adds folding for 4 vector intrinsics: `interleave2`,
`deinterleave2`, `vector_extract` and `vector_insert`. For the last 2
intrinsics the change does not use `ShuffleVector` fold mechanism as
it's much simpler to construct result vector explicitly.
---
 llvm/lib/Analysis/ConstantFolding.cpp         | 97 +++++++++++++++++++
 .../InstSimplify/ConstProp/vector-calls.ll    | 68 +++++++++++++
 2 files changed, 165 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 1ef0badd23757..139a0b81e299b 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1635,6 +1635,10 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::vector_reduce_smax:
   case Intrinsic::vector_reduce_umin:
   case Intrinsic::vector_reduce_umax:
+  case Intrinsic::vector_extract:
+  case Intrinsic::vector_insert:
+  case Intrinsic::vector_interleave2:
+  case Intrinsic::vector_deinterleave2:
   // Target intrinsics
   case Intrinsic::amdgcn_perm:
   case Intrinsic::amdgcn_wave_reduce_umin:
@@ -3758,6 +3762,72 @@ static Constant *ConstantFoldFixedVectorCall(
     }
     return nullptr;
   }
+  case Intrinsic::vector_extract: {
+    auto *Idx = dyn_cast<ConstantInt>(Operands[1]);
+    Constant *Vec = Operands[0];
+    if (!Idx || !isa<FixedVectorType>(Vec->getType()))
+      return nullptr;
+
+    unsigned NumElements = FVTy->getNumElements();
+    unsigned VecNumElements =
+        cast<FixedVectorType>(Vec->getType())->getNumElements();
+    unsigned StartingIndex = Idx->getZExtValue();
+
+    // Extracting entire vector is nop
+    if (NumElements == VecNumElements && StartingIndex == 0)
+      return Vec;
+
+    for (unsigned I = StartingIndex, E = StartingIndex + NumElements; I < E;
+         ++I) {
+      Constant *Elt = Vec->getAggregateElement(I);
+      if (!Elt)
+        return nullptr;
+      Result[I - StartingIndex] = Elt;
+    }
+
+    return ConstantVector::get(Result);
+  }
+  case Intrinsic::vector_insert: {
+    Constant *Vec = Operands[0];
+    Constant *SubVec = Operands[1];
+    auto *Idx = dyn_cast<ConstantInt>(Operands[2]);
+    if (!Idx || !isa<FixedVectorType>(Vec->getType()))
+      return nullptr;
+
+    unsigned SubVecNumElements =
+        cast<FixedVectorType>(SubVec->getType())->getNumElements();
+    unsigned VecNumElements =
+        cast<FixedVectorType>(Vec->getType())->getNumElements();
+    unsigned IdxN = Idx->getZExtValue();
+    // Replacing entire vector with a subvec is nop
+    if (SubVecNumElements == VecNumElements && IdxN == 0)
+      return SubVec;
+
+    for (unsigned I = 0; I < VecNumElements; ++I) {
+      Constant *Elt;
+      if (I < IdxN + SubVecNumElements)
+        Elt = SubVec->getAggregateElement(I - IdxN);
+      else
+        Elt = Vec->getAggregateElement(I);
+      if (!Elt)
+        return nullptr;
+      Result[I] = Elt;
+    }
+    return ConstantVector::get(Result);
+  }
+  case Intrinsic::vector_interleave2: {
+    unsigned NumElements =
+        cast<FixedVectorType>(Operands[0]->getType())->getNumElements();
+    for (unsigned I = 0; I < NumElements; ++I) {
+      Constant *Elt0 = Operands[0]->getAggregateElement(I);
+      Constant *Elt1 = Operands[1]->getAggregateElement(I);
+      if (!Elt0 || !Elt1)
+        return nullptr;
+      Result[2 * I] = Elt0;
+      Result[2 * I + 1] = Elt1;
+    }
+    return ConstantVector::get(Result);
+  }
   default:
     break;
   }
@@ -3919,6 +3989,33 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
       return nullptr;
     return ConstantStruct::get(StTy, SinResult, CosResult);
   }
+  case Intrinsic::vector_deinterleave2: {
+    auto *Vec = dyn_cast<Constant>(Operands[0]);
+    if (!Vec)
+      return nullptr;
+
+    auto *VecTy = cast<VectorType>(Vec->getType());
+    unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2;
+    if (isa<ConstantAggregateZero>(Vec)) {
+      auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy);
+      return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy),
+                                 ConstantAggregateZero::get(HalfVecTy));
+    }
+    if (isa<FixedVectorType>(Vec->getType())) {
+      SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
+      for (unsigned I = 0; I < NumElements; ++I) {
+        Constant *Elt0 = Vec->getAggregateElement(2 * I);
+        Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
+        if (!Elt0 || !Elt1)
+          return nullptr;
+        Res0[I] = Elt0;
+        Res1[I] = Elt1;
+      }
+      return ConstantStruct::get(StTy, ConstantVector::get(Res0),
+                                 ConstantVector::get(Res1));
+    }
+    return nullptr;
+  }
   default:
     // TODO: Constant folding of vector intrinsics that fall through here does
     // not work (e.g. overflow intrinsics)
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
new file mode 100644
index 0000000000000..9dbe3d4e50ee1
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instsimplify,verify -S | FileCheck %s
+
+define <3 x i32> @fold_vector_extract() {
+; CHECK-LABEL: define <3 x i32> @fold_vector_extract() {
+; CHECK-NEXT:    ret <3 x i32> <i32 3, i32 4, i32 5>
+;
+  %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 3)
+  ret <3 x i32> %1
+}
+
+@a = external global i16, align 1
+
+define <3 x i32> @fold_vector_extract_constexpr() {
+; CHECK-LABEL: define <3 x i32> @fold_vector_extract_constexpr() {
+; CHECK-NEXT:    ret <3 x i32> <i32 ptrtoint (ptr @a to i32), i32 1, i32 2>
+;
+  %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 ptrtoint (ptr @a to i32), i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 0)
+  ret <3 x i32> %1
+}
+
+define <8 x i32> @fold_vector_extract_nop() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_extract_nop() {
+; CHECK-NEXT:    ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;
+  %1 = call <8 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_insert() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_insert() {
+; CHECK-NEXT:    ret <8 x i32> <i32 9, i32 10, i32 11, i32 12, i32 5, i32 6, i32 7, i32 8>
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <4 x i32> <i32 9, i32 10, i32 11, i32 12>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_insert_nop() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_insert_nop() {
+; CHECK-NEXT:    ret <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @fold_vector_interleave2() {
+; CHECK-LABEL: define <8 x i32> @fold_vector_interleave2() {
+; CHECK-NEXT:    ret <8 x i32> <i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 4, i32 8>
+;
+  %1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+  ret <8 x i32> %1
+}
+
+define {<4 x i32>, <4 x i32>} @fold_vector_deinterleave2() {
+; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @fold_vector_deinterleave2() {
+; CHECK-NEXT:    ret { <4 x i32>, <4 x i32> } { <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8> }
+;
+  %1 = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<8 x i32> <i32 1, i32 5, i32 2, i32 6, i32 3, i32 7, i32 4, i32 8>)
+  ret {<4 x i32>, <4 x i32>} %1
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterleave2() {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @fold_scalable_vector_deinterleave2() {
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer
+;
+  %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> zeroinitializer)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
+}

From dc4335a2bf75c7b9928a72a7f15df0276120d7ed Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 11 Jun 2025 18:22:05 -0500
Subject: [PATCH 160/851] [libc] Perform bitfield zero initialization
 wave-parallel (#143607)

Summary:
We need to set the bitfield memory to zero because the system does not
guarantee zeroed out memory. Even if fresh pages are zero, the system
allows re-use so we would need a `kfd` level API to skip this step.

Because we can't this patch updates the logic to perform the zero
initialization wave-parallel. This reduces the amount of time it takes
to allocate a fresh by up to a tenth.

This has the unfortunate side effect that the control flow is more
convoluted and we waste some extra registers, but it's worth it to
reduce the slab allocation latency.
---
 libc/src/__support/GPU/allocator.cpp | 46 +++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index ecc0de1cb6ec3..66ab155e5c299 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) {
   return (x + N) & ~(N - 1);
 }
 
+// Perform a lane parallel memset on a uint32_t pointer.
+void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
+  uint64_t mask = gpu::get_lane_mask();
+  uint32_t workers = cpp::popcount(uniform);
+  for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers)
+    s[i] = c;
+}
+
 } // namespace impl
 
 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -157,10 +165,15 @@ struct Slab {
     Header *header = reinterpret_cast<Header *>(memory);
     header->chunk_size = chunk_size;
     header->global_index = global_index;
+  }
 
-    // This memset is expensive and likely not necessary for the current 'kfd'
-    // driver. Until zeroed pages are exposed by the API we must be careful.
-    __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size));
+  // Set the necessary bitfield bytes to zero in parallel using many lanes. This
+  // must be called before the bitfield can be accessed safely, memory is not
+  // guaranteed to be zero initialized in the current implementation.
+  void initialize(uint64_t uniform) {
+    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
+                    sizeof(uint32_t);
+    impl::uniform_memset(get_bitfield(), 0, size, uniform);
   }
 
   // Get the number of chunks that can theoretically fit inside this slab.
@@ -354,14 +367,7 @@ struct GuardPtr {
       void *raw = impl::rpc_allocate(sizeof(Slab));
       if (!raw)
         return nullptr;
-      Slab *mem = new (raw) Slab(cpp::forward<Args>(args)...);
-
-      cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-      ptr.store(mem, cpp::MemoryOrder::RELAXED);
-      cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
-      if (!ref.acquire(n, count))
-        ref.reset(n, count);
-      return mem;
+      return new (raw) Slab(cpp::forward<Args>(args)...);
     }
 
     if (!expected || expected == reinterpret_cast<Slab *>(SENTINEL))
@@ -374,6 +380,16 @@ struct GuardPtr {
     return ptr.load(cpp::MemoryOrder::RELAXED);
   }
 
+  // Finalize the associated memory and signal that it is ready to use by
+  // resetting the counter.
+  void finalize(Slab *mem, uint32_t n, uint64_t &count) {
+    cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    ptr.store(mem, cpp::MemoryOrder::RELAXED);
+    cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+    if (!ref.acquire(n, count))
+      ref.reset(n, count);
+  }
+
 public:
   // Attempt to lock access to the pointer, potentially creating it if empty.
   // The uniform mask represents which lanes share the same pointer. For each
@@ -392,6 +408,14 @@ struct GuardPtr {
     if (!result)
       return nullptr;
 
+    // We defer storing the newly allocated slab until now so that we can use
+    // multiple lanes to initialize it and release it for use.
+    if (count == cpp::numeric_limits<uint64_t>::max()) {
+      result->initialize(uniform);
+      if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
+        finalize(result, cpp::popcount(uniform), count);
+    }
+
     if (count != cpp::numeric_limits<uint64_t>::max())
       count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1;
 

From 1ecd108cb7ceda2b11281b5d173e2827feb60c55 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 16:22:17 -0700
Subject: [PATCH 161/851] [libc] Migrate stdio tests to ErrnoCheckingTest.
 (#143802)

Reduce the direct use of libc_errno in stdio unit tests by adopting
ErrnoCheckingTest where appropriate.

Also removes the libc_errno.h inclusions from stdlib.h tests that were
accidentally added in d87eea35fac5a34a841c637db8908128409a184e
---
 libc/test/src/stdio/CMakeLists.txt           | 10 ++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++--------
 libc/test/src/stdio/fgetc_test.cpp           |  5 ++---
 libc/test/src/stdio/fgetc_unlocked_test.cpp  |  5 ++---
 libc/test/src/stdio/fgets_test.cpp           |  6 +++---
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++---------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++--------
 libc/test/src/stdio/remove_test.cpp          | 10 +++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 +++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  8 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 01904a30504ed..3627006ec28fd 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -426,6 +430,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -440,6 +445,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -456,6 +462,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -476,6 +483,7 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -498,6 +506,7 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -515,6 +524,7 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100e..b53184c30be36 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a8..7c652f666a8f3 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,7 +33,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82b..f4471dd82df15 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -36,7 +36,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d4052939..c00a9256af52d 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,12 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -35,7 +36,6 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b8..e097785832d56 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b646..bcf5e674141a7 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c0..296bff1f5dc15 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8e..135fb98c07fbb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb41..4144bc1bef447 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,9 +14,10 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +53,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +103,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064bd..e99b382d12112 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e77..03f0a6539c785 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c3..eb4056dc7ba64 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 3c7af175e51c3ab08ac3c442146c2b822f38c01e Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Wed, 11 Jun 2025 16:52:21 -0700
Subject: [PATCH 162/851] [libc] Fix stdio tests after #143802 (#143810)

In #143802 the stdio test cleanup missed a few places where errno was
being set to a failing value, and one where the framework needed to
included.
---
 libc/docs/configure.rst                     | 2 +-
 libc/test/src/stdio/fgetc_test.cpp          | 1 +
 libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 +
 libc/test/src/stdio/fgets_test.cpp          | 1 +
 libc/test/src/stdio/setvbuf_test.cpp        | 1 +
 5 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 8d53390ae19bf..109412225634f 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 7c652f666a8f3..1faa49112fb63 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -33,6 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index f4471dd82df15..7b2efe642fb5e 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -36,6 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index c00a9256af52d..2d7c68d490811 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -36,6 +36,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 4144bc1bef447..a0936ba79ef73 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,6 +11,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"

From 6c72084a578a7a1e4dc1013a1a4a30b72ad5c6ab Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 11 Jun 2025 16:56:37 -0700
Subject: [PATCH 163/851] [bazel] port 1ecd108cb7ceda2b11281b5d173e2827feb60c55

---
 utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
index 484d3e5e0a24e..505b73fd77111 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel
@@ -122,6 +122,7 @@ libc_test(
         "//libc:mkdirat",
         "//libc:open",
         "//libc:remove",
+        "//libc/test/UnitTest:errno_test_helpers",
     ],
 )
 

From bc7ea63e9c885fbe71dec29581a206bc0543d22a Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Jun 2025 20:04:27 -0400
Subject: [PATCH 164/851] [MemCpyOpt] handle memcpy from memset for
 non-constant sizes (#143727)

Allows forwarding memset to memcpy for mismatching unknown sizes if
overread has undef contents. In that case we can refine the undef bytes
to the memset value.

Refs #140954 which laid some of the groundwork for this.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 38 +++++++++----------
 .../MemCpyOpt/variable-sized-memset-memcpy.ll |  6 +--
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 960001bf880c6..1c4ec6aa08b43 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1440,7 +1440,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
   int64_t MOffset = 0;
   const DataLayout &DL = MemCpy->getModule()->getDataLayout();
   // We can only transforms memcpy's where the dest of one is the source of the
-  // other, or the memory transfer has a known offset from the memset.
+  // other, or they have a known offset.
   if (MemCpy->getSource() != MemSet->getDest()) {
     std::optional<int64_t> Offset =
         MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL);
@@ -1451,28 +1451,28 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 
   if (MOffset != 0 || MemSetSize != CopySize) {
     // Make sure the memcpy doesn't read any more than what the memset wrote,
-    // other than undef. Don't worry about sizes larger than i64. A known memset
-    // size is required.
+    // other than undef. Don't worry about sizes larger than i64.
     auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
-    if (!CMemSetSize)
-      return false;
-
-    // A known memcpy size is also required.
     auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
-    if (!CCopySize)
-      return false;
-    if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
+    if (!CMemSetSize || !CCopySize ||
+        CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
       if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA))
         return false;
-      // Clip the memcpy to the bounds of the memset
-      if (MOffset == 0)
-        CopySize = MemSetSize;
-      else
-        CopySize =
-            ConstantInt::get(CopySize->getType(),
-                             CMemSetSize->getZExtValue() <= (uint64_t)MOffset
-                                 ? 0
-                                 : CMemSetSize->getZExtValue() - MOffset);
+
+      if (CMemSetSize && CCopySize) {
+        // If both have constant sizes and offsets, clip the memcpy to the
+        // bounds of the memset if applicable.
+        assert(CCopySize->getZExtValue() + MOffset >
+               CMemSetSize->getZExtValue());
+        if (MOffset == 0)
+          CopySize = MemSetSize;
+        else
+          CopySize =
+              ConstantInt::get(CopySize->getType(),
+                               CMemSetSize->getZExtValue() <= (uint64_t)MOffset
+                                   ? 0
+                                   : CMemSetSize->getZExtValue() - MOffset);
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
index d5b1ab9b2f299..4b44f8b44f74a 100644
--- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
@@ -19,12 +19,12 @@ define void @test(ptr %src, i8 %c, i64 %size) {
 }
 
 ; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca
-define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
-; CHECK-LABEL: @negative_test(
+define void @dynsize_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
+; CHECK-LABEL: @dynsize_test(
 ; CHECK-NEXT:    [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1
 ; CHECK-NEXT:    [[DST2:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE1]], i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[DST1]], i64 [[SIZE2]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[DST2]], i8 [[C]], i64 [[SIZE2]], i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %dst1 = alloca i8, i64 %size1

From d7c6cad744bc7ed28535dc6f75629902eda559ea Mon Sep 17 00:00:00 2001
From: Jake Egan <Jake.egan@ibm.com>
Date: Wed, 11 Jun 2025 20:22:15 -0400
Subject: [PATCH 165/851] [sanitizer_common] Implement interception on AIX
 (#138606)

Adjust AIX interceptor support in sanitizer_common.

Issue: https://github.com/llvm/llvm-project/issues/138916
---
 .../sanitizer_common_interceptors.inc         | 43 ++++++++-----
 .../sanitizer_common_interceptors_ioctl.inc   |  2 +
 ...izer_common_interceptors_memintrinsics.inc |  8 ++-
 .../sanitizer_platform_interceptors.h         | 61 +++++++++++--------
 .../sanitizer_redefine_builtins.h             |  2 +-
 5 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 9272e2ab6cbd5..2d6cf7fc3282f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -481,7 +481,8 @@ INTERCEPTOR(char*, textdomain, const char *domainname) {
 #endif
 
 #if SANITIZER_INTERCEPT_STRCMP || SANITIZER_INTERCEPT_MEMCMP
-static inline int CharCmpX(unsigned char c1, unsigned char c2) {
+[[maybe_unused]] static inline int CharCmpX(unsigned char c1,
+                                            unsigned char c2) {
   return (c1 == c2) ? 0 : (c1 < c2) ? -1 : 1;
 }
 #endif
@@ -1350,7 +1351,8 @@ INTERCEPTOR(unsigned long, time, unsigned long *t) {
 #if SANITIZER_INTERCEPT_LOCALTIME_AND_FRIENDS
 static void unpoison_tm(void *ctx, __sanitizer_tm *tm) {
   COMMON_INTERCEPTOR_WRITE_RANGE(ctx, tm, sizeof(*tm));
-#if !SANITIZER_SOLARIS
+// AIX tm struct does not have tm_zone field.
+#  if !SANITIZER_SOLARIS && !SANITIZER_AIX
   if (tm->tm_zone) {
     // Can not use COMMON_INTERCEPTOR_WRITE_RANGE here, because tm->tm_zone
     // can point to shared memory and tsan would report a data race.
@@ -1735,10 +1737,12 @@ INTERCEPTOR(int, __vsprintf_chk, char *str, int flag, SIZE_T size_to,
 VSPRINTF_INTERCEPTOR_IMPL(vsprintf, str, format, ap)
 #endif
 
+#  if SANITIZER_INTERCEPT_VASPRINTF
 INTERCEPTOR(int, vasprintf, char **strp, const char *format, va_list ap)
 VASPRINTF_INTERCEPTOR_IMPL(vasprintf, strp, format, ap)
+#  endif
 
-#if SANITIZER_INTERCEPT_ISOC99_PRINTF
+#  if SANITIZER_INTERCEPT_ISOC99_PRINTF
 INTERCEPTOR(int, __isoc99_vprintf, const char *format, va_list ap)
 VPRINTF_INTERCEPTOR_IMPL(__isoc99_vprintf, format, ap)
 
@@ -1787,10 +1791,12 @@ INTERCEPTOR(int, __snprintf_chk, char *str, SIZE_T size, int flag,
 FORMAT_INTERCEPTOR_IMPL(__snprintf_chk, vsnprintf, str, size, format)
 #endif
 
+#  if SANITIZER_INTERCEPT_ASPRINTF
 INTERCEPTOR(int, asprintf, char **strp, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(asprintf, vasprintf, strp, format)
+#  endif
 
-#if SANITIZER_INTERCEPT_ISOC99_PRINTF
+#  if SANITIZER_INTERCEPT_ISOC99_PRINTF
 INTERCEPTOR(int, __isoc99_printf, const char *format, ...)
 FORMAT_INTERCEPTOR_IMPL(__isoc99_printf, __isoc99_vprintf, format)
 
@@ -1811,17 +1817,24 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_snprintf, __isoc99_vsnprintf, str, size,
 #endif  // SANITIZER_INTERCEPT_PRINTF
 
 #if SANITIZER_INTERCEPT_PRINTF
-#define INIT_PRINTF                     \
-  COMMON_INTERCEPT_FUNCTION_LDBL(printf);    \
-  COMMON_INTERCEPT_FUNCTION_LDBL(sprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(snprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(asprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(fprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vprintf);   \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf);  \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); \
-  COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf);
+#  define INIT_PRINTF_COMMON                   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(printf);    \
+    COMMON_INTERCEPT_FUNCTION_LDBL(sprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(snprintf);  \
+    COMMON_INTERCEPT_FUNCTION_LDBL(fprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vprintf);   \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf);  \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \
+    COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf);
+#  if !SANITIZER_AIX
+// AIX does not have [v]asprintf.
+#    define INIT_PRINTF_EXTRA                   \
+      COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \
+      COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf);
+#  else
+#    define INIT_PRINTF_EXTRA
+#  endif
+#  define INIT_PRINTF INIT_PRINTF_COMMON INIT_PRINTF_EXTRA
 #else
 #define INIT_PRINTF
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index bc8f02826c614..08c2be47f5358 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -79,7 +79,9 @@ static void ioctl_table_fill() {
   _(TIOCMSET, READ, sizeof(int));
   _(TIOCNXCL, NONE, 0);
   _(TIOCOUTQ, WRITE, sizeof(int));
+#  if !SANITIZER_AIX
   _(TIOCSCTTY, NONE, 0);
+#  endif
   _(TIOCSPGRP, READ, pid_t_sz);
   _(TIOCSWINSZ, READ, struct_winsize_sz);
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
index 1565a494140f6..0b6731c89950b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
@@ -33,11 +33,13 @@
 
 // Platform-specific options.
 #if SANITIZER_APPLE
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
 #elif SANITIZER_WINDOWS64
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
+#elif SANITIZER_AIX
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0
 #else
-#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1
+#  define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1
 #endif  // SANITIZER_APPLE
 
 #ifndef COMMON_INTERCEPTOR_MEMSET_IMPL
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 4bc55d7801db7..ccc808b60ca75 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -141,6 +141,12 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SI_SOLARIS 0
 #endif
 
+#if SANITIZER_AIX
+#  define SI_NOT_AIX 0
+#else
+#  define SI_NOT_AIX 1
+#endif
+
 #if SANITIZER_SOLARIS32
 #define SI_SOLARIS32 1
 #else
@@ -161,20 +167,20 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 
 #define SANITIZER_INTERCEPT_STRLEN SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRNLEN (SI_NOT_MAC && SI_NOT_FUCHSIA)
-#define SANITIZER_INTERCEPT_STRCMP SI_NOT_FUCHSIA
+#define SANITIZER_INTERCEPT_STRCMP (SI_NOT_FUCHSIA && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRSTR SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_STRCASESTR SI_POSIX
+#define SANITIZER_INTERCEPT_STRCASESTR (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRTOK SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRCHR SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_STRCHRNUL SI_POSIX_NOT_MAC
+#define SANITIZER_INTERCEPT_STRCHRNUL (SI_POSIX_NOT_MAC && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_STRRCHR SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRSPN SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_STRPBRK SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_TEXTDOMAIN SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_STRCASECMP SI_POSIX
 #define SANITIZER_INTERCEPT_MEMSET 1
-#define SANITIZER_INTERCEPT_MEMMOVE 1
-#define SANITIZER_INTERCEPT_MEMCPY 1
+#define SANITIZER_INTERCEPT_MEMMOVE SI_NOT_AIX
+#define SANITIZER_INTERCEPT_MEMCPY SI_NOT_AIX
 #define SANITIZER_INTERCEPT_MEMCMP SI_NOT_FUCHSIA
 #define SANITIZER_INTERCEPT_BCMP \
   SANITIZER_INTERCEPT_MEMCMP &&  \
@@ -233,9 +239,11 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_ISOC99_SCANF SI_GLIBC
 
 #ifndef SANITIZER_INTERCEPT_PRINTF
-#define SANITIZER_INTERCEPT_PRINTF SI_POSIX
-#define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD)
-#define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC
+#  define SANITIZER_INTERCEPT_ASPRINTF SI_NOT_AIX
+#  define SANITIZER_INTERCEPT_VASPRINTF SI_NOT_AIX
+#  define SANITIZER_INTERCEPT_PRINTF SI_POSIX
+#  define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD)
+#  define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC
 #endif
 
 #define SANITIZER_INTERCEPT_SETPROCTITLE (SI_FREEBSD || SI_NETBSD)
@@ -243,8 +251,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT___PRINTF_CHK \
   (SANITIZER_INTERCEPT_PRINTF && SI_GLIBC)
 
-#define SANITIZER_INTERCEPT_FREXP SI_NOT_FUCHSIA
-#define SANITIZER_INTERCEPT_FREXPF SI_POSIX
+// AIX libc does not export FREXP and FREXPF.
+#define SANITIZER_INTERCEPT_FREXP (SI_NOT_FUCHSIA && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_FREXPF (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_FREXPL SI_POSIX
 
 #define SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS SI_POSIX
@@ -294,7 +303,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_ACCEPT4 \
   (SI_LINUX_NOT_ANDROID || SI_NETBSD || SI_FREEBSD)
 #define SANITIZER_INTERCEPT_PACCEPT SI_NETBSD
-#define SANITIZER_INTERCEPT_MODF SI_POSIX
+#define SANITIZER_INTERCEPT_MODF (SI_POSIX && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_RECVMSG SI_POSIX
 #define SANITIZER_INTERCEPT_SENDMSG SI_POSIX
 #define SANITIZER_INTERCEPT_RECVMMSG SI_LINUX
@@ -329,8 +338,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT___WCSXFRM_L SI_LINUX
 #define SANITIZER_INTERCEPT_WCSNRTOMBS \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_WCRTOMB \
-  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
+#define SANITIZER_INTERCEPT_WCRTOMB                                           \
+  (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS || \
+   !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_WCTOMB \
   (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_TCGETATTR SI_LINUX_NOT_ANDROID || SI_SOLARIS
@@ -370,7 +380,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_GETMNTENT_R SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_STATFS \
   (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS)
-#define SANITIZER_INTERCEPT_STATFS64 SI_GLIBC && SANITIZER_HAS_STATFS64
+#define SANITIZER_INTERCEPT_STATFS64 \
+  ((SI_GLIBC || !SI_NOT_AIX) && SANITIZER_HAS_STATFS64)
 #define SANITIZER_INTERCEPT_STATVFS \
   (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID)
 #define SANITIZER_INTERCEPT_STATVFS64 SI_GLIBC
@@ -419,10 +430,10 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_TTYNAME_R SI_POSIX
 #define SANITIZER_INTERCEPT_TEMPNAM SI_POSIX
 #define SANITIZER_INTERCEPT_SINCOS SI_LINUX || SI_SOLARIS
-#define SANITIZER_INTERCEPT_REMQUO SI_POSIX
-#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD)
-#define SANITIZER_INTERCEPT_LGAMMA SI_POSIX
-#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD)
+#define SANITIZER_INTERCEPT_REMQUO (SI_POSIX && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_LGAMMA (SI_POSIX && SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_LGAMMA_R (SI_FREEBSD || SI_LINUX || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_LGAMMAL_R SI_LINUX_NOT_ANDROID || SI_SOLARIS
 #define SANITIZER_INTERCEPT_DRAND48_R SI_GLIBC
@@ -505,11 +516,13 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD)
 
 #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33))
-#define SANITIZER_INTERCEPT_STAT                                        \
-  (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS ||     \
-   SI_STAT_LINUX)
-#define SANITIZER_INTERCEPT_STAT64 SI_STAT_LINUX && SANITIZER_HAS_STAT64
-#define SANITIZER_INTERCEPT_LSTAT (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX)
+#define SANITIZER_INTERCEPT_STAT                                    \
+  (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \
+   SI_STAT_LINUX || !SI_NOT_AIX)
+#define SANITIZER_INTERCEPT_STAT64 \
+  ((SI_STAT_LINUX || !SI_NOT_AIX) && SANITIZER_HAS_STAT64)
+#define SANITIZER_INTERCEPT_LSTAT \
+  (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX || !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT___XSTAT \
   ((!SANITIZER_INTERCEPT_STAT && SI_POSIX) || SI_STAT_LINUX)
 #define SANITIZER_INTERCEPT___XSTAT64 SI_GLIBC
@@ -578,7 +591,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_PROTOENT_R SI_GLIBC
 #define SANITIZER_INTERCEPT_NETENT (SI_LINUX || SI_NETBSD || SI_FREEBSD)
 #define SANITIZER_INTERCEPT_SETVBUF \
-  (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC)
+  (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC || !SI_NOT_AIX)
 #define SANITIZER_INTERCEPT_GETMNTINFO (SI_NETBSD || SI_FREEBSD || SI_MAC)
 #define SANITIZER_INTERCEPT_MI_VECTOR_HASH SI_NETBSD
 #define SANITIZER_INTERCEPT_GETVFSSTAT SI_NETBSD
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
index 41e0613d6fc13..bda0f04687693 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h
@@ -15,7 +15,7 @@
 #    define SANITIZER_REDEFINE_BUILTINS_H
 
 // The asm hack only works with GCC and Clang.
-#    if !defined(_WIN32)
+#    if !defined(_WIN32) && !defined(_AIX)
 
 asm(R"(
     .set memcpy, __sanitizer_internal_memcpy

From 7a3bcf9f7179e6904d405de36360714da07c31ba Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 11 Jun 2025 21:50:35 +0800
Subject: [PATCH 166/851] [RISCV] Add missing predicate for PseudoTHVdotVMAQA
 family instructions

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 2fccbcaf2cf37..89441444a994e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -660,10 +660,12 @@ def : Pat<(i32 (sub GPR:$rd, (mul (sexti16 (i32 GPR:$rs1)),
           (TH_MULSH GPR:$rd, GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasVendorXTHeadMac, IsRV32]
 
+let Predicates = [HasVendorXTHeadVdot] in {
 defm PseudoTHVdotVMAQA      : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQAU     : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQASU    : VPseudoVMAQA_VV_VX;
 defm PseudoTHVdotVMAQAUS    : VPseudoVMAQA_VX;
+}
 
 let Predicates = [HasVendorXTHeadVdot] in {
 defm : VPatTernaryVMAQA_VV_VX<"int_riscv_th_vmaqa",  "PseudoTHVdotVMAQA",

From 7034014d08249a1e159a668a71e96a0b78636a39 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Wed, 11 Jun 2025 18:07:00 -0700
Subject: [PATCH 167/851] [InstCombine] Combine or-disjoint (and->mul),
 (and->mul) to and->mul (#136013)

The canonical pattern for bitmasked mul is currently

```
%val = and %x, %bitMask // where %bitMask is some constant
%cmp = icmp eq %val, 0
%sel = select %cmp, 0, %C // where %C is some constant = C' * %bitMask
```

In certain cases, where we are combining multiple of these bitmasked
muls with common factors, we are able to optimize into and->mul (see
https://github.com/llvm/llvm-project/pull/135274 )

This optimization lends itself to further optimizations. This PR
addresses one of such optimizations.

In cases where we have

`or-disjoint ( mul(and (X, C1), D) , mul (and (X, C2), D))`

we can combine into

`mul( and (X, (C1 + C2)), D) `

provided C1 and C2 are disjoint.

Generalized proof: https://alive2.llvm.org/ce/z/MQYMui
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 130 ++++++++++++------
 .../test/Transforms/InstCombine/or-bitmask.ll | 116 ++++++++++++++--
 2 files changed, 190 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c6c231f81c4ab..dce695a036006 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3592,6 +3592,73 @@ static Value *foldOrOfInversions(BinaryOperator &I,
   return nullptr;
 }
 
+// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools
+// track these properities for preservation. Note that we can decompose
+// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask *
+// Factor))
+struct DecomposedBitMaskMul {
+  Value *X;
+  APInt Factor;
+  APInt Mask;
+  bool NUW;
+  bool NSW;
+};
+
+static std::optional<DecomposedBitMaskMul> matchBitmaskMul(Value *V) {
+  Instruction *Op = dyn_cast<Instruction>(V);
+  if (!Op)
+    return std::nullopt;
+
+  // Decompose (A & N) * C) into BitMaskMul
+  Value *Original = nullptr;
+  const APInt *Mask = nullptr;
+  const APInt *MulConst = nullptr;
+  if (match(Op, m_Mul(m_And(m_Value(Original), m_APInt(Mask)),
+                      m_APInt(MulConst)))) {
+    if (MulConst->isZero() || Mask->isZero())
+      return std::nullopt;
+
+    return std::optional<DecomposedBitMaskMul>(
+        {Original, *MulConst, *Mask,
+         cast<BinaryOperator>(Op)->hasNoUnsignedWrap(),
+         cast<BinaryOperator>(Op)->hasNoSignedWrap()});
+  }
+
+  Value *Cond = nullptr;
+  const APInt *EqZero = nullptr, *NeZero = nullptr;
+
+  // Decompose ((A & N) ? 0 : N * C) into BitMaskMul
+  if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) {
+    auto ICmpDecompose =
+        decomposeBitTest(Cond, /*LookThruTrunc=*/true,
+                         /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true);
+    if (!ICmpDecompose.has_value())
+      return std::nullopt;
+
+    assert(ICmpInst::isEquality(ICmpDecompose->Pred) &&
+           ICmpDecompose->C.isZero());
+
+    if (ICmpDecompose->Pred == ICmpInst::ICMP_NE)
+      std::swap(EqZero, NeZero);
+
+    if (!EqZero->isZero() || NeZero->isZero())
+      return std::nullopt;
+
+    if (!ICmpDecompose->Mask.isPowerOf2() || ICmpDecompose->Mask.isZero() ||
+        NeZero->getBitWidth() != ICmpDecompose->Mask.getBitWidth())
+      return std::nullopt;
+
+    if (!NeZero->urem(ICmpDecompose->Mask).isZero())
+      return std::nullopt;
+
+    return std::optional<DecomposedBitMaskMul>(
+        {ICmpDecompose->X, NeZero->udiv(ICmpDecompose->Mask),
+         ICmpDecompose->Mask, /*NUW=*/false, /*NSW=*/false});
+  }
+
+  return std::nullopt;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -3674,49 +3741,26 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
                                    /*NSW=*/true, /*NUW=*/true))
       return R;
 
-    Value *Cond0 = nullptr, *Cond1 = nullptr;
-    const APInt *Op0Eq = nullptr, *Op0Ne = nullptr;
-    const APInt *Op1Eq = nullptr, *Op1Ne = nullptr;
-
-    //  (!(A & N) ? 0 : N * C) + (!(A & M) ? 0 : M * C) -> A & (N + M) * C
-    if (match(I.getOperand(0),
-              m_Select(m_Value(Cond0), m_APInt(Op0Eq), m_APInt(Op0Ne))) &&
-        match(I.getOperand(1),
-              m_Select(m_Value(Cond1), m_APInt(Op1Eq), m_APInt(Op1Ne)))) {
-
-      auto LHSDecompose =
-          decomposeBitTest(Cond0, /*LookThruTrunc=*/true,
-                           /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
-      auto RHSDecompose =
-          decomposeBitTest(Cond1, /*LookThruTrunc=*/true,
-                           /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true);
-
-      if (LHSDecompose && RHSDecompose && LHSDecompose->X == RHSDecompose->X &&
-          RHSDecompose->Mask.isPowerOf2() && LHSDecompose->Mask.isPowerOf2() &&
-          LHSDecompose->Mask != RHSDecompose->Mask &&
-          LHSDecompose->Mask.getBitWidth() == Op0Ne->getBitWidth() &&
-          RHSDecompose->Mask.getBitWidth() == Op1Ne->getBitWidth()) {
-        assert(Op0Ne->getBitWidth() == Op1Ne->getBitWidth());
-        assert(ICmpInst::isEquality(LHSDecompose->Pred));
-        if (LHSDecompose->Pred == ICmpInst::ICMP_NE)
-          std::swap(Op0Eq, Op0Ne);
-        if (RHSDecompose->Pred == ICmpInst::ICMP_NE)
-          std::swap(Op1Eq, Op1Ne);
-
-        if (!Op0Ne->isZero() && !Op1Ne->isZero() && Op0Eq->isZero() &&
-            Op1Eq->isZero() && Op0Ne->urem(LHSDecompose->Mask).isZero() &&
-            Op1Ne->urem(RHSDecompose->Mask).isZero() &&
-            Op0Ne->udiv(LHSDecompose->Mask) ==
-                Op1Ne->udiv(RHSDecompose->Mask)) {
-          auto NewAnd = Builder.CreateAnd(
-              LHSDecompose->X,
-              ConstantInt::get(LHSDecompose->X->getType(),
-                               (LHSDecompose->Mask + RHSDecompose->Mask)));
-
-          return BinaryOperator::CreateMul(
-              NewAnd, ConstantInt::get(NewAnd->getType(),
-                                       Op0Ne->udiv(LHSDecompose->Mask)));
-        }
+    // (A & N) * C + (A & M) * C -> (A & (N + M)) & C
+    // This also accepts the equivalent select form of (A & N) * C
+    // expressions i.e. !(A & N) ? 0 : N * C)
+    auto Decomp1 = matchBitmaskMul(I.getOperand(1));
+    if (Decomp1) {
+      auto Decomp0 = matchBitmaskMul(I.getOperand(0));
+      if (Decomp0 && Decomp0->X == Decomp1->X &&
+          (Decomp0->Mask & Decomp1->Mask).isZero() &&
+          Decomp0->Factor == Decomp1->Factor) {
+
+        Value *NewAnd = Builder.CreateAnd(
+            Decomp0->X, ConstantInt::get(Decomp0->X->getType(),
+                                         (Decomp0->Mask + Decomp1->Mask)));
+
+        auto *Combined = BinaryOperator::CreateMul(
+            NewAnd, ConstantInt::get(NewAnd->getType(), Decomp1->Factor));
+
+        Combined->setHasNoUnsignedWrap(Decomp0->NUW && Decomp1->NUW);
+        Combined->setHasNoSignedWrap(Decomp0->NSW && Decomp1->NSW);
+        return Combined;
       }
     }
   }
diff --git a/llvm/test/Transforms/InstCombine/or-bitmask.ll b/llvm/test/Transforms/InstCombine/or-bitmask.ll
index 3b482dc1794db..3c992dfea569a 100644
--- a/llvm/test/Transforms/InstCombine/or-bitmask.ll
+++ b/llvm/test/Transforms/InstCombine/or-bitmask.ll
@@ -36,13 +36,9 @@ define i32 @add_select_cmp_and2(i32 %in) {
 
 define i32 @add_select_cmp_and3(i32 %in) {
 ; CHECK-LABEL: @add_select_cmp_and3(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
-; CHECK-NEXT:    [[TEMP:%.*]] = mul nuw nsw i32 [[TMP1]], 72
-; CHECK-NEXT:    [[BITOP2:%.*]] = and i32 [[IN]], 4
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[BITOP2]], 0
-; CHECK-NEXT:    [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 288
-; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[TEMP]], [[SEL2]]
-; CHECK-NEXT:    ret i32 [[OUT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 7
+; CHECK-NEXT:    [[TEMP1:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[TEMP1]]
 ;
   %bitop0 = and i32 %in, 1
   %cmp0 = icmp eq i32 %bitop0, 0
@@ -60,12 +56,9 @@ define i32 @add_select_cmp_and3(i32 %in) {
 
 define i32 @add_select_cmp_and4(i32 %in) {
 ; CHECK-LABEL: @add_select_cmp_and4(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
-; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
-; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[IN]], 12
-; CHECK-NEXT:    [[TEMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 72
-; CHECK-NEXT:    [[OUT1:%.*]] = or disjoint i32 [[OUT]], [[TEMP3]]
-; CHECK-NEXT:    ret i32 [[OUT1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[IN:%.*]], 15
+; CHECK-NEXT:    [[TEMP2:%.*]] = mul nuw nsw i32 [[TMP2]], 72
+; CHECK-NEXT:    ret i32 [[TEMP2]]
 ;
   %bitop0 = and i32 %in, 1
   %cmp0 = icmp eq i32 %bitop0, 0
@@ -361,6 +354,103 @@ define i64 @mask_select_types_1(i64 %in) {
   ret i64 %out
 }
 
+define i32 @add_select_cmp_mixed1(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed1(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask = and i32 %in, 1
+  %sel0 = mul i32 %mask, 72
+  %bitop1 = and i32 %in, 2
+  %cmp1 = icmp eq i32 %bitop1, 0
+  %sel1 = select i1 %cmp1, i32 0, i32 144
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_mixed2(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed2(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %bitop0 = and i32 %in, 1
+  %cmp0 = icmp eq i32 %bitop0, 0
+  %mask = and i32 %in, 2
+  %sel0 = select i1 %cmp0, i32 0, i32 72
+  %sel1 = mul i32 %mask, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_and_mul(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_and_mul(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 3
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask0 = and i32 %in, 1
+  %sel0 = mul i32 %mask0, 72
+  %mask1 = and i32 %in, 2
+  %sel1 = mul i32 %mask1, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_mixed2_mismatch(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_mixed2_mismatch(
+; CHECK-NEXT:    [[BITOP0:%.*]] = and i32 [[IN:%.*]], 1
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[BITOP0]], 0
+; CHECK-NEXT:    [[MASK:%.*]] = and i32 [[IN]], 2
+; CHECK-NEXT:    [[SEL0:%.*]] = select i1 [[CMP0]], i32 0, i32 73
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK]], 72
+; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %bitop0 = and i32 %in, 1
+  %cmp0 = icmp eq i32 %bitop0, 0
+  %mask = and i32 %in, 2
+  %sel0 = select i1 %cmp0, i32 0, i32 73
+  %sel1 = mul i32 %mask, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @add_select_cmp_and_mul_mismatch(i32 %in) {
+; CHECK-LABEL: @add_select_cmp_and_mul_mismatch(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[IN:%.*]] to i1
+; CHECK-NEXT:    [[SEL0:%.*]] = select i1 [[TMP1]], i32 73, i32 0
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 [[IN]], 2
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72
+; CHECK-NEXT:    [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT]]
+;
+  %mask0 = and i32 %in, 1
+  %sel0 = mul i32 %mask0, 73
+  %mask1 = and i32 %in, 2
+  %sel1 = mul i32 %mask1, 72
+  %out = or disjoint i32 %sel0, %sel1
+  ret i32 %out
+}
+
+define i32 @and_mul_non_disjoint(i32 %in) {
+; CHECK-LABEL: @and_mul_non_disjoint(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[IN:%.*]], 2
+; CHECK-NEXT:    [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72
+; CHECK-NEXT:    [[MASK1:%.*]] = and i32 [[IN]], 4
+; CHECK-NEXT:    [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72
+; CHECK-NEXT:    [[OUT1:%.*]] = or i32 [[OUT]], [[SEL1]]
+; CHECK-NEXT:    ret i32 [[OUT1]]
+;
+  %mask0 = and i32 %in, 2
+  %sel0 = mul i32 %mask0, 72
+  %mask1 = and i32 %in, 4
+  %sel1 = mul i32 %mask1, 72
+  %out = or i32 %sel0, %sel1
+  ret i32 %out
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CONSTSPLAT: {{.*}}
 ; CONSTVEC: {{.*}}

From c4316180418ce8de4b4c9812c7fac791d55b6102 Mon Sep 17 00:00:00 2001
From: Shunsuke Watanabe <watanabe.shu-06@fujitsu.com>
Date: Thu, 12 Jun 2025 10:19:26 +0900
Subject: [PATCH 168/851] [Clang][Driver] Override complex number calculation
 method by -fno-fast-math (#132680)

This patch fixes a bug where -fno-fast-math doesn't revert the complex
number calculation method to the default. The priority of overriding
options related to complex number calculations differs slightly from
GCC, as discussed in:


https://discourse.llvm.org/t/the-priority-of-fno-fast-math-regarding-complex-number-calculations/84679
---
 clang/lib/Driver/ToolChains/Clang.cpp |  22 +++++-
 clang/test/Driver/range.c             | 100 +++++++++++++++++++++++---
 2 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index a74fa81f3cf5b..1d11be1d82be8 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2831,8 +2831,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   StringRef Float16ExcessPrecision = "";
   StringRef BFloat16ExcessPrecision = "";
   LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
-  std::string ComplexRangeStr = "";
-  std::string GccRangeComplexOption = "";
+  std::string ComplexRangeStr;
+  std::string GccRangeComplexOption;
+  std::string LastComplexRangeOption;
 
   auto setComplexRange = [&](LangOptions::ComplexRangeKind NewRange) {
     // Warn if user expects to perform full implementation of complex
@@ -2916,6 +2917,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
           EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-limited-range");
       }
       GccRangeComplexOption = "-fcx-limited-range";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Basic;
       break;
     case options::OPT_fno_cx_limited_range:
@@ -2929,6 +2931,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                "-fno-cx-limited-range");
       }
       GccRangeComplexOption = "-fno-cx-limited-range";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_fcx_fortran_rules:
@@ -2938,6 +2941,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       else
         EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-fortran-rules");
       GccRangeComplexOption = "-fcx-fortran-rules";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Improved;
       break;
     case options::OPT_fno_cx_fortran_rules:
@@ -2950,6 +2954,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                "-fno-cx-fortran-rules");
       }
       GccRangeComplexOption = "-fno-cx-fortran-rules";
+      LastComplexRangeOption = A->getSpelling();
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_fcomplex_arithmetic_EQ: {
@@ -2984,6 +2989,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                  ComplexArithmeticStr(RangeVal));
         }
       }
+      LastComplexRangeOption =
+          Args.MakeArgString(A->getSpelling() + A->getValue());
       Range = RangeVal;
       break;
     }
@@ -3037,6 +3044,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Val;
+      LastComplexRangeOption = A->getSpelling();
       break;
     }
 
@@ -3222,6 +3230,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       [[fallthrough]];
     case options::OPT_ffast_math:
       applyFastMath(true);
+      LastComplexRangeOption = A->getSpelling();
       if (A->getOption().getID() == options::OPT_Ofast)
         LastFpContractOverrideOption = "-Ofast";
       else
@@ -3239,6 +3248,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ApproxFunc = false;
       SignedZeros = true;
       restoreFPContractState();
+      // If the last specified option related to complex range is not
+      // -ffast-math or -ffp-model=, emit warning.
+      if (LastComplexRangeOption != "-ffast-math" &&
+          LastComplexRangeOption != "-ffp-model=" &&
+          Range != LangOptions::ComplexRangeKind::CX_Full)
+        EmitComplexRangeDiag(D, LastComplexRangeOption, "-fno-fast-math");
+      Range = LangOptions::ComplexRangeKind::CX_None;
+      LastComplexRangeOption = "";
+      GccRangeComplexOption = "";
       LastFpContractOverrideOption = "";
       break;
     } // End switch (A->getOption().getID())
diff --git a/clang/test/Driver/range.c b/clang/test/Driver/range.c
index da5748d7c723c..30140f3c208e0 100644
--- a/clang/test/Driver/range.c
+++ b/clang/test/Driver/range.c
@@ -177,14 +177,83 @@
 // RUN: %clang -### -target x86_64 -ffast-math -fcomplex-arithmetic=basic -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=BASIC %s
 
-// BASIC: -complex-range=basic
-// FULL: -complex-range=full
-// PRMTD: -complex-range=promoted
-// BASIC-NOT: -complex-range=improved
-// CHECK-NOT: -complex-range=basic
-// IMPRVD: -complex-range=improved
-// IMPRVD-NOT: -complex-range=basic
-// CHECK-NOT: -complex-range=improved
+// RUN: %clang -### --target=x86_64 -fcx-limited-range -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN21 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-cx-limited-range -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### --target=x86_64 -fcx-fortran-rules -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN22 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-cx-fortran-rules -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffast-math -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=basic -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN23 %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=promoted -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN24 %s
+
+// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=improved -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN25 %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fcomplex-arithmetic=full -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=aggressive -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=fast -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=precise -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -ffp-model=strict -fno-fast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-limited-range \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-limited-range \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-fortran-rules \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-fortran-rules \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffast-math \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=basic \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=promoted \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=improved \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=full \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=aggressive \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=fast \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=precise \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
+
+// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=strict \
+// RUN:   -c %s 2>&1 | FileCheck --check-prefixes=FULL %s
 
 // WARN1: warning: overriding '-fcx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option]
 // WARN2: warning: overriding '-fno-cx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option]
@@ -196,5 +265,20 @@
 // WARN14: overriding '-complex-range=promoted' option with '-fcx-limited-range' [-Woverriding-option]
 // WARN17: warning: overriding '-fcomplex-arithmetic=full' option with '-fcomplex-arithmetic=basic' [-Woverriding-option]
 // WARN20: warning: overriding '-fcx-fortran-rules' option with '-fcx-limited-range' [-Woverriding-option]
+// WARN21: warning: overriding '-fcx-limited-range' option with '-fno-fast-math' [-Woverriding-option]
+// WARN22: warning: overriding '-fcx-fortran-rules' option with '-fno-fast-math' [-Woverriding-option]
+// WARN23: warning: overriding '-fcomplex-arithmetic=basic' option with '-fno-fast-math' [-Woverriding-option]
+// WARN24: warning: overriding '-fcomplex-arithmetic=promoted' option with '-fno-fast-math' [-Woverriding-option]
+// WARN25: warning: overriding '-fcomplex-arithmetic=improved' option with '-fno-fast-math' [-Woverriding-option]
+
+// BASIC: -complex-range=basic
+// FULL: -complex-range=full
+// PRMTD: -complex-range=promoted
+// BASIC-NOT: -complex-range=improved
+// CHECK-NOT: -complex-range=basic
+// IMPRVD: -complex-range=improved
+// IMPRVD-NOT: -complex-range=basic
+// CHECK-NOT: -complex-range=improved
+// RANGE-NOT: -complex-range=
 
 // ERR: error: unsupported argument 'foo' to option '-fcomplex-arithmetic='

From 52360d195b85608c677d781272534dfa61e9a1c3 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Thu, 12 Jun 2025 09:27:27 +0800
Subject: [PATCH 169/851] [NFC] Use `llvm::includes` instead of `std::includes`
 (#143542)

This PR follows up #143297.
---
 clang-tools-extra/clangd/refactor/Rename.cpp              | 2 +-
 llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 6 ++----
 llvm/tools/sancov/sancov.cpp                              | 3 +--
 llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp              | 4 ++--
 llvm/unittests/ADT/DeltaAlgorithmTest.cpp                 | 4 ++--
 llvm/utils/TableGen/AsmMatcherEmitter.cpp                 | 3 +--
 llvm/utils/TableGen/Common/CodeGenRegisters.cpp           | 7 ++-----
 7 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index d9b73b83e902a..c56375b1a98d3 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -1308,7 +1308,7 @@ getMappedRanges(ArrayRef<Range> Indexed, ArrayRef<SymbolRange> Lexed) {
     return std::nullopt;
   }
   // Fast check for the special subset case.
-  if (std::includes(Indexed.begin(), Indexed.end(), Lexed.begin(), Lexed.end()))
+  if (llvm::includes(Indexed, Lexed))
     return Lexed.vec();
 
   std::vector<size_t> Best;
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index d94a2fbb23d23..61fef1387d82a 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1975,12 +1975,10 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2,
   auto V1Elems = ShadowElements.find(V1);
   auto V2Elems = ShadowElements.find(V2);
   if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
-    if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
-                      V2Elems->second.begin(), V2Elems->second.end())) {
+    if (llvm::includes(V1Elems->second, V2Elems->second)) {
       return collapseToPrimitiveShadow(V1, Pos);
     }
-    if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
-                      V1Elems->second.begin(), V1Elems->second.end())) {
+    if (llvm::includes(V2Elems->second, V1Elems->second)) {
       return collapseToPrimitiveShadow(V2, Pos);
     }
   } else if (V1Elems != ShadowElements.end()) {
diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp
index 2cc84b47de6b9..aebb5effd0be7 100644
--- a/llvm/tools/sancov/sancov.cpp
+++ b/llvm/tools/sancov/sancov.cpp
@@ -889,8 +889,7 @@ symbolize(const RawCoverage &Data, const std::string ObjectFile) {
   }
 
   std::set<uint64_t> AllAddrs = findCoveragePointAddrs(ObjectFile);
-  if (!std::includes(AllAddrs.begin(), AllAddrs.end(), Data.Addrs->begin(),
-                     Data.Addrs->end())) {
+  if (!llvm::includes(AllAddrs, *Data.Addrs)) {
     fail("Coverage points in binary and .sancov file do not match.");
   }
   Coverage->Points = getCoveragePoints(ObjectFile, AllAddrs, *Data.Addrs);
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index 66a67d96d1532..f543947899393 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DAGDeltaAlgorithm.h"
+#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <cstdarg>
@@ -23,8 +24,7 @@ class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm {
 protected:
   bool ExecuteOneTest(const changeset_ty &Changes) override {
     ++NumTests;
-    return std::includes(Changes.begin(), Changes.end(),
-                         FailingSet.begin(), FailingSet.end());
+    return llvm::includes(Changes, FailingSet);
   }
 
 public:
diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
index 5e284129180a0..24e18f42eb33c 100644
--- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DeltaAlgorithm.h"
+#include "llvm/ADT/STLExtras.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 #include <cstdarg>
@@ -38,8 +39,7 @@ class FixedDeltaAlgorithm final : public DeltaAlgorithm {
 protected:
   bool ExecuteOneTest(const changeset_ty &Changes) override {
     ++NumTests;
-    return std::includes(Changes.begin(), Changes.end(),
-                         FailingSet.begin(), FailingSet.end());
+    return llvm::includes(Changes, FailingSet);
   }
 
 public:
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 9792eb41ea5d7..32098e96ce721 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -1330,8 +1330,7 @@ void AsmMatcherInfo::buildRegisterClasses(
   for (const RegisterSet &RS : RegisterSets) {
     ClassInfo *CI = RegisterSetClasses[RS];
     for (const RegisterSet &RS2 : RegisterSets)
-      if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(),
-                                     RS.end(), LessRecordByID()))
+      if (RS != RS2 && llvm::includes(RS2, RS, LessRecordByID()))
         CI->SuperClasses.push_back(RegisterSetClasses[RS2]);
   }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index 4d24eb3de1ed9..f52c21e97f9c8 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -933,9 +933,7 @@ bool CodeGenRegisterClass::Key::operator<(
 static bool testSubClass(const CodeGenRegisterClass *A,
                          const CodeGenRegisterClass *B) {
   return A->RSI.isSubClassOf(B->RSI) &&
-         std::includes(A->getMembers().begin(), A->getMembers().end(),
-                       B->getMembers().begin(), B->getMembers().end(),
-                       deref<std::less<>>());
+         llvm::includes(A->getMembers(), B->getMembers(), deref<std::less<>>());
 }
 
 /// Sorting predicate for register classes.  This provides a topological
@@ -1990,8 +1988,7 @@ findRegUnitSet(const std::vector<RegUnitSet> &UniqueSets,
 // Return true if the RUSubSet is a subset of RUSuperSet.
 static bool isRegUnitSubSet(const std::vector<unsigned> &RUSubSet,
                             const std::vector<unsigned> &RUSuperSet) {
-  return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(),
-                       RUSubSet.end());
+  return llvm::includes(RUSuperSet, RUSubSet);
 }
 
 /// Iteratively prune unit sets. Prune subsets that are close to the superset,

From 082251bba4effea7f60191c6cbddacb3705c07db Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 11 Jun 2025 21:49:01 -0400
Subject: [PATCH 170/851] [AArch64] fix trampoline implementation: use X15
 (#126743)

AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this
purpose, and says not to use X16 or X18 like GCC (and the previous
implementation) chose to use. The X18 register may need to get used by
the kernel in some circumstances, as specified by the platform ABI, so
it is generally an unwise choice. Simply choosing a different register
fixes the problem of this being broken on any platform that actually
follows the platform ABI (which is all of them except EABI, if I am
reading this linux kernel bug correctly
https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As
a side benefit, also generate slightly better code and avoids needing
the compiler-rt to be present. I did that by following the XCore
implementation instead of PPC (although in hindsight, following the
RISCV might have been slightly more readable). That X18 is wrong to use
for this purpose has been known for many years (e.g.
https://www.mail-archive.com/gcc@gcc.gnu.org/msg76934.html) and also
known that fixing this to use one of the correct registers is not an ABI
break, since this only appears inside of a translation unit. Some of the
other temporary registers (e.g. X9) are already reserved inside llvm for
internal use as a generic temporary register in the prologue before
saving registers, while X15 was already used in rare cases as a scratch
register in the prologue as well, so I felt that seemed the most logical
choice to choose here.
---
 compiler-rt/lib/builtins/README.txt           |   5 -
 compiler-rt/lib/builtins/trampoline_setup.c   |  42 ---
 .../builtins/Unit/trampoline_setup_test.c     |   2 +-
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  |   8 +-
 flang/test/Fir/boxproc.fir                    |   4 +-
 .../AArch64/AArch64CallingConvention.td       |  25 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   |  85 ++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  97 ++++---
 llvm/lib/TargetParser/Triple.cpp              |   2 -
 llvm/test/CodeGen/AArch64/nest-register.ll    |  16 +-
 .../AArch64/statepoint-call-lowering.ll       |   2 +-
 llvm/test/CodeGen/AArch64/trampoline.ll       | 257 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/win64cc-x18.ll      |  27 +-
 .../CodeGen/AArch64/zero-call-used-regs.ll    |  16 +-
 14 files changed, 421 insertions(+), 167 deletions(-)

diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 19f26c92a0f94..2d213d95f333a 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -272,11 +272,6 @@ switch32
 switch8
 switchu8
 
-// This function generates a custom trampoline function with the specific
-// realFunc and localsPtr values.
-void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
-                        const void* realFunc, void* localsPtr);
-
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 830e25e4c0303..844eb27944142 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
-
-// The AArch64 compiler generates calls to __trampoline_setup() when creating
-// trampoline functions on the stack for use with nested functions.
-// This function creates a custom 36-byte trampoline function on the stack
-// which loads x18 with a pointer to the outer function's locals
-// and then jumps to the target nested function.
-// Note: x18 is a reserved platform register on Windows and macOS.
-
-#if defined(__aarch64__) && defined(__ELF__)
-COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
-                                        int trampSizeAllocated,
-                                        const void *realFunc, void *localsPtr) {
-  // This should never happen, but if compiler did not allocate
-  // enough space on stack for the trampoline, abort.
-  if (trampSizeAllocated < 36)
-    compilerrt_abort();
-
-  // create trampoline
-  // Load realFunc into x17. mov/movk 16 bits at a time.
-  trampOnStack[0] =
-      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
-  trampOnStack[1] =
-      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
-  trampOnStack[2] =
-      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
-  trampOnStack[3] =
-      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
-  // Load localsPtr into x18
-  trampOnStack[4] =
-      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
-  trampOnStack[5] =
-      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
-  trampOnStack[6] =
-      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
-  trampOnStack[7] =
-      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
-  trampOnStack[8] = 0xd61f0220; // br x17
-
-  // Clear instruction cache.
-  __clear_cache(trampOnStack, &trampOnStack[9]);
-}
-#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
index d51d35acaa02f..da115fe764271 100644
--- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
+++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc and aarch64 compilers generates a call to __trampoline_setup
+ * The ppc compiler generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 82b11ad7db32a..69bdb48146a54 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -274,12 +274,12 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // For PPC32 and PPC64, the thunk is populated by a call to
             // __trampoline_setup, which is defined in
             // compiler-rt/lib/builtins/trampoline_setup.c and requires the
-            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
-            // thunk setup doesn't go through __trampoline_setup and fits in 32
-            // bytes.
+            // thunk size greater than 32 bytes.  For AArch64, RISCV and x86_64,
+            // the thunk setup doesn't go through __trampoline_setup and fits in
+            // 32 bytes.
             fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
             mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 5d82522055adc..97d9b38ed6f40 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -3,7 +3,7 @@
 // RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
@@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
 // CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 920cc67273146..1b5a713bffdc9 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
 //===----------------------------------------------------------------------===//
 
 defvar AArch64_Common = [
+  // The 'nest' parameter, if any, is passed in X15.
+  // The previous register used here (X18) is also defined to be unavailable
+  // for this purpose, while all of X9-X15 were defined to be free for LLVM to
+  // use for this, so use X15 (which LLVM often already clobbers anyways).
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,13 +123,7 @@ defvar AArch64_Common = [
 ];
 
 let Entry = 1 in
-def CC_AArch64_AAPCS : CallingConv<!listconcat(
-  // The 'nest' parameter, if any, is passed in X18.
-  // Darwin and Windows use X18 as the platform register and hence 'nest' isn't
-  // currently supported there.
-  [CCIfNest<CCAssignToReg<[X18]>>],
-  AArch64_Common
-)>;
+def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
 
 let Entry = 1 in
 def RetCC_AArch64_AAPCS : CallingConv<[
@@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
 // a stack layout compatible with the x64 calling convention.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // Convert small floating-point values to integer.
   CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
 //     + Stack slots are sized as needed rather than being at least 64-bit.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // same as the normal Darwin VarArgs handling.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_GHC : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
 
   // Handle all vector types as either f64 or v2f64.
@@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
 
   // We can pass arguments in all general registers, except:
   // - X8, used for sret
+  // - X15 (on Windows), used as a temporary register in the prologue when allocating call frames
   // - X16/X17, used by the linker as IP0/IP1
   // - X18, the platform register
   // - X19, the base pointer
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3335ee04bb0e0..2650c621e19f6 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -331,7 +331,9 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(MachineFunction &MF);
 static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
+static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                                 bool HasCall = false);
+static bool requiresSaveVG(const MachineFunction &MF);
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -1006,6 +1008,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
   }
 }
 
+static bool windowsRequiresStackProbe(const MachineFunction &MF,
+                                      uint64_t StackSizeInBytes) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
+  // TODO: When implementing stack protectors, take that into account
+  // for the probe threshold.
+  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
+         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
+}
+
 static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
                                    const MachineBasicBlock &MBB) {
   const MachineFunction *MF = MBB.getParent();
@@ -1027,7 +1039,8 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
 // but we would then have to make sure that we were in fact saving at least one
 // callee-save register in the prologue, which is additional complexity that
 // doesn't seem worth the benefit.
-static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
+                                                 bool HasCall) {
   MachineFunction *MF = MBB->getParent();
 
   // If MBB is an entry block, use X9 as the scratch register
@@ -1041,6 +1054,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
   LivePhysRegs LiveRegs(TRI);
   getLiveRegsForEntryMBB(LiveRegs, *MBB);
+  if (HasCall) {
+    LiveRegs.addReg(AArch64::X16);
+    LiveRegs.addReg(AArch64::X17);
+    LiveRegs.addReg(AArch64::X18);
+  }
 
   // Prefer X9 since it was historically used for the prologue scratch reg.
   const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -1081,23 +1099,18 @@ bool AArch64FrameLowering::canUseAsPrologue(
       MBB.isLiveIn(AArch64::NZCV))
     return false;
 
-  // Don't need a scratch register if we're not going to re-align the stack or
-  // emit stack probes.
-  if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF))
-    return true;
-  // Otherwise, we can use any block as long as it has a scratch register
-  // available.
-  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
-}
+  if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF))
+    if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister)
+      return false;
 
-static bool windowsRequiresStackProbe(MachineFunction &MF,
-                                      uint64_t StackSizeInBytes) {
-  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
-  // TODO: When implementing stack protectors, take that into account
-  // for the probe threshold.
-  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
-         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
+  // May need a scratch register (for return value) if require making a special
+  // call
+  if (requiresSaveVG(*MF) ||
+      windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
+    if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
+      return false;
+
+  return true;
 }
 
 static bool needsWinCFI(const MachineFunction &MF) {
@@ -1378,8 +1391,8 @@ bool requiresGetVGCall(MachineFunction &MF) {
          !MF.getSubtarget<AArch64Subtarget>().hasSVE();
 }
 
-static bool requiresSaveVG(MachineFunction &MF) {
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+static bool requiresSaveVG(const MachineFunction &MF) {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   // For Darwin platforms we don't save VG for non-SVE functions, even if SME
   // is enabled with streaming mode changes.
   if (!AFI->hasStreamingModeChanges())
@@ -2049,6 +2062,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     if (AFI->getSVECalleeSavedStackSize())
       report_fatal_error(
           "SVE callee saves not yet supported with stack probing");
+
+    // Find an available register to spill the value of X15 to, if X15 is being
+    // used already for nest.
+    unsigned X15Scratch = AArch64::NoRegister;
+    const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+    if (llvm::any_of(MBB.liveins(),
+                     [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+                       return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+                           AArch64::X15, LiveIn.PhysReg);
+                     })) {
+      X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
+      assert(X15Scratch != AArch64::NoRegister &&
+             (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
+#ifndef NDEBUG
+      LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+#endif
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::X15, RegState::Undef)
+          .addReg(AArch64::X15, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
     if (NeedsWinCFI) {
       HasWinCFI = true;
@@ -2171,6 +2207,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // we've set a frame pointer and already finished the SEH prologue.
       assert(!NeedsWinCFI);
     }
+    if (X15Scratch != AArch64::NoRegister) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+          .addReg(AArch64::XZR)
+          .addReg(X15Scratch, RegState::Undef)
+          .addReg(X15Scratch, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
@@ -3355,7 +3398,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     unsigned X0Scratch = AArch64::NoRegister;
     if (Reg1 == AArch64::VG) {
       // Find an available register to store value of VG to.
-      Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+      Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
       assert(Reg1 != AArch64::NoRegister);
       SMEAttrs Attrs = AFI->getSMEFnAttrs();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 766599d567efd..ad5b90984188e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7126,59 +7126,80 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error(
-        "ADJUST_TRAMPOLINE operation is only supported on Linux.");
-
   return Op.getOperand(0);
 }
 
 SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                     SelectionDAG &DAG) const {
-
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
-
   SDValue Chain = Op.getOperand(0);
-  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  SDLoc dl(Op);
 
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  // ldr NestReg, .+16
+  // ldr x17, .+20
+  // br x17
+  // .word 0
+  // .nest: .qword nest
+  // .fptr: .qword fptr
+  SDValue OutChains[5];
 
-  Entry.Ty = IntPtrTy;
-  Entry.Node = Trmp;
-  Args.push_back(Entry);
+  const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+  CallingConv::ID CC = Func->getCallingConv();
+  unsigned NestReg;
 
-  if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineFrameInfo &MFI = MF.getFrameInfo();
-    Entry.Node =
-        DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
-  } else
-    Entry.Node = DAG.getConstant(36, dl, MVT::i64);
+  switch (CC) {
+  default:
+    NestReg = 0x0f; // X15
+  case CallingConv::ARM64EC_Thunk_Native:
+  case CallingConv::ARM64EC_Thunk_X64:
+    // Must be kept in sync with AArch64CallingConv.td
+    NestReg = 0x04; // X4
+    break;
+  }
 
-  Args.push_back(Entry);
-  Entry.Node = FPtr;
-  Args.push_back(Entry);
-  Entry.Node = Nest;
-  Args.push_back(Entry);
+  const char FptrReg = 0x11; // X17
 
-  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
-      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
+  SDValue Addr = Trmp;
 
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.second;
+  SDLoc dl(Op);
+  OutChains[0] = DAG.getStore(
+      Chain, dl, DAG.getConstant(0x58000080u | NestReg, dl, MVT::i32), Addr,
+      MachinePointerInfo(TrmpAddr));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(4, dl, MVT::i64));
+  OutChains[1] = DAG.getStore(
+      Chain, dl, DAG.getConstant(0x580000b0u | FptrReg, dl, MVT::i32), Addr,
+      MachinePointerInfo(TrmpAddr, 4));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(8, dl, MVT::i64));
+  OutChains[2] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 8));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(16, dl, MVT::i64));
+  OutChains[3] =
+      DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(24, dl, MVT::i64));
+  OutChains[4] =
+      DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
+
+  SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+  SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                                  DAG.getConstant(12, dl, MVT::i64));
+
+  // Call clear cache on the trampoline instructions.
+  return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, Trmp,
+                     EndOfTrmp);
 }
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index bd291e1918219..5718ae385bac1 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1754,8 +1754,6 @@ unsigned Triple::getTrampolineSize() const {
     if (isOSLinux())
       return 48;
     break;
-  case Triple::aarch64:
-    return 36;
   }
   return 32;
 }
diff --git a/llvm/test/CodeGen/AArch64/nest-register.ll b/llvm/test/CodeGen/AArch64/nest-register.ll
index 1e1c1b044bab6..2e94dfba1fa52 100644
--- a/llvm/test/CodeGen/AArch64/nest-register.ll
+++ b/llvm/test/CodeGen/AArch64/nest-register.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
@@ -5,18 +6,21 @@
 
 define ptr @nest_receiver(ptr nest %arg) nounwind {
 ; CHECK-LABEL: nest_receiver:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov x0, x18
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x15
+; CHECK-NEXT:    ret
 
   ret ptr %arg
 }
 
 define ptr @nest_caller(ptr %arg) nounwind {
 ; CHECK-LABEL: nest_caller:
-; CHECK: mov x18, x0
-; CHECK-NEXT: bl nest_receiver
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x15, x0
+; CHECK-NEXT:    bl nest_receiver
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 
   %result = call ptr @nest_receiver(ptr nest %arg)
   ret ptr %result
diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
index 9619895c450ca..32c3eaeb9c876 100644
--- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll
@@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" {
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    ldr x8, [sp, #64]
 ; CHECK-NEXT:    ldr q0, [sp, #48]
-; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov x15, xzr
 ; CHECK-NEXT:    mov w0, #42 // =0x2a
 ; CHECK-NEXT:    mov w1, #17 // =0x11
 ; CHECK-NEXT:    str x8, [sp, #16]
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 30ac2aa283b3e..d9016b02a0f80 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -1,32 +1,265 @@
-; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-LINUX
+; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s --check-prefixes=CHECK-LINUX
+; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-PC
+; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK-APPLE
 
 @trampg = internal global [36 x i8] zeroinitializer, align 8
 
 declare void @llvm.init.trampoline(ptr, ptr, ptr);
 declare ptr @llvm.adjust.trampoline(ptr);
 
-define i64 @f(ptr nest %c, i64 %x, i64 %y) {
-  %sum = add i64 %x, %y
-  ret i64 %sum
+define ptr @f(ptr nest %x, i64 %y) {
+; CHECK-LINUX-LABEL: f:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    sub sp, sp, #237, lsl #12 // =970752
+; CHECK-LINUX-NEXT:    sub sp, sp, #3264
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 974032
+; CHECK-LINUX-NEXT:    .cfi_offset w29, -16
+; CHECK-LINUX-NEXT:    add x0, x15, x0
+; CHECK-LINUX-NEXT:    add sp, sp, #237, lsl #12 // =970752
+; CHECK-LINUX-NEXT:    add sp, sp, #3264
+; CHECK-LINUX-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: f:
+; CHECK-PC:       .seh_proc f
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_fplr_x 16
+; CHECK-PC-NEXT:    mov x9, x15
+; CHECK-PC-NEXT:    mov x15, #60876 // =0xedcc
+; CHECK-PC-NEXT:    .seh_nop
+; CHECK-PC-NEXT:    bl __chkstk
+; CHECK-PC-NEXT:    .seh_nop
+; CHECK-PC-NEXT:    sub sp, sp, x15, lsl #4
+; CHECK-PC-NEXT:    .seh_stackalloc 974016
+; CHECK-PC-NEXT:    mov x15, x9
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    add x0, x15, x0
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    add sp, sp, #237, lsl #12 // =970752
+; CHECK-PC-NEXT:    .seh_stackalloc 970752
+; CHECK-PC-NEXT:    add sp, sp, #3264
+; CHECK-PC-NEXT:    .seh_stackalloc 3264
+; CHECK-PC-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_fplr_x 16
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: f:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    stp x28, x27, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    sub sp, sp, #237, lsl #12 ; =970752
+; CHECK-APPLE-NEXT:    sub sp, sp, #3264
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 974032
+; CHECK-APPLE-NEXT:    .cfi_offset w27, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w28, -16
+; CHECK-APPLE-NEXT:    add x0, x15, x0
+; CHECK-APPLE-NEXT:    add sp, sp, #237, lsl #12 ; =970752
+; CHECK-APPLE-NEXT:    add sp, sp, #3264
+; CHECK-APPLE-NEXT:    ldp x28, x27, [sp], #16 ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    ret
+  %chkstack = alloca [u0xedcba x i8]
+  %sum = getelementptr i8, ptr %x, i64 %y
+  ret ptr %sum
 }
 
 define i64 @func1() {
+; CHECK-LINUX-LABEL: func1:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    sub sp, sp, #64
+; CHECK-LINUX-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-LINUX-NEXT:    .cfi_offset w30, -16
+; CHECK-LINUX-NEXT:    adrp x8, :got:f
+; CHECK-LINUX-NEXT:    mov w9, #544 // =0x220
+; CHECK-LINUX-NEXT:    add x0, sp, #8
+; CHECK-LINUX-NEXT:    ldr x8, [x8, :got_lo12:f]
+; CHECK-LINUX-NEXT:    movk w9, #54815, lsl #16
+; CHECK-LINUX-NEXT:    str w9, [sp, #16]
+; CHECK-LINUX-NEXT:    add x9, sp, #56
+; CHECK-LINUX-NEXT:    stp x9, x8, [sp, #24]
+; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
+; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
+; CHECK-LINUX-NEXT:    str x8, [sp, #8]
+; CHECK-LINUX-NEXT:    add x8, sp, #8
+; CHECK-LINUX-NEXT:    add x1, x8, #12
+; CHECK-LINUX-NEXT:    bl __clear_cache
+; CHECK-LINUX-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    mov x0, xzr
+; CHECK-LINUX-NEXT:    add sp, sp, #64
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: func1:
+; CHECK-PC:       .seh_proc func1
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    sub sp, sp, #64
+; CHECK-PC-NEXT:    .seh_stackalloc 64
+; CHECK-PC-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_reg x30, 48
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    adrp x8, f
+; CHECK-PC-NEXT:    add x8, x8, :lo12:f
+; CHECK-PC-NEXT:    add x9, sp, #56
+; CHECK-PC-NEXT:    stp x9, x8, [sp, #24]
+; CHECK-PC-NEXT:    mov w8, #544 // =0x220
+; CHECK-PC-NEXT:    add x0, sp, #8
+; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
+; CHECK-PC-NEXT:    str w8, [sp, #16]
+; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
+; CHECK-PC-NEXT:    movk x8, #177, lsl #32
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
+; CHECK-PC-NEXT:    str x8, [sp, #8]
+; CHECK-PC-NEXT:    add x8, sp, #8
+; CHECK-PC-NEXT:    add x1, x8, #12
+; CHECK-PC-NEXT:    bl __clear_cache
+; CHECK-PC-NEXT:    mov x0, xzr
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_reg x30, 48
+; CHECK-PC-NEXT:    add sp, sp, #64
+; CHECK-PC-NEXT:    .seh_stackalloc 64
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: func1:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    sub sp, sp, #64
+; CHECK-APPLE-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-APPLE-NEXT:    .cfi_offset w30, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
+; CHECK-APPLE-NEXT:  Lloh0:
+; CHECK-APPLE-NEXT:    adrp x8, _f@PAGE
+; CHECK-APPLE-NEXT:  Lloh1:
+; CHECK-APPLE-NEXT:    add x8, x8, _f@PAGEOFF
+; CHECK-APPLE-NEXT:    add x9, sp, #40
+; CHECK-APPLE-NEXT:    stp x9, x8, [sp, #16]
+; CHECK-APPLE-NEXT:    mov w8, #544 ; =0x220
+; CHECK-APPLE-NEXT:    mov x0, sp
+; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
+; CHECK-APPLE-NEXT:    str w8, [sp, #8]
+; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
+; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48
+; CHECK-APPLE-NEXT:    str x8, [sp]
+; CHECK-APPLE-NEXT:    mov x8, sp
+; CHECK-APPLE-NEXT:    add x1, x8, #12
+; CHECK-APPLE-NEXT:    bl ___clear_cache
+; CHECK-APPLE-NEXT:    ldp x29, x30, [sp, #48] ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    mov x0, xzr
+; CHECK-APPLE-NEXT:    add sp, sp, #64
+; CHECK-APPLE-NEXT:    ret
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh0, Lloh1
   %val = alloca i64
-  %nval = bitcast ptr %val to ptr
   %tramp = alloca [36 x i8], align 8
-  ; CHECK:	mov	w1, #36
-  ; CHECK:	bl	__trampoline_setup
-  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval)
+  call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %val)
   %fp = call ptr @llvm.adjust.trampoline(ptr %tramp)
   ret i64 0
 }
 
 define i64 @func2() {
+; CHECK-LINUX-LABEL: func2:
+; CHECK-LINUX:       // %bb.0:
+; CHECK-LINUX-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LINUX-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-LINUX-NEXT:    .cfi_offset w30, -16
+; CHECK-LINUX-NEXT:    adrp x8, :got:f
+; CHECK-LINUX-NEXT:    mov w9, #544 // =0x220
+; CHECK-LINUX-NEXT:    adrp x0, trampg
+; CHECK-LINUX-NEXT:    add x0, x0, :lo12:trampg
+; CHECK-LINUX-NEXT:    ldr x8, [x8, :got_lo12:f]
+; CHECK-LINUX-NEXT:    movk w9, #54815, lsl #16
+; CHECK-LINUX-NEXT:    str w9, [x0, #8]
+; CHECK-LINUX-NEXT:    add x9, sp, #8
+; CHECK-LINUX-NEXT:    add x1, x0, #12
+; CHECK-LINUX-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-LINUX-NEXT:    mov x8, #132 // =0x84
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #16
+; CHECK-LINUX-NEXT:    movk x8, #177, lsl #32
+; CHECK-LINUX-NEXT:    movk x8, #22528, lsl #48
+; CHECK-LINUX-NEXT:    str x8, [x0]
+; CHECK-LINUX-NEXT:    bl __clear_cache
+; CHECK-LINUX-NEXT:    mov x0, xzr
+; CHECK-LINUX-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-LINUX-NEXT:    ret
+;
+; CHECK-PC-LABEL: func2:
+; CHECK-PC:       .seh_proc func2
+; CHECK-PC-NEXT:  // %bb.0:
+; CHECK-PC-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-PC-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-PC-NEXT:    .seh_endprologue
+; CHECK-PC-NEXT:    adrp x0, trampg
+; CHECK-PC-NEXT:    add x0, x0, :lo12:trampg
+; CHECK-PC-NEXT:    adrp x8, f
+; CHECK-PC-NEXT:    add x8, x8, :lo12:f
+; CHECK-PC-NEXT:    add x9, sp, #8
+; CHECK-PC-NEXT:    add x1, x0, #12
+; CHECK-PC-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-PC-NEXT:    mov w8, #544 // =0x220
+; CHECK-PC-NEXT:    movk w8, #54815, lsl #16
+; CHECK-PC-NEXT:    str w8, [x0, #8]
+; CHECK-PC-NEXT:    mov x8, #132 // =0x84
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #16
+; CHECK-PC-NEXT:    movk x8, #177, lsl #32
+; CHECK-PC-NEXT:    movk x8, #22528, lsl #48
+; CHECK-PC-NEXT:    str x8, [x0]
+; CHECK-PC-NEXT:    bl __clear_cache
+; CHECK-PC-NEXT:    mov x0, xzr
+; CHECK-PC-NEXT:    .seh_startepilogue
+; CHECK-PC-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-PC-NEXT:    .seh_save_reg_x x30, 16
+; CHECK-PC-NEXT:    .seh_endepilogue
+; CHECK-PC-NEXT:    ret
+; CHECK-PC-NEXT:    .seh_endfunclet
+; CHECK-PC-NEXT:    .seh_endproc
+;
+; CHECK-APPLE-LABEL: func2:
+; CHECK-APPLE:       ; %bb.0:
+; CHECK-APPLE-NEXT:    sub sp, sp, #32
+; CHECK-APPLE-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; CHECK-APPLE-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-APPLE-NEXT:    .cfi_offset w30, -8
+; CHECK-APPLE-NEXT:    .cfi_offset w29, -16
+; CHECK-APPLE-NEXT:  Lloh2:
+; CHECK-APPLE-NEXT:    adrp x0, _trampg@PAGE
+; CHECK-APPLE-NEXT:  Lloh3:
+; CHECK-APPLE-NEXT:    add x0, x0, _trampg@PAGEOFF
+; CHECK-APPLE-NEXT:  Lloh4:
+; CHECK-APPLE-NEXT:    adrp x8, _f@PAGE
+; CHECK-APPLE-NEXT:  Lloh5:
+; CHECK-APPLE-NEXT:    add x8, x8, _f@PAGEOFF
+; CHECK-APPLE-NEXT:    add x9, sp, #8
+; CHECK-APPLE-NEXT:    add x1, x0, #12
+; CHECK-APPLE-NEXT:    stp x9, x8, [x0, #16]
+; CHECK-APPLE-NEXT:    mov w8, #544 ; =0x220
+; CHECK-APPLE-NEXT:    movk w8, #54815, lsl #16
+; CHECK-APPLE-NEXT:    str w8, [x0, #8]
+; CHECK-APPLE-NEXT:    mov x8, #132 ; =0x84
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #16
+; CHECK-APPLE-NEXT:    movk x8, #177, lsl #32
+; CHECK-APPLE-NEXT:    movk x8, #22528, lsl #48
+; CHECK-APPLE-NEXT:    str x8, [x0]
+; CHECK-APPLE-NEXT:    bl ___clear_cache
+; CHECK-APPLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; CHECK-APPLE-NEXT:    mov x0, xzr
+; CHECK-APPLE-NEXT:    add sp, sp, #32
+; CHECK-APPLE-NEXT:    ret
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh4, Lloh5
+; CHECK-APPLE-NEXT:    .loh AdrpAdd Lloh2, Lloh3
   %val = alloca i64
-  %nval = bitcast ptr %val to ptr
-  ; CHECK:	mov	w1, #36
-  ; CHECK:	bl	__trampoline_setup
-  call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %nval)
+  call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %val)
   %fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
   ret i64 0
 }
diff --git a/llvm/test/CodeGen/AArch64/win64cc-x18.ll b/llvm/test/CodeGen/AArch64/win64cc-x18.ll
index b3e78cc9bbb81..4b45c300e9c1d 100644
--- a/llvm/test/CodeGen/AArch64/win64cc-x18.ll
+++ b/llvm/test/CodeGen/AArch64/win64cc-x18.ll
@@ -1,35 +1,26 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;; Testing that nest uses x15 on all calling conventions (except Arm64EC)
 
-;; Testing that x18 is not clobbered when passing pointers with the nest
-;; attribute on windows
-
-; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=CHECK,CHECK-NO-X18
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-X18
+; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-darwin- | FileCheck %s
 
 define dso_local i64 @other(ptr nest %p) #0 {
 ; CHECK-LABEL: other:
-; CHECK-X18: ldr x0, [x18]
-; CHECK-NO-X18: ldr x0, [x0]
+; CHECK:    ldr x0, [x15]
+; CHECK:    ret
   %r = load i64, ptr %p
-; CHECK: ret
   ret i64 %r
 }
 
 define dso_local void @func() #0 {
 ; CHECK-LABEL: func:
-
-
+; CHECK:    add x15, sp, #8
+; CHECK:    bl {{_?other}}
+; CHECK:    ret
 entry:
   %p = alloca i64
-; CHECK: mov w8, #1
-; CHECK: stp x30, x8, [sp, #-16]
-; CHECK-X18: add x18, sp, #8
   store i64 1, ptr %p
-; CHECK-NO-X18: add x0, sp, #8
-; CHECK: bl other
   call void @other(ptr nest %p)
-; CHECK: ldr x30, [sp], #16
-; CHECK: ret
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
index 4799ea3bcd19f..986666e015e9e 100644
--- a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
+++ b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll
@@ -93,7 +93,7 @@ define dso_local i32 @all_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c
 ; CHECK-NEXT:    mov x5, #0 // =0x0
 ; CHECK-NEXT:    mov x6, #0 // =0x0
 ; CHECK-NEXT:    mov x7, #0 // =0x0
-; CHECK-NEXT:    mov x18, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
 ; CHECK-NEXT:    orr w0, w8, w2
 ; CHECK-NEXT:    mov x2, #0 // =0x0
 ; CHECK-NEXT:    mov x8, #0 // =0x0
@@ -146,7 +146,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; DEFAULT-NEXT:    mov x5, #0 // =0x0
 ; DEFAULT-NEXT:    mov x6, #0 // =0x0
 ; DEFAULT-NEXT:    mov x7, #0 // =0x0
-; DEFAULT-NEXT:    mov x18, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
 ; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
 ; DEFAULT-NEXT:    orr w0, w8, w2
 ; DEFAULT-NEXT:    mov x2, #0 // =0x0
@@ -169,7 +169,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; SVE-OR-SME-NEXT:    mov x5, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x6, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x7, #0 // =0x0
-; SVE-OR-SME-NEXT:    mov x18, #0 // =0x0
+; SVE-OR-SME-NEXT:    mov x15, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z0.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    orr w0, w8, w2
 ; SVE-OR-SME-NEXT:    mov x2, #0 // =0x0
@@ -196,7 +196,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo
 ; STREAMING-COMPAT-NEXT:    mov x5, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x6, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x7, #0 // =0x0
-; STREAMING-COMPAT-NEXT:    mov x18, #0 // =0x0
+; STREAMING-COMPAT-NEXT:    mov x15, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    fmov d0, xzr
 ; STREAMING-COMPAT-NEXT:    orr w0, w8, w2
 ; STREAMING-COMPAT-NEXT:    mov x2, #0 // =0x0
@@ -492,7 +492,7 @@ define dso_local double @all_gpr_arg_float(double noundef %a, float noundef %b)
 ; CHECK-NEXT:    mov x6, #0 // =0x0
 ; CHECK-NEXT:    mov x7, #0 // =0x0
 ; CHECK-NEXT:    mov x8, #0 // =0x0
-; CHECK-NEXT:    mov x18, #0 // =0x0
+; CHECK-NEXT:    mov x15, #0 // =0x0
 ; CHECK-NEXT:    ret
 
 entry:
@@ -547,7 +547,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; DEFAULT-NEXT:    mov x6, #0 // =0x0
 ; DEFAULT-NEXT:    mov x7, #0 // =0x0
 ; DEFAULT-NEXT:    mov x8, #0 // =0x0
-; DEFAULT-NEXT:    mov x18, #0 // =0x0
+; DEFAULT-NEXT:    mov x15, #0 // =0x0
 ; DEFAULT-NEXT:    movi v1.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v2.2d, #0000000000000000
 ; DEFAULT-NEXT:    movi v3.2d, #0000000000000000
@@ -570,7 +570,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; SVE-OR-SME-NEXT:    mov x6, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x7, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov x8, #0 // =0x0
-; SVE-OR-SME-NEXT:    mov x18, #0 // =0x0
+; SVE-OR-SME-NEXT:    mov x15, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z1.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z2.d, #0 // =0x0
 ; SVE-OR-SME-NEXT:    mov z3.d, #0 // =0x0
@@ -597,7 +597,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca
 ; STREAMING-COMPAT-NEXT:    mov x6, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x7, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    mov x8, #0 // =0x0
-; STREAMING-COMPAT-NEXT:    mov x18, #0 // =0x0
+; STREAMING-COMPAT-NEXT:    mov x15, #0 // =0x0
 ; STREAMING-COMPAT-NEXT:    fmov d1, xzr
 ; STREAMING-COMPAT-NEXT:    fmov d2, xzr
 ; STREAMING-COMPAT-NEXT:    fmov d3, xzr

From bb3b8306dc226c4dc4dfde36444b43476eea66ee Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 10:48:32 +0800
Subject: [PATCH 171/851] [NFC] [C++20] [Modules] Add a test module local
 declaration lookup

From
https://github.com/llvm/llvm-project/issues/143734, but it looks good on
trunk. Add it as tests are always good.
---
 .../Modules/module-local-declarations.cppm    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 clang/test/Modules/module-local-declarations.cppm

diff --git a/clang/test/Modules/module-local-declarations.cppm b/clang/test/Modules/module-local-declarations.cppm
new file mode 100644
index 0000000000000..4fbcf09e4d792
--- /dev/null
+++ b/clang/test/Modules/module-local-declarations.cppm
@@ -0,0 +1,30 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/Base.cppm -emit-module-interface -o %t/Base.pcm
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fsyntax-only -verify -fprebuilt-module-path=%t
+
+//--- Base.cppm
+export module Base;
+export template <class T>
+class Base {};
+
+//--- A.cppm
+export module A;
+import Base;
+struct S {};
+
+export Base<S> a;
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+
+import A;
+import Base;
+
+struct S {};
+
+export Base<S> b;

From de51b2dd3c6fc995e7db56fc50b4c8dceddc0aab Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 11 Jun 2025 19:51:05 -0700
Subject: [PATCH 172/851] [lldb] Move Transport class into lldb_private (NFC)
 (#143806)

Move lldb-dap's Transport class into lldb_private so the code can be
shared between the "JSON with header" protocol used by DAP and the JSON
RPC protocol used by MCP (see [1]).

[1]: https://discourse.llvm.org/t/rfc-adding-mcp-support-to-lldb/86798
---
 lldb/include/lldb/Host/JSONTransport.h    | 126 +++++++++++++++++++
 lldb/source/Host/CMakeLists.txt           |   3 +-
 lldb/source/Host/common/JSONTransport.cpp | 147 ++++++++++++++++++++++
 lldb/tools/lldb-dap/DAP.cpp               |   7 +-
 lldb/tools/lldb-dap/Transport.cpp         | 145 +--------------------
 lldb/tools/lldb-dap/Transport.h           |  65 ++--------
 lldb/unittests/DAP/DAPTest.cpp            |   7 +-
 lldb/unittests/DAP/TestBase.cpp           |   3 +-
 lldb/unittests/DAP/TransportTest.cpp      |  16 ++-
 9 files changed, 308 insertions(+), 211 deletions(-)
 create mode 100644 lldb/include/lldb/Host/JSONTransport.h
 create mode 100644 lldb/source/Host/common/JSONTransport.cpp

diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h
new file mode 100644
index 0000000000000..4db5e417ea852
--- /dev/null
+++ b/lldb/include/lldb/Host/JSONTransport.h
@@ -0,0 +1,126 @@
+//===-- JSONTransport.h ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Transport layer for encoding and decoding JSON protocol messages.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_HOST_JSONTRANSPORT_H
+#define LLDB_HOST_JSONTRANSPORT_H
+
+#include "lldb/lldb-forward.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/JSON.h"
+#include <chrono>
+#include <system_error>
+
+namespace lldb_private {
+
+class TransportEOFError : public llvm::ErrorInfo<TransportEOFError> {
+public:
+  static char ID;
+
+  TransportEOFError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport end of file reached";
+  }
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
+class TransportTimeoutError : public llvm::ErrorInfo<TransportTimeoutError> {
+public:
+  static char ID;
+
+  TransportTimeoutError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport operation timed out";
+  }
+  std::error_code convertToErrorCode() const override {
+    return std::make_error_code(std::errc::timed_out);
+  }
+};
+
+class TransportClosedError : public llvm::ErrorInfo<TransportClosedError> {
+public:
+  static char ID;
+
+  TransportClosedError() = default;
+
+  void log(llvm::raw_ostream &OS) const override {
+    OS << "transport is closed";
+  }
+  std::error_code convertToErrorCode() const override {
+    return llvm::inconvertibleErrorCode();
+  }
+};
+
+/// A transport class that uses JSON for communication.
+class JSONTransport {
+public:
+  JSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output);
+  virtual ~JSONTransport() = default;
+
+  /// Transport is not copyable.
+  /// @{
+  JSONTransport(const JSONTransport &rhs) = delete;
+  void operator=(const JSONTransport &rhs) = delete;
+  /// @}
+
+  /// Writes a message to the output stream.
+  template <typename T> llvm::Error Write(const T &t) {
+    const std::string message = llvm::formatv("{0}", toJSON(t)).str();
+    return WriteImpl(message);
+  }
+
+  /// Reads the next message from the input stream.
+  template <typename T>
+  llvm::Expected<T> Read(const std::chrono::microseconds &timeout) {
+    llvm::Expected<std::string> message = ReadImpl(timeout);
+    if (!message)
+      return message.takeError();
+    return llvm::json::parse<T>(/*JSON=*/*message);
+  }
+
+protected:
+  virtual void Log(llvm::StringRef message);
+
+  virtual llvm::Error WriteImpl(const std::string &message) = 0;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) = 0;
+
+  lldb::IOObjectSP m_input;
+  lldb::IOObjectSP m_output;
+};
+
+/// A transport class for JSON with a HTTP header.
+class HTTPDelimitedJSONTransport : public JSONTransport {
+public:
+  HTTPDelimitedJSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output)
+      : JSONTransport(input, output) {}
+  virtual ~HTTPDelimitedJSONTransport() = default;
+
+protected:
+  virtual llvm::Error WriteImpl(const std::string &message) override;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) override;
+
+  // FIXME: Support any header.
+  static constexpr llvm::StringLiteral kHeaderContentLength =
+      "Content-Length: ";
+  static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n";
+};
+
+} // namespace lldb_private
+
+#endif
diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt
index 5b713133afeaf..b15d72e61b6e5 100644
--- a/lldb/source/Host/CMakeLists.txt
+++ b/lldb/source/Host/CMakeLists.txt
@@ -27,8 +27,9 @@ add_host_subdirectory(common
   common/HostNativeThreadBase.cpp
   common/HostProcess.cpp
   common/HostThread.cpp
-  common/LockFileBase.cpp
+  common/JSONTransport.cpp
   common/LZMA.cpp
+  common/LockFileBase.cpp
   common/MainLoopBase.cpp
   common/MemoryMonitor.cpp
   common/MonitoringProcessLauncher.cpp
diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp
new file mode 100644
index 0000000000000..103c76d25daf7
--- /dev/null
+++ b/lldb/source/Host/common/JSONTransport.cpp
@@ -0,0 +1,147 @@
+//===-- JSONTransport.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/JSONTransport.h"
+#include "lldb/Utility/IOObject.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/SelectHelper.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/lldb-forward.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+using namespace lldb;
+using namespace lldb_private;
+
+/// ReadFull attempts to read the specified number of bytes. If EOF is
+/// encountered, an empty string is returned.
+static Expected<std::string>
+ReadFull(IOObject &descriptor, size_t length,
+         std::optional<std::chrono::microseconds> timeout = std::nullopt) {
+  if (!descriptor.IsValid())
+    return llvm::make_error<TransportClosedError>();
+
+  bool timeout_supported = true;
+  // FIXME: SelectHelper does not work with NativeFile on Win32.
+#if _WIN32
+  timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket;
+#endif
+
+  if (timeout && timeout_supported) {
+    SelectHelper sh;
+    sh.SetTimeout(*timeout);
+    sh.FDSetRead(descriptor.GetWaitableHandle());
+    Status status = sh.Select();
+    if (status.Fail()) {
+      // Convert timeouts into a specific error.
+      if (status.GetType() == lldb::eErrorTypePOSIX &&
+          status.GetError() == ETIMEDOUT)
+        return make_error<TransportTimeoutError>();
+      return status.takeError();
+    }
+  }
+
+  std::string data;
+  data.resize(length);
+  Status status = descriptor.Read(data.data(), length);
+  if (status.Fail())
+    return status.takeError();
+
+  // Read returns '' on EOF.
+  if (length == 0)
+    return make_error<TransportEOFError>();
+
+  // Return the actual number of bytes read.
+  return data.substr(0, length);
+}
+
+static Expected<std::string>
+ReadUntil(IOObject &descriptor, StringRef delimiter,
+          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
+  std::string buffer;
+  buffer.reserve(delimiter.size() + 1);
+  while (!llvm::StringRef(buffer).ends_with(delimiter)) {
+    Expected<std::string> next =
+        ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout);
+    if (auto Err = next.takeError())
+      return std::move(Err);
+    buffer += *next;
+  }
+  return buffer.substr(0, buffer.size() - delimiter.size());
+}
+
+JSONTransport::JSONTransport(IOObjectSP input, IOObjectSP output)
+    : m_input(std::move(input)), m_output(std::move(output)) {}
+
+void JSONTransport::Log(llvm::StringRef message) {
+  LLDB_LOG(GetLog(LLDBLog::Host), "{0}", message);
+}
+
+Expected<std::string>
+HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
+  if (!m_input || !m_input->IsValid())
+    return createStringError("transport output is closed");
+
+  IOObject *input = m_input.get();
+  Expected<std::string> message_header =
+      ReadFull(*input, kHeaderContentLength.size(), timeout);
+  if (!message_header)
+    return message_header.takeError();
+  if (*message_header != kHeaderContentLength)
+    return createStringError(formatv("expected '{0}' and got '{1}'",
+                                     kHeaderContentLength, *message_header)
+                                 .str());
+
+  Expected<std::string> raw_length = ReadUntil(*input, kHeaderSeparator);
+  if (!raw_length)
+    return handleErrors(raw_length.takeError(),
+                        [&](const TransportEOFError &E) -> llvm::Error {
+                          return createStringError(
+                              "unexpected EOF while reading header separator");
+                        });
+
+  size_t length;
+  if (!to_integer(*raw_length, length))
+    return createStringError(
+        formatv("invalid content length {0}", *raw_length).str());
+
+  Expected<std::string> raw_json = ReadFull(*input, length);
+  if (!raw_json)
+    return handleErrors(
+        raw_json.takeError(), [&](const TransportEOFError &E) -> llvm::Error {
+          return createStringError("unexpected EOF while reading JSON");
+        });
+
+  Log(llvm::formatv("--> {0}", *raw_json).str());
+
+  return raw_json;
+}
+
+Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
+  if (!m_output || !m_output->IsValid())
+    return llvm::make_error<TransportClosedError>();
+
+  Log(llvm::formatv("<-- {0}", message).str());
+
+  std::string Output;
+  raw_string_ostream OS(Output);
+  OS << kHeaderContentLength << message.length() << kHeaderSeparator << message;
+  size_t num_bytes = Output.size();
+  return m_output->Write(Output.data(), num_bytes).takeError();
+}
+
+char TransportEOFError::ID;
+char TransportTimeoutError::ID;
+char TransportClosedError::ID;
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index b034c967594ba..9fe8227cd2d6f 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -70,6 +70,7 @@
 
 using namespace lldb_dap;
 using namespace lldb_dap::protocol;
+using namespace lldb_private;
 
 namespace {
 #ifdef _WIN32
@@ -893,14 +894,14 @@ llvm::Error DAP::Loop() {
 
         while (!disconnecting) {
           llvm::Expected<Message> next =
-              transport.Read(std::chrono::seconds(1));
-          if (next.errorIsA<EndOfFileError>()) {
+              transport.Read<protocol::Message>(std::chrono::seconds(1));
+          if (next.errorIsA<TransportEOFError>()) {
             consumeError(next.takeError());
             break;
           }
 
           // If the read timed out, continue to check if we should disconnect.
-          if (next.errorIsA<TimeoutError>()) {
+          if (next.errorIsA<TransportTimeoutError>()) {
             consumeError(next.takeError());
             continue;
           }
diff --git a/lldb/tools/lldb-dap/Transport.cpp b/lldb/tools/lldb-dap/Transport.cpp
index 4e322e9ff1358..d602920da34e3 100644
--- a/lldb/tools/lldb-dap/Transport.cpp
+++ b/lldb/tools/lldb-dap/Transport.cpp
@@ -8,152 +8,19 @@
 
 #include "Transport.h"
 #include "DAPLog.h"
-#include "Protocol/ProtocolBase.h"
-#include "lldb/Utility/IOObject.h"
-#include "lldb/Utility/SelectHelper.h"
-#include "lldb/Utility/Status.h"
 #include "lldb/lldb-forward.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include <optional>
-#include <string>
-#include <utility>
 
 using namespace llvm;
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_dap;
-using namespace lldb_dap::protocol;
 
-/// ReadFull attempts to read the specified number of bytes. If EOF is
-/// encountered, an empty string is returned.
-static Expected<std::string>
-ReadFull(IOObject &descriptor, size_t length,
-         std::optional<std::chrono::microseconds> timeout = std::nullopt) {
-  if (!descriptor.IsValid())
-    return createStringError("transport output is closed");
+Transport::Transport(llvm::StringRef client_name, lldb_dap::Log *log,
+                     lldb::IOObjectSP input, lldb::IOObjectSP output)
+    : HTTPDelimitedJSONTransport(input, output), m_client_name(client_name),
+      m_log(log) {}
 
-  bool timeout_supported = true;
-  // FIXME: SelectHelper does not work with NativeFile on Win32.
-#if _WIN32
-  timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket;
-#endif
-
-  if (timeout && timeout_supported) {
-    SelectHelper sh;
-    sh.SetTimeout(*timeout);
-    sh.FDSetRead(descriptor.GetWaitableHandle());
-    Status status = sh.Select();
-    if (status.Fail()) {
-      // Convert timeouts into a specific error.
-      if (status.GetType() == lldb::eErrorTypePOSIX &&
-          status.GetError() == ETIMEDOUT)
-        return make_error<TimeoutError>();
-      return status.takeError();
-    }
-  }
-
-  std::string data;
-  data.resize(length);
-  Status status = descriptor.Read(data.data(), length);
-  if (status.Fail())
-    return status.takeError();
-
-  // Read returns '' on EOF.
-  if (length == 0)
-    return make_error<EndOfFileError>();
-
-  // Return the actual number of bytes read.
-  return data.substr(0, length);
-}
-
-static Expected<std::string>
-ReadUntil(IOObject &descriptor, StringRef delimiter,
-          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
-  std::string buffer;
-  buffer.reserve(delimiter.size() + 1);
-  while (!llvm::StringRef(buffer).ends_with(delimiter)) {
-    Expected<std::string> next =
-        ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout);
-    if (auto Err = next.takeError())
-      return std::move(Err);
-    buffer += *next;
-  }
-  return buffer.substr(0, buffer.size() - delimiter.size());
-}
-
-/// DAP message format
-/// ```
-/// Content-Length: (?<length>\d+)\r\n\r\n(?<content>.{\k<length>})
-/// ```
-static constexpr StringLiteral kHeaderContentLength = "Content-Length: ";
-static constexpr StringLiteral kHeaderSeparator = "\r\n\r\n";
-
-namespace lldb_dap {
-
-char EndOfFileError::ID;
-char TimeoutError::ID;
-
-Transport::Transport(StringRef client_name, Log *log, IOObjectSP input,
-                     IOObjectSP output)
-    : m_client_name(client_name), m_log(log), m_input(std::move(input)),
-      m_output(std::move(output)) {}
-
-Expected<Message> Transport::Read(const std::chrono::microseconds &timeout) {
-  if (!m_input || !m_input->IsValid())
-    return createStringError("transport output is closed");
-
-  IOObject *input = m_input.get();
-  Expected<std::string> message_header =
-      ReadFull(*input, kHeaderContentLength.size(), timeout);
-  if (!message_header)
-    return message_header.takeError();
-  if (*message_header != kHeaderContentLength)
-    return createStringError(formatv("expected '{0}' and got '{1}'",
-                                     kHeaderContentLength, *message_header)
-                                 .str());
-
-  Expected<std::string> raw_length = ReadUntil(*input, kHeaderSeparator);
-  if (!raw_length)
-    return handleErrors(raw_length.takeError(),
-                        [&](const EndOfFileError &E) -> llvm::Error {
-                          return createStringError(
-                              "unexpected EOF while reading header separator");
-                        });
-
-  size_t length;
-  if (!to_integer(*raw_length, length))
-    return createStringError(
-        formatv("invalid content length {0}", *raw_length).str());
-
-  Expected<std::string> raw_json = ReadFull(*input, length);
-  if (!raw_json)
-    return handleErrors(
-        raw_json.takeError(), [&](const EndOfFileError &E) -> llvm::Error {
-          return createStringError("unexpected EOF while reading JSON");
-        });
-
-  DAP_LOG(m_log, "--> ({0}) {1}", m_client_name, *raw_json);
-
-  return json::parse<Message>(/*JSON=*/*raw_json,
-                              /*RootName=*/"protocol_message");
+void Transport::Log(llvm::StringRef message) {
+  DAP_LOG(m_log, "({0}) {1}", m_client_name, message);
 }
-
-Error Transport::Write(const Message &message) {
-  if (!m_output || !m_output->IsValid())
-    return createStringError("transport output is closed");
-
-  std::string json = formatv("{0}", toJSON(message)).str();
-
-  DAP_LOG(m_log, "<-- ({0}) {1}", m_client_name, json);
-
-  std::string Output;
-  raw_string_ostream OS(Output);
-  OS << kHeaderContentLength << json.length() << kHeaderSeparator << json;
-  size_t num_bytes = Output.size();
-  return m_output->Write(Output.data(), num_bytes).takeError();
-}
-
-} // end namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Transport.h b/lldb/tools/lldb-dap/Transport.h
index 4e347eaa51314..51f62e718a0d0 100644
--- a/lldb/tools/lldb-dap/Transport.h
+++ b/lldb/tools/lldb-dap/Transport.h
@@ -15,70 +15,21 @@
 #define LLDB_TOOLS_LLDB_DAP_TRANSPORT_H
 
 #include "DAPForward.h"
-#include "Protocol/ProtocolBase.h"
+#include "lldb/Host/JSONTransport.h"
 #include "lldb/lldb-forward.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Error.h"
-#include <chrono>
-#include <system_error>
 
 namespace lldb_dap {
 
-class EndOfFileError : public llvm::ErrorInfo<EndOfFileError> {
-public:
-  static char ID;
-
-  EndOfFileError() = default;
-
-  void log(llvm::raw_ostream &OS) const override {
-    OS << "end of file reached";
-  }
-  std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
-  }
-};
-
-class TimeoutError : public llvm::ErrorInfo<TimeoutError> {
-public:
-  static char ID;
-
-  TimeoutError() = default;
-
-  void log(llvm::raw_ostream &OS) const override {
-    OS << "operation timed out";
-  }
-  std::error_code convertToErrorCode() const override {
-    return std::make_error_code(std::errc::timed_out);
-  }
-};
-
 /// A transport class that performs the Debug Adapter Protocol communication
 /// with the client.
-class Transport {
+class Transport : public lldb_private::HTTPDelimitedJSONTransport {
 public:
-  Transport(llvm::StringRef client_name, Log *log, lldb::IOObjectSP input,
-            lldb::IOObjectSP output);
-  ~Transport() = default;
-
-  /// Transport is not copyable.
-  /// @{
-  Transport(const Transport &rhs) = delete;
-  void operator=(const Transport &rhs) = delete;
-  /// @}
-
-  /// Writes a Debug Adater Protocol message to the output stream.
-  llvm::Error Write(const protocol::Message &M);
+  Transport(llvm::StringRef client_name, lldb_dap::Log *log,
+            lldb::IOObjectSP input, lldb::IOObjectSP output);
+  virtual ~Transport() = default;
 
-  /// Reads the next Debug Adater Protocol message from the input stream.
-  ///
-  /// \param timeout[in]
-  ///     A timeout to wait for reading the initial header. Once a message
-  ///     header is recieved, this will block until the full message is
-  ///     read.
-  ///
-  /// \returns Returns the next protocol message.
-  llvm::Expected<protocol::Message>
-  Read(const std::chrono::microseconds &timeout);
+  virtual void Log(llvm::StringRef message) override;
 
   /// Returns the name of this transport client, for example `stdin/stdout` or
   /// `client_1`.
@@ -86,9 +37,7 @@ class Transport {
 
 private:
   llvm::StringRef m_client_name;
-  Log *m_log;
-  lldb::IOObjectSP m_input;
-  lldb::IOObjectSP m_output;
+  lldb_dap::Log *m_log;
 };
 
 } // namespace lldb_dap
diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp
index 5fb6bf7e564ab..40ffaf87c9c45 100644
--- a/lldb/unittests/DAP/DAPTest.cpp
+++ b/lldb/unittests/DAP/DAPTest.cpp
@@ -32,7 +32,8 @@ TEST_F(DAPTest, SendProtocolMessages) {
       /*transport=*/*to_dap,
   };
   dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt});
-  ASSERT_THAT_EXPECTED(from_dap->Read(std::chrono::milliseconds(1)),
-                       HasValue(testing::VariantWith<Event>(testing::FieldsAre(
-                           /*event=*/"my-event", /*body=*/std::nullopt))));
+  ASSERT_THAT_EXPECTED(
+      from_dap->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      HasValue(testing::VariantWith<Event>(testing::FieldsAre(
+          /*event=*/"my-event", /*body=*/std::nullopt))));
 }
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index 388d1b901507e..4063b34250312 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -122,7 +122,8 @@ std::vector<Message> DAPTestBase::DrainOutput() {
   std::vector<Message> msgs;
   output.CloseWriteFileDescriptor();
   while (true) {
-    Expected<Message> next = from_dap->Read(std::chrono::milliseconds(1));
+    Expected<Message> next =
+        from_dap->Read<protocol::Message>(std::chrono::milliseconds(1));
     if (!next) {
       consumeError(next.takeError());
       break;
diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp
index e6dab42e30941..aaf257993af23 100644
--- a/lldb/unittests/DAP/TransportTest.cpp
+++ b/lldb/unittests/DAP/TransportTest.cpp
@@ -26,6 +26,8 @@ using namespace lldb_dap::protocol;
 using lldb_private::File;
 using lldb_private::NativeFile;
 using lldb_private::Pipe;
+using lldb_private::TransportEOFError;
+using lldb_private::TransportTimeoutError;
 
 class TransportTest : public PipeBase {
 protected:
@@ -50,7 +52,7 @@ TEST_F(TransportTest, MalformedRequests) {
       input.Write(malformed_header.data(), malformed_header.size()),
       Succeeded());
   ASSERT_THAT_EXPECTED(
-      transport->Read(std::chrono::milliseconds(1)),
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
       FailedWithMessage(
           "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
 }
@@ -63,20 +65,22 @@ TEST_F(TransportTest, Read) {
   ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
                        Succeeded());
   ASSERT_THAT_EXPECTED(
-      transport->Read(std::chrono::milliseconds(1)),
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
       HasValue(testing::VariantWith<Request>(testing::FieldsAre(
           /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt))));
 }
 
 TEST_F(TransportTest, ReadWithTimeout) {
-  ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)),
-                       Failed<TimeoutError>());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
 }
 
 TEST_F(TransportTest, ReadWithEOF) {
   input.CloseWriteFileDescriptor();
-  ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)),
-                       Failed<EndOfFileError>());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
 }
 
 TEST_F(TransportTest, Write) {

From faa49d6662b4c14438cc8e63a3751c22f28d2481 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 02:53:03 +0000
Subject: [PATCH 173/851] [gn build] Port de51b2dd3c6f

---
 llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
index ca1acf9ba8aa4..b00442d8e1ebb 100644
--- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn
@@ -27,6 +27,7 @@ static_library("Host") {
     "common/HostNativeThreadBase.cpp",
     "common/HostProcess.cpp",
     "common/HostThread.cpp",
+    "common/JSONTransport.cpp",
     "common/LZMA.cpp",
     "common/LockFileBase.cpp",
     "common/MainLoopBase.cpp",

From d8118ed6db28a3caaf3fa4a4f8d0d51d33b09c30 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 20:00:45 -0700
Subject: [PATCH 174/851] [ELF,test] Improve weak-undef-rw.s

---
 lld/test/ELF/weak-undef-rw.s | 54 +++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 20 deletions(-)

diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index bbc37ba49304a..902cad87aba9a 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -3,12 +3,17 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o
-# RUN: ld.lld a.o -o nopie --export-dynamic
-# RUN: llvm-readelf -r --hex-dump=.data nopie | FileCheck %s --check-prefix=STATIC
-# RUN: ld.lld a.o -o out.pie -pie
-# RUN: llvm-readelf -r --hex-dump=.data out.pie | FileCheck %s --check-prefix=STATIC
-# RUN: ld.lld a.o -o out.so -shared
-# RUN: llvm-readobj -r out.so | FileCheck %s --check-prefix=PIC
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o
+# RUN: ld.lld -shared s.o -o s.so
+
+# RUN: ld.lld a.o -o a --export-dynamic
+# RUN: llvm-readelf -r --hex-dump=.data a | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o s.so -o as
+# RUN: llvm-readelf -r --hex-dump=.data as | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o -o a.pie -pie
+# RUN: llvm-readelf -r --hex-dump=.data a.pie | FileCheck %s --check-prefix=STATIC
+# RUN: ld.lld a.o -o a.so -shared
+# RUN: llvm-readelf -r a.so | FileCheck %s --check-prefix=DYN
 
 ## gABI leaves the behavior of weak undefined references implementation defined.
 ## We choose to resolve them statically for static linking and produce dynamic relocations
@@ -19,35 +24,44 @@
 
 # STATIC:      no relocations
 # STATIC:      Hex dump of section '.data':
-# STATIC-NEXT: {{.*}} 00000000 00000000 .
+# STATIC-NEXT: {{.*}} 00000000 00000000 03000000 00000000 .
 # STATIC-EMPTY:
 
-# PIC:      .rela.dyn {
-# PIC-NEXT:   R_X86_64_64 foobar 0x0
-# PIC-NEXT: }
+# DYN:        Relocation section '.rela.dyn' {{.*}} contains 2
+# DYN:        R_X86_64_64 0000000000000000 foobar + 0{{$}}
 
-# RUN: ld.lld a.o b.o -o out1 -z undefs
-# RUN: llvm-readelf -r -x .data out1 | FileCheck %s --check-prefix=STATIC1
-# RUN: ld.lld a.o b.o -o out1.pie -pie -z undefs
-# RUN: llvm-readelf -r -x .data out1.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o -o ab -z undefs
+# RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o s.so -o abs -z undefs
+# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1
+# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1
 
 # STATIC1:      no relocations
 # STATIC1:      Hex dump of section '.data':
-# STATIC1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 .
+# STATIC1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 .
+# STATIC1-NEXT: {{.*}} 05000000 00000000                   .
 # STATIC1-EMPTY:
 
+# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 1
+# DYN1:        Hex dump of section '.data':
+# DYN1-NEXT:   {{.*}} 00000000 00000000 03000000 00000000 .
+# DYN1-NEXT:   {{.*}} 00000000 00000000                   .
+# DYN1-EMPTY:
+
 # RUN: ld.lld a.o b.o c.o -pie -z undefs 2>&1 | count 0
 
 #--- a.s
-        .global _start
+.global _start
 _start:
-        .data
-        .weak foobar
-        .quad foobar
+.data
+.weak foobar
+.quad foobar
+.quad foobar+3
 
 #--- b.s
 .data
-.quad undef
+.quad undef+5
 
 #--- c.s
 call undef

From b46f34452e9dec50eee6ddbe07875f05e421a81c Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Wed, 11 Jun 2025 20:22:08 -0700
Subject: [PATCH 175/851] libunwind: Do not use  __attribute__((target("gcs")))
 with non-clang compilers (#138077)

This attribute is unsupported in GCC, so far it worked because before
GCC15 did not define this macros in _CHKFEAT_GCS in arm_acle.h [1]

With gcc15 compiler libunwind's check for this macros is succeeding and
it ends up enabling 'gcs' by using function attribute, this works with
clang but not with gcc.

We can see this in rust compiler bootstrap for aarch64/musl when system
uses gcc15, it ends up with these errors

Building libunwind.a for aarch64-poky-linux-musl
```
cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:191:1: error: arch extension 'gcs' should be prefixed by '+' cargo:warning=  191 | unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) {
cargo:warning=      | ^~~~~~~~~~~~~
cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:337:22: error: arch extension 'gcs' should be prefixed by '+'
cargo:warning=  337 |                      _Unwind_Stop_Fn stop, void *stop_parameter) {
cargo:warning=      |                      ^~~~~~~~~~~~~~~
```

[1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5a6af707f0af

Signed-off-by: Khem Raj <raj.khem@gmail.com>
---
 libunwind/src/UnwindLevel1.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index a258a832a9c31..f3b451ad9b730 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -188,10 +188,11 @@ extern int __unw_step_stage2(unw_cursor_t *);
 
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
-__attribute__((target("gcs")))
+__attribute__((target("+gcs")))
 #endif
 static _Unwind_Reason_Code
-unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) {
+unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor,
+              _Unwind_Exception *exception_object) {
   __unw_init_local(cursor, uc);
 
   _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_obj=%p)",
@@ -332,12 +333,12 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except
 
 #if defined(_LIBUNWIND_USE_GCS)
 // Enable the GCS target feature to permit gcspop instructions to be used.
-__attribute__((target("gcs")))
+__attribute__((target("+gcs")))
 #endif
 static _Unwind_Reason_Code
 unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
-                     _Unwind_Exception *exception_object,
-                     _Unwind_Stop_Fn stop, void *stop_parameter) {
+                     _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop,
+                     void *stop_parameter) {
   __unw_init_local(cursor, uc);
 
   // uc is initialized by __unw_getcontext in the parent frame. The first stack
@@ -443,7 +444,6 @@ unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor,
   return _URC_FATAL_PHASE2_ERROR;
 }
 
-
 /// Called by __cxa_throw.  Only returns if there is a fatal error.
 _LIBUNWIND_EXPORT _Unwind_Reason_Code
 _Unwind_RaiseException(_Unwind_Exception *exception_object) {

From a71210e5abdbae80363cb5956a24a2004f625ca6 Mon Sep 17 00:00:00 2001
From: Kewen12 <Kewen.Meng@amd.com>
Date: Wed, 11 Jun 2025 20:24:56 -0700
Subject: [PATCH 176/851] Revert "[libc] Fix stdio tests after #143802"
 (#143824)

Reverts llvm/llvm-project#143810

This PR breaks our buildbot:
https://lab.llvm.org/buildbot/#/builders/10/builds/7159 revert to
unblock downstream merge.
---
 libc/docs/configure.rst                     | 2 +-
 libc/test/src/stdio/fgetc_test.cpp          | 1 -
 libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 -
 libc/test/src/stdio/fgets_test.cpp          | 1 -
 libc/test/src/stdio/setvbuf_test.cpp        | 1 -
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 109412225634f..8d53390ae19bf 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 1faa49112fb63..7c652f666a8f3 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -33,7 +33,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 7b2efe642fb5e..f4471dd82df15 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -36,7 +36,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    ASSERT_ERRNO_FAILURE();
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index 2d7c68d490811..c00a9256af52d 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -36,7 +36,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  ASSERT_ERRNO_FAILURE();
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a0936ba79ef73..4144bc1bef447 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,7 +11,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"

From 968d8eaa44c500259fe8d56ad77ec1c71cad35e2 Mon Sep 17 00:00:00 2001
From: Yang Zaizhou <91008302+Mxfg-incense@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:28:57 +0800
Subject: [PATCH 177/851] [OpenMP][Flang]Fix omp_get_cancellation return type
 from integer to logical (#142990)

---
 openmp/runtime/src/include/omp_lib.F90.var | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var
index 3463b698291e1..20639f60b5d97 100644
--- a/openmp/runtime/src/include/omp_lib.F90.var
+++ b/openmp/runtime/src/include/omp_lib.F90.var
@@ -399,7 +399,7 @@
 
           function omp_get_cancellation() bind(c)
             use omp_lib_kinds
-            integer (kind=omp_integer_kind) omp_get_cancellation
+            logical (kind=omp_logical_kind) omp_get_cancellation
           end function omp_get_cancellation
 
           function omp_is_initial_device() bind(c)

From 2fcaa00d1e2317a90c9071b735eb0e758b5dd58b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 20:37:15 -0700
Subject: [PATCH 178/851] [ELF] -z undefs: handle relocations referencing
 undefined non-weak like undefined weak

* Merge the special case into isStaticLinkTimeConstant
* Generalize isUndefWeak to isUndefined. undefined non-weak is an error
  case. We choose to be general, which also brings us in line with GNU ld.
---
 lld/ELF/Relocations.cpp      | 25 ++++++++++---------------
 lld/test/ELF/weak-undef-rw.s | 12 +++++++-----
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 1af01e7247dce..6c4209a2b81ed 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -990,10 +990,17 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type,
   // only the low bits are used.
   if (e == R_GOT || e == R_PLT)
     return ctx.target->usesOnlyLowPageBits(type) || !ctx.arg.isPic;
-
   // R_AARCH64_AUTH_ABS64 requires a dynamic relocation.
-  if (sym.isPreemptible || e == RE_AARCH64_AUTH)
+  if (e == RE_AARCH64_AUTH)
     return false;
+
+  // The behavior of an undefined weak reference is implementation defined.
+  // (We treat undefined non-weak the same as undefined weak.) For static
+  // -no-pie linking, dynamic relocations are generally avoided (except
+  // IRELATIVE). Emitting dynamic relocations for -shared aligns with its -z
+  // undefs default. Dynamic -no-pie linking and -pie allow flexibility.
+  if (sym.isPreemptible)
+    return sym.isUndefined() && !ctx.arg.isPic;
   if (!ctx.arg.isPic)
     return true;
 
@@ -1113,19 +1120,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
   // If the relocation is known to be a link-time constant, we know no dynamic
   // relocation will be created, pass the control to relocateAlloc() or
   // relocateNonAlloc() to resolve it.
-  //
-  // The behavior of an undefined weak reference is implementation defined. For
-  // non-link-time constants, we resolve relocations statically (let
-  // relocate{,Non}Alloc() resolve them) for -no-pie and try producing dynamic
-  // relocations for -pie and -shared.
-  //
-  // The general expectation of -no-pie static linking is that there is no
-  // dynamic relocation (except IRELATIVE). Emitting dynamic relocations for
-  // -shared matches the spirit of its -z undefs default. -pie has freedom on
-  // choices, and we choose dynamic relocations to be consistent with the
-  // handling of GOT-generating relocations.
-  if (isStaticLinkTimeConstant(expr, type, sym, offset) ||
-      (!ctx.arg.isPic && sym.isUndefWeak())) {
+  if (isStaticLinkTimeConstant(expr, type, sym, offset)) {
     sec->addReloc({expr, type, offset, addend, &sym});
     return;
   }
diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index 902cad87aba9a..497228a3cf905 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -33,9 +33,11 @@
 # RUN: ld.lld a.o b.o -o ab -z undefs
 # RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1
 # RUN: ld.lld a.o b.o s.so -o abs -z undefs
-# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1
-# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs
-# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o -o ab.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data ab.pie | FileCheck %s --check-prefix=STATIC1
+# RUN: ld.lld a.o b.o s.so -o abs.pie -pie -z undefs
+# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=DYN1
 
 # STATIC1:      no relocations
 # STATIC1:      Hex dump of section '.data':
@@ -43,9 +45,9 @@
 # STATIC1-NEXT: {{.*}} 05000000 00000000                   .
 # STATIC1-EMPTY:
 
-# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 1
+# DYN1:        Relocation section '.rela.dyn' {{.*}} contains 3
 # DYN1:        Hex dump of section '.data':
-# DYN1-NEXT:   {{.*}} 00000000 00000000 03000000 00000000 .
+# DYN1-NEXT:   {{.*}} 00000000 00000000 00000000 00000000 .
 # DYN1-NEXT:   {{.*}} 00000000 00000000                   .
 # DYN1-EMPTY:
 

From 5f231db76482bbdd3e658d8e9797cbd46837d4e1 Mon Sep 17 00:00:00 2001
From: Brandon Wu <songwu0813@gmail.com>
Date: Thu, 12 Jun 2025 11:41:52 +0800
Subject: [PATCH 179/851] [RISCV] Use StringRef for RequiredExtensions in
 RVVIntrinsicDef (#143503)

This prevents many duplicated copies of required extensions string.
---
 clang/lib/Sema/SemaRISCV.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 9f70be746eb3f..9eab0c2a0df6a 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -47,7 +47,7 @@ struct RVVIntrinsicDef {
   std::string BuiltinName;
 
   /// Mapping to RequiredFeatures in riscv_vector.td
-  std::string RequiredExtensions;
+  StringRef RequiredExtensions;
 
   /// Function signature, first element is return type.
   RVVTypes Signature;

From f09050fdc85074869f0b34f0d9e061a74ef549ee Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 11:35:44 +0800
Subject: [PATCH 180/851] [C++20] [Modules] Fix module local lookup
 ambiguousity

Close https://github.com/llvm/llvm-project/issues/61360
Close https://github.com/llvm/llvm-project/issues/129525
Close https://github.com/llvm/llvm-project/issues/143734

We shouldn't identify different module local decls in different modules
as the same entity.
---
 clang/include/clang/AST/ASTContext.h          |  6 ++--
 clang/include/clang/AST/DeclBase.h            |  4 +++
 clang/lib/AST/ASTContext.cpp                  |  8 ++++-
 clang/lib/AST/DeclBase.cpp                    |  6 ++++
 .../Modules/module-local-declarations-02.cppm | 31 +++++++++++++++++++
 clang/test/Modules/pr61360.cppm               | 25 +++++++++++++++
 6 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Modules/module-local-declarations-02.cppm
 create mode 100644 clang/test/Modules/pr61360.cppm

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8d24d393eab09..3abb49312255a 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -488,8 +488,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// if possible.
   ///
   /// Not serialized intentionally.
-  llvm::StringMap<const Module *> PrimaryModuleNameMap;
-  llvm::DenseMap<const Module *, const Module *> SameModuleLookupSet;
+  mutable llvm::StringMap<const Module *> PrimaryModuleNameMap;
+  mutable llvm::DenseMap<const Module *, const Module *> SameModuleLookupSet;
 
   static constexpr unsigned ConstantArrayTypesLog2InitSize = 8;
   static constexpr unsigned GeneralTypesLog2InitSize = 9;
@@ -1151,7 +1151,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   ///
   /// FIXME: The signature may be confusing since `clang::Module` means to
   /// a module fragment or a module unit but not a C++20 module.
-  bool isInSameModule(const Module *M1, const Module *M2);
+  bool isInSameModule(const Module *M1, const Module *M2) const;
 
   TranslationUnitDecl *getTranslationUnitDecl() const {
     return TUDecl->getMostRecentDecl();
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 375e9e2592502..dd67ebc9873ff 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -646,6 +646,10 @@ class alignas(8) Decl {
     return getModuleOwnershipKind() == ModuleOwnershipKind::ModulePrivate;
   }
 
+  /// Whether this declaration was a local declaration to a C++20
+  /// named module.
+  bool isModuleLocal() const;
+
   /// Whether this declaration was exported in a lexical context.
   /// e.g.:
   ///
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index b51f7622288df..4d44f23c0f503 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1175,7 +1175,7 @@ void ASTContext::setCurrentNamedModule(Module *M) {
   CurrentCXXNamedModule = M;
 }
 
-bool ASTContext::isInSameModule(const Module *M1, const Module *M2) {
+bool ASTContext::isInSameModule(const Module *M1, const Module *M2) const {
   if (!M1 != !M2)
     return false;
 
@@ -7429,6 +7429,12 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const {
                           cast<Decl>(Y->getDeclContext()->getRedeclContext())))
     return false;
 
+  // If either X or Y are local to the owning module, they are only possible to
+  // be the same entity if they are in the same module.
+  if (X->isModuleLocal() || Y->isModuleLocal())
+    if (!isInSameModule(X->getOwningModule(), Y->getOwningModule()))
+      return false;
+
   // Two typedefs refer to the same entity if they have the same underlying
   // type.
   if (const auto *TypedefX = dyn_cast<TypedefNameDecl>(X))
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index a1bb62bcb68fa..48c60aa4e449a 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1132,6 +1132,12 @@ bool Decl::isInExportDeclContext() const {
   return isa_and_nonnull<ExportDecl>(DC);
 }
 
+bool Decl::isModuleLocal() const {
+  auto *M = getOwningModule();
+  return M && M->isNamedModule() &&
+         getModuleOwnershipKind() == ModuleOwnershipKind::ReachableWhenImported;
+}
+
 bool Decl::isInAnotherModuleUnit() const {
   auto *M = getOwningModule();
 
diff --git a/clang/test/Modules/module-local-declarations-02.cppm b/clang/test/Modules/module-local-declarations-02.cppm
new file mode 100644
index 0000000000000..0670c4295abc7
--- /dev/null
+++ b/clang/test/Modules/module-local-declarations-02.cppm
@@ -0,0 +1,31 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll
+
+//--- A.cppm
+export module A;
+
+export template<typename>
+struct holder {
+};
+
+struct foo {};
+
+export struct a {
+	holder<foo> m;
+};
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+
+import A;
+
+struct foo {};
+
+struct b {
+	holder<foo> m;
+};
\ No newline at end of file
diff --git a/clang/test/Modules/pr61360.cppm b/clang/test/Modules/pr61360.cppm
new file mode 100644
index 0000000000000..a16f65d4be2fe
--- /dev/null
+++ b/clang/test/Modules/pr61360.cppm
@@ -0,0 +1,25 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll
+
+//--- A.cppm
+export module A;
+export template<typename>
+struct holder {
+};
+
+struct a {
+	holder<struct foo> m;
+};
+
+//--- B.cppm
+// expected-no-diagnostics
+export module B;
+import A;
+
+struct b {
+	holder<struct foo> m;
+};

From 282e471018d234f78b0990100834532389877519 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Thu, 12 Jun 2025 05:58:55 +0200
Subject: [PATCH 181/851] [flang] Erase `fir.local` ops before lowering `fir`
 to `llvm` (#143687)

`fir.local` ops are not supposed to have any uses at this point (i.e.
during lowering to LLVM). In case of serialization, the
`fir.do_concurrent` users are expected to have been lowered to
`fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent`
users are expected to have been lowered to the target parallel model
(e.g. OpenMP).

This hopefully resolved a build issue introduced by
https://github.com/llvm/llvm-project/pull/142567 (see for example:
https://lab.llvm.org/buildbot/#/builders/199/builds/4009).
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp | 42 +++++++++++++++++++------
 flang/test/Fir/local.fir                | 10 ++++++
 2 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 flang/test/Fir/local.fir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 82d960a6fc61e..a3de3ae9d116a 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3294,6 +3294,30 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
   }
 };
 
+struct LocalitySpecifierOpConversion
+    : public fir::FIROpConversion<fir::LocalitySpecifierOp> {
+  using FIROpConversion::FIROpConversion;
+  llvm::LogicalResult
+  matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+#ifdef EXPENSIVE_CHECKS
+    auto uses = mlir::SymbolTable::getSymbolUses(
+        localizer, localizer->getParentOfType<mlir::ModuleOp>());
+
+    // `fir.local` ops are not supposed to have any uses at this point (i.e.
+    // during lowering to LLVM). In case of serialization, the
+    // `fir.do_concurrent` users are expected to have been lowered to
+    // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent`
+    // users are expected to have been lowered to the target parallel model
+    // (e.g. OpenMP).
+    assert(uses && uses->empty());
+#endif
+
+    rewriter.eraseOp(localizer);
+    return mlir::success();
+  }
+};
+
 /// Lower `fir.no_reassoc` to LLVM IR dialect.
 /// TODO: how do we want to enforce this in LLVM-IR? Can we manipulate the fast
 /// math flags?
@@ -4249,15 +4273,15 @@ void fir::populateFIRToLLVMConversionPatterns(
       FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion,
       GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion,
       IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion,
-      MulcOpConversion, NegcOpConversion, NoReassocOpConversion,
-      SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion,
-      SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion,
-      ShiftOpConversion, SliceOpConversion, StoreOpConversion,
-      StringLitOpConversion, SubcOpConversion, TypeDescOpConversion,
-      TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion,
-      UndefOpConversion, UnreachableOpConversion, XArrayCoorOpConversion,
-      XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter,
-                                                                options);
+      LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion,
+      NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion,
+      SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion,
+      ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion,
+      StoreOpConversion, StringLitOpConversion, SubcOpConversion,
+      TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion,
+      UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion,
+      XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion,
+      ZeroOpConversion>(converter, options);
 
   // Patterns that are populated without a type converter do not trigger
   // target materializations for the operands of the root op.
diff --git a/flang/test/Fir/local.fir b/flang/test/Fir/local.fir
new file mode 100644
index 0000000000000..006f5ca944670
--- /dev/null
+++ b/flang/test/Fir/local.fir
@@ -0,0 +1,10 @@
+// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s
+
+// Tests that `fir.local` ops are dropped from the module before LLVM lowering.
+
+fir.local {type = local} @local_privatizer : i32
+func.func @foo() {
+  return
+}
+
+// CHECK-NOT: fir.local

From c3be4524a56ba01bc1f868fc37e329f24ec5041c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 21:23:06 -0700
Subject: [PATCH 182/851] [ELF,test] Improve weak-undef-got-plt.s

---
 lld/test/ELF/weak-undef-got-pie.s | 22 --------------------
 lld/test/ELF/weak-undef-got-plt.s | 34 +++++++++++++++++++++++++++++++
 lld/test/ELF/weak-undef.s         | 31 ----------------------------
 3 files changed, 34 insertions(+), 53 deletions(-)
 delete mode 100644 lld/test/ELF/weak-undef-got-pie.s
 create mode 100644 lld/test/ELF/weak-undef-got-plt.s
 delete mode 100644 lld/test/ELF/weak-undef.s

diff --git a/lld/test/ELF/weak-undef-got-pie.s b/lld/test/ELF/weak-undef-got-pie.s
deleted file mode 100644
index 2301400f4e0b1..0000000000000
--- a/lld/test/ELF/weak-undef-got-pie.s
+++ /dev/null
@@ -1,22 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o
-# RUN: ld.lld %t1.o -shared -o %t1.so
-# RUN: llvm-mc -filetype=obj -x86-relax-relocations=false -triple=x86_64 %s -o %t.o
-
-# RUN: ld.lld -pie %t.o %t1.so -o %t
-# RUN: llvm-readobj -r %t | FileCheck --check-prefix=RELOCS %s
-# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=DISASM %s
-
-# RELOCS:      Relocations [
-# RELOCS-NEXT:   Section ({{.*}}) .rela.dyn {
-# RELOCS-NEXT:     R_X86_64_GLOB_DAT foo 0x0
-# RELOCS-NEXT:   }
-# RELOCS-NEXT: ]
-
-.weak foo
-
-.globl _start
-_start:
-# DISASM: <_start>:
-# DISASM-NEXT: movq {{.*}}(%rip), %rax
-mov foo@gotpcrel(%rip), %rax
diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s
new file mode 100644
index 0000000000000..0ee3da2cd3b40
--- /dev/null
+++ b/lld/test/ELF/weak-undef-got-plt.s
@@ -0,0 +1,34 @@
+# REQUIRES: x86
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 -x86-relax-relocations=false a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o
+# RUN: ld.lld -shared s.o -o s.so
+
+# RUN: ld.lld a.o -o a
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
+# RUN: ld.lld a.o s.so -o as
+# RUN: llvm-objdump -dR as | FileCheck %s
+
+# RUN: ld.lld -pie a.o s.so -o as.pie
+# RUN: llvm-objdump -dR as.pie | FileCheck %s
+
+# RUN: ld.lld -shared a.o -o a.so
+# RUN: llvm-objdump -dR a.so | FileCheck %s
+
+# NORELOC:    no relocation
+
+# CHECK:      TYPE                     VALUE
+# CHECK-NEXT: R_X86_64_GLOB_DAT        foo{{$}}
+# CHECK-NEXT: R_X86_64_JUMP_SLOT       foo{{$}}
+# CHECK-EMPTY:
+# CHECK:      <_start>:
+# CHECK-NEXT:   movq {{.*}}(%rip), %rax
+# CHECK-NEXT:   callq {{.*}} <foo@plt>
+
+#--- a.s
+.weak foo
+
+.globl _start
+_start:
+mov foo@gotpcrel(%rip), %rax
+call foo
diff --git a/lld/test/ELF/weak-undef.s b/lld/test/ELF/weak-undef.s
deleted file mode 100644
index 21488023a79e1..0000000000000
--- a/lld/test/ELF/weak-undef.s
+++ /dev/null
@@ -1,31 +0,0 @@
-# REQUIRES: x86
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-# RUN: ld.lld %t.o -o %t --export-dynamic
-# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \
-# RUN:   FileCheck %s --check-prefixes=NORELOC,COMMON
-
-# NORELOC: There are no relocations in this file.
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o
-# RUN: ld.lld %t1.o -shared -o %t1.so
-# RUN: ld.lld %t.o -o %t %t1.so -pie
-# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \
-# RUN:   FileCheck %s --check-prefixes=RELOC,COMMON
-
-# RELOC:      Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries:
-# RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend
-# RELOC-NEXT: {{.*}} 0000000100000001 R_X86_64_64 0000000000000000 foo + 0
-
-# NORELOC-NOT: Symbol table '.dynsym'
-# RELOC:       Symbol table '.dynsym' contains 2 entries:
-# RELOC-NEXT:  Num: Value Size Type Bind Vis Ndx Name
-# RELOC-NEXT:  0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND
-# RELOC-NEXT:  1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo
-# COMMON:      Hex dump of section '.data':
-# COMMON-NEXT: {{.*}} 00000000 00000000 
-# COMMON-EMPTY:
-
-.weak foo
-
-.data
-  .dc.a foo

From a93e55e57ed00a55f822c64e3520c7c732b58480 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Wed, 11 Jun 2025 21:33:46 -0700
Subject: [PATCH 183/851] Revert "[libc] Migrate stdio tests to
 ErrnoCheckingTest." (#143829)

Reverts llvm/llvm-project#143802. Follow-up fix
3c7af175e51c3ab08ac3c442146c2b822f38c01e wasn't robust enough and itself
got reverted.
---
 libc/test/src/stdio/CMakeLists.txt           | 10 ----------
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++++------
 libc/test/src/stdio/fgetc_test.cpp           |  5 +++--
 libc/test/src/stdio/fgetc_unlocked_test.cpp  |  5 +++--
 libc/test/src/stdio/fgets_test.cpp           |  6 +++---
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++++++++++++-----
 libc/test/src/stdio/fopencookie_test.cpp     | 15 ++++++++-------
 libc/test/src/stdio/remove_test.cpp          | 10 +++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++-----
 libc/test/src/stdio/setvbuf_test.cpp         |  8 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 ++++---
 libc/test/src/stdlib/StrtolTest.h            |  1 +
 libc/test/src/stdlib/strtold_test.cpp        |  1 +
 13 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 3627006ec28fd..01904a30504ed 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,7 +20,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -69,7 +68,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -90,7 +88,6 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -112,7 +109,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -430,7 +426,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -445,7 +440,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -462,7 +456,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -483,7 +476,6 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -506,7 +498,6 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -524,7 +515,6 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index b53184c30be36..104fc478b100e 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,21 +9,20 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
+TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
+TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 7c652f666a8f3..56bde5f0099a8 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,6 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index f4471dd82df15..90429ecf4e82b 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -36,6 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
     ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index c00a9256af52d..abed3d4052939 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,12 +12,11 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -36,6 +35,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e097785832d56..e624181c795b8 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,18 +17,17 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
+TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST_F(LlvmLibcFILETest, FFlush) {
+TEST(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index bcf5e674141a7..03e1ac286b646 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,7 +15,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -23,7 +22,6 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
-using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -90,7 +88,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 296bff1f5dc15..84984e26398c0 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,17 +11,16 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index 135fb98c07fbb..ac494a4ecaf8e 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,19 +8,18 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
+TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
+TEST(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 4144bc1bef447..5872943c1bb41 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -14,10 +14,9 @@
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -53,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -103,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index e99b382d12112..5d482b70064bd 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,12 +15,11 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3eeccc5727e77 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba64..c2f2b9c9a11c3 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 99638537cd19b84252685a3dd56535a4d54d690e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 11 Jun 2025 21:56:48 -0700
Subject: [PATCH 184/851] [AArch64] Fix a warning

This patch fixes:

  llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:7157:3: error:
  unannotated fall-through between switch labels
  [-Werror,-Wimplicit-fallthrough]
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ad5b90984188e..af5dfd6c9b8f4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7154,6 +7154,7 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   switch (CC) {
   default:
     NestReg = 0x0f; // X15
+    LLVM_FALLTHROUGH;
   case CallingConv::ARM64EC_Thunk_Native:
   case CallingConv::ARM64EC_Thunk_X64:
     // Must be kept in sync with AArch64CallingConv.td

From 02550da932913bd7c3987c68abc9060c9e5bde2c Mon Sep 17 00:00:00 2001
From: Fazlay Rabbi <106703039+mdfazlay@users.noreply.github.com>
Date: Wed, 11 Jun 2025 22:06:11 -0700
Subject: [PATCH 185/851] [OpenMP 60] Initial parsing/sema for
 `need_device_addr` modifier on `adjust_args` clause (#143442)

Adds initial parsing and semantic analysis for `need_device_addr`
modifier on `adjust_args` clause.
---
 clang/include/clang/Basic/Attr.td             |  1 +
 .../clang/Basic/DiagnosticParseKinds.td       |  6 ++--
 clang/include/clang/Basic/OpenMPKinds.def     |  1 +
 clang/include/clang/Sema/SemaOpenMP.h         |  1 +
 clang/lib/AST/AttrImpl.cpp                    |  6 ++++
 clang/lib/Parse/ParseOpenMP.cpp               | 28 +++++++++++++------
 clang/lib/Sema/SemaOpenMP.cpp                 |  5 ++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 11 ++++++--
 .../declare_variant_clauses_ast_print.cpp     | 26 ++++++++++-------
 .../declare_variant_clauses_messages.cpp      | 24 +++++++++++-----
 10 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 9e84462eaa660..f113cd2ba2fbf 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4630,6 +4630,7 @@ def OMPDeclareVariant : InheritableAttr {
     OMPTraitInfoArgument<"TraitInfos">,
     VariadicExprArgument<"AdjustArgsNothing">,
     VariadicExprArgument<"AdjustArgsNeedDevicePtr">,
+    VariadicExprArgument<"AdjustArgsNeedDeviceAddr">,
     VariadicOMPInteropInfoArgument<"AppendArgs">,
   ];
   let AdditionalMembers = [{
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 3aa36ad59d0b9..6c30da376dafb 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1581,8 +1581,10 @@ def err_omp_unexpected_append_op : Error<
   "unexpected operation specified in 'append_args' clause, expected 'interop'">;
 def err_omp_unexpected_execution_modifier : Error<
   "unexpected 'execution' modifier in non-executable context">;
-def err_omp_unknown_adjust_args_op : Error<
-  "incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'">;
+def err_omp_unknown_adjust_args_op
+    : Error<
+          "incorrect 'adjust_args' type, expected 'need_device_ptr'%select{|, "
+          "'need_device_addr',}0 or 'nothing'">;
 def err_omp_declare_variant_wrong_clause : Error<
   "expected %select{'match'|'match', 'adjust_args', or 'append_args'}0 clause "
   "on 'omp declare variant' directive">;
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index b0de65df7e397..2b1dc1e0121b2 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -214,6 +214,7 @@ OPENMP_ORIGINAL_SHARING_MODIFIER(default)
 // Adjust-op kinds for the 'adjust_args' clause.
 OPENMP_ADJUST_ARGS_KIND(nothing)
 OPENMP_ADJUST_ARGS_KIND(need_device_ptr)
+OPENMP_ADJUST_ARGS_KIND(need_device_addr)
 
 // Binding kinds for the 'bind' clause.
 OPENMP_BIND_KIND(teams)
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 6498390fe96f7..be6bec2068784 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -849,6 +849,7 @@ class SemaOpenMP : public SemaBase {
       FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
       ArrayRef<Expr *> AdjustArgsNothing,
       ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
+      ArrayRef<Expr *> AdjustArgsNeedDeviceAddr,
       ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
       SourceLocation AppendArgsLoc, SourceRange SR);
 
diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp
index fefb8f55a9ee2..5875a925d3fb0 100644
--- a/clang/lib/AST/AttrImpl.cpp
+++ b/clang/lib/AST/AttrImpl.cpp
@@ -224,6 +224,12 @@ void OMPDeclareVariantAttr::printPrettyPragma(
     PrintExprs(adjustArgsNeedDevicePtr_begin(), adjustArgsNeedDevicePtr_end());
     OS << ")";
   }
+  if (adjustArgsNeedDeviceAddr_size()) {
+    OS << " adjust_args(need_device_addr:";
+    PrintExprs(adjustArgsNeedDeviceAddr_begin(),
+               adjustArgsNeedDeviceAddr_end());
+    OS << ")";
+  }
 
   auto PrintInteropInfo = [&OS](OMPInteropInfo *Begin, OMPInteropInfo *End) {
     for (OMPInteropInfo *I = Begin; I != End; ++I) {
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index e41e5ba8596b9..b69c3abe0b321 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -1483,6 +1483,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
   SmallVector<Expr *, 6> AdjustNothing;
   SmallVector<Expr *, 6> AdjustNeedDevicePtr;
+  SmallVector<Expr *, 6> AdjustNeedDeviceAddr;
   SmallVector<OMPInteropInfo, 3> AppendArgs;
   SourceLocation AdjustArgsLoc, AppendArgsLoc;
 
@@ -1515,11 +1516,21 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
         SmallVector<Expr *> Vars;
         IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args,
                                      Vars, Data);
-        if (!IsError)
-          llvm::append_range(Data.ExtraModifier == OMPC_ADJUST_ARGS_nothing
-                                 ? AdjustNothing
-                                 : AdjustNeedDevicePtr,
-                             Vars);
+        if (!IsError) {
+          switch (Data.ExtraModifier) {
+          case OMPC_ADJUST_ARGS_nothing:
+            llvm::append_range(AdjustNothing, Vars);
+            break;
+          case OMPC_ADJUST_ARGS_need_device_ptr:
+            llvm::append_range(AdjustNeedDevicePtr, Vars);
+            break;
+          case OMPC_ADJUST_ARGS_need_device_addr:
+            llvm::append_range(AdjustNeedDeviceAddr, Vars);
+            break;
+          default:
+            llvm_unreachable("Unexpected 'adjust_args' clause modifier.");
+          }
+        }
         break;
       }
       case OMPC_append_args:
@@ -1559,8 +1570,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   if (DeclVarData && !TI.Sets.empty())
     Actions.OpenMP().ActOnOpenMPDeclareVariantDirective(
         DeclVarData->first, DeclVarData->second, TI, AdjustNothing,
-        AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc,
-        SourceRange(Loc, Tok.getLocation()));
+        AdjustNeedDevicePtr, AdjustNeedDeviceAddr, AppendArgs, AdjustArgsLoc,
+        AppendArgsLoc, SourceRange(Loc, Tok.getLocation()));
 
   // Skip the last annot_pragma_openmp_end.
   (void)ConsumeAnnotationToken();
@@ -4818,7 +4829,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         getLangOpts());
     Data.ExtraModifierLoc = Tok.getLocation();
     if (Data.ExtraModifier == OMPC_ADJUST_ARGS_unknown) {
-      Diag(Tok, diag::err_omp_unknown_adjust_args_op);
+      Diag(Tok, diag::err_omp_unknown_adjust_args_op)
+          << (getLangOpts().OpenMP >= 60 ? 1 : 0);
       SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch);
     } else {
       ConsumeToken();
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 2cbe79c5c07ca..d928b7ae2b4c2 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -7122,6 +7122,7 @@ void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
       getASTContext(), VariantFuncRef, DVScope.TI,
       /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0,
       /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0,
+      /*NeedDeviceAddrArgs=*/nullptr, /*NeedDeviceAddrArgsSize=*/0,
       /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0);
   for (FunctionDecl *BaseFD : Bases)
     BaseFD->addAttr(OMPDeclareVariantA);
@@ -7553,6 +7554,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
     FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
     ArrayRef<Expr *> AdjustArgsNothing,
     ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
+    ArrayRef<Expr *> AdjustArgsNeedDeviceAddr,
     ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
     SourceLocation AppendArgsLoc, SourceRange SR) {
 
@@ -7564,6 +7566,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
   SmallVector<Expr *, 8> AllAdjustArgs;
   llvm::append_range(AllAdjustArgs, AdjustArgsNothing);
   llvm::append_range(AllAdjustArgs, AdjustArgsNeedDevicePtr);
+  llvm::append_range(AllAdjustArgs, AdjustArgsNeedDeviceAddr);
 
   if (!AllAdjustArgs.empty() || !AppendArgs.empty()) {
     VariantMatchInfo VMI;
@@ -7614,6 +7617,8 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
       const_cast<Expr **>(AdjustArgsNothing.data()), AdjustArgsNothing.size(),
       const_cast<Expr **>(AdjustArgsNeedDevicePtr.data()),
       AdjustArgsNeedDevicePtr.size(),
+      const_cast<Expr **>(AdjustArgsNeedDeviceAddr.data()),
+      AdjustArgsNeedDeviceAddr.size(),
       const_cast<OMPInteropInfo *>(AppendArgs.data()), AppendArgs.size(), SR);
   FD->addAttr(NewAttr);
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 57271415f838c..a25bfd1c48dee 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -527,6 +527,7 @@ static void instantiateOMPDeclareVariantAttr(
 
   SmallVector<Expr *, 8> NothingExprs;
   SmallVector<Expr *, 8> NeedDevicePtrExprs;
+  SmallVector<Expr *, 8> NeedDeviceAddrExprs;
   SmallVector<OMPInteropInfo, 4> AppendArgs;
 
   for (Expr *E : Attr.adjustArgsNothing()) {
@@ -541,14 +542,20 @@ static void instantiateOMPDeclareVariantAttr(
       continue;
     NeedDevicePtrExprs.push_back(ER.get());
   }
+  for (Expr *E : Attr.adjustArgsNeedDeviceAddr()) {
+    ExprResult ER = Subst(E);
+    if (ER.isInvalid())
+      continue;
+    NeedDeviceAddrExprs.push_back(ER.get());
+  }
   for (OMPInteropInfo &II : Attr.appendArgs()) {
     // When prefer_type is implemented for append_args handle them here too.
     AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync);
   }
 
   S.OpenMP().ActOnOpenMPDeclareVariantDirective(
-      FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(),
-      SourceLocation(), Attr.getRange());
+      FD, E, TI, NothingExprs, NeedDevicePtrExprs, NeedDeviceAddrExprs,
+      AppendArgs, SourceLocation(), SourceLocation(), Attr.getRange());
 }
 
 static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
diff --git a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
index 172dd1670421d..c14e19cc8b7ec 100644
--- a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp
@@ -54,9 +54,9 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}foo_v1
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
-//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB)
+//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) adjust_args(need_device_addr:AAA)
 
-//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA)
+//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:BBB)
 
 //PRINT: omp declare variant(foo_v1) match(construct={dispatch}, device={arch(arm)}) adjust_args(need_device_ptr:AAA,BBB)
 
@@ -66,42 +66,48 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;}
 
 #pragma omp declare variant(foo_v2)                        \
    match(construct={dispatch}, device={arch(ppc)}),        \
-   adjust_args(need_device_ptr:AAA)
+   adjust_args(need_device_ptr:AAA)                        \
+   adjust_args(need_device_addr:BBB)
 
 #pragma omp declare variant(foo_v3)                        \
    adjust_args(need_device_ptr:BBB) adjust_args(nothing:I) \
+   adjust_args(need_device_addr:AAA)                      \
    match(construct={dispatch}, device={arch(x86,x86_64)})
 
 void foo(float *AAA, float *BBB, int *I) {return;}
 
-void Foo_Var(float *AAA, float *BBB) {return;}
+void Foo_Var(float *AAA, float *BBB, float *CCC) {return;}
 
 #pragma omp declare variant(Foo_Var) \
    match(construct={dispatch}, device={arch(x86_64)}) \
-   adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB)
+   adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) \
+   adjust_args(need_device_addr:CCC)
 template<typename T>
-void Foo(T *AAA, T *BBB) {return;}
+void Foo(T *AAA, T *BBB, T *CCC) {return;}
 
-//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA)
-//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *)'
+//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:CCC)
+//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *, T *)'
 //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
+//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC'
 //
-//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *)'
+//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *, float *)'
 //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)}
 //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB'
 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA'
+//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC'
 
 void func()
 {
   float *A;
   float *B;
+  float *C;
 
   //#pragma omp dispatch
-  Foo(A, B);
+  Foo(A, B, C);
 }
 
 typedef void *omp_interop_t;
diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
index 284e49bbd21b4..aadded7699ea1 100644
--- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 \
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 \
 // RUN:  -DNO_INTEROP_T_DEF -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s
-// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -Wno-strict-prototypes -DC -x c -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s
+// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -x c -o - %s
 // RUN: %clang_cc1 -verify -triple x86_64-pc-windows-msvc -fms-compatibility \
-// RUN:  -fopenmp -Wno-strict-prototypes -DC -DWIN -x c -o - %s
+// RUN:  -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -DWIN -x c -o - %s
 
 #ifdef NO_INTEROP_T_DEF
 void foo_v1(float *, void *);
@@ -114,6 +114,16 @@ void vararg_bar2(const char *fmt) { return; }
    match(construct={dispatch}, device={arch(ppc)}),          \
    adjust_args(need_device_ptr:AAA) adjust_args(nothing:AAA)
 
+// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}}
+#pragma omp declare variant(foo_v1)                          \
+   match(construct={dispatch}, device={arch(arm)})           \
+   adjust_args(need_device_ptr:AAA,BBB) adjust_args(need_device_addr:AAA)
+
+// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}}
+#pragma omp declare variant(foo_v1)                          \
+   match(construct={dispatch}, device={arch(ppc)}),          \
+   adjust_args(need_device_addr:AAA) adjust_args(nothing:AAA)
+
 // expected-error@+2 {{use of undeclared identifier 'J'}}
 #pragma omp declare variant(foo_v1)                          \
    adjust_args(nothing:J)                                    \
@@ -186,12 +196,12 @@ void vararg_bar2(const char *fmt) { return; }
 // expected-error@+1 {{variant in '#pragma omp declare variant' with type 'void (float *, float *, int *, omp_interop_t)' (aka 'void (float *, float *, int *, void *)') is incompatible with type 'void (float *, float *, int *)'}}
 #pragma omp declare variant(foo_v4) match(construct={dispatch})
 
-// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}}
 #pragma omp declare variant(foo_v1)                        \
    match(construct={dispatch}, device={arch(arm)})         \
    adjust_args(badaaop:AAA,BBB)
 
-// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}}
+// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}}
 #pragma omp declare variant(foo_v1)                        \
    match(construct={dispatch}, device={arch(arm)})         \
    adjust_args(badaaop AAA,BBB)

From 28bda778437fea17a25b561f1b3b84545612b565 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 11 Jun 2025 22:19:31 -0700
Subject: [PATCH 186/851] Introduce MCAsmInfo::UsesSetToEquateSymbol and prefer
 = to .set

Introduce MCAsmInfo::UsesSetToEquateSymbol to control the preferred
syntax for symbol equating. We now favor the more readable and common
`symbol = expression` syntax over `.set`. This aligns with pre- https://reviews.llvm.org/D44256 behavior.

On Apple platforms, this resolves a clang -S vs -c behavior difference (resolves #104623).

For targets whose = support is unconfirmed, UsesSetToEquateSymbol is set to false.
This also minimizes test updates.

Pull Request: https://github.com/llvm/llvm-project/pull/142289
---
 clang/test/CodeGen/alias.c                    |  6 +--
 llvm/include/llvm/MC/MCAsmInfo.h              |  4 ++
 llvm/lib/MC/MCAsmStreamer.cpp                 |  6 ++-
 .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp   |  1 +
 .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp |  1 +
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     |  2 +
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp |  1 +
 llvm/test/CodeGen/AArch64/arm64ec-alias.ll    | 14 +++---
 .../AArch64/arm64ec-hybrid-patchable.ll       | 18 +++----
 llvm/test/CodeGen/AArch64/arm64ec-symbols.ll  |  6 +--
 llvm/test/CodeGen/AArch64/arm64ec-varargs.ll  | 16 +++---
 llvm/test/CodeGen/AArch64/ehcontguard.ll      |  2 +-
 llvm/test/CodeGen/AArch64/global-merge-1.ll   |  8 +--
 llvm/test/CodeGen/AArch64/global-merge-2.ll   | 12 ++---
 llvm/test/CodeGen/AArch64/global-merge-3.ll   | 10 ++--
 .../AArch64/global-merge-hidden-minsize.ll    |  4 +-
 llvm/test/CodeGen/AArch64/ifunc-asm.ll        |  2 +-
 llvm/test/CodeGen/AArch64/seh-finally.ll      |  8 +--
 .../CodeGen/AArch64/stackguard-internal.ll    |  2 +-
 llvm/test/CodeGen/ARM/alias_store.ll          |  2 +-
 llvm/test/CodeGen/ARM/aliases.ll              | 14 +++---
 .../CodeGen/ARM/global-merge-dllexport.ll     |  4 +-
 .../CodeGen/ARM/global-merge-external-2.ll    | 12 ++---
 .../test/CodeGen/ARM/global-merge-external.ll | 12 ++---
 llvm/test/CodeGen/AVR/global-aliases.ll       | 28 +++++------
 llvm/test/CodeGen/Mips/hf16call32_body.ll     | 24 ++++-----
 llvm/test/CodeGen/Mips/mips16ex.ll            |  2 +-
 .../PowerPC/asm-printer-topological-order.ll  |  6 +--
 llvm/test/CodeGen/PowerPC/data-align.ll       | 10 ++--
 llvm/test/CodeGen/WebAssembly/aliases.ll      | 22 ++++----
 llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll |  2 +-
 llvm/test/CodeGen/WinCFGuard/cfguard.ll       |  2 +-
 .../CodeGen/X86/2007-09-06-ExtWeakAliasee.ll  |  2 +-
 llvm/test/CodeGen/X86/2009-08-12-badswitch.ll | 50 +++++++++----------
 .../CodeGen/X86/2010-05-26-DotDebugLoc.ll     |  8 +--
 llvm/test/CodeGen/X86/alias-gep.ll            |  8 +--
 llvm/test/CodeGen/X86/aliases.ll              |  8 +--
 .../CodeGen/X86/catchret-empty-fallthrough.ll |  2 +-
 llvm/test/CodeGen/X86/coff-alias-type.ll      |  2 +-
 llvm/test/CodeGen/X86/coff-comdat.ll          |  2 +-
 llvm/test/CodeGen/X86/coff-feat00.ll          |  2 +-
 llvm/test/CodeGen/X86/dllexport-x86_64.ll     | 10 ++--
 llvm/test/CodeGen/X86/dllexport.ll            |  8 +--
 llvm/test/CodeGen/X86/ehcontguard.ll          |  2 +-
 .../CodeGen/X86/fastcall-correct-mangling.ll  |  4 +-
 llvm/test/CodeGen/X86/ifunc-asm.ll            |  2 +-
 .../test/CodeGen/X86/lea-opt-memop-check-1.ll |  6 +--
 llvm/test/CodeGen/X86/linux-preemption.ll     | 16 +++---
 llvm/test/CodeGen/X86/localescape.ll          | 16 +++---
 llvm/test/CodeGen/X86/pr22019.ll              |  8 +--
 llvm/test/CodeGen/X86/seh-catch-all-win32.ll  |  4 +-
 llvm/test/CodeGen/X86/seh-catchpad.ll         |  2 +-
 llvm/test/CodeGen/X86/seh-finally.ll          |  2 +-
 llvm/test/CodeGen/X86/seh-no-invokes.ll       |  2 +-
 llvm/test/CodeGen/X86/seh-stack-realign.ll    |  4 +-
 llvm/test/CodeGen/X86/tailcall-cgp-dup.ll     | 12 ++---
 .../X86/windows-seh-EHa-TryInFinally.ll       |  2 +-
 llvm/test/CodeGen/XCore/globals.ll            |  2 +-
 llvm/test/CodeGen/XCore/linkage.ll            |  4 +-
 llvm/test/DebugInfo/X86/dbg-value-range.ll    |  4 +-
 .../X86/stmt-list-multiple-compile-units.ll   |  4 +-
 llvm/test/MC/AArch64/basic-a64-instructions.s |  2 +-
 llvm/test/MC/AsmParser/assignment.s           | 12 ++---
 llvm/test/MC/AsmParser/directive_include.s    |  2 +-
 llvm/test/MC/AsmParser/directive_set.s        |  6 +--
 llvm/test/MC/AsmParser/include.ll             |  4 +-
 llvm/test/MC/AsmParser/labels.s               |  6 +--
 llvm/test/MC/AsmParser/macro-arg-darwin.s     |  4 +-
 llvm/test/MC/AsmParser/motorola_integers.s    | 16 +++---
 llvm/test/MC/Mips/cpsetup.s                   |  2 +-
 70 files changed, 263 insertions(+), 252 deletions(-)

diff --git a/clang/test/CodeGen/alias.c b/clang/test/CodeGen/alias.c
index bc4167adf53f6..9403c55beae0b 100644
--- a/clang/test/CodeGen/alias.c
+++ b/clang/test/CodeGen/alias.c
@@ -29,20 +29,20 @@ const int wacom_usb_ids[] = {1, 1, 2, 3, 5, 8, 13, 0};
 extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids")));
 // CHECKBASIC-DAG: @__mod_usb_device_table ={{.*}} alias i32, ptr @wacom_usb_ids
 // CHECKASM-DAG: .globl __mod_usb_device_table
-// CHECKASM-DAG: .set __mod_usb_device_table, wacom_usb_ids
+// CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids
 // CHECKASM-NOT: .size __mod_usb_device_table
 
 extern int g1;
 extern int g1 __attribute((alias("g0")));
 // CHECKBASIC-DAG: @g1 ={{.*}} alias i32, ptr @g0
 // CHECKASM-DAG: .globl g1
-// CHECKASM-DAG: .set g1, g0
+// CHECKASM-DAG: g1 = g0
 // CHECKASM-NOT: .size g1
 
 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS")));
 // CHECKBASIC-DAG: @__libc_errno ={{.*}} thread_local alias i32, ptr @TL_WITH_ALIAS
 // CHECKASM-DAG: .globl __libc_errno
-// CHECKASM-DAG: .set __libc_errno, TL_WITH_ALIAS
+// CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS
 // CHECKASM-NOT: .size __libc_errno
 
 void f0(void) { }
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 4eb50344d6384..e98cd17a9df50 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -141,6 +141,9 @@ class LLVM_ABI MCAsmInfo {
   /// This is appended to emitted labels.  Defaults to ":"
   const char *LabelSuffix;
 
+  /// Use .set instead of = to equate a symbol to an expression.
+  bool UsesSetToEquateSymbol = false;
+
   // Print the EH begin symbol with an assignment. Defaults to false.
   bool UseAssignmentForEHBegin = false;
 
@@ -525,6 +528,7 @@ class LLVM_ABI MCAsmInfo {
   bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; }
   const char *getLabelSuffix() const { return LabelSuffix; }
 
+  bool usesSetToEquateSymbol() const { return UsesSetToEquateSymbol; }
   bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; }
   bool needsLocalForSize() const { return NeedsLocalForSize; }
   StringRef getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index da0d99e70d9ea..4380f74318e7b 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -695,9 +695,11 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
     if (E->inlineAssignedExpr())
       EmitSet = false;
   if (EmitSet) {
-    OS << ".set ";
+    bool UseSet = MAI->usesSetToEquateSymbol();
+    if (UseSet)
+      OS << ".set ";
     Symbol->print(OS, MAI);
-    OS << ", ";
+    OS << (UseSet ? ", " : " = ");
     Value->print(OS, MAI);
 
     EmitEOL();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 6f1d89e500ed3..fcf134aa8658f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
   CommentString = ";";
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
+  UsesSetToEquateSymbol = true;
 
   //===--- Data Emission Directives -------------------------------------===//
   UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 7675b05f106a0..ba8faaeb74a07 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -38,6 +38,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
+  UsesSetToEquateSymbol = true;
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 160ee07fad5cc..b5be23c5a96ad 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -155,5 +155,7 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
   // Support $ as PC in inline asm
   DollarIsPC = true;
 
+  UsesSetToEquateSymbol = true;
+
   initializeVariantKinds(variantKindDescs);
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 27272cdbbd230..e9d387399bf30 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -49,6 +49,7 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
   CalleeSaveStackSlotSize = 8;
   CodePointerSize = 8;
   CommentString = "*";
+  UsesSetToEquateSymbol = true;
   ExceptionsType = ExceptionHandling::ZOS;
   IsHLASM = true;
   IsLittleEndian = false;
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
index 03cc873136940..18023a95a5d20 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll
@@ -13,30 +13,30 @@ define dso_local void @patchable_func() hybrid_patchable {
 @patchable_alias = alias void (), ptr @patchable_func
 
 ; CHECK:              .weak_anti_dep  func_alias
-; CHECK-NEXT: .set func_alias, "#func_alias"
+; CHECK-NEXT: func_alias = "#func_alias"
 ; CHECK-NEXT:         .weak_anti_dep  func_alias2
-; CHECK-NEXT: .set func_alias2, "#func_alias2"
+; CHECK-NEXT: func_alias2 = "#func_alias2"
 ; CHECK-NEXT:         .weak_anti_dep  func
-; CHECK-NEXT: .set func, "#func"
+; CHECK-NEXT: func = "#func"
 ; CHECK:              .weak_anti_dep  patchable_alias
-; CHECK-NEXT: .set patchable_alias, "#patchable_alias"
+; CHECK-NEXT: patchable_alias = "#patchable_alias"
 
 ; CHECK:              .globl  "#func_alias"
 ; CHECK-NEXT:         .def    "#func_alias";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#func_alias", "#func"
+; CHECK-NEXT: "#func_alias" = "#func"
 ; CHECK-NEXT:         .globl  "#func_alias2"
 ; CHECK-NEXT:         .def    "#func_alias2";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#func_alias2", "#func_alias"
+; CHECK-NEXT: "#func_alias2" = "#func_alias"
 
 ; CHECK:              .globl  "#patchable_alias"
 ; CHECK-NEXT:         .def    "#patchable_alias";
 ; CHECK-NEXT:         .scl    2;
 ; CHECK-NEXT:         .type   32;
 ; CHECK-NEXT:         .endef
-; CHECK-NEXT: .set "#patchable_alias", "#patchable_func"
+; CHECK-NEXT: "#patchable_alias" = "#patchable_func"
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
index f964484c0c2d4..7c77832a9d9a5 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
@@ -76,7 +76,7 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .p2align        2
 ; CHECK-NEXT:  "#caller":                              // @"#caller"
 ; CHECK-NEXT:      .weak_anti_dep  caller
-; CHECK-NEXT:  .set caller, "#caller"{{$}}
+; CHECK-NEXT:  caller = "#caller"{{$}}
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:      str     x30, [sp, #-16]!                // 8-byte Folded Spill
 ; CHECK-NEXT:      bl      "#func"
@@ -253,13 +253,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak  func
-; CHECK-NEXT:  .set func, "EXP+#func"{{$}}
+; CHECK-NEXT:  func = "EXP+#func"{{$}}
 ; CHECK-NEXT:      .weak  "#func"
 ; CHECK-NEXT:      .def    "#func";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#func", "#func$hybpatch_thunk"{{$}}
+; CHECK-NEXT:  "#func" = "#func$hybpatch_thunk"{{$}}
 ; CHECK-NEXT:      .def    "EXP+#has_varargs";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -269,13 +269,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   has_varargs
-; CHECK-NEXT:  .set has_varargs, "EXP+#has_varargs"
+; CHECK-NEXT:  has_varargs = "EXP+#has_varargs"
 ; CHECK-NEXT:      .weak   "#has_varargs"
 ; CHECK-NEXT:      .def    "#has_varargs";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#has_varargs", "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:  "#has_varargs" = "#has_varargs$hybpatch_thunk"
 ; CHECK-NEXT:      .def    "EXP+#has_sret";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -285,13 +285,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   has_sret
-; CHECK-NEXT:  .set has_sret, "EXP+#has_sret"
+; CHECK-NEXT:  has_sret = "EXP+#has_sret"
 ; CHECK-NEXT:      .weak   "#has_sret"
 ; CHECK-NEXT:      .def    "#has_sret";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#has_sret", "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:  "#has_sret" = "#has_sret$hybpatch_thunk"
 ; CHECK-NEXT:      .def    "EXP+#exp";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -301,13 +301,13 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .weak   exp
-; CHECK-NEXT:  .set exp, "EXP+#exp"
+; CHECK-NEXT:  exp = "EXP+#exp"
 ; CHECK-NEXT:      .weak   "#exp"
 ; CHECK-NEXT:      .def    "#exp";
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
-; CHECK-NEXT:  .set "#exp", "#exp$hybpatch_thunk"
+; CHECK-NEXT:  "#exp" = "#exp$hybpatch_thunk"
 
 ; SYM:      [53](sec 15)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$hybpatch_thunk
 ; SYM:      [58](sec 16)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #has_varargs$hybpatch_thunk
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
index b79dd7d61dd60..b44f39ad7b735 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll
@@ -10,12 +10,12 @@ define void @caller() nounwind {
 }
 
 ; CHECK:      .weak_anti_dep  caller
-; CHECK-NEXT: .set caller, "#caller"{{$}}
+; CHECK-NEXT: caller = "#caller"{{$}}
 
 ; CHECK:      .weak_anti_dep  func
-; CHECK-NEXT: .set func, "#func"{{$}}
+; CHECK-NEXT: func = "#func"{{$}}
 ; CHECK-NEXT: .weak_anti_dep  "#func"
-; CHECK-NEXT: .set "#func", "#func$exit_thunk"{{$}}
+; CHECK-NEXT: "#func" = "#func$exit_thunk"{{$}}
 
 ; SYM:       [ 8](sec  4)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #caller
 ; SYM:       [21](sec  7)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #func$exit_thunk
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
index 5fab5738078dc..389969bebaea4 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
@@ -45,9 +45,9 @@ define void @varargs_caller() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    str xzr, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    bl "#varargs_callee"
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
@@ -86,9 +86,9 @@ define void @varargs_many_argscalleer() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    stp q0, q0, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_many_argscallee
-; CHECK-NEXT:  .set varargs_many_argscallee, "#varargs_many_argscallee"
+; CHECK-NEXT:  varargs_many_argscallee = "#varargs_many_argscallee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_many_argscallee"
-; CHECK-NEXT:  .set "#varargs_many_argscallee", varargs_many_argscallee
+; CHECK-NEXT:  "#varargs_many_argscallee" = varargs_many_argscallee
 ; CHECK-NEXT:    bl "#varargs_many_argscallee"
 ; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #64
@@ -116,9 +116,9 @@ define void @varargs_caller_tail() nounwind {
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    str xzr, [sp, #16]
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    bl "#varargs_callee"
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    add x4, sp, #48
@@ -129,9 +129,9 @@ define void @varargs_caller_tail() nounwind {
 ; CHECK-NEXT:    mov x5, xzr
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    .weak_anti_dep varargs_callee
-; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"
+; CHECK-NEXT:  varargs_callee = "#varargs_callee"
 ; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
-; CHECK-NEXT:  .set "#varargs_callee", varargs_callee
+; CHECK-NEXT:  "#varargs_callee" = varargs_callee
 ; CHECK-NEXT:    b "#varargs_callee"
   call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> <double 0.0, double 0.0>)
   tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2)
diff --git a/llvm/test/CodeGen/AArch64/ehcontguard.ll b/llvm/test/CodeGen/AArch64/ehcontguard.ll
index eecff391d0f8c..cb603a482d228 100644
--- a/llvm/test/CodeGen/AArch64/ehcontguard.ll
+++ b/llvm/test/CodeGen/AArch64/ehcontguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s
 ; EHCont Guard is currently only available on Windows
 
-; CHECK: .set "@feat.00", 16384
+; CHECK: "@feat.00" = 16384
 
 ; CHECK: .section .gehcont$y
 
diff --git a/llvm/test/CodeGen/AArch64/global-merge-1.ll b/llvm/test/CodeGen/AArch64/global-merge-1.ll
index cc17e344c211a..626310fc4ec25 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-1.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-1.ll
@@ -23,9 +23,9 @@ define void @f1(i32 %a1, i32 %a2) {
 ;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
 ;CHECK:	.local	.L_MergedGlobals
 ;CHECK:	.comm	.L_MergedGlobals,8,4
-;CHECK: .set m, .L_MergedGlobals
-;CHECK: .set n, .L_MergedGlobals+4
+;CHECK: m = .L_MergedGlobals
+;CHECK: n = .L_MergedGlobals+4
 
 ;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,2 ; @_MergedGlobals
-;CHECK-APPLE-IOS-NOT: .set _m, l__MergedGlobals
-;CHECK-APPLE-IOS-NOT: .set _n, l__MergedGlobals+4
+;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals
+;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4
diff --git a/llvm/test/CodeGen/AArch64/global-merge-2.ll b/llvm/test/CodeGen/AArch64/global-merge-2.ll
index 85d814c3177b3..1b5333b907d27 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-2.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-2.ll
@@ -32,21 +32,21 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK:	.comm	.L_MergedGlobals,12,4
 
 ;CHECK:	.globl	x
-;CHECK: .set x, .L_MergedGlobals
+;CHECK: x = .L_MergedGlobals
 ;CHECK: .size x, 4
 ;CHECK:	.globl	y
-;CHECK: .set y, .L_MergedGlobals+4
+;CHECK: y = .L_MergedGlobals+4
 ;CHECK: .size y, 4
 ;CHECK:	.globl	z
-;CHECK: .set z, .L_MergedGlobals+8
+;CHECK: z = .L_MergedGlobals+8
 ;CHECK: .size z, 4
 
 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,2
 
 ;CHECK-APPLE-IOS: .globl	_x
-;CHECK-APPLE-IOS: .set {{.*}}, __MergedGlobals_x
+;CHECK-APPLE-IOS: {{.*}} = __MergedGlobals_x
 ;CHECK-APPLE-IOS: .globl	_y
-;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
 ;CHECK-APPLE-IOS: .globl	_z
-;CHECK-APPLE-IOS: .set _z, __MergedGlobals_x+8
+;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
 ;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll
index b3f58887139f7..2a0ae12274556 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-3.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll
@@ -40,14 +40,14 @@ define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) {
 
 ;CHECK-APPLE-IOS: .globl  __MergedGlobals_x
 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,800,2
-;CHECK-APPLE-IOS: .set _x, __MergedGlobals_x
-;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+400
+;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS: _y = __MergedGlobals_x+400
 
 ;CHECK: .type   .L_MergedGlobals,@object // @_MergedGlobals
 ;CHECK: .local  .L_MergedGlobals
 ;CHECK: .comm   .L_MergedGlobals,800,4
 ;CHECK: globl  x
-;CHECK: .set x, .L_MergedGlobals
+;CHECK: x = .L_MergedGlobals
 ;CHECK: globl  y
-;CHECK: .set y, .L_MergedGlobals+400
-;CHECK-NOT: .set z, .L_MergedGlobals
+;CHECK: y = .L_MergedGlobals+400
+;CHECK-NOT: z = .L_MergedGlobals
diff --git a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
index 9c694fc4d289c..5292aa91fc381 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll
@@ -16,10 +16,10 @@ attributes #0 = { minsize optsize }
 
 ; CHECK: .globl x
 ; CHECK: .hidden x
-; CHECK: .set x, .L_MergedGlobals
+; CHECK: x = .L_MergedGlobals
 ; CHECK: .size x, 4
 
 ; CHECK: .globl y
 ; CHECK: .hidden y
-; CHECK: .set y, .L_MergedGlobals+4
+; CHECK: y = .L_MergedGlobals+4
 ; CHECK: .size y, 4
diff --git a/llvm/test/CodeGen/AArch64/ifunc-asm.ll b/llvm/test/CodeGen/AArch64/ifunc-asm.ll
index 57fc2f0c9d7f5..7aad6cce09cf2 100644
--- a/llvm/test/CodeGen/AArch64/ifunc-asm.ll
+++ b/llvm/test/CodeGen/AArch64/ifunc-asm.ll
@@ -16,7 +16,7 @@ entry:
 @global_ifunc = ifunc i32 (i32), ptr @the_resolver
 ; ELF:             .globl global_ifunc
 ; ELF-NEXT:        .type global_ifunc,@gnu_indirect_function
-; ELF-NEXT:        .set global_ifunc, the_resolver
+; ELF-NEXT:        global_ifunc = the_resolver
 
 ; MACHO:           .section __DATA,__data
 ; MACHO-NEXT:      .p2align 3, 0x0
diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll
index 04a30800d9294..fd6b3fd0bc1fc 100644
--- a/llvm/test/CodeGen/AArch64/seh-finally.ll
+++ b/llvm/test/CodeGen/AArch64/seh-finally.ll
@@ -38,7 +38,7 @@ entry:
 ; CHECK: add     x29, sp, #16
 ; CHECK: mov     x0, #-2
 ; CHECK: stur    x0, [x29, #16]
-; CHECK: .set .Lsimple_seh$frame_escape_0, -8
+; CHECK: .Lsimple_seh$frame_escape_0 = -8
 ; CHECK: ldur    w0, [x29, #-8]
 ; CHECK: bl      foo
 
@@ -89,7 +89,7 @@ entry:
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x0, #-2
 ; CHECK: stur    x0, [x29, #24]
-; CHECK: .set .Lstack_realign$frame_escape_0, 0
+; CHECK: .Lstack_realign$frame_escape_0 = 0
 ; CHECK: ldr     w0, [x19]
 ; CHECK: bl      foo
 
@@ -137,7 +137,7 @@ entry:
 ; CHECK: add     x29, sp, #32
 ; CHECK: mov     x1, #-2
 ; CHECK: stur    x1, [x29, #16]
-; CHECK: .set .Lvla_present$frame_escape_0, -4
+; CHECK: .Lvla_present$frame_escape_0 = -4
 ; CHECK: stur    w0, [x29, #-4]
 ; CHECK: ldur    w8, [x29, #-4]
 ; CHECK: mov     x9, sp
@@ -204,7 +204,7 @@ entry:
 ; CHECK: mov     x19, sp
 ; CHECK: mov     x1, #-2
 ; CHECK: stur    x1, [x29, #24]
-; CHECK: .set .Lvla_and_realign$frame_escape_0, 32
+; CHECK: .Lvla_and_realign$frame_escape_0 = 32
 ; CHECK: str     w0, [x29, #36]
 ; CHECK: ldr     w8, [x29, #36]
 ; CHECK: mov     x9, sp
diff --git a/llvm/test/CodeGen/AArch64/stackguard-internal.ll b/llvm/test/CodeGen/AArch64/stackguard-internal.ll
index a70c8874edbac..7b32e8c0caab5 100644
--- a/llvm/test/CodeGen/AArch64/stackguard-internal.ll
+++ b/llvm/test/CodeGen/AArch64/stackguard-internal.ll
@@ -6,7 +6,7 @@ target triple = "aarch64-linux-gnu"
 ; is an alias.  (The alias is created by GlobalMerge.)
 ; CHECK: adrp {{.*}}, __stack_chk_guard
 ; CHECK: ldr {{.*}}, [{{.*}}, :lo12:__stack_chk_guard]
-; CHECK: .set __stack_chk_guard, .L_MergedGlobals+4
+; CHECK: __stack_chk_guard = .L_MergedGlobals+4
 
 @__stack_chk_guard = internal global [8 x i32] zeroinitializer, align 4
 @x = internal global i32 0, align 4
diff --git a/llvm/test/CodeGen/ARM/alias_store.ll b/llvm/test/CodeGen/ARM/alias_store.ll
index c6612334eaf1b..60aa58d37499c 100644
--- a/llvm/test/CodeGen/ARM/alias_store.ll
+++ b/llvm/test/CodeGen/ARM/alias_store.ll
@@ -13,4 +13,4 @@ entry:
 ; CHECK: ldr r{{.*}}, [[L:.*]]
 ; CHECK: [[L]]:
 ; CHECK-NEXT: .long XA
-; CHECK: .set XA, X+1
+; CHECK: XA = X+1
diff --git a/llvm/test/CodeGen/ARM/aliases.ll b/llvm/test/CodeGen/ARM/aliases.ll
index 6075ad813e990..8d9f938155d15 100644
--- a/llvm/test/CodeGen/ARM/aliases.ll
+++ b/llvm/test/CodeGen/ARM/aliases.ll
@@ -6,30 +6,30 @@
 ; CHECK: .size .Lstructvar, 8
 
 ; CHECK: .globl	foo1
-; CHECK: .set foo1, bar
+; CHECK: foo1 = bar
 ; CHECK-NOT: .size foo1
 
 ; CHECK: .globl	foo2
-; CHECK: .set foo2, bar
+; CHECK: foo2 = bar
 ; CHECK-NOT: .size foo2
 
 ; CHECK: .weak	bar_f
-; CHECK: .set bar_f, foo_f
+; CHECK: bar_f = foo_f
 ; CHECK-NOT: .size bar_f
 
-; CHECK: .set bar_i, bar
+; CHECK: bar_i = bar
 ; CHECK-NOT: .size bar_i
 
 ; CHECK: .globl	A
-; CHECK: .set A, bar
+; CHECK: A = bar
 ; CHECK-NOT: .size A
 
 ; CHECK: .globl elem0
-; CHECK: .set elem0, .Lstructvar
+; CHECK: elem0 = .Lstructvar
 ; CHECK: .size elem0, 4
 
 ; CHECK: .globl elem1
-; CHECK: .set elem1, .Lstructvar+4
+; CHECK: elem1 = .Lstructvar+4
 ; CHECK: .size elem1, 4
 
 @bar = global i32 42
diff --git a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
index 89e8a859b9393..f5961d7f79e3d 100644
--- a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll
@@ -16,6 +16,6 @@ define void @f1(i32 %a1, i32 %a2) {
 ; CHECK: .section .drectve,"yni"
 ; CHECK: .ascii " /EXPORT:y,DATA"
 ; CHECK: .globl x
-; CHECK: .set x, .L_MergedGlobals
+; CHECK: x = .L_MergedGlobals
 ; CHECK: .globl y
-; CHECK: .set y, .L_MergedGlobals+4
+; CHECK: y = .L_MergedGlobals+4
diff --git a/llvm/test/CodeGen/ARM/global-merge-external-2.ll b/llvm/test/CodeGen/ARM/global-merge-external-2.ll
index 602533e045e0b..c9e92d98e4841 100644
--- a/llvm/test/CodeGen/ARM/global-merge-external-2.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-external-2.ll
@@ -50,16 +50,16 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK-WIN32:   .lcomm  .L_MergedGlobals,8,4
 
 ;CHECK-MERGE:   .globl  x
-;CHECK-MERGE: .set x, .L_MergedGlobals
+;CHECK-MERGE: x = .L_MergedGlobals
 ;CHECK-MERGE: .size x, 4
 ;CHECK-MERGE:   .globl  y
-;CHECK-MERGE: .set y, .L_MergedGlobals+4
+;CHECK-MERGE: y = .L_MergedGlobals+4
 ;CHECK-MERGE: .size y, 4
-;CHECK-MERGE-NOT: .set z, .L_MergedGlobals+8
+;CHECK-MERGE-NOT: z = .L_MergedGlobals+8
 
 
 ;CHECK-WIN32:   .globl  x
-;CHECK-WIN32: .set x, .L_MergedGlobals
+;CHECK-WIN32: x = .L_MergedGlobals
 ;CHECK-WIN32:   .globl  y
-;CHECK-WIN32: .set y, .L_MergedGlobals+4
-;CHECK-WIN32-NOT: .set z, .L_MergedGlobals+8
+;CHECK-WIN32: y = .L_MergedGlobals+4
+;CHECK-WIN32-NOT: z = .L_MergedGlobals+8
diff --git a/llvm/test/CodeGen/ARM/global-merge-external.ll b/llvm/test/CodeGen/ARM/global-merge-external.ll
index 364659b36bb9a..4fe1914aae351 100644
--- a/llvm/test/CodeGen/ARM/global-merge-external.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-external.ll
@@ -45,18 +45,18 @@ define dso_local void @g1(i32 %a1, i32 %a2) {
 ;CHECK-WIN32:	.lcomm	.L_MergedGlobals,12,4
 
 ;CHECK-MERGE:	.globl	x
-;CHECK-MERGE: .set x, .L_MergedGlobals
+;CHECK-MERGE: x = .L_MergedGlobals
 ;CHECK-MERGE: .size x, 4
 ;CHECK-MERGE:	.globl	y
-;CHECK-MERGE: .set y, .L_MergedGlobals+4
+;CHECK-MERGE: y = .L_MergedGlobals+4
 ;CHECK-MERGE: .size y, 4
 ;CHECK-MERGE:	.globl	z
-;CHECK-MERGE: .set z, .L_MergedGlobals+8
+;CHECK-MERGE: z = .L_MergedGlobals+8
 ;CHECK-MERGE: .size z, 4
 
 ;CHECK-WIN32:	.globl	x
-;CHECK-WIN32: .set x, .L_MergedGlobals
+;CHECK-WIN32: x = .L_MergedGlobals
 ;CHECK-WIN32:	.globl	y
-;CHECK-WIN32: .set y, .L_MergedGlobals+4
+;CHECK-WIN32: y = .L_MergedGlobals+4
 ;CHECK-WIN32:	.globl	z
-;CHECK-WIN32: .set z, .L_MergedGlobals+8
+;CHECK-WIN32: z = .L_MergedGlobals+8
diff --git a/llvm/test/CodeGen/AVR/global-aliases.ll b/llvm/test/CodeGen/AVR/global-aliases.ll
index 91bcedc7e0dba..b948003e8b88d 100644
--- a/llvm/test/CodeGen/AVR/global-aliases.ll
+++ b/llvm/test/CodeGen/AVR/global-aliases.ll
@@ -1,18 +1,18 @@
 ; RUN: llc < %s -mtriple=avr -mcpu=atxmega384c3 | FileCheck %s --check-prefixes=MEGA
 ; RUN: llc < %s -mtriple=avr -mcpu=attiny40 | FileCheck %s --check-prefixes=TINY
 
-; MEGA: .set __tmp_reg__, 0
-; MEGA: .set __zero_reg__, 1
-; MEGA: .set __SREG__, 63
-; MEGA: .set __SP_H__, 62
-; MEGA: .set __SP_L__, 61
-; MEGA: .set __EIND__, 60
-; MEGA: .set __RAMPZ__, 59
+; MEGA: __tmp_reg__ = 0
+; MEGA: __zero_reg__ = 1
+; MEGA: __SREG__ = 63
+; MEGA: __SP_H__ = 62
+; MEGA: __SP_L__ = 61
+; MEGA: __EIND__ = 60
+; MEGA: __RAMPZ__ = 59
 
-; TINY:     .set __tmp_reg__, 16
-; TINY:     .set __zero_reg__, 17
-; TINY:     .set __SREG__, 63
-; TINY-NOT: .set __SP_H__, 62
-; TINY:     .set __SP_L__, 61
-; TINY-NOT: .set __EIND__, 60
-; TINY-NOT: .set __RAMPZ__, 59
+; TINY:     __tmp_reg__ = 16
+; TINY:     __zero_reg__ = 17
+; TINY:     __SREG__ = 63
+; TINY-NOT: __SP_H__ = 62
+; TINY:     __SP_L__ = 61
+; TINY-NOT: __EIND__ = 60
+; TINY-NOT: __RAMPZ__ = 59
diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll
index ea83f776bd40f..3bcb6f6bc0152 100644
--- a/llvm/test/CodeGen/Mips/hf16call32_body.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll
@@ -24,7 +24,7 @@ entry:
 ; stel: addiu $25, $25, %lo(v_sf)
 ; stel: mfc1 $4, $f12
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf, v_sf
+; stel: $__fn_local_v_sf = v_sf
 ; stel: .end __fn_stub_v_sf
 
 declare i32 @printf(ptr, ...) #1
@@ -46,7 +46,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f13
 ; stel: jr $25
-; stel: .set $__fn_local_v_df, v_df
+; stel: $__fn_local_v_df = v_df
 ; stel: .end __fn_stub_v_df
 
 ; Function Attrs: nounwind
@@ -70,7 +70,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf_sf, v_sf_sf
+; stel: $__fn_local_v_sf_sf = v_sf_sf
 ; stel: .end __fn_stub_v_sf_sf
 
 ; Function Attrs: nounwind
@@ -95,7 +95,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_v_sf_df, v_sf_df
+; stel: $__fn_local_v_sf_df = v_sf_df
 ; stel: .end __fn_stub_v_sf_df
 
 ; Function Attrs: nounwind
@@ -120,7 +120,7 @@ entry:
 ; stel: mfc1 $5, $f13
 ; stel: mfc1 $6, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_v_df_sf, v_df_sf
+; stel: $__fn_local_v_df_sf = v_df_sf
 ; stel: .end __fn_stub_v_df_sf
 
 ; Function Attrs: nounwind
@@ -146,7 +146,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_v_df_df, v_df_df
+; stel: $__fn_local_v_df_df = v_df_df
 ; stel: .end __fn_stub_v_df_df
 
 ; Function Attrs: nounwind
@@ -174,7 +174,7 @@ entry:
 ; stel: addiu $25, $25, %lo(sf_sf)
 ; stel: mfc1 $4, $f12
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf, sf_sf
+; stel: $__fn_local_sf_sf = sf_sf
 ; stel: .end __fn_stub_sf_sf
 
 
@@ -196,7 +196,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f13
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df, sf_df
+; stel: $__fn_local_sf_df = sf_df
 ; stel: .end __fn_stub_sf_df
 
 ; Function Attrs: nounwind
@@ -221,7 +221,7 @@ entry:
 ; stel: mfc1 $4, $f12
 ; stel: mfc1 $5, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf_sf, sf_sf_sf
+; stel: $__fn_local_sf_sf_sf = sf_sf_sf
 ; stel: .end __fn_stub_sf_sf_sf
 
 ; Function Attrs: nounwind
@@ -247,7 +247,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_sf_sf_df, sf_sf_df
+; stel: $__fn_local_sf_sf_df = sf_sf_df
 ; stel: .end __fn_stub_sf_sf_df
 
 ; Function Attrs: nounwind
@@ -273,7 +273,7 @@ entry:
 ; stel: mfc1 $5, $f13
 ; stel: mfc1 $6, $f14
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df_sf, sf_df_sf
+; stel: $__fn_local_sf_df_sf = sf_df_sf
 ; stel: .end __fn_stub_sf_df_sf
 
 ; Function Attrs: nounwind
@@ -300,7 +300,7 @@ entry:
 ; stel: mfc1 $6, $f14
 ; stel: mfc1 $7, $f15
 ; stel: jr $25
-; stel: .set $__fn_local_sf_df_df, sf_df_df
+; stel: $__fn_local_sf_df_df = sf_df_df
 ; stel: .end __fn_stub_sf_df_df
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16ex.ll b/llvm/test/CodeGen/Mips/mips16ex.ll
index fb9a44e767516..f4d1125718a9a 100644
--- a/llvm/test/CodeGen/Mips/mips16ex.ll
+++ b/llvm/test/CodeGen/Mips/mips16ex.ll
@@ -2,7 +2,7 @@
 
 ;16: main:
 ;16-NEXT: [[TMP:.*]]:
-;16-NEXT: .set $func_begin0, [[TMP]]
+;16-NEXT: $func_begin0 = [[TMP]]
 ;16-NEXT: .cfi_startproc
 ;16-NEXT: .cfi_personality
 @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
diff --git a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
index 6299b4e393d9e..3218c77f08c80 100644
--- a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
+++ b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll
@@ -10,6 +10,6 @@ entry:
 }
 
 ; CHECK-LABEL: TestD:
-; CHECK: .set TestC, TestD
-; CHECK-DAG: .set TestB, TestC
-; CHECK-DAG: .set TestA, TestC
+; CHECK: TestC = TestD
+; CHECK-DAG: TestB = TestC
+; CHECK-DAG: TestA = TestC
diff --git a/llvm/test/CodeGen/PowerPC/data-align.ll b/llvm/test/CodeGen/PowerPC/data-align.ll
index bfedec139369c..42dee13d152a9 100644
--- a/llvm/test/CodeGen/PowerPC/data-align.ll
+++ b/llvm/test/CodeGen/PowerPC/data-align.ll
@@ -2,23 +2,23 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux | FileCheck %s
 
-; CHECK:      .set .Li8,
+; CHECK:      .Li8 =
 ; CHECK-NEXT:  .size	.Li8, 1
 @i8 = private constant i8 42
 
-; CHECK:      .set .Li16,
+; CHECK:      .Li16 =
 ; CHECK-NEXT: .size	.Li16, 2
 @i16 = private constant i16 42
 
-; CHECK:      .set .Li32,
+; CHECK:      .Li32 =
 ; CHECK-NEXT: .size	.Li32, 4
 @i32 = private constant i32 42
 
-; CHECK:      .set .Li64,
+; CHECK:      .Li64 =
 ; CHECK-NEXT: .size	.Li64, 8
 @i64 = private constant i64 42
 
-; CHECK:        .set .Li128,
+; CHECK:        .Li128 =
 ; CHECK-NEXT:	.size	.Li128, 16
 @i128 = private constant i128 42
 
diff --git a/llvm/test/CodeGen/WebAssembly/aliases.ll b/llvm/test/CodeGen/WebAssembly/aliases.ll
index 91b57b90df1d6..87b292f53c625 100644
--- a/llvm/test/CodeGen/WebAssembly/aliases.ll
+++ b/llvm/test/CodeGen/WebAssembly/aliases.ll
@@ -4,11 +4,11 @@
 @bar = global i32 42
 
 ; CHECK-DAG: .globl	foo1
-; CHECK-DAG: .set foo1, bar
+; CHECK-DAG: foo1 = bar
 @foo1 = alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	foo2
-; CHECK-DAG: .set foo2, bar
+; CHECK-DAG: foo2 = bar
 @foo2 = alias i32, ptr @bar
 
 %FunTy = type i32()
@@ -19,14 +19,14 @@ define i32 @foo_f() {
 
 ; CHECK-DAG: .weak	bar_f
 ; CHECK-DAG: .type	bar_f,@function
-; CHECK-DAG: .set bar_f, foo_f
+; CHECK-DAG: bar_f = foo_f
 @bar_f = weak alias %FunTy, ptr @foo_f
 
 ; CHECK-DAG: .weak	bar_l
-; CHECK-DAG: .set bar_l, bar
+; CHECK-DAG: bar_l = bar
 @bar_l = linkonce_odr alias i32, ptr @bar
 
-; CHECK-DAG: .set bar_i, bar
+; CHECK-DAG: bar_i = bar
 @bar_i = internal alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	A
@@ -34,24 +34,24 @@ define i32 @foo_f() {
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
-; CHECK-DAG: .set bar_h, bar
+; CHECK-DAG: bar_h = bar
 @bar_h = hidden alias i32, ptr @bar
 
 ; CHECK-DAG: .globl	bar_p
 ; CHECK-DAG: .protected	bar_p
-; CHECK-DAG: .set bar_p, bar
+; CHECK-DAG: bar_p = bar
 @bar_p = protected alias i32, ptr @bar
 
-; CHECK-DAG: .set test2, bar+4
+; CHECK-DAG: test2 = bar+4
 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1)
 
-; CHECK-DAG: .set test3, 42
+; CHECK-DAG: test3 = 42
 @test3 = alias i32, inttoptr(i32 42 to ptr)
 
-; CHECK-DAG: .set test4, bar
+; CHECK-DAG: test4 = bar
 @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr)
 
-; CHECK-DAG: .set test5, test2-bar
+; CHECK-DAG: test5 = test2-bar
 @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32),
                                  i32 ptrtoint (ptr @bar to i32)) to ptr)
 
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
index 7a5baa09f95e9..10985de88bf2e 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
@@ -35,7 +35,7 @@
 ; }
 ;-------------------------------------------------------------------------------
 
-; CHECK: .set @feat.00, 2048
+; CHECK: @feat.00 = 2048
 
 ; CHECK: .section .gfids$y
 ; CHECK: .symidx _ZNK7Derived4calcEv
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard.ll b/llvm/test/CodeGen/WinCFGuard/cfguard.ll
index 2ec2e573f7164..a77d5490ef876 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s
 ; Control Flow Guard is currently only available on Windows
 
-; CHECK: .set @feat.00, 2048
+; CHECK: @feat.00 = 2048
 
 ; CHECK: .section .gfids$y
 ; CHECK: .symidx "?address_taken@@YAXXZ"
diff --git a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index d59953fb4e37d..cc80f87fda311 100644
--- a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -10,4 +10,4 @@ define weak i32 @pthread_once(ptr, ptr) {
 ; CHECK: pthread_once:
 
 ; CHECK: .weak   __gthrw_pthread_once
-; CHECK: .set __gthrw_pthread_once, pthread_once
+; CHECK: __gthrw_pthread_once = pthread_once
diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
index 7050889d71029..527684f5a27db 100644
--- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
+++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll
@@ -125,31 +125,31 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp {
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:    .data_region jt32
-; CHECK-NEXT:  .set L0_0_set_3, LBB0_3-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_4, LBB0_4-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_5, LBB0_5-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_6, LBB0_6-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_7, LBB0_7-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_8, LBB0_8-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_9, LBB0_9-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_10, LBB0_10-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_11, LBB0_11-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_12, LBB0_12-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_13, LBB0_13-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_14, LBB0_14-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_15, LBB0_15-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_16, LBB0_16-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_17, LBB0_17-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_18, LBB0_18-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_19, LBB0_19-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_20, LBB0_20-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_21, LBB0_21-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_22, LBB0_22-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_23, LBB0_23-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_24, LBB0_24-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_25, LBB0_25-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_26, LBB0_26-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_27, LBB0_27-LJTI0_0
+; CHECK-NEXT:  L0_0_set_3 = LBB0_3-LJTI0_0
+; CHECK-NEXT:  L0_0_set_4 = LBB0_4-LJTI0_0
+; CHECK-NEXT:  L0_0_set_5 = LBB0_5-LJTI0_0
+; CHECK-NEXT:  L0_0_set_6 = LBB0_6-LJTI0_0
+; CHECK-NEXT:  L0_0_set_7 = LBB0_7-LJTI0_0
+; CHECK-NEXT:  L0_0_set_8 = LBB0_8-LJTI0_0
+; CHECK-NEXT:  L0_0_set_9 = LBB0_9-LJTI0_0
+; CHECK-NEXT:  L0_0_set_10 = LBB0_10-LJTI0_0
+; CHECK-NEXT:  L0_0_set_11 = LBB0_11-LJTI0_0
+; CHECK-NEXT:  L0_0_set_12 = LBB0_12-LJTI0_0
+; CHECK-NEXT:  L0_0_set_13 = LBB0_13-LJTI0_0
+; CHECK-NEXT:  L0_0_set_14 = LBB0_14-LJTI0_0
+; CHECK-NEXT:  L0_0_set_15 = LBB0_15-LJTI0_0
+; CHECK-NEXT:  L0_0_set_16 = LBB0_16-LJTI0_0
+; CHECK-NEXT:  L0_0_set_17 = LBB0_17-LJTI0_0
+; CHECK-NEXT:  L0_0_set_18 = LBB0_18-LJTI0_0
+; CHECK-NEXT:  L0_0_set_19 = LBB0_19-LJTI0_0
+; CHECK-NEXT:  L0_0_set_20 = LBB0_20-LJTI0_0
+; CHECK-NEXT:  L0_0_set_21 = LBB0_21-LJTI0_0
+; CHECK-NEXT:  L0_0_set_22 = LBB0_22-LJTI0_0
+; CHECK-NEXT:  L0_0_set_23 = LBB0_23-LJTI0_0
+; CHECK-NEXT:  L0_0_set_24 = LBB0_24-LJTI0_0
+; CHECK-NEXT:  L0_0_set_25 = LBB0_25-LJTI0_0
+; CHECK-NEXT:  L0_0_set_26 = LBB0_26-LJTI0_0
+; CHECK-NEXT:  L0_0_set_27 = LBB0_27-LJTI0_0
 ; CHECK-NEXT:  LJTI0_0:
 ; CHECK-NEXT:    .long L0_0_set_3
 ; CHECK-NEXT:    .long L0_0_set_3
diff --git a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index cf20cfaced5d0..17df3e10fd3d9 100644
--- a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -64,15 +64,15 @@ attributes #1 = { nounwind readnone }
 ; CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]]
 
 ; CHECK: Ldebug_loc0:
-; CHECK-NEXT: .set [[SET1:.*]], Lfunc_begin0-Lfunc_begin0
+; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET1]]
-; CHECK-NEXT: .set [[SET2:.*]], [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET2]]
 ; CHECK-NEXT: .short  1     ## Loc expr size
 ; CHECK-NEXT: .byte   85
-; CHECK-NEXT: .set [[SET3:.*]], [[LABEL]]-Lfunc_begin0
+; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET3]]
-; CHECK-NEXT: .set [[SET4:.*]], [[CLOBBER]]-Lfunc_begin0
+; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0
 ; CHECK-NEXT: .quad   [[SET4]]
 ; CHECK-NEXT: .short  1     ## Loc expr size
 ; CHECK-NEXT: .byte   83
diff --git a/llvm/test/CodeGen/X86/alias-gep.ll b/llvm/test/CodeGen/X86/alias-gep.ll
index 904a611f61d1c..65d2ced6df5ba 100644
--- a/llvm/test/CodeGen/X86/alias-gep.ll
+++ b/llvm/test/CodeGen/X86/alias-gep.ll
@@ -3,17 +3,17 @@
 
 ;MACHO: .globl _offsetSym0
 ;MACHO-NOT: .alt_entry
-;MACHO: .set _offsetSym0, _s
+;MACHO: _offsetSym0 = _s
 ;MACHO: .globl _offsetSym1
 ;MACHO: .alt_entry _offsetSym1
-;MACHO: .set _offsetSym1, _s+8
+;MACHO: _offsetSym1 = _s+8
 
 ;ELF: .globl offsetSym0
 ;ELF-NOT: .alt_entry
-;ELF: .set offsetSym0, s
+;ELF: offsetSym0 = s
 ;ELF: .globl offsetSym1
 ;ELF-NOT: .alt_entry
-;ELF: .set offsetSym1, s+8
+;ELF: offsetSym1 = s+8
 
 %struct.S1 = type { i32, i32, i32 }
 
diff --git a/llvm/test/CodeGen/X86/aliases.ll b/llvm/test/CodeGen/X86/aliases.ll
index 03ea2579d0f8a..d36798820fe83 100644
--- a/llvm/test/CodeGen/X86/aliases.ll
+++ b/llvm/test/CodeGen/X86/aliases.ll
@@ -48,16 +48,16 @@ define i32 @foo_f() {
 ; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32, ptr @bar
 
-; CHECK-DAG: .set test2, bar+4
+; CHECK-DAG: test2 = bar+4
 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1)
 
-; CHECK-DAG: .set test3, 42
+; CHECK-DAG: test3 = 42
 @test3 = alias i32, inttoptr(i32 42 to ptr)
 
-; CHECK-DAG: .set test4, bar
+; CHECK-DAG: test4 = bar
 @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr)
 
-; CHECK-DAG: .set test5, test2-bar
+; CHECK-DAG: test5 = test2-bar
 @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32),
                                  i32 ptrtoint (ptr @bar to i32)) to ptr)
 
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index 437d9698ee6bd..ab9fa2287ffad 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -44,7 +44,7 @@ return:                                           ; preds = %catch, %entry
 ; CHECK: .LBB0_[[catch:[0-9]+]]:
 
 ; CHECK: .seh_handlerdata
-; CHECK-NEXT: .set .Lfoo$parent_frame_offset, 32
+; CHECK-NEXT: .Lfoo$parent_frame_offset = 32
 ; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT: .long   .Ltmp0@IMGREL
diff --git a/llvm/test/CodeGen/X86/coff-alias-type.ll b/llvm/test/CodeGen/X86/coff-alias-type.ll
index a242cd2d77d7c..6cc0638b2d4af 100644
--- a/llvm/test/CodeGen/X86/coff-alias-type.ll
+++ b/llvm/test/CodeGen/X86/coff-alias-type.ll
@@ -22,4 +22,4 @@ entry:
 ; CHECK-NEXT: .scl     2
 ; CHECK-NEXT: .type    32
 ; CHECK-NEXT: .endef
-; CHECK-NEXT: .set     _ZN8MyStructC1Ev, _ZN8MyStructC2Ev
+; CHECK-NEXT: _ZN8MyStructC1Ev = _ZN8MyStructC2Ev
diff --git a/llvm/test/CodeGen/X86/coff-comdat.ll b/llvm/test/CodeGen/X86/coff-comdat.ll
index 99b3c0a687afb..084a5a71125ee 100644
--- a/llvm/test/CodeGen/X86/coff-comdat.ll
+++ b/llvm/test/CodeGen/X86/coff-comdat.ll
@@ -89,4 +89,4 @@ $vftable = comdat largest
 ; CHECK: .globl  _f6
 ; CHECK: .section        .rdata,"dr",largest,_vftable
 ; CHECK: .globl  _vftable
-; CHECK: .set _vftable, L_some_name+4
+; CHECK: _vftable = L_some_name+4
diff --git a/llvm/test/CodeGen/X86/coff-feat00.ll b/llvm/test/CodeGen/X86/coff-feat00.ll
index 21dd04ed34c7e..1dcd4276399a9 100644
--- a/llvm/test/CodeGen/X86/coff-feat00.ll
+++ b/llvm/test/CodeGen/X86/coff-feat00.ll
@@ -4,4 +4,4 @@ define i32 @foo() {
   ret i32 0
 }
 
-; CHECK: .set @feat.00, 1
+; CHECK: @feat.00 = 1
diff --git a/llvm/test/CodeGen/X86/dllexport-x86_64.ll b/llvm/test/CodeGen/X86/dllexport-x86_64.ll
index 76add98314f5c..b640e630e47e6 100644
--- a/llvm/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/llvm/test/CodeGen/X86/dllexport-x86_64.ll
@@ -105,23 +105,23 @@ define weak_odr dllexport void @weak1() {
 ; MINGW: .ascii " -export:blob_alias"
 
 ; CHECK: .globl alias
-; CHECK: .set alias, notExported
+; CHECK: alias = notExported
 @alias = dllexport alias void(), ptr @notExported
 
 ; CHECK: .globl aliasNotExported
-; CHECK: .set aliasNotExported, f1
+; CHECK: aliasNotExported = f1
 @aliasNotExported = alias void(), ptr @f1
 
 ; CHECK: .globl alias2
-; CHECK: .set alias2, f1
+; CHECK: alias2 = f1
 @alias2 = dllexport alias void(), ptr @f1
 
 ; CHECK: .globl alias3
-; CHECK: .set alias3, notExported
+; CHECK: alias3 = notExported
 @alias3 = dllexport alias void(), ptr @notExported
 
 ; CHECK: .weak weak_alias
-; CHECK: .set weak_alias, f1
+; CHECK: weak_alias = f1
 @weak_alias = weak_odr dllexport alias void(), ptr @f1
 
 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
diff --git a/llvm/test/CodeGen/X86/dllexport.ll b/llvm/test/CodeGen/X86/dllexport.ll
index 09cc03e7729d9..53ecb8e7a1b4f 100644
--- a/llvm/test/CodeGen/X86/dllexport.ll
+++ b/llvm/test/CodeGen/X86/dllexport.ll
@@ -135,17 +135,17 @@ define weak_odr dllexport void @weak1() {
 ; CHECK-GCC: .ascii " -export:weak_alias"
 
 ; CHECK: .globl _alias
-; CHECK: .set _alias, _notExported
+; CHECK: _alias = _notExported
 @alias = dllexport alias void(), ptr @notExported
 
 ; CHECK: .globl _alias2
-; CHECK: .set _alias2, _f1
+; CHECK: _alias2 = _f1
 @alias2 = dllexport alias void(), ptr @f1
 
 ; CHECK: .globl _alias3
-; CHECK: .set _alias3, _notExported
+; CHECK: _alias3 = _notExported
 @alias3 = dllexport alias void(), ptr @notExported
 
 ; CHECK: .weak _weak_alias
-; CHECK: .set _weak_alias, _f1
+; CHECK: _weak_alias = _f1
 @weak_alias = weak_odr dllexport alias void(), ptr @f1
diff --git a/llvm/test/CodeGen/X86/ehcontguard.ll b/llvm/test/CodeGen/X86/ehcontguard.ll
index 740621bc5d025..e868209babce6 100644
--- a/llvm/test/CodeGen/X86/ehcontguard.ll
+++ b/llvm/test/CodeGen/X86/ehcontguard.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s
 ; EHCont Guard is currently only available on Windows
 
-; CHECK: .set @feat.00, 16384
+; CHECK: @feat.00 = 16384
 
 ; CHECK: .section .gehcont$y
 
diff --git a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
index 53b4bc8f1df2e..4840308a5d498 100644
--- a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
+++ b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll
@@ -33,5 +33,5 @@ define private x86_fastcallcc void @dontCrash() {
 }
 
 @alias = alias void(i64, i8, i8, i16), ptr @func
-; CHECK32-LABEL: {{^}}.set @alias@20, @func@20
-; CHECK64-LABEL: {{^}}.set alias, func
+; CHECK32-LABEL: {{^}}@alias@20 = @func@20
+; CHECK64-LABEL: {{^}}alias = func
diff --git a/llvm/test/CodeGen/X86/ifunc-asm.ll b/llvm/test/CodeGen/X86/ifunc-asm.ll
index a4c47da7f4c65..bc8e7e3d7d05b 100644
--- a/llvm/test/CodeGen/X86/ifunc-asm.ll
+++ b/llvm/test/CodeGen/X86/ifunc-asm.ll
@@ -15,7 +15,7 @@ entry:
 @foo_ifunc = ifunc i32 (i32), ptr @foo_resolver
 ; ELF:             .globl foo_ifunc
 ; ELF-NEXT:        .type foo_ifunc,@gnu_indirect_function
-; ELF-NEXT:        .set foo_ifunc, foo_resolver
+; ELF-NEXT:        foo_ifunc = foo_resolver
 
 ; MACHO:           .section __DATA,__data
 ; MACHO-NEXT:      .p2align 3, 0x0
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index b8f0661225f82..5199b1519ebea 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -47,9 +47,9 @@ entry:
   call fastcc void @"\01?fin$0@0@test2@@"(ptr %tmp0)
   ret void
 ; CHECK-LABEL: test2:
-; CHECK:	.set Ltest2$frame_escape_0, 8
-; CHECK:	.set Ltest2$frame_escape_1, 4
-; CHECK:	.set Ltest2$frame_escape_2, 0
+; CHECK:	Ltest2$frame_escape_0 = 8
+; CHECK:	Ltest2$frame_escape_1 = 4
+; CHECK:	Ltest2$frame_escape_2 = 0
 ; CHECK:	calll "?fin$0@0@test2@@"
 }
 
diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll
index 8e60b47879754..dc06a34e1c692 100644
--- a/llvm/test/CodeGen/X86/linux-preemption.ll
+++ b/llvm/test/CodeGen/X86/linux-preemption.ll
@@ -285,18 +285,18 @@ define dso_local ptr @comdat_any_local() comdat {
 ; CHECK-NEXT: .Lstrong_local_global$local:
 
 ; COMMON:      .globl strong_default_alias
-; COMMON-NEXT: .set strong_default_alias, aliasee
+; COMMON-NEXT: strong_default_alias = aliasee
 ; COMMON-NEXT: .globl strong_hidden_alias
 ; COMMON-NEXT: .hidden strong_hidden_alias
-; COMMON-NEXT: .set strong_hidden_alias, aliasee
+; COMMON-NEXT: strong_hidden_alias = aliasee
 ; COMMON-NEXT: .weak weak_default_alias
-; COMMON-NEXT: .set weak_default_alias, aliasee
+; COMMON-NEXT: weak_default_alias = aliasee
 ; COMMON-NEXT: .globl strong_local_alias
-; COMMON-NEXT: .set strong_local_alias, aliasee
-; CHECK-NEXT:  .set .Lstrong_local_alias$local, aliasee
+; COMMON-NEXT: strong_local_alias = aliasee
+; CHECK-NEXT:  .Lstrong_local_alias$local = aliasee
 ; COMMON-NEXT: .weak weak_local_alias
-; COMMON-NEXT: .set weak_local_alias, aliasee
+; COMMON-NEXT: weak_local_alias = aliasee
 ; COMMON-NEXT: .globl strong_preemptable_alias
-; COMMON-NEXT: .set strong_preemptable_alias, aliasee
+; COMMON-NEXT: strong_preemptable_alias = aliasee
 ; COMMON-NEXT: .weak weak_preemptable_alias
-; COMMON-NEXT: .set weak_preemptable_alias, aliasee
+; COMMON-NEXT: weak_preemptable_alias = aliasee
diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll
index aee7613273f75..57369be489af3 100644
--- a/llvm/test/CodeGen/X86/localescape.ll
+++ b/llvm/test/CodeGen/X86/localescape.ll
@@ -76,8 +76,8 @@ define void @alloc_func(i32 %n) {
 ; X64: .seh_stackalloc 16
 ; X64: leaq    16(%rsp), %rbp
 ; X64: .seh_setframe %rbp, 16
-; X64: .set .Lalloc_func$frame_escape_0, -4
-; X64: .set .Lalloc_func$frame_escape_1, -12
+; X64: .Lalloc_func$frame_escape_0 = -4
+; X64: .Lalloc_func$frame_escape_1 = -12
 ; X64: movl $42, -4(%rbp)
 ; X64: movl $13, -12(%rbp)
 ; X64: movq 	%rbp, %rcx
@@ -88,8 +88,8 @@ define void @alloc_func(i32 %n) {
 ; X86: pushl   %ebp
 ; X86: movl    %esp, %ebp
 ; X86: subl    $12, %esp
-; X86: .set Lalloc_func$frame_escape_0, -4
-; X86: .set Lalloc_func$frame_escape_1, -12
+; X86: Lalloc_func$frame_escape_0 = -4
+; X86: Lalloc_func$frame_escape_1 = -12
 ; X86: movl    $42, -4(%ebp)
 ; X86: movl    $13, -12(%ebp)
 ; X86: pushl   %ebp
@@ -118,8 +118,8 @@ define void @alloc_func_no_frameaddr() {
 ; X64: subq    $40, %rsp
 ; X64: .seh_stackalloc 40
 ; X64: .seh_endprologue
-; X64: .set .Lalloc_func_no_frameaddr$frame_escape_0, 36
-; X64: .set .Lalloc_func_no_frameaddr$frame_escape_1, 32
+; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36
+; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32
 ; X64: movl $42, 36(%rsp)
 ; X64: movl $13, 32(%rsp)
 ; X64: xorl %ecx, %ecx
@@ -131,8 +131,8 @@ define void @alloc_func_no_frameaddr() {
 
 ; X86-LABEL: alloc_func_no_frameaddr:
 ; X86: subl    $8, %esp
-; X86: .set Lalloc_func_no_frameaddr$frame_escape_0, 4
-; X86: .set Lalloc_func_no_frameaddr$frame_escape_1, 0
+; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4
+; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0
 ; X86: movl $42, 4(%esp)
 ; X86: movl $13, (%esp)
 ; X86: pushl $0
diff --git a/llvm/test/CodeGen/X86/pr22019.ll b/llvm/test/CodeGen/X86/pr22019.ll
index 4e78bae204428..262ee5fad7375 100644
--- a/llvm/test/CodeGen/X86/pr22019.ll
+++ b/llvm/test/CodeGen/X86/pr22019.ll
@@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu"
 module asm "pselect = __pselect"
 module asm "var = __var"
 module asm "alias = __alias"
-; CHECK: .set pselect, __pselect
-; CHECK: .set var, __var
-; CHECK: .set alias, __alias
+; CHECK: pselect = __pselect
+; CHECK: var = __var
+; CHECK: alias = __alias
 
 ; CHECK: pselect:
 ; CHECK: retq
@@ -19,5 +19,5 @@ define void @pselect() {
 ; CHECK: .long 0
 @var = global i32 0
 
-; CHECK: .set alias, var
+; CHECK: alias = var
 @alias = alias i32, ptr @var
diff --git a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
index 3acf999fc4237..bd51ca76c59d1 100644
--- a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
+++ b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -58,7 +58,7 @@ entry:
 ; CHECK: pushl %edi
 ; CHECK: pushl %esi
 
-; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
 ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%ebp)
 ; CHECK: movl $L__ehtable$main,
 ;       EH state 0
@@ -78,7 +78,7 @@ entry:
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]]
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: .p2align 2
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index 7558c4389be59..d958580e5925b 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -119,7 +119,7 @@ __except.ret:                                     ; preds = %catch.dispatch.7
 ; CHECK:         jmp     .LBB1_[[epilogue]]
 
 ; CHECK:         .seh_handlerdata
-; CHECK-NEXT:         .set .Lmain$parent_frame_offset, 32
+; CHECK-NEXT:         .Lmain$parent_frame_offset = 32
 ; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
 ; CHECK-NEXT:         .long   .Ltmp0@IMGREL
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index 28e5cf68dd27e..41823dfb38f0a 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -26,7 +26,7 @@ lpad:                                             ; preds = %entry
 ; X64: retq
 
 ; X64: .seh_handlerdata
-; X64-NEXT: .set .Lmain$parent_frame_offset, 32
+; X64-NEXT: .Lmain$parent_frame_offset = 32
 ; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
 ; X64-NEXT: .Llsda_begin0:
 ; X64-NEXT: .long   .Ltmp0@IMGREL # LabelStart
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 99b81f0eb1bb4..63e91d33d4006 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -15,7 +15,7 @@
 ; label. This was PR30431.
 
 ; CHECK-LABEL: _f:                                     # @f
-; CHECK: .set Lf$parent_frame_offset, 0
+; CHECK: Lf$parent_frame_offset = 0
 ; CHECK: retl
 
 ; CHECK-LABEL: "?filt$0@0@f@@":                        # @"\01?filt$0@0@f@@"
diff --git a/llvm/test/CodeGen/X86/seh-stack-realign.ll b/llvm/test/CodeGen/X86/seh-stack-realign.ll
index 2869bff822314..ae687343cc504 100644
--- a/llvm/test/CodeGen/X86/seh-stack-realign.ll
+++ b/llvm/test/CodeGen/X86/seh-stack-realign.ll
@@ -51,7 +51,7 @@ entry:
 ; Check that we can get the exception code from eax to the printf.
 
 ; CHECK-LABEL: _main:
-; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]]
+; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]]
 ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi)
 ; CHECK: movl $L__ehtable$main,
 ;       EH state 0
@@ -71,7 +71,7 @@ entry:
 ; CHECK: calll _printf
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]]
+; CHECK: Lmain$parent_frame_offset = [[reg_offs]]
 ; CHECK: L__ehtable$main
 ; CHECK-NEXT: .long -1
 ; CHECK-NEXT: .long _filt$main
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index d8fcf6d86fa4d..ecbbaf3ab362d 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -34,12 +34,12 @@ define i32 @foo(i32 %x) nounwind ssp {
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:    .data_region jt32
-; CHECK-NEXT:  .set L0_0_set_2, LBB0_2-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_3, LBB0_3-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_4, LBB0_4-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_5, LBB0_5-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_6, LBB0_6-LJTI0_0
-; CHECK-NEXT:  .set L0_0_set_7, LBB0_7-LJTI0_0
+; CHECK-NEXT:  L0_0_set_2 = LBB0_2-LJTI0_0
+; CHECK-NEXT:  L0_0_set_3 = LBB0_3-LJTI0_0
+; CHECK-NEXT:  L0_0_set_4 = LBB0_4-LJTI0_0
+; CHECK-NEXT:  L0_0_set_5 = LBB0_5-LJTI0_0
+; CHECK-NEXT:  L0_0_set_6 = LBB0_6-LJTI0_0
+; CHECK-NEXT:  L0_0_set_7 = LBB0_7-LJTI0_0
 ; CHECK-NEXT:  LJTI0_0:
 ; CHECK-NEXT:    .long L0_0_set_2
 ; CHECK-NEXT:    .long L0_0_set_3
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 16322cbe9980e..9e44299083d46 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: "?fin$0@0@main@@"
 ; CHECK:      .seh_handlerdata
-; CHECK:      .set ".L?fin$0@0@main@@$parent_frame_offset", 48
+; CHECK:      ".L?fin$0@0@main@@$parent_frame_offset" = 48
 ; CHECK-NEXT:        .long   (.Llsda_end1-.Llsda_begin1)/16
 ; CHECK-NEXT: .Llsda_begin1:
 ; CHECK-NEXT:        .long   .Ltmp
diff --git a/llvm/test/CodeGen/XCore/globals.ll b/llvm/test/CodeGen/XCore/globals.ll
index 134bbb3444b5d..186cfda97104d 100644
--- a/llvm/test/CodeGen/XCore/globals.ll
+++ b/llvm/test/CodeGen/XCore/globals.ll
@@ -127,4 +127,4 @@ entry:
 
 @array = global [10 x i16] zeroinitializer, align 2
 ; CHECK: .globl  array.globound
-; CHECK: .set array.globound, 10
+; CHECK: array.globound = 10
diff --git a/llvm/test/CodeGen/XCore/linkage.ll b/llvm/test/CodeGen/XCore/linkage.ll
index 93edf01cf8a96..5bfb83d964dfa 100644
--- a/llvm/test/CodeGen/XCore/linkage.ll
+++ b/llvm/test/CodeGen/XCore/linkage.ll
@@ -19,14 +19,14 @@ define protected void @test_protected() {
 }
 
 ; CHECK: .globl array.globound
-; CHECK: .set array.globound, 2
+; CHECK: array.globound = 2
 ; CHECK: .weak array.globound
 ; CHECK: .globl array
 ; CHECK: .weak array
 @array = weak global [2 x i32] zeroinitializer
 
 ; CHECK: .globl ac.globound
-; CHECK: .set ac.globound, 2
+; CHECK: ac.globound = 2
 ; CHECK: .weak ac.globound
 ; CHECK: .globl ac
 ; CHECK: .weak ac
diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll
index 0d49b5eeefd1b..a6ede2814aba3 100644
--- a/llvm/test/DebugInfo/X86/dbg-value-range.ll
+++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll
@@ -49,9 +49,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
 ;CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]]
 
 ;CHECK:Ldebug_loc0:
-;CHECK-NEXT: .set Lset{{.*}},
+;CHECK-NEXT: Lset{{.*}} =
 ;CHECK-NEXT:	.quad
-;CHECK-NEXT: .set [[CLOBBER_OFF:Lset.*]], [[CLOBBER]]-{{.*}}
+;CHECK-NEXT: [[CLOBBER_OFF:Lset.*]] = [[CLOBBER]]-{{.*}}
 ;CHECK-NEXT:	.quad	[[CLOBBER_OFF]]
 ;CHECK-NEXT:  .short 1 ## Loc expr size
 ;CHECK-NEXT:	.byte	85 ## DW_OP_reg
diff --git a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index 446f31f9a9126..8d4d065641fca 100644
--- a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -64,11 +64,11 @@
 ; PR15408
 ; ASM: Lcu_begin0:
 ; ASM-NOT: Lcu_begin
-; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
 ; ASM-NEXT: .long   Lset[[LT]]
 ; ASM: Lcu_begin1:
 ; ASM-NOT: Lcu_begin
-; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list
+; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list
 ; ASM-NEXT: .long   Lset[[LT]]
 define i32 @test(i32 %a) nounwind uwtable ssp !dbg !5 {
 entry:
diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s
index 14ac11f581a55..b2ec5b6ac3678 100644
--- a/llvm/test/MC/AArch64/basic-a64-instructions.s
+++ b/llvm/test/MC/AArch64/basic-a64-instructions.s
@@ -3349,7 +3349,7 @@ _func:
 
 	.equ equvalue, 0x0001
         movk x1, equvalue, lsl 16
-// CHECK: .set equvalue, 1
+// CHECK: equvalue = 1
 // CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2]
 
         movz x2, #:abs_g0:sym
diff --git a/llvm/test/MC/AsmParser/assignment.s b/llvm/test/MC/AsmParser/assignment.s
index 6f84a1c338dad..8c8984c12ac36 100644
--- a/llvm/test/MC/AsmParser/assignment.s
+++ b/llvm/test/MC/AsmParser/assignment.s
@@ -1,22 +1,22 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
 
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 TEST0:
         a = 0
 
 # CHECK: TEST1:
-# CHECK: .set b, 0
+# CHECK: b = 0
 TEST1:
-        .set b, 0
+        b = 0
 
 # CHECK: .globl	_f1
-# CHECK: .set _f1, 0
+# CHECK: _f1 = 0
         .globl _f1
         _f1 = 0
 
 # CHECK: .globl	_f2
-# CHECK: .set _f2, 0
+# CHECK: _f2 = 0
         .globl _f2
-        .set _f2, 0
+        _f2 = 0
 
diff --git a/llvm/test/MC/AsmParser/directive_include.s b/llvm/test/MC/AsmParser/directive_include.s
index 8d2ef2753b23a..f53bc671fc646 100644
--- a/llvm/test/MC/AsmParser/directive_include.s
+++ b/llvm/test/MC/AsmParser/directive_include.s
@@ -2,7 +2,7 @@
 
 # CHECK: TESTA:
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK: TESTB:
 TESTA:  
 	.include       "directive\137set.s"   # "\137" is underscore "_"
diff --git a/llvm/test/MC/AsmParser/directive_set.s b/llvm/test/MC/AsmParser/directive_set.s
index 65dd33d1d54fb..4b93de01b309d 100644
--- a/llvm/test/MC/AsmParser/directive_set.s
+++ b/llvm/test/MC/AsmParser/directive_set.s
@@ -1,13 +1,13 @@
 # RUN: llvm-mc -triple i386-unknown-elf %s | FileCheck %s
 
 # CHECK: TEST0:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK-NOT: .no_dead_strip a
 TEST0:  
-        .set a, 0
+        a = 0
         
 # CHECK: TEST1:
-# CHECK: .set a, 0
+# CHECK: a = 0
 # CHECK-NOT: .no_dead_strip a
 TEST1:  
         .equ a, 0
diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll
index 3321f0a6a2872..22c9eaf7a36e9 100644
--- a/llvm/test/MC/AsmParser/include.ll
+++ b/llvm/test/MC/AsmParser/include.ll
@@ -10,5 +10,5 @@ entry:
   ret void
 }
 
-; CHECK: .set MODULE, 1
-; CHECK: .set FUNCTION, 1
+; CHECK: MODULE = 1
+; CHECK: FUNCTION = 1
diff --git a/llvm/test/MC/AsmParser/labels.s b/llvm/test/MC/AsmParser/labels.s
index 599ce72c44eef..6a9870b655f2f 100644
--- a/llvm/test/MC/AsmParser/labels.s
+++ b/llvm/test/MC/AsmParser/labels.s
@@ -18,12 +18,12 @@ foo:
 // CHECK: addl $24, a$b+10(%eax)
         addl $24, ("a$b" + 10)(%eax)
 
-// CHECK: .set b$c, 10
+// CHECK: b$c = 10
 "b$c" = 10
 // CHECK: addl $10, %eax
         addl $"b$c", %eax
 
-// CHECK: .set "a 0", 11
+// CHECK: "a 0" = 11
         .set "a 0", 11
 
 // CHECK: .long 11
@@ -49,7 +49,7 @@ foo:
 // CHECX: .lsym "a 8",1
 //        .lsym "a 8", 1
 
-// CHECK: .set "a 9", a-b
+// CHECK: "a 9" = a-b
         .set "a 9", a - b
 
 // CHECK: .long "a 9"
diff --git a/llvm/test/MC/AsmParser/macro-arg-darwin.s b/llvm/test/MC/AsmParser/macro-arg-darwin.s
index 8671107539ce7..88c63dd488be4 100644
--- a/llvm/test/MC/AsmParser/macro-arg-darwin.s
+++ b/llvm/test/MC/AsmParser/macro-arg-darwin.s
@@ -38,7 +38,7 @@ bar
     .endif
 .endm
 .macro bottom
-    .set fred, $0
+    fred = $0
 .endm
 
 .text
@@ -49,7 +49,7 @@ top bar, 42
 // CHECK: _foo:
 // CHECK-NOT: fred
 // CHECK: _bar
-// CHECK-NEXT: .set fred, 42
+// CHECK-NEXT: fred = 42
 
 
 .macro foo
diff --git a/llvm/test/MC/AsmParser/motorola_integers.s b/llvm/test/MC/AsmParser/motorola_integers.s
index c75d9a5e0cb14..1ec2e02e97f02 100644
--- a/llvm/test/MC/AsmParser/motorola_integers.s
+++ b/llvm/test/MC/AsmParser/motorola_integers.s
@@ -1,10 +1,10 @@
 # RUN: llvm-mc -triple i386-unknown-unknown -motorola-integers %s | FileCheck %s
 
-# CHECK: .set a, 2882400009
-.set a, $aBcDeF09
-# CHECK: .set b, 256
-.set b, $0100
-# CHECK: .set c, 10
-.set c, %01010
-# CHECK: .set d, 1
-.set d, %1
+# CHECK: a = 2882400009
+a = $aBcDeF09
+# CHECK: b = 256
+b = $0100
+# CHECK: c = 10
+c = %01010
+# CHECK: d = 1
+d = %1
diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s
index 4a027c6e796ae..f948d650da94d 100644
--- a/llvm/test/MC/Mips/cpsetup.s
+++ b/llvm/test/MC/Mips/cpsetup.s
@@ -196,7 +196,7 @@ IMM_8 = 8
 
 # ALL-LABEL: <t1b>:
 # ASM-LABEL: t1b:
-# ASM-NEXT: .set IMM_8, 8
+# ASM-NEXT: IMM_8 = 8
 
 # O32-NOT: __cerror
 

From 95bbaca6c1dcabb03bd67aabe3aaa4730a11200d Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Thu, 12 Jun 2025 10:54:01 +0530
Subject: [PATCH 187/851] [AArch64] Extend usage of `XAR` instruction for
 fixed-length operations (#139460)

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 102 +++++--
 llvm/test/CodeGen/AArch64/xar.ll              | 250 +++++++++++++++++-
 2 files changed, 324 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 11cb91fbe02d4..009d69b2b9433 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -4606,7 +4606,33 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
     return false;
   }
 
-  if (!Subtarget->hasSHA3())
+  // We have Neon SHA3 XAR operation for v2i64 but for types
+  // v4i32, v8i16, v16i8 we can use SVE operations when SVE2-SHA3
+  // is available.
+  EVT SVT;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32:
+  case MVT::v2i32:
+    SVT = MVT::nxv4i32;
+    break;
+  case MVT::v8i16:
+  case MVT::v4i16:
+    SVT = MVT::nxv8i16;
+    break;
+  case MVT::v16i8:
+  case MVT::v8i8:
+    SVT = MVT::nxv16i8;
+    break;
+  case MVT::v2i64:
+  case MVT::v1i64:
+    SVT = Subtarget->hasSHA3() ? MVT::v2i64 : MVT::nxv2i64;
+    break;
+  default:
+    return false;
+  }
+
+  if ((!SVT.isScalableVector() && !Subtarget->hasSHA3()) ||
+      (SVT.isScalableVector() && !Subtarget->hasSVE2()))
     return false;
 
   if (N0->getOpcode() != AArch64ISD::VSHL ||
@@ -4632,7 +4658,8 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
   SDValue Imm = CurDAG->getTargetConstant(
       ShAmt, DL, N0.getOperand(1).getValueType(), false);
 
-  if (ShAmt + HsAmt != 64)
+  unsigned VTSizeInBits = VT.getScalarSizeInBits();
+  if (ShAmt + HsAmt != VTSizeInBits)
     return false;
 
   if (!IsXOROperand) {
@@ -4640,33 +4667,76 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
     SDNode *MOV =
         CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, MVT::v2i64, Zero);
     SDValue MOVIV = SDValue(MOV, 0);
+
     R1 = N1->getOperand(0);
     R2 = MOVIV;
   }
 
-  // If the input is a v1i64, widen to a v2i64 to use XAR.
-  assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!");
-  if (VT == MVT::v1i64) {
-    EVT SVT = MVT::v2i64;
+  if (SVT != VT) {
     SDValue Undef =
-        SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0);
-    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+        SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, SVT), 0);
+
+    if (SVT.isScalableVector() && VT.is64BitVector()) {
+      EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext());
+
+      SDValue UndefQ = SDValue(
+          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, QVT), 0);
+      SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+
+      R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT,
+                                          UndefQ, R1, DSub),
+                   0);
+      if (R2.getValueType() == VT)
+        R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT,
+                                            UndefQ, R2, DSub),
+                     0);
+    }
+
+    SDValue SubReg = CurDAG->getTargetConstant(
+        (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, MVT::i32);
+
     R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef,
-                                        R1, DSub),
+                                        R1, SubReg),
                  0);
-    if (R2.getValueType() == MVT::v1i64)
+
+    if (SVT.isScalableVector() || R2.getValueType() != SVT)
       R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT,
-                                          Undef, R2, DSub),
+                                          Undef, R2, SubReg),
                    0);
   }
 
   SDValue Ops[] = {R1, R2, Imm};
-  SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops);
+  SDNode *XAR = nullptr;
+
+  if (SVT.isScalableVector()) {
+    if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
+            SVT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
+                  AArch64::XAR_ZZZI_D}))
+      XAR = CurDAG->getMachineNode(Opc, DL, SVT, Ops);
+  } else {
+    XAR = CurDAG->getMachineNode(AArch64::XAR, DL, SVT, Ops);
+  }
 
-  if (VT == MVT::v1i64) {
-    SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
-    XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
-                                 SDValue(XAR, 0), DSub);
+  assert(XAR && "Unexpected NULL value for XAR instruction in DAG");
+
+  if (SVT != VT) {
+    if (VT.is64BitVector() && SVT.isScalableVector()) {
+      EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext());
+
+      SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+      SDNode *Q = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, QVT,
+                                         SDValue(XAR, 0), ZSub);
+
+      SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+      XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
+                                   SDValue(Q, 0), DSub);
+    } else {
+      SDValue SubReg = CurDAG->getTargetConstant(
+          (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL,
+          MVT::i32);
+      XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
+                                   SDValue(XAR, 0), SubReg);
+    }
   }
   ReplaceNode(N, XAR);
   return true;
diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll
index d682f4f4a1bfb..652617b58eaf3 100644
--- a/llvm/test/CodeGen/AArch64/xar.ll
+++ b/llvm/test/CodeGen/AArch64/xar.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
 ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s
+
+/* 128-bit vectors */
 
 define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; SHA3-LABEL: xar:
@@ -14,6 +17,14 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
 ; NOSHA3-NEXT:    shl v0.2d, v1.2d, #10
 ; NOSHA3-NEXT:    usra v0.2d, v1.2d, #54
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #54
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
     %a = xor <2 x i64> %x, %y
     %b = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 10, i64 10>)
     ret <2 x i64> %b
@@ -34,24 +45,40 @@ define <1 x i64> @xar_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; NOSHA3-NEXT:    shl d0, d1, #1
 ; NOSHA3-NEXT:    usra d0, d1, #63
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v1i64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #63
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
   %v.val = xor <1 x i64> %a, %b
   %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1))
   ret <1 x i64> %fshl
 }
 
-define <2 x i64> @xar_instead_of_or1(<2 x i64> %r) {
-; SHA3-LABEL: xar_instead_of_or1:
+define <2 x i64> @xar_instead_of_or_v2i64(<2 x i64> %r) {
+; SHA3-LABEL: xar_instead_of_or_v2i64:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    movi v1.2d, #0000000000000000
 ; SHA3-NEXT:    xar v0.2d, v0.2d, v1.2d, #39
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or1:
+; NOSHA3-LABEL: xar_instead_of_or_v2i64:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.2d, v0.2d, #25
 ; NOSHA3-NEXT:    usra v1.2d, v0.2d, #39
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v2i64:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #39
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 25))
   ret <2 x i64> %or
@@ -72,67 +99,266 @@ define <1 x i64> @xar_instead_of_or_v1i64(<1 x i64> %v.val) {
 ; NOSHA3-NEXT:    usra d1, d0, #63
 ; NOSHA3-NEXT:    fmov d0, d1
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v1i64:
+; SVE2:       // %bb.0:
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.d, z0.d, z1.d, #63
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
   %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1))
   ret <1 x i64> %fshl
 }
 
-define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) {
-; SHA3-LABEL: xar_instead_of_or2:
+define <4 x i32> @xar_instead_of_or_v4i32(<4 x i32> %r) {
+; SHA3-LABEL: xar_instead_of_or_v4i32:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    shl v1.4s, v0.4s, #25
 ; SHA3-NEXT:    usra v1.4s, v0.4s, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or2:
+; NOSHA3-LABEL: xar_instead_of_or_v4i32:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.4s, v0.4s, #25
 ; NOSHA3-NEXT:    usra v1.4s, v0.4s, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v4i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25))
   ret <4 x i32> %or
 }
 
-define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) {
-; SHA3-LABEL: xar_instead_of_or3:
+define <8 x i16> @xar_instead_of_or_v8i16(<8 x i16> %r) {
+; SHA3-LABEL: xar_instead_of_or_v8i16:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    shl v1.8h, v0.8h, #9
 ; SHA3-NEXT:    usra v1.8h, v0.8h, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or3:
+; NOSHA3-LABEL: xar_instead_of_or_v8i16:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    shl v1.8h, v0.8h, #9
 ; NOSHA3-NEXT:    usra v1.8h, v0.8h, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v8i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25))
   ret <8 x i16> %or
 }
 
-define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) {
-; SHA3-LABEL: xar_instead_of_or4:
+define <16 x i8> @xar_instead_of_or_v16i8(<16 x i8> %r) {
+; SHA3-LABEL: xar_instead_of_or_v16i8:
 ; SHA3:       // %bb.0: // %entry
 ; SHA3-NEXT:    add v1.16b, v0.16b, v0.16b
 ; SHA3-NEXT:    usra v1.16b, v0.16b, #7
 ; SHA3-NEXT:    mov v0.16b, v1.16b
 ; SHA3-NEXT:    ret
 ;
-; NOSHA3-LABEL: xar_instead_of_or4:
+; NOSHA3-LABEL: xar_instead_of_or_v16i8:
 ; NOSHA3:       // %bb.0: // %entry
 ; NOSHA3-NEXT:    add v1.16b, v0.16b, v0.16b
 ; NOSHA3-NEXT:    usra v1.16b, v0.16b, #7
 ; NOSHA3-NEXT:    mov v0.16b, v1.16b
 ; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v16i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $q0 killed $q0 def $z0
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE2-NEXT:    ret
 entry:
   %or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25))
   ret <16 x i8> %or
 }
 
+/* 64 bit vectors */
+
+define <2 x i32> @xar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; SHA3-LABEL: xar_v2i32:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    shl v0.2s, v1.2s, #25
+; SHA3-NEXT:    usra v0.2s, v1.2s, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v2i32:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    shl v0.2s, v1.2s, #25
+; NOSHA3-NEXT:    usra v0.2s, v1.2s, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v2i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <2 x i32> %x, %y
+  %b = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 25, i32 25>)
+  ret <2 x i32> %b
+}
+
+define <2 x i32> @xar_instead_of_or_v2i32(<2 x i32> %r) {
+; SHA3-LABEL: xar_instead_of_or_v2i32:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    shl v1.2s, v0.2s, #25
+; SHA3-NEXT:    usra v1.2s, v0.2s, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v2i32:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    shl v1.2s, v0.2s, #25
+; NOSHA3-NEXT:    usra v1.2s, v0.2s, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v2i32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.s, z0.s, z1.s, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 25))
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @xar_v4i16(<4 x i16> %x, <4 x i16> %y) {
+; SHA3-LABEL: xar_v4i16:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    shl v0.4h, v1.4h, #9
+; SHA3-NEXT:    usra v0.4h, v1.4h, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v4i16:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    shl v0.4h, v1.4h, #9
+; NOSHA3-NEXT:    usra v0.4h, v1.4h, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v4i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <4 x i16> %x, %y
+  %b = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> splat (i16 25))
+  ret <4 x i16> %b
+}
+
+define <4 x i16> @xar_instead_of_or_v4i16(<4 x i16> %r) {
+; SHA3-LABEL: xar_instead_of_or_v4i16:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    shl v1.4h, v0.4h, #9
+; SHA3-NEXT:    usra v1.4h, v0.4h, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v4i16:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    shl v1.4h, v0.4h, #9
+; NOSHA3-NEXT:    usra v1.4h, v0.4h, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v4i16:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.h, z0.h, z1.h, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 25))
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @xar_v8i8(<8 x i8> %x, <8 x i8> %y) {
+; SHA3-LABEL: xar_v8i8:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; SHA3-NEXT:    add v0.8b, v1.8b, v1.8b
+; SHA3-NEXT:    usra v0.8b, v1.8b, #7
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_v8i8:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    eor v1.8b, v0.8b, v1.8b
+; NOSHA3-NEXT:    add v0.8b, v1.8b, v1.8b
+; NOSHA3-NEXT:    usra v0.8b, v1.8b, #7
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_v8i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    // kill: def $d1 killed $d1 def $z1
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %a = xor <8 x i8> %x, %y
+  %b = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> splat (i8 25))
+  ret <8 x i8> %b
+}
+
+define <8 x i8> @xar_instead_of_or_v8i8(<8 x i8> %r) {
+; SHA3-LABEL: xar_instead_of_or_v8i8:
+; SHA3:       // %bb.0: // %entry
+; SHA3-NEXT:    add v1.8b, v0.8b, v0.8b
+; SHA3-NEXT:    usra v1.8b, v0.8b, #7
+; SHA3-NEXT:    fmov d0, d1
+; SHA3-NEXT:    ret
+;
+; NOSHA3-LABEL: xar_instead_of_or_v8i8:
+; NOSHA3:       // %bb.0: // %entry
+; NOSHA3-NEXT:    add v1.8b, v0.8b, v0.8b
+; NOSHA3-NEXT:    usra v1.8b, v0.8b, #7
+; NOSHA3-NEXT:    fmov d0, d1
+; NOSHA3-NEXT:    ret
+;
+; SVE2-LABEL: xar_instead_of_or_v8i8:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    movi v1.2d, #0000000000000000
+; SVE2-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE2-NEXT:    xar z0.b, z0.b, z1.b, #7
+; SVE2-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; SVE2-NEXT:    ret
+entry:
+  %or = call <8 x i8> @llvm.fshl(<8 x i8> %r, <8 x i8> %r, <8 x i8> splat (i8 25))
+  ret <8 x i8> %or
+}
+
 declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
 declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)

From 2efff47363f18966cd37461323b5db5418183534 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Wed, 11 Jun 2025 22:43:06 -0700
Subject: [PATCH 188/851] [NFCI][msan] Show that shadow for partially undefined
 constant vectors is computed as fully initialized (#143823)

This happens because `getShadow(Value *V)` has a special case for fully undefined/poisoned values, but partially undefined values fall-through and are given a clean shadow. This leads to false negatives (no false positives).

Note: MSan correctly handles InsertElementInst, but the shadow of the initial constant vector may still be wrong and be propagated.

Showing that the same approximation happens for other composite types is left as an exercise for the reader.
---
 .../Instrumentation/MemorySanitizer.cpp       |  4 +
 .../MemorySanitizer/partial-poison.ll         | 78 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index c2315d5de7041..d3c6a7151ec37 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2085,6 +2085,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(ShadowPtr && "Could not find shadow for an argument");
       return ShadowPtr;
     }
+
+    // TODO: Partially undefined vectors are handled by the fall-through case
+    //       below (see partial-poison.ll); this causes false negatives.
+
     // For everything else the shadow is zero.
     return getCleanShadow(V);
   }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
new file mode 100644
index 0000000000000..5164441c17e10
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -passes='msan' 2>&1 | FileCheck %s
+;
+; Test case to show that MSan computes shadows for partially poisoned vectors
+; as fully initialized, resulting in false negatives.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x i64> @left_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @left_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 poison, i64 42>
+;
+  ret <2 x i64> <i64 poison, i64 42>
+}
+
+define <2 x i64> @right_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @right_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 poison>
+;
+  ret <2 x i64> <i64 42, i64 poison>
+}
+
+define <2 x i64> @full_poison(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @full_poison(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> poison
+;
+  ret <2 x i64> <i64 poison, i64 poison>
+}
+
+define <2 x i64> @no_poison_or_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @no_poison_or_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> splat (i64 42)
+;
+  ret <2 x i64> <i64 42, i64 42>
+}
+
+define <2 x i64> @left_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @left_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 undef, i64 42>
+;
+  ret <2 x i64> <i64 undef, i64 42>
+}
+
+define <2 x i64> @right_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @right_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> <i64 42, i64 undef>
+;
+  ret <2 x i64> <i64 42, i64 undef>
+}
+
+define <2 x i64> @full_undef(ptr %add.ptr) sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @full_undef(
+; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> undef
+;
+  ret <2 x i64> <i64 undef, i64 undef>
+}

From bec85f3b187f57713e01191381c88134e122bd35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 12 Jun 2025 08:58:26 +0300
Subject: [PATCH 189/851] [LLD] [COFF] [test] Readd lto-late-arm.ll (#143494)

This testcase was removed in 4cafd28b7dd92080103d11cccc78d9a2f01e1242,
as a082f665f85b1002ab22af263eeafceca5288657 had made it no longer
trigger the error that it was supposed to do. (Because the latter of
those two commits makes the symbol "__rt_sdiv" be included among the
potential libcalls listed by lto::LTO::getRuntimeLibcallSymbols().)

Readd the test as a positive test, making sure that such libcalls can
get linked.

We do have preexisting test coverage for LTO libcalls overall in
libcall-archive.ll, but readd this test to cover specifically the ARM
division helper functions as well.
---
 lld/test/COFF/lto-late-arm.ll | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 lld/test/COFF/lto-late-arm.ll

diff --git a/lld/test/COFF/lto-late-arm.ll b/lld/test/COFF/lto-late-arm.ll
new file mode 100644
index 0000000000000..1070fc52a0136
--- /dev/null
+++ b/lld/test/COFF/lto-late-arm.ll
@@ -0,0 +1,38 @@
+; REQUIRES: arm
+
+;; A bitcode file can generate undefined references to symbols that weren't
+;; listed as undefined on the bitcode file itself, when lowering produces
+;; calls to e.g. builtin helper functions. Ideally all those functions are
+;; listed by lto::LTO::getRuntimeLibcallSymbols(), then we successfully
+;; can link cases when the helper functions are provided as bitcode too.
+;; (In practice, compiler-rt builtins are always compiled with -fno-lto, so
+;; this shouldn't really happen anyway.)
+
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: llvm-as %t.dir/main.ll -o %t.main.obj
+; RUN: llvm-as %t.dir/sdiv.ll -o %t.sdiv.obj
+; RUN: llvm-ar rcs %t.sdiv.lib %t.sdiv.obj
+
+; RUN: lld-link /entry:entry %t.main.obj %t.sdiv.lib /out:%t.exe /subsystem:console
+
+;--- main.ll
+target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7-w64-windows-gnu"
+
+@num = dso_local global i32 100
+
+define dso_local arm_aapcs_vfpcc i32 @entry(i32 %param) {
+entry:
+  %0 = load i32, ptr @num
+  %div = sdiv i32 %0, %param
+  ret i32 %div
+}
+;--- sdiv.ll
+target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7-w64-windows-gnu"
+
+define dso_local arm_aapcs_vfpcc void @__rt_sdiv() {
+entry:
+  ret void
+}

From 9d491bc602c2d9730cb42fe25f0753471a3af389 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Jun 2025 07:03:09 +0100
Subject: [PATCH 190/851] [AArch64][GlobalISel] Enable extract_vec_elt_combines
 postlegalization.

---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  2 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 51 +++++++------------
 2 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 571e2692cbfff..ca09598464d13 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -361,7 +361,7 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs,
+                        commute_constant_to_rhs, extract_vec_elt_combines,
                         push_freeze_to_prevent_poison_from_propagating,
                         combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> {
 }
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 77483ebb2235c..d6d323530946e 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -596,23 +596,15 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) {
 ; CHECK-GI-NEXT:    mov.b v1[3], w8
 ; CHECK-GI-NEXT:    cmeq.8b v0, v0, v1
 ; CHECK-GI-NEXT:    mvn.8b v0, v0
-; CHECK-GI-NEXT:    umov.b w8, v0[0]
-; CHECK-GI-NEXT:    umov.b w9, v0[1]
-; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    umov.b w8, v0[2]
-; CHECK-GI-NEXT:    mov.s v1[1], w9
-; CHECK-GI-NEXT:    umov.b w9, v0[3]
-; CHECK-GI-NEXT:    mov.s v1[2], w8
-; CHECK-GI-NEXT:    mov.s v1[3], w9
-; CHECK-GI-NEXT:    mov.s w8, v1[1]
-; CHECK-GI-NEXT:    mov.s w9, v1[2]
-; CHECK-GI-NEXT:    fmov w11, s1
-; CHECK-GI-NEXT:    mov.s w10, v1[3]
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w10, v0[2]
+; CHECK-GI-NEXT:    umov.b w11, v0[3]
 ; CHECK-GI-NEXT:    and w8, w8, #0x1
-; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
-; CHECK-GI-NEXT:    and w8, w9, #0x1
-; CHECK-GI-NEXT:    and w9, w10, #0x1
-; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
+; CHECK-GI-NEXT:    and w9, w11, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
 ; CHECK-GI-NEXT:    strb w8, [sp, #15]
 ; CHECK-GI-NEXT:    and w0, w8, #0xff
@@ -871,28 +863,19 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
 ; CHECK-GI-NEXT:    cmtst.4s v1, v1, v1
 ; CHECK-GI-NEXT:    mov.s w8, v1[1]
 ; CHECK-GI-NEXT:    mov.s w9, v1[2]
+; CHECK-GI-NEXT:    fmov w11, s1
 ; CHECK-GI-NEXT:    mov.s w10, v1[3]
-; CHECK-GI-NEXT:    mov.h v1[1], w8
-; CHECK-GI-NEXT:    mov.s w8, v0[1]
-; CHECK-GI-NEXT:    mov.h v1[2], w9
-; CHECK-GI-NEXT:    mov.h v1[3], w10
-; CHECK-GI-NEXT:    mov.h v1[4], v0[0]
-; CHECK-GI-NEXT:    mov.h v1[5], w8
-; CHECK-GI-NEXT:    umov.h w8, v1[1]
-; CHECK-GI-NEXT:    umov.h w9, v1[0]
-; CHECK-GI-NEXT:    umov.h w10, v1[2]
-; CHECK-GI-NEXT:    umov.h w11, v1[3]
 ; CHECK-GI-NEXT:    and w8, w8, #0x1
-; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
-; CHECK-GI-NEXT:    and w8, w10, #0x1
-; CHECK-GI-NEXT:    umov.h w10, v1[4]
-; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #2
-; CHECK-GI-NEXT:    and w9, w11, #0x1
-; CHECK-GI-NEXT:    umov.h w11, v1[5]
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    bfi w11, w8, #1, #31
+; CHECK-GI-NEXT:    and w8, w9, #0x1
 ; CHECK-GI-NEXT:    and w9, w10, #0x1
+; CHECK-GI-NEXT:    mov.s w10, v0[1]
+; CHECK-GI-NEXT:    orr w8, w11, w8, lsl #2
+; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #3
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    and w9, w9, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #4
-; CHECK-GI-NEXT:    and w9, w11, #0x1
+; CHECK-GI-NEXT:    and w9, w10, #0x1
 ; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #5
 ; CHECK-GI-NEXT:    and w8, w8, #0x3f
 ; CHECK-GI-NEXT:    strb w8, [sp, #15]

From 3f0cf742ac4eb3437450f8f263081ea951248851 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 14:40:38 +0800
Subject: [PATCH 191/851] [C++20] [Modules] [Reduced BMI] Don't write
 specializations with local args

Close https://github.com/llvm/llvm-project/issues/119947

As discussed in the above thread, we shouldn't write specializations
with local args in reduced BMI. Since users can't find such
specializations any way.
---
 clang/lib/Serialization/ASTWriterDecl.cpp | 45 +++++++++++++++++++
 clang/test/Modules/pr119947.cppm          | 54 +++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 clang/test/Modules/pr119947.cppm

diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 8f82324a27535..052cb5a253bf7 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -221,6 +221,48 @@ namespace clang {
         Record.AddDeclRef(F.second);
     }
 
+    template <typename T> bool shouldSkipWritingSpecializations(T *Spec) {
+      // Now we will only avoid writing specializations if we're generating
+      // reduced BMI.
+      if (!GeneratingReducedBMI)
+        return false;
+
+      assert((isa<FunctionDecl, ClassTemplateSpecializationDecl,
+                  VarTemplateSpecializationDecl>(Spec)));
+
+      ArrayRef<TemplateArgument> Args;
+      if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(Spec))
+        Args = CTSD->getTemplateArgs().asArray();
+      else if (auto *VTSD = dyn_cast<VarTemplateSpecializationDecl>(Spec))
+        Args = VTSD->getTemplateArgs().asArray();
+      else
+        Args = cast<FunctionDecl>(Spec)
+                   ->getTemplateSpecializationArgs()
+                   ->asArray();
+
+      // If there is any template argument is TULocal, we can avoid writing the
+      // specialization since the consumers of reduced BMI won't get the
+      // specialization anyway.
+      for (const TemplateArgument &TA : Args) {
+        switch (TA.getKind()) {
+        case TemplateArgument::Type: {
+          Linkage L = TA.getAsType()->getLinkage();
+          if (!isExternallyVisible(L))
+            return true;
+          break;
+        }
+        case TemplateArgument::Declaration:
+          if (!TA.getAsDecl()->isExternallyVisible())
+            return true;
+          break;
+        default:
+          break;
+        }
+      }
+
+      return false;
+    }
+
     /// Add to the record the first template specialization from each module
     /// file that provides a declaration of D. We store the DeclId and an
     /// ODRHash of the template arguments of D which should provide enough
@@ -235,6 +277,9 @@ namespace clang {
       CollectFirstDeclFromEachModule(D, /*IncludeLocal*/ true, Firsts);
 
       for (const auto &F : Firsts) {
+        if (shouldSkipWritingSpecializations(F.second))
+          continue;
+
         if (isa<ClassTemplatePartialSpecializationDecl,
                 VarTemplatePartialSpecializationDecl>(F.second))
           PartialSpecsInMap.push_back(F.second);
diff --git a/clang/test/Modules/pr119947.cppm b/clang/test/Modules/pr119947.cppm
new file mode 100644
index 0000000000000..40de2cad3c0d7
--- /dev/null
+++ b/clang/test/Modules/pr119947.cppm
@@ -0,0 +1,54 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -emit-llvm -o -
+
+
+//--- a.cppm
+export module a;
+
+struct a_inner {
+	~a_inner() {
+	}
+	void f(auto) {
+	}
+};
+
+export template<typename T>
+struct a {
+	a() {
+		struct local {};
+		inner.f(local());
+	}
+private:
+	a_inner inner;
+};
+
+
+namespace {
+
+struct s {
+};
+
+} // namespace
+
+void f() {
+	a<s> x;
+}
+
+//--- use.cpp
+import a;
+
+namespace {
+
+struct s {
+};
+
+} // namespace
+
+void g() {
+	a<s> x;
+}
+

From 6157028fea93ff14af18b173dd01eb431cfb6aef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 09:19:50 +0200
Subject: [PATCH 192/851] [BasicAA][ValueTracking] Increase depth for
 underlying object search (#143714)

This depth limits a linear search (rather than the usual potentially
exponential one) and is not particularly important for compile-time in
practice.

The change in #137297 is going to increase the length of GEP chains, so
I'd like to increase this limit a bit to reduce the chance of
regressions (https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2419
showed a 13% increase in SearchLimitReached). There is no particular
significance to the new value of 10.

Compile-time is neutral.
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  2 +-
 .../BasicAA/gep-decomposition-limit.ll        | 38 +++++++++++--------
 .../underlying-objects-2.ll                   |  5 ++-
 .../inline-noalias-unidentify-object.ll       | 22 +++++++----
 4 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 32ab9733d13c9..e215c90b5a72a 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -47,7 +47,7 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6;
 
 /// The max limit of the search depth in DecomposeGEPExpression() and
 /// getUnderlyingObject().
-constexpr unsigned MaxLookupSearchDepth = 6;
+constexpr unsigned MaxLookupSearchDepth = 10;
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the KnownZero/KnownOne bit sets.
diff --git a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
index 23a96ebca8485..a256ececbe565 100644
--- a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
+++ b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll
@@ -2,22 +2,22 @@
 
 ; CHECK-LABEL: Function: test
 ;; Before limit:
-; CHECK-DAG: MustAlias: i8* %gep.add5, i8* %gep.inc5
-; CHECK-DAG: NoAlias: i8* %gep.inc3, i8* %gep.inc5
-; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc5
+; CHECK-DAG: MustAlias: i8* %gep.add9, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc7, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc8, i8* %gep.inc9
 ;; At limit:
-; CHECK-DAG: MustAlias: i8* %gep.add6, i8* %gep.inc6
-; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc6
-; CHECK-DAG: NoAlias: i8* %gep.inc5, i8* %gep.inc6
+; CHECK-DAG: MustAlias: i8* %gep.add10, i8* %gep.inc10
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc8
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc9
 ;; After limit:
-; CHECK-DAG: MayAlias: i8* %gep.add7, i8* %gep.inc7
-; CHECK-DAG: MayAlias: i8* %gep.inc5, i8* %gep.inc7
-; CHECK-DAG: NoAlias: i8* %gep.inc6, i8* %gep.inc7
+; CHECK-DAG: MayAlias: i8* %gep.add11, i8* %gep.inc11
+; CHECK-DAG: MayAlias: i8* %gep.inc11, i8* %gep.inc9
+; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc11
 
 define void @test(ptr %base) {
-  %gep.add5 = getelementptr i8, ptr %base, i64 5
-  %gep.add6 = getelementptr i8, ptr %base, i64 6
-  %gep.add7 = getelementptr i8, ptr %base, i64 7
+  %gep.add9 = getelementptr i8, ptr %base, i64 9
+  %gep.add10 = getelementptr i8, ptr %base, i64 10
+  %gep.add11 = getelementptr i8, ptr %base, i64 11
 
   %gep.inc1 = getelementptr i8, ptr %base, i64 1
   %gep.inc2 = getelementptr i8, ptr %gep.inc1, i64 1
@@ -26,15 +26,23 @@ define void @test(ptr %base) {
   %gep.inc5 = getelementptr i8, ptr %gep.inc4, i64 1
   %gep.inc6 = getelementptr i8, ptr %gep.inc5, i64 1
   %gep.inc7 = getelementptr i8, ptr %gep.inc6, i64 1
+  %gep.inc8 = getelementptr i8, ptr %gep.inc7, i64 1
+  %gep.inc9 = getelementptr i8, ptr %gep.inc8, i64 1
+  %gep.inc10 = getelementptr i8, ptr %gep.inc9, i64 1
+  %gep.inc11 = getelementptr i8, ptr %gep.inc10, i64 1
 
-  load i8, ptr %gep.add5
-  load i8, ptr %gep.add6
-  load i8, ptr %gep.add7
+  load i8, ptr %gep.add9
+  load i8, ptr %gep.add10
+  load i8, ptr %gep.add11
   load i8, ptr %gep.inc3
   load i8, ptr %gep.inc4
   load i8, ptr %gep.inc5
   load i8, ptr %gep.inc6
   load i8, ptr %gep.inc7
+  load i8, ptr %gep.inc8
+  load i8, ptr %gep.inc9
+  load i8, ptr %gep.inc10
+  load i8, ptr %gep.inc11
 
   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
index abfdff79dc113..1d3512128678e 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll
@@ -127,9 +127,12 @@ for_j.body:
   %gepB7 = getelementptr inbounds i8, ptr %gepB6, i64 0
   %gepB8 = getelementptr inbounds i8, ptr %gepB7, i64 0
   %gepB9 = getelementptr inbounds i8, ptr %gepB8, i64 0
+  %gepB10 = getelementptr inbounds i8, ptr %gepB9, i64 0
+  %gepB11 = getelementptr inbounds i8, ptr %gepB10, i64 0
+  %gepB12 = getelementptr inbounds i8, ptr %gepB11, i64 0
 
   %loadPrev = load i8, ptr %gepPrev, align 1
-  %loadB = load i8, ptr %gepB9, align 1
+  %loadB = load i8, ptr %gepB12, align 1
 
   %mul = mul i8 %loadPrev, %loadB
 
diff --git a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
index 54e9ee0918ae8..b7ba1b32238a7 100644
--- a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
+++ b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll
@@ -3,15 +3,18 @@
 define i32 @caller(ptr %p) {
 ; CHECK-LABEL: define i32 @caller(ptr %p) {
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    [[P_8_I:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
-; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P_8_I]], align 4, !alias.scope !0
-; CHECK-NEXT:    [[P_1_I:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    [[P_11_I:%.*]] = getelementptr i8, ptr %p, i64 11
+; CHECK-NEXT:    [[V_I:%.*]] = load i32, ptr [[P_11_I]], align 4, !alias.scope !0
+; CHECK-NEXT:    [[P_1_I:%.*]] = getelementptr i8, ptr %p, i64 1
 ; CHECK-NEXT:    [[P_2_I:%.*]] = getelementptr i8, ptr [[P_1_I]], i64 1
 ; CHECK-NEXT:    [[P_3_I:%.*]] = getelementptr i8, ptr [[P_2_I]], i64 1
 ; CHECK-NEXT:    [[P_4_I:%.*]] = getelementptr i8, ptr [[P_3_I]], i64 1
 ; CHECK-NEXT:    [[P_5_I:%.*]] = getelementptr i8, ptr [[P_4_I]], i64 1
 ; CHECK-NEXT:    [[P_6_I:%.*]] = getelementptr i8, ptr [[P_5_I]], i64 1
-; CHECK-NEXT:    [[P_7_I:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1
+; CHECK-NEXT:    [[P_7_I1:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1
+; CHECK-NEXT:    [[P_8_I:%.*]] = getelementptr i8, ptr [[P_7_I1]], i64 1
+; CHECK-NEXT:    [[P_9_I:%.*]] = getelementptr i8, ptr [[P_8_I]], i64 1
+; CHECK-NEXT:    [[P_7_I:%.*]] = getelementptr i8, ptr [[P_9_I]], i64 1
 ; CHECK-NEXT:    [[P_8_ALIAS_I:%.*]] = getelementptr i8, ptr [[P_7_I]], i64 1
 ; CHECK-NEXT:    store i32 42, ptr [[P_8_ALIAS_I]], align 4
 ; CHECK-NEXT:    ret i32 [[V_I]]
@@ -21,8 +24,8 @@ define i32 @caller(ptr %p) {
 }
 
 define internal i32 @callee(ptr noalias %p) {
-  %p.8 = getelementptr i8, ptr %p, i64 8
-  %v = load i32, ptr %p.8
+  %p.11 = getelementptr i8, ptr %p, i64 11
+  %v = load i32, ptr %p.11
   %p.1 = getelementptr i8, ptr %p, i64 1
   %p.2 = getelementptr i8, ptr %p.1, i64 1
   %p.3 = getelementptr i8, ptr %p.2, i64 1
@@ -30,7 +33,10 @@ define internal i32 @callee(ptr noalias %p) {
   %p.5 = getelementptr i8, ptr %p.4, i64 1
   %p.6 = getelementptr i8, ptr %p.5, i64 1
   %p.7 = getelementptr i8, ptr %p.6, i64 1
-  %p.8.alias = getelementptr i8, ptr %p.7, i64 1
-  store i32 42, ptr %p.8.alias
+  %p.8 = getelementptr i8, ptr %p.7, i64 1
+  %p.9 = getelementptr i8, ptr %p.8, i64 1
+  %p.10 = getelementptr i8, ptr %p.9, i64 1
+  %p.11.alias = getelementptr i8, ptr %p.10, i64 1
+  store i32 42, ptr %p.11.alias
   ret i32 %v
 }

From 77062244ed56be61aecda28d6fede3432545f741 Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 12 Jun 2025 09:29:40 +0200
Subject: [PATCH 193/851] Fix two instances of -Wparentheses warnings [NFC]

Add parentheses around the assert conditions.

Without this gcc warned like
 ../lib/Target/AMDGPU/GCNSchedStrategy.cpp:2250: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
  2250 |          NewMI != RegionBounds.second && "cannot remove at region end");
and
 ../../clang/lib/Sema/SemaOverload.cpp:11326:39: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
 11326 |          DeferredCandidatesCount == 0 &&
       |          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~
 11327 |              "Unexpected deferred template candidates");
       |              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
---
 clang/lib/Sema/SemaOverload.cpp             | 6 +++---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index cf455f4588de3..89e86f49a3ca8 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -11322,9 +11322,9 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S,
                                                            SourceLocation Loc,
                                                            iterator &Best) {
 
-  assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) ||
-         DeferredCandidatesCount == 0 &&
-             "Unexpected deferred template candidates");
+  assert((shouldDeferTemplateArgumentDeduction(S.getLangOpts()) ||
+          DeferredCandidatesCount == 0) &&
+         "Unexpected deferred template candidates");
 
   bool TwoPhaseResolution =
       DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 0f80462050cda..7165cf89ca45d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -2246,8 +2246,8 @@ void PreRARematStage::finalizeGCNSchedStage() {
 void GCNScheduleDAGMILive::updateRegionBoundaries(
     RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
     MachineInstr *NewMI) {
-  assert(!NewMI ||
-         NewMI != RegionBounds.second && "cannot remove at region end");
+  assert((!NewMI || NewMI != RegionBounds.second) &&
+         "cannot remove at region end");
 
   if (RegionBounds.first == RegionBounds.second) {
     assert(NewMI && "cannot remove from an empty region");

From 2d35b568ef949717e35df664d4d9352eddbffbfd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 09:27:24 +0100
Subject: [PATCH 194/851] [X86] bsf.ll - add icmp_ne coverage to bsf
 passthrough tests

---
 llvm/test/CodeGen/X86/bsf.ll | 56 ++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 58929115baf54..312f94c041235 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -38,13 +38,13 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testb %al, %al
-; X86-NEXT:    je .LBB1_1
+; X86-NEXT:    jne .LBB1_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB1_1:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -56,8 +56,8 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true)
-  %2 = icmp eq i8 %x, 0
-  %3 = select i1 %2, i8 %y, i8 %1
+  %2 = icmp ne i8 %x, 0
+  %3 = select i1 %2, i8 %1, i8 %y
   ret i8 %3
 }
 
@@ -66,14 +66,14 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testw %ax, %ax
-; X86-NEXT:    je .LBB2_1
+; X86-NEXT:    jne .LBB2_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    orl $65536, %eax # imm = 0x10000
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB2_1:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl $65536, %eax # imm = 0x10000
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
@@ -87,8 +87,8 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
-  %2 = icmp eq i16 %x, 0
-  %3 = select i1 %2, i16 %y, i16 %1
+  %2 = icmp ne i16 %x, 0
+  %3 = select i1 %2, i16 %1, i16 %y
   ret i16 %3
 }
 
@@ -157,12 +157,12 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB5_1
+; X86-NEXT:    jne .LBB5_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    rep bsfl %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: cmov_bsf32_undef:
@@ -171,8 +171,8 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
 ; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
-  %2 = icmp eq i32 %x, 0
-  %3 = select i1 %2, i32 %y, i32 %1
+  %2 = icmp ne i32 %x, 0
+  %3 = select i1 %2, i32 %1, i32 %y
   ret i32 %3
 }
 
@@ -199,7 +199,7 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl $64, %eax
 ; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    jne .LBB6_7
-; X86-NEXT:  .LBB6_6:
+; X86-NEXT:  .LBB6_6: # %cond.end
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:  .LBB6_7: # %cond.end
@@ -218,8 +218,8 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
-  %2 = icmp eq i64 %x, 0
-  %3 = select i1 %2, i64 %y, i64 %1
+  %2 = icmp ne i64 %x, 0
+  %3 = select i1 %2, i64 %1, i64 %y
   ret i64 %3
 }
 
@@ -375,10 +375,10 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    orl %ebx, %ebp
 ; X86-NEXT:    orl %edi, %ebp
 ; X86-NEXT:    je .LBB9_11
-; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
-; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:  # %bb.3: # %select.true.sink
 ; X86-NEXT:    rep bsfl %ecx, %edi
 ; X86-NEXT:    addl $32, %edi
 ; X86-NEXT:    testl %ebx, %ebx
@@ -402,20 +402,20 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    rep bsfl %edx, %edi
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    jne .LBB9_5
-; X86-NEXT:  .LBB9_6: # %select.false.sink
+; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
 ; X86-NEXT:    addl $32, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    jne .LBB9_9
-; X86-NEXT:  .LBB9_8: # %select.false.sink
+; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:  .LBB9_9: # %select.false.sink
+; X86-NEXT:  .LBB9_9: # %select.true.sink
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
-; X86-NEXT:  .LBB9_10: # %select.false.sink
+; X86-NEXT:  .LBB9_10: # %select.true.sink
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -427,7 +427,7 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    orq %rsi, %rax
 ; X64-NEXT:    je .LBB9_2
-; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:  # %bb.1: # %select.true.sink
 ; X64-NEXT:    rep bsfq %rdi, %rcx
 ; X64-NEXT:    rep bsfq %rsi, %rax
 ; X64-NEXT:    addq $64, %rax
@@ -440,8 +440,8 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X64-NEXT:    movq %rcx, %rdx
 ; X64-NEXT:    retq
   %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
-  %2 = icmp eq i128 %x, 0
-  %3 = select i1 %2, i128 %y, i128 %1
+  %2 = icmp ne i128 %x, 0
+  %3 = select i1 %2, i128 %1, i128 %y
   ret i128 %3
 }
 

From 6e5a1423b752c66273bfcff35aaa8083075788a8 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Thu, 12 Jun 2025 01:28:27 -0700
Subject: [PATCH 195/851] [mlir] Reapply "Loosen restrictions on folding
 dynamic reshapes" (#142827)

The original PR https://github.com/llvm/llvm-project/pull/137963 had a
nvidia bot failure. This appears to be a flaky test because rerunning
the build was successful.

This change needs commit 6f2ba47 to fix incorrect usage of
`getReassociationIndicesForCollapse`.

Reverts llvm/llvm-project#142639

Co-authored-by: Artem Gindinson <gindinson@roofline.ai>
---
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp    | 372 +++++++++++++++---
 .../Dialect/Linalg/simplify-pack-unpack.mlir  |   4 +-
 mlir/test/Dialect/Tensor/canonicalize.mlir    |  39 +-
 mlir/unittests/Dialect/Utils/CMakeLists.txt   |   1 +
 .../Dialect/Utils/ReshapeOpsUtilsTest.cpp     | 203 ++++++++++
 5 files changed, 560 insertions(+), 59 deletions(-)
 create mode 100644 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp

diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 1a04d702e0559..3b1fdb69e8ef1 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -10,6 +10,10 @@
 
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/LogicalResult.h"
 
 #include <numeric>
 #include <optional>
@@ -28,67 +32,329 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType,
   return std::nullopt;
 }
 
-std::optional<SmallVector<ReassociationIndices>>
-mlir::getReassociationIndicesForCollapse(ArrayRef<int64_t> sourceShape,
-                                         ArrayRef<int64_t> targetShape) {
-  if (sourceShape.size() <= targetShape.size())
-    return std::nullopt;
-  unsigned sourceDim = 0;
-  SmallVector<ReassociationIndices> reassociationMap;
-  reassociationMap.reserve(targetShape.size());
+namespace {
+/// A simple struct to represent ReassociationIndices as an inclusive interval.
+/// It's designed to be feasibly minimal, so the call sites should manage the
+/// validity of the range manually.
+struct ReassociationIndexRange {
+  /// FIXME: Signed type is used for consistency with ReassociationIndices.
+  /// We should consider refactoring all reassociation utilities to use unsigned
+  /// types.
+  int64_t leftIdx = 0, rightIdx = 0;
+
+  /// Util for manual checks of the range's validity
+  LogicalResult verify() const {
+    return leftIdx >= 0 && (leftIdx <= rightIdx) ? success() : failure();
+  }
+
+  /// Checks range's containment within another range. Treats the edges
+  /// non-exclusively.
+  bool isInRange(const ReassociationIndexRange &outerRange) const {
+    return leftIdx >= outerRange.leftIdx && rightIdx <= outerRange.rightIdx;
+  }
+
+  unsigned size() const {
+    assert(succeeded(verify()));
+    return rightIdx - leftIdx + 1;
+  }
+  bool containsSingleIndex() const { return size() == 1; }
+
+  /// Collects indices that do not overlap between this and another range.
+  ReassociationIndices
+  getNonOverlappingIndicesWith(ReassociationIndexRange &rhs) const {
+    if (rightIdx < rhs.leftIdx) {
+      // The intervals do not overlap - concatenate the indices from both.
+      auto jointFullIndices = getFullIndices();
+      jointFullIndices.append(rhs.getFullIndices());
+      return jointFullIndices;
+    }
+    ReassociationIndices result;
+    // Handle the chunk left of the overlapping range.
+    int64_t leftStart = std::min(leftIdx, rhs.leftIdx);
+    int64_t leftEnd = std::max(leftIdx, rhs.leftIdx);
+    llvm::append_range(result, llvm::seq(leftStart, leftEnd));
+    // Handle the chunk right of the overlapping range. Symmetrically, we should
+    // skip the edge of the overlap AND include the rightmost index.
+    int64_t rightStart = std::min(rightIdx, rhs.rightIdx) + 1;
+    int64_t rightEnd = std::max(rightIdx, rhs.rightIdx);
+    if (rightStart < rightEnd)
+      llvm::append_range(result, llvm::seq_inclusive(rightStart, rightEnd));
+    return result;
+  }
+
+  /// Converts the range into ReassociationIndices.
+  ReassociationIndices getFullIndices() const {
+    ReassociationIndices result;
+    for (int64_t idx = leftIdx; idx <= rightIdx; ++idx) {
+      result.push_back(idx);
+    }
+    return result;
+  }
+};
+} // namespace
+
+/// Starting from `sourceStartIdx`, searches `sourceShape` for the first
+/// sequence that can be collapsed into a dynamic dimension (at least one must
+/// be present in the source).
+/// By default, lazily returns once the first dynamic dimension has been found.
+/// Setting `matchGreedily` as `true` will also mark all subsequent
+/// source dimensions for collapsing into the target.
+static FailureOr<ReassociationIndexRange>
+findReassociationRangeForDynamicDim(ArrayRef<int64_t> sourceShape,
+                                    int64_t sourceStartIdx,
+                                    bool matchGreedily = false) {
+  const unsigned numSourceDims = sourceShape.size();
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+  std::optional<ReassociationIndexRange> resultRange = std::nullopt;
+
+  ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx};
+  for (; iterationRange.isInRange(sourceShapeAsRange);
+       iterationRange.rightIdx++) {
+    int64_t sourceSize = sourceShape[iterationRange.rightIdx];
+    if (sourceSize == ShapedType::kDynamic) {
+      resultRange = iterationRange;
+      break;
+    }
+  }
+  if (!resultRange)
+    return failure();
+  if (matchGreedily)
+    resultRange->rightIdx = sourceShapeAsRange.rightIdx;
+  return *resultRange;
+}
 
-  ReassociationIndices currIndices;
+/// Starting from `sourceStartIdx`, searches `sourceShape` for the first
+/// sequence of static dimensions such that their product matches `targetSize`.
+/// By default, lazily returns once the product matches the target size. Setting
+/// `matchGreedily` as `true` will append all neighboring unit dimensions
+/// (dimensions of 1) to the match.
+static FailureOr<ReassociationIndexRange>
+findReassociationRangeForSize(ArrayRef<int64_t> sourceShape,
+                              int64_t sourceStartIdx, int64_t targetSize,
+                              bool matchGreedily = false) {
+  const unsigned numSourceDims = sourceShape.size();
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+  std::optional<ReassociationIndexRange> resultRange = std::nullopt;
+
+  ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx};
   int64_t prodOfCollapsedDims = 1;
-  while (sourceDim < sourceShape.size()) {
-    unsigned targetDim = reassociationMap.size();
-    // If we have mapped all the target dimensions stop and handle the remaining
-    // tail of size-1 dimensions explicitly.
-    if (targetDim == targetShape.size())
+  while (iterationRange.isInRange(sourceShapeAsRange)) {
+    int64_t sourceSize = sourceShape[iterationRange.rightIdx];
+    if (sourceSize == ShapedType::kDynamic) {
+      // Reassociation for a static dim cannot include a dynamic dim. Reset
+      // induction variables to essentially restart the loop from the next
+      // source dimension.
+      prodOfCollapsedDims = 1;
+      iterationRange = {iterationRange.rightIdx + 1,
+                        iterationRange.rightIdx + 1};
+      continue;
+    }
+    prodOfCollapsedDims *= sourceSize;
+    // If the target size has been exceeded without matching, we need to shift
+    // the range start right. From the start of the range, roll back the
+    // multiplication until the target size exceeds the product again.
+    while (prodOfCollapsedDims > targetSize &&
+           !iterationRange.containsSingleIndex()) {
+      int64_t frontSourceSize = sourceShape[iterationRange.leftIdx];
+      prodOfCollapsedDims /= frontSourceSize;
+      // Shrink the range rightwards
+      iterationRange.leftIdx++;
+    }
+    // We could've reached the target size with the current dimension,
+    // also as a result of the above shift to right.
+    if (prodOfCollapsedDims == targetSize) {
+      resultRange = iterationRange;
       break;
+    }
+    // Increment the iteration range
+    iterationRange.rightIdx++;
+  }
+  if (!resultRange)
+    return failure();
+  if (matchGreedily) {
+    // We now want to collect all unit dimensions directly after the target
+    // product match. Advance the iterator to avoid OOB when the product match
+    // happens at the last element.
+    iterationRange.rightIdx++;
+    while (iterationRange.isInRange(sourceShapeAsRange) &&
+           sourceShape[iterationRange.rightIdx] == 1) {
+      resultRange = iterationRange;
+      iterationRange.rightIdx++;
+    }
+  }
+  return *resultRange;
+}
 
-    int64_t currTargetShape = targetShape[targetDim];
-    while (sourceDim < (sourceShape.size() - 1) &&
-           sourceShape[sourceDim] != ShapedType::kDynamic &&
-           prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) {
-      prodOfCollapsedDims *= sourceShape[sourceDim];
-      currIndices.push_back(sourceDim++);
+/// Attempts to find a valid collapsing reassociation of `sourceShape` into
+/// `targetShape` through a simple traversal. If successful, an array of source
+/// index ranges is returned, correspondingly to each dimension in the target
+/// shape. The resulting indices shall fully cover the `sourceShape` without
+/// overlaps.
+///
+/// The algorithm is essentially a lazy one, searching for non-greedy matches -
+/// it will only yield a greedy match for the last target dimension.
+/// FIXME: The algorithm can only backtrack when it needs to append an offset
+/// for a static target dimension to the preceding dynamic one (this retains the
+/// linear complexity). As feasible, consider adding further backtracking
+/// routines to enable more reassociations, e.g.:
+/// - ?x2x?x2 into ?x2
+static FailureOr<SmallVector<ReassociationIndexRange>>
+findReassociationRangesForCollapse(ArrayRef<int64_t> sourceShape,
+                                   ArrayRef<int64_t> targetShape) {
+  unsigned numSourceDims = sourceShape.size(),
+           numTargetDims = targetShape.size();
+  assert(numSourceDims > numTargetDims);
+  ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1};
+
+  SmallVector<ReassociationIndexRange> reassocRanges;
+  reassocRanges.reserve(numTargetDims);
+  // We'll iterate in strides of 2 to enable pseudo-backtracking for simple
+  // cases, e.g.:
+  // - ?x2x3x5 into ?x15
+  std::optional<int64_t> prevTargetSize = std::nullopt;
+  for (unsigned targetDimIdx = 0, sourceDimIdx = 0;
+       targetDimIdx < numTargetDims; ++targetDimIdx) {
+    int64_t targetSize = targetShape[targetDimIdx];
+    // Simply check if there are any subsequent target dimensions left - if not,
+    // the match must be made greedily.
+    bool shouldMatchGreedily = targetDimIdx == numTargetDims - 1;
+    FailureOr<ReassociationIndexRange> sourceRange;
+    if (targetSize == ShapedType::kDynamic) {
+      sourceRange = findReassociationRangeForDynamicDim(
+          sourceShape, sourceDimIdx, shouldMatchGreedily);
+    } else {
+      sourceRange = findReassociationRangeForSize(
+          sourceShape, sourceDimIdx, targetSize, shouldMatchGreedily);
     }
 
-    // If the current expanded dimension is dynamic, then the collapsed
-    // dimensions should also be dynamic and product of all previous unprocessed
-    // dimensions of the expanded shape should be 1.
-    if (sourceShape[sourceDim] == ShapedType::kDynamic &&
-        (currTargetShape != ShapedType::kDynamic || prodOfCollapsedDims != 1))
-      return std::nullopt;
-
-    // If the collapsed dim is dynamic, the current expanded dim should also
-    // be dynamic.
-    if (currTargetShape == ShapedType::kDynamic &&
-        sourceShape[sourceDim] != ShapedType::kDynamic)
-      return std::nullopt;
-
-    // For static shapes, if the product of dimensions of the expanded shape
-    // should match the collapsed dimension shape.
-    if (prodOfCollapsedDims * sourceShape[sourceDim] != currTargetShape)
-      return std::nullopt;
-
-    currIndices.push_back(sourceDim++);
-    reassociationMap.emplace_back(ReassociationIndices{});
-    std::swap(reassociationMap.back(), currIndices);
-    prodOfCollapsedDims = 1;
+    // Run sanity checks on the returned index range.
+    if (failed(sourceRange) || failed(sourceRange->verify()) ||
+        !sourceRange->isInRange(sourceShapeAsRange))
+      return failure();
+    if (sourceRange->leftIdx > sourceDimIdx) {
+      // If some source dimensions had to be skipped in order to find a match,
+      // they must be collapsed into the directly preceding dynamic dimension.
+      if (!prevTargetSize || prevTargetSize != ShapedType::kDynamic)
+        return failure();
+      reassocRanges.back().rightIdx = sourceRange->leftIdx - 1;
+    }
+
+    // Store the gathered information as required for the next iteration.
+    prevTargetSize = targetSize;
+    sourceDimIdx = sourceRange->rightIdx + 1;
+    reassocRanges.push_back(*sourceRange);
   }
-  // All the dimensions in the target must have been processed.
-  if (reassociationMap.size() != targetShape.size())
+  // Fail if the source shape wasn't a full match for the target shape. We only
+  // need to check the last recorded index - any other gaps should have been
+  // mended by the main loop.
+  if (reassocRanges.back().rightIdx < sourceShapeAsRange.rightIdx)
+    return failure();
+  return reassocRanges;
+}
+
+/// A variant of `findReassociationRangesForCollapse(...)` that can also scan
+/// the shapes right-to-left.
+static FailureOr<SmallVector<ReassociationIndexRange>>
+findReassociationRangesForCollapse(ArrayRef<int64_t> sourceShape,
+                                   ArrayRef<int64_t> targetShape,
+                                   bool iterateRightToLeft) {
+  if (!iterateRightToLeft)
+    return findReassociationRangesForCollapse(sourceShape, targetShape);
+  // NB: To iterate right-to-left, we currently reverse the shapes and then
+  // reverse the result back. The reversed shapes must not be temporary, as
+  // we're passing through an ArrayRef.
+  // FIXME: It would be preferable to avoid the expensive copies. At the moment,
+  // this approach is chosen for readability of the main implementation.
+  std::vector<int64_t> sourceToReverse = sourceShape.vec(),
+                       targetToReverse = targetShape.vec();
+  std::reverse(sourceToReverse.begin(), sourceToReverse.end());
+  std::reverse(targetToReverse.begin(), targetToReverse.end());
+  auto invertedRanges =
+      findReassociationRangesForCollapse(sourceToReverse, targetToReverse);
+  if (failed(invertedRanges))
+    return failure();
+  SmallVector<ReassociationIndexRange> &rangesToInvert = *invertedRanges;
+  unsigned numSourceDims = sourceShape.size();
+  // We have received the ranges for inverted shapes. Now we have to invert
+  // the ranges back to correspond with the original source shape.
+  for (auto &range : rangesToInvert) {
+    int64_t invLeftIdx = range.leftIdx, invRightIdx = range.rightIdx;
+    range.leftIdx = numSourceDims - 1 - invRightIdx;
+    range.rightIdx = numSourceDims - 1 - invLeftIdx;
+  }
+  // Also invert the ordering of the ranges to correspond with the original
+  // target shape.
+  std::reverse(rangesToInvert.begin(), rangesToInvert.end());
+  return rangesToInvert;
+}
+
+std::optional<SmallVector<ReassociationIndices>>
+mlir::getReassociationIndicesForCollapse(ArrayRef<int64_t> sourceShape,
+                                         ArrayRef<int64_t> targetShape) {
+  unsigned numSourceDims = sourceShape.size(),
+           numTargetDims = targetShape.size();
+  // We're supposed to search for a collapsing reassociation. If the sizes
+  // match, there's no actual collapsing taking place - it's either a no-op or a
+  // `tensor.reshape`-style reassociation (that would be beyond the scope of
+  // this utility).
+  if (numSourceDims <= numTargetDims)
+    return std::nullopt;
+  // Early handling for scalar target types.
+  if (numTargetDims == 0) {
+    ReassociationIndices allSourceIndices;
+    allSourceIndices.reserve(numSourceDims);
+    for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims;
+         ++sourceDimIdx) {
+      int64_t sourceSize = sourceShape[sourceDimIdx];
+      // All source dimensions must be unit or dynamic.
+      if (sourceSize != 1 && sourceSize != ShapedType::kDynamic)
+        return std::nullopt;
+      allSourceIndices.push_back(sourceDimIdx);
+    }
+    return SmallVector<ReassociationIndices>{allSourceIndices};
+  }
+
+  // Collect source ranges by iterating over the target shape left-to-right.
+  FailureOr<SmallVector<ReassociationIndexRange>> maybeForwardRanges =
+      findReassociationRangesForCollapse(sourceShape, targetShape);
+  if (failed(maybeForwardRanges))
+    return std::nullopt;
+  auto &ranges = *maybeForwardRanges;
+  // Now do the same in reverse. We need to get another valid reassociation
+  // through some other strategy, and then compare the results in order to
+  // disambiguate mixed subshapes, such as:
+  // ?x?x? into ?x?, ?x2x? into ?x?, ?x2x3x6x? into ?x6x?
+  // This leads us to lose some of the reassociation opportunities that can only
+  // be found by iterating in a certain direction, e.g. 2x2x? into 2x? - without
+  // backtracking, the algorithm will fail right-to-left. However, this is the
+  // best way to preserve correctness.
+  FailureOr<SmallVector<ReassociationIndexRange>> maybeReverseRanges =
+      findReassociationRangesForCollapse(sourceShape, targetShape,
+                                         /*iterateRightToLeft=*/true);
+  if (failed(maybeReverseRanges))
+    return std::nullopt;
+  auto &reverseRanges = *maybeReverseRanges;
+
+  if (ranges.size() != numTargetDims || reverseRanges.size() != numTargetDims)
     return std::nullopt;
-  // Process any remaining entries in the source shape. They all need to be
-  // 1 or dynamic.
-  for (; sourceDim < sourceShape.size(); sourceDim++) {
-    if (sourceShape[sourceDim] != ShapedType::kDynamic &&
-        sourceShape[sourceDim] != 1)
-      return std::nullopt;
-    // The map is empty when the target type is a scalar.
-    if (!reassociationMap.empty())
-      reassociationMap.back().push_back(sourceDim);
+  // Now we can check for ambiguity of each target dimension's reassociation. If
+  // successful, we put the full indices into our result map for the target
+  // shape.
+  SmallVector<ReassociationIndices> reassociationMap(numTargetDims);
+  for (unsigned targetDimIdx = 0; targetDimIdx < numTargetDims;
+       ++targetDimIdx) {
+    ReassociationIndexRange &range = ranges[targetDimIdx];
+    ReassociationIndexRange &reverseRange = reverseRanges[targetDimIdx];
+    // Get non-overlapping indices between the ranges
+    ReassociationIndices nonMatchingIndices =
+        range.getNonOverlappingIndicesWith(reverseRange);
+    // Unit dimensions can be collapsed wherever - this is the only ambiguity
+    // that we allow.
+    for (int64_t sourceDimIdx : nonMatchingIndices) {
+      if (sourceShape[sourceDimIdx] != 1)
+        return std::nullopt;
+    }
+    reassociationMap[targetDimIdx] = range.getFullIndices();
   }
   return reassociationMap;
 }
diff --git a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
index 51350e5bc8498..6979770154bab 100644
--- a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir
@@ -158,8 +158,8 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> {
 // -----
 
 // CHECK-LABEL: func.func @unpack_dynamic
-// CHECK-NOT:     tensor.collapse
-// CHECK:         linalg.unpack
+// CHECK:     tensor.collapse
+// CHECK-NOT:         linalg.unpack
 func.func @unpack_dynamic(%arg0: tensor<?x32xf32>) -> tensor<?xf32> {
   %c32 = arith.constant 32 : index
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 67b03b0a3485b..3251c5a4a2bfd 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1101,7 +1101,7 @@ func.func @fold_expand_of_collapse(%arg0 : tensor<3x4x4xf32>) -> tensor<3x4x4xf3
 
 // -----
 
-func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor<?x4x?xf32>, %arg1: index, %arg2: index)
+func.func @fold_expand_of_collapse_mixed_subshape(%arg0 : tensor<?x4x?xf32>, %arg1: index, %arg2: index)
     -> tensor<?x4x?xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1], [2]]
       : tensor<?x4x?xf32> into tensor<?x?xf32>
@@ -1109,12 +1109,28 @@ func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor<?x4x?xf32>, %arg1: ind
       : tensor<?x?xf32> into tensor<?x4x?xf32>
   return %1 : tensor<?x4x?xf32>
 }
-// CHECK-LABEL: @fold_expand_of_collapse_dynamic
+// CHECK-LABEL: @fold_expand_of_collapse_mixed_subshape
 //   CHECK-NOT:   tensor.{{.*}}_shape
 
 // -----
 
-func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index, %arg3: index)
+func.func @fold_expand_of_collapse_mixed_target_subshape(%arg0 : tensor<?x4x?x2xf32>, %arg1: index, %arg2: index)
+    -> tensor<?x4x?xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1], [2, 3]]
+      : tensor<?x4x?x2xf32> into tensor<?x?xf32>
+  %1 = tensor.expand_shape %0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2]
+      : tensor<?x?xf32> into tensor<?x4x?xf32>
+  return %1 : tensor<?x4x?xf32>
+}
+// CHECK-LABEL: @fold_expand_of_collapse_mixed_target_subshape
+//   CHECK-NOT:   tensor.expand_shape
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0 {{\[}}[0], [1], [2, 3]]
+//  CHECK-SAME:     : tensor<?x4x?x2xf32> into tensor<?x4x?xf32>
+//  CHECK-NEXT:   return %[[COLLAPSE]]
+
+// -----
+
+func.func @no_fold_expand_of_collapse_fully_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index, %arg3: index)
     -> tensor<?x?x?xf32> {
   %0 = tensor.collapse_shape %arg0 [[0, 1], [2]]
       : tensor<?x?x?xf32> into tensor<?x?xf32>
@@ -1122,7 +1138,22 @@ func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1:
       : tensor<?x?xf32> into tensor<?x?x?xf32>
   return %1 : tensor<?x?x?xf32>
 }
-// CHECK-LABEL: @no_fold_expand_of_collapse_dynamic
+// CHECK-LABEL: @no_fold_expand_of_collapse_fully_dynamic
+//       CHECK:   tensor.collapse_shape
+//       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape
+//       CHECK:   return %[[EXPAND]]
+
+// -----
+
+func.func @no_fold_expand_of_collapse_adjacent_dynamic(%arg0 : tensor<?x?x?xf32>, %arg1: index, %arg2: index)
+    -> tensor<?x?xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1, 2]]
+      : tensor<?x?x?xf32> into tensor<?xf32>
+  %1 = tensor.expand_shape %0 [[0, 1]] output_shape [%arg1, %arg2]
+      : tensor<?xf32> into tensor<?x?xf32>
+  return %1 : tensor<?x?xf32>
+}
+// CHECK-LABEL: @no_fold_expand_of_collapse_adjacent_dynamic
 //       CHECK:   tensor.collapse_shape
 //       CHECK:   %[[EXPAND:.+]] = tensor.expand_shape
 //       CHECK:   return %[[EXPAND]]
diff --git a/mlir/unittests/Dialect/Utils/CMakeLists.txt b/mlir/unittests/Dialect/Utils/CMakeLists.txt
index 61b9cdcb3b8f3..e921c8bcfb4e5 100644
--- a/mlir/unittests/Dialect/Utils/CMakeLists.txt
+++ b/mlir/unittests/Dialect/Utils/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIRDialectUtilsTests
   StructuredOpsUtilsTest.cpp
+  ReshapeOpsUtilsTest.cpp
   IndexingUtilsTest.cpp
 )
 mlir_target_link_libraries(MLIRDialectUtilsTests
diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
new file mode 100644
index 0000000000000..db1a87a4de2d5
--- /dev/null
+++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
@@ -0,0 +1,203 @@
+//===- ReshapeOpsUtilsTest.cpp - ReshapeOpsUtils unit tests ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "llvm/ADT/STLExtras.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+using namespace mlir;
+
+/// Helper to make constructing
+/// `std::optional<SmallVector<ReassociationIndices>>` more readable.
+static std::optional<SmallVector<ReassociationIndices>>
+makeOptionalIndices(std::initializer_list<ReassociationIndices> list) {
+  return std::optional<SmallVector<ReassociationIndices>>(list);
+}
+
+TEST(ReassociationIndicesForCollapse, ScalarTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}),
+            makeOptionalIndices({{0}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}),
+            makeOptionalIndices({{0}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic,
+                                                ShapedType::kDynamic, 1,
+                                                ShapedType::kDynamic},
+                                               {}),
+            makeOptionalIndices({{0, 1, 2, 3, 4}}));
+}
+
+TEST(ReassociationIndicesForCollapse, ScalarTestFailure) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({}, {}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({}, {1}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({2}, {}), std::nullopt);
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse({1, 2, ShapedType::kDynamic, 1}, {}),
+      std::nullopt);
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {200}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {10, 600}),
+            makeOptionalIndices({{0}, {1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 30}),
+            makeOptionalIndices({{0, 1}, {2}}));
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTestFailure) {
+  // No-op reassociation
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10, 20}),
+            std::nullopt);
+  // Invalid static reassociations
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10}), std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 300}),
+            std::nullopt);
+  // Non-collapsing (expanding) reassociation
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {1, 10, 20, 30}),
+            std::nullopt);
+}
+
+TEST(ReassociationIndicesForCollapse, StaticTestUnitDims) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, 1}, {10}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 20, 30}, {600}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1}, {1}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1, 1}, {1, 1, 1}),
+            makeOptionalIndices({{0}, {1}, {2, 3}}));
+}
+
+TEST(ReassociationIndicesForCollapse, DynamicTest) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, 1, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2, 3, 4}}));
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse(
+          {ShapedType::kDynamic, ShapedType::kDynamic}, {ShapedType::kDynamic}),
+      makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {1, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 1, 2, ShapedType::kDynamic, 10},
+                {ShapedType::kDynamic, 10}),
+            makeOptionalIndices({{0, 1, 2, 3}, {4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20},
+                                               {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic, 20},
+                                               {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 3, 2, 5, 2}, {ShapedType::kDynamic, 20}),
+            makeOptionalIndices({{0, 1}, {2, 3, 4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {10, ShapedType::kDynamic, 20, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, 20, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2}, {3, 4}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 1},
+                                               {ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, ShapedType::kDynamic, 1},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {1, ShapedType::kDynamic, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0, 1}, {2}}));
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 1, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            makeOptionalIndices({{0}, {1, 2}}));
+}
+
+TEST(ReassociationIndicesForCollapse, DynamicTestFailure) {
+  EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20},
+                                               {ShapedType::kDynamic, 10}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {20, ShapedType::kDynamic, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 5, 3, 2, 2}, {ShapedType::kDynamic, 20}),
+            std::nullopt);
+  EXPECT_EQ(
+      getReassociationIndicesForCollapse(
+          {ShapedType::kDynamic, ShapedType::kDynamic, ShapedType::kDynamic},
+          {ShapedType::kDynamic, ShapedType::kDynamic}),
+      std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, ShapedType::kDynamic, 10, 1,
+                 ShapedType::kDynamic},
+                {ShapedType::kDynamic, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 2, 2, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 3, 4, 3, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 12, ShapedType::kDynamic}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 8, 4, 2, 16, ShapedType::kDynamic},
+                {ShapedType::kDynamic, 32, ShapedType::kDynamic}),
+            std::nullopt);
+
+  //===----------------------------------------------------------------------===//
+  // TODO: Reassociation for the following examples can be computed, but isn't
+  // supported by `getReassociationIndicesForCollapse`.
+  //===----------------------------------------------------------------------===//
+
+  // TODO: Fails because there's no backtracking when some source dimensions
+  // remain unmatched at either edge.
+  EXPECT_EQ(getReassociationIndicesForCollapse(
+                {ShapedType::kDynamic, 10, ShapedType::kDynamic, 10},
+                {ShapedType::kDynamic, 10}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 2, 2},
+                                               {1, ShapedType::kDynamic, 2}),
+            std::nullopt);
+  EXPECT_EQ(getReassociationIndicesForCollapse({2, 2, ShapedType::kDynamic, 1},
+                                               {2, ShapedType::kDynamic}),
+            std::nullopt);
+}

From edaac11df3f82268e8ca34bf34b3e9d115b7d475 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 09:29:41 +0100
Subject: [PATCH 196/851] [X86] combineSelect - attempt to combine with
 shuffles (#143753)

Before legalization we will convert to a vector_shuffle node - but afterward we can try to combine the select into an existing target shuffle chain
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   16 +-
 .../CodeGen/X86/combine-mask-with-shuffle.ll  |   32 +-
 llvm/test/CodeGen/X86/pr132844.ll             |   11 +-
 .../vector-interleaved-load-i8-stride-7.ll    | 1166 ++++---
 .../vector-interleaved-store-i16-stride-8.ll  | 2864 ++++++++---------
 .../vector-interleaved-store-i8-stride-5.ll   |   30 +-
 .../vector-interleaved-store-i8-stride-6.ll   | 2026 ++++++------
 .../vector-interleaved-store-i8-stride-7.ll   |  231 +-
 .../vector-interleaved-store-i8-stride-8.ll   | 1096 +++----
 .../X86/vector-shuffle-combining-avx512f.ll   |   40 +-
 10 files changed, 3610 insertions(+), 3902 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 96714adf78e43..b0553aa4b8197 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47785,13 +47785,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                                                            DL, DAG, Subtarget))
       return V;
 
-  // Convert vselects with constant condition into shuffles.
-  if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
-      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
+  if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
     SmallVector<int, 64> Mask;
     if (createShuffleMaskFromVSELECT(Mask, Cond,
-                                     N->getOpcode() == X86ISD::BLENDV))
-      return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+                                     N->getOpcode() == X86ISD::BLENDV)) {
+      // Convert vselects with constant condition into shuffles.
+      if (DCI.isBeforeLegalizeOps())
+        return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+
+      // Attempt to combine as shuffle.
+      SDValue Op(N, 0);
+      if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+        return Res;
+    }
   }
 
   // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
diff --git a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
index 268ac3dd31b85..7564e65a428b7 100644
--- a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll
@@ -67,11 +67,9 @@ define <16 x i32> @combine_mask_with_abs(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_umin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpminud %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpminud %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -88,11 +86,9 @@ define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_umax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaxud %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpmaxud %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -109,11 +105,9 @@ define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_smin:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpminsd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpminsd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
@@ -130,11 +124,9 @@ define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) {
 define <16 x i32> @combine_mask_with_smax(<16 x i32> %v0) {
 ; CHECK-LABEL: combine_mask_with_smax:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vpblendmd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
+; CHECK-NEXT:    vpmaxsd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    movw $-3856, %ax # imm = 0xF0F0
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpopcntd %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll
index ded100b2accce..dc9f006d93d12 100644
--- a/llvm/test/CodeGen/X86/pr132844.ll
+++ b/llvm/test/CodeGen/X86/pr132844.ll
@@ -4,12 +4,11 @@
 define  { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) {
 ; CHECK-LABEL: PR132844:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    vmovdqu %ymm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT:    vinsertf128 $1, 16, %ymm2, %ymm2
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; CHECK-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index c132c5ea2ef49..82481269022b0 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -13723,364 +13723,361 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm16
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm17
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm12
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm18
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm8
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm7
 ; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm1 {%k1}
 ; AVX512BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT:    kmovq %k1, %k3
-; AVX512BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm9 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm14
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm9, %zmm9
 ; AVX512BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm1 {%k5}
+; AVX512BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX512BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
 ; AVX512BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm8, %xmm21
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm11, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm21 {%k7}
-; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
-; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm11
+; AVX512BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
 ; AVX512BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm22
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm0, %ymm22
 ; AVX512BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm15 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm23
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm7 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm15 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm23, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k5}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm8 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
 ; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpermd %ymm14, %ymm15, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm12, %zmm12
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm14, %xmm13, %xmm23
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm23 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm24 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512BW-FCP-NEXT:    vmovdqa 192(%rdi), %xmm15
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm25 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm24, %xmm25, %xmm24
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm0, %ymm24
 ; AVX512BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm24, %ymm13 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm23, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm19
 ; AVX512BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm19 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm20
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm19 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
+; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm19 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm18, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm18 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm18 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm17
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm17, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm17 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm18, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm17 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm16, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm16, %zmm16
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm17, %zmm17
 ; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
-; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm18 {%k1}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm16 {%k4}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,0,7,14],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm18 {%k7}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm19
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k6}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
-; AVX512BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k3}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm19 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm16, %zmm20
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm16, %zmm17
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm17 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm17, %xmm22, %xmm17
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm17
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm22, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm16 {%k5}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm22
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,xmm22[3,10],zero,zero,zero,xmm22[6,13]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u,5,12],zero,zero,xmm21[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm18 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm18, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14]
+; AVX512BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm19 {%k2}
+; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm20 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
 ; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
-; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm20, %xmm20
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512BW-FCP-NEXT:    vporq %xmm9, %xmm17, %xmm9
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k3}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm17
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm18
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm19
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm10, %xmm3
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0]
-; AVX512BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm20, %zmm10
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm3 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm8
-; AVX512BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
+; AVX512BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm21, %zmm20, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k5}
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm0, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm19, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm7 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm11, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm9 {%k6}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm9 {%k1}
-; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k4}
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k7}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0]
+; AVX512BW-FCP-NEXT:    vpermd %ymm17, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k5}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdi)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, (%r9)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rdi)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -14453,362 +14450,359 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 320(%rdi), %zmm3
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm24
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
 ; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm1, %zmm7
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    movw $-28382, %ax # imm = 0x9122
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $992, %ax # imm = 0x3E0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 160(%rdi), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    movw $8772, %ax # imm = 0x2244
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovq %k1, %k3
-; AVX512DQ-BW-FCP-NEXT:    kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm9 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 192(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm10, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 240(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm5, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm11, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm9, %zmm9
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $137438429184, %rax # imm = 0x1FFFF80000
 ; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm4, %zmm1 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm9, %zmm1 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 288(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 256(%rdi), %ymm9
 ; AVX512DQ-BW-FCP-NEXT:    movw $9288, %ax # imm = 0x2448
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm8 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm8, %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm11 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm11, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movw $3968, %ax # imm = 0xF80
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm21 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm7, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 416(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 384(%rdi), %ymm7
 ; AVX512DQ-BW-FCP-NEXT:    movw $4644, %ax # imm = 0x1224
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm22
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm22
 ; AVX512DQ-BW-FCP-NEXT:    movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm23
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $511, %r10d # imm = 0x1FF
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm18, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm14, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm5, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm16[u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm14, %xmm16, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm13, %zmm8 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    movl $261632, %r10d # imm = 0x3FE00
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k5
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm12, %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm17, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm12, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm14, %zmm12, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm2, %ymm4, %ymm13 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm14, %xmm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm13 {%k4}
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm13, %xmm15
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm23, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 208(%rdi), %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 192(%rdi), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm25 = xmm16[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm23, %xmm25, %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm0, %ymm23
 ; AVX512DQ-BW-FCP-NEXT:    movl $-134217728, %r10d # imm = 0xF8000000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm23, %ymm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm15, %xmm19, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm15, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm19, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm19, %zmm13, %zmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm22, %ymm21 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm0, %zmm14
 ; AVX512DQ-BW-FCP-NEXT:    movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm7 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm8, %ymm6, %ymm15 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm11, %ymm7, %ymm17 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm5, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm10, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm15, %ymm14 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm17, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm17, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm17, %ymm14 {%k2}
 ; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm13 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm17, %zmm18
 ; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm4, %ymm16 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,0,7,14],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm18, %ymm17 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm9, %ymm18 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm19
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm19 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    movl $8176, %eax # imm = 0x1FF0
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm6, %ymm8, %ymm16 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm16, %ymm15 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k6}
-; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm9 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm11, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm21, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm11, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm14, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm14, %ymm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm14, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm14 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm18[u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm18 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm16[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm18, %xmm22, %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm18, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm22, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm21, %zmm14 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm7, %ymm11, %ymm21 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm21, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm22, %xmm21, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm19 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm17, %zmm0, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm4, %ymm2, %ymm17 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm17, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm17 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpblendmw %ymm5, %ymm6, %ymm20 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u]
 ; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm11, %xmm20, %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm21, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm20, %xmm20
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm20, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm9, %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm20, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm9, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm9, %xmm17, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm9, %ymm20 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm2, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vporq %xmm2, %xmm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 224(%rdi), %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm9, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm2 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vextracti32x4 $1, %ymm16, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm11, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm9 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm10, %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm19, %ymm10, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm10, %zmm20, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm10, %zmm3 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm8, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movw $-512, %ax # imm = 0xFE00
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm5, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %edi # imm = 0x3FE000
-; AVX512DQ-BW-FCP-NEXT:    kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm16[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14]
+; AVX512DQ-BW-FCP-NEXT:    vporq %xmm21, %xmm22, %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm0, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm21, %ymm20 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm21, %zmm20, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k5}
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm19, %zmm0, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm11, %ymm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm11, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm9 {%k6}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    movl $4186112, %eax # imm = 0x3FE000
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm15, %zmm0, %zmm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm6, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x8 $1, %ymm5, %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm4, %ymm2 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm5 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm5, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm16[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermd %ymm18, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $2, %xmm0, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k5}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512DQ-BW-FCP-NEXT:    vextracti128 $1, %ymm7, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2d %zmm4, %zmm2, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rdx)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm8, (%rdx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, (%rcx)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, (%r8)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%r9)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm9, (%rdi)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%r9)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, (%rdi)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %wide.vec = load <448 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index 9c9dca82f60ca..f626dfe5daf00 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -4093,139 +4093,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    vpermd %zmm1, %zmm26, %zmm30
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm17, %zmm9
+; AVX512-NEXT:    movb $-86, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
-; AVX512-NEXT:    vpermd %zmm0, %zmm27, %zmm30 {%k1}
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm1, %zmm28, %zmm3
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512-NEXT:    kmovw %r11d, %k2
-; AVX512-NEXT:    vpermd %zmm0, %zmm29, %zmm3 {%k2}
-; AVX512-NEXT:    vmovdqa 32(%r10), %ymm15
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm12
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-NEXT:    vpermd %zmm13, %zmm19, %zmm31
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm18, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm20, %zmm14
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
-; AVX512-NEXT:    vpermd %zmm4, %zmm21, %zmm14 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15]
-; AVX512-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15]
-; AVX512-NEXT:    vmovdqa 32(%rax), %xmm7
-; AVX512-NEXT:    vpermd %zmm12, %zmm19, %zmm17
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512-NEXT:    vpermd %zmm4, %zmm18, %zmm17 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15]
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm20, %zmm16
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm10
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm18, %zmm0
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm11
+; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm5
+; AVX512-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15]
+; AVX512-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm19, %zmm13
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512-NEXT:    vpermd %zmm4, %zmm21, %zmm16 {%k2}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm26, %zmm23
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm27, %zmm23 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm4, %zmm28, %zmm22
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm22 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
-; AVX512-NEXT:    vpermd %zmm6, %zmm26, %zmm25
-; AVX512-NEXT:    vpermd %zmm2, %zmm27, %zmm25 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm0, %zmm28, %zmm24
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm24 {%k2}
-; AVX512-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm3
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
-; AVX512-NEXT:    vpermd %zmm6, %zmm19, %zmm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-NEXT:    vpermd %zmm7, %zmm18, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-NEXT:    vpermd %zmm2, %zmm19, %zmm2
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-NEXT:    vpermd %zmm0, %zmm18, %zmm2 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm0, %zmm20, %zmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
-; AVX512-NEXT:    vpermd %zmm13, %zmm21, %zmm0 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15]
-; AVX512-NEXT:    vpermd %zmm4, %zmm20, %zmm4
-; AVX512-NEXT:    vpermd %zmm1, %zmm21, %zmm4 {%k2}
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512-NEXT:    vpermd %zmm5, %zmm26, %zmm5
-; AVX512-NEXT:    vpermd %zmm1, %zmm27, %zmm5 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-NEXT:    vpermd %zmm7, %zmm28, %zmm7
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm7 {%k2}
-; AVX512-NEXT:    movb $-86, %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm31, %zmm14 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm10
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm6, %zmm18, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm3
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm22, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm15, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm9, 64(%rax)
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
@@ -4234,139 +4220,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm25
-; AVX512-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm10, %zmm19
+; AVX512-FCP-NEXT:    movb $-86, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm25 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm6, %zmm29
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm29 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm27
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm18, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm11
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm16, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm28 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm13
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
 ; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm11
 ; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm18, %zmm31
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm21
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm21 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm26, %zmm23
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm23 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm22
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm22 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm11
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
 ; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm13, %zmm18, %zmm13
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm19, %zmm13 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm17, %zmm1 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm18, %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm2 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm26, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm24, %zmm4
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm4 {%k2}
-; AVX512-FCP-NEXT:    movb $-86, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm30 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm4
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rax)
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
@@ -4374,139 +4344,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm26, %zmm30
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm0
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm27
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm17, %zmm9
+; AVX512DQ-NEXT:    movb $-86, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm27, %zmm30 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm10
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm11
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm28, %zmm3
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512DQ-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm29, %zmm3 {%k2}
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm15
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm12
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm13, %zmm19, %zmm31
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm18, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm20, %zmm14
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm6
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm21, %zmm14 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm2
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm7
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm19, %zmm17
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm12
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm18, %zmm17 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm15
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm20, %zmm16
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm10
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm13
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm18, %zmm0
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm11
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15]
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm10
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm19, %zmm13
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm12
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm14
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm21, %zmm16 {%k2}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm26, %zmm23
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm27, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm28, %zmm22
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm22 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm26, %zmm25
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm27, %zmm25 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm28, %zmm24
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm24 {%k2}
-; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm15 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm3
 ; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm4
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm19, %zmm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vpermd %zmm7, %zmm18, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm1
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm19, %zmm2
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm18, %zmm2 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm20, %zmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11]
-; AVX512DQ-NEXT:    vpermd %zmm13, %zmm21, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm20, %zmm4
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm21, %zmm4 {%k2}
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm26, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm27, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-NEXT:    vpermd %zmm7, %zmm28, %zmm7
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm7 {%k2}
-; AVX512DQ-NEXT:    movb $-86, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqa64 %zmm30, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm12
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm10
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm10 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm6, %zmm18, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm3 {%k1}
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm22, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 64(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm9, 64(%rax)
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -4515,139 +4471,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm25
-; AVX512DQ-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm9
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm27
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm10, %zmm19
+; AVX512DQ-FCP-NEXT:    movb $-86, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm25 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm7
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm8
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm6, %zmm29
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
-; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm13
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm27
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm17, %zmm27 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm18, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm19, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm16, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm14
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm10, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm6
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm13
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm11
 ; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm18, %zmm31
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm1, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm20 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm26, %zmm23
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm26, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm24, %zmm22
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm22 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm16, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm17, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm11
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11]
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm13, %zmm18, %zmm13
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm15
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm19, %zmm13 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm16, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm17, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm18, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm19, %zmm2 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm26, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm26, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm24, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm24, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT:    movb $-86, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm29 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm30 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm6, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm20, %zmm4
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 128(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 320(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, 64(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 320(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 256(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, 64(%rax)
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
@@ -7777,1095 +7717,959 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ;
 ; AVX512-LABEL: store_i16_stride8_vf64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    subq $504, %rsp # imm = 0x1F8
+; AVX512-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r10), %xmm4
-; AVX512-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm30, %zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512-NEXT:    kmovw %r11d, %k2
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512-NEXT:    movb $-86, %r11b
 ; AVX512-NEXT:    kmovw %r11d, %k1
-; AVX512-NEXT:    vmovdqa 96(%r10), %ymm2
-; AVX512-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-NEXT:    vmovdqa 96(%r9), %ymm8
-; AVX512-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-NEXT:    vpermd %zmm10, %zmm19, %zmm0
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm10
-; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm11
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512-NEXT:    vmovdqa 96(%rax), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
 ; AVX512-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512-NEXT:    vmovdqa 96(%rdi), %ymm13
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm16, %zmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-NEXT:    vpermd %zmm14, %zmm17, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512-NEXT:    vpermd %zmm5, %zmm19, %zmm0
-; AVX512-NEXT:    vpermd %zmm2, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512-NEXT:    vpermd %zmm5, %zmm16, %zmm31
-; AVX512-NEXT:    vpermd %zmm2, %zmm17, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512-NEXT:    vmovdqa 96(%rax), %xmm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512-NEXT:    vmovdqa 96(%r9), %xmm10
-; AVX512-NEXT:    vmovdqa 96(%r8), %xmm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-NEXT:    vpermd %zmm12, %zmm30, %zmm0
-; AVX512-NEXT:    vpermd %zmm9, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512-NEXT:    vpermd %zmm8, %zmm30, %zmm0
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%r10), %ymm2
-; AVX512-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512-NEXT:    vmovdqa 64(%r8), %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-NEXT:    vpermd %zmm12, %zmm19, %zmm0
-; AVX512-NEXT:    vpermd %zmm8, %zmm18, %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm19, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512-NEXT:    vpermt2d %zmm10, %zmm17, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
 ; AVX512-NEXT:    vmovdqa 64(%rsi), %ymm1
 ; AVX512-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm16, %zmm26
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm17, %zmm26 {%k1}
-; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15]
-; AVX512-NEXT:    vmovdqa 64(%r8), %xmm9
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512-NEXT:    vpermd %zmm10, %zmm19, %zmm5
-; AVX512-NEXT:    vpermd %zmm2, %zmm18, %zmm5 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm14, %zmm19, %zmm7
+; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm7 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm5, %zmm18, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-NEXT:    vpermd %zmm0, %zmm16, %zmm24
-; AVX512-NEXT:    vpermd %zmm2, %zmm17, %zmm24 {%k1}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT:    vpermd %zmm1, %zmm30, %zmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm29, %zmm1 {%k2}
-; AVX512-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm19, %zmm28
-; AVX512-NEXT:    vpermd %zmm3, %zmm18, %zmm28 {%k2}
-; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
-; AVX512-NEXT:    vpermd %zmm3, %zmm16, %zmm23
-; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
-; AVX512-NEXT:    vpermd %zmm6, %zmm17, %zmm23 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
-; AVX512-NEXT:    vpermd %zmm1, %zmm19, %zmm25
-; AVX512-NEXT:    vpermd %zmm0, %zmm18, %zmm25 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
-; AVX512-NEXT:    vpermd %zmm0, %zmm16, %zmm21
-; AVX512-NEXT:    vpermd %zmm3, %zmm17, %zmm21 {%k1}
-; AVX512-NEXT:    vmovdqa (%r10), %ymm3
-; AVX512-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm19, %zmm27
-; AVX512-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm18, %zmm27 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
-; AVX512-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512-NEXT:    vpermd %zmm4, %zmm19, %zmm20
-; AVX512-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512-NEXT:    vpermd %zmm3, %zmm18, %zmm20 {%k2}
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512-NEXT:    vpermd %zmm8, %zmm16, %zmm18
-; AVX512-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
-; AVX512-NEXT:    vpermd %zmm9, %zmm17, %zmm18 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512-NEXT:    vmovdqa 32(%r10), %xmm4
-; AVX512-NEXT:    vpermd %zmm3, %zmm16, %zmm16
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vmovdqa 64(%r10), %xmm7
+; AVX512-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512-NEXT:    vmovdqa 64(%r9), %xmm8
+; AVX512-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm9
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-NEXT:    vpermt2d %zmm4, %zmm18, %zmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r10), %xmm10
 ; AVX512-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512-NEXT:    vpermd %zmm6, %zmm17, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512-NEXT:    vpermd %zmm10, %zmm30, %zmm19
-; AVX512-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm19 {%k2}
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm30, %zmm10
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT:    vpermd %zmm9, %zmm29, %zmm10 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX512-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512-NEXT:    vpermd %zmm4, %zmm30, %zmm17
-; AVX512-NEXT:    vpermd %zmm3, %zmm29, %zmm17 {%k2}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512-NEXT:    vmovdqa64 %xmm0, %xmm22
-; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
-; AVX512-NEXT:    # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512-NEXT:    vpermd %zmm6, %zmm30, %zmm8
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512-NEXT:    vpermd %zmm4, %zmm29, %zmm8 {%k2}
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm7
-; AVX512-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-NEXT:    vpermd %zmm3, %zmm30, %zmm7 {%k1}
-; AVX512-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT:    vpermd %zmm6, %zmm29, %zmm6
-; AVX512-NEXT:    vmovdqa 96(%rcx), %xmm1
-; AVX512-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm14, %zmm30, %zmm6 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm5
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm5 {%k1}
-; AVX512-NEXT:    vmovdqa 64(%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm3, %zmm29, %zmm4
-; AVX512-NEXT:    vmovdqa 64(%rcx), %xmm14
-; AVX512-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-NEXT:    vpermd %zmm15, %zmm30, %zmm4 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm3
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT:    vpermd %zmm14, %zmm29, %zmm14
-; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512-NEXT:    vpermd %zmm11, %zmm30, %zmm14 {%k1}
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT:    vpermd %zmm1, %zmm29, %zmm1
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512-NEXT:    vpermd %zmm2, %zmm29, %zmm2
-; AVX512-NEXT:    vpermd %zmm0, %zmm30, %zmm2 {%k1}
-; AVX512-NEXT:    movb $-86, %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm6 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm26 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm24 {%k1}
-; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm25, %zmm21 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm10, %zmm14 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm17, %zmm1 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k1}
-; AVX512-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm12
+; AVX512-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm17, %zmm4
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm7
+; AVX512-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm18, %zmm5
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-NEXT:    vmovdqa64 %xmm23, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-NEXT:    vmovdqa64 %xmm27, %xmm6
+; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm16, 192(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm21, 448(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm23, 384(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm24, 704(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm26, 640(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm5, 832(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512-NEXT:    addq $504, %rsp # imm = 0x1F8
+; AVX512-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 ;
 ; AVX512-FCP-LABEL: store_i16_stride8_vf64:
 ; AVX512-FCP:       # %bb.0:
-; AVX512-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm17
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
+; AVX512-FCP-NEXT:    movb $-86, %r11b
 ; AVX512-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm13
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm15
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm23, %zmm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm22, %zmm0 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm10
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm0
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm9, %zmm21, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm13
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm1
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm5 {%k2}
-; AVX512-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
-; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm16
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm19
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm19 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm18
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm18 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %ymm1
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm25
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm25 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm24
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm24 {%k1}
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
 ; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm27
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm27 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm26
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm26 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm23, %zmm28
-; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm28 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm23
-; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm23 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm20, %zmm22
-; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm21, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm20
-; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm21
-; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm21 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm29
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm29 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm30
-; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm30 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm31
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm31 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm6
-; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm14, %zmm6 {%k2}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm1 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm9
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm9 {%k2}
-; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm8
-; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm2
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm3
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512-FCP-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
 ; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT:    vpermd %zmm5, %zmm7, %zmm5
-; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm12
-; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm14
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-FCP-NEXT:    vpermd %zmm3, %zmm7, %zmm3
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
-; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512-FCP-NEXT:    vpermd %zmm15, %zmm7, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; AVX512-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm4
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
-; AVX512-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm11
-; AVX512-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm11 {%k1}
-; AVX512-FCP-NEXT:    movb $-86, %al
-; AVX512-FCP-NEXT:    kmovw %eax, %k1
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm21, %zmm8 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2 {%k1}
-; AVX512-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm30, %zmm5 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm12
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm5, %zmm17, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa 64(%r10), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512-FCP-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512-FCP-NEXT:    vmovdqa 64(%rsi), %xmm13
+; AVX512-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm9
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm15
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa 32(%r10), %xmm10
+; AVX512-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm12
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k1}
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm4
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512-FCP-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512-FCP-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm7
+; AVX512-FCP-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm5
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm26, 448(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm24, 384(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm5, 512(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm18, 704(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm16, 640(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm2, 832(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm8, 768(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 960(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm10, 896(%rax)
-; AVX512-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512-FCP-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512-FCP-NEXT:    vzeroupper
 ; AVX512-FCP-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: store_i16_stride8_vf64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    subq $504, %rsp # imm = 0x1F8
+; AVX512DQ-NEXT:    subq $392, %rsp # imm = 0x188
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r10), %xmm4
-; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm1
-; AVX512DQ-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm0
-; AVX512DQ-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm7
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512DQ-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512DQ-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512DQ-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT:    vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT:    vmovdqa (%r9), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm22
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-NEXT:    vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-NEXT:    vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512DQ-NEXT:    movb $-86, %r11b
 ; AVX512DQ-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-NEXT:    vmovdqa 96(%r10), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm5
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm8
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm11
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
 ; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %ymm13
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm16, %zmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm17, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
-; AVX512DQ-NEXT:    vpermd %zmm5, %zmm16, %zmm31
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512DQ-NEXT:    vmovdqa 96(%rax), %xmm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm10
-; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm30, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%r10), %ymm2
-; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm9
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11]
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm10
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-NEXT:    vpermd %zmm12, %zmm19, %zmm0
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm18, %zmm0 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm15
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm19, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512DQ-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512DQ-NEXT:    vmovdqa 96(%r8), %xmm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm10, %zmm17, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm17, %zmm2
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm10
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
 ; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %ymm1
 ; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm26
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm17, %zmm26 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15]
-; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm9
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15]
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm19, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm18, %zmm5 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm14, %zmm19, %zmm7
+; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm7 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm5, %zmm18, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
 ; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm16, %zmm24
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm17, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm30, %zmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm29, %zmm1 {%k2}
-; AVX512DQ-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm4
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm19, %zmm28
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm18, %zmm28 {%k2}
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm9
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm16, %zmm23
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm17, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm19, %zmm25
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm18, %zmm25 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15]
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm16, %zmm21
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm17, %zmm21 {%k1}
-; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm3
-; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm4
-; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm19, %zmm27
-; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm18, %zmm27 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm6
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm19, %zmm20
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm18, %zmm20 {%k2}
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11]
-; AVX512DQ-NEXT:    vpermd %zmm8, %zmm16, %zmm18
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm8
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm17, %zmm18 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15]
-; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm4
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm16, %zmm16
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vmovdqa 64(%r10), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512DQ-NEXT:    vmovdqa 64(%r9), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm14
+; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm9
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm15
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm18, %zmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm4, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r10), %xmm10
 ; AVX512DQ-NEXT:    vmovdqa 32(%rax), %xmm3
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm17, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm6
-; AVX512DQ-NEXT:    vpermd %zmm10, %zmm30, %zmm19
-; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm8
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm19 {%k2}
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm30, %zmm10
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT:    vpermd %zmm9, %zmm29, %zmm10 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-NEXT:    vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX512DQ-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm30, %zmm17
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm29, %zmm17 {%k2}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; AVX512DQ-NEXT:    vmovdqa64 %xmm0, %xmm22
-; AVX512DQ-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-NEXT:    vmovdqa (%rsi), %xmm9
-; AVX512DQ-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT:    # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm30, %zmm8
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-NEXT:    vpermd %zmm4, %zmm29, %zmm8 {%k2}
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm7
-; AVX512DQ-NEXT:    vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm30, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 96(%rsi), %xmm3
-; AVX512DQ-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT:    vpermd %zmm6, %zmm29, %zmm6
-; AVX512DQ-NEXT:    vmovdqa 96(%rcx), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 96(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm30, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm5
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 64(%rsi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm3, %zmm29, %zmm4
-; AVX512DQ-NEXT:    vmovdqa 64(%rcx), %xmm14
-; AVX512DQ-NEXT:    vmovdqa 64(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-NEXT:    vpermd %zmm15, %zmm30, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm3
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT:    vpermd %zmm14, %zmm29, %zmm14
-; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512DQ-NEXT:    vpermd %zmm11, %zmm30, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
-; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT:    vpermd %zmm1, %zmm29, %zmm1
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-NEXT:    vpermd %zmm2, %zmm29, %zmm2
-; AVX512DQ-NEXT:    vpermd %zmm0, %zmm30, %zmm2 {%k1}
-; AVX512DQ-NEXT:    movb $-86, %al
-; AVX512DQ-NEXT:    kmovw %eax, %k1
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm0 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm31 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm6 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm5 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm26 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm24 {%k1}
-; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm19, %zmm3 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm28, %zmm23 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm25, %zmm21 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm10, %zmm14 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm17, %zmm1 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm27, %zmm18 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm20, %zmm16 {%k1}
-; AVX512DQ-NEXT:    vmovdqa64 %zmm8, %zmm2 {%k1}
+; AVX512DQ-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512DQ-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm12
+; AVX512DQ-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512DQ-NEXT:    vpermt2d %zmm1, %zmm17, %zmm4
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512DQ-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512DQ-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQ-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm0, %zmm18, %zmm7
+; AVX512DQ-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512DQ-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512DQ-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm18, %zmm5
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512DQ-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512DQ-NEXT:    vmovdqa64 %xmm20, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm23, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-NEXT:    vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-NEXT:    vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-NEXT:    vmovdqa64 %xmm27, %xmm6
+; AVX512DQ-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm16, 192(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm18, 128(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 256(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm21, 448(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm23, 384(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 512(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm24, 704(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm26, 640(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 832(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm6, 768(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm31, 960(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 896(%rax)
-; AVX512DQ-NEXT:    vmovdqa64 %zmm7, 64(%rax)
-; AVX512DQ-NEXT:    addq $504, %rsp # imm = 0x1F8
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT:    vmovaps %zmm0, 64(%rax)
+; AVX512DQ-NEXT:    addq $392, %rsp # imm = 0x188
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512DQ-FCP-LABEL: store_i16_stride8_vf64:
 ; AVX512DQ-FCP:       # %bb.0:
-; AVX512DQ-FCP-NEXT:    subq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    subq $328, %rsp # imm = 0x148
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm1, %xmm22
 ; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    movw $-30584, %r11w # imm = 0x8888
-; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm2, %zmm17
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    movw $8738, %r11w # imm = 0x2222
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm23
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm2, %xmm26
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vpermi2d %zmm0, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT:    movb $-86, %r11b
 ; AVX512DQ-FCP-NEXT:    kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm13
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm15
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm22, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm6, %zmm20, %zmm10
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm9, %zmm21, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm13
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %ymm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %ymm1
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %ymm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm16
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm19
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm19 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm18
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm18 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm1
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm23, %zmm25
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm22, %zmm25 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm8, %zmm20, %zmm24
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm21, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %ymm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
 ; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm27
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm27 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm20, %zmm26
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm21, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm0
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm23, %zmm28
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm22, %zmm28 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm23, %zmm23
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm22, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm20, %zmm22
-; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm21, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm21, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm3, %zmm21
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm21 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm29
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm30
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm14, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm31
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm0
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm1, %zmm14, %zmm6
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm4
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm14, %zmm6 {%k2}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm1 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm14, %zmm9 {%k2}
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm2
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm5, %zmm8
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm2, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r10), %xmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rax), %xmm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r9), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%r8), %xmm4
 ; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm5, %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm14
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm3, %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm14
-; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm15, %zmm7, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
-; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm4, %zmm7, %zmm4
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-FCP-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT:    # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
-; AVX512DQ-FCP-NEXT:    vpermd %zmm11, %zmm7, %zmm11
-; AVX512DQ-FCP-NEXT:    vpermd %zmm0, %zmm7, %zmm11 {%k1}
-; AVX512DQ-FCP-NEXT:    movb $-86, %al
-; AVX512DQ-FCP-NEXT:    kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm21, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm29, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm19, %zmm18 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm30, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm31, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm25, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm27, %zmm26 {%k1}
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rcx), %xmm2
+; AVX512DQ-FCP-NEXT:    vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rsi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 96(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm12, %zmm18, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %ymm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm12
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm15, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm5, %zmm17, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r10), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rax), %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r9), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%r8), %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rcx), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdx), %xmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rsi), %xmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm2
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm15
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, %zmm15 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %ymm4
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %ymm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm17, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %ymm7
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm9, %zmm19, %zmm14
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm4, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm19, %zmm13
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r10), %xmm10
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rax), %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r9), %xmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%r8), %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-FCP-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm12
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm28, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm23, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm1, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r10), %ymm10
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rax), %ymm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r9), %ymm5
+; AVX512DQ-FCP-NEXT:    vmovdqa (%r8), %ymm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm0, %zmm17, %zmm7
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rcx), %ymm8
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdx), %ymm9
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rsi), %ymm2
+; AVX512DQ-FCP-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm11, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm17, %zmm5
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15]
+; AVX512DQ-FCP-NEXT:    vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm3, %zmm19, %zmm1
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm24, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm16, %zmm3
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm27, %xmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %xmm28, %xmm6
+; AVX512DQ-FCP-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT:    vpermt2d %zmm2, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, %zmm5 {%k1}
 ; AVX512DQ-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm11, (%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 192(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm12, 256(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm26, 448(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm24, 384(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm3, 576(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm5, 512(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm18, 704(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm16, 640(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm2, 832(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm8, 768(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 960(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm10, 896(%rax)
-; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
-; AVX512DQ-FCP-NEXT:    addq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm13, 448(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm14, 384(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm15, 576(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 512(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 704(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 640(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 832(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 768(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 960(%rax)
+; AVX512DQ-FCP-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT:    vmovaps %zmm0, 896(%rax)
+; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm20, 64(%rax)
+; AVX512DQ-FCP-NEXT:    addq $328, %rsp # imm = 0x148
 ; AVX512DQ-FCP-NEXT:    vzeroupper
 ; AVX512DQ-FCP-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 86efcf9c57616..ad9db98711a62 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1190,8 +1190,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
@@ -1233,8 +1232,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm4, %ymm3
 ; AVX2-FP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2]
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
@@ -1461,20 +1459,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
 ; AVX512BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
 ; AVX512BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX512BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512BW-NEXT:    vporq %zmm3, %zmm4, %zmm3
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
 ; AVX512BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
@@ -1531,20 +1527,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm3, %ymm3
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22]
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX512DQ-BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
-; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm3, %ymm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22]
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpor %ymm3, %ymm5, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT:    vporq %zmm3, %zmm4, %zmm3
 ; AVX512DQ-BW-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2]
 ; AVX512DQ-BW-NEXT:    vpermd %zmm2, %zmm4, %zmm4
 ; AVX512DQ-BW-NEXT:    movabsq $595056260442243600, %rax # imm = 0x842108421084210
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 6d499e17bfbc6..03f5b90002d34 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -2996,94 +2996,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512BW-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm10
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512BW-NEXT:    kmovd %r10d, %k2
+; AVX512BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512BW-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512BW-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm13, %zmm14
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
-; AVX512BW-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512BW-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm13, %zmm8
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm8, %xmm13
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512BW-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm13, %zmm16
-; AVX512BW-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm15
+; AVX512BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512BW-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512BW-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -3092,93 +3086,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512BW-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
-; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm12
 ; AVX512BW-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
 ; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm10, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm10
-; AVX512BW-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm8, %zmm10, %zmm14
-; AVX512BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm7, %zmm10, %zmm16
-; AVX512BW-FCP-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512BW-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -3187,94 +3176,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512DQ-BW-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512DQ-BW-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm9
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm11
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm11, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm8
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm12
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm8, %zmm13, %zmm14
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm10, %xmm13
-; AVX512DQ-BW-NEXT:    vpshufb %xmm8, %xmm12, %xmm8
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm13, %zmm8
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
+; AVX512DQ-BW-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm7, %zmm13, %zmm16
-; AVX512DQ-BW-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm15
+; AVX512DQ-BW-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -3283,93 +3266,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm5
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm2, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm6, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm8, %ymm5, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm9, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    movl $1227114788, %r10d # imm = 0x49244924
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm3, %ymm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm6, %ymm2, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm7, %ymm4, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm9, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm1, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $9362, %r10w # imm = 0x2492
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm6, %ymm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm6, %ymm10, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm11, %zmm6, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm10, %zmm8, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm9, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm8, %xmm11
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm12
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm10, %xmm12, %xmm10
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm10, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm11, %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm8, %xmm13, %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm8, %zmm10, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    movl $1227105426, %ecx # imm = 0x49242492
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm15, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm8, %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm11, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm10, %xmm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm15, %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm7, %zmm10, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %ecx # imm = 0x24924924
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm5, %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm9, %xmm15, %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm9, %ymm7 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm13, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm16, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm7, %zmm13, %zmm7
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm9, %ymm8, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm8, %ymm11, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm7, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13]
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31]
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44]
-; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm14, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %zmm0, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 128(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
@@ -6368,726 +6346,770 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-LABEL: store_i8_stride6_vf64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm3, %ymm8, %ymm3
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
-; AVX512BW-NEXT:    kmovd %r10d, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm0 {%k1}
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512BW-NEXT:    vpshufb %zmm14, %zmm5, %zmm5
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm7, %zmm3
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-NEXT:    vmovdqa64 (%rcx), %xmm16
+; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm8
+; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512BW-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX512BW-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
 ; AVX512BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512BW-NEXT:    kmovd %r10d, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %xmm9
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512BW-NEXT:    vpermt2w %ymm8, %ymm13, %ymm12
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm8, %zmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm12, %zmm3
+; AVX512BW-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm20, %zmm14
+; AVX512BW-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512BW-NEXT:    kmovq %r10, %k2
+; AVX512BW-NEXT:    vmovdqu8 %zmm14, %zmm3 {%k2}
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm15, %xmm14
+; AVX512BW-NEXT:    vpshufb %xmm5, %xmm18, %xmm5
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm7, %zmm14
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm16, %xmm5
+; AVX512BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm7
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-NEXT:    vprold $16, %xmm7, %xmm7
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5]
+; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm14 {%k1}
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512BW-NEXT:    vpermi2w %ymm7, %ymm14, %ymm13
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX512BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm14
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm7
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm20, %zmm13
+; AVX512BW-NEXT:    vmovdqu8 %zmm13, %zmm7 {%k2}
+; AVX512BW-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512BW-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512BW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512BW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm18, %zmm15
+; AVX512BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
 ; AVX512BW-NEXT:    kmovd %r10d, %k2
-; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k2}
-; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208
-; AVX512BW-NEXT:    kmovq %r10, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k3}
-; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm9, %ymm5
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512BW-NEXT:    vpshufb %ymm7, %ymm10, %ymm6
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
-; AVX512BW-NEXT:    vpermw %ymm6, %ymm8, %ymm6
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm17, %ymm5
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512BW-NEXT:    vpshufb %ymm12, %ymm19, %ymm7
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31]
-; AVX512BW-NEXT:    vpermw %ymm7, %ymm11, %ymm7
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k1}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb %zmm14, %zmm13, %zmm6
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k2}
-; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512BW-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 (%rsi), %xmm21
-; AVX512BW-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm7, %xmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %xmm22
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm8, %xmm14
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm25, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 (%rcx), %xmm23
-; AVX512BW-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm15, %xmm14
-; AVX512BW-NEXT:    vmovdqa64 (%rdx), %xmm24
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm16
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512BW-NEXT:    vprold $16, %xmm16, %xmm16
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm16, %zmm14
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm27, %zmm16
-; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm16
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpermt2w %zmm28, %zmm27, %zmm26
-; AVX512BW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512BW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm26, %zmm6 {%k3}
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm21, %xmm26
-; AVX512BW-NEXT:    vpshufb %xmm20, %xmm22, %xmm20
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm25, %zmm20
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm23, %xmm25
-; AVX512BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm12
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512BW-NEXT:    vprold $16, %xmm25, %xmm25
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm25, %zmm12
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5]
-; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k2}
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3]
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero
-; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512BW-NEXT:    vmovdqu8 %zmm12, %zmm20 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm17
+; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm15 {%k2}
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm15, %ymm19
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm20, %ymm5, %ymm22
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512BW-NEXT:    kmovd %r10d, %k3
+; AVX512BW-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-NEXT:    vpermt2w %ymm22, %ymm23, %ymm15
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm15, %zmm15
+; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpshufb %ymm19, %ymm8, %ymm22
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512BW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-NEXT:    kmovq %r10, %k4
+; AVX512BW-NEXT:    vmovdqu8 %zmm24, %zmm15 {%k4}
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm21, %zmm10
+; AVX512BW-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm18, %zmm4
+; AVX512BW-NEXT:    vmovdqu16 %zmm10, %zmm4 {%k2}
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
+; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512BW-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpermt2w %ymm9, %ymm23, %ymm4
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512BW-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512BW-NEXT:    vpshufb %ymm19, %ymm6, %ymm6
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm25, %zmm9
+; AVX512BW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k4}
+; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm11, %ymm9
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm21, %ymm10
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm12, %zmm10
-; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm10 {%k1}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm13, %ymm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm13
-; AVX512BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512BW-NEXT:    kmovd %ecx, %k2
-; AVX512BW-NEXT:    vmovdqu16 %zmm13, %zmm10 {%k2}
-; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm11, %ymm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm11
-; AVX512BW-NEXT:    movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
 ; AVX512BW-NEXT:    kmovq %rcx, %k3
-; AVX512BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k3}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512BW-NEXT:    vpermt2w %zmm3, %zmm19, %zmm4
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm16, %ymm1
+; AVX512BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm2
 ; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm2
-; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k2}
-; AVX512BW-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512BW-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512BW-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512BW-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-NEXT:    vpshufb %zmm9, %zmm5, %zmm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512BW-FCP:       # %bb.0:
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm24
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm26
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm19
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm13
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm12, %zmm18, %zmm13
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm15, %xmm12
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm19, %xmm14
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm14, %zmm25, %zmm12
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm28
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm28, %xmm17
-; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm17, %zmm30, %zmm14
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm17
-; AVX512BW-FCP-NEXT:    vpshufb %xmm2, %xmm22, %xmm31
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm18, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm24, %xmm18
-; AVX512BW-FCP-NEXT:    vpshufb %xmm23, %xmm26, %xmm23
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm23, %zmm25, %zmm18
-; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm29, %xmm25
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm30, %zmm23
-; AVX512BW-FCP-NEXT:    vpshufb %xmm27, %xmm8, %xmm27
-; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm27, %zmm30, %zmm25
-; AVX512BW-FCP-NEXT:    vpshufb %xmm31, %xmm9, %xmm31
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm30, %zmm27
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm30
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm22
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm24, %zmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm31
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm20
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15]
+; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm4
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm3
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm11
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm13, %zmm12
+; AVX512BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k1
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512BW-FCP-NEXT:    vmovdqa %ymm12, %ymm4
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm3, %ymm14, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm11, %xmm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm19, %zmm12
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm12, %xmm4
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm23, %zmm22
+; AVX512BW-FCP-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm0, %xmm16, %xmm22
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm5, %zmm22
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm17, %xmm4
+; AVX512BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm5
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm13, %zmm5
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512BW-FCP-NEXT:    vpermi2w %ymm7, %ymm5, %ymm14
+; AVX512BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm7
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm7, %zmm19, %zmm5
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm14, %zmm5
+; AVX512BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm23, %zmm14
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm16
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm19, %zmm18, %zmm17
+; AVX512BW-FCP-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm17, %ymm19
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512BW-FCP-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm4, %ymm22
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512BW-FCP-NEXT:    kmovd %r10d, %k3
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm22, %ymm23, %ymm17
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm17
 ; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
 ; AVX512BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm24, %zmm10
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm11
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm11, %zmm0, %zmm15
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
-; AVX512BW-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm11
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm26, %ymm1
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm1, %ymm24, %ymm1
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-FCP-NEXT:    movl $613566756, %eax # imm = 0x24924924
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k1}
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512BW-FCP-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    movl $-1840700270, %eax # imm = 0x92492492
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k3}
-; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm5, %ymm3
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512BW-FCP-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512BW-FCP-NEXT:    kmovq %r10, %k4
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm24, %zmm17 {%k4}
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm26, %zmm21, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm26, %zmm18, %zmm9
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpermt2w %ymm10, %ymm23, %ymm9
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm9, %zmm25, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm6 {%k4}
+; AVX512BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm21, %ymm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermw %ymm8, %ymm11, %ymm8
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512BW-FCP-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k2}
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512BW-FCP-NEXT:    kmovq %rcx, %k3
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm9, %ymm16, %ymm2
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31]
+; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
 ; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm3
-; AVX512BW-FCP-NEXT:    vpshufb %ymm2, %ymm30, %ymm2
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31]
-; AVX512BW-FCP-NEXT:    vpermw %ymm3, %ymm24, %ymm3
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vpshufb %zmm1, %zmm8, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k2}
-; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm21 {%k1}
-; AVX512BW-FCP-NEXT:    movl $1227133513, %eax # imm = 0x49249249
-; AVX512BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm21 {%k3}
-; AVX512BW-FCP-NEXT:    movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k4
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm21 {%k4}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT:    movabsq $585610922974906400, %rax # imm = 0x820820820820820
-; AVX512BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm12 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm18 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm18 {%k5}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k3}
-; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm10 {%k4}
-; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512BW-FCP-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
 ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm1, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm3, %ymm8, %ymm3
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm9
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm3, %ymm0
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %ymm4
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm4, %ymm10
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
-; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm14, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm15
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm6, %xmm3
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm0, %zmm7, %zmm3
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm10
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm10, %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdx), %xmm11
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX512DQ-BW-NEXT:    vprold $16, %xmm9, %xmm9
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm9, %zmm8
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5]
 ; AVX512DQ-BW-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm8, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %xmm9
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm3, %ymm12
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm8, %ymm13, %ymm12
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm8, %zmm3
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm3, %zmm12, %zmm3
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm20, %zmm14
+; AVX512DQ-BW-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512DQ-BW-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm14, %zmm3 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm15, %xmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm5, %xmm18, %xmm5
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm5, %zmm7, %zmm14
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm16, %xmm5
+; AVX512DQ-BW-NEXT:    vpshufb %xmm0, %xmm17, %xmm7
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-NEXT:    vprold $16, %xmm7, %xmm7
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm5, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX512DQ-BW-NEXT:    vpermi2w %ymm7, %ymm14, %ymm13
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm7, %zmm8, %zmm14
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm13, %zmm7
+; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm20, %zmm13
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm13, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %ymm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %ymm17
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm18, %zmm15
+; AVX512DQ-BW-NEXT:    movl $613566756, %r10d # imm = 0x24924924
 ; AVX512DQ-BW-NEXT:    kmovd %r10d, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm5, %zmm0 {%k2}
-; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208
-; AVX512DQ-BW-NEXT:    kmovq %r10, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm5, %zmm0 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %ymm9
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm9, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %ymm10
-; AVX512DQ-BW-NEXT:    vpshufb %ymm7, %ymm10, %ymm6
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31]
-; AVX512DQ-BW-NEXT:    vpermw %ymm6, %ymm8, %ymm6
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %ymm17
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm17, %ymm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %ymm19
-; AVX512DQ-BW-NEXT:    vpshufb %ymm12, %ymm19, %ymm7
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31]
-; AVX512DQ-BW-NEXT:    vpermw %ymm7, %ymm11, %ymm7
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb %zmm14, %zmm13, %zmm6
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm6, %zmm5 {%k2}
-; AVX512DQ-BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm6, %zmm5 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rsi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %xmm7
-; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdi), %xmm22
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rdi), %xmm8
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm25, %zmm6
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rcx), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%rcx), %xmm15
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT:    vmovdqa64 (%rdx), %xmm24
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm18, %xmm16
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm16, %xmm16
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm14, %zmm16, %zmm14
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm14, %zmm6 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %xmm14
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm27, %zmm16
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm16, %zmm6 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm16
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm28, %zmm27, %zmm26
-; AVX512DQ-BW-NEXT:    movabsq $585610922974906400, %rcx # imm = 0x820820820820820
-; AVX512DQ-BW-NEXT:    kmovq %rcx, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm26, %zmm6 {%k3}
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm21, %xmm26
-; AVX512DQ-BW-NEXT:    vpshufb %xmm20, %xmm22, %xmm20
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm25, %zmm20
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm23, %xmm25
-; AVX512DQ-BW-NEXT:    vpshufb %xmm12, %xmm24, %xmm12
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vprold $16, %xmm25, %xmm25
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm25, %zmm12
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5]
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k2}
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm12, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm25, %zmm27, %zmm12
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm12, %zmm20 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm17
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm20, %zmm15 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm15, %ymm19
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-BW-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm5, %ymm22
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512DQ-BW-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512DQ-BW-NEXT:    kmovd %r10d, %k3
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm22, %ymm23, %ymm15
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm19, %zmm15, %zmm15
+; AVX512DQ-BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
+; AVX512DQ-BW-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm8, %ymm22
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512DQ-BW-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQ-BW-NEXT:    kmovq %r10, %k4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm24, %zmm15 {%k4}
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm21, %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm18, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm10, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT:    vextracti64x4 $1, %zmm4, %ymm6
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512DQ-BW-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpermt2w %ymm9, %ymm23, %ymm4
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %ymm6
+; AVX512DQ-BW-NEXT:    vpshufb %ymm19, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm6, %zmm25, %zmm9
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm9, %zmm4 {%k4}
+; AVX512DQ-BW-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm11, %ymm9
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm21, %ymm10
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm9, %zmm12, %zmm10
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm17, %zmm10 {%k1}
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-BW-NEXT:    # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm13, %ymm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm13
-; AVX512DQ-BW-NEXT:    movl $1227133513, %ecx # imm = 0x49249249
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm13, %zmm10 {%k2}
-; AVX512DQ-BW-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
-; AVX512DQ-BW-NEXT:    # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm11, %ymm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm21, %zmm11
-; AVX512DQ-BW-NEXT:    movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQ-BW-NEXT:    vpermw %ymm10, %ymm11, %ymm10
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQ-BW-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm9, %zmm10 {%k2}
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512DQ-BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
 ; AVX512DQ-BW-NEXT:    kmovq %rcx, %k3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm11, %zmm10 {%k3}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm3, %zmm19, %zmm4
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm16, %ymm1
+; AVX512DQ-BW-NEXT:    vpshufb %ymm6, %ymm17, %ymm2
 ; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm2
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm4, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512DQ-BW-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm3, %zmm2 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-BW-NEXT:    vpshufb %ymm13, %ymm1, %ymm1
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm3
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k3}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31]
+; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512DQ-BW-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512DQ-BW-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-NEXT:    vpshufb %zmm9, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm15, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64:
 ; AVX512DQ-BW-FCP:       # %bb.0:
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm7
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rcx), %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm24
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm16, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm20, %xmm13
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm12, %zmm18, %zmm13
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm15, %xmm12
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm19, %xmm14
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm14, %zmm25, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm28, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm17, %zmm30, %zmm14
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm21, %xmm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm2, %xmm22, %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm18, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm24, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm23, %xmm26, %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm23, %zmm25, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm29, %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm30, %zmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm27, %xmm8, %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm27, %zmm30, %zmm25
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm31, %xmm9, %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm30, %zmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %ymm30
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm24, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
-; AVX512DQ-BW-FCP-NEXT:    # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm8, %ymm31
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm31, %zmm0, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15]
+; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm6, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdx), %xmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdx), %xmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm8, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm9, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %xmm18
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm10, %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm13, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %r10d # imm = 0x92492492
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm4, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm11
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm12, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm3, %ymm14, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm11, %xmm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm19, %zmm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm12, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm23, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    movabsq $585610922974906400, %r10 # imm = 0x820820820820820
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm22, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm15, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm0, %xmm16, %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm5, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm17, %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm7, %xmm18, %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm13, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX512DQ-BW-FCP-NEXT:    vpermi2w %ymm7, %ymm5, %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm20, %xmm4, %xmm7
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm7, %zmm19, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm5, %zmm14, %zmm5
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm21, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm23, %zmm14
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm14, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %ymm14
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm19, %zmm21, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%rdi), %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm19, %zmm18, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %r10d # imm = 0x24924924
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm17, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0]
+; AVX512DQ-BW-FCP-NEXT:    # ymm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm4, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    movw $18724, %r10w # imm = 0x4924
+; AVX512DQ-BW-FCP-NEXT:    kmovd %r10d, %k3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm22, %ymm19 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm22, %ymm23, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm17, %zmm17
 ; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
 ; AVX512DQ-BW-FCP-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm24, %zmm10
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm11, %zmm0, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm11
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm1, %ymm11, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm1, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm4, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm26, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm1, %ymm24, %ymm1
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    movl $613566756, %eax # imm = 0x24924924
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $-1840700270, %eax # imm = 0x92492492
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm4 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm5, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm7, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm22, %zmm25, %zmm24
+; AVX512DQ-BW-FCP-NEXT:    movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082
+; AVX512DQ-BW-FCP-NEXT:    kmovq %r10, %k4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm24, %zmm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm26, %zmm21, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm26, %zmm18, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm6, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vextracti64x4 $1, %zmm9, %ymm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm20, %ymm10, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %ymm10, %ymm6 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %ymm10, %ymm23, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm6, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r9), %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm19, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm9, %zmm25, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm10, %zmm6 {%k4}
+; AVX512DQ-BW-FCP-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm8, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm21, %ymm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm8, %ymm11, %ymm8
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm8, %zmm10, %zmm8
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm22, %ymm10
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm24, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm12, %ymm18, %ymm12
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm12, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm8, %zmm10 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm2, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208
+; AVX512DQ-BW-FCP-NEXT:    kmovq %rcx, %k3
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm15, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm9, %ymm16, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm11, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm13, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
 ; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm7, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm2, %ymm30, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm3, %ymm24, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm1, %zmm8, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm0, %zmm2 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
-; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm22, %zmm21 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    movl $1227133513, %eax # imm = 0x49249249
-; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm20, %zmm21 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    movabsq $2342443691899625602, %rax # imm = 0x2082082082082082
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm15, %zmm21 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm13, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm14, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    movabsq $585610922974906400, %rax # imm = 0x820820820820820
-; AVX512DQ-BW-FCP-NEXT:    kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm23, %zmm12 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm17, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm25, %zmm18 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm27, %zmm18 {%k5}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm16, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm11, %zmm10 {%k3}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm19, %zmm10 {%k4}
-; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm12, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31]
+; AVX512DQ-BW-FCP-NEXT:    vpermw %ymm2, %ymm18, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63]
+; AVX512DQ-BW-FCP-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k3}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm10, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm6, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm3, 192(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index f4055a953badd..25e489eef9d11 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -925,16 +925,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
 ; AVX2-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, 48(%rax)
@@ -967,16 +965,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
+; AVX2-FP-NEXT:    vpor %ymm3, %ymm5, %ymm3
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28]
 ; AVX2-FP-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm0, %ymm1, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX2-FP-NEXT:    vextracti128 $1, %ymm3, %xmm0
 ; AVX2-FP-NEXT:    vmovq %xmm0, 48(%rax)
@@ -1205,24 +1201,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
-; AVX512BW-NEXT:    kmovq %rcx, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, 48(%rax)
-; AVX512BW-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
+; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -1283,24 +1276,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512DQ-BW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-BW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm1
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vporq %zmm1, %zmm2, %zmm1
-; AVX512DQ-BW-NEXT:    movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870
-; AVX512DQ-BW-NEXT:    kmovq %rcx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
-; AVX512DQ-BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
-; AVX512DQ-BW-NEXT:    vmovq %xmm0, 48(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa %ymm1, (%rax)
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2
+; AVX512DQ-BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
+; AVX512DQ-BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512DQ-BW-NEXT:    vmovq %xmm1, 48(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa %ymm0, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -1824,8 +1814,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-NEXT:    vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT:    vpor %ymm5, %ymm7, %ymm5
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero
@@ -1903,8 +1892,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm7, %ymm5, %ymm5
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
+; AVX2-FP-NEXT:    vpor %ymm4, %ymm5, %ymm5
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX2-FP-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero
@@ -2323,19 +2311,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
 ; AVX512BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
-; AVX512BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vmovdqu8 %ymm5, %ymm6 {%k1}
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2]
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
-; AVX512BW-NEXT:    vpor %ymm5, %ymm7, %ymm5
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
+; AVX512BW-NEXT:    vporq %zmm5, %zmm6, %zmm5
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
@@ -2445,12 +2431,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
 ; AVX512BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15]
-; AVX512BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
 ; AVX512BW-FCP-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa %xmm2, 96(%rax)
 ; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
@@ -2470,19 +2453,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpor %ymm5, %ymm6, %ymm5
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2]
+; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25]
 ; AVX512DQ-BW-NEXT:    vpor %ymm7, %ymm6, %ymm6
-; AVX512DQ-BW-NEXT:    movl $202911840, %ecx # imm = 0xC183060
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vmovdqu8 %ymm5, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2]
-; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2]
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28]
-; AVX512DQ-BW-NEXT:    vpor %ymm5, %ymm7, %ymm5
-; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQ-BW-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
+; AVX512DQ-BW-NEXT:    vporq %zmm5, %zmm6, %zmm5
 ; AVX512DQ-BW-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
 ; AVX512DQ-BW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
@@ -2592,12 +2573,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15]
-; AVX512DQ-BW-FCP-NEXT:    vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-BW-FCP-NEXT:    movw $-7741, %cx # imm = 0xE1C3
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa %ymm5, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, 96(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm2, 96(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm7, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
@@ -3598,24 +3576,24 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm4
-; AVX2-NEXT:    vmovdqa (%rsi), %ymm6
-; AVX2-NEXT:    vmovdqa (%rdx), %ymm3
-; AVX2-NEXT:    vmovdqa (%rcx), %ymm5
-; AVX2-NEXT:    vmovdqa (%r8), %ymm7
+; AVX2-NEXT:    vmovdqa (%rdi), %ymm7
 ; AVX2-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vmovdqa (%r9), %ymm2
-; AVX2-NEXT:    vmovdqa (%rax), %ymm1
+; AVX2-NEXT:    vmovdqa (%rsi), %ymm3
+; AVX2-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX2-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX2-NEXT:    vmovdqa (%rcx), %ymm2
+; AVX2-NEXT:    vmovdqa (%r8), %ymm5
+; AVX2-NEXT:    vmovdqa (%r9), %ymm6
+; AVX2-NEXT:    vmovdqa (%rax), %ymm4
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0]
 ; AVX2-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
-; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u]
+; AVX2-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0]
 ; AVX2-NEXT:    # ymm10 = mem[0,1,0,1]
@@ -3623,13 +3601,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero
 ; AVX2-NEXT:    vpor %ymm8, %ymm9, %ymm8
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -3698,68 +3676,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm10
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0]
 ; AVX2-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20]
+; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
 ; AVX2-NEXT:    vpor %ymm1, %ymm7, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27]
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero
 ; AVX2-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm7
 ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero
-; AVX2-NEXT:    vpor %ymm6, %ymm4, %ymm4
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpor %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31]
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero
-; AVX2-NEXT:    vpor %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,ymm5[1,2,3,0,1,14],zero,ymm5[0,1,0,1,14,15],zero,ymm5[15,16,17,18,19,16],zero,ymm5[30,31,16,17,16,17],zero,ymm5[31,30,31]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero
+; AVX2-NEXT:    vpor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-NEXT:    vpblendvb %ymm6, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18],zero
+; AVX2-NEXT:    vpor %ymm3, %ymm5, %ymm3
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpor %ymm2, %ymm5, %ymm2
+; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-NEXT:    vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    vmovdqa %ymm2, 96(%rax)
 ; AVX2-NEXT:    vmovdqa %ymm1, 160(%rax)
@@ -3905,22 +3882,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-FP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
+; AVX2-FP-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-FP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
 ; AVX2-FP-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FP-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
-; AVX2-FP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-FP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-FP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-FP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm6, %ymm0
 ; AVX2-FP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FP-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-FP-NEXT:    vmovdqa %ymm12, 128(%rax)
@@ -4067,22 +4043,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FCP-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
+; AVX2-FCP-NEXT:    vpor %ymm6, %ymm8, %ymm6
+; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
+; AVX2-FCP-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18]
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero
 ; AVX2-FCP-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FCP-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
-; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
-; AVX2-FCP-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
-; AVX2-FCP-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX2-FCP-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX2-FCP-NEXT:    vpblendvb %ymm1, %ymm0, %ymm6, %ymm0
 ; AVX2-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FCP-NEXT:    vmovdqa %ymm0, 96(%rax)
 ; AVX2-FCP-NEXT:    vmovdqa %ymm10, 128(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index a9da7abaa945c..3acc94d6e1fc4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2071,9 +2071,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512BW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
-; AVX512BW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512BW-NEXT:    kmovd %ecx, %k1
-; AVX512BW-NEXT:    vpord %zmm6, %zmm9, %zmm4 {%k1}
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
 ; AVX512BW-NEXT:    vpshufb %zmm5, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
@@ -2083,9 +2081,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7]
 ; AVX512BW-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm9, (%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -2117,23 +2115,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm4
 ; AVX512BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm10, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT:    vpord %zmm7, %zmm4, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3]
-; AVX512BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7
+; AVX512BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm0
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm6, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq %zmm1, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq %zmm1, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512BW-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT:    vpermq %zmm2, %zmm5, %zmm1
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm9, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT:    vpermq %zmm3, %zmm5, %zmm2
 ; AVX512BW-FCP-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -2167,9 +2163,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6]
 ; AVX512DQ-BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512DQ-BW-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT:    vpord %zmm6, %zmm9, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm5, %zmm0, %zmm0
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7]
@@ -2179,9 +2173,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512DQ-BW-NEXT:    vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7]
 ; AVX512DQ-BW-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm4, (%rax)
+; AVX512DQ-BW-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, (%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -2213,23 +2207,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63]
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm10, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    movw $-21846, %cx # imm = 0xAAAA
-; AVX512DQ-BW-FCP-NEXT:    kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpord %zmm7, %zmm4, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3]
-; AVX512DQ-BW-FCP-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7
+; AVX512DQ-BW-FCP-NEXT:    vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3]
+; AVX512DQ-BW-FCP-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm0, %zmm5, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm6, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm1, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm1, %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm2, %zmm5, %zmm1
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm9, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpermq %zmm3, %zmm5, %zmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpshufb %zmm10, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vpord %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm4, (%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
@@ -8050,128 +8042,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512BW-NEXT:    vmovdqa 48(%rcx), %xmm14
-; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm17
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm19
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm22
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512BW-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512BW-NEXT:    kmovd %r11d, %k1
-; AVX512BW-NEXT:    vpermw %zmm4, %zmm6, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa (%r10), %xmm4
-; AVX512BW-NEXT:    vmovdqa64 48(%r10), %xmm23
-; AVX512BW-NEXT:    vmovdqa (%rax), %xmm7
-; AVX512BW-NEXT:    vmovdqa64 48(%rax), %xmm24
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm25
-; AVX512BW-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm26
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512BW-NEXT:    vpermw %zmm11, %zmm12, %zmm11
-; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512BW-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512BW-NEXT:    kmovd %r11d, %k2
-; AVX512BW-NEXT:    vpermw %zmm9, %zmm13, %zmm11 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm15, %zmm6, %zmm9 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512BW-NEXT:    vpermw %zmm15, %zmm12, %zmm15
-; AVX512BW-NEXT:    vpermw %zmm27, %zmm13, %zmm15 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm27
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm28
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
-; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm17, %zmm6, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm30
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512BW-NEXT:    vpermw %zmm19, %zmm12, %zmm19
-; AVX512BW-NEXT:    vpermw %zmm17, %zmm13, %zmm19 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm22, %zmm6, %zmm17 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512BW-NEXT:    vpermw %zmm22, %zmm12, %zmm22
-; AVX512BW-NEXT:    vpermw %zmm23, %zmm13, %zmm22 {%k2}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm23
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm21
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm18, %zmm6, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 16(%rsi), %xmm24
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%rdi), %xmm25
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512BW-NEXT:    vpermw %zmm18, %zmm12, %zmm18
-; AVX512BW-NEXT:    vpermw %zmm20, %zmm13, %zmm18 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm26, %zmm6, %zmm20 {%k1}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
-; AVX512BW-NEXT:    vmovdqa64 16(%r10), %xmm24
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm21, %zmm6, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm21
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512BW-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512BW-NEXT:    vmovdqa 16(%r8), %xmm5
-; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm6, %zmm3 {%k1}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX512BW-NEXT:    vpermw %zmm6, %zmm12, %zmm6
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm6 {%k2}
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15]
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX512BW-NEXT:    vpermw %zmm2, %zmm12, %zmm2
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm2 {%k2}
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512BW-NEXT:    vpermw %zmm4, %zmm12, %zmm4
-; AVX512BW-NEXT:    vpermw %zmm1, %zmm13, %zmm4 {%k2}
+; AVX512BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512BW-NEXT:    vmovdqa 48(%r10), %xmm14
+; AVX512BW-NEXT:    vmovdqa (%rax), %xmm3
+; AVX512BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512BW-NEXT:    vmovdqa 48(%rax), %xmm15
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512BW-NEXT:    vmovdqa64 32(%r9), %xmm19
+; AVX512BW-NEXT:    vmovdqa64 48(%r9), %xmm18
+; AVX512BW-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512BW-NEXT:    vmovdqa64 32(%r8), %xmm21
+; AVX512BW-NEXT:    vmovdqa64 48(%r8), %xmm20
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
+; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm7, %zmm0
+; AVX512BW-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512BW-NEXT:    vmovdqa64 48(%rcx), %xmm22
+; AVX512BW-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512BW-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; AVX512BW-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512BW-NEXT:    vmovdqa64 48(%rdi), %xmm25
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
+; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm2
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7]
+; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm13
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
+; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm12, %zmm11
+; AVX512BW-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rdx), %xmm27
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15]
+; AVX512BW-NEXT:    vmovdqa64 32(%rsi), %xmm28
+; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm7, %zmm15
+; AVX512BW-NEXT:    vmovdqa64 32(%rdi), %xmm29
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
+; AVX512BW-NEXT:    vpermt2w %zmm18, %zmm12, %zmm14
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512BW-NEXT:    vpermt2w %zmm18, %zmm7, %zmm20
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512BW-NEXT:    vpermt2w %zmm22, %zmm12, %zmm18
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%r10), %xmm22
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rax), %xmm19
+; AVX512BW-NEXT:    vpermt2w %zmm17, %zmm7, %zmm16
+; AVX512BW-NEXT:    vmovdqa64 16(%r9), %xmm21
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%r8), %xmm24
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512BW-NEXT:    vpermt2w %zmm23, %zmm12, %zmm17
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512BW-NEXT:    vpermt2w %zmm23, %zmm7, %zmm25
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512BW-NEXT:    vmovdqa64 16(%rcx), %xmm22
+; AVX512BW-NEXT:    vpermt2w %zmm19, %zmm7, %zmm21
+; AVX512BW-NEXT:    vmovdqa64 16(%rdx), %xmm19
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512BW-NEXT:    vmovdqa 16(%rsi), %xmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm7, %zmm4
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm7
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm3
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm5
 ; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm20 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm23 {%k1}
-; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm11 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm16, %zmm17 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm7 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm3 {%k1}
+; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm17, 256(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, 448(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -8179,172 +8150,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FCP:       # %bb.0:
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512BW-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm17
+; AVX512BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm16
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm18
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm20
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm17
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854]
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm6 = [1284,1798]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm4
 ; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm22
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm21
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm18
 ; AVX512BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm23
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm24
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT:    vpermw %zmm6, %zmm8, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa (%r10), %xmm6
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm25
-; AVX512BW-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm22
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm4, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT:    vmovdqa (%r10), %xmm4
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm24
+; AVX512BW-FCP-NEXT:    vmovdqa (%rax), %xmm8
 ; AVX512BW-FCP-NEXT:    vmovdqa64 48(%rax), %xmm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm10, %ymm13
 ; AVX512BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm27
-; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm28
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm12, %zmm13, %zmm12
-; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512BW-FCP-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT:    vpermw %zmm15, %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm29
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm29, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm8, %zmm15 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm16, %zmm13, %zmm16
-; AVX512BW-FCP-NEXT:    vpermw %zmm29, %zmm14, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm17, %xmm18
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm29, %ymm18
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm29
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm28
+; AVX512BW-FCP-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm30
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX512BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm11
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm25
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm13, %ymm13
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm15, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm25
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm14, %zmm15
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm16, %xmm17
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm25, %ymm17
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm25
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm16, %ymm16
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm16
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm29
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm27
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm30
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm31
-; AVX512BW-FCP-NEXT:    vpermw %zmm18, %zmm8, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm18, %zmm13, %zmm18
-; AVX512BW-FCP-NEXT:    vpermw %zmm19, %zmm14, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm24, %ymm24
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm19, %xmm25
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm19, %ymm19
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm19, %zmm19
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm24, %zmm8, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm24, %zmm13, %zmm24
-; AVX512BW-FCP-NEXT:    vpermw %zmm25, %zmm14, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm20, %xmm21
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm25, %ymm21
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm17, %zmm9, %zmm16
+; AVX512BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm18
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm18, %zmm14, %zmm17
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm18, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm24
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm18, %ymm18
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm23
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm23, %zmm9, %zmm18
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm24
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm14, %zmm23
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm19, %xmm20
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm19
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm19, %ymm19
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm20, %zmm19
+; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
+; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm22
 ; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm20
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm20, %ymm20
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm21, %zmm20
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm25
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm23
-; AVX512BW-FCP-NEXT:    vpermw %zmm21, %zmm8, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm20, %zmm9, %zmm19
 ; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rcx), %xmm26
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm21
-; AVX512BW-FCP-NEXT:    vpermw %zmm22, %zmm14, %zmm21 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm22
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm27, %ymm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm21
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm21, %zmm14, %zmm20
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm21, %xmm25
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm27, %ymm25
 ; AVX512BW-FCP-NEXT:    vmovdqa64 16(%rdx), %xmm27
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm23
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm25, %ymm23
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm23, %zmm8, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512BW-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm2, %ymm23
-; AVX512BW-FCP-NEXT:    vpshufb %ymm4, %ymm23, %ymm4
-; AVX512BW-FCP-NEXT:    vmovdqa64 16(%r10), %xmm23
-; AVX512BW-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
-; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm5
-; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm4
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm21, %ymm21
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm25, %zmm21
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm25, %ymm25
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm25, %zmm9, %zmm21
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15]
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm22, %xmm24
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm25, %ymm24
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm22, %ymm22
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm24, %zmm22
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm24, %ymm24
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm24, %zmm9, %zmm22
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm2
+; AVX512BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%r10), %xmm5
+; AVX512BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm6
+; AVX512BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm2
 ; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
 ; AVX512BW-FCP-NEXT:    vmovdqa 16(%r8), %xmm7
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm8, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm8, %zmm13, %zmm8
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
-; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512BW-FCP-NEXT:    vpermw %zmm4, %zmm13, %zmm4
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512BW-FCP-NEXT:    vpermw %zmm5, %zmm13, %zmm5
-; AVX512BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm9
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; AVX512BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm4
 ; AVX512BW-FCP-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm15 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm22 {%k1}
+; AVX512BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
 ; AVX512BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512BW-FCP-NEXT:    vzeroupper
 ; AVX512BW-FCP-NEXT:    retq
 ;
@@ -8352,128 +8324,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW:       # %bb.0:
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm1
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm16
-; AVX512DQ-BW-NEXT:    vmovdqa 48(%rcx), %xmm14
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm2
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm18
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm17
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm3
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm19
-; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm5
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm22
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512DQ-BW-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT:    vpermw %zmm4, %zmm6, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm4
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r10), %xmm23
-; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm7
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rax), %xmm24
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm8
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm25
-; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm10
-; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm26
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm11, %zmm12, %zmm11
-; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512DQ-BW-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512DQ-BW-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-NEXT:    vpermw %zmm9, %zmm13, %zmm11 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm6, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm15, %zmm12, %zmm15
-; AVX512DQ-BW-NEXT:    vpermw %zmm27, %zmm13, %zmm15 {%k2}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm27
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm28
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm29
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm17, %zmm6, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm30
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm19, %zmm12, %zmm19
-; AVX512DQ-BW-NEXT:    vpermw %zmm17, %zmm13, %zmm19 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm22, %zmm6, %zmm17 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm22, %zmm12, %zmm22
-; AVX512DQ-BW-NEXT:    vpermw %zmm23, %zmm13, %zmm22 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm23
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm21
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm18, %zmm6, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rsi), %xmm24
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdi), %xmm25
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm18, %zmm12, %zmm18
-; AVX512DQ-BW-NEXT:    vpermw %zmm20, %zmm13, %zmm18 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm26, %zmm6, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r10), %xmm24
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm21, %zmm6, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm21
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r9), %xmm2
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX512DQ-BW-NEXT:    vmovdqa 16(%r8), %xmm5
-; AVX512DQ-BW-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm6, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm6, %zmm12, %zmm6
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm6 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15]
-; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX512DQ-BW-NEXT:    vpermw %zmm2, %zmm12, %zmm2
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm2 {%k2}
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-BW-NEXT:    vpermw %zmm4, %zmm12, %zmm4
-; AVX512DQ-BW-NEXT:    vpermw %zmm1, %zmm13, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT:    vmovdqa (%r10), %xmm1
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r10), %xmm16
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%r10), %xmm14
+; AVX512DQ-BW-NEXT:    vmovdqa (%rax), %xmm3
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rax), %xmm17
+; AVX512DQ-BW-NEXT:    vmovdqa 48(%rax), %xmm15
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX512DQ-BW-NEXT:    vmovdqa (%r9), %xmm4
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r9), %xmm19
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r9), %xmm18
+; AVX512DQ-BW-NEXT:    vmovdqa (%r8), %xmm6
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%r8), %xmm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%r8), %xmm20
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm2, %zmm7, %zmm0
+; AVX512DQ-BW-NEXT:    vmovdqa (%rcx), %xmm5
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rcx), %xmm22
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdx), %xmm8
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; AVX512DQ-BW-NEXT:    vmovdqa (%rsi), %xmm9
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rsi), %xmm24
+; AVX512DQ-BW-NEXT:    vmovdqa (%rdi), %xmm10
+; AVX512DQ-BW-NEXT:    vmovdqa64 48(%rdi), %xmm25
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; AVX512DQ-BW-NEXT:    vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm2
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm11, %zmm7, %zmm13
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm26, %zmm12, %zmm11
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rcx), %xmm26
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdx), %xmm27
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rsi), %xmm28
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm14, %zmm7, %zmm15
+; AVX512DQ-BW-NEXT:    vmovdqa64 32(%rdi), %xmm29
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm18, %zmm12, %zmm14
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm18, %zmm7, %zmm20
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm22, %zmm12, %zmm18
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r10), %xmm22
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rax), %xmm19
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm17, %zmm7, %zmm16
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r9), %xmm21
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%r8), %xmm24
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm23, %zmm12, %zmm17
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm23, %zmm7, %zmm25
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15]
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rcx), %xmm22
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm19, %zmm7, %zmm21
+; AVX512DQ-BW-NEXT:    vmovdqa64 16(%rdx), %xmm19
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rsi), %xmm3
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512DQ-BW-NEXT:    vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm7, %zmm4
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm7
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15]
+; AVX512DQ-BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm3
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512DQ-BW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX512DQ-BW-NEXT:    vpermt2w %zmm1, %zmm12, %zmm5
 ; AVX512DQ-BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512DQ-BW-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm19, %zmm14 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm22, %zmm17 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm18, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm6, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm13, %zmm11 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm15, %zmm14 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm16, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm25, %zmm7 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm21, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k1}
 ; AVX512DQ-BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm23, 192(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm20, 128(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm16, 320(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 256(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm7, 128(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm17, 320(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm18, 256(%rax)
 ; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm14, 448(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm9, 384(%rax)
-; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm0, 64(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm11, 384(%rax)
+; AVX512DQ-BW-NEXT:    vmovdqa64 %zmm2, 64(%rax)
 ; AVX512DQ-BW-NEXT:    vzeroupper
 ; AVX512DQ-BW-NEXT:    retq
 ;
@@ -8481,172 +8432,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-BW-FCP:       # %bb.0:
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rsi), %xmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rsi), %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rsi), %xmm16
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdi), %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdi), %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm5 = [1284,1798]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxwq {{.*#+}} xmm6 = [1284,1798]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm0, %xmm4
 ; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rcx), %xmm3
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rcx), %xmm21
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rcx), %xmm18
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rdx), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm24
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $572662306, %r11d # imm = 0x22222222
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm6, %zmm8, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r10), %xmm6
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rax), %xmm9
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rdx), %xmm22
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rdx), %xmm23
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm4, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm4, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r10), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r10), %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%rax), %xmm8
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%rax), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm10, %ymm10, %ymm13
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r9), %xmm10
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm11
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm28
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm12, %zmm13, %zmm12
-; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7]
-; AVX512DQ-BW-FCP-NEXT:    movl $-2004318072, %r11d # imm = 0x88888888
-; AVX512DQ-BW-FCP-NEXT:    kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm15, %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm15, %xmm29
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm29, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm15, %zmm15
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm8, %zmm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm16, %zmm13, %zmm16
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm29, %zmm14, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm17, %xmm18
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm29, %ymm18
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm29
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r9), %xmm28
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa (%r8), %xmm12
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 48(%r8), %xmm30
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX512DQ-BW-FCP-NEXT:    vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm13, %zmm14, %zmm11
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm13, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm13, %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm13, %ymm13
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm15, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm15, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm15, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm15, %ymm15, %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm14, %zmm15
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm25, %ymm17
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r10), %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm16, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm16, %ymm16
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm16, %zmm17, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm29
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm27
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm17, %zmm18, %zmm17
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%rax), %xmm30
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r9), %xmm31
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm18, %zmm8, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 32(%r8), %xmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm18, %zmm13, %zmm18
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm19, %zmm14, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm24, %ymm24
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm19, %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm19, %ymm19
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm24, %zmm19, %zmm19
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm24, %zmm8, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm24, %zmm13, %zmm24
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm25, %zmm14, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm20, %xmm21
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm25, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm17, %zmm9, %zmm16
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 32(%r8), %xmm31
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm17, %ymm17, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm18, %zmm14, %zmm17
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm18, %ymm18, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm23, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm18, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm18, %ymm18
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm23, %zmm18, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm23
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm23, %zmm9, %zmm18
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm23, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm14, %zmm23
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm19, %xmm20
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm24, %ymm20
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm19, %ymm19, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm19, %ymm19
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm19, %zmm20, %zmm19
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15]
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm22
 ; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm20, %ymm20
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm20, %zmm21, %zmm20
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rsi), %xmm25
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdi), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm21, %zmm8, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm20, %zmm9, %zmm19
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rcx), %xmm26
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm13, %zmm21
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm22, %zmm14, %zmm21 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm22
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm27, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm20, %ymm20, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm21, %zmm14, %zmm20
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm21, %xmm25
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm27, %ymm25
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%rdx), %xmm27
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm0, %zmm8, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15]
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm0, %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm23, %ymm25, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm0, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm23, %zmm8, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT:    # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm2, %ymm2, %ymm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm4, %ymm23, %ymm4
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 16(%r10), %xmm23
-; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm5
-; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm4
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm21, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm21, %ymm21
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm21, %zmm25, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm25, %ymm25, %ymm25
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm25, %zmm9, %zmm21
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15]
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm22, %xmm24
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm25, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm22, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm22, %ymm22
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm22, %zmm24, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti32x4 $1, %xmm24, %ymm24, %ymm24
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm24, %zmm9, %zmm22
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r10), %xmm5
+; AVX512DQ-BW-FCP-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%rax), %xmm6
+; AVX512DQ-BW-FCP-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r9), %xmm2
 ; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
 ; AVX512DQ-BW-FCP-NEXT:    vmovdqa 16(%r8), %xmm7
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm8, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm8, %zmm13, %zmm8
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15]
-; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm4, %zmm13, %zmm4
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm5, %zmm13, %zmm5
-; AVX512DQ-BW-FCP-NEXT:    vpermw %zmm3, %zmm14, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm9
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; AVX512DQ-BW-FCP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm2
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512DQ-BW-FCP-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
+; AVX512DQ-BW-FCP-NEXT:    vinserti128 $1, %xmm3, %ymm3, %ymm3
+; AVX512DQ-BW-FCP-NEXT:    vpermt2w %zmm3, %zmm14, %zmm4
 ; AVX512DQ-BW-FCP-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512DQ-BW-FCP-NEXT:    kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm16, %zmm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm24, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm8, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm11, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm17, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm20, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm9, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm2, %zmm22 {%k1}
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
 ; AVX512DQ-BW-FCP-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm17, 448(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm15, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm22, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm21, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm19, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm18, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm16, 448(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm13, 384(%rax)
+; AVX512DQ-BW-FCP-NEXT:    vmovdqa64 %zmm0, 64(%rax)
 ; AVX512DQ-BW-FCP-NEXT:    vzeroupper
 ; AVX512DQ-BW-FCP-NEXT:    retq
   %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 68967c2ce6536..c33776daf18fa 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -964,41 +964,11 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
 }
 
 define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
-; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X86-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X86-AVX512F-NEXT:    kmovw %eax, %k1
-; X86-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X86-AVX512F-NEXT:    retl
-;
-; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
-; X86-AVX512BW:       # %bb.0:
-; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X86-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X86-AVX512BW-NEXT:    kmovd %eax, %k1
-; X86-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X86-AVX512BW-NEXT:    retl
-;
-; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X64-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X64-AVX512F-NEXT:    kmovw %eax, %k1
-; X64-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
-; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
-; X64-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
-; X64-AVX512BW-NEXT:    kmovd %eax, %k1
-; X64-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; X64-AVX512BW-NEXT:    retq
+; CHECK-LABEL: blend_of_permutes_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [4,21,6,23,16,1,2,19,12,29,14,31,24,9,10,27]
+; CHECK-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   %x0 = bitcast <8 x i64> %s0 to <16 x i32>

From 4079ed3c9e72d64746c5d3f05fc585d844c1e8a7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 17:35:55 +0900
Subject: [PATCH 197/851] ARM: Move setting of more runtime libcalls to
 RuntimeLibcallInfo (#143826)

These are the easy cases that do not really depend on the subtarget,
other than for the deceptive predicates on the subtarget class. Most
of the rest of the cases here also do not, but this is obscured by
going through helper predicates added onto the subtarget which hide
dependence on TargetOptions.
---
 llvm/lib/IR/RuntimeLibcalls.cpp         | 28 +++++++++++++++++++++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 30 -------------------------
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 31013310a746d..331b319511aed 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -79,6 +79,34 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
       }
     }
   }
+
+  if (TT.isOSWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char *const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+        {RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP},
+        {RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP},
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      Info.setLibcallName(LC.Op, LC.Name);
+      Info.setLibcallCallingConv(LC.Op, LC.CC);
+    }
+  }
+
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (TT.isOSBinFormatMachO() && (!TT.isiOS() || !TT.isOSVersionLT(5, 0))) {
+    Info.setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    Info.setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
 }
 
 static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 8455eef9bad32..d2e910a248f23 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -708,36 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget->isTargetWindows()) {
-    static const struct {
-      const RTLIB::Libcall Op;
-      const char * const Name;
-      const CallingConv::ID CC;
-    } LibraryCalls[] = {
-      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
-      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
-    };
-
-    for (const auto &LC : LibraryCalls) {
-      setLibcallName(LC.Op, LC.Name);
-      setLibcallCallingConv(LC.Op, LC.CC);
-    }
-  }
-
-  // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->isTargetMachO() &&
-      !(Subtarget->isTargetIOS() &&
-        Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
-    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  }
-
   // The half <-> float conversion functions are always soft-float on
   // non-watchos platforms, but are needed for some targets which use a
   // hard-float calling convention by default.

From 5434b85d2c7a83d9cebae06dad2f9d630e9a3927 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 17:38:52 +0900
Subject: [PATCH 198/851] ARM: Remove fake entries for divrem libcalls
 (#143832)

This was defining aliases of the i32 divrem functions for the i8
and i16 cases. This is unnecessary and was unused. The divrem
candidate cases wouldn't have formed with illegal types in the
first place, so codegen wouldn't even query these.
---
 llvm/lib/IR/RuntimeLibcalls.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 331b319511aed..d84c56f0af5c6 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -41,13 +41,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
         const char *const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
-          {RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS},
-          {RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS},
-
-          {RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS},
-          {RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS},
       };
@@ -62,13 +57,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
         const char *const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
-          {RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
-          {RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS},
           {RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS},
-
-          {RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
-          {RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS},
           {RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS},
       };

From ce621041c2f162c50d630810491c2feee8eb6c64 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Thu, 12 Jun 2025 16:39:57 +0800
Subject: [PATCH 199/851] [RISCV] Get host CPU name via hwprobe (#142745)

We can get the `mvendorid/marchid/mimpid` via hwprobe and then we
can compare these IDs with those defined in processors to find the
CPU name.

With this change, `-mcpu/-mtune=native` can set the proper name.
---
 .../llvm/TargetParser/RISCVTargetParser.h     |  8 +++++
 llvm/lib/TargetParser/Host.cpp                | 30 +++++++++++++++----
 llvm/lib/TargetParser/RISCVTargetParser.cpp   | 15 +++++++---
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index 41fdab6012aa0..19a8af0cb9567 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -29,6 +29,13 @@ struct CPUModel {
   uint32_t MVendorID;
   uint64_t MArchID;
   uint64_t MImpID;
+
+  bool isValid() const { return MVendorID != 0 && MArchID != 0 && MImpID != 0; }
+
+  bool operator==(const CPUModel &Other) const {
+    return MVendorID == Other.MVendorID && MArchID == Other.MArchID &&
+           MImpID == Other.MImpID;
+  }
 };
 
 struct CPUInfo {
@@ -58,6 +65,7 @@ LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU);
 LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU);
 LLVM_ABI bool hasValidCPUModel(StringRef CPU);
 LLVM_ABI CPUModel getCPUModel(StringRef CPU);
+LLVM_ABI StringRef getCPUNameFromCPUModel(const CPUModel &Model);
 
 } // namespace RISCV
 
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 14acef116708a..5957e1befe2da 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/TargetParser/X86TargetParser.h"
 #include <string.h>
@@ -1672,8 +1673,32 @@ StringRef sys::getHostCPUName() {
   return "generic";
 }
 #elif defined(__riscv)
+#if defined(__linux__)
+// struct riscv_hwprobe
+struct RISCVHwProbe {
+  int64_t Key;
+  uint64_t Value;
+};
+#endif
+
 StringRef sys::getHostCPUName() {
 #if defined(__linux__)
+  // Try the hwprobe way first.
+  RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_MVENDORID=*/0, 0},
+                       {/*RISCV_HWPROBE_KEY_MARCHID=*/1, 0},
+                       {/*RISCV_HWPROBE_KEY_MIMPID=*/2, 0}};
+  int Ret = syscall(/*__NR_riscv_hwprobe=*/258, /*pairs=*/Query,
+                    /*pair_count=*/std::size(Query), /*cpu_count=*/0,
+                    /*cpus=*/0, /*flags=*/0);
+  if (Ret == 0) {
+    RISCV::CPUModel Model{static_cast<uint32_t>(Query[0].Value), Query[1].Value,
+                          Query[2].Value};
+    StringRef Name = RISCV::getCPUNameFromCPUModel(Model);
+    if (!Name.empty())
+      return Name;
+  }
+
+  // Then try the cpuinfo way.
   std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
   StringRef Content = P ? P->getBuffer() : "";
   StringRef Name = detail::getHostCPUNameForRISCV(Content);
@@ -2148,11 +2173,6 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   return Features;
 }
 #elif defined(__linux__) && defined(__riscv)
-// struct riscv_hwprobe
-struct RISCVHwProbe {
-  int64_t Key;
-  uint64_t Value;
-};
 const StringMap<bool> sys::getHostCPUFeatures() {
   RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0},
                        {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0},
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 2e5e8f4e50c9c..9957ec0c28d88 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -57,10 +57,7 @@ bool hasFastVectorUnalignedAccess(StringRef CPU) {
   return Info && Info->FastVectorUnalignedAccess;
 }
 
-bool hasValidCPUModel(StringRef CPU) {
-  const CPUModel Model = getCPUModel(CPU);
-  return Model.MVendorID != 0 && Model.MArchID != 0 && Model.MImpID != 0;
-}
+bool hasValidCPUModel(StringRef CPU) { return getCPUModel(CPU).isValid(); }
 
 CPUModel getCPUModel(StringRef CPU) {
   const CPUInfo *Info = getCPUInfoByName(CPU);
@@ -69,6 +66,16 @@ CPUModel getCPUModel(StringRef CPU) {
   return Info->Model;
 }
 
+StringRef getCPUNameFromCPUModel(const CPUModel &Model) {
+  if (!Model.isValid())
+    return "";
+
+  for (auto &C : RISCVCPUInfo)
+    if (C.Model == Model)
+      return C.Name;
+  return "";
+}
+
 bool parseCPU(StringRef CPU, bool IsRV64) {
   const CPUInfo *Info = getCPUInfoByName(CPU);
 

From 4551e5035565606eb04253a35f31d51685657436 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Thu, 12 Jun 2025 10:49:23 +0200
Subject: [PATCH 200/851] [clang] Reset FileID based diag state mappings
 (#143695)

When sharing same compiler instance for multiple compilations, we reset
source manager's file id tables in between runs. Diagnostics engine
keeps a cache based on these file ids, that became dangling references
across compilations.

This patch makes sure we reset those whenever sourcemanager is trashing
its FileIDs.
---
 clang/include/clang/Basic/Diagnostic.h        | 13 +++--
 clang/lib/Basic/Diagnostic.cpp                |  4 +-
 clang/lib/Basic/SourceManager.cpp             |  3 ++
 .../Frontend/CompilerInstanceTest.cpp         | 51 +++++++++++++++++++
 4 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index efee8302e7501..7ae4ef7df138c 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -424,10 +424,13 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
     bool empty() const { return Files.empty(); }
 
     /// Clear out this map.
-    void clear() {
+    void clear(bool Soft) {
+      // Just clear the cache when in soft mode.
       Files.clear();
-      FirstDiagState = CurDiagState = nullptr;
-      CurDiagStateLoc = SourceLocation();
+      if (!Soft) {
+        FirstDiagState = CurDiagState = nullptr;
+        CurDiagStateLoc = SourceLocation();
+      }
     }
 
     /// Produce a debugging dump of the diagnostic state.
@@ -920,6 +923,10 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// Reset the state of the diagnostic object to its initial configuration.
   /// \param[in] soft - if true, doesn't reset the diagnostic mappings and state
   void Reset(bool soft = false);
+  /// We keep a cache of FileIDs for diagnostics mapped by pragmas. These might
+  /// get invalidated when diagnostics engine is shared across different
+  /// compilations. Provide users with a way to reset that.
+  void ResetPragmas();
 
   //===--------------------------------------------------------------------===//
   // DiagnosticsEngine classification and reporting interfaces.
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index 95d86cb153b4b..a30bfa28eca71 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -119,6 +119,8 @@ bool DiagnosticsEngine::popMappings(SourceLocation Loc) {
   return true;
 }
 
+void DiagnosticsEngine::ResetPragmas() { DiagStatesByLoc.clear(/*Soft=*/true); }
+
 void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   ErrorOccurred = false;
   UncompilableErrorOccurred = false;
@@ -135,7 +137,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
   if (!soft) {
     // Clear state related to #pragma diagnostic.
     DiagStates.clear();
-    DiagStatesByLoc.clear();
+    DiagStatesByLoc.clear(false);
     DiagStateOnPushStack.clear();
 
     // Create a DiagState and DiagStatePoint representing diagnostic changes
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 09e5c6547fb51..053e82683a4a6 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -344,6 +344,9 @@ void SourceManager::clearIDTables() {
   NextLocalOffset = 0;
   CurrentLoadedOffset = MaxLoadedOffset;
   createExpansionLoc(SourceLocation(), SourceLocation(), SourceLocation(), 1);
+  // Diagnostics engine keeps some references to fileids, mostly for dealing
+  // with diagnostic pragmas, make sure they're reset as well.
+  Diag.ResetPragmas();
 }
 
 bool SourceManager::isMainFile(const FileEntry &SourceFile) {
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index a7b258d5e537e..459a3864887e1 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -9,9 +9,12 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "gtest/gtest.h"
@@ -97,4 +100,52 @@ TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) {
   ASSERT_EQ(DiagnosticOutput, "error: expected no crash\n");
 }
 
+TEST(CompilerInstance, MultipleInputsCleansFileIDs) {
+  auto VFS = makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  VFS->addFile("a.cc", /*ModificationTime=*/{},
+               MemoryBuffer::getMemBuffer(R"cpp(
+      #include "a.h"
+      )cpp"));
+  // Paddings of `void foo();` in the sources below are "important". We're
+  // testing against source locations from previous compilations colliding.
+  // Hence the `unused` variable in `b.h` needs to be within `#pragma clang
+  // diagnostic` block from `a.h`.
+  VFS->addFile("a.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp(
+      #include "b.h"
+      #pragma clang diagnostic push
+      #pragma clang diagnostic warning "-Wunused"
+      void foo();
+      #pragma clang diagnostic pop
+      )cpp"));
+  VFS->addFile("b.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp(
+      void foo(); void foo(); void foo(); void foo();
+      inline void foo() { int unused = 2; }
+      )cpp"));
+
+  DiagnosticOptions DiagOpts;
+  IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
+      CompilerInstance::createDiagnostics(*VFS, DiagOpts);
+
+  CreateInvocationOptions CIOpts;
+  CIOpts.Diags = Diags;
+
+  const char *Args[] = {"clang", "-xc++", "a.cc"};
+  std::shared_ptr<CompilerInvocation> CInvok =
+      createInvocation(Args, std::move(CIOpts));
+  ASSERT_TRUE(CInvok) << "could not create compiler invocation";
+
+  CompilerInstance Instance(std::move(CInvok));
+  Instance.setDiagnostics(Diags.get());
+  Instance.createFileManager(VFS);
+
+  // Run once for `a.cc` and then for `a.h`. This makes sure we get the same
+  // file ID for `b.h` in the second run as `a.h` from first run.
+  const auto &OrigInputKind = Instance.getFrontendOpts().Inputs[0].getKind();
+  Instance.getFrontendOpts().Inputs.emplace_back("a.h", OrigInputKind);
+
+  SyntaxOnlyAction Act;
+  EXPECT_TRUE(Instance.ExecuteAction(Act)) << "Failed to execute action";
+  EXPECT_FALSE(Diags->hasErrorOccurred());
+  EXPECT_EQ(Diags->getNumWarnings(), 0u);
+}
 } // anonymous namespace

From db8d34db26e9ea92c08d6e813eca9cce40c48478 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 12 Jun 2025 10:04:08 +0100
Subject: [PATCH 201/851] [VPlan] Set branch weight metadata on middle term in
 VPlan (NFC) (#143035)

Manage branch weights for the BranchOnCond in the middle block in VPlan.
This requires updating VPInstruction to inherit from VPIRMetadata, which
in general makes sense as there are a number of opcodes that could take
metadata.

There are other branches (part of the skeleton) that also need branch
weights adding.

PR: https://github.com/llvm/llvm-project/pull/143035
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 48 ++++++++++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 53 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 ++-
 3 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d236111836391..93ab3353a296a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7273,6 +7273,33 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
 
+/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
+/// BranchOnCond recipe.
+static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
+                                              Loop *OrigLoop) {
+  // 4. Adjust branch weight of the branch in the middle block.
+  Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator();
+  if (!hasBranchWeightMD(*LatchTerm))
+    return;
+
+  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+  auto *MiddleTerm =
+      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
+  // Only add branch metadata if there is a (conditional) terminator.
+  if (!MiddleTerm)
+    return;
+
+  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
+         "must have a BranchOnCond");
+  // Assume that `Count % VectorTripCount` is equally distributed.
+  unsigned TripCount = Plan.getUF() * VF.getKnownMinValue();
+  assert(TripCount > 0 && "trip count should not be zero");
+  MDBuilder MDB(LatchTerm->getContext());
+  MDNode *BranchWeights =
+      MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false);
+  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+}
+
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
@@ -7295,11 +7322,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-  // Retrieve and store the middle block before dissolving regions. Regions are
-  // dissolved after optimizing for VF and UF, which completely removes unneeded
-  // loop regions first.
-  VPBasicBlock *MiddleVPBB =
-      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
+
+  addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop);
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7442,20 +7466,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  // 4. Adjust branch weight of the branch in the middle block.
-  if (HeaderVPBB) {
-    auto *MiddleTerm =
-        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
-    if (MiddleTerm->isConditional() &&
-        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-      // Assume that `Count % VectorTripCount` is equally distributed.
-      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-      assert(TripCount > 0 && "trip count should not be zero");
-      const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
-    }
-  }
-
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index acc861b991975..468284168e9ca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,11 +882,39 @@ template <unsigned PartOpIdx> class VPUnrollPartAccessor {
   unsigned getUnrollPart(VPUser &U) const;
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+
+  void addMetadata(unsigned Kind, MDNode *Node) {
+    Metadata.emplace_back(Kind, Node);
+  }
+};
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
+                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -976,7 +1004,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        Opcode(Opcode), Name(Name.str()) {}
+        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1268,29 +1296,6 @@ struct VPIRPhi : public VPIRInstruction, public VPPhiAccessors {
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-};
-
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 62b99d98a2b5e..f5a2533727b3d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -591,7 +591,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    return createCondBranch(Cond, getParent(), State);
+    auto *Br = createCondBranch(Cond, getParent(), State);
+    applyMetadata(*Br);
+    return Br;
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.

From 2a27c059eccd96b6e46464dbdf69fd2f6237a56c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 10:46:08 +0100
Subject: [PATCH 202/851] [X86] Use BSR passthrough behaviour to fold (CMOV
 (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) (#143662)

Make use of targets that support BSR "pass through behaviour" on a zero input to remove a CMOV thats performing the same function

BSF will be a trickier patch as we need to make sure it works with the "REP BSF" hack in X86MCInstLower
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++++
 llvm/test/CodeGen/X86/bsr.ll            | 10 ++++------
 llvm/test/CodeGen/X86/pr40090.ll        | 11 ++++-------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b0553aa4b8197..f0fbf55e97be9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49398,6 +49398,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
+  // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X)
+  // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X)
   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
     SDValue Add = TrueOp;
@@ -49406,6 +49408,14 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
     if (CC == X86::COND_E)
       std::swap(Add, Const);
 
+    // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack.
+    if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR &&
+        Add.getResNo() == 0 && Add.hasOneUse() &&
+        Add.getOperand(1) == Cond.getOperand(0)) {
+      return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const,
+                         Add.getOperand(1));
+    }
+
     // We might have replaced the constant in the cmov with the LHS of the
     // compare. If so change it to the RHS of the compare.
     if (Const == Cond.getOperand(0))
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index 1247b3ec59324..fbca4af425eac 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -162,9 +162,8 @@ define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
   %2 = xor i32 %1, 31
@@ -188,8 +187,8 @@ define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr32_undef:
 ; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    bsrl %edi, %eax
-; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
   %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %2 = xor i32 %1, 31
@@ -239,9 +238,8 @@ define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    bsrq %rdi, %rax
-; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
   %2 = xor i64 %1, 63
@@ -279,8 +277,8 @@ define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind {
 ;
 ; X64-LABEL: cmov_bsr64_undef:
 ; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    bsrq %rdi, %rax
-; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
   %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
   %2 = xor i64 %1, 63
diff --git a/llvm/test/CodeGen/X86/pr40090.ll b/llvm/test/CodeGen/X86/pr40090.ll
index 24e957ac59f52..af933c950e111 100644
--- a/llvm/test/CodeGen/X86/pr40090.ll
+++ b/llvm/test/CodeGen/X86/pr40090.ll
@@ -4,10 +4,9 @@
 define i64 @foo(i64 %x, i64 %y) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bsrq %rdi, %rax
-; CHECK-NEXT:    orq $64, %rax
+; CHECK-NEXT:    bsrq %rdi, %rcx
+; CHECK-NEXT:    orq $64, %rcx
 ; CHECK-NEXT:    bsrq %rsi, %rcx
-; CHECK-NEXT:    cmoveq %rax, %rcx
 ; CHECK-NEXT:    movl $63, %eax
 ; CHECK-NEXT:    subq %rcx, %rax
 ; CHECK-NEXT:    retq
@@ -25,11 +24,9 @@ define i64 @bar(i64 %x, i64 %y) {
 ; CHECK-LABEL: bar:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl $127, %ecx
-; CHECK-NEXT:    movl $127, %eax
-; CHECK-NEXT:    bsrq %rdi, %rax
-; CHECK-NEXT:    xorq $64, %rax
+; CHECK-NEXT:    bsrq %rdi, %rcx
+; CHECK-NEXT:    xorq $64, %rcx
 ; CHECK-NEXT:    bsrq %rsi, %rcx
-; CHECK-NEXT:    cmoveq %rax, %rcx
 ; CHECK-NEXT:    movl $63, %eax
 ; CHECK-NEXT:    subq %rcx, %rax
 ; CHECK-NEXT:    retq

From 1d1f9afe911c360b9505b5fd2c712cb112c8aa5f Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 12 Jun 2025 17:42:00 +0800
Subject: [PATCH 203/851] [C++20] [Modules] Treat directly imported internal
 partition unit as reachable

Close https://github.com/llvm/llvm-project/issues/143788

See the discussion for details.
---
 clang/lib/Sema/SemaLookup.cpp    | 23 ++++++++++++++++++-----
 clang/lib/Sema/SemaModule.cpp    | 13 +++++++------
 clang/test/Modules/pr143788.cppm | 28 ++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/Modules/pr143788.cppm

diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index eef134b158438..91822909f1fd3 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -1978,6 +1978,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   if (D->isModulePrivate())
     return false;
 
+  Module *DeclTopModule = DeclModule->getTopLevelModule();
+
   // [module.reach]/p1
   //   A translation unit U is necessarily reachable from a point P if U is a
   //   module interface unit on which the translation unit containing P has an
@@ -1996,17 +1998,28 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) {
   //
   // Here we only check for the first condition. Since we couldn't see
   // DeclModule if it isn't (transitively) imported.
-  if (DeclModule->getTopLevelModule()->isModuleInterfaceUnit())
+  if (DeclTopModule->isModuleInterfaceUnit())
     return true;
 
-  // [module.reach]/p2
+  // [module.reach]/p1,2
+  //   A translation unit U is necessarily reachable from a point P if U is a
+  //   module interface unit on which the translation unit containing P has an
+  //   interface dependency, or the translation unit containing P imports U, in
+  //   either case prior to P
+  //
   //   Additional translation units on
   //   which the point within the program has an interface dependency may be
   //   considered reachable, but it is unspecified which are and under what
   //   circumstances.
-  //
-  // The decision here is to treat all additional tranditional units as
-  // unreachable.
+  Module *CurrentM = SemaRef.getCurrentModule();
+
+  // Directly imported module are necessarily reachable.
+  // Since we can't export import a module implementation partition unit, we
+  // don't need to count for Exports here.
+  if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule))
+    return true;
+
+  // Then we treat all module implementation partition unit as unreachable.
   return false;
 }
 
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 6c4df0aa35af5..9fcaad48d3058 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -712,7 +712,13 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
       Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) {
     Diag(ExportLoc, diag::err_export_partition_impl)
         << SourceRange(ExportLoc, Path.back().getLoc());
-  } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) {
+  } else if (ExportLoc.isValid() &&
+             (ModuleScopes.empty() || currentModuleIsImplementation())) {
+    // [module.interface]p1:
+    // An export-declaration shall inhabit a namespace scope and appear in the
+    // purview of a module interface unit.
+    Diag(ExportLoc, diag::err_export_not_in_module_interface);
+  } else if (!ModuleScopes.empty()) {
     // Re-export the module if the imported module is exported.
     // Note that we don't need to add re-exported module to Imports field
     // since `Exports` implies the module is imported already.
@@ -720,11 +726,6 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc,
       getCurrentModule()->Exports.emplace_back(Mod, false);
     else
       getCurrentModule()->Imports.insert(Mod);
-  } else if (ExportLoc.isValid()) {
-    // [module.interface]p1:
-    // An export-declaration shall inhabit a namespace scope and appear in the
-    // purview of a module interface unit.
-    Diag(ExportLoc, diag::err_export_not_in_module_interface);
   }
 
   return Import;
diff --git a/clang/test/Modules/pr143788.cppm b/clang/test/Modules/pr143788.cppm
new file mode 100644
index 0000000000000..5ae36d8d0e85a
--- /dev/null
+++ b/clang/test/Modules/pr143788.cppm
@@ -0,0 +1,28 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm
+// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/P.pcm
+// RUN: %clang_cc1 -std=c++20 %t/I.cpp -fmodule-file=M:P=%t/P.pcm -fmodule-file=M=%t/M.pcm -fsyntax-only -verify
+
+//--- H.hpp
+struct S{};
+
+//--- M.cppm
+export module M;
+
+
+//--- P.cppm
+module;
+#include "H.hpp"
+module M:P;
+
+using T = S;
+
+//--- I.cpp
+// expected-no-diagnostics
+module M;
+import :P;
+
+T f() { return {}; }

From 8e4fdff6f02161d878a63900abb35aaa32ff85e9 Mon Sep 17 00:00:00 2001
From: Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 12 Jun 2025 14:48:13 +0500
Subject: [PATCH 204/851] [X86] Update tailcc-ssp.ll assertions using
 update_llc_test_checks.py (#143500)

The assertions in llvm/test/CodeGen/X86/tailcc-ssp.ll were outdated. The
initial comment indicated they were generated with
`utils/update_llc_test_checks.py UTC_ARGS: --version 5`, but this was
not accurate based on the file's content.

Running `utils/update_llc_test_checks.py` regenerated the assertions,
aligning them with the current `llc` output.
This commit ensures that the test's claimed behavior accurately reflects
the actual `llc` output, even though the tests were already passing.

This was identified by @efriedma-quic during review of #136290.

Submitting a separate PR to make sure these changes stay isolated.
---
 llvm/test/CodeGen/X86/tailcc-ssp.ll | 55 ++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
index 5211e4fe9eef9..7ea5dd49f0242 100644
--- a/llvm/test/CodeGen/X86/tailcc-ssp.ll
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -78,7 +78,7 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
-
+;
 ; LINUX-LABEL: tailcall_unrelated_frame:
 ; LINUX:       # %bb.0:
 ; LINUX-NEXT:    pushq %rax
@@ -97,6 +97,7 @@ define void @tailcall_unrelated_frame() sspreq {
 ; LINUX-NEXT:    .cfi_def_cfa_offset 16
 ; LINUX-NEXT:    callq __stack_chk_fail@PLT
 
+
   call void @bar()
   tail call void @bar()
   ret void
@@ -105,18 +106,48 @@ define void @tailcall_unrelated_frame() sspreq {
 declare void @callee()
 define void @caller() sspreq {
 ; WINDOWS-LABEL: caller:
-; WINDOWS: callq   callee
-; WINDOWS: callq   callee
-; WINDOWS: cmpq    __security_cookie(%rip), %rcx
-; WINDOWS: jne
-; WINDOWS: callq   __security_check_cookie
-
+; WINDOWS:       # %bb.0:
+; WINDOWS-NEXT:    subq $40, %rsp
+; WINDOWS-NEXT:    .seh_stackalloc 40
+; WINDOWS-NEXT:    .seh_endprologue
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WINDOWS-NEXT:    callq callee
+; WINDOWS-NEXT:    callq callee
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
+; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    jne .LBB2_2
+; WINDOWS-NEXT:  # %bb.1:
+; WINDOWS-NEXT:    .seh_startepilogue
+; WINDOWS-NEXT:    addq $40, %rsp
+; WINDOWS-NEXT:    .seh_endepilogue
+; WINDOWS-NEXT:    retq
+; WINDOWS-NEXT:  .LBB2_2:
+; WINDOWS-NEXT:    callq __security_check_cookie
+; WINDOWS-NEXT:    int3
+; WINDOWS-NEXT:    .seh_endproc
+;
 ; LINUX-LABEL: caller:
-; LINUX: callq   callee@PLT
-; LINUX: callq   callee@PLT
-; LINUX: cmpq
-; LINUX: jne
-; LINUX: callq   __stack_chk_fail@PLT
+; LINUX:       # %bb.0:
+; LINUX-NEXT:    pushq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    movq %rax, (%rsp)
+; LINUX-NEXT:    callq callee@PLT
+; LINUX-NEXT:    callq callee@PLT
+; LINUX-NEXT:    movq %fs:40, %rax
+; LINUX-NEXT:    cmpq (%rsp), %rax
+; LINUX-NEXT:    jne .LBB2_2
+; LINUX-NEXT:  # %bb.1: # %SP_return
+; LINUX-NEXT:    popq %rax
+; LINUX-NEXT:    .cfi_def_cfa_offset 8
+; LINUX-NEXT:    retq
+; LINUX-NEXT:  .LBB2_2: # %CallStackCheckFailBlk
+; LINUX-NEXT:    .cfi_def_cfa_offset 16
+; LINUX-NEXT:    callq __stack_chk_fail@PLT
+
 
   tail call void @callee()
   call void @callee()

From 3e5d50f9c61bb266ab17919ab5209c7b08520aff Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Thu, 12 Jun 2025 15:20:39 +0530
Subject: [PATCH 205/851] [NVPTX] Add cta_group support to TMA G2S intrinsics
 (#143178)

This patch extends the TMA G2S intrinsics with the
support for cta_group::1/2 available from Blackwell onwards.
The existing intrinsics are auto-upgraded with a default
value of '0' for the `cta_group` flag operand.

* lit tests are added for all combinations of the newer variants.
* Negative tests are added to validate the error-handling
   when the value of the cta_group flag falls out-of-range.
* The generated PTX is verified with a 12.8 ptxas executable.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/docs/NVPTXUsage.rst                      |  32 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  32 +-
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h     |   9 +
 llvm/lib/IR/AutoUpgrade.cpp                   | 104 ++++-
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp   |  19 +
 .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h     |   1 +
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  19 +-
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |  17 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |   8 +
 .../Assembler/auto_upgrade_nvvm_intrinsics.ll |  16 +-
 .../NVPTX/cp-async-bulk-tensor-g2s-1cta.ll    | 435 ++++++++++++++++++
 .../NVPTX/cp-async-bulk-tensor-g2s-2cta.ll    | 435 ++++++++++++++++++
 .../NVPTX/cp-async-bulk-tensor-g2s-invalid.ll |  15 +
 13 files changed, 1078 insertions(+), 64 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d51686c0b830c..abd7ca5453645 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1016,7 +1016,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
@@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the
 tensor data ranges from 1d to 5d with the coordinates specified
 by the ``i32 %d0 ... i32 %d4`` arguments.
 
-* The last two arguments to these intrinsics are boolean flags
-  indicating support for cache_hint and/or multicast modifiers.
-  These flag arguments must be compile-time constants. The backend
-  looks through these flags and lowers the intrinsics appropriately.
+* The last three arguments to these intrinsics are flags
+  indicating support for multicast, cache_hint and cta_group::1/2
+  modifiers. These flag arguments must be compile-time constants.
+  The backend looks through these flags and lowers the intrinsics
+  appropriately.
 
-* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
+* The argument denoted by ``i1 %flag_ch`` when set, indicates
   a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
   variant of the PTX instruction.
 
-* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
-  the presence of a multicast mask (``i16 %mc``) and generates the PTX
-  instruction with the ``.multicast::cluster`` modifier.
+* The argument denoted by ``i1 %flag_mc`` when set, indicates
+  the presence of a multicast mask (``i16 %mc``) and generates
+  the PTX instruction with the ``.multicast::cluster`` modifier.
+
+* The argument denoted by ``i32 %flag_cta_group`` takes values within
+  the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group``
+  is not within the range, it may raise an error from the Verifier.
+  The default value is '0' with no cta_group modifier in the
+  instruction. The values of '1' and '2' lower to ``cta_group::1``
+  and ``cta_group::2`` variants of the PTX instruction respectively.
 
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
@@ -1058,7 +1066,7 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
@@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this
 mode, the tensor has to be at least three-dimensional. Along with the tensor
 coordinates, im2col offsets are also specified (denoted by
 ``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
-than the number of dimensions of the tensor operation. The last two arguments
-to these intrinsics are boolean flags, with the same functionality as described
+than the number of dimensions of the tensor operation. The last three arguments
+to these intrinsics are flags, with the same functionality as described
 in the ``tile`` mode intrinsics above.
 
 For more information, refer PTX ISA
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8c8e778b57061..4efdff71c0167 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2020,20 +2020,26 @@ foreach dim = 1...5 in {
     defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
     defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
 
+    defvar g2s_params = !listconcat(
+                          [llvm_shared_cluster_ptr_ty, // dst_ptr
+                           llvm_shared_ptr_ty,  // mbarrier_ptr
+                           llvm_ptr_ty],        // tensormap_ptr
+                          tensor_dim_args,      // actual tensor dims
+                          im2col_offsets_args,  // im2col offsets
+                          [llvm_i16_ty,         // cta_mask
+                           llvm_i64_ty]);       // cache_hint
+    defvar g2s_flags = [llvm_i1_ty,             // Flag for cta_mask
+                        llvm_i1_ty,             // Flag for cache_hint
+                        llvm_i32_ty];           // Flag for cta_group
+    defvar cta_group_idx = !add(
+                             !size(g2s_params),
+                             !sub(!size(g2s_flags), 1));
+    defvar g2s_props = [IntrConvergent,
+                        WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+                        // Allowed values for cta_group are {0,1,2} i.e [0, 3).
+                        Range<ArgIndex<cta_group_idx>, 0, 3>];
     def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d :
-      DefaultAttrsIntrinsicFlags<[],
-          !listconcat([llvm_shared_cluster_ptr_ty,  // dst_shared_cluster_ptr
-                       llvm_shared_ptr_ty,          // mbarrier_smem_ptr
-                       llvm_ptr_ty],                // tensormap_ptr
-                      tensor_dim_args,              // actual tensor dims
-                      im2col_offsets_args,          // im2col offsets
-                      [llvm_i16_ty,                 // cta_mask
-                       llvm_i64_ty]),               // cache_hint
-          [llvm_i1_ty,                              // Flag for cta_mask
-           llvm_i1_ty],                             // Flag for cache_hint
-          [IntrConvergent,
-           WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>]>;
+      DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>;
 
     def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d :
       DefaultAttrsIntrinsicFlags<[],
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index ce794e2573637..737610b73b081 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -38,6 +38,15 @@ enum class TMAReductionOp : uint8_t {
   XOR = 7,
 };
 
+// Enum to represent the cta_group::1 and
+// cta_group::2 variants in TMA/TCGEN05 family of
+// PTX instructions.
+enum class CTAGroupKind : uint8_t {
+  CG_NONE = 0, // default with no cta_group modifier
+  CG_1 = 1,    // cta_group::1 modifier
+  CG_2 = 2,    // cta_group::2 modifier
+};
+
 inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
   switch (IntrinsicID) {
   case Intrinsic::nvvm_f2i_rm_ftz:
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index a0886776ff93f..6e7254ec3e31f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -945,6 +945,53 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
   return false; // No other 'arm.*', 'aarch64.*'.
 }
 
+static Intrinsic::ID shouldUpgradeNVPTXTMAG2SIntrinsics(Function *F,
+                                                        StringRef Name) {
+  if (Name.consume_front("cp.async.bulk.tensor.g2s.")) {
+    Intrinsic::ID ID =
+        StringSwitch<Intrinsic::ID>(Name)
+            .Case("im2col.3d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
+            .Case("im2col.4d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
+            .Case("im2col.5d",
+                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
+            .Case("tile.1d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
+            .Case("tile.2d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
+            .Case("tile.3d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
+            .Case("tile.4d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
+            .Case("tile.5d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
+            .Default(Intrinsic::not_intrinsic);
+
+    if (ID == Intrinsic::not_intrinsic)
+      return ID;
+
+    // These intrinsics may need upgrade for two reasons:
+    // (1) When the address-space of the first argument is shared[AS=3]
+    //     (and we upgrade it to use shared_cluster address-space[AS=7])
+    if (F->getArg(0)->getType()->getPointerAddressSpace() ==
+        NVPTXAS::ADDRESS_SPACE_SHARED)
+      return ID;
+
+    // (2) When there are only two boolean flag arguments at the end:
+    //
+    // The last three parameters of the older version of these
+    // intrinsics are: arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag
+    //
+    // The newer version reads as:
+    // arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag, i32 cta_group_flag
+    //
+    // So, when the type of the [N-3]rd argument is "not i1", then
+    // it is the older version and we need to upgrade.
+    size_t FlagStartIndex = F->getFunctionType()->getNumParams() - 3;
+    Type *ArgType = F->getFunctionType()->getParamType(FlagStartIndex);
+    if (!ArgType->isIntegerTy(1))
+      return ID;
+  }
+
+  return Intrinsic::not_intrinsic;
+}
+
 static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
                                                               StringRef Name) {
   if (Name.consume_front("mapa.shared.cluster"))
@@ -959,22 +1006,6 @@ static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
                   Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
             .Case("shared.cta.to.cluster",
                   Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
-            .Case("tensor.g2s.im2col.3d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
-            .Case("tensor.g2s.im2col.4d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
-            .Case("tensor.g2s.im2col.5d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
-            .Case("tensor.g2s.tile.1d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
-            .Case("tensor.g2s.tile.2d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
-            .Case("tensor.g2s.tile.3d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
-            .Case("tensor.g2s.tile.4d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
-            .Case("tensor.g2s.tile.5d",
-                  Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
             .Default(Intrinsic::not_intrinsic);
 
     if (ID != Intrinsic::not_intrinsic)
@@ -1339,6 +1370,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
         return true;
       }
 
+      // Upgrade TMA copy G2S Intrinsics
+      IID = shouldUpgradeNVPTXTMAG2SIntrinsics(F, Name);
+      if (IID != Intrinsic::not_intrinsic) {
+        rename(F);
+        NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+        return true;
+      }
+
       // The following nvvm intrinsics correspond exactly to an LLVM idiom, but
       // not to an intrinsic alone.  We expand them in UpgradeIntrinsicCall.
       //
@@ -4831,7 +4870,18 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     return;
   }
   case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
-  case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
+  case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: {
+    // Create a new call with the correct address space.
+    SmallVector<Value *, 4> Args(CI->args());
+    Args[0] = Builder.CreateAddrSpaceCast(
+        Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    NewCall->takeName(CI);
+    CI->replaceAllUsesWith(NewCall);
+    CI->eraseFromParent();
+    return;
+  }
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
@@ -4840,10 +4890,22 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
   case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
-    // Create a new call with the correct address space.
-    SmallVector<Value *, 4> Args(CI->args());
-    Args[0] = Builder.CreateAddrSpaceCast(
-        Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+    SmallVector<Value *, 16> Args(CI->args());
+
+    // Create AddrSpaceCast to shared_cluster if needed.
+    // This handles case (1) in shouldUpgradeNVPTXTMAG2SIntrinsics().
+    unsigned AS = CI->getArgOperand(0)->getType()->getPointerAddressSpace();
+    if (AS == NVPTXAS::ADDRESS_SPACE_SHARED)
+      Args[0] = Builder.CreateAddrSpaceCast(
+          Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
+
+    // Attach the flag argument for cta_group, with a
+    // default value of 0. This handles case (2) in
+    // shouldUpgradeNVPTXTMAG2SIntrinsics().
+    size_t NumArgs = CI->arg_size();
+    Value *FlagArg = CI->getArgOperand(NumArgs - 3);
+    if (!FlagArg->getType()->isIntegerTy(1))
+      Args.push_back(ConstantInt::get(Builder.getInt32Ty(), 0));
 
     NewCall = Builder.CreateCall(NewFn, Args);
     NewCall->takeName(CI);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b4616b64bad15..732950deca9fa 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -437,3 +437,22 @@ void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum,
   llvm_unreachable(
       "Invalid Reduction Op in printCpAsyncBulkTensorReductionMode");
 }
+
+void NVPTXInstPrinter::printCTAGroup(const MCInst *MI, int OpNum,
+                                     raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  using CGTy = nvvm::CTAGroupKind;
+
+  switch (static_cast<CGTy>(MO.getImm())) {
+  case CGTy::CG_NONE:
+    O << "";
+    return;
+  case CGTy::CG_1:
+    O << ".cta_group::1";
+    return;
+  case CGTy::CG_2:
+    O << ".cta_group::2";
+    return;
+  }
+  llvm_unreachable("Invalid cta_group in printCTAGroup");
+}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index a2dd772cd86d0..f73af7a3f2c6e 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -51,6 +51,7 @@ class NVPTXInstPrinter : public MCInstPrinter {
   void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O);
   void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printCTAGroup(const MCInst *MI, int OpNum, raw_ostream &O);
 };
 
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 32223bf3d601e..a20099788d09c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2556,19 +2556,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
   // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
   // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
   // multicast, cache_hint,
-  // multicast_flag, cache_hint_flag}
+  // multicast_flag, cache_hint_flag, cta_group_flag}
   // NumOperands = {Chain, IID} + {Actual intrinsic args}
-  //             = {2}          + {7 + dims + im2col_offsets}
+  //             = {2}          + {8 + dims + im2col_offsets}
   size_t NumOps = N->getNumOperands();
   size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
-                            : (NumOps - 9);
+                            : (NumOps - 10);
   // Offsets is always 'NumDims - 2' and only for im2col mode
   size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
-  bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
-  bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
+  bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
+  bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
   size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
   size_t MultiCastIdx = NumBaseArgs + 2;         // for Chain and IID
 
+  unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
+  if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
+    report_fatal_error(
+        formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
+                Subtarget->getSmVersion()));
+
   SDLoc DL(N);
   SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
 
@@ -2580,6 +2586,9 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
   if (IsCacheHint)
     Ops.push_back(N->getOperand(MultiCastIdx + 1));
 
+  // Flag for CTA Group
+  Ops.push_back(getI32Imm(CTAGroupVal, DL));
+
   // Finally, the chain operand
   Ops.push_back(N->getOperand(0));
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 83d7defe6d9a9..f52ff39c3e1a5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -578,10 +578,14 @@ class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
                      # !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
 }
 
+def CTAGroupFlags : Operand<i32> {
+  let PrintMethod = "printCTAGroup";
+}
+
 multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
   defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
   defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
-  defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+  defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
   defvar rc = !if(is_shared32, Int32Regs, Int64Regs);
 
   defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
@@ -595,19 +599,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode>
     !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
 
   def "" : NVPTXInst<(outs),
-            !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag),
+            !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
             !strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";"), []>,
             Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)),
+                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                       (ins Int16Regs:$mc, CTAGroupFlags:$cg)),
                   !strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _CH : NVPTXInst<(outs),
-                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
+                  !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                       (ins Int64Regs:$ch, CTAGroupFlags:$cg)),
                   !strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;"), []>,
                   Requires<[hasPTX<80>, hasSM<90>]>;
   def _MC_CH : NVPTXInst<(outs),
-                     !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)),
+                     !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag,
+                          (ins Int16Regs:$mc, Int64Regs:$ch, CTAGroupFlags:$cg)),
                      !strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;"), []>,
                      Requires<[hasPTX<80>, hasSM<90>]>;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 5136b1ee28502..d2eae48826829 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -117,6 +117,14 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
     return HasTcgen05 && PTXVersion >= 86;
   }
 
+  // TMA G2S copy with cta_group::1/2 support
+  bool hasCpAsyncBulkTensorCTAGroupSupport() const {
+    // TODO: Update/tidy-up after the family-conditional support arrives
+    return ((FullSmVersion == 1001 || FullSmVersion == 1011) &&
+            PTXVersion >= 86) ||
+           (FullSmVersion == 1031 && PTXVersion >= 88);
+  }
+
   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
   // terminates a basic block. Instead, it would assume that control flow
   // continued to the next instruction. The next instruction could be in the
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index b7bdca42d5596..a17f11a680aa2 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -307,9 +307,9 @@ define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(
 
 ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
 define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false, i32 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0)
@@ -318,11 +318,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs
 
 ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
 define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false)
-; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false, i32 0)
+; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false, i32 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0)
   call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0)
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
new file mode 100644
index 0000000000000..5cfa25dfe55fc
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d
+define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d
+define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d
+define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d
+define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d
+define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d
+define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d
+define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
new file mode 100644
index 0000000000000..a7e6bec6aef10
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll
@@ -0,0 +1,435 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3);
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d
+define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<3>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d
+define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d
+define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d
+define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d
+define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<4>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d
+define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<5>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<4>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<7>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d
+define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX64-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX64-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX64-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX64-NEXT:    ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK-PTX64-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK-PTX64-NEXT:    ret;
+;
+; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d(
+; CHECK-PTX-SHARED32:       {
+; CHECK-PTX-SHARED32-NEXT:    .reg .b16 %rs<5>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-SHARED32-NEXT:    .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT:  // %bb.0:
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11];
+; CHECK-PTX-SHARED32-NEXT:    ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12];
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK-PTX-SHARED32-NEXT:    cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK-PTX-SHARED32-NEXT:    ret;
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll
new file mode 100644
index 0000000000000..1c35fbead389e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll
@@ -0,0 +1,15 @@
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -o /dev/null 2>&1 | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) writeonly, ptr addrspace(3), ptr readonly, i32, i16, i64, i1 immarg, i1 immarg, i32 immarg range(i32 0, 3))
+
+define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+  ; CHECK: immarg value 3 out of range [0, 3)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 3)
+
+  ; CHECK: immarg value -1 out of range [0, 3)
+  tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 -1)
+
+  ret void
+}

From a8c6fb4cb8e686f733e022afc549bc085d1558f4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 11:53:32 +0200
Subject: [PATCH 206/851] [MemCpyOpt] Fix lifetime marker sizes in tests (NFC)

As pointed out in https://github.com/llvm/llvm-project/pull/143782,
these tests were specifying the size in bits instead of bytes.

In order to preserve the intent of the tests, add a use of %src,
which prevents stack-move optimization. These are supposed to test
the handling of scoped alias metadata in call slot optimization.
---
 .../test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll | 7 +++++--
 llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll         | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
index 989049ab67a0b..840a5172561dc 100644
--- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
+++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll
@@ -1,17 +1,20 @@
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
+declare void @use(ptr)
+
 ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains
 define i8 @test(i8 %input) {
   %tmp = alloca i8
   %dst = alloca i8
   %src = alloca i8
 ; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]]
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4
   %ret_value = load i8, ptr %dst
+  call void @use(ptr %src)
   ret i8 %ret_value
 }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
index efdbdce401b76..601498e36a7a3 100644
--- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
+++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll
@@ -1,9 +1,11 @@
 ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s
 
+declare void @use(ptr)
+
 ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions.
 ; Merging here naively generates:
 ;  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3
-;  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0
+;  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0
 ;   ...
 ;  !0 = !{!1}
 ;  !1 = distinct !{!1, !2, !"callee1: %a"}
@@ -18,12 +20,13 @@ define i8 @test(i8 %input) {
   %src = alloca i8
 ; NOTE: we're matching the full line and looking for the lack of !alias.scope here
 ; CHECK:   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false)
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3
+  call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3
   store i8 %input, ptr %src
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3
+  call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3
   call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3
   %ret_value = load i8, ptr %dst
+  call void @use(ptr %src)
   ret i8 %ret_value
 }
 

From 5987f1ee5cc59a05961156c04010ab0f3c857628 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Thu, 12 Jun 2025 11:52:28 +0200
Subject: [PATCH 207/851] [InstCombine] Regenerate `narrow-switch.ll` test
 (NFC)

`narrow-switch.ll` test has been regenerated via latest UTC using
`--prefix-filecheck-ir-name _`, so as to avoid conflicts with
scripted variable names.
---
 .../Transforms/InstCombine/narrow-switch.ll   | 194 +++++++++++++-----
 1 file changed, 148 insertions(+), 46 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/narrow-switch.ll b/llvm/test/Transforms/InstCombine/narrow-switch.ll
index 05a30b910e5ee..90f56a61fa410 100644
--- a/llvm/test/Transforms/InstCombine/narrow-switch.ll
+++ b/llvm/test/Transforms/InstCombine/narrow-switch.ll
@@ -1,15 +1,27 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name _ --version 5
 ; Vary legal integer types in data layout.
 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
 define i32 @positive1(i64 %a) {
-; ALL-LABEL: @positive1(
-; ALL:         switch i32
-; ALL-NEXT:    i32 10, label %return
-; ALL-NEXT:    i32 100, label %sw.bb1
-; ALL-NEXT:    i32 1001, label %sw.bb2
+; ALL-LABEL: define i32 @positive1(
+; ALL-SAME: i64 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A]] to i32
+; ALL-NEXT:    switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i32 10, label %[[RETURN:.*]]
+; ALL-NEXT:      i32 100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i32 1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %and = and i64 %a, 4294967295
@@ -34,12 +46,24 @@ return:
 }
 
 define i32 @negative1(i64 %a) {
-; ALL-LABEL: @negative1(
-; ALL:         switch i32
-; ALL-NEXT:    i32 -10, label %return
-; ALL-NEXT:    i32 -100, label %sw.bb1
-; ALL-NEXT:    i32 -1001, label %sw.bb2
+; ALL-LABEL: define i32 @negative1(
+; ALL-SAME: i64 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i64 [[A]] to i32
+; ALL-NEXT:    switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i32 -10, label %[[RETURN:.*]]
+; ALL-NEXT:      i32 -100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i32 -1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %or = or i64 %a, -4294967296
@@ -67,12 +91,24 @@ return:
 ; assertion.
 
 define i32 @trunc72to68(i72 %a) {
-; ALL-LABEL: @trunc72to68(
-; ALL:         switch i68
-; ALL-NEXT:    i68 10, label %return
-; ALL-NEXT:    i68 100, label %sw.bb1
-; ALL-NEXT:    i68 1001, label %sw.bb2
+; ALL-LABEL: define i32 @trunc72to68(
+; ALL-SAME: i72 [[A:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*]]:
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i72 [[A]] to i68
+; ALL-NEXT:    switch i68 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; ALL-NEXT:      i68 10, label %[[RETURN:.*]]
+; ALL-NEXT:      i68 100, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i68 1001, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_DEFAULT]]:
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ]
+; ALL-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %and = and i72 %a, 295147905179352825855
@@ -103,15 +139,38 @@ return:
 ; because both are illegal.
 
 define void @trunc64to58(i64 %a) {
-; ALL-LABEL: @trunc64to58(
-; CHECK32:         switch i58
-; CHECK32-NEXT:    i58 0, label %sw.bb1
-; CHECK32-NEXT:    i58 18717182647723699, label %sw.bb2
+; CHECK32-LABEL: define void @trunc64to58(
+; CHECK32-SAME: i64 [[A:%.*]]) {
+; CHECK32-NEXT:  [[ENTRY:.*:]]
+; CHECK32-NEXT:    [[TMP0:%.*]] = trunc i64 [[A]] to i58
+; CHECK32-NEXT:    [[TMP1:%.*]] = and i58 [[TMP0]], 15
+; CHECK32-NEXT:    [[TRUNC:%.*]] = mul nuw i58 [[TMP1]], 18717182647723699
+; CHECK32-NEXT:    switch i58 [[TRUNC]], label %[[SW_DEFAULT:.*]] [
+; CHECK32-NEXT:      i58 0, label %[[SW_BB1:.*]]
+; CHECK32-NEXT:      i58 18717182647723699, label %[[SW_BB2:.*]]
 ; CHECK32-NEXT:    ]
-; CHECK64:         switch i64
-; CHECK64-NEXT:    i64 0, label %sw.bb1
-; CHECK64-NEXT:    i64 18717182647723699, label %sw.bb2
+; CHECK32:       [[SW_BB1]]:
+; CHECK32-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK32:       [[SW_BB2]]:
+; CHECK32-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK32:       [[SW_DEFAULT]]:
+; CHECK32-NEXT:    ret void
+;
+; CHECK64-LABEL: define void @trunc64to58(
+; CHECK64-SAME: i64 [[A:%.*]]) {
+; CHECK64-NEXT:  [[ENTRY:.*:]]
+; CHECK64-NEXT:    [[_TMP0:%.*]] = and i64 [[A]], 15
+; CHECK64-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[_TMP0]], 18717182647723699
+; CHECK64-NEXT:    switch i64 [[TMP0]], label %[[SW_DEFAULT:.*]] [
+; CHECK64-NEXT:      i64 0, label %[[SW_BB1:.*]]
+; CHECK64-NEXT:      i64 18717182647723699, label %[[SW_BB2:.*]]
 ; CHECK64-NEXT:    ]
+; CHECK64:       [[SW_BB1]]:
+; CHECK64-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK64:       [[SW_BB2]]:
+; CHECK64-NEXT:    br label %[[SW_DEFAULT]]
+; CHECK64:       [[SW_DEFAULT]]:
+; CHECK64-NEXT:    ret void
 ;
 entry:
   %tmp0 = and i64 %a, 15
@@ -136,18 +195,19 @@ sw.default:
 ; https://llvm.org/bugs/show_bug.cgi?id=31260
 
 define i8 @PR31260(i8 %x) {
-; ALL-LABEL: @PR31260(
-; ALL-NEXT:  entry:
-; ALL-NEXT:    [[T4:%.*]] = and i8 [[X:%.*]], 2
-; ALL-NEXT:    switch i8 [[T4]], label [[EXIT:%.*]] [
-; ALL-NEXT:    i8 0, label [[CASE126:%.*]]
-; ALL-NEXT:    i8 2, label [[CASE124:%.*]]
+; ALL-LABEL: define i8 @PR31260(
+; ALL-SAME: i8 [[X:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*:]]
+; ALL-NEXT:    [[T4:%.*]] = and i8 [[X]], 2
+; ALL-NEXT:    switch i8 [[T4]], label %[[EXIT:.*]] [
+; ALL-NEXT:      i8 0, label %[[CASE126:.*]]
+; ALL-NEXT:      i8 2, label %[[CASE124:.*]]
 ; ALL-NEXT:    ]
-; ALL:       exit:
+; ALL:       [[EXIT]]:
 ; ALL-NEXT:    ret i8 1
-; ALL:       case126:
+; ALL:       [[CASE126]]:
 ; ALL-NEXT:    ret i8 3
-; ALL:       case124:
+; ALL:       [[CASE124]]:
 ; ALL-NEXT:    ret i8 5
 ;
 entry:
@@ -169,12 +229,33 @@ case124:
 ; Make sure the arithmetic evaluation of the switch
 ; condition is evaluated on the original type
 define i32 @trunc32to16(i32 %a0) #0 {
-; ALL-LABEL: @trunc32to16(
-; ALL:         switch i16
-; ALL-NEXT:    i16 63, label %sw.bb
-; ALL-NEXT:    i16 1, label %sw.bb1
-; ALL-NEXT:    i16 100, label %sw.bb2
+; ALL-LABEL: define i32 @trunc32to16(
+; ALL-SAME: i32 [[A0:%.*]]) {
+; ALL-NEXT:  [[ENTRY:.*:]]
+; ALL-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+; ALL-NEXT:    [[XOR:%.*]] = lshr i32 [[A0]], 16
+; ALL-NEXT:    [[TMP0:%.*]] = trunc nuw i32 [[XOR]] to i16
+; ALL-NEXT:    [[TRUNC:%.*]] = xor i16 [[TMP0]], 15784
+; ALL-NEXT:    switch i16 [[TRUNC]], label %[[SW_EPILOG:.*]] [
+; ALL-NEXT:      i16 63, label %[[SW_BB:.*]]
+; ALL-NEXT:      i16 1, label %[[SW_BB1:.*]]
+; ALL-NEXT:      i16 100, label %[[SW_BB2:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[SW_BB]]:
+; ALL-NEXT:    store i32 90, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN:.*]]
+; ALL:       [[SW_BB1]]:
+; ALL-NEXT:    store i32 91, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_BB2]]:
+; ALL-NEXT:    store i32 92, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[SW_EPILOG]]:
+; ALL-NEXT:    store i32 113, ptr [[RETVAL]], align 4
+; ALL-NEXT:    br label %[[RETURN]]
+; ALL:       [[RETURN]]:
+; ALL-NEXT:    [[RVAL:%.*]] = load i32, ptr [[RETVAL]], align 4
+; ALL-NEXT:    ret i32 [[RVAL]]
 ;
 entry:
   %retval = alloca i32, align 4
@@ -182,9 +263,9 @@ entry:
   %shr = lshr i32 %xor, 16
   %add = add i32 %shr, -917677090
   switch i32 %add, label %sw.epilog [
-    i32 -917677027, label %sw.bb
-    i32 -917677089, label %sw.bb1
-    i32 -917676990, label %sw.bb2
+  i32 -917677027, label %sw.bb
+  i32 -917677089, label %sw.bb1
+  i32 -917676990, label %sw.bb2
   ]
 
 sw.bb:                                            ; preds = %entry
@@ -219,11 +300,32 @@ declare i32 @goo()
 ; if original type is legal (i32 in this case)
 
 define void @PR29009() {
-; ALL-LABEL: @PR29009(
-; ALL:         switch i32
-; ALL-NEXT:    i32 0, label
-; ALL-NEXT:    i32 3, label
+; ALL-LABEL: define void @PR29009() {
+; ALL-NEXT:    br label %[[BB1:.*]]
+; ALL:       [[BB1]]:
+; ALL-NEXT:    [[TMP2:%.*]] = load volatile i32, ptr @njob, align 4
+; ALL-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 0
+; ALL-NEXT:    br i1 [[DOTNOT]], label %[[BB10:.*]], label %[[BB3:.*]]
+; ALL:       [[BB3]]:
+; ALL-NEXT:    [[TMP4:%.*]] = call i32 @goo()
+; ALL-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 7
+; ALL-NEXT:    switch i32 [[TMP5]], label %[[BB6:.*]] [
+; ALL-NEXT:      i32 0, label %[[BB7:.*]]
+; ALL-NEXT:      i32 3, label %[[BB8:.*]]
 ; ALL-NEXT:    ]
+; ALL:       [[BB6]]:
+; ALL-NEXT:    store i32 6, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9:.*]]
+; ALL:       [[BB7]]:
+; ALL-NEXT:    store i32 1, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9]]
+; ALL:       [[BB8]]:
+; ALL-NEXT:    store i32 2, ptr @a, align 4
+; ALL-NEXT:    br label %[[BB9]]
+; ALL:       [[BB9]]:
+; ALL-NEXT:    br label %[[BB1]]
+; ALL:       [[BB10]]:
+; ALL-NEXT:    ret void
 ;
   br label %1
 
@@ -236,8 +338,8 @@ define void @PR29009() {
   %5 = call i32 @goo()
   %6 = and i32 %5, 7
   switch i32 %6, label %7 [
-    i32 0, label %8
-    i32 3, label %9
+  i32 0, label %8
+  i32 3, label %9
   ]
 
 ; <label>:7:                                      ; preds = %4

From 7ef77eb9984d1fb537a409cf4be89560fbb681fe Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 12 Jun 2025 11:09:09 +0100
Subject: [PATCH 208/851] [LV] Support scalable interleave groups for factors
 3,5,6 and 7 (#141865)

Currently the loop vectorizer can only vectorize interleave groups for
power-of-2 factors at scalable VFs by recursively interleaving
[de]interleave2 intrinsics.

However after https://github.com/llvm/llvm-project/pull/124825 and
#139893, we now have [de]interleave intrinsics for all factors up to 8,
which is enough to support all types of segmented loads and stores on
RISC-V.

Now that the interleaved access pass has been taught to lower these in
#139373 and #141512, this patch teaches the loop vectorizer to emit
these intrinsics for factors up to 8, which enables scalable
vectorization for non-power-of-2 factors.

As far as I'm aware, no in-tree target will vectorize a scalable
interelave group above factor 8 because the maximum interleave factor is
capped at 4 on AArch64 and 8 on RISC-V, and the
`-max-interleave-group-factor` CLI option defaults to 8, so the
recursive [de]interleaving code has been removed for now.

Factors of 3 with scalable VFs are also turned off in AArch64 since
there's no lowering for [de]interleave3 just yet either.
---
 llvm/include/llvm/Analysis/VectorUtils.h      |   6 +
 llvm/lib/Analysis/VectorUtils.cpp             |  24 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   7 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  63 +-
 .../AArch64/sve-interleaved-accesses.ll       |  52 +-
 .../sve-interleaved-masked-accesses.ll        |  84 +--
 .../RISCV/interleaved-accesses.ll             | 626 +++++++++---------
 8 files changed, 418 insertions(+), 458 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 52fe6f6cf43f2..53ba1e8f77791 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -176,6 +176,12 @@ LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(
 LLVM_ABI Intrinsic::ID
 getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI);
 
+/// Returns the corresponding llvm.vector.interleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor);
+
+/// Returns the corresponding llvm.vector.deinterleaveN intrinsic for factor N.
+LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 968fd2f8c5d7f..63fccee63c0ae 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -240,6 +240,30 @@ Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
   return Intrinsic::not_intrinsic;
 }
 
+struct InterleaveIntrinsic {
+  Intrinsic::ID Interleave, Deinterleave;
+};
+
+static InterleaveIntrinsic InterleaveIntrinsics[] = {
+    {Intrinsic::vector_interleave2, Intrinsic::vector_deinterleave2},
+    {Intrinsic::vector_interleave3, Intrinsic::vector_deinterleave3},
+    {Intrinsic::vector_interleave4, Intrinsic::vector_deinterleave4},
+    {Intrinsic::vector_interleave5, Intrinsic::vector_deinterleave5},
+    {Intrinsic::vector_interleave6, Intrinsic::vector_deinterleave6},
+    {Intrinsic::vector_interleave7, Intrinsic::vector_deinterleave7},
+    {Intrinsic::vector_interleave8, Intrinsic::vector_deinterleave8},
+};
+
+Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Interleave;
+}
+
+Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
+  assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
+  return InterleaveIntrinsics[Factor - 2].Deinterleave;
+}
+
 /// Given a vector and an element number, see if the scalar value is
 /// already around as a register, for example if it were inserted then extracted
 /// from the vector.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index acd37a5ae0720..0232ac421aeda 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4583,6 +4583,13 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (VecTy->isScalableTy() && !ST->hasSVE())
     return InstructionCost::getInvalid();
 
+  // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
+  // only have lowering for power-of-2 factors.
+  // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
+  // InterleavedAccessPass for ld3/st3
+  if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
+    return InstructionCost::getInvalid();
+
   // Vectorization for masked interleaved accesses is only enabled for scalable
   // VF.
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93ab3353a296a..474f856d20461 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3166,10 +3166,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   if (hasIrregularType(ScalarTy, DL))
     return false;
 
-  // For scalable vectors, the only interleave factor currently supported
-  // must be power of 2 since we require the (de)interleave2 intrinsics
-  // instead of shufflevectors.
-  if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
+  // For scalable vectors, the interleave factors must be <= 8 since we require
+  // the (de)interleaveN intrinsics instead of shufflevectors.
+  if (VF.isScalable() && InterleaveFactor > 8)
     return false;
 
   // If the group involves a non-integral pointer, we may not be able to
@@ -8718,10 +8717,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       bool Result = (VF.isVector() && // Query is illegal for VF == 1
                      CM.getWideningDecision(IG->getInsertPos(), VF) ==
                          LoopVectorizationCostModel::CM_Interleave);
-      // For scalable vectors, the only interleave factor currently supported
-      // must be power of 2 since we require the (de)interleave2 intrinsics
-      // instead of shufflevectors.
-      assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
+      // For scalable vectors, the interleave factors must be <= 8 since we
+      // require the (de)interleaveN intrinsics instead of shufflevectors.
+      assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
              "Unsupported interleave factor for scalable vectors");
       return Result;
     };
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f5a2533727b3d..8863a3fb4b31d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3296,21 +3296,13 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
   // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
   // must use intrinsics to interleave.
   if (VecTy->isScalableTy()) {
-    assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
-                                    "scalable vectors, must be power of 2");
-    SmallVector<Value *> InterleavingValues(Vals);
-    // When interleaving, the number of values will be shrunk until we have the
-    // single final interleaved value.
-    auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
-    for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
-      InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
-      for (unsigned I = 0; I < Midpoint; ++I)
-        InterleavingValues[I] = Builder.CreateIntrinsic(
-            InterleaveTy, Intrinsic::vector_interleave2,
-            {InterleavingValues[I], InterleavingValues[Midpoint + I]},
-            /*FMFSource=*/nullptr, Name);
-    }
-    return InterleavingValues[0];
+    assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
+    VectorType *InterleaveTy =
+        VectorType::get(VecTy->getElementType(),
+                        VecTy->getElementCount().multiplyCoefficientBy(Factor));
+    return Builder.CreateIntrinsic(InterleaveTy,
+                                   getInterleaveIntrinsicID(Factor), Vals,
+                                   /*FMFSource=*/nullptr, Name);
   }
 
   // Fixed length. Start by concatenating all vectors into a wide vector.
@@ -3396,7 +3388,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
                           &InterleaveFactor](Value *MaskForGaps) -> Value * {
     if (State.VF.isScalable()) {
       assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
-      assert(isPowerOf2_32(InterleaveFactor) &&
+      assert(InterleaveFactor <= 8 &&
              "Unsupported deinterleave factor for scalable vectors");
       auto *ResBlockInMask = State.get(BlockInMask);
       SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
@@ -3440,43 +3432,18 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     ArrayRef<VPValue *> VPDefs = definedValues();
     const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
     if (VecTy->isScalableTy()) {
-      assert(isPowerOf2_32(InterleaveFactor) &&
-             "Unsupported deinterleave factor for scalable vectors");
-
       // Scalable vectors cannot use arbitrary shufflevectors (only splats),
       // so must use intrinsics to deinterleave.
-      SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
-      DeinterleavedValues[0] = NewLoad;
-      // For the case of InterleaveFactor > 2, we will have to do recursive
-      // deinterleaving, because the current available deinterleave intrinsic
-      // supports only Factor of 2, otherwise it will bailout after first
-      // iteration.
-      // When deinterleaving, the number of values will double until we
-      // have "InterleaveFactor".
-      for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
-           NumVectors *= 2) {
-        // Deinterleave the elements within the vector
-        SmallVector<Value *> TempDeinterleavedValues(NumVectors);
-        for (unsigned I = 0; I < NumVectors; ++I) {
-          auto *DiTy = DeinterleavedValues[I]->getType();
-          TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
-              Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
-              /*FMFSource=*/nullptr, "strided.vec");
-        }
-        // Extract the deinterleaved values:
-        for (unsigned I = 0; I < 2; ++I)
-          for (unsigned J = 0; J < NumVectors; ++J)
-            DeinterleavedValues[NumVectors * I + J] =
-                State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
-      }
+      assert(InterleaveFactor <= 8 &&
+             "Unsupported deinterleave factor for scalable vectors");
+      Value *Deinterleave = State.Builder.CreateIntrinsic(
+          getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(),
+          NewLoad,
+          /*FMFSource=*/nullptr, "strided.vec");
 
-#ifndef NDEBUG
-      for (Value *Val : DeinterleavedValues)
-        assert(Val && "NULL Deinterleaved Value");
-#endif
       for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
         Instruction *Member = Group->getMember(I);
-        Value *StridedVec = DeinterleavedValues[I];
+        Value *StridedVec = State.Builder.CreateExtractValue(Deinterleave, I);
         if (!Member) {
           // This value is not needed as it's not used
           cast<Instruction>(StridedVec)->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 6861644fc9969..77e713256d247 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -375,8 +375,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1479,34 +1479,24 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
-; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
-; CHECK-NEXT:    [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32> [[TMP24]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1595,18 +1585,14 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP11]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP12]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
@@ -1622,9 +1608,7 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
 ; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-NEXT:    [[REVERSE9:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP20]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC10]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> [[REVERSE6]], <vscale x 4 x i32> [[REVERSE7]], <vscale x 4 x i32> [[REVERSE8]], <vscale x 4 x i32> [[REVERSE9]])
 ; CHECK-NEXT:    store <vscale x 16 x i32> [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 469faf67a71b3..3567aff0ace4e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -469,36 +469,26 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -531,37 +521,27 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 2
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK2:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK2]], <vscale x 64 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 32 x i8>, <vscale x 32 x i8> } @llvm.vector.deinterleave2.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC3]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC4]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP8]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP19]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC5:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP18]], <vscale x 16 x i8> [[TMP20]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC6:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv64i8(<vscale x 32 x i8> [[INTERLEAVED_VEC]], <vscale x 32 x i8> [[INTERLEAVED_VEC5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK7:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK8:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> [[INTERLEAVED_MASK7]], <vscale x 32 x i1> [[INTERLEAVED_MASK8]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index feb3b6d42b658..61a3e3561ad98 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -354,32 +354,40 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; CHECK-NEXT:    store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -458,32 +466,40 @@ define void @load_store_factor3_i32(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i32(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP1]], align 4
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP7]], <24 x i32> poison, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
-; SCALABLE-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 12 x i32>, ptr [[TMP1]], align 4
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[TMP8]], splat (i32 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP9]], splat (i32 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP10]], splat (i32 3)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[TMP13]])
+; SCALABLE-NEXT:    store <vscale x 12 x i32> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 4
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -541,32 +557,40 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; CHECK-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -645,32 +669,40 @@ define void @load_store_factor3_i64(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor3_i64(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP7]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> [[TMP6]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP7]], <12 x i64> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 2 x i64> [[TMP8]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP9]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 3)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64> [[TMP13]])
+; SCALABLE-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -745,22 +777,16 @@ define void @load_store_factor4(ptr %p) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -877,22 +903,16 @@ define void @load_store_factor4(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = mul i64 [[INDEX]], 4
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]]
 ; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP7]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP9]])
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC]], 3
 ; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 2 x i64> [[TMP10]], splat (i64 1)
 ; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[TMP11]], splat (i64 2)
 ; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP12]], splat (i64 3)
 ; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i64> [[TMP13]], splat (i64 4)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP16]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC3:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP17]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC]], <vscale x 4 x i64> [[INTERLEAVED_VEC3]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC4:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64> [[TMP15]], <vscale x 2 x i64> [[TMP16]], <vscale x 2 x i64> [[TMP17]])
 ; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC4]], ptr [[TMP7]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; SCALABLE-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -970,38 +990,41 @@ exit:
 define void @load_store_factor5(ptr %p) {
 ; CHECK-LABEL: @load_store_factor5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; CHECK-NEXT:    store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; CHECK-NEXT:    store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1106,38 +1129,41 @@ define void @load_store_factor5(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor5(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 5
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <10 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 0, i32 5>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 1, i32 6>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 2, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 3, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <10 x i64> [[WIDE_VEC]], <10 x i64> poison, <2 x i32> <i32 4, i32 9>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <10 x i64> [[TMP11]], <10 x i64> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
-; SCALABLE-NEXT:    store <10 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 5 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave5.nxv5i64(<vscale x 5 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 5 x i64> @llvm.vector.interleave5.nxv5i64(<vscale x 1 x i64> [[TMP10]], <vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]])
+; SCALABLE-NEXT:    store <vscale x 5 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1217,41 +1243,43 @@ exit:
 define void @load_store_factor6(ptr %p) {
 ; CHECK-LABEL: @load_store_factor6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; CHECK-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1369,41 +1397,43 @@ define void @load_store_factor6(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor6(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 6
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 0, i32 6>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 1, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 2, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 3, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 4, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <2 x i32> <i32 5, i32 11>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
-; SCALABLE-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 6 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave6.nxv6i64(<vscale x 6 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP11:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 6 x i64> @llvm.vector.interleave6.nxv6i64(<vscale x 1 x i64> [[TMP11]], <vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]])
+; SCALABLE-NEXT:    store <vscale x 6 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1494,45 +1524,45 @@ exit:
 define void @load_store_factor7(ptr %p) {
 ; CHECK-LABEL: @load_store_factor7(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; CHECK-NEXT:    store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; CHECK-NEXT:    store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1664,45 +1694,45 @@ define void @load_store_factor7(ptr %p) {
 ;
 ; SCALABLE-LABEL: @load_store_factor7(
 ; SCALABLE-NEXT:  entry:
-; SCALABLE-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP3]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
 ; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP0:%.*]] = mul i64 [[INDEX]], 7
 ; SCALABLE-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP0]]
-; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <14 x i64>, ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 0, i32 7>
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 1, i32 8>
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 2, i32 9>
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 3, i32 10>
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 4, i32 11>
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 5, i32 12>
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <14 x i64> [[WIDE_VEC]], <14 x i64> poison, <2 x i32> <i32 6, i32 13>
-; SCALABLE-NEXT:    [[TMP2:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1)
-; SCALABLE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2)
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3)
-; SCALABLE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4)
-; SCALABLE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5)
-; SCALABLE-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6)
-; SCALABLE-NEXT:    [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SCALABLE-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SCALABLE-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
-; SCALABLE-NEXT:    [[TMP15:%.*]] = shufflevector <6 x i64> [[TMP14]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
-; SCALABLE-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i64> [[TMP12]], <8 x i64> [[TMP15]], <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <14 x i64> [[TMP16]], <14 x i64> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
-; SCALABLE-NEXT:    store <14 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 7 x i64>, ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave7.nxv7i64(<vscale x 7 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT:    [[TMP12:%.*]] = add <vscale x 1 x i64> [[TMP5]], splat (i64 1)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> [[TMP6]], splat (i64 2)
+; SCALABLE-NEXT:    [[TMP14:%.*]] = add <vscale x 1 x i64> [[TMP7]], splat (i64 3)
+; SCALABLE-NEXT:    [[TMP15:%.*]] = add <vscale x 1 x i64> [[TMP8]], splat (i64 4)
+; SCALABLE-NEXT:    [[TMP16:%.*]] = add <vscale x 1 x i64> [[TMP9]], splat (i64 5)
+; SCALABLE-NEXT:    [[TMP17:%.*]] = add <vscale x 1 x i64> [[TMP10]], splat (i64 6)
+; SCALABLE-NEXT:    [[TMP18:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 7)
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 7 x i64> @llvm.vector.interleave7.nxv7i64(<vscale x 1 x i64> [[TMP12]], <vscale x 1 x i64> [[TMP13]], <vscale x 1 x i64> [[TMP14]], <vscale x 1 x i64> [[TMP15]], <vscale x 1 x i64> [[TMP16]], <vscale x 1 x i64> [[TMP17]], <vscale x 1 x i64> [[TMP18]])
+; SCALABLE-NEXT:    store <vscale x 7 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; SCALABLE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; SCALABLE:       scalar.ph:
-; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; SCALABLE-NEXT:    br label [[LOOP:%.*]]
 ; SCALABLE:       loop:
 ; SCALABLE-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -1818,27 +1848,15 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 3)
@@ -1847,13 +1865,7 @@ define void @load_store_factor8(ptr %p) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 6)
 ; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 7)
 ; CHECK-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 8)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP23]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; CHECK-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]])
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2019,27 +2031,15 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = shl i64 [[INDEX]], 3
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP3]]
 ; SCALABLE-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i64>, ptr [[TMP4]], align 8
-; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i64>, <vscale x 4 x i64> } @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
-; SCALABLE-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 0
-; SCALABLE-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[STRIDED_VEC]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP5]])
-; SCALABLE-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> [[TMP6]])
-; SCALABLE-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 0
-; SCALABLE-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 0
-; SCALABLE-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC1]], 1
-; SCALABLE-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[STRIDED_VEC2]], 1
-; SCALABLE-NEXT:    [[STRIDED_VEC3:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP7]])
-; SCALABLE-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP8]])
-; SCALABLE-NEXT:    [[STRIDED_VEC5:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP9]])
-; SCALABLE-NEXT:    [[STRIDED_VEC6:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave2.nxv2i64(<vscale x 2 x i64> [[TMP10]])
-; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 0
-; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 0
-; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 0
-; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 0
-; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC3]], 1
-; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC4]], 1
-; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC5]], 1
-; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC6]], 1
+; SCALABLE-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @llvm.vector.deinterleave8.nxv8i64(<vscale x 8 x i64> [[WIDE_VEC]])
+; SCALABLE-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 0
+; SCALABLE-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 1
+; SCALABLE-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 2
+; SCALABLE-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 3
+; SCALABLE-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 4
+; SCALABLE-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 5
+; SCALABLE-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 6
+; SCALABLE-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[STRIDED_VEC]], 7
 ; SCALABLE-NEXT:    [[TMP19:%.*]] = add <vscale x 1 x i64> [[TMP11]], splat (i64 1)
 ; SCALABLE-NEXT:    [[TMP20:%.*]] = add <vscale x 1 x i64> [[TMP12]], splat (i64 2)
 ; SCALABLE-NEXT:    [[TMP21:%.*]] = add <vscale x 1 x i64> [[TMP13]], splat (i64 3)
@@ -2048,13 +2048,7 @@ define void @load_store_factor8(ptr %p) {
 ; SCALABLE-NEXT:    [[TMP24:%.*]] = add <vscale x 1 x i64> [[TMP16]], splat (i64 6)
 ; SCALABLE-NEXT:    [[TMP25:%.*]] = add <vscale x 1 x i64> [[TMP17]], splat (i64 7)
 ; SCALABLE-NEXT:    [[TMP26:%.*]] = add <vscale x 1 x i64> [[TMP18]], splat (i64 8)
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP23]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC7:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP24]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC8:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP25]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC9:%.*]] = call <vscale x 2 x i64> @llvm.vector.interleave2.nxv2i64(<vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP26]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC10:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC]], <vscale x 2 x i64> [[INTERLEAVED_VEC8]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC11:%.*]] = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> [[INTERLEAVED_VEC7]], <vscale x 2 x i64> [[INTERLEAVED_VEC9]])
-; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> [[INTERLEAVED_VEC10]], <vscale x 4 x i64> [[INTERLEAVED_VEC11]])
+; SCALABLE-NEXT:    [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i64> @llvm.vector.interleave8.nxv8i64(<vscale x 1 x i64> [[TMP19]], <vscale x 1 x i64> [[TMP20]], <vscale x 1 x i64> [[TMP21]], <vscale x 1 x i64> [[TMP22]], <vscale x 1 x i64> [[TMP23]], <vscale x 1 x i64> [[TMP24]], <vscale x 1 x i64> [[TMP25]], <vscale x 1 x i64> [[TMP26]])
 ; SCALABLE-NEXT:    store <vscale x 8 x i64> [[INTERLEAVED_VEC12]], ptr [[TMP4]], align 8
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; SCALABLE-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

From 702b9033c115500a934a6c49c325c112b30fe47f Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 12 Jun 2025 11:27:30 +0100
Subject: [PATCH 209/851] [LLVM][CodeGen][AArch64] Lower vector-(de)interleave
 to multi-register uzp/zip instructions. (#143128)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  48 ++
 .../AArch64/sve-vector-deinterleave.ll        | 633 ++++++++++++------
 .../CodeGen/AArch64/sve-vector-interleave.ll  | 561 ++++++++++------
 3 files changed, 850 insertions(+), 392 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index af5dfd6c9b8f4..ac545534d728b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29451,6 +29451,30 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
 
+  // Are multi-register uzp instructions available?
+  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
+      OpVT.getVectorElementType() != MVT::i1) {
+    Intrinsic::ID IntID;
+    switch (Op->getNumOperands()) {
+    default:
+      return SDValue();
+    case 2:
+      IntID = Intrinsic::aarch64_sve_uzp_x2;
+      break;
+    case 4:
+      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
+          OpVT.getScalarSizeInBits() == 64)
+        return SDValue();
+      IntID = Intrinsic::aarch64_sve_uzp_x4;
+      break;
+    }
+
+    SmallVector<SDValue, 5> Ops;
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.append(Op->op_values().begin(), Op->op_values().end());
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
+  }
+
   if (Op->getNumOperands() != 2)
     return SDValue();
 
@@ -29468,6 +29492,30 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
 
+  // Are multi-register zip instructions available?
+  if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
+      OpVT.getVectorElementType() != MVT::i1) {
+    Intrinsic::ID IntID;
+    switch (Op->getNumOperands()) {
+    default:
+      return SDValue();
+    case 2:
+      IntID = Intrinsic::aarch64_sve_zip_x2;
+      break;
+    case 4:
+      if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
+          OpVT.getScalarSizeInBits() == 64)
+        return SDValue();
+      IntID = Intrinsic::aarch64_sve_zip_x4;
+      break;
+    }
+
+    SmallVector<SDValue, 5> Ops;
+    Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+    Ops.append(Op->op_values().begin(), Op->op_values().end());
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
+  }
+
   if (Op->getNumOperands() != 2)
     return SDValue();
 
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 89fc10b47bb35..139ecafaff0eb 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -1,106 +1,166 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve2 | FileCheck %s
+; RUN: llc < %s -mattr=+sve | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sve,+sme2 | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sme2 -force-streaming | FileCheck %s -check-prefixes=CHECK,SME2,SME2-ALL
+; RUN: llc < %s -mattr=+sme2 -force-streaming -aarch64-sve-vector-bits-min=256 | FileCheck %s -check-prefixes=CHECK,SME2,SME2-256
+
+target triple = "aarch64-unknown-linux-gnu"
 
 define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f16_nxv4f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %vec)
   ret {<vscale x 2 x half>, <vscale x 2 x half>} %retval
 }
 
 define {<vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv8f16(<vscale x 8 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.h, z0.h, z0.h
+; SVE-NEXT:    uzp2 z2.h, z0.h, z0.h
+; SVE-NEXT:    uunpklo z0.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4f16_nxv8f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %vec)
   ret {<vscale x 4 x half>, <vscale x 4 x half>} %retval
 }
 
 define {<vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv16f16(<vscale x 16 x half> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8f16_nxv16f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %vec)
   ret {<vscale x 8 x half>, <vscale x 8 x half>} %retval
 }
 
 define {<vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv4f32(<vscale x 4 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f32_nxv4f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %vec)
   ret {<vscale x 2 x float>, <vscale x 2 x float>} %retval
 }
 
 define {<vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv8f32(<vscale x 8 x float> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4f32_nxv8f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %vec)
   ret {<vscale x 4 x float>, <vscale x 4 x float>} %retval
 }
 
 define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv4f64(<vscale x 4 x double> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
   ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
 }
 
 define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.s, z0.s, z0.s
-; CHECK-NEXT:    uzp2 z2.s, z0.s, z0.s
-; CHECK-NEXT:    uunpklo z0.d, z1.s
-; CHECK-NEXT:    uunpklo z1.d, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.s, z0.s, z0.s
+; SVE-NEXT:    uzp2 z2.s, z0.s, z0.s
+; SVE-NEXT:    uunpklo z0.d, z1.s
+; SVE-NEXT:    uunpklo z1.d, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
   ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
 }
 
 define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv8bf16(<vscale x 8 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z1.h, z0.h, z0.h
-; CHECK-NEXT:    uzp2 z2.h, z0.h, z0.h
-; CHECK-NEXT:    uunpklo z0.s, z1.h
-; CHECK-NEXT:    uunpklo z1.s, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z1.h, z0.h, z0.h
+; SVE-NEXT:    uzp2 z2.h, z0.h, z0.h
+; SVE-NEXT:    uunpklo z0.s, z1.h
+; SVE-NEXT:    uunpklo z1.s, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4bf16_nxv8bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave2.nxv8bf16(<vscale x 8 x bfloat> %vec)
   ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
 }
 
 define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv16bf16(<vscale x 16 x bfloat> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8bf16_nxv16bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave2.nxv16bf16(<vscale x 16 x bfloat> %vec)
   ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval
 }
@@ -108,141 +168,259 @@ define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8b
 ; Integers
 
 define {<vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv32i8(<vscale x 32 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.b, z0.b, z1.b
-; CHECK-NEXT:    uzp2 z1.b, z0.b, z1.b
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.b, z0.b, z1.b
+; SVE-NEXT:    uzp2 z1.b, z0.b, z1.b
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv16i8_nxv32i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.b, z1.b }, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
 define {<vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv16i16(<vscale x 16 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i16_nxv16i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
 define {<vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv8i32(<vscale x 8 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i32_nxvv8i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv4i64(<vscale x 4 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z2.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
 define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.b, z2.b, z3.b
-; CHECK-NEXT:    uzp1 z5.b, z0.b, z1.b
-; CHECK-NEXT:    uzp2 z3.b, z2.b, z3.b
-; CHECK-NEXT:    uzp2 z6.b, z0.b, z1.b
-; CHECK-NEXT:    uzp1 z0.b, z5.b, z4.b
-; CHECK-NEXT:    uzp2 z2.b, z5.b, z4.b
-; CHECK-NEXT:    uzp1 z1.b, z6.b, z3.b
-; CHECK-NEXT:    uzp2 z3.b, z6.b, z3.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.b, z2.b, z3.b
+; SVE-NEXT:    uzp1 z5.b, z0.b, z1.b
+; SVE-NEXT:    uzp2 z3.b, z2.b, z3.b
+; SVE-NEXT:    uzp2 z6.b, z0.b, z1.b
+; SVE-NEXT:    uzp1 z0.b, z5.b, z4.b
+; SVE-NEXT:    uzp2 z2.b, z5.b, z4.b
+; SVE-NEXT:    uzp1 z1.b, z6.b, z3.b
+; SVE-NEXT:    uzp2 z3.b, z6.b, z3.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.b - z3.b }, { z0.b - z3.b }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %vec)
   ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
 }
 
 define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.h, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z5.h, z0.h, z1.h
-; CHECK-NEXT:    uzp2 z3.h, z2.h, z3.h
-; CHECK-NEXT:    uzp2 z6.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.h, z5.h, z4.h
-; CHECK-NEXT:    uzp2 z2.h, z5.h, z4.h
-; CHECK-NEXT:    uzp1 z1.h, z6.h, z3.h
-; CHECK-NEXT:    uzp2 z3.h, z6.h, z3.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.h, z2.h, z3.h
+; SVE-NEXT:    uzp1 z5.h, z0.h, z1.h
+; SVE-NEXT:    uzp2 z3.h, z2.h, z3.h
+; SVE-NEXT:    uzp2 z6.h, z0.h, z1.h
+; SVE-NEXT:    uzp1 z0.h, z5.h, z4.h
+; SVE-NEXT:    uzp2 z2.h, z5.h, z4.h
+; SVE-NEXT:    uzp1 z1.h, z6.h, z3.h
+; SVE-NEXT:    uzp2 z3.h, z6.h, z3.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.h - z3.h }, { z0.h - z3.h }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
   ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
 }
 
 define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.s, z2.s, z3.s
-; CHECK-NEXT:    uzp1 z5.s, z0.s, z1.s
-; CHECK-NEXT:    uzp2 z3.s, z2.s, z3.s
-; CHECK-NEXT:    uzp2 z6.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.s, z5.s, z4.s
-; CHECK-NEXT:    uzp2 z2.s, z5.s, z4.s
-; CHECK-NEXT:    uzp1 z1.s, z6.s, z3.s
-; CHECK-NEXT:    uzp2 z3.s, z6.s, z3.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.s, z2.s, z3.s
+; SVE-NEXT:    uzp1 z5.s, z0.s, z1.s
+; SVE-NEXT:    uzp2 z3.s, z2.s, z3.s
+; SVE-NEXT:    uzp2 z6.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.s, z5.s, z4.s
+; SVE-NEXT:    uzp2 z2.s, z5.s, z4.s
+; SVE-NEXT:    uzp1 z1.s, z6.s, z3.s
+; SVE-NEXT:    uzp2 z3.s, z6.s, z3.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    uzp { z0.s - z3.s }, { z0.s - z3.s }
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    uzp2 z2.d, z5.d, z4.d
-; CHECK-NEXT:    uzp1 z1.d, z6.d, z3.d
-; CHECK-NEXT:    uzp2 z3.d, z6.d, z3.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z5.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z3.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z6.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.d, z5.d, z4.d
+; SVE-NEXT:    uzp2 z2.d, z5.d, z4.d
+; SVE-NEXT:    uzp1 z1.d, z6.d, z3.d
+; SVE-NEXT:    uzp2 z3.d, z6.d, z3.d
+; SVE-NEXT:    ret
+;
+; SME2-ALL-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SME2-ALL:       // %bb.0:
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z2.d, z3.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-ALL-NEXT:    uzp { z2.d, z3.d }, z0.d, z4.d
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z1.d, z5.d
+; SME2-ALL-NEXT:    mov z0.d, z2.d
+; SME2-ALL-NEXT:    mov z1.d, z4.d
+; SME2-ALL-NEXT:    mov z2.d, z3.d
+; SME2-ALL-NEXT:    mov z3.d, z5.d
+; SME2-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    uzp { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
 define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv16i64(<vscale x 16 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z6.d, z7.d
-; CHECK-NEXT:    uzp1 z25.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z26.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z27.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z6.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z4.d, z4.d, z5.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z5.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z24.d, z25.d, z24.d
-; CHECK-NEXT:    uzp1 z7.d, z27.d, z26.d
-; CHECK-NEXT:    uzp1 z28.d, z4.d, z6.d
-; CHECK-NEXT:    uzp2 z25.d, z27.d, z26.d
-; CHECK-NEXT:    uzp1 z29.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z26.d, z4.d, z6.d
-; CHECK-NEXT:    uzp2 z27.d, z0.d, z2.d
-; CHECK-NEXT:    uzp1 z0.d, z7.d, z5.d
-; CHECK-NEXT:    uzp1 z2.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z4.d, z7.d, z5.d
-; CHECK-NEXT:    uzp1 z1.d, z29.d, z28.d
-; CHECK-NEXT:    uzp1 z3.d, z27.d, z26.d
-; CHECK-NEXT:    uzp2 z5.d, z29.d, z28.d
-; CHECK-NEXT:    uzp2 z6.d, z25.d, z24.d
-; CHECK-NEXT:    uzp2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z24.d, z6.d, z7.d
+; SVE-NEXT:    uzp1 z25.d, z4.d, z5.d
+; SVE-NEXT:    uzp1 z26.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z27.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z6.d, z6.d, z7.d
+; SVE-NEXT:    uzp2 z4.d, z4.d, z5.d
+; SVE-NEXT:    uzp2 z2.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z5.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z24.d, z25.d, z24.d
+; SVE-NEXT:    uzp1 z7.d, z27.d, z26.d
+; SVE-NEXT:    uzp1 z28.d, z4.d, z6.d
+; SVE-NEXT:    uzp2 z25.d, z27.d, z26.d
+; SVE-NEXT:    uzp1 z29.d, z0.d, z2.d
+; SVE-NEXT:    uzp2 z26.d, z4.d, z6.d
+; SVE-NEXT:    uzp2 z27.d, z0.d, z2.d
+; SVE-NEXT:    uzp1 z0.d, z7.d, z5.d
+; SVE-NEXT:    uzp1 z2.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z4.d, z7.d, z5.d
+; SVE-NEXT:    uzp1 z1.d, z29.d, z28.d
+; SVE-NEXT:    uzp1 z3.d, z27.d, z26.d
+; SVE-NEXT:    uzp2 z5.d, z29.d, z28.d
+; SVE-NEXT:    uzp2 z6.d, z25.d, z24.d
+; SVE-NEXT:    uzp2 z7.d, z27.d, z26.d
+; SVE-NEXT:    ret
+;
+; SME2-ALL-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SME2-ALL:       // %bb.0:
+; SME2-ALL-NEXT:    uzp { z6.d, z7.d }, z6.d, z7.d
+; SME2-ALL-NEXT:    uzp { z24.d, z25.d }, z4.d, z5.d
+; SME2-ALL-NEXT:    uzp { z26.d, z27.d }, z24.d, z6.d
+; SME2-ALL-NEXT:    uzp { z2.d, z3.d }, z2.d, z3.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-ALL-NEXT:    uzp { z28.d, z29.d }, z0.d, z2.d
+; SME2-ALL-NEXT:    uzp { z4.d, z5.d }, z28.d, z26.d
+; SME2-ALL-NEXT:    uzp { z30.d, z31.d }, z25.d, z7.d
+; SME2-ALL-NEXT:    uzp { z0.d, z1.d }, z1.d, z3.d
+; SME2-ALL-NEXT:    uzp { z6.d, z7.d }, z0.d, z30.d
+; SME2-ALL-NEXT:    uzp { z24.d, z25.d }, z29.d, z27.d
+; SME2-ALL-NEXT:    uzp { z26.d, z27.d }, z1.d, z31.d
+; SME2-ALL-NEXT:    mov z0.d, z4.d
+; SME2-ALL-NEXT:    mov z1.d, z6.d
+; SME2-ALL-NEXT:    mov z2.d, z24.d
+; SME2-ALL-NEXT:    mov z3.d, z26.d
+; SME2-ALL-NEXT:    mov z4.d, z5.d
+; SME2-ALL-NEXT:    mov z5.d, z7.d
+; SME2-ALL-NEXT:    mov z6.d, z25.d
+; SME2-ALL-NEXT:    mov z7.d, z27.d
+; SME2-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
+; SME2-256-NEXT:    uzp { z28.d - z31.d }, { z4.d - z7.d }
+; SME2-256-NEXT:    uzp { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    uzp { z4.d, z5.d }, z0.d, z28.d
+; SME2-256-NEXT:    uzp { z6.d, z7.d }, z1.d, z29.d
+; SME2-256-NEXT:    uzp { z24.d, z25.d }, z2.d, z30.d
+; SME2-256-NEXT:    uzp { z26.d, z27.d }, z3.d, z31.d
+; SME2-256-NEXT:    mov z0.d, z4.d
+; SME2-256-NEXT:    mov z1.d, z6.d
+; SME2-256-NEXT:    mov z2.d, z24.d
+; SME2-256-NEXT:    mov z3.d, z26.d
+; SME2-256-NEXT:    mov z4.d, z5.d
+; SME2-256-NEXT:    mov z5.d, z7.d
+; SME2-256-NEXT:    mov z6.d, z25.d
+; SME2-256-NEXT:    mov z7.d, z27.d
+; SME2-256-NEXT:    ret
   %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave8.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
@@ -299,39 +477,65 @@ define {<vscale x 2 x i1>, <vscale x 2 x i1>} @vector_deinterleave_nxv2i1_nxv4i1
 ; Split illegal types
 
 define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv8i64(<vscale x 8 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z4.d
-; CHECK-NEXT:    mov z2.d, z6.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z4.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z5.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z6.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z3.d, z2.d, z3.d
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z4.d
+; SVE-NEXT:    mov z2.d, z6.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z4.d, z5.d }, z0.d, z1.d
+; SME2-NEXT:    uzp { z6.d, z7.d }, z2.d, z3.d
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z6.d
+; SME2-NEXT:    mov z2.d, z5.d
+; SME2-NEXT:    mov z3.d, z7.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
   ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
 }
 
 define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv16i64(<vscale x 16 x i64> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z26.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z27.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z28.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z29.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z30.d, z4.d, z5.d
-; CHECK-NEXT:    uzp2 z7.d, z6.d, z7.d
-; CHECK-NEXT:    mov z0.d, z25.d
-; CHECK-NEXT:    mov z1.d, z24.d
-; CHECK-NEXT:    mov z2.d, z26.d
-; CHECK-NEXT:    mov z3.d, z27.d
-; CHECK-NEXT:    mov z4.d, z28.d
-; CHECK-NEXT:    mov z5.d, z29.d
-; CHECK-NEXT:    mov z6.d, z30.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uzp1 z24.d, z2.d, z3.d
+; SVE-NEXT:    uzp1 z25.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z26.d, z4.d, z5.d
+; SVE-NEXT:    uzp1 z27.d, z6.d, z7.d
+; SVE-NEXT:    uzp2 z28.d, z0.d, z1.d
+; SVE-NEXT:    uzp2 z29.d, z2.d, z3.d
+; SVE-NEXT:    uzp2 z30.d, z4.d, z5.d
+; SVE-NEXT:    uzp2 z7.d, z6.d, z7.d
+; SVE-NEXT:    mov z0.d, z25.d
+; SVE-NEXT:    mov z1.d, z24.d
+; SVE-NEXT:    mov z2.d, z26.d
+; SVE-NEXT:    mov z3.d, z27.d
+; SVE-NEXT:    mov z4.d, z28.d
+; SVE-NEXT:    mov z5.d, z29.d
+; SVE-NEXT:    mov z6.d, z30.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i64_nxv16i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uzp { z24.d, z25.d }, z0.d, z1.d
+; SME2-NEXT:    uzp { z26.d, z27.d }, z2.d, z3.d
+; SME2-NEXT:    uzp { z28.d, z29.d }, z4.d, z5.d
+; SME2-NEXT:    uzp { z30.d, z31.d }, z6.d, z7.d
+; SME2-NEXT:    mov z0.d, z24.d
+; SME2-NEXT:    mov z1.d, z26.d
+; SME2-NEXT:    mov z2.d, z28.d
+; SME2-NEXT:    mov z3.d, z30.d
+; SME2-NEXT:    mov z4.d, z25.d
+; SME2-NEXT:    mov z5.d, z27.d
+; SME2-NEXT:    mov z6.d, z29.d
+; SME2-NEXT:    mov z7.d, z31.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64> %vec)
   ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
 }
@@ -340,37 +544,58 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; Promote illegal type size
 
 define {<vscale x 8 x i8>, <vscale x 8 x i8>} @vector_deinterleave_nxv8i8_nxv16i8(<vscale x 16 x i8> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.h, z0.b
-; CHECK-NEXT:    uunpklo z2.h, z0.b
-; CHECK-NEXT:    uzp1 z0.h, z2.h, z1.h
-; CHECK-NEXT:    uzp2 z1.h, z2.h, z1.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.h, z0.b
+; SVE-NEXT:    uunpklo z2.h, z0.b
+; SVE-NEXT:    uzp1 z0.h, z2.h, z1.h
+; SVE-NEXT:    uzp2 z1.h, z2.h, z1.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv8i8_nxv16i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.h, z0.b
+; SME2-NEXT:    uunpklo z0.h, z0.b
+; SME2-NEXT:    uzp { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %vec)
   ret {<vscale x 8 x i8>, <vscale x 8 x i8>} %retval
 }
 
 define {<vscale x 4 x i16>, <vscale x 4 x i16>} @vector_deinterleave_nxv4i16_nxv8i16(<vscale x 8 x i16> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.s, z0.h
-; CHECK-NEXT:    uunpklo z2.s, z0.h
-; CHECK-NEXT:    uzp1 z0.s, z2.s, z1.s
-; CHECK-NEXT:    uzp2 z1.s, z2.s, z1.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.s, z0.h
+; SVE-NEXT:    uunpklo z2.s, z0.h
+; SVE-NEXT:    uzp1 z0.s, z2.s, z1.s
+; SVE-NEXT:    uzp2 z1.s, z2.s, z1.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv4i16_nxv8i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.s, z0.h
+; SME2-NEXT:    uunpklo z0.s, z0.h
+; SME2-NEXT:    uzp { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %vec)
   ret {<vscale x 4 x i16>, <vscale x 4 x i16>} %retval
 }
 
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv4i32(<vscale x 4 x i32> %vec) {
-; CHECK-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z1.d, z0.s
-; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    uzp1 z0.d, z2.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z2.d, z1.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    uunpkhi z1.d, z0.s
+; SVE-NEXT:    uunpklo z2.d, z0.s
+; SVE-NEXT:    uzp1 z0.d, z2.d, z1.d
+; SVE-NEXT:    uzp2 z1.d, z2.d, z1.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: vector_deinterleave_nxv2i32_nxv4i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    uunpkhi z1.d, z0.s
+; SME2-NEXT:    uunpklo z0.d, z0.s
+; SME2-NEXT:    uzp { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
   ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 34d026f43708c..52cb2d9ebe343 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -1,101 +1,156 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s
+; RUN: llc < %s -mattr=+sve | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sve,+sme2 | FileCheck %s -check-prefixes=CHECK,SVE
+; RUN: llc < %s -mattr=+sme2 -force-streaming | FileCheck %s -check-prefixes=CHECK,SME2,SME-ALL
+; RUN: llc < %s -mattr=+sme2 -force-streaming -aarch64-sve-vector-bits-min=256 | FileCheck %s -check-prefixes=CHECK,SME2,SME2-256
+
+target triple = "aarch64-unknown-linux-gnu"
 
 define <vscale x 4 x half> @interleave2_nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1)
   ret <vscale x 4 x half> %retval
 }
 
 define <vscale x 8 x half> @interleave2_nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv8f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1)
   ret <vscale x 8 x half> %retval
 }
 
 define <vscale x 16 x half> @interleave2_nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1) {
-; CHECK-LABEL: interleave2_nxv16f16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16f16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16f16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1)
   ret <vscale x 16 x half> %retval
 }
 
 define <vscale x 4 x float> @interleave2_nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1)
   ret <vscale x 4 x float> %retval
 }
 
 define <vscale x 8 x float> @interleave2_nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
-; CHECK-LABEL: interleave2_nxv8f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8f32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8f32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1)
   ret <vscale x 8 x float> %retval
 }
 
 define <vscale x 4 x double> @interleave2_nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1) {
-; CHECK-LABEL: interleave2_nxv4f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4f64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4f64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x double>@llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1)
   ret <vscale x 4 x double> %retval
 }
 
 define <vscale x 4 x bfloat> @interleave2_nxv4bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv4bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1)
   ret <vscale x 4 x bfloat> %retval
 }
 
 define <vscale x 8 x bfloat> @interleave2_nxv8bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv8bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1)
   ret <vscale x 8 x bfloat> %retval
 }
 
 define <vscale x 16 x bfloat> @interleave2_nxv16bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1) {
-; CHECK-LABEL: interleave2_nxv16bf16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16bf16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16bf16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1)
   ret <vscale x 16 x bfloat> %retval
 }
@@ -103,141 +158,237 @@ define <vscale x 16 x bfloat> @interleave2_nxv16bf16(<vscale x 8 x bfloat> %vec0
 ; Integers
 
 define <vscale x 32 x i8> @interleave2_nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1) {
-; CHECK-LABEL: interleave2_nxv32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.b, z0.b, z1.b
-; CHECK-NEXT:    zip2 z1.b, z0.b, z1.b
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv32i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.b, z0.b, z1.b
+; SVE-NEXT:    zip2 z1.b, z0.b, z1.b
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv32i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.b, z1.b }, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1)
   ret <vscale x 32 x i8> %retval
 }
 
 define <vscale x 16 x i16> @interleave2_nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1) {
-; CHECK-LABEL: interleave2_nxv16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip2 z1.h, z0.h, z1.h
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip2 z1.h, z0.h, z1.h
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1)
   ret <vscale x 16 x i16> %retval
 }
 
 define <vscale x 8 x i32> @interleave2_nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip2 z1.s, z0.s, z1.s
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip2 z1.s, z0.s, z1.s
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1)
   ret <vscale x 8 x i32> %retval
 }
 
 define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1) {
-; CHECK-LABEL: interleave2_nxv4i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip2 z1.d, z0.d, z1.d
-; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip2 z1.d, z0.d, z1.d
+; SVE-NEXT:    mov z0.d, z2.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1)
   ret <vscale x 4 x i64> %retval
 }
 
 define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
-; CHECK-LABEL: interleave4_nxv16i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.b, z1.b, z3.b
-; CHECK-NEXT:    zip1 z5.b, z0.b, z2.b
-; CHECK-NEXT:    zip2 z3.b, z1.b, z3.b
-; CHECK-NEXT:    zip2 z6.b, z0.b, z2.b
-; CHECK-NEXT:    zip1 z0.b, z5.b, z4.b
-; CHECK-NEXT:    zip2 z1.b, z5.b, z4.b
-; CHECK-NEXT:    zip1 z2.b, z6.b, z3.b
-; CHECK-NEXT:    zip2 z3.b, z6.b, z3.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv16i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.b, z1.b, z3.b
+; SVE-NEXT:    zip1 z5.b, z0.b, z2.b
+; SVE-NEXT:    zip2 z3.b, z1.b, z3.b
+; SVE-NEXT:    zip2 z6.b, z0.b, z2.b
+; SVE-NEXT:    zip1 z0.b, z5.b, z4.b
+; SVE-NEXT:    zip2 z1.b, z5.b, z4.b
+; SVE-NEXT:    zip1 z2.b, z6.b, z3.b
+; SVE-NEXT:    zip2 z3.b, z6.b, z3.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv16i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.b - z3.b }, { z0.b - z3.b }
+; SME2-NEXT:    ret
   %retval = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3)
   ret <vscale x 64 x i8> %retval
 }
 
 define <vscale x 32 x i16> @interleave4_nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3) {
-; CHECK-LABEL: interleave4_nxv8i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.h, z1.h, z3.h
-; CHECK-NEXT:    zip1 z5.h, z0.h, z2.h
-; CHECK-NEXT:    zip2 z3.h, z1.h, z3.h
-; CHECK-NEXT:    zip2 z6.h, z0.h, z2.h
-; CHECK-NEXT:    zip1 z0.h, z5.h, z4.h
-; CHECK-NEXT:    zip2 z1.h, z5.h, z4.h
-; CHECK-NEXT:    zip1 z2.h, z6.h, z3.h
-; CHECK-NEXT:    zip2 z3.h, z6.h, z3.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv8i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.h, z1.h, z3.h
+; SVE-NEXT:    zip1 z5.h, z0.h, z2.h
+; SVE-NEXT:    zip2 z3.h, z1.h, z3.h
+; SVE-NEXT:    zip2 z6.h, z0.h, z2.h
+; SVE-NEXT:    zip1 z0.h, z5.h, z4.h
+; SVE-NEXT:    zip2 z1.h, z5.h, z4.h
+; SVE-NEXT:    zip1 z2.h, z6.h, z3.h
+; SVE-NEXT:    zip2 z3.h, z6.h, z3.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv8i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.h - z3.h }, { z0.h - z3.h }
+; SME2-NEXT:    ret
   %retval = call <vscale x 32 x i16> @llvm.vector.interleave4.nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3)
   ret <vscale x 32 x i16> %retval
 }
 
 define <vscale x 16 x i32> @interleave4_nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3) {
-; CHECK-LABEL: interleave4_nxv4i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
-; CHECK-NEXT:    zip2 z6.s, z0.s, z2.s
-; CHECK-NEXT:    zip1 z0.s, z5.s, z4.s
-; CHECK-NEXT:    zip2 z1.s, z5.s, z4.s
-; CHECK-NEXT:    zip1 z2.s, z6.s, z3.s
-; CHECK-NEXT:    zip2 z3.s, z6.s, z3.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.s, z1.s, z3.s
+; SVE-NEXT:    zip1 z5.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z3.s, z1.s, z3.s
+; SVE-NEXT:    zip2 z6.s, z0.s, z2.s
+; SVE-NEXT:    zip1 z0.s, z5.s, z4.s
+; SVE-NEXT:    zip2 z1.s, z5.s, z4.s
+; SVE-NEXT:    zip1 z2.s, z6.s, z3.s
+; SVE-NEXT:    zip2 z3.s, z6.s, z3.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave4_nxv4i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-NEXT:    zip { z0.s - z3.s }, { z0.s - z3.s }
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3)
   ret <vscale x 16 x i32> %retval
 }
 
 define <vscale x 8 x i64> @interleave4_nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3) {
-; CHECK-LABEL: interleave4_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
-; CHECK-NEXT:    zip2 z6.d, z0.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z2.d, z6.d, z3.d
-; CHECK-NEXT:    zip2 z3.d, z6.d, z3.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave4_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z5.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z3.d, z1.d, z3.d
+; SVE-NEXT:    zip2 z6.d, z0.d, z2.d
+; SVE-NEXT:    zip1 z0.d, z5.d, z4.d
+; SVE-NEXT:    zip2 z1.d, z5.d, z4.d
+; SVE-NEXT:    zip1 z2.d, z6.d, z3.d
+; SVE-NEXT:    zip2 z3.d, z6.d, z3.d
+; SVE-NEXT:    ret
+;
+; SME-ALL-LABEL: interleave4_nxv8i64:
+; SME-ALL:       // %bb.0:
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z1.d, z3.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z0.d, z2.d
+; SME-ALL-NEXT:    zip { z0.d, z1.d }, z2.d, z4.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z3.d, z5.d
+; SME-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: interleave4_nxv8i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
+; SME2-256-NEXT:    zip { z0.d - z3.d }, { z0.d - z3.d }
+; SME2-256-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3)
   ret <vscale x 8 x i64> %retval
 }
 
 define <vscale x 16 x i64> @interleave8_nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7) {
-; CHECK-LABEL: interleave8_nxv16i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z24.d, z3.d, z7.d
-; CHECK-NEXT:    zip1 z25.d, z1.d, z5.d
-; CHECK-NEXT:    zip1 z26.d, z2.d, z6.d
-; CHECK-NEXT:    zip1 z27.d, z0.d, z4.d
-; CHECK-NEXT:    zip2 z3.d, z3.d, z7.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z5.d
-; CHECK-NEXT:    zip2 z2.d, z2.d, z6.d
-; CHECK-NEXT:    zip2 z0.d, z0.d, z4.d
-; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
-; CHECK-NEXT:    zip2 z6.d, z25.d, z24.d
-; CHECK-NEXT:    zip1 z5.d, z27.d, z26.d
-; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    zip1 z24.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z25.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z26.d, z1.d, z3.d
-; CHECK-NEXT:    zip2 z27.d, z0.d, z2.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z2.d, z7.d, z6.d
-; CHECK-NEXT:    zip2 z3.d, z7.d, z6.d
-; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
-; CHECK-NEXT:    zip2 z5.d, z25.d, z24.d
-; CHECK-NEXT:    zip1 z6.d, z27.d, z26.d
-; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave8_nxv16i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z24.d, z3.d, z7.d
+; SVE-NEXT:    zip1 z25.d, z1.d, z5.d
+; SVE-NEXT:    zip1 z26.d, z2.d, z6.d
+; SVE-NEXT:    zip1 z27.d, z0.d, z4.d
+; SVE-NEXT:    zip2 z3.d, z3.d, z7.d
+; SVE-NEXT:    zip2 z1.d, z1.d, z5.d
+; SVE-NEXT:    zip2 z2.d, z2.d, z6.d
+; SVE-NEXT:    zip2 z0.d, z0.d, z4.d
+; SVE-NEXT:    zip1 z4.d, z25.d, z24.d
+; SVE-NEXT:    zip2 z6.d, z25.d, z24.d
+; SVE-NEXT:    zip1 z5.d, z27.d, z26.d
+; SVE-NEXT:    zip2 z7.d, z27.d, z26.d
+; SVE-NEXT:    zip1 z24.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z25.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z26.d, z1.d, z3.d
+; SVE-NEXT:    zip2 z27.d, z0.d, z2.d
+; SVE-NEXT:    zip1 z0.d, z5.d, z4.d
+; SVE-NEXT:    zip2 z1.d, z5.d, z4.d
+; SVE-NEXT:    zip1 z2.d, z7.d, z6.d
+; SVE-NEXT:    zip2 z3.d, z7.d, z6.d
+; SVE-NEXT:    zip1 z4.d, z25.d, z24.d
+; SVE-NEXT:    zip2 z5.d, z25.d, z24.d
+; SVE-NEXT:    zip1 z6.d, z27.d, z26.d
+; SVE-NEXT:    zip2 z7.d, z27.d, z26.d
+; SVE-NEXT:    ret
+;
+; SME-ALL-LABEL: interleave8_nxv16i64:
+; SME-ALL:       // %bb.0:
+; SME-ALL-NEXT:    zip { z24.d, z25.d }, z3.d, z7.d
+; SME-ALL-NEXT:    zip { z26.d, z27.d }, z1.d, z5.d
+; SME-ALL-NEXT:    zip { z28.d, z29.d }, z26.d, z24.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z2.d, z6.d
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z0.d, z4.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z4.d, z6.d
+; SME-ALL-NEXT:    zip { z0.d, z1.d }, z2.d, z28.d
+; SME-ALL-NEXT:    zip { z2.d, z3.d }, z3.d, z29.d
+; SME-ALL-NEXT:    zip { z24.d, z25.d }, z27.d, z25.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z5.d, z7.d
+; SME-ALL-NEXT:    zip { z4.d, z5.d }, z6.d, z24.d
+; SME-ALL-NEXT:    zip { z6.d, z7.d }, z7.d, z25.d
+; SME-ALL-NEXT:    ret
+;
+; SME2-256-LABEL: interleave8_nxv16i64:
+; SME2-256:       // %bb.0:
+; SME2-256-NEXT:    zip { z26.d, z27.d }, z3.d, z7.d
+; SME2-256-NEXT:    zip { z6.d, z7.d }, z2.d, z6.d
+; SME2-256-NEXT:    zip { z24.d, z25.d }, z1.d, z5.d
+; SME2-256-NEXT:    zip { z0.d, z1.d }, z0.d, z4.d
+; SME2-256-NEXT:    mov z28.d, z0.d
+; SME2-256-NEXT:    mov z29.d, z24.d
+; SME2-256-NEXT:    mov z30.d, z6.d
+; SME2-256-NEXT:    mov z31.d, z26.d
+; SME2-256-NEXT:    mov z24.d, z1.d
+; SME2-256-NEXT:    mov z26.d, z7.d
+; SME2-256-NEXT:    zip { z0.d - z3.d }, { z28.d - z31.d }
+; SME2-256-NEXT:    zip { z4.d - z7.d }, { z24.d - z27.d }
+; SME2-256-NEXT:    ret
   %retval = call <vscale x 16 x i64> @llvm.vector.interleave8.nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7)
   ret <vscale x 16 x i64> %retval
 }
@@ -291,31 +442,47 @@ define <vscale x 4 x i1> @interleave2_nxv4i1(<vscale x 2 x i1> %vec0, <vscale x
 ; Split illegal type size
 
 define <vscale x 16 x i32> @interleave2_nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv16i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
-; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z2.s, z0.s, z2.s
-; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, z4.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv16i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.s, z1.s, z3.s
+; SVE-NEXT:    zip1 z5.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z2.s, z0.s, z2.s
+; SVE-NEXT:    zip2 z3.s, z1.s, z3.s
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z2.d
+; SVE-NEXT:    mov z2.d, z4.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv16i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z4.s, z5.s }, z0.s, z2.s
+; SME2-NEXT:    zip { z2.s, z3.s }, z1.s, z3.s
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z5.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i32>@llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %vec0, <vscale x 8 x i32> %vec1)
   ret <vscale x 16 x i32> %retval
 }
 
 define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
-; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z2.d, z0.d, z2.d
-; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
-; CHECK-NEXT:    mov z0.d, z5.d
-; CHECK-NEXT:    mov z1.d, z2.d
-; CHECK-NEXT:    mov z2.d, z4.d
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i64:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip1 z4.d, z1.d, z3.d
+; SVE-NEXT:    zip1 z5.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z2.d, z0.d, z2.d
+; SVE-NEXT:    zip2 z3.d, z1.d, z3.d
+; SVE-NEXT:    mov z0.d, z5.d
+; SVE-NEXT:    mov z1.d, z2.d
+; SVE-NEXT:    mov z2.d, z4.d
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i64:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z4.d, z5.d }, z0.d, z2.d
+; SME2-NEXT:    zip { z2.d, z3.d }, z1.d, z3.d
+; SME2-NEXT:    mov z0.d, z4.d
+; SME2-NEXT:    mov z1.d, z5.d
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i64> @llvm.vector.interleave2.nxv8i64(<vscale x 4 x i64> %vec0, <vscale x 4 x i64> %vec1)
   ret <vscale x 8 x i64> %retval
 }
@@ -323,34 +490,52 @@ define <vscale x 8 x i64> @interleave2_nxv8i64(<vscale x 4 x i64> %vec0, <vscale
 ; Promote illegal type size
 
 define <vscale x 16 x i8> @interleave2_nxv8i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1) {
-; CHECK-LABEL: interleave2_nxv8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z2.b
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv8i8:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.h, z0.h, z1.h
+; SVE-NEXT:    zip1 z0.h, z0.h, z1.h
+; SVE-NEXT:    uzp1 z0.b, z0.b, z2.b
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv8i8:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.h, z1.h }, z0.h, z1.h
+; SME2-NEXT:    uzp1 z0.b, z0.b, z1.b
+; SME2-NEXT:    ret
   %retval = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 8 x i8> %vec0, <vscale x 8 x i8> %vec1)
   ret <vscale x 16 x i8> %retval
 }
 
 define <vscale x 8 x i16> @interleave2_nxv4i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1) {
-; CHECK-LABEL: interleave2_nxv4i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.s, z0.s, z1.s
-; CHECK-NEXT:    zip1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv4i16:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.s, z0.s, z1.s
+; SVE-NEXT:    zip1 z0.s, z0.s, z1.s
+; SVE-NEXT:    uzp1 z0.h, z0.h, z2.h
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv4i16:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.s, z1.s }, z0.s, z1.s
+; SME2-NEXT:    uzp1 z0.h, z0.h, z1.h
+; SME2-NEXT:    ret
   %retval = call <vscale x 8 x i16> @llvm.vector.interleave2.nxv8i16(<vscale x 4 x i16> %vec0, <vscale x 4 x i16> %vec1)
   ret <vscale x 8 x i16> %retval
 }
 
 define <vscale x 4 x i32> @interleave2_nxv2i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1) {
-; CHECK-LABEL: interleave2_nxv2i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
-; CHECK-NEXT:    ret
+; SVE-LABEL: interleave2_nxv2i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    zip2 z2.d, z0.d, z1.d
+; SVE-NEXT:    zip1 z0.d, z0.d, z1.d
+; SVE-NEXT:    uzp1 z0.s, z0.s, z2.s
+; SVE-NEXT:    ret
+;
+; SME2-LABEL: interleave2_nxv2i32:
+; SME2:       // %bb.0:
+; SME2-NEXT:    zip { z0.d, z1.d }, z0.d, z1.d
+; SME2-NEXT:    uzp1 z0.s, z0.s, z1.s
+; SME2-NEXT:    ret
   %retval = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %vec0, <vscale x 2 x i32> %vec1)
   ret <vscale x 4 x i32> %retval
 }

From d517f15e09e49e172387cb6deb76e4ee2d45d0e4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 12:30:35 +0200
Subject: [PATCH 210/851] [LICM] Regenerate test checks (NFC)

---
 llvm/test/Transforms/LICM/call-hoisting.ll | 247 ++++++++++++++++-----
 1 file changed, 191 insertions(+), 56 deletions(-)

diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll
index e6d2e42e34e81..907f13438623a 100644
--- a/llvm/test/Transforms/LICM/call-hoisting.ll
+++ b/llvm/test/Transforms/LICM/call-hoisting.ll
@@ -1,13 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=licm %s | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<target-ir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 declare i32 @load(ptr %p) argmemonly readonly nounwind
 
 define void @test_load(ptr noalias %loc, ptr noalias %sink) {
-; CHECK-LABEL: @test_load
-; CHECK-LABEL: entry:
-; CHECK: call i32 @load
-; CHECK-LABEL: loop:
+; CHECK-LABEL: define void @test_load(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[SINK:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC]])
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    store volatile i32 [[RET]], ptr [[SINK]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -30,15 +41,26 @@ declare i32 @spec(ptr %p, ptr %q) readonly argmemonly nounwind speculatable
 ; However, we need not strip the nonnull attribute since it just propagates
 ; poison if the parameter was indeed null.
 define void @test_strip_attribute(ptr noalias %loc, ptr noalias %sink, ptr %q) {
-; CHECK-LABEL: @test_strip_attribute(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC:%.*]])
-; CHECK-NEXT:    [[NULLCHK:%.*]] = icmp eq ptr [[Q:%.*]], null
+; CHECK-LABEL: define void @test_strip_attribute(
+; CHECK-SAME: ptr noalias [[LOC:%.*]], ptr noalias [[SINK:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @load(ptr [[LOC]])
+; CHECK-NEXT:    [[NULLCHK:%.*]] = icmp eq ptr [[Q]], null
 ; CHECK-NEXT:    [[RET2:%.*]] = call i32 @spec(ptr nonnull [[Q]], ptr [[LOC]])
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[ISNULL:%.*]] ]
-; CHECK-NEXT:    br i1 [[NULLCHK]], label [[ISNULL]], label [[NONNULLBB:%.*]]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[ISNULL:.*]] ]
+; CHECK-NEXT:    br i1 [[NULLCHK]], label %[[ISNULL]], label %[[NONNULLBB:.*]]
+; CHECK:       [[NONNULLBB]]:
+; CHECK-NEXT:    br label %[[ISNULL]]
+; CHECK:       [[ISNULL]]:
+; CHECK-NEXT:    store volatile i32 [[RET]], ptr [[SINK]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -65,10 +87,19 @@ exit:
 declare void @store(i32 %val, ptr %p) argmemonly writeonly nounwind
 
 define void @test(ptr %loc) {
-; CHECK-LABEL: @test
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -84,10 +115,23 @@ exit:
 }
 
 define void @test_multiexit(ptr %loc, i1 %earlycnd) {
-; CHECK-LABEL: @test_multiexit
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: backedge:
+; CHECK-LABEL: define void @test_multiexit(
+; CHECK-SAME: ptr [[LOC:%.*]], i1 [[EARLYCND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[EARLYCND]], label %[[EXIT1:.*]], label %[[BACKEDGE]]
+; CHECK:       [[BACKEDGE]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT2:.*]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[EXIT2]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -108,10 +152,19 @@ exit2:
 }
 
 define void @neg_lv_value(ptr %loc) {
-; CHECK-LABEL: @neg_lv_value
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_lv_value(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 [[IV]], ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -127,10 +180,20 @@ exit:
 }
 
 define void @neg_lv_addr(ptr %loc) {
-; CHECK-LABEL: @neg_lv_addr
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_lv_addr(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i32, ptr [[LOC]], i32 [[IV]]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[P]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -147,10 +210,20 @@ exit:
 }
 
 define void @neg_mod(ptr %loc) {
-; CHECK-LABEL: @neg_mod
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_mod(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    store i32 [[IV]], ptr [[LOC]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -167,10 +240,25 @@ exit:
 }
 
 define void @neg_ref(ptr %loc) {
-; CHECK-LABEL: @neg_ref
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit1:
+; CHECK-LABEL: define void @neg_ref(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[LOC]], align 4
+; CHECK-NEXT:    [[EARLYCND:%.*]] = icmp eq i32 [[V]], 198
+; CHECK-NEXT:    br i1 [[EARLYCND]], label %[[EXIT1:.*]], label %[[BACKEDGE]]
+; CHECK:       [[BACKEDGE]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT2:.*]]
+; CHECK:       [[EXIT1]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[EXIT2]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -195,10 +283,20 @@ exit2:
 declare void @modref()
 
 define void @neg_modref(ptr %loc) {
-; CHECK-LABEL: @neg_modref
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_modref(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    call void @modref()
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -215,10 +313,20 @@ exit:
 }
 
 define void @neg_fence(ptr %loc) {
-; CHECK-LABEL: @neg_fence
-; CHECK-LABEL: loop:
-; CHECK: call void @store
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_fence(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @store(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    fence seq_cst
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -239,10 +347,19 @@ declare void @not_argmemonly(i32 %v, ptr %p) writeonly nounwind
 declare void @not_writeonly(i32 %v, ptr %p) argmemonly nounwind
 
 define void @neg_not_nounwind(ptr %loc) {
-; CHECK-LABEL: @neg_not_nounwind
-; CHECK-LABEL: loop:
-; CHECK: call void @not_nounwind
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_nounwind(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_nounwind(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -258,10 +375,19 @@ exit:
 }
 
 define void @neg_not_argmemonly(ptr %loc) {
-; CHECK-LABEL: @neg_not_argmemonly
-; CHECK-LABEL: loop:
-; CHECK: call void @not_argmemonly
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_argmemonly(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_argmemonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 
@@ -277,10 +403,19 @@ exit:
 }
 
 define void @neg_not_writeonly(ptr %loc) {
-; CHECK-LABEL: @neg_not_writeonly
-; CHECK-LABEL: loop:
-; CHECK: call void @not_writeonly
-; CHECK-LABEL: exit:
+; CHECK-LABEL: define void @neg_not_writeonly(
+; CHECK-SAME: ptr [[LOC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @not_writeonly(i32 0, ptr [[LOC]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 200
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %loop
 

From 971c49fbf361c22ccf20913f61a58c28b26c4e27 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001@gmail.com>
Date: Thu, 12 Jun 2025 16:01:43 +0530
Subject: [PATCH 211/851] [InstCombine] Ensure Safe Handling of Flags in
 foldFNegIntoConstant (#94148)

Fix #93769

alive2: https://alive2.llvm.org/ce/z/MHShQY
---
 .../InstCombine/InstCombineAddSub.cpp         |  10 +-
 llvm/test/Transforms/InstCombine/fneg.ll      | 166 +++++++++++++++++-
 llvm/test/Transforms/InstCombine/fsub.ll      |   2 +-
 3 files changed, 173 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index fc7dd302b27a5..f0f709bb16d8a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2857,8 +2857,14 @@ static Instruction *foldFNegIntoConstant(Instruction &I, const DataLayout &DL) {
   // Fold negation into constant operand.
   // -(X * C) --> X * (-C)
   if (match(FNegOp, m_FMul(m_Value(X), m_Constant(C))))
-    if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
-      return BinaryOperator::CreateFMulFMF(X, NegC, &I);
+    if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL)) {
+      FastMathFlags FNegF = I.getFastMathFlags();
+      FastMathFlags OpF = FNegOp->getFastMathFlags();
+      FastMathFlags FMF = FastMathFlags::unionValue(FNegF, OpF) |
+                          FastMathFlags::intersectRewrite(FNegF, OpF);
+      FMF.setNoInfs(FNegF.noInfs() && OpF.noInfs());
+      return BinaryOperator::CreateFMulFMF(X, NegC, FMF);
+    }
   // -(X / C) --> X / (-C)
   if (match(FNegOp, m_FDiv(m_Value(X), m_Constant(C))))
     if (Constant *NegC = ConstantFoldUnaryOpOperand(Instruction::FNeg, C, DL))
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index a9d1b9a4ab837..39117f56fa4e1 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -42,7 +42,7 @@ define float @fmul_fneg(float %x) {
 
 define float @fmul_fsub_fmf(float %x) {
 ; CHECK-LABEL: @fmul_fsub_fmf(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -4.200000e+01
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %m = fmul float %x, 42.0
@@ -52,7 +52,7 @@ define float @fmul_fsub_fmf(float %x) {
 
 define float @fmul_fneg_fmf(float %x) {
 ; CHECK-LABEL: @fmul_fneg_fmf(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -4.200000e+01
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -4.200000e+01
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %m = fmul float %x, 42.0
@@ -1142,4 +1142,166 @@ define <vscale x 2 x double> @test_fneg_select_svec_3(<vscale x 2 x i1> %cond, <
   ret <vscale x 2 x double> %2
 }
 
+define float @test_fneg_ninf_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_with_anyzero(
+; CHECK-NEXT:    [[F:%.*]] = fmul float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F]]
+;
+  %mul = fmul float %a, 0.0
+  %f = fneg ninf float %mul
+  ret float %f
+}
+
+define float @test_fsub_ninf_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fsub_ninf_mul_with_anyzero(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nsz float %a, 0.000000
+  %f2 = fsub ninf float -0.000000, %f1
+  ret float %f2
+}
+
+define float @test_fneg_nnan_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_nnan_mul_with_anyzero(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nnan float [[A:%.*]]
+; CHECK-NEXT:    [[F2:%.*]] = call nnan float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul ninf float %a, 0.000000
+  %f2 = fneg nnan float %f1
+  ret float %f2
+}
+
+define float @test_fneg_nsz_mul_with_anyzero(float %a) {
+; CHECK-LABEL: @test_fneg_nsz_mul_with_anyzero(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul ninf float %a, 0.000000
+  %f2 = fneg nsz float %f1
+  ret float %f2
+}
+
+define float @test_fneg_ninf_mul_nnan_with_const(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_nnan_with_const(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg float [[A:%.*]]
+; CHECK-NEXT:    [[F2:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nnan float %a, 0.000000
+  %f2 = fneg ninf float %f1
+  ret float %f2
+}
+
+define float @test_fneg_ninf_mul_nsz_with_const(float %a) {
+; CHECK-LABEL: @test_fneg_ninf_mul_nsz_with_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz float [[A:%.*]], -0.000000e+00
+; CHECK-NEXT:    ret float [[F2]]
+;
+  %f1 = fmul nsz float %a, 0.000000
+  %f2 = fneg ninf float %f1
+  ret float %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nnan_ninf_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nnan_ninf_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul nnan <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nsz_ninf_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nsz_ninf_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nsz <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul nsz <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_ninf_nnan_mul_with_vec_const(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_ninf_nnan_mul_with_vec_const(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg nnan ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_nnan_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_nnan_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul nnan ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg nnan ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const1(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const1(
+; CHECK-NEXT:    [[F2:%.*]] = fmul <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul reassoc <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg reassoc ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_reassoc_ninf_with_vec_const3(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_reassoc_ninf_with_vec_const3(
+; CHECK-NEXT:    [[F2:%.*]] = fmul reassoc <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul reassoc <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg reassoc ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const1(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const1(
+; CHECK-NEXT:    [[F2:%.*]] = fmul <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul contract <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const2(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const2(
+; CHECK-NEXT:    [[F2:%.*]] = fmul ninf <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul ninf <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg contract ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
+define <2 x float> @test_fneg_mul_combine_contract_ninf_with_vec_const3(<2 x float> %a) {
+; CHECK-LABEL: @test_fneg_mul_combine_contract_ninf_with_vec_const3(
+; CHECK-NEXT:    [[F2:%.*]] = fmul contract <2 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[F2]]
+;
+  %f1 = fmul contract <2 x float> %a, <float 0.000000, float -0.000000>
+  %f2 = fneg contract ninf <2 x float> %f1
+  ret <2 x float> %f2
+}
+
 !0 = !{}
diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll
index cffc63405ddcb..28cee50d72c12 100644
--- a/llvm/test/Transforms/InstCombine/fsub.ll
+++ b/llvm/test/Transforms/InstCombine/fsub.ll
@@ -98,7 +98,7 @@ define float @sub_sub_nsz(float %x, float %y, float %z) {
 
 define float @sub_add_neg_x(float %x, float %y) {
 ; CHECK-LABEL: @sub_add_neg_x(
-; CHECK-NEXT:    [[R:%.*]] = fmul reassoc nsz float [[X:%.*]], -5.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = fmul nsz float [[X:%.*]], -5.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %mul = fmul float %x, 5.000000e+00

From 20d5d09e99188dfc7df6e4e4f1c37512e0ab318e Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 12 Jun 2025 11:37:25 +0100
Subject: [PATCH 212/851] [compiler-rt] remove unused default in compiler-rt
 lit tests (#143738)

In https://github.com/llvm/llvm-project/pull/143183 was mistakenly added
a default value to `python_root_dir` in lit tests of compiler-rt.

This is unused by the lit tests of compiler-rt, as it was meant to be
used by `lldb`.

This patch removes this change.
---
 compiler-rt/test/lit.common.configured.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 8ca47a8df5aed..04d1a4df5a54f 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -25,7 +25,6 @@ set_default("gold_executable", "@GOLD_EXECUTABLE@")
 set_default("clang", "@COMPILER_RT_RESOLVED_TEST_COMPILER@")
 set_default("compiler_id", "@COMPILER_RT_TEST_COMPILER_ID@")
 set_default("python_executable", "@Python3_EXECUTABLE@")
-set_default("python_root_dir", "@Python3_ROOT_DIR@")
 set_default("compiler_rt_debug", @COMPILER_RT_DEBUG_PYBOOL@)
 set_default("compiler_rt_intercept_libdispatch", @COMPILER_RT_INTERCEPT_LIBDISPATCH_PYBOOL@)
 set_default("compiler_rt_output_dir", "@COMPILER_RT_RESOLVED_OUTPUT_DIR@")

From fe28ea37b640ea4842583df3b89e08877220fb8e Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Thu, 12 Jun 2025 18:39:16 +0800
Subject: [PATCH 213/851] [LoongArch] Add demanded bits support for [X]VMSKLTZ
 (#143528)

This patch adds a DAG combine hook for the [X]VMSKLTZ nodes to simplify
their input when possible. It also implements target-specific logic in
SimplifyDemandedBitsForTargetNode to optimize away unnecessary
computations when only a subset of the sign bits in the vector results
is actually used.
---
 .../LoongArch/LoongArchISelLowering.cpp       | 73 +++++++++++++++++++
 .../Target/LoongArch/LoongArchISelLowering.h  |  6 ++
 llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll | 15 +---
 llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll   | 16 ----
 4 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index b869ad25e7852..99dae6ec3eb08 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5634,6 +5634,21 @@ static SDValue performMOVFR2GR_SCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue performVMSKLTZCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const LoongArchSubtarget &Subtarget) {
+  MVT VT = N->getSimpleValueType(0);
+  unsigned NumBits = VT.getScalarSizeInBits();
+
+  // Simplify the inputs.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedMask(APInt::getAllOnes(NumBits));
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -5658,6 +5673,9 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performMOVGR2FR_WCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::MOVFR2GR_S_LA64:
     return performMOVFR2GR_SCombine(N, DAG, DCI, Subtarget);
+  case LoongArchISD::VMSKLTZ:
+  case LoongArchISD::XVMSKLTZ:
+    return performVMSKLTZCombine(N, DAG, DCI, Subtarget);
   }
   return SDValue();
 }
@@ -8192,3 +8210,58 @@ unsigned LoongArchTargetLowering::getNumRegistersForCallingConv(
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
+
+bool LoongArchTargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = OriginalDemandedBits.getBitWidth();
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default:
+    break;
+  case LoongArchISD::VMSKLTZ:
+  case LoongArchISD::XVMSKLTZ: {
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+    unsigned SrcBits = SrcVT.getScalarSizeInBits();
+    unsigned NumElts = SrcVT.getVectorNumElements();
+
+    // If we don't need the sign bits at all just return zero.
+    if (OriginalDemandedBits.countr_zero() >= NumElts)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+    // Only demand the vector elements of the sign bits we need.
+    APInt KnownUndef, KnownZero;
+    APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
+    if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
+      return true;
+
+    Known.Zero = KnownZero.zext(BitWidth);
+    Known.Zero.setHighBits(BitWidth - NumElts);
+
+    // [X]VMSKLTZ only uses the MSB from each vector element.
+    KnownBits KnownSrc;
+    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+                             Depth + 1))
+      return true;
+
+    if (KnownSrc.One[SrcBits - 1])
+      Known.One.setLowBits(NumElts);
+    else if (KnownSrc.Zero[SrcBits - 1])
+      Known.Zero.setLowBits(NumElts);
+
+    // Attempt to avoid multi-use ops if we don't need anything from it.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
+    return false;
+  }
+  }
+
+  return TargetLowering::SimplifyDemandedBitsForTargetNode(
+      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 53e3f1adb8d27..79aa89726191b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -314,6 +314,12 @@ class LoongArchTargetLowering : public TargetLowering {
   bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
   LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
+  bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits,
+                                         const APInt &DemandedElts,
+                                         KnownBits &Known,
+                                         TargetLoweringOpt &TLO,
+                                         unsigned Depth) const override;
+
 private:
   /// Target-specific function used to lower LoongArch calling conventions.
   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
index 7e015852e0abc..5a861be95977d 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll
@@ -383,9 +383,8 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
 ; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvslti.w $xr1, $xr2, 0
-; CHECK-NEXT:    xvrepli.b $xr2, -1
-; CHECK-NEXT:    xvbitsel.v $xr0, $xr1, $xr2, $xr0
+; CHECK-NEXT:    xvrepli.b $xr1, -1
+; CHECK-NEXT:    xvbitsel.v $xr0, $xr2, $xr1, $xr0
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -408,8 +407,7 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    xvreplgr2vr.w $xr4, $a0
 ; CHECK-NEXT:    xvand.v $xr2, $xr2, $xr4
 ; CHECK-NEXT:    xvseq.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvslti.w $xr1, $xr3, 0
-; CHECK-NEXT:    xvor.v $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvor.v $xr0, $xr3, $xr0
 ; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr2
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
@@ -530,7 +528,6 @@ define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) {
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
 ; CHECK-NEXT:    vpackev.h $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -558,7 +555,6 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) {
 ; CHECK-NEXT:    st.h $a0, $sp, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -586,7 +582,6 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) {
 ; CHECK-NEXT:    st.h $a0, $sp, 0
 ; CHECK-NEXT:    vld $vr0, $sp, 0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    addi.d $sp, $sp, 16
@@ -601,7 +596,6 @@ define i32 @xvmsk_trunc_i8(<32 x i8> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.b $xr0, $xr0, 7
-; CHECK-NEXT:    xvsrai.b $xr0, $xr0, 7
 ; CHECK-NEXT:    xvmskltz.b $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -616,7 +610,6 @@ define i16 @xvmsk_trunc_i16(<16 x i16> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.h $xr0, $xr0, 15
-; CHECK-NEXT:    xvsrai.h $xr0, $xr0, 15
 ; CHECK-NEXT:    xvmskltz.h $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -631,7 +624,6 @@ define i8 @xvmsk_trunc_i32(<8 x i32> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.w $xr0, $xr0, 31
-; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 31
 ; CHECK-NEXT:    xvmskltz.w $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
@@ -646,7 +638,6 @@ define i4 @xvmsk_trunc_i64(<4 x i64> %a) {
 ; CHECK-LABEL: xvmsk_trunc_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xvslli.d $xr0, $xr0, 63
-; CHECK-NEXT:    xvsrai.d $xr0, $xr0, 63
 ; CHECK-NEXT:    xvmskltz.d $xr0, $xr0
 ; CHECK-NEXT:    xvpickve2gr.wu $a0, $xr0, 0
 ; CHECK-NEXT:    xvpickve2gr.wu $a1, $xr0, 4
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index d8098ccc9328d..0ee30120f77a6 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -181,7 +181,6 @@ define i2 @vmsk_sgt_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -197,7 +196,6 @@ define i2 @vmsk_sgt_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -212,7 +210,6 @@ define i2 @vmsk_sgt_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:    vslt.w $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -252,7 +249,6 @@ define i4 @vmsk_sgt_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -267,7 +263,6 @@ define i4 @vmsk_sgt_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:    vslt.h $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -306,7 +301,6 @@ define i8 @vmsk_sgt_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vslt.b $vr0, $vr1, $vr0
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -349,7 +343,6 @@ define i2 @vmsk_sgt_and_sgt_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8>
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 56
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 56
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -369,7 +362,6 @@ define i2 @vmsk_sgt_and_sgt_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.w $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 48
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 48
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -388,7 +380,6 @@ define i2 @vmsk_sgt_and_sgt_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vshuf4i.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 32
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 32
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -440,7 +431,6 @@ define i4 @vmsk_sgt_and_sgt_v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 24
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -459,7 +449,6 @@ define i4 @vmsk_sgt_and_sgt_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 16
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 16
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -510,7 +499,6 @@ define i8 @vmsk_sgt_and_sgt_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8>
 ; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 8
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 8
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -557,7 +545,6 @@ define i16 @vmsk_trunc_i8(<16 x i8> %a) {
 ; CHECK-LABEL: vmsk_trunc_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.b $vr0, $vr0, 7
-; CHECK-NEXT:    vsrai.b $vr0, $vr0, 7
 ; CHECK-NEXT:    vmskltz.b $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -570,7 +557,6 @@ define i8 @vmsk_trunc_i16(<8 x i16> %a) {
 ; CHECK-LABEL: vmsk_trunc_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.h $vr0, $vr0, 15
-; CHECK-NEXT:    vsrai.h $vr0, $vr0, 15
 ; CHECK-NEXT:    vmskltz.h $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -583,7 +569,6 @@ define i4 @vmsk_trunc_i32(<4 x i32> %a) {
 ; CHECK-LABEL: vmsk_trunc_i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.w $vr0, $vr0, 31
-; CHECK-NEXT:    vsrai.w $vr0, $vr0, 31
 ; CHECK-NEXT:    vmskltz.w $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret
@@ -596,7 +581,6 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
 ; CHECK-LABEL: vmsk_trunc_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vslli.d $vr0, $vr0, 63
-; CHECK-NEXT:    vsrai.d $vr0, $vr0, 63
 ; CHECK-NEXT:    vmskltz.d $vr0, $vr0
 ; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
 ; CHECK-NEXT:    ret

From 97ac6483aaead89897d9bda8a12f1f4c11fad621 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 12 Jun 2025 11:51:58 +0100
Subject: [PATCH 214/851] [DebugInfo][RemoveDIs] Delete debug-info-format flag
 (#143746)

This flag was used to let us incrementally introduce debug records
into LLVM, however everything is now using records. It serves no
purpose now, so delete it.
---
 llvm/include/llvm/IR/BasicBlock.h             |   9 --
 llvm/include/llvm/IR/Function.h               |   9 --
 llvm/include/llvm/IR/Module.h                 |  20 ---
 llvm/lib/AsmParser/LLParser.cpp               |   4 -
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   8 --
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   3 +-
 llvm/lib/IR/BasicBlock.cpp                    |  35 +----
 llvm/lib/IR/Core.cpp                          |   8 +-
 llvm/lib/IR/Function.cpp                      |  18 +--
 llvm/lib/IR/Instruction.cpp                   |  13 +-
 llvm/lib/IR/Module.cpp                        |   4 +-
 llvm/lib/IR/Verifier.cpp                      |  15 +--
 llvm/lib/LTO/LTO.cpp                          |   4 +-
 llvm/lib/Linker/IRMover.cpp                   |   2 -
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |   1 -
 .../AMDGPU/AMDGPUPreloadKernelArguments.cpp   |   1 -
 .../AMDGPU/AMDGPURewriteOutArguments.cpp      |   2 -
 .../WebAssemblyAddMissingPrototypes.cpp       |   1 -
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp |   1 -
 llvm/lib/Transforms/IPO/Attributor.cpp        |   6 -
 .../IPO/DeadArgumentElimination.cpp           |   2 -
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   |   2 -
 llvm/lib/Transforms/IPO/MergeFunctions.cpp    |   2 -
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 120 +-----------------
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   7 -
 llvm/lib/Transforms/Utils/CloneModule.cpp     |   1 -
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   5 -
 .../Transforms/Utils/LoopRotationUtils.cpp    |   6 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  12 +-
 llvm/tools/llvm-as/llvm-as.cpp                |   1 -
 llvm/tools/llvm-dis/llvm-dis.cpp              |   1 -
 llvm/tools/llvm-link/llvm-link.cpp            |   8 +-
 llvm/unittests/IR/IRBuilderTest.cpp           |  11 +-
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |   3 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |   5 +-
 35 files changed, 31 insertions(+), 319 deletions(-)

diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 07444cd6779e1..c24f01fe26cc8 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -63,9 +63,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
 public:
   using InstListType = SymbolTableList<Instruction, ilist_iterator_bits<true>,
                                        ilist_parent<BasicBlock>>;
-  /// Flag recording whether or not this block stores debug-info in the form
-  /// of intrinsic instructions (false) or non-instruction records (true).
-  bool IsNewDbgInfoFormat;
 
 private:
   // Allow Function to renumber blocks.
@@ -95,12 +92,6 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// IsNewDbgInfoFormat = false.
   LLVM_ABI void convertFromNewDbgValues();
 
-  /// Ensure the block is in "old" dbg.value format (\p NewFlag == false) or
-  /// in the new format (\p NewFlag == true), converting to the desired format
-  /// if necessary.
-  LLVM_ABI void setIsNewDbgInfoFormat(bool NewFlag);
-  LLVM_ABI void setNewDbgInfoFormatFlag(bool NewFlag);
-
   unsigned getNumber() const {
     assert(getParent() && "only basic blocks in functions have valid numbers");
     return Number;
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index f24d03635731e..c361be3e752a9 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -111,11 +111,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   friend class SymbolTableListTraits<Function>;
 
 public:
-  /// Is this function using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// hasLazyArguments/CheckLazyArguments - The argument list of a function is
   /// built on demand, so that the list isn't allocated until the first client
   /// needs it.  The hasLazyArguments predicate returns true if the arg list
@@ -130,9 +125,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// \see BasicBlock::convertFromNewDbgValues.
   void convertFromNewDbgValues();
 
-  void setIsNewDbgInfoFormat(bool NewVal);
-  void setNewDbgInfoFormatFlag(bool NewVal);
-
 private:
   friend class TargetLibraryInfoImpl;
 
@@ -760,7 +752,6 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node<Function> {
   /// to the newly inserted BB.
   Function::iterator insert(Function::iterator Position, BasicBlock *BB) {
     Function::iterator FIt = BasicBlocks.insert(Position, BB);
-    BB->setIsNewDbgInfoFormat(IsNewDbgInfoFormat);
     return FIt;
   }
 
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index 7a26efb74b324..f4420f460741b 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -215,11 +215,6 @@ class LLVM_ABI Module {
 /// @name Constructors
 /// @{
 public:
-  /// Is this Module using intrinsics to record the position of debugging
-  /// information, or non-intrinsic records? See IsNewDbgInfoFormat in
-  /// \ref BasicBlock.
-  bool IsNewDbgInfoFormat;
-
   /// Used when printing this module in the new debug info format; removes all
   /// declarations of debug intrinsics that are replaced by non-intrinsic
   /// records in the new format.
@@ -230,7 +225,6 @@ class LLVM_ABI Module {
     for (auto &F : *this) {
       F.convertToNewDbgValues();
     }
-    IsNewDbgInfoFormat = true;
   }
 
   /// \see BasicBlock::convertFromNewDbgValues.
@@ -238,20 +232,6 @@ class LLVM_ABI Module {
     for (auto &F : *this) {
       F.convertFromNewDbgValues();
     }
-    IsNewDbgInfoFormat = false;
-  }
-
-  void setIsNewDbgInfoFormat(bool UseNewFormat) {
-    if (UseNewFormat && !IsNewDbgInfoFormat)
-      convertToNewDbgValues();
-    else if (!UseNewFormat && IsNewDbgInfoFormat)
-      convertFromNewDbgValues();
-  }
-  void setNewDbgInfoFormatFlag(bool NewFlag) {
-    for (auto &F : *this) {
-      F.setNewDbgInfoFormatFlag(NewFlag);
-    }
-    IsNewDbgInfoFormat = NewFlag;
   }
 
   /// The Module constructor. Note that there is no default constructor. You
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5c007dcf00224..926dc6211eb8d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -441,8 +441,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
 
-  M->setIsNewDbgInfoFormat(true);
-
   if (!Slots)
     return false;
   // Initialize the slot mapping.
@@ -6906,8 +6904,6 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
       if (SeenOldDbgInfoFormat)
         return error(Lex.getLoc(), "debug record should not appear in a module "
                                    "containing debug info intrinsics");
-      if (!SeenNewDbgInfoFormat)
-        M->setNewDbgInfoFormatFlag(true);
       SeenNewDbgInfoFormat = true;
       Lex.Lex();
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 31129b7e5cf77..fde934fbb3cf1 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4479,10 +4479,6 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata,
                                  ParserCallbacks Callbacks) {
-  // Don't allow modules to use debug-intrinsics: autoupgrading them is now
-  // mandatory.
-  TheModule->IsNewDbgInfoFormat = true;
-
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
   if (ResumeBit) {
     if (Error JumpFailed = Stream.JumpToBit(ResumeBit))
@@ -6994,10 +6990,6 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   if (Error JumpFailed = Stream.JumpToBit(DFII->second))
     return JumpFailed;
 
-  // Regardless of the debug info format we want to end up in, we need
-  // IsNewDbgInfoFormat=true to construct any debug records seen in the bitcode.
-  F->IsNewDbgInfoFormat = true;
-
   if (Error Err = parseFunctionBody(F))
     return Err;
   F->setIsMaterializable(false);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 32348a899683d..3792b456c836e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3335,8 +3335,7 @@ class TypePromotionTransaction {
 
       // Record where we would have to re-insert the instruction in the sequence
       // of DbgRecords, if we ended up reinserting.
-      if (BB->IsNewDbgInfoFormat)
-        BeforeDbgRecord = Inst->getDbgReinsertionPosition();
+      BeforeDbgRecord = Inst->getDbgReinsertionPosition();
 
       if (HasPrevInstruction) {
         Point.PrevInst = std::prev(Inst->getIterator());
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 62a75313bb171..8b3e91750f86c 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -52,8 +52,6 @@ DbgMarker *BasicBlock::createMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::convertToNewDbgValues() {
-  IsNewDbgInfoFormat = true;
-
   // Iterate over all instructions in the instruction list, collecting debug
   // info intrinsics and converting them to DbgRecords. Once we find a "real"
   // instruction, attach all those DbgRecords to a DbgMarker in that
@@ -91,7 +89,6 @@ void BasicBlock::convertToNewDbgValues() {
 
 void BasicBlock::convertFromNewDbgValues() {
   invalidateOrders();
-  IsNewDbgInfoFormat = false;
 
   // Iterate over the block, finding instructions annotated with DbgMarkers.
   // Convert any attached DbgRecords to debug intrinsics and insert ahead of the
@@ -126,16 +123,6 @@ void BasicBlock::dumpDbgValues() const {
 }
 #endif
 
-void BasicBlock::setIsNewDbgInfoFormat(bool NewFlag) {
-  if (NewFlag && !IsNewDbgInfoFormat)
-    convertToNewDbgValues();
-  else if (!NewFlag && IsNewDbgInfoFormat)
-    convertFromNewDbgValues();
-}
-void BasicBlock::setNewDbgInfoFormatFlag(bool NewFlag) {
-  IsNewDbgInfoFormat = NewFlag;
-}
-
 ValueSymbolTable *BasicBlock::getValueSymbolTable() {
   if (Function *F = getParent())
     return F->getValueSymbolTable();
@@ -157,8 +144,7 @@ template class llvm::SymbolTableListTraits<
 
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
-    : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(true), Parent(nullptr) {
+    : Value(Type::getLabelTy(C), Value::BasicBlockVal), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
@@ -168,8 +154,6 @@ BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
 
   end().getNodePtr()->setParent(this);
   setName(Name);
-  if (NewParent)
-    setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
 }
 
 void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
@@ -180,8 +164,6 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
     NewParent->insert(InsertBefore->getIterator(), this);
   else
     NewParent->insert(NewParent->end(), this);
-
-  setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
 }
 
 BasicBlock::~BasicBlock() {
@@ -725,10 +707,6 @@ void BasicBlock::flushTerminatorDbgRecords() {
   // check whether there's anything trailing at the end and move those
   // DbgRecords in front of the terminator.
 
-  // Do nothing if we're not in new debug-info format.
-  if (!IsNewDbgInfoFormat)
-    return;
-
   // If there's no terminator, there's nothing to do.
   Instruction *Term = getTerminator();
   if (!Term)
@@ -765,10 +743,6 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // in the iterators whether there was the intention to transfer any debug
   // info.
 
-  // If we're not in "new" debug-info format, do nothing.
-  if (!IsNewDbgInfoFormat)
-    return;
-
   assert(First == Last);
   bool InsertAtHead = Dest.getHeadBit();
   bool ReadFromHead = First.getHeadBit();
@@ -1029,8 +1003,6 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
 void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
                         iterator Last) {
-  assert(Src->IsNewDbgInfoFormat == IsNewDbgInfoFormat);
-
 #ifdef EXPENSIVE_CHECKS
   // Check that First is before Last.
   auto FromBBEnd = Src->end();
@@ -1045,9 +1017,7 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
     return;
   }
 
-  // Handle non-instr debug-info specific juggling.
-  if (IsNewDbgInfoFormat)
-    spliceDebugInfo(Dest, Src, First, Last);
+  spliceDebugInfo(Dest, Src, First, Last);
 
   // And move the instructions.
   getInstList().splice(Dest, Src->getInstList(), First, Last);
@@ -1056,7 +1026,6 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
 }
 
 void BasicBlock::insertDbgRecordAfter(DbgRecord *DR, Instruction *I) {
-  assert(IsNewDbgInfoFormat);
   assert(I->getParent() == this);
 
   iterator NextIt = std::next(I->getIterator());
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index a7c3a56dcc22a..9810f04cc503c 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -431,12 +431,12 @@ void LLVMAddModuleFlag(LLVMModuleRef M, LLVMModuleFlagBehavior Behavior,
                            {Key, KeyLen}, unwrap(Val));
 }
 
-LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) {
-  return unwrap(M)->IsNewDbgInfoFormat;
-}
+LLVMBool LLVMIsNewDbgInfoFormat(LLVMModuleRef M) { return true; }
 
 void LLVMSetIsNewDbgInfoFormat(LLVMModuleRef M, LLVMBool UseNewFormat) {
-  unwrap(M)->setIsNewDbgInfoFormat(UseNewFormat);
+  if (!UseNewFormat)
+    llvm_unreachable("LLVM no longer supports intrinsic based debug-info");
+  (void)M;
 }
 
 /*--.. Printing modules ....................................................--*/
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 493dec72d45af..28fb81055baf4 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -87,32 +87,17 @@ void Function::validateBlockNumbers() const {
 }
 
 void Function::convertToNewDbgValues() {
-  IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
     BB.convertToNewDbgValues();
   }
 }
 
 void Function::convertFromNewDbgValues() {
-  IsNewDbgInfoFormat = false;
   for (auto &BB : *this) {
     BB.convertFromNewDbgValues();
   }
 }
 
-void Function::setIsNewDbgInfoFormat(bool NewFlag) {
-  if (NewFlag && !IsNewDbgInfoFormat)
-    convertToNewDbgValues();
-  else if (!NewFlag && IsNewDbgInfoFormat)
-    convertFromNewDbgValues();
-}
-void Function::setNewDbgInfoFormatFlag(bool NewFlag) {
-  for (auto &BB : *this) {
-    BB.setNewDbgInfoFormatFlag(NewFlag);
-  }
-  IsNewDbgInfoFormat = NewFlag;
-}
-
 //===----------------------------------------------------------------------===//
 // Argument Implementation
 //===----------------------------------------------------------------------===//
@@ -490,7 +475,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
                    const Twine &name, Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(true) {
+      NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -505,7 +490,6 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
 
   if (ParentModule) {
     ParentModule->getFunctionList().push_back(this);
-    IsNewDbgInfoFormat = ParentModule->IsNewDbgInfoFormat;
   }
 
   HasLLVMReservedName = getName().starts_with("llvm.");
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 109d516c61b7c..1b60caab6c11a 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -86,7 +86,7 @@ void Instruction::removeFromParent() {
 }
 
 void Instruction::handleMarkerRemoval() {
-  if (!getParent()->IsNewDbgInfoFormat || !DebugMarker)
+  if (!DebugMarker)
     return;
 
   DebugMarker->removeMarker();
@@ -136,9 +136,6 @@ void Instruction::insertBefore(BasicBlock &BB,
 
   BB.getInstList().insert(InsertPos, this);
 
-  if (!BB.IsNewDbgInfoFormat)
-    return;
-
   // We've inserted "this": if InsertAtHead is set then it comes before any
   // DbgVariableRecords attached to InsertPos. But if it's not set, then any
   // DbgRecords should now come before "this".
@@ -226,7 +223,7 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
 
   // If we've been given the "Preserve" flag, then just move the DbgRecords with
   // the instruction, no more special handling needed.
-  if (BB.IsNewDbgInfoFormat && DebugMarker && !Preserve) {
+  if (DebugMarker && !Preserve) {
     if (I != this->getIterator() || InsertAtHead) {
       // "this" is definitely moving in the list, or it's moving ahead of its
       // attached DbgVariableRecords. Detach any existing DbgRecords.
@@ -238,7 +235,7 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   // the block splicer, which will do more debug-info things.
   BB.getInstList().splice(I, getParent()->getInstList(), getIterator());
 
-  if (BB.IsNewDbgInfoFormat && !Preserve) {
+  if (!Preserve) {
     DbgMarker *NextMarker = getParent()->getNextMarker(this);
 
     // If we're inserting at point I, and not in front of the DbgRecords
@@ -258,10 +255,6 @@ iterator_range<DbgRecord::self_iterator> Instruction::cloneDebugInfoFrom(
   if (!From->DebugMarker)
     return DbgMarker::getEmptyDbgRecordRange();
 
-  assert(getParent()->IsNewDbgInfoFormat);
-  assert(getParent()->IsNewDbgInfoFormat ==
-         From->getParent()->IsNewDbgInfoFormat);
-
   if (!DebugMarker)
     getParent()->createMarker(this);
 
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 0a47f98619691..37f4a72d8c20b 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -71,8 +71,7 @@ template class LLVM_EXPORT_TEMPLATE llvm::SymbolTableListTraits<GlobalIFunc>;
 
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
-      ModuleID(std::string(MID)), SourceFileName(std::string(MID)),
-      IsNewDbgInfoFormat(true) {
+      ModuleID(std::string(MID)), SourceFileName(std::string(MID)) {
   Context.addModule(this);
 }
 
@@ -83,7 +82,6 @@ Module &Module::operator=(Module &&Other) {
 
   ModuleID = std::move(Other.ModuleID);
   SourceFileName = std::move(Other.SourceFileName);
-  IsNewDbgInfoFormat = std::move(Other.IsNewDbgInfoFormat);
 
   GlobalList.clear();
   GlobalList.splice(GlobalList.begin(), Other.GlobalList);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 9ec94a8b80959..1f1041b259736 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2878,11 +2878,6 @@ void Verifier::visitFunction(const Function &F) {
   Check(verifyAttributeCount(Attrs, FT->getNumParams()),
         "Attribute after last parameter!", &F);
 
-  CheckDI(F.IsNewDbgInfoFormat == F.getParent()->IsNewDbgInfoFormat,
-          "Function debug format should match parent module", &F,
-          F.IsNewDbgInfoFormat, F.getParent(),
-          F.getParent()->IsNewDbgInfoFormat);
-
   bool IsIntrinsic = F.isIntrinsic();
 
   // Check function attributes.
@@ -3233,15 +3228,9 @@ void Verifier::visitBasicBlock(BasicBlock &BB) {
     Check(I.getParent() == &BB, "Instruction has bogus parent pointer!");
   }
 
-  CheckDI(BB.IsNewDbgInfoFormat == BB.getParent()->IsNewDbgInfoFormat,
-          "BB debug format should match parent function", &BB,
-          BB.IsNewDbgInfoFormat, BB.getParent(),
-          BB.getParent()->IsNewDbgInfoFormat);
-
   // Confirm that no issues arise from the debug program.
-  if (BB.IsNewDbgInfoFormat)
-    CheckDI(!BB.getTrailingDbgRecords(), "Basic Block has trailing DbgRecords!",
-            &BB);
+  CheckDI(!BB.getTrailingDbgRecords(), "Basic Block has trailing DbgRecords!",
+          &BB);
 }
 
 void Verifier::visitTerminator(Instruction &I) {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index df395073359cf..adf995cbc9b18 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -599,9 +599,7 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
                                       const Config &Conf)
     : ParallelCodeGenParallelismLevel(ParallelCodeGenParallelismLevel),
       Ctx(Conf), CombinedModule(std::make_unique<Module>("ld-temp.o", Ctx)),
-      Mover(std::make_unique<IRMover>(*CombinedModule)) {
-  CombinedModule->IsNewDbgInfoFormat = true;
-}
+      Mover(std::make_unique<IRMover>(*CombinedModule)) {}
 
 LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam)
     : Backend(std::move(BackendParam)), CombinedIndex(/*HaveGVs*/ false) {
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index a449185b2b9ba..2a9709050162f 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -595,7 +595,6 @@ Function *IRLinker::copyFunctionProto(const Function *SF) {
                              SF->getAddressSpace(), SF->getName(), &DstM);
   F->copyAttributesFrom(SF);
   F->setAttributes(mapAttributeTypes(F->getContext(), F->getAttributes()));
-  F->IsNewDbgInfoFormat = SF->IsNewDbgInfoFormat;
   return F;
 }
 
@@ -1030,7 +1029,6 @@ Error IRLinker::linkFunctionBody(Function &Dst, Function &Src) {
     Dst.setPrologueData(Src.getPrologueData());
   if (Src.hasPersonalityFn())
     Dst.setPersonalityFn(Src.getPersonalityFn());
-  assert(Src.IsNewDbgInfoFormat == Dst.IsNewDbgInfoFormat);
 
   // Copy over the metadata attachments without remapping.
   Dst.copyMetadata(&Src, 0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 0f002b016af0c..67db961e60fa3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2364,7 +2364,6 @@ static Function *moveFunctionAdaptingType(Function *OldF, FunctionType *NewTy,
   bool IsIntrinsic = OldF->isIntrinsic();
   Function *NewF =
       Function::Create(NewTy, OldF->getLinkage(), OldF->getAddressSpace());
-  NewF->IsNewDbgInfoFormat = OldF->IsNewDbgInfoFormat;
   NewF->copyAttributesFrom(OldF);
   NewF->copyMetadata(OldF, 0);
   NewF->takeName(OldF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
index 5027705ef61de..984c1ee89309e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp
@@ -134,7 +134,6 @@ class PreloadKernelArgInfo {
 
     NF->copyAttributesFrom(&F);
     NF->copyMetadata(&F, 0);
-    NF->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
 
     F.getParent()->getFunctionList().insert(F.getIterator(), NF);
     NF->takeName(&F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index e1008439a33a8..4b1f80c777827 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -325,8 +325,6 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   NewFunc->removeRetAttrs(RetAttrs);
   // TODO: How to preserve metadata?
 
-  NewFunc->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
-
   // Move the body of the function into the new rewritten function, and replace
   // this function with a stub.
   NewFunc->splice(NewFunc->begin(), &F);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index f02725efc7e0c..344a3636b431b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -135,7 +135,6 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
         Function::Create(NewType, F.getLinkage(), F.getName() + ".fixed_sig");
     NewF->setAttributes(F.getAttributes());
     NewF->removeFnAttr("no-prototype");
-    NewF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
     Replacements.emplace_back(&F, NewF);
   }
 
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 0ec5202b8cfe7..262c902d40d2d 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -179,7 +179,6 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
                                   F->getName());
   NF->copyAttributesFrom(F);
   NF->copyMetadata(F, 0);
-  NF->setIsNewDbgInfoFormat(F->IsNewDbgInfoFormat);
 
   // The new function will have the !dbg metadata copied from the original
   // function. The original function may not be deleted, and dbg metadata need
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index cbdbf9ae1494d..050eed376ed3f 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2726,8 +2726,6 @@ void Attributor::createShallowWrapper(Function &F) {
       Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
   F.setName(""); // set the inside function anonymous
   M.getFunctionList().insert(F.getIterator(), Wrapper);
-  // Flag whether the function is using new-debug-info or not.
-  Wrapper->IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
 
   F.setLinkage(GlobalValue::InternalLinkage);
 
@@ -2808,8 +2806,6 @@ bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
       VMap[&Arg] = &(*NewFArgIt++);
     }
     SmallVector<ReturnInst *, 8> Returns;
-    // Flag whether the function is using new-debug-info or not.
-    Copied->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
 
     // Copy the body of the original function to the new one
     CloneFunctionInto(Copied, F, VMap,
@@ -3027,8 +3023,6 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
     NewFn->takeName(OldFn);
     NewFn->copyAttributesFrom(OldFn);
-    // Flag whether the function is using new-debug-info or not.
-    NewFn->IsNewDbgInfoFormat = OldFn->IsNewDbgInfoFormat;
 
     // Patch the pointer to LLVM function in debug info descriptor.
     NewFn->setSubprogram(OldFn->getSubprogram());
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 2e2687a5ff6e3..d32b829e2ad79 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -170,7 +170,6 @@ bool DeadArgumentEliminationPass::deleteDeadVarargs(Function &F) {
   NF->setComdat(F.getComdat());
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
   NF->takeName(&F);
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   // Loop over all the callers of the function, transforming the call sites
   // to pass in a smaller number of arguments into the new function.
@@ -884,7 +883,6 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   // it again.
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
-  NF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
 
   // Loop over all the callers of the function, transforming the call sites to
   // pass in a smaller number of arguments into the new function.
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index e25f23107966d..16ffd503300ee 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -508,7 +508,6 @@ ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M,
   Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace());
 
   NF->setName(F.getName() + ".varargs");
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
 
@@ -550,7 +549,6 @@ ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder,
   NF->setComdat(F.getComdat());
   F.getParent()->getFunctionList().insert(F.getIterator(), NF);
   NF->setName(F.getName() + ".valist");
-  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
 
   AttrBuilder ParamAttrs(Ctx);
 
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index e5397e94c792b..d4555e9435f1d 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -751,7 +751,6 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
     NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
                             G->getAddressSpace(), "", G->getParent());
     NewG->setComdat(G->getComdat());
-    NewG->IsNewDbgInfoFormat = G->IsNewDbgInfoFormat;
     BB = BasicBlock::Create(F->getContext(), "", NewG);
   }
 
@@ -897,7 +896,6 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
     NewF->takeName(F);
     NewF->setComdat(F->getComdat());
     F->setComdat(nullptr);
-    NewF->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
     // Ensure CFI type metadata is propagated to the new function.
     copyMetadataIfPresent(F, NewF, "type");
     copyMetadataIfPresent(F, NewF, "kcfi_type");
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6608515e1cbbc..1feed14b4fed8 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -437,45 +437,7 @@ DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
 }
 
 static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
-
-  SmallVector<DbgValueInst *, 8> ToBeRemoved;
-  SmallDenseSet<DebugVariable> VariableSet;
-  for (auto &I : reverse(*BB)) {
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
-      DebugVariable Key(DVI->getVariable(),
-                        DVI->getExpression(),
-                        DVI->getDebugLoc()->getInlinedAt());
-      auto R = VariableSet.insert(Key);
-      // If the variable fragment hasn't been seen before then we don't want
-      // to remove this dbg intrinsic.
-      if (R.second)
-        continue;
-
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI)) {
-        // Don't delete dbg.assign intrinsics that are linked to instructions.
-        if (!at::getAssignmentInsts(DAI).empty())
-          continue;
-        // Unlinked dbg.assign intrinsics can be treated like dbg.values.
-      }
-
-      // If the same variable fragment is described more than once it is enough
-      // to keep the last one (i.e. the first found since we for reverse
-      // iteration).
-      ToBeRemoved.push_back(DVI);
-      continue;
-    }
-    // Sequence with consecutive dbg.value instrs ended. Clear the map to
-    // restart identifying redundant instructions if case we find another
-    // dbg.value sequence.
-    VariableSet.clear();
-  }
-
-  for (auto &Instr : ToBeRemoved)
-    Instr->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveRedundantDbgInstrsUsingBackwardScan(BB);
 }
 
 /// Remove redundant dbg.value instructions using a forward scan. This can
@@ -578,49 +540,7 @@ DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
 }
 
 static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
-
-  SmallVector<DbgValueInst *, 8> ToBeRemoved;
-  SmallDenseMap<DebugVariable,
-                std::pair<SmallVector<Value *, 4>, DIExpression *>, 4>
-      VariableMap;
-  for (auto &I : *BB) {
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
-      DebugVariable Key(DVI->getVariable(), std::nullopt,
-                        DVI->getDebugLoc()->getInlinedAt());
-      auto [VMI, Inserted] = VariableMap.try_emplace(Key);
-      auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-      // A dbg.assign with no linked instructions can be treated like a
-      // dbg.value (i.e. can be deleted).
-      bool IsDbgValueKind = (!DAI || at::getAssignmentInsts(DAI).empty());
-
-      // Update the map if we found a new value/expression describing the
-      // variable, or if the variable wasn't mapped already.
-      SmallVector<Value *, 4> Values(DVI->getValues());
-      if (Inserted || VMI->second.first != Values ||
-          VMI->second.second != DVI->getExpression()) {
-        // Use a sentinel value (nullptr) for the DIExpression when we see a
-        // linked dbg.assign so that the next debug intrinsic will never match
-        // it (i.e. always treat linked dbg.assigns as if they're unique).
-        if (IsDbgValueKind)
-          VMI->second = {Values, DVI->getExpression()};
-        else
-          VMI->second = {Values, nullptr};
-        continue;
-      }
-
-      // Don't delete dbg.assign intrinsics that are linked to instructions.
-      if (!IsDbgValueKind)
-        continue;
-      ToBeRemoved.push_back(DVI);
-    }
-  }
-
-  for (auto &Instr : ToBeRemoved)
-    Instr->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveRedundantDbgInstrsUsingForwardScan(BB);
 }
 
 /// Remove redundant undef dbg.assign intrinsic from an entry block using a
@@ -643,41 +563,7 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
 /// Possible improvements:
 /// - Keep track of non-overlapping fragments.
 static bool removeUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
-  if (BB->IsNewDbgInfoFormat)
-    return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
-
-  assert(BB->isEntryBlock() && "expected entry block");
-  SmallVector<DbgAssignIntrinsic *, 8> ToBeRemoved;
-  DenseSet<DebugVariable> SeenDefForAggregate;
-  // Returns the DebugVariable for DVI with no fragment info.
-  auto GetAggregateVariable = [](DbgValueInst *DVI) {
-    return DebugVariable(DVI->getVariable(), std::nullopt,
-                         DVI->getDebugLoc()->getInlinedAt());
-  };
-
-  // Remove undef dbg.assign intrinsics that are encountered before
-  // any non-undef intrinsics from the entry block.
-  for (auto &I : *BB) {
-    DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I);
-    if (!DVI)
-      continue;
-    auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-    bool IsDbgValueKind = (!DAI || at::getAssignmentInsts(DAI).empty());
-    DebugVariable Aggregate = GetAggregateVariable(DVI);
-    if (!SeenDefForAggregate.contains(Aggregate)) {
-      bool IsKill = DVI->isKillLocation() && IsDbgValueKind;
-      if (!IsKill) {
-        SeenDefForAggregate.insert(Aggregate);
-      } else if (DAI) {
-        ToBeRemoved.push_back(DAI);
-      }
-    }
-  }
-
-  for (DbgAssignIntrinsic *DAI : ToBeRemoved)
-    DAI->eraseFromParent();
-
-  return !ToBeRemoved.empty();
+  return DbgVariableRecordsRemoveUndefDbgAssignsFromEntryBlock(BB);
 }
 
 bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 5487dbef8a434..510d9f97bf8c6 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -114,7 +114,6 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
                                   ClonedCodeInfo *CodeInfo, bool MapAtoms) {
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
-  NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat;
   if (BB->hasName())
     NewBB->setName(BB->getName() + NameSuffix);
 
@@ -286,7 +285,6 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
                              ValueMapTypeRemapper *TypeMapper,
                              ValueMaterializer *Materializer) {
-  NewFunc->setIsNewDbgInfoFormat(OldFunc->IsNewDbgInfoFormat);
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
@@ -391,7 +389,6 @@ Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
   // Create the new function...
   Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(),
                                     F->getName(), F->getParent());
-  NewF->setIsNewDbgInfoFormat(F->IsNewDbgInfoFormat);
 
   // Loop over the arguments, copying the names of the mapped arguments over...
   Function::arg_iterator DestI = NewF->arg_begin();
@@ -525,7 +522,6 @@ void PruningFunctionCloner::CloneBlock(
   BasicBlock *NewBB;
   Twine NewName(BB->hasName() ? Twine(BB->getName()) + NameSuffix : "");
   BBEntry = NewBB = BasicBlock::Create(BB->getContext(), NewName, NewFunc);
-  NewBB->IsNewDbgInfoFormat = BB->IsNewDbgInfoFormat;
 
   // It is only legal to clone a function if a block address within that
   // function is never referenced outside of the function.  Given that, we
@@ -549,9 +545,6 @@ void PruningFunctionCloner::CloneBlock(
   BasicBlock::const_iterator DbgCursor = StartingInst;
   auto CloneDbgRecordsToHere =
       [NewBB, &DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
-        if (!NewBB->IsNewDbgInfoFormat)
-          return;
-
         // Clone debug-info records onto this instruction. Iterate through any
         // source-instructions we've cloned and then subsequently optimised
         // away, so that their debug-info doesn't go missing.
diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
index 88e2bfe45d2cb..55fb0acd39eae 100644
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -61,7 +61,6 @@ std::unique_ptr<Module> llvm::CloneModule(
   New->setDataLayout(M.getDataLayout());
   New->setTargetTriple(M.getTargetTriple());
   New->setModuleInlineAsm(M.getModuleInlineAsm());
-  New->IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
 
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index c4894c90c127f..1210bdf4a1c98 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -792,7 +792,6 @@ void CodeExtractor::severSplitPHINodesOfExits() {
         NewBB = BasicBlock::Create(ExitBB->getContext(),
                                    ExitBB->getName() + ".split",
                                    ExitBB->getParent(), ExitBB);
-        NewBB->IsNewDbgInfoFormat = ExitBB->IsNewDbgInfoFormat;
         SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBB));
         for (BasicBlock *PredBB : Preds)
           if (Blocks.count(PredBB))
@@ -1548,7 +1547,6 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC,
   Function *newFunction = constructFunctionDeclaration(
       inputs, outputs, EntryFreq, oldFunction->getName() + "." + SuffixToUse,
       StructValues, StructTy);
-  newFunction->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
   SmallVector<Value *> NewValues;
 
   emitFunctionBody(inputs, outputs, StructValues, newFunction, StructTy, header,
@@ -1637,7 +1635,6 @@ void CodeExtractor::emitFunctionBody(
   // head of the region, but the entry node of a function cannot have preds.
   BasicBlock *newFuncRoot =
       BasicBlock::Create(Context, "newFuncRoot", newFunction);
-  newFuncRoot->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
 
   // Now sink all instructions which only have non-phi uses inside the region.
   // Group the allocas at the start of the block, so that any bitcast uses of
@@ -1871,10 +1868,8 @@ CallInst *CodeExtractor::emitReplacerCall(
   // This takes place of the original loop
   BasicBlock *codeReplacer =
       BasicBlock::Create(Context, "codeRepl", oldFunction, ReplIP);
-  codeReplacer->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
   BasicBlock *AllocaBlock =
       AllocationBlock ? AllocationBlock : &oldFunction->getEntryBlock();
-  AllocaBlock->IsNewDbgInfoFormat = oldFunction->IsNewDbgInfoFormat;
 
   // Update the entry count of the function.
   if (BFI)
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 693b1f517f8d0..6b42503b2e015 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -634,8 +634,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           // memory access in coroutines.
           !Inst->getFunction()->isPresplitCoroutine()) {
 
-        if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
-            !NextDbgInsts.empty()) {
+        if (!NextDbgInsts.empty()) {
           auto DbgValueRange =
               LoopEntryBranch->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
           RemapDbgRecordRange(M, DbgValueRange, ValueMap,
@@ -664,8 +663,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
       ++NumInstrsDuplicated;
 
-      if (LoopEntryBranch->getParent()->IsNewDbgInfoFormat &&
-          !NextDbgInsts.empty()) {
+      if (!NextDbgInsts.empty()) {
         auto Range = C->cloneDebugInfoFrom(Inst, NextDbgInsts.begin());
         RemapDbgRecordRange(M, Range, ValueMap,
                             RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 975ce3bef5176..f67a6414ca316 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4055,13 +4055,11 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
 
   Module *M = BB->getModule();
 
-  if (PredBlock->IsNewDbgInfoFormat) {
-    PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
-    for (DbgVariableRecord &DVR :
-         filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
-      RemapDbgRecord(M, &DVR, VMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-    }
+  PredBlock->getTerminator()->cloneDebugInfoFrom(BB->getTerminator());
+  for (DbgVariableRecord &DVR :
+       filterDbgVars(PredBlock->getTerminator()->getDbgRecordRange())) {
+    RemapDbgRecord(M, &DVR, VMap,
+                   RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
   }
 
   // Now that the Cond was cloned into the predecessor basic block,
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index f42a08e2e9c8b..21648674b51f1 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -139,7 +139,6 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  M->setIsNewDbgInfoFormat(true);
   M->removeDebugIntrinsicDeclarations();
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index 8937272abb92a..422eb855ba2cf 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -268,7 +268,6 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          M->setIsNewDbgInfoFormat(true);
           M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
         }
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index 35b4f0af97f6e..22ea54e68358a 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -523,16 +523,10 @@ int main(int argc, char **argv) {
 
   if (Verbose)
     errs() << "Writing bitcode...\n";
-  auto SetFormat = [&](bool NewFormat) {
-    Composite->setIsNewDbgInfoFormat(NewFormat);
-    if (NewFormat)
-      Composite->removeDebugIntrinsicDeclarations();
-  };
+  Composite->removeDebugIntrinsicDeclarations();
   if (OutputAssembly) {
-    SetFormat(true);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(true);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index aadae5287c380..520735dfc3268 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -888,14 +888,9 @@ TEST_F(IRBuilderTest, DIBuilder) {
   };
 
   auto ExpectOrder = [&](DbgInstPtr First, BasicBlock::iterator Second) {
-    if (M->IsNewDbgInfoFormat) {
-      EXPECT_TRUE(isa<DbgRecord *>(First));
-      EXPECT_FALSE(Second->getDbgRecordRange().empty());
-      EXPECT_EQ(GetLastDbgRecord(&*Second), cast<DbgRecord *>(First));
-    } else {
-      EXPECT_TRUE(isa<Instruction *>(First));
-      EXPECT_EQ(&*std::prev(Second), cast<Instruction *>(First));
-    }
+    EXPECT_TRUE(isa<DbgRecord *>(First));
+    EXPECT_FALSE(Second->getDbgRecordRange().empty());
+    EXPECT_EQ(GetLastDbgRecord(&*Second), cast<DbgRecord *>(First));
   };
 
   auto RunTest = [&]() {
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index 187e2a9b75a9b..2dd0640f794e5 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -70,8 +70,7 @@ void registerFromLLVMIRTranslation() {
           return nullptr;
 
         // Debug records are not currently supported in the LLVM IR translator.
-        if (llvmModule->IsNewDbgInfoFormat)
-          llvmModule->convertFromNewDbgValues();
+        llvmModule->convertFromNewDbgValues();
 
         return translateLLVMIRToModule(
             std::move(llvmModule), context, emitExpensiveWarnings,
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 2702b7aa544da..e5ca147ea98f8 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -2231,9 +2231,6 @@ prepareLLVMModule(Operation *m, llvm::LLVMContext &llvmContext,
                   StringRef name) {
   m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
-  // ModuleTranslation can currently only construct modules in the old debug
-  // info format, so set the flag accordingly.
-  llvmModule->setNewDbgInfoFormatFlag(false);
   if (auto dataLayoutAttr =
           m->getDiscardableAttr(LLVM::LLVMDialect::getDataLayoutAttrName())) {
     llvmModule->setDataLayout(cast<StringAttr>(dataLayoutAttr).getValue());
@@ -2329,7 +2326,7 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext,
   // Once we've finished constructing elements in the module, we should convert
   // it to use the debug info format desired by LLVM.
   // See https://llvm.org/docs/RemoveDIsDebugInfo.html
-  translator.llvmModule->setIsNewDbgInfoFormat(true);
+  translator.llvmModule->convertToNewDbgValues();
 
   // Add the necessary debug info module flags, if they were not encoded in MLIR
   // beforehand.

From 013034cd0f5ae19ef02fc35a83362874e727f13c Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 12 Jun 2025 12:04:41 +0100
Subject: [PATCH 215/851] Follow-up to 97ac6483aae, squelch an unused lambda
 capture warning

NewBB here was being captured for some code that was deleted in
97ac6483aae, and that leads to some warnings on some compilers.
---
 llvm/lib/Transforms/Utils/CloneFunction.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 510d9f97bf8c6..fccb73a36b182 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -544,7 +544,7 @@ void PruningFunctionCloner::CloneBlock(
   // Keep a cursor pointing at the last place we cloned debug-info records from.
   BasicBlock::const_iterator DbgCursor = StartingInst;
   auto CloneDbgRecordsToHere =
-      [NewBB, &DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
+      [&DbgCursor](Instruction *NewInst, BasicBlock::const_iterator II) {
         // Clone debug-info records onto this instruction. Iterate through any
         // source-instructions we've cloned and then subsequently optimised
         // away, so that their debug-info doesn't go missing.

From d698ede748e66f5519cb8481abc2df89a994a059 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Thu, 12 Jun 2025 13:45:19 +0200
Subject: [PATCH 216/851] [mlir][amx] Restore conversion interface for AMX
 (#143871)

Restores mistakenly removed AMX interface which ensures that the custom
tile type is converted to its LLVM equivalent within other operations
such as control flow.

Fix after #140559
---
 mlir/include/mlir/Dialect/AMX/Transforms.h    |  3 +++
 mlir/include/mlir/InitAllExtensions.h         |  2 ++
 .../AMX/Transforms/LegalizeForLLVMExport.cpp  | 19 ++++++++++++++++++
 mlir/test/Target/LLVMIR/amx.mlir              | 20 +++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/mlir/include/mlir/Dialect/AMX/Transforms.h b/mlir/include/mlir/Dialect/AMX/Transforms.h
index 4a751d99ceeee..7391ec2ff6b14 100644
--- a/mlir/include/mlir/Dialect/AMX/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMX/Transforms.h
@@ -25,6 +25,9 @@ void populateAMXLegalizeForLLVMExportPatterns(LLVMTypeConverter &converter,
 /// intrinsics.
 void configureAMXLegalizeForExportTarget(LLVMConversionTarget &target);
 
+/// Register LLVM conversion interface for AMX dialect.
+void registerConvertAMXToLLVMInterface(DialectRegistry &registry);
+
 } // namespace mlir
 
 #endif // MLIR_DIALECT_AMX_TRANSFORMS_H
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 7dcbabe8aafa3..f356b91b1b6c0 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -32,6 +32,7 @@
 #include "mlir/Conversion/SCFToEmitC/SCFToEmitC.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/AMX/Transforms.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/ArmNeon/TransformOps/ArmNeonVectorTransformOps.h"
 #include "mlir/Dialect/ArmSVE/TransformOps/ArmSVEVectorTransformOps.h"
@@ -85,6 +86,7 @@ inline void registerAllExtensions(DialectRegistry &registry) {
   registerConvertOpenMPToLLVMInterface(registry);
   registerConvertSCFToEmitCInterface(registry);
   ub::registerConvertUBToLLVMInterface(registry);
+  registerConvertAMXToLLVMInterface(registry);
   gpu::registerConvertGpuToLLVMInterface(registry);
   NVVM::registerConvertGpuToNVVMInterface(registry);
   vector::registerConvertVectorToLLVMInterface(registry);
diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
index 7471dc797e0fc..37aebc9fab3eb 100644
--- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
@@ -60,3 +60,22 @@ void mlir::populateAMXLegalizeForLLVMExportPatterns(
 void mlir::configureAMXLegalizeForExportTarget(LLVMConversionTarget &target) {
   target.addIllegalDialect<AMXDialect>();
 }
+
+namespace {
+/// Implement the interface to convert AMX to LLVM.
+struct AMXToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
+  using ConvertToLLVMPatternInterface::ConvertToLLVMPatternInterface;
+
+  void populateConvertToLLVMConversionPatterns(
+      ConversionTarget &target, LLVMTypeConverter &typeConverter,
+      RewritePatternSet &patterns) const final {
+    populateAMXLegalizeForLLVMExportPatterns(typeConverter, patterns);
+  }
+};
+} // namespace
+
+void mlir::registerConvertAMXToLLVMInterface(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, amx::AMXDialect *dialect) {
+    dialect->addInterfaces<AMXToLLVMDialectInterface>();
+  });
+}
diff --git a/mlir/test/Target/LLVMIR/amx.mlir b/mlir/test/Target/LLVMIR/amx.mlir
index 094475040436d..abdf2fe3bd534 100644
--- a/mlir/test/Target/LLVMIR/amx.mlir
+++ b/mlir/test/Target/LLVMIR/amx.mlir
@@ -88,3 +88,23 @@ func.func @amx_tile_muli(%matA: memref<?x?xi8>, %matB: memref<?x?xi8>,
   amx.tile_store %out[%c16, %c16], %res3 : memref<?x?xi8>, !amx.tile<16x16xi32>
   return
 }
+
+// CHECK-LABEL: define void @amx_tile_type_through_cf
+func.func @amx_tile_type_through_cf(%src: memref<?x?xi8>, %out: memref<?x?xi8>,
+    %idx: index, %cond: i1) {
+  cf.cond_br %cond, ^bb1, ^bb2
+^bb1:  // pred: ^bb0
+  // CHECK: call x86_amx @llvm.x86.tileloadd64.internal
+  %0 = amx.tile_load %src[%idx, %idx] : memref<?x?xi8> into !amx.tile<16x64xi8>
+  cf.br ^bb3(%0 : !amx.tile<16x64xi8>)
+^bb2:  // pred: ^bb0
+  // CHECK: call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
+  %1 = amx.tile_zero : !amx.tile<16x64xi8>
+  cf.br ^bb3(%1 : !amx.tile<16x64xi8>)
+^bb3(%2: !amx.tile<16x64xi8>):  // 2 preds: ^bb1, ^bb2
+  cf.br ^bb4
+^bb4:  // pred: ^bb3
+  // CHECK: call void @llvm.x86.tilestored64.internal
+  amx.tile_store %out[%idx, %idx], %2 : memref<?x?xi8>, !amx.tile<16x64xi8>
+  return
+}

From 0604dc199c019b23746f4a54885ba0c75569cdae Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 12 Jun 2025 13:44:18 +0200
Subject: [PATCH 217/851] Revert "[VPlan] Set branch weight metadata on middle
 term in VPlan (NFC) (#143035)"

This caused assertion failures:

  llvm/lib/Transforms/Vectorize/VPlan.h:4021:
  llvm::VPBasicBlock* llvm::VPlan::getMiddleBlock():
  Assertion `LoopRegion && "cannot call the function after vector loop region has been removed"' failed.

See comment on the PR.

> Manage branch weights for the BranchOnCond in the middle block in VPlan.
> This requires updating VPInstruction to inherit from VPIRMetadata, which
> in general makes sense as there are a number of opcodes that could take
> metadata.
>
> There are other branches (part of the skeleton) that also need branch
> weights adding.
>
> PR: https://github.com/llvm/llvm-project/pull/143035

This reverts commit db8d34db26e9ea92c08d6e813eca9cce40c48478.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 48 +++++++----------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 53 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 +--
 3 files changed, 45 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 474f856d20461..8177b76ad5bdf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7272,33 +7272,6 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
       BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
 }
 
-/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
-/// BranchOnCond recipe.
-static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF,
-                                              Loop *OrigLoop) {
-  // 4. Adjust branch weight of the branch in the middle block.
-  Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator();
-  if (!hasBranchWeightMD(*LatchTerm))
-    return;
-
-  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
-  auto *MiddleTerm =
-      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
-  // Only add branch metadata if there is a (conditional) terminator.
-  if (!MiddleTerm)
-    return;
-
-  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
-         "must have a BranchOnCond");
-  // Assume that `Count % VectorTripCount` is equally distributed.
-  unsigned TripCount = Plan.getUF() * VF.getKnownMinValue();
-  assert(TripCount > 0 && "trip count should not be zero");
-  MDBuilder MDB(LatchTerm->getContext());
-  MDNode *BranchWeights =
-      MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false);
-  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
-}
-
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
@@ -7321,8 +7294,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-
-  addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop);
+  // Retrieve and store the middle block before dissolving regions. Regions are
+  // dissolved after optimizing for VF and UF, which completely removes unneeded
+  // loop regions first.
+  VPBasicBlock *MiddleVPBB =
+      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7465,6 +7441,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
+  // 4. Adjust branch weight of the branch in the middle block.
+  if (HeaderVPBB) {
+    auto *MiddleTerm =
+        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
+    if (MiddleTerm->isConditional() &&
+        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
+      // Assume that `Count % VectorTripCount` is equally distributed.
+      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
+      assert(TripCount > 0 && "trip count should not be zero");
+      const uint32_t Weights[] = {1, TripCount - 1};
+      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
+    }
+  }
+
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 468284168e9ca..acc861b991975 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,39 +882,11 @@ template <unsigned PartOpIdx> class VPUnrollPartAccessor {
   unsigned getUnrollPart(VPUser &U) const;
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-
-  void addMetadata(unsigned Kind, MDNode *Node) {
-    Metadata.emplace_back(Kind, Node);
-  }
-};
-
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
-                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -1004,7 +976,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
+        Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1296,6 +1268,29 @@ struct VPIRPhi : public VPIRInstruction, public VPPhiAccessors {
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+};
+
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 8863a3fb4b31d..aa6b13c217bd1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
+      Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -591,9 +591,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    auto *Br = createCondBranch(Cond, getParent(), State);
-    applyMetadata(*Br);
-    return Br;
+    return createCondBranch(Cond, getParent(), State);
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.

From 9f542f14701cdf70023790b206273ae8174e913a Mon Sep 17 00:00:00 2001
From: Ryan Buchner <92571492+bababuck@users.noreply.github.com>
Date: Thu, 12 Jun 2025 05:05:53 -0700
Subject: [PATCH 218/851] [RISCV] Add new tests for RISCV zicond extension
 (#143580)

I have a few patches to improve compilation for these tests which I will
be posting as separate MRs.
---
 llvm/test/CodeGen/RISCV/zicond-opts.ll | 289 +++++++++++++++++++++++++
 1 file changed, 289 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/zicond-opts.ll

diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
new file mode 100644
index 0000000000000..f5a25868bd12b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -O2 -verify-machineinstrs -mattr=+b,+zicond < %s | FileCheck %s -check-prefix=RV32ZICOND
+; RUN: llc -mtriple=riscv64 -O2 -verify-machineinstrs -mattr=+b,+zicond < %s | FileCheck %s -check-prefix=RV64ZICOND
+
+; (and (icmp x. 0, ne), (icmp y, 0, ne)) -> (czero.eqz (icmp x, 0, ne), y)
+define i32 @icmp_and(i64 %x, i64 %y) {
+; RV32ZICOND-LABEL: icmp_and:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    snez a1, a2
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    and a0, a0, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    and a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+  %3 = icmp ne i64 %y, 0
+  %4 = icmp ne i64 %x, 0
+  %5 = and i1 %4, %3
+  %6 = zext i1 %5 to i32
+  ret i32 %6
+}
+
+; (and (and (icmp x, 0, ne), (icmp y, 0, ne)), (icmp z, 0, ne)) -> (czero.eqz (czero.eqz (icmp x, 0, ne), y), z)
+define i32 @icmp_and_and(i64 %x, i64 %y, i64 %z) {
+; RV32ZICOND-LABEL: icmp_and_and:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    snez a1, a2
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    and a0, a1, a0
+; RV32ZICOND-NEXT:    snez a1, a4
+; RV32ZICOND-NEXT:    and a0, a1, a0
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: icmp_and_and:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    snez a1, a1
+; RV64ZICOND-NEXT:    snez a0, a0
+; RV64ZICOND-NEXT:    and a0, a1, a0
+; RV64ZICOND-NEXT:    snez a1, a2
+; RV64ZICOND-NEXT:    and a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = icmp ne i64 %y, 0
+  %5 = icmp ne i64 %x, 0
+  %6 = and i1 %4, %5
+  %7 = icmp ne i64 %z, 0
+  %8 = and i1 %7, %6
+  %9 = zext i1 %8 to i32
+  ret i32 %9
+}
+
+; (select cond, x, rotl(x, rot.amt)) -> (rotl x, (czero_nez rot.amt, cond))
+define i64 @rotate_l_nez(i64 %x, i64 %rot.amt, i1 %cond) {
+; RV32ZICOND-LABEL: rotate_l_nez:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    bexti a3, a2, 5
+; RV32ZICOND-NEXT:    not a5, a2
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a0, a3
+; RV32ZICOND-NEXT:    czero.nez t0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a4
+; RV32ZICOND-NEXT:    or a6, a7, a6
+; RV32ZICOND-NEXT:    or a3, a3, t0
+; RV32ZICOND-NEXT:    sll a7, a6, a2
+; RV32ZICOND-NEXT:    srli t0, a3, 1
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    srli a3, a6, 1
+; RV32ZICOND-NEXT:    srl a6, t0, a5
+; RV32ZICOND-NEXT:    srl a3, a3, a5
+; RV32ZICOND-NEXT:    or a5, a7, a6
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    czero.nez a3, a5, a4
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a1, a1, a3
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: rotate_l_nez:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    rol a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a0, a1
+; RV64ZICOND-NEXT:    ret
+  %6 = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %rot.amt)
+  %7 = select i1 %cond, i64 %x, i64 %6
+  ret i64 %7
+}
+
+; (select cond, rotl(x, rot.amt), x) -> (rotl x, (czero_eqz rot.amt, cond))
+define i64 @rotate_l_eqz(i64 %x, i64 %rot.amt, i1 %cond) {
+; RV32ZICOND-LABEL: rotate_l_eqz:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    bexti a3, a2, 5
+; RV32ZICOND-NEXT:    not a5, a2
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a0, a3
+; RV32ZICOND-NEXT:    czero.nez t0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a1, a3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    or a6, a7, a6
+; RV32ZICOND-NEXT:    or a3, a3, t0
+; RV32ZICOND-NEXT:    sll a7, a6, a2
+; RV32ZICOND-NEXT:    srli t0, a3, 1
+; RV32ZICOND-NEXT:    sll a2, a3, a2
+; RV32ZICOND-NEXT:    srli a3, a6, 1
+; RV32ZICOND-NEXT:    srl a6, t0, a5
+; RV32ZICOND-NEXT:    srl a3, a3, a5
+; RV32ZICOND-NEXT:    or a5, a7, a6
+; RV32ZICOND-NEXT:    or a2, a2, a3
+; RV32ZICOND-NEXT:    czero.eqz a2, a2, a4
+; RV32ZICOND-NEXT:    czero.eqz a3, a5, a4
+; RV32ZICOND-NEXT:    or a0, a2, a0
+; RV32ZICOND-NEXT:    or a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: rotate_l_eqz:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    rol a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %6 = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %rot.amt)
+  %7 = select i1 %cond, i64 %6, i64 %x
+  ret i64 %7
+}
+
+; (select cond, const, t) -> (add (czero_nez t - const, cond), const)
+define i64 @select_imm_reg(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, 3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 3, i64 %t
+  ret i64 %4
+}
+
+; (select cond, t, const) -> (add (czero_eqz t - const, cond), const)
+define i64 @select_reg_imm(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_reg_imm:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, 3
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a2
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_reg_imm:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    czero.nez a2, a2, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a1
+; RV64ZICOND-NEXT:    or a0, a0, a2
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 %t, i64 3
+  ret i64 %4
+}
+
+; (select cond, -2048, t) -> (xor (czero_nez (xor t, -2048), cond), -2048)
+define i64 @select_imm_reg_neg_2048(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg_neg_2048:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    li a3, -2048
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    neg a2, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg_neg_2048:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    li a2, -2048
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 -2048, i64 %t
+  ret i64 %4
+}
+
+; (select cond, 2048, t) -> no transform
+define i64 @select_imm_reg_2048(i64 %t, i1 %cond) {
+; RV32ZICOND-LABEL: select_imm_reg_2048:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a2, a2, 1
+; RV32ZICOND-NEXT:    bseti a3, zero, 11
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: select_imm_reg_2048:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a1, a1, 1
+; RV64ZICOND-NEXT:    bseti a2, zero, 11
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %4 = select i1 %cond, i64 2048, i64 %t
+  ret i64 %4
+}
+
+; (select cond, (and f, ~x), f) -> (andn f, (czero_eqz x, cond))
+define i64 @test_inv_and_nez(i64 %f, i64 %x, i1 %cond) {
+; RV32ZICOND-LABEL: test_inv_and_nez:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    addi a4, a4, -1
+; RV32ZICOND-NEXT:    orn a3, a4, a3
+; RV32ZICOND-NEXT:    orn a2, a4, a2
+; RV32ZICOND-NEXT:    and a0, a2, a0
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: test_inv_and_nez:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    andn a1, a0, a1
+; RV64ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %5 = xor i64 %x, -1
+  %6 = select i1 %cond, i64 %5, i64 -1
+  %7 = and i64 %6, %f
+  ret i64 %7
+}
+
+; (select cond, f, (and f, ~x)) -> (andn f, (czero_nez x, cond))
+define i64 @test_inv_and_eqz(i64 %f, i64 %x, i1 %cond) {
+; RV32ZICOND-LABEL: test_inv_and_eqz:
+; RV32ZICOND:       # %bb.0:
+; RV32ZICOND-NEXT:    slli a4, a4, 31
+; RV32ZICOND-NEXT:    srai a4, a4, 31
+; RV32ZICOND-NEXT:    orn a3, a4, a3
+; RV32ZICOND-NEXT:    orn a2, a4, a2
+; RV32ZICOND-NEXT:    and a0, a2, a0
+; RV32ZICOND-NEXT:    and a1, a3, a1
+; RV32ZICOND-NEXT:    ret
+;
+; RV64ZICOND-LABEL: test_inv_and_eqz:
+; RV64ZICOND:       # %bb.0:
+; RV64ZICOND-NEXT:    andi a2, a2, 1
+; RV64ZICOND-NEXT:    andn a1, a0, a1
+; RV64ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    ret
+  %5 = xor i64 %x, -1
+  %6 = select i1 %cond, i64 -1, i64 %5
+  %7 = and i64 %6, %f
+  ret i64 %7
+}

From 2ecbfc0beb42abbbd2c3d28bfd576b38c44a5b46 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Thu, 12 Jun 2025 20:11:14 +0800
Subject: [PATCH 219/851] [LoongArch] Fix '-mno-lsx' option not disabling LASX
 feature (#143821)

When '-march' with LASX feature and '-mno-lsx' options are used
together, '-mno-lsx' fails to disable LASX, leaving
'HasFeatureLASX=true' and causing incorrect '__loongarch_sx/asx=1' macro
definition.

Fixes https://github.com/loongson-community/discussions/issues/95
---
 clang/lib/Driver/ToolChains/Arch/LoongArch.cpp | 1 +
 clang/test/Preprocessor/init-loongarch.c       | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 3318e498a74f9..33a655870b01b 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -252,6 +252,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
         Features.push_back("+lsx");
     } else /*-mno-lsx*/ {
       Features.push_back("-lsx");
+      Features.push_back("-lasx");
     }
   }
 
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index ac461b371162f..71a266b8a9157 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -946,6 +946,10 @@
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
 // RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -march=la464 -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lsx -march=la464 -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
 // MNO-LSX-NOT: #define __loongarch_asx
 // MNO-LSX-NOT: #define __loongarch_simd_width
 // MNO-LSX-NOT: #define __loongarch_sx

From bc7fafbeea08bf8cd9a18fa10d3d3bc63f0c45a3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 14:13:15 +0200
Subject: [PATCH 220/851] [AA] Take read-only provenance captures into account
 (#143097)

Update the AA CaptureAnalysis providers to return CaptureComponents, so
we can distinguish between full provenance and read-only provenance
captures.

Use this to restrict "other" memory effects on call from ModRef to Ref.

Ideally we would also apply the same reasoning for escape sources, but
the current API cannot actually convey the necessary information (we can
only say NoAlias or MayAlias, not MayAlias but only via Ref).
---
 llvm/include/llvm/Analysis/AliasAnalysis.h    | 29 ++++---
 llvm/include/llvm/Analysis/CaptureTracking.h  | 26 +++---
 llvm/lib/Analysis/BasicAliasAnalysis.cpp      | 86 +++++++++++--------
 llvm/lib/Analysis/CaptureTracking.cpp         | 11 ++-
 .../Scalar/DeadStoreElimination.cpp           |  3 +-
 llvm/test/Analysis/BasicAA/captures.ll        |  3 +-
 llvm/test/Transforms/GVN/captures.ll          |  3 +-
 7 files changed, 89 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 0e736b92e550e..b7d1251aeb723 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -149,23 +149,24 @@ LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, AliasResult AR);
 struct LLVM_ABI CaptureAnalysis {
   virtual ~CaptureAnalysis() = 0;
 
-  /// Check whether Object is not captured before instruction I. If OrAt is
-  /// true, captures by instruction I itself are also considered.
+  /// Return how Object may be captured before instruction I, considering only
+  /// provenance captures. If OrAt is true, captures by instruction I itself
+  /// are also considered.
   ///
   /// If I is nullptr, then captures at any point will be considered.
-  virtual bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                                   bool OrAt) = 0;
+  virtual CaptureComponents
+  getCapturesBefore(const Value *Object, const Instruction *I, bool OrAt) = 0;
 };
 
 /// Context-free CaptureAnalysis provider, which computes and caches whether an
 /// object is captured in the function at all, but does not distinguish whether
 /// it was captured before or after the context instruction.
 class LLVM_ABI SimpleCaptureAnalysis final : public CaptureAnalysis {
-  SmallDenseMap<const Value *, bool, 8> IsCapturedCache;
+  SmallDenseMap<const Value *, CaptureComponents, 8> IsCapturedCache;
 
 public:
-  bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                           bool OrAt) override;
+  CaptureComponents getCapturesBefore(const Value *Object, const Instruction *I,
+                                      bool OrAt) override;
 };
 
 /// Context-sensitive CaptureAnalysis provider, which computes and caches the
@@ -176,10 +177,12 @@ class LLVM_ABI EarliestEscapeAnalysis final : public CaptureAnalysis {
   const LoopInfo *LI;
 
   /// Map from identified local object to an instruction before which it does
-  /// not escape, or nullptr if it never escapes. The "earliest" instruction
-  /// may be a conservative approximation, e.g. the first instruction in the
-  /// function is always a legal choice.
-  DenseMap<const Value *, Instruction *> EarliestEscapes;
+  /// not escape (or nullptr if it never escapes) and the possible components
+  /// that may be captured (by any instruction, not necessarily the earliest
+  /// one). The "earliest" instruction may be a conservative approximation,
+  /// e.g. the first instruction in the function is always a legal choice.
+  DenseMap<const Value *, std::pair<Instruction *, CaptureComponents>>
+      EarliestEscapes;
 
   /// Reverse map from instruction to the objects it is the earliest escape for.
   /// This is used for cache invalidation purposes.
@@ -189,8 +192,8 @@ class LLVM_ABI EarliestEscapeAnalysis final : public CaptureAnalysis {
   EarliestEscapeAnalysis(DominatorTree &DT, const LoopInfo *LI = nullptr)
       : DT(DT), LI(LI) {}
 
-  bool isNotCapturedBefore(const Value *Object, const Instruction *I,
-                           bool OrAt) override;
+  CaptureComponents getCapturesBefore(const Value *Object, const Instruction *I,
+                                      bool OrAt) override;
 
   void removeInstruction(Instruction *I);
 };
diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h
index dd6a7f9b14dc6..e652bc5a0a5a6 100644
--- a/llvm/include/llvm/Analysis/CaptureTracking.h
+++ b/llvm/include/llvm/Analysis/CaptureTracking.h
@@ -95,21 +95,21 @@ namespace llvm {
       function_ref<bool(CaptureComponents)> StopFn = capturesAnything,
       const LoopInfo *LI = nullptr, unsigned MaxUsesToExplore = 0);
 
-  // Returns the 'earliest' instruction that captures \p V in \F. An instruction
-  // A is considered earlier than instruction B, if A dominates B. If 2 escapes
-  // do not dominate each other, the terminator of the common dominator is
-  // chosen. If not all uses can be analyzed, the earliest escape is set to
-  // the first instruction in the function entry block. If \p V does not escape,
-  // nullptr is returned. Note that the caller of the function has to ensure
-  // that the instruction the result value is compared against is not in a
-  // cycle.
+  // Returns the 'earliest' instruction that captures \p V in \F, and which
+  // components may be captured (by any use, not necessarily the earliest one).
+  // An instruction A is considered earlier than instruction B, if A dominates
+  // B. If 2 escapes do not dominate each other, the terminator of the common
+  // dominator is chosen. If not all uses can be analyzed, the earliest escape
+  // is set to the first instruction in the function entry block. If \p V does
+  // not escape, nullptr is returned. Note that the caller of the function has
+  // to ensure that the instruction the result value is compared against is
+  // not in a cycle.
   //
   // Only consider components that are part of \p Mask.
-  LLVM_ABI Instruction *FindEarliestCapture(const Value *V, Function &F,
-                                            bool ReturnCaptures,
-                                            const DominatorTree &DT,
-                                            CaptureComponents Mask,
-                                            unsigned MaxUsesToExplore = 0);
+  LLVM_ABI std::pair<Instruction *, CaptureComponents>
+  FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                      const DominatorTree &DT, CaptureComponents Mask,
+                      unsigned MaxUsesToExplore = 0);
 
   /// Capture information for a specific Use.
   struct UseCaptureInfo {
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index f862d6930f545..31611dfe4fd2f 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -192,18 +192,20 @@ static bool areBothVScale(const Value *V1, const Value *V2) {
 
 CaptureAnalysis::~CaptureAnalysis() = default;
 
-bool SimpleCaptureAnalysis::isNotCapturedBefore(const Value *Object,
-                                                const Instruction *I,
-                                                bool OrAt) {
+CaptureComponents SimpleCaptureAnalysis::getCapturesBefore(const Value *Object,
+                                                           const Instruction *I,
+                                                           bool OrAt) {
   if (!isIdentifiedFunctionLocal(Object))
-    return false;
+    return CaptureComponents::Provenance;
 
-  auto [CacheIt, Inserted] = IsCapturedCache.insert({Object, false});
+  auto [CacheIt, Inserted] =
+      IsCapturedCache.insert({Object, CaptureComponents::Provenance});
   if (!Inserted)
     return CacheIt->second;
 
-  bool Ret = !capturesAnything(PointerMayBeCaptured(
-      Object, /*ReturnCaptures=*/false, CaptureComponents::Provenance));
+  CaptureComponents Ret = PointerMayBeCaptured(
+      Object, /*ReturnCaptures=*/false, CaptureComponents::Provenance,
+      [](CaptureComponents CC) { return capturesFullProvenance(CC); });
   CacheIt->second = Ret;
   return Ret;
 }
@@ -216,37 +218,44 @@ static bool isNotInCycle(const Instruction *I, const DominatorTree *DT,
          !isPotentiallyReachableFromMany(Succs, BB, nullptr, DT, LI);
 }
 
-bool EarliestEscapeAnalysis::isNotCapturedBefore(const Value *Object,
-                                                 const Instruction *I,
-                                                 bool OrAt) {
+CaptureComponents
+EarliestEscapeAnalysis::getCapturesBefore(const Value *Object,
+                                          const Instruction *I, bool OrAt) {
   if (!isIdentifiedFunctionLocal(Object))
-    return false;
+    return CaptureComponents::Provenance;
 
   auto Iter = EarliestEscapes.try_emplace(Object);
   if (Iter.second) {
-    Instruction *EarliestCapture = FindEarliestCapture(
-        Object, *const_cast<Function *>(DT.getRoot()->getParent()),
-        /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance);
-    if (EarliestCapture)
-      Inst2Obj[EarliestCapture].push_back(Object);
+    std::pair<Instruction *, CaptureComponents> EarliestCapture =
+        FindEarliestCapture(
+            Object, *const_cast<Function *>(DT.getRoot()->getParent()),
+            /*ReturnCaptures=*/false, DT, CaptureComponents::Provenance);
+    if (EarliestCapture.first)
+      Inst2Obj[EarliestCapture.first].push_back(Object);
     Iter.first->second = EarliestCapture;
   }
 
-  // No capturing instruction.
-  if (!Iter.first->second)
-    return true;
-
-  // No context instruction means any use is capturing.
-  if (!I)
-    return false;
+  auto IsNotCapturedBefore = [&]() {
+    // No capturing instruction.
+    Instruction *CaptureInst = Iter.first->second.first;
+    if (!CaptureInst)
+      return true;
 
-  if (I == Iter.first->second) {
-    if (OrAt)
+    // No context instruction means any use is capturing.
+    if (!I)
       return false;
-    return isNotInCycle(I, &DT, LI);
-  }
 
-  return !isPotentiallyReachable(Iter.first->second, I, nullptr, &DT, LI);
+    if (I == CaptureInst) {
+      if (OrAt)
+        return false;
+      return isNotInCycle(I, &DT, LI);
+    }
+
+    return !isPotentiallyReachable(CaptureInst, I, nullptr, &DT, LI);
+  };
+  if (IsNotCapturedBefore())
+    return CaptureComponents::None;
+  return Iter.first->second.second;
 }
 
 void EarliestEscapeAnalysis::removeInstruction(Instruction *I) {
@@ -946,9 +955,14 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
   // As an exception, ignore allocas, as setjmp is not required to preserve
   // non-volatile stores for them.
   if (isModOrRefSet(OtherMR) && !isa<Constant>(Object) && Call != Object &&
-      AAQI.CA->isNotCapturedBefore(Object, Call, /*OrAt=*/false) &&
-      (isa<AllocaInst>(Object) || !Call->hasFnAttr(Attribute::ReturnsTwice)))
-    OtherMR = ModRefInfo::NoModRef;
+      (isa<AllocaInst>(Object) || !Call->hasFnAttr(Attribute::ReturnsTwice))) {
+    CaptureComponents CC =
+        AAQI.CA->getCapturesBefore(Object, Call, /*OrAt=*/false);
+    if (capturesNothing(CC))
+      OtherMR = ModRefInfo::NoModRef;
+    else if (capturesReadProvenanceOnly(CC))
+      OtherMR = ModRefInfo::Ref;
+  }
 
   // Refine the modref info for argument memory. We only bother to do this
   // if ArgMR is not a subset of OtherMR, otherwise this won't have an impact
@@ -1614,11 +1628,13 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size,
     // temporary store the nocapture argument's value in a temporary memory
     // location if that memory location doesn't escape. Or it may pass a
     // nocapture value to other functions as long as they don't capture it.
-    if (isEscapeSource(O1) && AAQI.CA->isNotCapturedBefore(
-                                  O2, dyn_cast<Instruction>(O1), /*OrAt*/ true))
+    if (isEscapeSource(O1) &&
+        capturesNothing(AAQI.CA->getCapturesBefore(
+            O2, dyn_cast<Instruction>(O1), /*OrAt*/ true)))
       return AliasResult::NoAlias;
-    if (isEscapeSource(O2) && AAQI.CA->isNotCapturedBefore(
-                                  O1, dyn_cast<Instruction>(O2), /*OrAt*/ true))
+    if (isEscapeSource(O2) &&
+        capturesNothing(AAQI.CA->getCapturesBefore(
+            O1, dyn_cast<Instruction>(O2), /*OrAt*/ true)))
       return AliasResult::NoAlias;
   }
 
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index d08ed17a655e4..076f4176c0219 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -249,11 +249,10 @@ bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
       capturesAnything, LI, MaxUsesToExplore));
 }
 
-Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
-                                       bool ReturnCaptures,
-                                       const DominatorTree &DT,
-                                       CaptureComponents Mask,
-                                       unsigned MaxUsesToExplore) {
+std::pair<Instruction *, CaptureComponents>
+llvm::FindEarliestCapture(const Value *V, Function &F, bool ReturnCaptures,
+                          const DominatorTree &DT, CaptureComponents Mask,
+                          unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
 
@@ -263,7 +262,7 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F,
     ++NumCapturedBefore;
   else
     ++NumNotCapturedBefore;
-  return CB.EarliestCapture;
+  return {CB.EarliestCapture, CB.CC};
 }
 
 UseCaptureInfo llvm::DetermineUseCaptureKind(const Use &U, const Value *Base) {
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 49a0c88922c3e..4a2eb9284a6ea 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2345,7 +2345,8 @@ bool isFuncLocalAndNotCaptured(Value *Arg, const CallBase *CB,
                                EarliestEscapeAnalysis &EA) {
   const Value *UnderlyingObj = getUnderlyingObject(Arg);
   return isIdentifiedFunctionLocal(UnderlyingObj) &&
-         EA.isNotCapturedBefore(UnderlyingObj, CB, /*OrAt*/ true);
+         capturesNothing(
+             EA.getCapturesBefore(UnderlyingObj, CB, /*OrAt*/ true));
 }
 
 SmallVector<MemoryLocation, 1>
diff --git a/llvm/test/Analysis/BasicAA/captures.ll b/llvm/test/Analysis/BasicAA/captures.ll
index c212a466f8ede..c9ed1ea74be88 100644
--- a/llvm/test/Analysis/BasicAA/captures.ll
+++ b/llvm/test/Analysis/BasicAA/captures.ll
@@ -17,8 +17,7 @@ define void @address_capture() {
 
 ; CHECK-LABEL: read_only_capture
 ; CHECK: MayAlias:	i32* %a, i32* %p
-; CHECK: Both ModRef:  Ptr: i32* %a	<->  %p = call ptr @get_ptr()
-; TODO: The ModRef could be just Ref.
+; CHECK: Just Ref:  Ptr: i32* %a	<->  %p = call ptr @get_ptr()
 define void @read_only_capture() {
   %a = alloca i32
   call void @capture(ptr captures(address, read_provenance) %a)
diff --git a/llvm/test/Transforms/GVN/captures.ll b/llvm/test/Transforms/GVN/captures.ll
index ae47e92da0f2b..96fce438356c4 100644
--- a/llvm/test/Transforms/GVN/captures.ll
+++ b/llvm/test/Transforms/GVN/captures.ll
@@ -43,8 +43,7 @@ define i32 @read_provenance_capture() {
 ; CHECK-NEXT:    call void @capture(ptr captures(address, read_provenance) [[A]])
 ; CHECK-NEXT:    store i32 1, ptr [[A]], align 4
 ; CHECK-NEXT:    call void @unknown_call()
-; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    ret i32 [[V]]
+; CHECK-NEXT:    ret i32 1
 ;
   %a = alloca i32
   call void @capture(ptr captures(address, read_provenance) %a)

From 3550662c040024597485d1bfac0d733340514ae1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 21:17:48 +0900
Subject: [PATCH 221/851] ARM: Avoid using getTargetLowering in TargetLowering
 (#143833)

This is this.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 37 ++++++++++---------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d2e910a248f23..f17eb72bb2e26 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2688,8 +2688,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         unsigned RegBegin, RegEnd;
         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
 
-        EVT PtrVT =
-            DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+        EVT PtrVT = getPointerTy(DAG.getDataLayout());
         unsigned int i, j;
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
@@ -5024,7 +5023,7 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
 SDValue
 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+  if (!isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue Value, OverflowCmp;
@@ -5070,7 +5069,7 @@ static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+  if (!isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue LHS = Op.getOperand(0);
@@ -5168,7 +5167,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (Cond.getResNo() == 1 &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO)) {
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+    if (!isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     SDValue Value, OverflowCmp;
@@ -5530,8 +5529,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -5736,7 +5734,7 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || OptimizeMul)) {
     // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+    if (!isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
@@ -5766,8 +5764,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-        DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
@@ -5787,7 +5784,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
        Opc == ISD::USUBO || OptimizeMul) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+    if (!isTypeLegal(LHS->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
@@ -6255,7 +6252,6 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
 /// vectors), since the legalizer won't know what to do with that.
 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
                                          const ARMSubtarget *Subtarget) const {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
@@ -6282,7 +6278,7 @@ SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
-  if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
+  if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
     // if we can combine the bitcast with its source.
     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
@@ -6294,7 +6290,7 @@ SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
   }
 
   // Turn f64->i64 into VMOVRRD.
-  if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
+  if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
     SDValue Cvt;
     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
         SrcVT.getVectorNumElements() > 1)
@@ -9931,7 +9927,6 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
   Type *RetTy = StructType::get(ArgTy, ArgTy);
@@ -9945,7 +9940,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
     const Align StackAlign = DL.getPrefTypeAlign(RetTy);
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
+    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
 
     ArgListEntry Entry;
     Entry.Node = SRet;
@@ -10003,7 +9998,6 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   const auto &DL = DAG.getDataLayout();
-  const auto &TLI = DAG.getTargetLoweringInfo();
 
   const char *Name = nullptr;
   if (Signed)
@@ -10011,7 +10005,7 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   else
     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
 
-  SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
+  SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
 
   ARMTargetLowering::ArgListTy Args;
 
@@ -10101,7 +10095,6 @@ void ARMTargetLowering::ExpandDIV_Windows(
     SDValue Op, SelectionDAG &DAG, bool Signed,
     SmallVectorImpl<SDValue> &Results) const {
   const auto &DL = DAG.getDataLayout();
-  const auto &TLI = DAG.getTargetLoweringInfo();
 
   assert(Op.getValueType() == MVT::i64 &&
          "unexpected type for custom lowering DIV");
@@ -10113,7 +10106,7 @@ void ARMTargetLowering::ExpandDIV_Windows(
 
   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
-                              DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
+                              DAG.getConstant(32, dl, getPointerTy(DL)));
   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
 
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
@@ -10525,8 +10518,8 @@ SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
   // If we don't have instructions of this float type then soften to a libcall
   // and use SETCC instead.
   if (isUnsupportedFloatingType(LHS.getValueType())) {
-    DAG.getTargetLoweringInfo().softenSetCCOperands(
-      DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
+    softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
+                        Chain, IsSignaling);
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;

From 633375a29f52504b0b23a30bb767de521dd3e2a8 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 13:20:36 +0100
Subject: [PATCH 222/851] [llvm][DWARFLinker] Fix gcc 13 -Wuninitialized
 warnings (#143867)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bit awkward that we have to switch from public to protected and back
again, but it seemed neater than putting OS all the way down at the
bottom. Since it is a public member that you're more likely to be
looking for.

llvm-project/llvm/lib/DWARFLinker/Parallel/OutputSections.h:157:67:
warning: member
‘llvm::dwarf_linker::parallel::SectionDescriptor::Contents’ is used
uninitialized [-Wuninitialized]

Which refers to the use in the constructor:
```
  SectionDescriptor(DebugSectionKind SectionKind, LinkingGlobalData &GlobalData,
                    dwarf::FormParams Format, llvm::endianness Endianess)
      : SectionDescriptorBase(SectionKind, Format, Endianess), OS(Contents),
```
Where Contents is passed to `OS`, before Contents has been constructed.
---
 llvm/lib/DWARFLinker/Parallel/OutputSections.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.h b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
index da47f53b6c3d1..5043e918013e4 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.h
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
@@ -181,6 +181,11 @@ struct SectionDescriptor : SectionDescriptorBase {
   /// to the debug section, corresponding to this object.
   uint64_t StartOffset = 0;
 
+protected:
+  /// Section data bits.
+  OutSectionDataTy Contents;
+
+public:
   /// Stream which stores data to the Contents.
   raw_svector_ostream OS;
 
@@ -287,9 +292,6 @@ struct SectionDescriptor : SectionDescriptorBase {
 
   LinkingGlobalData &GlobalData;
 
-  /// Section data bits.
-  OutSectionDataTy Contents;
-
   /// Some sections are generated using AsmPrinter. The real section data
   /// located inside elf file in that case. Following fields points to the
   /// real section content inside elf file.

From aac603c47800bf2e167b53ddfd3bb10be292bc53 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 12 Jun 2025 21:20:45 +0900
Subject: [PATCH 223/851] ARM: Avoid repeating hardcoded windows division
 libcall names (#143834)

This is properly set in the runtime libcall info, so query
the name.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index f17eb72bb2e26..5b3664c4e961f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -9998,13 +9998,13 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Op);
 
   const auto &DL = DAG.getDataLayout();
-
-  const char *Name = nullptr;
+  RTLIB::Libcall LC;
   if (Signed)
-    Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
+    LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
   else
-    Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
+    LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
 
+  const char *Name = getLibcallName(LC);
   SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
 
   ARMTargetLowering::ArgListTy Args;

From b9793118423f928b8dcda933aa581f3904ae2b68 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 12 Jun 2025 14:21:29 +0200
Subject: [PATCH 224/851] [libc++] Remove allocator support from std::function
 (#140395)

The allocator support was removed in P0302R1, since it was impossible to
implement. We're currently providing the API for this, but ignore the
allocator in all cases but one (which is almost certainly an oversight).
That case is the `function(allocator_arg_t, const Alloc&, Func)`
constuctor. IMO we should remove the API entirely at a later date, but
this only removes most of the code for now, leaving only the public
functions. This not only simplifies the code quite a bit, but also
results in the constructor being instantiated ~8x faster.

Fixes #133901
---
 libcxx/docs/ReleaseNotes/21.rst        |   5 +
 libcxx/include/__functional/function.h | 238 ++++---------------------
 2 files changed, 39 insertions(+), 204 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6cbc0baf29487..2a5b90750eafc 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -99,6 +99,11 @@ Potentially breaking changes
 
 - User-defined specializations of ``std::common_reference`` are diagnosed now. To customize the common reference type, ``std::basic_common_reference`` should be specialized instead.
 
+- ``std::function`` used to have allocator support, which was removed from the Standard by `http://wg21.link/p0302r1`
+  due to issues with its design and inconsistent support from implementations. Previously, libc++ would provide
+  allocator-aware APIs in ``std::function`` in C++11 and C++14, but ignores the allocator argument in all places but
+  one. Starting in this release, the allocator argument is always ignored.
+
 Announcements About Future Releases
 -----------------------------------
 
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 733f321925a43..e71c778386fd2 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -17,13 +17,7 @@
 #include <__functional/binary_function.h>
 #include <__functional/invoke.h>
 #include <__functional/unary_function.h>
-#include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
-#include <__memory/allocator.h>
-#include <__memory/allocator_destructor.h>
-#include <__memory/allocator_traits.h>
-#include <__memory/compressed_pair.h>
-#include <__memory/unique_ptr.h>
 #include <__type_traits/aligned_storage.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_core_convertible.h>
@@ -34,9 +28,7 @@
 #include <__type_traits/strip_signature.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
-#include <__utility/piecewise_construct.h>
 #include <__utility/swap.h>
-#include <__verbose_abort>
 #include <tuple>
 #include <typeinfo>
 
@@ -133,71 +125,9 @@ _LIBCPP_HIDE_FROM_ABI bool __not_null(_Rp (^__p)(_Args...)) {
 
 namespace __function {
 
-// __alloc_func holds a functor and an allocator.
-
-template <class _Fp, class _Ap, class _FB>
-class __alloc_func;
 template <class _Fp, class _FB>
 class __default_alloc_func;
 
-template <class _Fp, class _Ap, class _Rp, class... _ArgTypes>
-class __alloc_func<_Fp, _Ap, _Rp(_ArgTypes...)> {
-  _LIBCPP_COMPRESSED_PAIR(_Fp, __func_, _Ap, __alloc_);
-
-public:
-  using _Target _LIBCPP_NODEBUG = _Fp;
-  using _Alloc _LIBCPP_NODEBUG  = _Ap;
-
-  _LIBCPP_HIDE_FROM_ABI const _Target& __target() const { return __func_; }
-
-  // WIN32 APIs may define __allocator, so use __get_allocator instead.
-  _LIBCPP_HIDE_FROM_ABI const _Alloc& __get_allocator() const { return __alloc_; }
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f) : __func_(std::move(__f)), __alloc_() {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, const _Alloc& __a) : __func_(__f), __alloc_(__a) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(const _Target& __f, _Alloc&& __a)
-      : __func_(__f), __alloc_(std::move(__a)) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __alloc_func(_Target&& __f, _Alloc&& __a)
-      : __func_(std::move(__f)), __alloc_(std::move(__a)) {}
-
-  _LIBCPP_HIDE_FROM_ABI _Rp operator()(_ArgTypes&&... __arg) {
-    return std::__invoke_r<_Rp>(__func_, std::forward<_ArgTypes>(__arg)...);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI __alloc_func* __clone() const {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, __alloc_func> _AA;
-    _AA __a(__alloc_);
-    typedef __allocator_destructor<_AA> _Dp;
-    unique_ptr<__alloc_func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-    ::new ((void*)__hold.get()) __alloc_func(__func_, _Alloc(__a));
-    return __hold.release();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI void destroy() _NOEXCEPT {
-    __func_.~_Fp();
-    __alloc_.~_Alloc();
-  }
-
-  _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__alloc_func* __f) {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, __alloc_func> _FunAlloc;
-    _FunAlloc __a(__f->__get_allocator());
-    __f->destroy();
-    __a.deallocate(__f, 1);
-  }
-};
-
-template <class _Tp>
-struct __deallocating_deleter {
-  _LIBCPP_HIDE_FROM_ABI void operator()(void* __p) const {
-    std::__libcpp_deallocate<_Tp>(static_cast<_Tp*>(__p), __element_count(1));
-  }
-};
-
 template <class _Fp, class _Rp, class... _ArgTypes>
 class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
   _Fp __f_;
@@ -215,20 +145,9 @@ class __default_alloc_func<_Fp, _Rp(_ArgTypes...)> {
     return std::__invoke_r<_Rp>(__f_, std::forward<_ArgTypes>(__arg)...);
   }
 
-  _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const {
-    using _Self = __default_alloc_func;
-    unique_ptr<_Self, __deallocating_deleter<_Self>> __hold(std::__libcpp_allocate<_Self>(__element_count(1)));
-    _Self* __res = ::new ((void*)__hold.get()) _Self(__f_);
-    (void)__hold.release();
-    return __res;
-  }
+  _LIBCPP_HIDE_FROM_ABI __default_alloc_func* __clone() const { return new __default_alloc_func(__f_); }
 
   _LIBCPP_HIDE_FROM_ABI void destroy() _NOEXCEPT { __f_.~_Target(); }
-
-  _LIBCPP_HIDE_FROM_ABI static void __destroy_and_delete(__default_alloc_func* __f) {
-    __f->destroy();
-    std::__libcpp_deallocate<__default_alloc_func>(__f, __element_count(1));
-  }
 };
 
 // __base provides an abstract interface for copyable functors.
@@ -257,84 +176,38 @@ class __base<_Rp(_ArgTypes...)> {
 
 // __func implements __base for a given functor type.
 
-template <class _FD, class _Alloc, class _FB>
+template <class _FD, class _FB>
 class __func;
 
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-class __func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
-  __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> __f_;
+template <class _Fp, class _Rp, class... _ArgTypes>
+class __func<_Fp, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
+  _Fp __func_;
 
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f) : __f_(std::move(__f)) {}
-
-  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f, const _Alloc& __a) : __f_(__f, __a) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f) : __func_(std::move(__f)) {}
+  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f) : __func_(__f) {}
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(const _Fp& __f, _Alloc&& __a) : __f_(__f, std::move(__a)) {}
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL __base<_Rp(_ArgTypes...)>* __clone() const override { return new __func(__func_); }
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(_Fp&& __f, _Alloc&& __a) : __f_(std::move(__f), std::move(__a)) {}
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void __clone(__base<_Rp(_ArgTypes...)>* __p) const override {
+    ::new ((void*)__p) __func(__func_);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void __clone(__base<_Rp(_ArgTypes...)>*) const;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy() _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual void destroy_deallocate() _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual _Rp operator()(_ArgTypes&&... __arg);
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void destroy() _NOEXCEPT override { __func_.~_Fp(); }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL void destroy_deallocate() _NOEXCEPT override { delete this; }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL _Rp operator()(_ArgTypes&&... __arg) override {
+    return std::__invoke_r<_Rp>(__func_, std::forward<_ArgTypes>(__arg)...);
+  }
 #  if _LIBCPP_HAS_RTTI
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const void* target(const type_info&) const _NOEXCEPT;
-  _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual const std::type_info& target_type() const _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL const void* target(const type_info& __ti) const _NOEXCEPT override {
+    if (__ti == typeid(_Fp))
+      return std::addressof(__func_);
+    return nullptr;
+  }
+  _LIBCPP_HIDE_FROM_ABI_VIRTUAL const std::type_info& target_type() const _NOEXCEPT override { return typeid(_Fp); }
 #  endif // _LIBCPP_HAS_RTTI
 };
 
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-__base<_Rp(_ArgTypes...)>* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone() const {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef __rebind_alloc<__alloc_traits, __func> _Ap;
-  _Ap __a(__f_.__get_allocator());
-  typedef __allocator_destructor<_Ap> _Dp;
-  unique_ptr<__func, _Dp> __hold(__a.allocate(1), _Dp(__a, 1));
-  ::new ((void*)__hold.get()) __func(__f_.__target(), _Alloc(__a));
-  return __hold.release();
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::__clone(__base<_Rp(_ArgTypes...)>* __p) const {
-  ::new ((void*)__p) __func(__f_.__target(), __f_.__get_allocator());
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy() _NOEXCEPT {
-  __f_.destroy();
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-void __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::destroy_deallocate() _NOEXCEPT {
-  typedef allocator_traits<_Alloc> __alloc_traits;
-  typedef __rebind_alloc<__alloc_traits, __func> _Ap;
-  _Ap __a(__f_.__get_allocator());
-  __f_.destroy();
-  __a.deallocate(this, 1);
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-_Rp __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::operator()(_ArgTypes&&... __arg) {
-  return __f_(std::forward<_ArgTypes>(__arg)...);
-}
-
-#  if _LIBCPP_HAS_RTTI
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-const void* __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::target(const type_info& __ti) const _NOEXCEPT {
-  if (__ti == typeid(_Fp))
-    return std::addressof(__f_.__target());
-  return nullptr;
-}
-
-template <class _Fp, class _Alloc, class _Rp, class... _ArgTypes>
-const std::type_info& __func<_Fp, _Alloc, _Rp(_ArgTypes...)>::target_type() const _NOEXCEPT {
-  return typeid(_Fp);
-}
-
-#  endif // _LIBCPP_HAS_RTTI
-
 // __value_func creates a value-type from a __func.
 
 template <class _Fp>
@@ -354,29 +227,19 @@ class __value_func<_Rp(_ArgTypes...)> {
 public:
   _LIBCPP_HIDE_FROM_ABI __value_func() _NOEXCEPT : __f_(nullptr) {}
 
-  template <class _Fp, class _Alloc>
-  _LIBCPP_HIDE_FROM_ABI __value_func(_Fp&& __f, const _Alloc& __a) : __f_(nullptr) {
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __function::__func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef __rebind_alloc<__alloc_traits, _Fun> _FunAlloc;
+  template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __value_func>::value, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI explicit __value_func(_Fp&& __f) : __f_(nullptr) {
+    typedef __function::__func<_Fp, _Rp(_ArgTypes...)> _Fun;
 
     if (__function::__not_null(__f)) {
-      _FunAlloc __af(__a);
-      if (sizeof(_Fun) <= sizeof(__buf_) && is_nothrow_copy_constructible<_Fp>::value &&
-          is_nothrow_copy_constructible<_FunAlloc>::value) {
-        __f_ = ::new ((void*)&__buf_) _Fun(std::move(__f), _Alloc(__af));
+      if (sizeof(_Fun) <= sizeof(__buf_) && is_nothrow_copy_constructible<_Fp>::value) {
+        __f_ = ::new (std::addressof(__buf_)) _Fun(std::move(__f));
       } else {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
-        unique_ptr<__func, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
-        ::new ((void*)__hold.get()) _Fun(std::move(__f), _Alloc(__a));
-        __f_ = __hold.release();
+        __f_ = new _Fun(std::move(__f));
       }
     }
   }
 
-  template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __value_func>::value, int> = 0>
-  _LIBCPP_HIDE_FROM_ABI explicit __value_func(_Fp&& __f) : __value_func(std::forward<_Fp>(__f), allocator<_Fp>()) {}
-
   _LIBCPP_HIDE_FROM_ABI __value_func(const __value_func& __f) {
     if (__f.__f_ == nullptr)
       __f_ = nullptr;
@@ -544,7 +407,7 @@ struct __policy {
 
   template <typename _Fun>
   _LIBCPP_HIDE_FROM_ABI static void __large_destroy(void* __s) {
-    _Fun::__destroy_and_delete(static_cast<_Fun*>(__s));
+    delete static_cast<_Fun*>(__s);
   }
 
   template <typename _Fun>
@@ -583,7 +446,7 @@ struct __policy {
 template <typename _Tp>
 using __fast_forward _LIBCPP_NODEBUG = __conditional_t<is_scalar<_Tp>::value, _Tp, _Tp&&>;
 
-// __policy_invoker calls an instance of __alloc_func held in __policy_storage.
+// __policy_invoker calls an instance of __default_alloc_func held in __policy_storage.
 
 template <class _Fp>
 struct __policy_invoker;
@@ -641,28 +504,6 @@ class __policy_func<_Rp(_ArgTypes...)> {
 public:
   _LIBCPP_HIDE_FROM_ABI __policy_func() : __policy_(__policy::__create_empty()) {}
 
-  template <class _Fp, class _Alloc>
-  _LIBCPP_HIDE_FROM_ABI __policy_func(_Fp&& __f, const _Alloc& __a) : __policy_(__policy::__create_empty()) {
-    typedef __alloc_func<_Fp, _Alloc, _Rp(_ArgTypes...)> _Fun;
-    typedef allocator_traits<_Alloc> __alloc_traits;
-    typedef __rebind_alloc<__alloc_traits, _Fun> _FunAlloc;
-
-    if (__function::__not_null(__f)) {
-      __invoker_ = __invoker::template __create<_Fun>();
-      __policy_  = __policy::__create<_Fun>();
-
-      _FunAlloc __af(__a);
-      if (__use_small_storage<_Fun>()) {
-        ::new ((void*)&__buf_.__small) _Fun(std::move(__f), _Alloc(__af));
-      } else {
-        typedef __allocator_destructor<_FunAlloc> _Dp;
-        unique_ptr<_Fun, _Dp> __hold(__af.allocate(1), _Dp(__af, 1));
-        ::new ((void*)__hold.get()) _Fun(std::move(__f), _Alloc(__af));
-        __buf_.__large = __hold.release();
-      }
-    }
-  }
-
   template <class _Fp, __enable_if_t<!is_same<__decay_t<_Fp>, __policy_func>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI explicit __policy_func(_Fp&& __f) : __policy_(__policy::__create_empty()) {
     typedef __default_alloc_func<_Fp, _Rp(_ArgTypes...)> _Fun;
@@ -673,9 +514,7 @@ class __policy_func<_Rp(_ArgTypes...)> {
       if (__use_small_storage<_Fun>()) {
         ::new ((void*)&__buf_.__small) _Fun(std::move(__f));
       } else {
-        unique_ptr<_Fun, __deallocating_deleter<_Fun>> __hold(std::__libcpp_allocate<_Fun>(__element_count(1)));
-        __buf_.__large = ::new ((void*)__hold.get()) _Fun(std::move(__f));
-        (void)__hold.release();
+        __buf_.__large = ::new _Fun(std::move(__f));
       }
     }
   }
@@ -750,8 +589,8 @@ class __policy_func<_Rp(_ArgTypes...)> {
 extern "C" void* _Block_copy(const void*);
 extern "C" void _Block_release(const void*);
 
-template <class _Rp1, class... _ArgTypes1, class _Alloc, class _Rp, class... _ArgTypes>
-class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
+template <class _Rp1, class... _ArgTypes1, class _Rp, class... _ArgTypes>
+class __func<_Rp1 (^)(_ArgTypes1...), _Rp(_ArgTypes...)> : public __base<_Rp(_ArgTypes...)> {
   typedef _Rp1 (^__block_type)(_ArgTypes1...);
   __block_type __f_;
 
@@ -767,15 +606,6 @@ class __func<_Rp1 (^)(_ArgTypes1...), _Alloc, _Rp(_ArgTypes...)> : public __base
 
   // [TODO] add && to save on a retain
 
-  _LIBCPP_HIDE_FROM_ABI explicit __func(__block_type __f, const _Alloc& /* unused */)
-#    if __has_feature(objc_arc)
-      : __f_(__f)
-#    else
-      : __f_(reinterpret_cast<__block_type>(__f ? _Block_copy(__f) : nullptr))
-#    endif
-  {
-  }
-
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL virtual __base<_Rp(_ArgTypes...)>* __clone() const {
     _LIBCPP_ASSERT_INTERNAL(
         false,
@@ -954,7 +784,7 @@ function<_Rp(_ArgTypes...)>::function(_Fp __f) : __f_(std::move(__f)) {}
 #  if _LIBCPP_STD_VER <= 14
 template <class _Rp, class... _ArgTypes>
 template <class _Fp, class _Alloc, class>
-function<_Rp(_ArgTypes...)>::function(allocator_arg_t, const _Alloc& __a, _Fp __f) : __f_(std::move(__f), __a) {}
+function<_Rp(_ArgTypes...)>::function(allocator_arg_t, const _Alloc&, _Fp __f) : __f_(std::move(__f)) {}
 #  endif
 
 template <class _Rp, class... _ArgTypes>

From 5aed4800f33a72c778f3b49f6389fff099ff4ff6 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim@gymni.ch>
Date: Thu, 12 Jun 2025 14:43:40 +0200
Subject: [PATCH 225/851] [GISel] KnownFPClass ValueTracking fix handling of
 vectors (#143372)

---
 .../CodeGen/GlobalISel/GISelValueTracking.cpp | 57 ++++++-----
 .../CodeGen/GlobalISel/KnownFPClassTest.cpp   | 98 +++++++++++++++++++
 2 files changed, 129 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 6650ad25bed04..1286af864fb3f 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1046,7 +1046,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     //
     if ((Known.KnownFPClasses & fcZero) != fcNone &&
         !Known.isKnownNeverSubnormal()) {
-      DenormalMode Mode = MF->getDenormalMode(getFltSemanticForLLT(DstTy));
+      DenormalMode Mode =
+          MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType()));
       if (Mode != DenormalMode::getIEEE())
         Known.KnownFPClasses |= fcZero;
     }
@@ -1108,8 +1109,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
 
     // If the parent function flushes denormals, the canonical output cannot
     // be a denormal.
-    LLT Ty = MRI.getType(Val);
-    const fltSemantics &FPType = getFltSemanticForLLT(Ty.getScalarType());
+    LLT Ty = MRI.getType(Val).getScalarType();
+    const fltSemantics &FPType = getFltSemanticForLLT(Ty);
     DenormalMode DenormMode = MF->getDenormalMode(FPType);
     if (DenormMode == DenormalMode::getIEEE()) {
       if (KnownSrc.isKnownNever(fcPosZero))
@@ -1219,8 +1220,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     if (KnownSrc.isKnownNeverNaN() && KnownSrc.cannotBeOrderedLessThanZero())
       Known.knownNot(fcNan);
 
-    LLT Ty = MRI.getType(Val);
-    const fltSemantics &FltSem = getFltSemanticForLLT(Ty.getScalarType());
+    LLT Ty = MRI.getType(Val).getScalarType();
+    const fltSemantics &FltSem = getFltSemanticForLLT(Ty);
     DenormalMode Mode = MF->getDenormalMode(FltSem);
 
     if (KnownSrc.isKnownNeverLogicalZero(Mode))
@@ -1338,19 +1339,19 @@ void GISelValueTracking::computeKnownFPClass(Register R,
           Known.knownNot(KnownFPClass::OrderedLessThanZeroMask);
 
         // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
-        if ((KnownLHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
-             KnownRHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+        if ((KnownLHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType()))) ||
+             KnownRHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType())))) &&
             // Make sure output negative denormal can't flush to -0
             outputDenormalIsIEEEOrPosZero(*MF, DstTy))
           Known.knownNot(fcNegZero);
       } else {
         // Only fsub -0, +0 can return -0
-        if ((KnownLHS.isKnownNeverLogicalNegZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy))) ||
-             KnownRHS.isKnownNeverLogicalPosZero(
-                 MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+        if ((KnownLHS.isKnownNeverLogicalNegZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType()))) ||
+             KnownRHS.isKnownNeverLogicalPosZero(MF->getDenormalMode(
+                 getFltSemanticForLLT(DstTy.getScalarType())))) &&
             // Make sure output negative denormal can't flush to -0
             outputDenormalIsIEEEOrPosZero(*MF, DstTy))
           Known.knownNot(fcNegZero);
@@ -1396,11 +1397,11 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     }
 
     if ((KnownRHS.isKnownNeverInfinity() ||
-         KnownLHS.isKnownNeverLogicalZero(
-             MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) &&
+         KnownLHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+             getFltSemanticForLLT(DstTy.getScalarType())))) &&
         (KnownLHS.isKnownNeverInfinity() ||
          KnownRHS.isKnownNeverLogicalZero(
-             MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))
+             MF->getDenormalMode(getFltSemanticForLLT(DstTy.getScalarType())))))
       Known.knownNot(fcNan);
 
     break;
@@ -1452,10 +1453,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
       if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() &&
           (KnownLHS.isKnownNeverInfinity() ||
            KnownRHS.isKnownNeverInfinity()) &&
-          ((KnownLHS.isKnownNeverLogicalZero(
-               MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) ||
-           (KnownRHS.isKnownNeverLogicalZero(
-               MF->getDenormalMode(getFltSemanticForLLT(DstTy)))))) {
+          ((KnownLHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+               getFltSemanticForLLT(DstTy.getScalarType())))) ||
+           (KnownRHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+               getFltSemanticForLLT(DstTy.getScalarType())))))) {
         Known.knownNot(fcNan);
       }
 
@@ -1468,8 +1469,8 @@ void GISelValueTracking::computeKnownFPClass(Register R,
       // Inf REM x and x REM 0 produce NaN.
       if (KnownLHS.isKnownNeverNaN() && KnownRHS.isKnownNeverNaN() &&
           KnownLHS.isKnownNeverInfinity() &&
-          KnownRHS.isKnownNeverLogicalZero(
-              MF->getDenormalMode(getFltSemanticForLLT(DstTy)))) {
+          KnownRHS.isKnownNeverLogicalZero(MF->getDenormalMode(
+              getFltSemanticForLLT(DstTy.getScalarType())))) {
         Known.knownNot(fcNan);
       }
 
@@ -1494,10 +1495,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
     // Infinity, nan and zero propagate from source.
     computeKnownFPClass(R, DemandedElts, InterestedClasses, Known, Depth + 1);
 
-    LLT DstTy = MRI.getType(Dst);
-    const fltSemantics &DstSem = getFltSemanticForLLT(DstTy.getScalarType());
-    LLT SrcTy = MRI.getType(Src);
-    const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy.getScalarType());
+    LLT DstTy = MRI.getType(Dst).getScalarType();
+    const fltSemantics &DstSem = getFltSemanticForLLT(DstTy);
+    LLT SrcTy = MRI.getType(Src).getScalarType();
+    const fltSemantics &SrcSem = getFltSemanticForLLT(SrcTy);
 
     // All subnormal inputs should be in the normal range in the result type.
     if (APFloat::isRepresentableAsNormalIn(SrcSem, DstSem)) {
@@ -1690,6 +1691,10 @@ void GISelValueTracking::computeKnownFPClass(Register R,
   }
   case TargetOpcode::COPY: {
     Register Src = MI.getOperand(1).getReg();
+
+    if (!Src.isVirtual())
+      return;
+
     computeKnownFPClass(Src, DemandedElts, InterestedClasses, Known, Depth + 1);
     break;
   }
diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
index 6ee571804e69f..040f0cfc92076 100644
--- a/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/KnownFPClassTest.cpp
@@ -96,6 +96,104 @@ TEST_F(AArch64GISelMITest, TestFPClassCstVecNegZero) {
   EXPECT_EQ(true, Known.SignBit);
 }
 
+TEST_F(AArch64GISelMITest, TestFPClassCstZeroFPExt) {
+  StringRef MIRString = R"(
+   %c0:_(s32) = G_FCONSTANT float 0.0
+   %ext:_(s64) = nnan ninf G_FPEXT %c0
+   %copy_vector:_(s64) = COPY %ext
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcNormal, Known.KnownFPClasses);
+  EXPECT_EQ(std::nullopt, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstVecZeroFPExt) {
+  StringRef MIRString = R"(
+   %c0:_(s32) = G_FCONSTANT float 0.0
+   %c1:_(s32) = G_FCONSTANT float 0.0
+   %c2:_(s32) = G_FCONSTANT float 0.0
+   %vector:_(<3 x s32>) = G_BUILD_VECTOR %c0, %c1, %c2
+   %ext:_(<3 x s64>) = nnan ninf G_FPEXT %vector
+   %copy_vector:_(<3 x s64>) = COPY %ext
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcNormal, Known.KnownFPClasses);
+  EXPECT_EQ(std::nullopt, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstZeroFPTrunc) {
+  StringRef MIRString = R"(
+   %c0:_(s64) = G_FCONSTANT double 0.0
+   %trunc:_(s32) = nnan ninf G_FPTRUNC %c0
+   %copy_vector:_(s32) = COPY %trunc
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcPosSubnormal | fcPosNormal, Known.KnownFPClasses);
+  EXPECT_EQ(false, Known.SignBit);
+}
+
+TEST_F(AArch64GISelMITest, TestFPClassCstVecZeroFPTrunc) {
+  StringRef MIRString = R"(
+   %c0:_(s64) = G_FCONSTANT double 0.0
+   %c1:_(s64) = G_FCONSTANT double 0.0
+   %c2:_(s64) = G_FCONSTANT double 0.0
+   %vector:_(<3 x s64>) = G_BUILD_VECTOR %c0, %c1, %c2
+   %trunc:_(<3 x s32>) = nnan ninf G_FPTRUNC %vector
+   %copy_vector:_(<3 x s32>) = COPY %trunc
+)";
+
+  setUp(MIRString);
+  if (!TM)
+    GTEST_SKIP();
+
+  Register CopyReg = Copies[Copies.size() - 1];
+  MachineInstr *FinalCopy = MRI->getVRegDef(CopyReg);
+  Register SrcReg = FinalCopy->getOperand(1).getReg();
+
+  GISelValueTracking Info(*MF);
+
+  KnownFPClass Known = Info.computeKnownFPClass(SrcReg);
+
+  EXPECT_EQ(fcZero | fcPosSubnormal | fcPosNormal, Known.KnownFPClasses);
+  EXPECT_EQ(false, Known.SignBit);
+}
+
 TEST_F(AArch64GISelMITest, TestFPClassSelectPos0) {
   StringRef MIRString = R"(
     %ptr:_(p0) = G_IMPLICIT_DEF

From 41c8df147b83026db8612ad2ca07fc0f007e3448 Mon Sep 17 00:00:00 2001
From: woruyu <99597449+woruyu@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:46:07 +0800
Subject: [PATCH 226/851] [DAG] Convert foldMaskedMerge to SDPatternMatch to
 match (m & x) | (~m & y) (#143855)

This PR resolves https://github.com/llvm/llvm-project/issues/143363

Remove foldMaskedMergeImpl entirely to use SDPatternMatch
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 45 ++++---------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e79a17e86bc87..5d62ded171f4f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,24 +8128,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
-                                   SDValue AndR1, const SDLoc &DL,
-                                   SelectionDAG &DAG) {
-  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
-    return SDValue();
-  SDValue NotOp = AndL0->getOperand(0);
-  if (NotOp == AndR1)
-    std::swap(AndR1, AndL1);
-  if (NotOp != AndL1)
-    return SDValue();
-
-  EVT VT = AndL1.getValueType();
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
-  return Xor1;
-}
-
 /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
 /// equivalent `((x ^ y) & m) ^ y)` pattern.
 /// This is typically a better representation for targets without a fused
@@ -8155,29 +8137,20 @@ static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
   // Note that masked-merge variants using XOR or ADD expressions are
   // normalized to OR by InstCombine so we only check for OR.
   assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
 
   // If the target supports and-not, don't fold this.
   if (TLI.hasAndNot(SDValue(Node, 0)))
     return SDValue();
 
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
+  SDValue M, X, Y;
+  if (sd_match(Node,
+               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+    EVT VT = M.getValueType();
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
+    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
+  }
   return SDValue();
 }
 

From 36ac72f4e3e4752f85c16363d630f4cfbd682e48 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 12:51:11 +0000
Subject: [PATCH 227/851] [llvm][MemProf] Fix unused variable warning in
 release build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

g++-13 warned that:
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp:1645:8: warning: variable ‘PrevIterCreatedNode’ set but not used [-Wunused-but-set-variable]
 1645 |   bool PrevIterCreatedNode = false;
      |        ^~~~~~~~~~~~~~~~~~~

When asserts were not enabled.
---
 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index f28fe51fb6a5a..10120dd0e10c1 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1642,7 +1642,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   // this entry.
   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
 
-  bool PrevIterCreatedNode = false;
+  [[maybe_unused]] bool PrevIterCreatedNode = false;
   bool CreatedNode = false;
   for (unsigned I = 0; I < Calls.size();
        I++, PrevIterCreatedNode = CreatedNode) {

From a08a831515919bcc384b453799f33bc97860c73b Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Thu, 12 Jun 2025 15:06:27 +0200
Subject: [PATCH 228/851] [DLCov][NFC] Propagate annotated DebugLocs through
 transformations (#138047)

Part of the coverage-tracking feature, following #107279.

In order for DebugLoc coverage testing to work, we firstly have to set
annotations for intentionally-empty DebugLocs, and secondly we have to
ensure that we do not drop these annotations as we propagate DebugLocs
throughout compilation. As the annotations exist as part of the DebugLoc
class, and not the underlying DILocation, they will not survive a
DebugLoc->DILocation->DebugLoc roundtrip. Therefore this patch modifies
a number of places in the compiler to propagate DebugLocs directly
rather than via the underlying DILocation. This has no effect on the
output of normal builds; it only ensures that during coverage builds, we
do not drop incorrectly annotations and therefore create false
positives.

The bulk of these changes are in replacing
DILocation::getMergedLocation(s) with a DebugLoc equivalent, and in
changing the IRBuilder to store a DebugLoc directly rather than storing
DILocations in its general Metadata array. We also use a new function,
`DebugLoc::orElse`, which selects the "best" DebugLoc out of a pair
(valid location > annotated > empty), preferring the current DebugLoc on
a tie - this encapsulates the existing behaviour at a few sites where we
_may_ assign a DebugLoc to an existing instruction, while extending the
logic to handle annotation DebugLocs at the same time.
---
 .../GlobalISel/LegalizationArtifactCombiner.h |  4 +-
 llvm/include/llvm/IR/DebugInfoMetadata.h      | 24 +++++-----
 llvm/include/llvm/IR/DebugLoc.h               | 45 +++++++++++++++++++
 llvm/include/llvm/IR/IRBuilder.h              | 22 +++++++--
 llvm/include/llvm/IR/Instruction.h            |  2 +-
 llvm/lib/CodeGen/BranchFolding.cpp            |  2 +-
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp |  5 +--
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp  |  2 +-
 llvm/lib/CodeGen/MachineBasicBlock.cpp        |  2 +-
 llvm/lib/CodeGen/MachineSink.cpp              |  4 +-
 llvm/lib/IR/DebugInfo.cpp                     |  4 +-
 llvm/lib/IR/DebugLoc.cpp                      | 21 +++++++++
 llvm/lib/IR/IRBuilder.cpp                     | 17 +++----
 llvm/lib/IR/Instruction.cpp                   |  5 ++-
 .../Target/BPF/BPFPreserveStaticOffset.cpp    | 11 +++--
 .../InstCombineLoadStoreAlloca.cpp            |  4 +-
 .../InstCombine/InstructionCombining.cpp      |  3 +-
 .../Transforms/Scalar/ConstantHoisting.cpp    |  2 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |  8 ++--
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |  4 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  6 +--
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 +-
 22 files changed, 133 insertions(+), 66 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 3712a7fa06d9a..22f6a5fde546a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -100,8 +100,8 @@ class LegalizationArtifactCombiner {
       const LLT DstTy = MRI.getType(DstReg);
       if (isInstLegal({TargetOpcode::G_CONSTANT, {DstTy}})) {
         auto &CstVal = SrcMI->getOperand(1);
-        auto *MergedLocation = DILocation::getMergedLocation(
-            MI.getDebugLoc().get(), SrcMI->getDebugLoc().get());
+        auto MergedLocation =
+            DebugLoc::getMergedLocation(MI.getDebugLoc(), SrcMI->getDebugLoc());
         // Set the debug location to the merged location of the SrcMI and the MI
         // if the aext fold is successful.
         Builder.setDebugLoc(MergedLocation);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 02f0a9f677db3..18228b7757897 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -2437,25 +2437,21 @@ class DILocation : public MDNode {
   inline std::optional<const DILocation *>
   cloneByMultiplyingDuplicationFactor(unsigned DF) const;
 
-  /// When two instructions are combined into a single instruction we also
-  /// need to combine the original locations into a single location.
-  /// When the locations are the same we can use either location.
-  /// When they differ, we need a third location which is distinct from either.
-  /// If they share a common scope, use this scope and compare the line/column
-  /// pair of the locations with the common scope:
-  /// * if both match, keep the line and column;
-  /// * if only the line number matches, keep the line and set the column as 0;
-  /// * otherwise set line and column as 0.
-  /// If they do not share a common scope the location is ambiguous and can't be
-  /// represented in a line entry. In this case, set line and column as 0 and
-  /// use the scope of any location.
-  ///
-  /// \p LocA \p LocB: The locations to be merged.
+  /// Attempts to merge \p LocA and \p LocB into a single location; see
+  /// DebugLoc::getMergedLocation for more details.
+  /// NB: When merging the locations of instructions, prefer to use
+  /// DebugLoc::getMergedLocation(), as an instruction's DebugLoc may contain
+  /// additional metadata that will not be preserved when merging the unwrapped
+  /// DILocations.
   LLVM_ABI static DILocation *getMergedLocation(DILocation *LocA,
                                                 DILocation *LocB);
 
   /// Try to combine the vector of locations passed as input in a single one.
   /// This function applies getMergedLocation() repeatedly left-to-right.
+  /// NB: When merging the locations of instructions, prefer to use
+  /// DebugLoc::getMergedLocations(), as an instruction's DebugLoc may contain
+  /// additional metadata that will not be preserved when merging the unwrapped
+  /// DILocations.
   ///
   /// \p Locs: The locations to be merged.
   LLVM_ABI static DILocation *getMergedLocations(ArrayRef<DILocation *> Locs);
diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index c3d0fb80354a4..2fabae9bfc66e 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -142,6 +142,51 @@ namespace llvm {
     static inline DebugLoc getDropped() { return DebugLoc(); }
 #endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+    /// When two instructions are combined into a single instruction we also
+    /// need to combine the original locations into a single location.
+    /// When the locations are the same we can use either location.
+    /// When they differ, we need a third location which is distinct from
+    /// either. If they share a common scope, use this scope and compare the
+    /// line/column pair of the locations with the common scope:
+    /// * if both match, keep the line and column;
+    /// * if only the line number matches, keep the line and set the column as
+    /// 0;
+    /// * otherwise set line and column as 0.
+    /// If they do not share a common scope the location is ambiguous and can't
+    /// be represented in a line entry. In this case, set line and column as 0
+    /// and use the scope of any location.
+    ///
+    /// \p LocA \p LocB: The locations to be merged.
+    LLVM_ABI static DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB);
+
+    /// Try to combine the vector of locations passed as input in a single one.
+    /// This function applies getMergedLocation() repeatedly left-to-right.
+    ///
+    /// \p Locs: The locations to be merged.
+    LLVM_ABI static DebugLoc getMergedLocations(ArrayRef<DebugLoc> Locs);
+
+    /// If this DebugLoc is non-empty, returns this DebugLoc; otherwise, selects
+    /// \p Other.
+    /// In coverage-tracking builds, this also accounts for whether this or
+    /// \p Other have an annotative DebugLocKind applied, such that if both are
+    /// empty but exactly one has an annotation, we prefer that annotated
+    /// location.
+    DebugLoc orElse(DebugLoc Other) const {
+      if (*this)
+        return *this;
+#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+      if (Other)
+        return Other;
+      if (getKind() != DebugLocKind::Normal)
+        return *this;
+      if (Other.getKind() != DebugLocKind::Normal)
+        return Other;
+      return *this;
+#else
+      return Other;
+#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+    }
+
     /// Get the underlying \a DILocation.
     ///
     /// \pre !*this or \c isa<DILocation>(getAsMDNode()).
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index a0cc20d34303a..59295089d6e91 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -113,12 +113,19 @@ class FMFSource {
 /// Common base class shared among various IRBuilders.
 class IRBuilderBase {
   /// Pairs of (metadata kind, MDNode *) that should be added to all newly
-  /// created instructions, like !dbg metadata.
+  /// created instructions, excluding !dbg metadata, which is stored in the
+  /// StoredDL field.
   SmallVector<std::pair<unsigned, MDNode *>, 2> MetadataToCopy;
+  /// The DebugLoc that will be applied to instructions inserted by this
+  /// builder.
+  DebugLoc StoredDL;
 
   /// Add or update the an entry (Kind, MD) to MetadataToCopy, if \p MD is not
   /// null. If \p MD is null, remove the entry with \p Kind.
   void AddOrRemoveMetadataToCopy(unsigned Kind, MDNode *MD) {
+    assert(Kind != LLVMContext::MD_dbg &&
+           "MD_dbg metadata must be stored in StoredDL");
+
     if (!MD) {
       erase_if(MetadataToCopy, [Kind](const std::pair<unsigned, MDNode *> &KV) {
         return KV.first == Kind;
@@ -238,7 +245,9 @@ class IRBuilderBase {
 
   /// Set location information used by debugging information.
   void SetCurrentDebugLocation(DebugLoc L) {
-    AddOrRemoveMetadataToCopy(LLVMContext::MD_dbg, L.getAsMDNode());
+    // For !dbg metadata attachments, we use DebugLoc instead of the raw MDNode
+    // to include optional introspection data for use in Debugify.
+    StoredDL = std::move(L);
   }
 
   /// Set nosanitize metadata.
@@ -252,8 +261,12 @@ class IRBuilderBase {
   /// not on \p Src will be dropped from MetadataToCopy.
   void CollectMetadataToCopy(Instruction *Src,
                              ArrayRef<unsigned> MetadataKinds) {
-    for (unsigned K : MetadataKinds)
-      AddOrRemoveMetadataToCopy(K, Src->getMetadata(K));
+    for (unsigned K : MetadataKinds) {
+      if (K == LLVMContext::MD_dbg)
+        SetCurrentDebugLocation(Src->getDebugLoc());
+      else
+        AddOrRemoveMetadataToCopy(K, Src->getMetadata(K));
+    }
   }
 
   /// Get location information used by debugging information.
@@ -267,6 +280,7 @@ class IRBuilderBase {
   void AddMetadataToInst(Instruction *I) const {
     for (const auto &KV : MetadataToCopy)
       I->setMetadata(KV.first, KV.second);
+    SetInstDebugLocation(I);
   }
 
   /// Get the return type of the current function that we're emitting
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 10fc9c1298607..8e1ef24226789 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -698,7 +698,7 @@ class Instruction : public User,
   ///     applications, thus the N-way merging should be in code path.
   /// The DebugLoc attached to this instruction will be overwritten by the
   /// merged DebugLoc.
-  LLVM_ABI void applyMergedLocation(DILocation *LocA, DILocation *LocB);
+  LLVM_ABI void applyMergedLocation(DebugLoc LocA, DebugLoc LocB);
 
   /// Updates the debug location given that the instruction has been hoisted
   /// from a block to a predecessor of that block.
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index e0f7466ceacff..ff9f0ff5d5bc3 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -862,7 +862,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
             "Reached BB end within common tail");
       }
       assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
-      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      DL = DebugLoc::getMergedLocation(DL, Pos->getDebugLoc());
       NextCommonInsts[i] = ++Pos;
     }
     MI.setDebugLoc(DL);
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 10c72641ce2df..e3e6c72165ebb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -53,8 +53,7 @@ CSEMIRBuilder::getDominatingInstrForID(FoldingSetNodeID &ID,
     } else if (!dominates(MI, CurrPos)) {
       // Update the spliced machineinstr's debug location by merging it with the
       // debug location of the instruction at the insertion point.
-      auto *Loc = DILocation::getMergedLocation(getDebugLoc().get(),
-                                                MI->getDebugLoc().get());
+      auto Loc = DebugLoc::getMergedLocation(getDebugLoc(), MI->getDebugLoc());
       MI->setDebugLoc(Loc);
       CurMBB->splice(CurrPos, CurMBB, MI);
     }
@@ -170,7 +169,7 @@ CSEMIRBuilder::generateCopiesIfRequired(ArrayRef<DstOp> DstOps,
     if (Observer)
       Observer->changingInstr(*MIB);
     MIB->setDebugLoc(
-        DILocation::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
+        DebugLoc::getMergedLocation(MIB->getDebugLoc(), getDebugLoc()));
     if (Observer)
       Observer->changedInstr(*MIB);
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index 78cd9bc7891e0..f68420ed66e4b 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -370,7 +370,7 @@ bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
   // For each store, compute pairwise merged debug locs.
   DebugLoc MergedLoc = Stores.front()->getDebugLoc();
   for (auto *Store : drop_begin(Stores))
-    MergedLoc = DILocation::getMergedLocation(MergedLoc, Store->getDebugLoc());
+    MergedLoc = DebugLoc::getMergedLocation(MergedLoc, Store->getDebugLoc());
 
   Builder.setInstr(*Stores.back());
   Builder.setDebugLoc(MergedLoc);
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index ccc164a0881e9..48b406e016c05 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1574,7 +1574,7 @@ MachineBasicBlock::findBranchDebugLoc() {
     DL = TI->getDebugLoc();
     for (++TI ; TI != end() ; ++TI)
       if (TI->isBranch())
-        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+        DL = DebugLoc::getMergedLocation(DL, TI->getDebugLoc());
   }
   return DL;
 }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index e3f6eda8ff065..8411d5c4b09c8 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -1611,8 +1611,8 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
   // location to prevent debug-info driven tools from potentially reporting
   // wrong location information.
   if (!SuccToSinkTo.empty() && InsertPos != SuccToSinkTo.end())
-    MI.setDebugLoc(DILocation::getMergedLocation(MI.getDebugLoc(),
-                                                 InsertPos->getDebugLoc()));
+    MI.setDebugLoc(DebugLoc::getMergedLocation(MI.getDebugLoc(),
+                                               InsertPos->getDebugLoc()));
   else
     MI.setDebugLoc(DebugLoc());
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2a84e7bae0f10..9527c3e0b5d67 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -960,8 +960,8 @@ unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
   return 0;
 }
 
-void Instruction::applyMergedLocation(DILocation *LocA, DILocation *LocB) {
-  setDebugLoc(DILocation::getMergedLocation(LocA, LocB));
+void Instruction::applyMergedLocation(DebugLoc LocA, DebugLoc LocB) {
+  setDebugLoc(DebugLoc::getMergedLocation(LocA, LocB));
 }
 
 void Instruction::mergeDIAssignID(
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0e65ddcec8934..0be6d55d724e0 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -143,6 +143,27 @@ DebugLoc DebugLoc::appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
   return Last;
 }
 
+DebugLoc DebugLoc::getMergedLocations(ArrayRef<DebugLoc> Locs) {
+  if (Locs.empty())
+    return DebugLoc();
+  if (Locs.size() == 1)
+    return Locs[0];
+  DebugLoc Merged = Locs[0];
+  for (const DebugLoc &DL : llvm::drop_begin(Locs)) {
+    Merged = getMergedLocation(Merged, DL);
+    if (!Merged)
+      break;
+  }
+  return Merged;
+}
+DebugLoc DebugLoc::getMergedLocation(DebugLoc LocA, DebugLoc LocB) {
+  if (!LocA)
+    return LocA;
+  if (!LocB)
+    return LocB;
+  return DILocation::getMergedLocation(LocA, LocB);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const { print(dbgs()); }
 #endif
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 59623b4295bb1..a33ef9c7d4a17 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -61,19 +61,12 @@ Type *IRBuilderBase::getCurrentFunctionReturnType() const {
   return BB->getParent()->getReturnType();
 }
 
-DebugLoc IRBuilderBase::getCurrentDebugLocation() const {
-  for (auto &KV : MetadataToCopy)
-    if (KV.first == LLVMContext::MD_dbg)
-      return {cast<DILocation>(KV.second)};
-
-  return {};
-}
+DebugLoc IRBuilderBase::getCurrentDebugLocation() const { return StoredDL; }
 void IRBuilderBase::SetInstDebugLocation(Instruction *I) const {
-  for (const auto &KV : MetadataToCopy)
-    if (KV.first == LLVMContext::MD_dbg) {
-      I->setDebugLoc(DebugLoc(KV.second));
-      return;
-    }
+  // We prefer to set our current debug location if any has been set, but if
+  // our debug location is empty and I has a valid location, we shouldn't
+  // overwrite it.
+  I->setDebugLoc(StoredDL.orElse(I->getDebugLoc()));
 }
 
 Value *IRBuilderBase::CreateAggregateCast(Value *V, Type *DestTy) {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 1b60caab6c11a..cbf39b8adf1b2 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1354,6 +1354,9 @@ void Instruction::swapProfMetadata() {
 
 void Instruction::copyMetadata(const Instruction &SrcInst,
                                ArrayRef<unsigned> WL) {
+  if (WL.empty() || is_contained(WL, LLVMContext::MD_dbg))
+    setDebugLoc(SrcInst.getDebugLoc().orElse(getDebugLoc()));
+
   if (!SrcInst.hasMetadata())
     return;
 
@@ -1367,8 +1370,6 @@ void Instruction::copyMetadata(const Instruction &SrcInst,
     if (WL.empty() || WLS.count(MD.first))
       setMetadata(MD.first, MD.second);
   }
-  if (WL.empty() || WLS.count(LLVMContext::MD_dbg))
-    setDebugLoc(SrcInst.getDebugLoc());
 }
 
 Instruction *Instruction::clone() const {
diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
index 77bbeab3c2790..222eb19e3eeef 100644
--- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp
@@ -150,10 +150,10 @@ static CallInst *isGEPAndStore(Value *I) {
 }
 
 template <class T = Instruction>
-static DILocation *mergeDILocations(SmallVector<T *> &Insns) {
-  DILocation *Merged = (*Insns.begin())->getDebugLoc();
+static DebugLoc mergeDebugLocs(SmallVector<T *> &Insns) {
+  DebugLoc Merged = (*Insns.begin())->getDebugLoc();
   for (T *I : Insns)
-    Merged = DILocation::getMergedLocation(Merged, I->getDebugLoc());
+    Merged = DebugLoc::getMergedLocation(Merged, I->getDebugLoc());
   return Merged;
 }
 
@@ -227,7 +227,7 @@ static Instruction *makeGEPAndLoad(Module *M, GEPChainInfo &GEP,
   CallInst *Call = makeIntrinsicCall(M, Intrinsic::bpf_getelementptr_and_load,
                                      {Load->getType()}, Args);
   setParamElementType(Call, 0, GEP.SourceElementType);
-  Call->applyMergedLocation(mergeDILocations(GEP.Members), Load->getDebugLoc());
+  Call->applyMergedLocation(mergeDebugLocs(GEP.Members), Load->getDebugLoc());
   Call->setName((*GEP.Members.rbegin())->getName());
   if (Load->isUnordered()) {
     Call->setOnlyReadsMemory();
@@ -251,8 +251,7 @@ static Instruction *makeGEPAndStore(Module *M, GEPChainInfo &GEP,
   setParamElementType(Call, 1, GEP.SourceElementType);
   if (Store->getValueOperand()->getType()->isPointerTy())
     setParamReadNone(Call, 0);
-  Call->applyMergedLocation(mergeDILocations(GEP.Members),
-                            Store->getDebugLoc());
+  Call->applyMergedLocation(mergeDebugLocs(GEP.Members), Store->getDebugLoc());
   if (Store->isUnordered()) {
     Call->setOnlyWritesMemory();
     Call->setOnlyAccessesArgMemory();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 324e6022f3f05..1d208de75db3b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1581,8 +1581,8 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getValueOperand();
   // The debug locations of the original instructions might differ. Merge them.
-  DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
-                                                     OtherStore->getDebugLoc());
+  DebugLoc MergedLoc =
+      DebugLoc::getMergedLocation(SI.getDebugLoc(), OtherStore->getDebugLoc());
   if (MergedVal != SI.getValueOperand()) {
     PHINode *PN =
         PHINode::Create(SI.getValueOperand()->getType(), 2, "storemerge");
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e261807bbc035..dc2a8cb0115e7 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5340,8 +5340,7 @@ bool InstCombinerImpl::run() {
         // We copy the old instruction's DebugLoc to the new instruction, unless
         // InstCombine already assigned a DebugLoc to it, in which case we
         // should trust the more specifically selected DebugLoc.
-        if (!Result->getDebugLoc())
-          Result->setDebugLoc(I->getDebugLoc());
+        Result->setDebugLoc(Result->getDebugLoc().orElse(I->getDebugLoc()));
         // We also copy annotation metadata to the new instruction.
         Result->copyMetadata(*I, LLVMContext::MD_annotation);
         // Everything uses the new instruction now.
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 07bc623c3dea0..839f5933e09b0 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -883,7 +883,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
         emitBaseConstants(Base, &R);
         ReBasesNum++;
         // Use the same debug location as the last user of the constant.
-        Base->setDebugLoc(DILocation::getMergedLocation(
+        Base->setDebugLoc(DebugLoc::getMergedLocation(
             Base->getDebugLoc(), R.User.Inst->getDebugLoc()));
       }
       assert(!Base->use_empty() && "The use list is empty!?");
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 3024ccb330b1a..bd59caa6a959a 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2224,10 +2224,10 @@ bool llvm::promoteLoopAccessesToScalars(
   });
 
   // Look at all the loop uses, and try to merge their locations.
-  std::vector<DILocation *> LoopUsesLocs;
-  for (auto *U : LoopUses)
-    LoopUsesLocs.push_back(U->getDebugLoc().get());
-  auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
+  std::vector<DebugLoc> LoopUsesLocs;
+  for (auto U : LoopUses)
+    LoopUsesLocs.push_back(U->getDebugLoc());
+  auto DL = DebugLoc::getMergedLocations(LoopUsesLocs);
 
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode *, 16> NewPHIs;
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 4e437e9abeb43..d20378ece4eea 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -128,7 +128,7 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
 
   // Now, go through each block (with the current terminator type)
   // we've recorded, and rewrite it to branch to the new common block.
-  DILocation *CommonDebugLoc = nullptr;
+  DebugLoc CommonDebugLoc;
   for (BasicBlock *BB : BBs) {
     auto *Term = BB->getTerminator();
     assert(Term->getOpcode() == CanonicalTerm->getOpcode() &&
@@ -145,7 +145,7 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
       CommonDebugLoc = Term->getDebugLoc();
     else
       CommonDebugLoc =
-          DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
+          DebugLoc::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
 
     // And turn BB into a block that just unconditionally branches
     // to the canonical block.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f67a6414ca316..0980f0e57aa6d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2095,11 +2095,11 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
 
   // Ensure terminator gets a debug location, even an unknown one, in case
   // it involves inlinable calls.
-  SmallVector<DILocation *, 4> Locs;
+  SmallVector<DebugLoc, 4> Locs;
   Locs.push_back(I1->getDebugLoc());
   for (auto *OtherSuccTI : OtherSuccTIs)
     Locs.push_back(OtherSuccTI->getDebugLoc());
-  NT->setDebugLoc(DILocation::getMergedLocations(Locs));
+  NT->setDebugLoc(DebugLoc::getMergedLocations(Locs));
 
   // PHIs created below will adopt NT's merged DebugLoc.
   IRBuilder<NoFolder> Builder(NT);
@@ -2896,7 +2896,7 @@ static void mergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
       MergedDebugLoc = II->getDebugLoc();
     else
       MergedDebugLoc =
-          DILocation::getMergedLocation(MergedDebugLoc, II->getDebugLoc());
+          DebugLoc::getMergedLocation(MergedDebugLoc, II->getDebugLoc());
 
     // And replace the old `invoke` with an unconditionally branch
     // to the block with the merged `invoke`.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1838562f26b82..b74ef91f26e70 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -395,7 +395,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
       LLVM_DEBUG(dbgs() << "Failed to create new discriminator: "
                         << DIL->getFilename() << " Line: " << DIL->getLine());
   } else
-    Builder.SetCurrentDebugLocation(DIL);
+    Builder.SetCurrentDebugLocation(DL);
 }
 
 void VPTransformState::packScalarIntoVectorizedValue(const VPValue *Def,

From ce747a16328b2fbc365e1cb1cb01cb400c2c1b4c Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Thu, 12 Jun 2025 21:06:58 +0800
Subject: [PATCH 229/851] [LV] Pre-commit test case for support
 VPWidenCastRecipe in isSingleScalar. nfc (#143498)

---
 .../LoopVectorize/single-scalar-cast-minbw.ll | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll

diff --git a/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
new file mode 100644
index 0000000000000..b8da9ac84a808
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single-scalar-cast-minbw.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
+
+define void @minbw_cast(ptr %dst, i64 %n, i1 %bool1, i1 %bool2) {
+; CHECK-LABEL: define void @minbw_cast(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[BOOL1:%.*]], i1 [[BOOL2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[BOOL1_EXT:%.*]] = zext i1 [[BOOL1]] to i32
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BOOL2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[BOOL1_EXT]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[BROADCAST_SPLAT2]] to <4 x i8>
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i1> [[BROADCAST_SPLAT]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i8> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT:    store i8 [[TMP3]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[BOOL2_EXT:%.*]] = zext i1 [[BOOL2]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[BOOL1_EXT]], [[BOOL2_EXT]]
+; CHECK-NEXT:    [[XOR_TRUNC:%.*]] = trunc i32 [[XOR]] to i8
+; CHECK-NEXT:    store i8 [[XOR_TRUNC]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %bool1.ext = zext i1 %bool1 to i32
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %bool2.ext = zext i1 %bool2 to i32
+  %xor = xor i32 %bool1.ext, %bool2.ext
+  %xor.trunc = trunc i32 %xor to i8
+  store i8 %xor.trunc, ptr %dst, align 1
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, %n
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}

From d49bc5e621c8931679b232fa28abfc89a171105e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 12 Jun 2025 13:10:04 +0000
Subject: [PATCH 230/851] [llvm][MemProf] Correct position of LLVM_ABI macro in
 computeFrameHistogram
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous placement resulted in this warning when using g++-13:
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:120:43: warning: attribute ignored [-Wattributes]
  120 | #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT [[gnu::visibility("default")]]
      |                                           ^
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:213:18: note: in expansion of macro ‘LLVM_ATTRIBUTE_VISIBILITY_DEFAULT’
  213 | #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
      |                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/home/david.spickett/llvm-project/llvm/lib/ProfileData/MemProfRadixTree.cpp:245:5: note: in expansion of macro ‘LLVM_ABI’
  245 |     LLVM_ABI computeFrameHistogram<FrameId>(
      |     ^~~~~~~~
/home/david.spickett/llvm-project/llvm/include/llvm/Support/Compiler.h:120:43: note: an attribute that appertains to a type-specifier is ignored
  120 | #define LLVM_ATTRIBUTE_VISIBILITY_DEFAULT [[gnu::visibility("default")]]
      |                                           ^

According to the interface guide, that macro should go before the return
type to be effective.

https://llvm.org/docs/InterfaceExportAnnotations.html#specialized-template-functions
---
 llvm/lib/ProfileData/MemProfRadixTree.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/ProfileData/MemProfRadixTree.cpp b/llvm/lib/ProfileData/MemProfRadixTree.cpp
index ea9f5bd25534c..c0672eb6da282 100644
--- a/llvm/lib/ProfileData/MemProfRadixTree.cpp
+++ b/llvm/lib/ProfileData/MemProfRadixTree.cpp
@@ -241,13 +241,13 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
 }
 
 // Explicitly instantiate function with the utilized FrameIdTy.
-template llvm::DenseMap<FrameId, FrameStat>
-    LLVM_ABI computeFrameHistogram<FrameId>(
-        llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
-            &MemProfCallStackData);
-template llvm::DenseMap<LinearFrameId, FrameStat>
-    LLVM_ABI computeFrameHistogram<LinearFrameId>(
-        llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
-            &MemProfCallStackData);
+template LLVM_ABI llvm::DenseMap<FrameId, FrameStat>
+computeFrameHistogram<FrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
+        &MemProfCallStackData);
+template LLVM_ABI llvm::DenseMap<LinearFrameId, FrameStat>
+computeFrameHistogram<LinearFrameId>(
+    llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
+        &MemProfCallStackData);
 } // namespace memprof
 } // namespace llvm

From 843f256623a68f51a80ae503c08b98433eeda04d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 12 Jun 2025 09:23:26 -0400
Subject: [PATCH 231/851] [gn] port 20d5d09e99188

---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index 2ed60b4cc33be..a7ea1cf309b97 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -50,7 +50,6 @@ write_cmake_config("lit_common_configured") {
         rebase_path("$root_build_dir/bin/clang") + ext,
     "COMPILER_RT_TEST_COMPILER_ID=Clang",
     "Python3_EXECUTABLE=$python_path",
-    "Python3_ROOT_DIR=",  # FIXME
     "COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL=False",
     "COMPILER_RT_DEBUG_PYBOOL=False",
     "COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER_PYBOOL=False",

From 622df892b844749440124167e8eee9e652fba613 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 12 Jun 2025 15:27:27 +0200
Subject: [PATCH 232/851] [lldb/cmake] Remove EXTRA_CXXFLAGS arg (#143731)

We have one library using this and three libraries directly calling
`target_compile_options`. Might as well standardize on the latter.
---
 lldb/cmake/modules/AddLLDB.cmake              |  5 +----
 .../Plugins/Language/ObjC/CMakeLists.txt      | 22 +++++++++----------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/lldb/cmake/modules/AddLLDB.cmake b/lldb/cmake/modules/AddLLDB.cmake
index 0108fb22e5a03..28bf8d816d89a 100644
--- a/lldb/cmake/modules/AddLLDB.cmake
+++ b/lldb/cmake/modules/AddLLDB.cmake
@@ -41,7 +41,7 @@ function(add_lldb_library name)
   cmake_parse_arguments(PARAM
     "MODULE;SHARED;STATIC;OBJECT;PLUGIN;FRAMEWORK;NO_INTERNAL_DEPENDENCIES;NO_PLUGIN_DEPENDENCIES"
     "INSTALL_PREFIX"
-    "EXTRA_CXXFLAGS;LINK_LIBS;CLANG_LIBS"
+    "LINK_LIBS;CLANG_LIBS"
     ${ARGN})
 
   if(PARAM_NO_INTERNAL_DEPENDENCIES)
@@ -130,9 +130,6 @@ function(add_lldb_library name)
     add_dependencies(${name} clang-tablegen-targets)
   endif()
 
-  # Add in any extra C++ compilation flags for this library.
-  target_compile_options(${name} PRIVATE ${PARAM_EXTRA_CXXFLAGS})
-
   if(PARAM_PLUGIN)
     get_property(parent_dir DIRECTORY PROPERTY PARENT_DIRECTORY)
     if(EXISTS ${parent_dir})
diff --git a/lldb/source/Plugins/Language/ObjC/CMakeLists.txt b/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
index 93c23fd32524b..b9fc5ce754c49 100644
--- a/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
+++ b/lldb/source/Plugins/Language/ObjC/CMakeLists.txt
@@ -1,13 +1,3 @@
-set(EXTRA_CXXFLAGS "")
-
-if (CXX_SUPPORTS_NO_GNU_ANONYMOUS_STRUCT)
-  set(EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS} -Wno-gnu-anonymous-struct)
-endif ()
-
-if (CXX_SUPPORTS_NO_NESTED_ANON_TYPES)
-  set(EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS} -Wno-nested-anon-types)
-endif ()
-
 add_lldb_library(lldbPluginObjCLanguage PLUGIN
   ObjCLanguage.cpp
   CF.cpp
@@ -36,6 +26,14 @@ add_lldb_library(lldbPluginObjCLanguage PLUGIN
     lldbPluginTypeSystemClang
   CLANG_LIBS
     clangAST
-
-  EXTRA_CXXFLAGS ${EXTRA_CXXFLAGS}
 )
+
+if (CXX_SUPPORTS_NO_GNU_ANONYMOUS_STRUCT)
+  target_compile_options(lldbPluginObjCLanguage
+    PRIVATE -Wno-gnu-anonymous-struct)
+endif ()
+
+if (CXX_SUPPORTS_NO_NESTED_ANON_TYPES)
+  target_compile_options(lldbPluginObjCLanguage
+    PRIVATE -Wno-nested-anon-types)
+endif ()

From b8e3e0749fb62a9845f8790f858e11f2558f94a2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 12 Jun 2025 15:27:19 +0200
Subject: [PATCH 233/851] [InstCombine] Export logic for common base pointer
 (NFC)

Make this available to other parts of InstCombine, to be used for
pointer comparison optimization.
---
 .../InstCombine/InstCombineAddSub.cpp         | 19 +++----------------
 .../InstCombine/InstCombineInternal.h         | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f0f709bb16d8a..86d318967403d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2068,21 +2068,8 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   return nullptr;
 }
 
-struct CommonBase {
-  /// Common base pointer.
-  Value *Ptr = nullptr;
-  /// LHS GEPs until common base.
-  SmallVector<GEPOperator *> LHSGEPs;
-  /// RHS GEPs until common base.
-  SmallVector<GEPOperator *> RHSGEPs;
-  /// LHS GEP NoWrapFlags until common base.
-  GEPNoWrapFlags LHSNW = GEPNoWrapFlags::all();
-  /// RHS GEP NoWrapFlags until common base.
-  GEPNoWrapFlags RHSNW = GEPNoWrapFlags::all();
-};
-
-static CommonBase computeCommonBase(Value *LHS, Value *RHS) {
-  CommonBase Base;
+CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
+  CommonPointerBase Base;
 
   if (LHS->getType() != RHS->getType())
     return Base;
@@ -2136,7 +2123,7 @@ static CommonBase computeCommonBase(Value *LHS, Value *RHS) {
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                    Type *Ty, bool IsNUW) {
-  CommonBase Base = computeCommonBase(LHS, RHS);
+  CommonPointerBase Base = CommonPointerBase::compute(LHS, RHS);
   if (!Base.Ptr)
     return nullptr;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 334462d715f95..bf7689bbfde70 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -862,6 +862,21 @@ class Negator final {
                                      InstCombinerImpl &IC);
 };
 
+struct CommonPointerBase {
+  /// Common base pointer.
+  Value *Ptr = nullptr;
+  /// LHS GEPs until common base.
+  SmallVector<GEPOperator *> LHSGEPs;
+  /// RHS GEPs until common base.
+  SmallVector<GEPOperator *> RHSGEPs;
+  /// LHS GEP NoWrapFlags until common base.
+  GEPNoWrapFlags LHSNW = GEPNoWrapFlags::all();
+  /// RHS GEP NoWrapFlags until common base.
+  GEPNoWrapFlags RHSNW = GEPNoWrapFlags::all();
+
+  static CommonPointerBase compute(Value *LHS, Value *RHS);
+};
+
 } // end namespace llvm
 
 #undef DEBUG_TYPE

From 3100b50f78c06dcd5207140e0d6e5ba6954d8828 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot@amd.com>
Date: Thu, 12 Jun 2025 14:35:28 +0100
Subject: [PATCH 234/851] [AMDGPU] Flatten recursive register resource info
 propagation (#142766)

In #112251 I had mentioned I'd follow up with flattening of recursion
for register resource info propagation

Behaviour prior to this patch when a recursive call is used is to take
the module scope worst case function register use (even prior to
AMDGPUMCResourceInfo). With this patch it will, when a cycle is
detected, attempt to do a simple cycle avoidant dfs to find the worst
case constant within the cycle and the cycle's propagates. In other
words, it will attempt to look for the cycle scope worst case rather
than module scope worst case.
---
 .../Target/AMDGPU/AMDGPUMCResourceInfo.cpp    | 100 +++++++++++++++---
 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h |   6 ++
 .../CodeGen/AMDGPU/function-resource-usage.ll |  32 +++---
 .../AMDGPU/recursive-resource-usage-mcexpr.ll |  82 +++++++++++++-
 4 files changed, 188 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index e3e3e411c6843..593b3ab220389 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -97,6 +97,86 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
   return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr");
 }
 
+// Tries to flatten recursive call register resource gathering. Simple cycle
+// avoiding dfs to find the constants in the propagated symbols.
+// Assumes:
+// - RecSym has been confirmed to recurse (this means the callee symbols should
+//   all be populated, started at RecSym).
+// - Shape of the resource symbol's MCExpr (`max` args are order agnostic):
+//   RecSym.MCExpr := max(<constant>+, <callee_symbol>*)
+const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym,
+                                                ResourceInfoKind RIK,
+                                                MCContext &OutContext) {
+  SmallPtrSet<const MCExpr *, 8> Seen;
+  SmallVector<const MCExpr *, 8> WorkList;
+  int64_t Maximum = 0;
+
+  const MCExpr *RecExpr = RecSym->getVariableValue();
+  WorkList.push_back(RecExpr);
+
+  while (!WorkList.empty()) {
+    const MCExpr *CurExpr = WorkList.pop_back_val();
+    switch (CurExpr->getKind()) {
+    default: {
+      // Assuming the recursion is of shape `max(<constant>, <callee_symbol>)`
+      // where <callee_symbol> will eventually recurse. If this condition holds,
+      // the recursion occurs within some other (possibly unresolvable) MCExpr,
+      // thus using the worst case value then.
+      if (!AMDGPUMCExpr::isSymbolUsedInExpression(RecSym, CurExpr)) {
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                          << ": Recursion in unexpected sub-expression, using "
+                             "module maximum\n");
+        switch (RIK) {
+        default:
+          break;
+        case RIK_NumVGPR:
+          return MCSymbolRefExpr::create(getMaxVGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumSGPR:
+          return MCSymbolRefExpr::create(getMaxSGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        case RIK_NumAGPR:
+          return MCSymbolRefExpr::create(getMaxAGPRSymbol(OutContext),
+                                         OutContext);
+          break;
+        }
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Constant: {
+      int64_t Val = cast<MCConstantExpr>(CurExpr)->getValue();
+      Maximum = std::max(Maximum, Val);
+      break;
+    }
+    case MCExpr::ExprKind::SymbolRef: {
+      const MCSymbolRefExpr *SymExpr = cast<MCSymbolRefExpr>(CurExpr);
+      const MCSymbol &SymRef = SymExpr->getSymbol();
+      if (SymRef.isVariable()) {
+        const MCExpr *SymVal = SymRef.getVariableValue();
+        if (Seen.insert(SymVal).second)
+          WorkList.push_back(SymVal);
+      }
+      break;
+    }
+    case MCExpr::ExprKind::Target: {
+      const AMDGPUMCExpr *TargetExpr = cast<AMDGPUMCExpr>(CurExpr);
+      if (TargetExpr->getKind() == AMDGPUMCExpr::VariantKind::AGVK_Max) {
+        for (auto &Arg : TargetExpr->getArgs())
+          WorkList.push_back(Arg);
+      }
+      break;
+    }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "MCResUse:   " << RecSym->getName()
+                    << ": Using flattened max: << " << Maximum << '\n');
+
+  return MCConstantExpr::create(Maximum, OutContext);
+}
+
 void MCResourceInfo::assignResourceInfoExpr(
     int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
     const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
@@ -133,25 +213,19 @@ void MCResourceInfo::assignResourceInfoExpr(
                           << CalleeValSym->getName() << " as callee\n");
         ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
       } else {
-        LLVM_DEBUG(
-            dbgs() << "MCResUse:   " << Sym->getName()
-                   << ": Recursion found, falling back to module maximum\n");
-        // In case of recursion: make sure to use conservative register counts
-        // (i.e., specifically for VGPR/SGPR/AGPR).
+        LLVM_DEBUG(dbgs() << "MCResUse:   " << Sym->getName()
+                          << ": Recursion found, attempt flattening of cycle "
+                             "for resource usage\n");
+        // In case of recursion for vgpr/sgpr/agpr resource usage: try to
+        // flatten and use the max of the call cycle. May still end up emitting
+        // module max if not fully resolvable.
         switch (RIK) {
         default:
           break;
         case RIK_NumVGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxVGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumSGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxSGPRSymbol(OutContext), OutContext));
-          break;
         case RIK_NumAGPR:
-          ArgExprs.push_back(MCSymbolRefExpr::create(
-              getMaxAGPRSymbol(OutContext), OutContext));
+          ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext));
           break;
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index a670878948c31..fa98f82d11022 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -58,6 +58,12 @@ class MCResourceInfo {
   // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
   void assignMaxRegs(MCContext &OutContext);
 
+  // Take flattened max of cyclic function calls' knowns. For example, for
+  // a cycle A->B->C->D->A, take max(A, B, C, D) for A and have B, C, D have the
+  // propgated value from A.
+  const MCExpr *flattenedCycleMax(MCSymbol *RecSym, ResourceInfoKind RIK,
+                                  MCContext &OutContext);
+
 public:
   MCResourceInfo() = default;
   void addMaxVGPRCandidate(int32_t candidate) {
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 0a6aa05c2d212..2a18d40e0bd8a 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -495,17 +495,17 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse1:
-; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse1.num_vgpr, max(48, 43)
+; GCN: .set multi_stage_recurse1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, 34)
 ; GCN: .set multi_stage_recurse1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse1.has_recursion, 1
 ; GCN: .set multi_stage_recurse1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse1.numbered_sgpr+4
-; GCN: NumVgprs: max(48, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 38
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse1(i32 %val) #2 {
   call void @multi_stage_recurse2(i32 %val)
@@ -528,8 +528,8 @@ define void @multi_stage_recurse2(i32 %val) #2 {
 ; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 40
+; GCN: NumVgprs: 48
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
   call void @multi_stage_recurse1(i32 %n)
@@ -550,17 +550,17 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 {
 ; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr)
 ; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size)
 ; GCN-LABEL: {{^}}multi_stage_recurse_noattr1:
-; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr)
-; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr)
-; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, amdgpu.max_num_sgpr)
+; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, 41)
+; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, 0)
+; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, 54)
 ; GCN: .set multi_stage_recurse_noattr1.private_seg_size, 16
 ; GCN: .set multi_stage_recurse_noattr1.uses_vcc, 1
 ; GCN: .set multi_stage_recurse_noattr1.uses_flat_scratch, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_dyn_sized_stack, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_recursion, 0
 ; GCN: .set multi_stage_recurse_noattr1.has_indirect_call, 0
-; GCN: TotalNumSgprs: multi_stage_recurse_noattr1.numbered_sgpr+4
-; GCN: NumVgprs: max(41, amdgpu.max_num_vgpr)
+; GCN: TotalNumSgprs: 61
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define void @multi_stage_recurse_noattr1(i32 %val) #0 {
   call void @multi_stage_recurse_noattr2(i32 %val)
@@ -583,8 +583,8 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 {
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion)
 ; GCN: .set usage_multi_stage_recurse_noattrs.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call)
-; GCN: TotalNumSgprs: usage_multi_stage_recurse_noattrs.numbered_sgpr+6
-; GCN: NumVgprs: usage_multi_stage_recurse_noattrs.num_vgpr
+; GCN: TotalNumSgprs: 63
+; GCN: NumVgprs: 41
 ; GCN: ScratchSize: 16
 define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
   call void @multi_stage_recurse_noattr1(i32 %n)
@@ -601,8 +601,8 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 {
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion)
 ; GCN:  .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call)
-; GCN: TotalNumSgprs: multi_call_with_multi_stage_recurse.numbered_sgpr+6
-; GCN: NumVgprs:  multi_call_with_multi_stage_recurse.num_vgpr
+; GCN: TotalNumSgprs: 59
+; GCN: NumVgprs:  48
 ; GCN: ScratchSize: 2052
 define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 {
   call void @use_stack0()
diff --git a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
index 3093349bff37c..a41a06592f62f 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
 
+; Recursion: foo -> bar -> baz -> qux -> foo
+
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, max(71, foo.num_vgpr)
 ; CHECK: .set qux.num_agpr, max(0, foo.num_agpr)
@@ -34,9 +36,9 @@
 ; CHECK: .set bar.has_indirect_call, or(0, baz.has_indirect_call)
 
 ; CHECK-LABEL: {{^}}foo
-; CHECK: .set foo.num_vgpr, max(46, amdgpu.max_num_vgpr)
-; CHECK: .set foo.num_agpr, max(0, amdgpu.max_num_agpr)
-; CHECK: .set foo.numbered_sgpr, max(71, amdgpu.max_num_sgpr)
+; CHECK: .set foo.num_vgpr, max(46, 71)
+; CHECK: .set foo.num_agpr, max(0, 0)
+; CHECK: .set foo.numbered_sgpr, max(71, 61)
 ; CHECK: .set foo.private_seg_size, 16
 ; CHECK: .set foo.uses_vcc, 1
 ; CHECK: .set foo.uses_flat_scratch, 0
@@ -91,3 +93,77 @@ define amdgpu_kernel void @usefoo() {
   ret void
 }
 
+; Recursion: A -> B -> C -> A && C -> D -> C
+
+; CHECK-LABEL: {{^}}D
+; CHECK: .set D.num_vgpr, max(71, C.num_vgpr)
+; CHECK: .set D.num_agpr, max(0, C.num_agpr)
+; CHECK: .set D.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set D.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set D.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set D.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set D.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set D.has_recursion, or(1, C.has_recursion)
+; CHECK: .set D.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}C
+; CHECK: .set C.num_vgpr, max(42, A.num_vgpr, 71)
+; CHECK: .set C.num_agpr, max(0, A.num_agpr, 0)
+; CHECK: .set C.numbered_sgpr, max(71, A.numbered_sgpr, 71)
+; CHECK: .set C.private_seg_size, 16+max(A.private_seg_size)
+; CHECK: .set C.uses_vcc, or(1, A.uses_vcc)
+; CHECK: .set C.uses_flat_scratch, or(0, A.uses_flat_scratch)
+; CHECK: .set C.has_dyn_sized_stack, or(0, A.has_dyn_sized_stack)
+; CHECK: .set C.has_recursion, or(1, A.has_recursion)
+; CHECK: .set C.has_indirect_call, or(0, A.has_indirect_call)
+
+; CHECK-LABEL: {{^}}B
+; CHECK: .set B.num_vgpr, max(42, C.num_vgpr)
+; CHECK: .set B.num_agpr, max(0, C.num_agpr)
+; CHECK: .set B.numbered_sgpr, max(71, C.numbered_sgpr)
+; CHECK: .set B.private_seg_size, 16+max(C.private_seg_size)
+; CHECK: .set B.uses_vcc, or(1, C.uses_vcc)
+; CHECK: .set B.uses_flat_scratch, or(0, C.uses_flat_scratch)
+; CHECK: .set B.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack)
+; CHECK: .set B.has_recursion, or(1, C.has_recursion)
+; CHECK: .set B.has_indirect_call, or(0, C.has_indirect_call)
+
+; CHECK-LABEL: {{^}}A
+; CHECK: .set A.num_vgpr, max(42, 71)
+; CHECK: .set A.num_agpr, max(0, 0)
+; CHECK: .set A.numbered_sgpr, max(71, 71)
+; CHECK: .set A.private_seg_size, 16
+; CHECK: .set A.uses_vcc, 1
+; CHECK: .set A.uses_flat_scratch, 0
+; CHECK: .set A.has_dyn_sized_stack, 0
+; CHECK: .set A.has_recursion, 1
+; CHECK: .set A.has_indirect_call, 0
+
+define void @A() {
+  call void @B()
+  call void asm sideeffect "", "~{v10}"()
+  call void asm sideeffect "", "~{s50}"()
+  ret void
+}
+
+define void @B() {
+  call void @C()
+  call void asm sideeffect "", "~{v20}"()
+  call void asm sideeffect "", "~{s30}"()
+  ret void
+}
+
+define void @C() {
+  call void @A()
+  call void @D()
+  call void asm sideeffect "", "~{v30}"()
+  call void asm sideeffect "", "~{s40}"()
+  ret void
+}
+
+define void @D() {
+  call void @C()
+  call void asm sideeffect "", "~{v70}"()
+  call void asm sideeffect "", "~{s70}"()
+  ret void
+}

From 79f4a43839386e785451c8f0a362b2d1e5850b74 Mon Sep 17 00:00:00 2001
From: Shamshura Egor <164661612+egorshamshura@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:21:05 +0300
Subject: [PATCH 235/851] [X86] VPTERNLOG comments - use "mem" just for full
 width loads and "m32bcst" / "m64bcst" for broadcast loads (#143721)

Use "mem" just for full width loads and "m32bcst" / "m64bcst" for 32-bit (D) / 64-bit (Q) broadcasts.

Fixes #143679

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 .../X86/MCTargetDesc/X86InstComments.cpp      | 27 +++++++++++++--
 .../any_extend_vector_inreg_of_broadcast.ll   | 16 ++++-----
 ...d_vector_inreg_of_broadcast_from_memory.ll | 16 ++++-----
 llvm/test/CodeGen/X86/avgfloors.ll            | 12 +++----
 llvm/test/CodeGen/X86/avx512-cvt.ll           |  2 +-
 llvm/test/CodeGen/X86/avx512-logic.ll         |  4 +--
 llvm/test/CodeGen/X86/avx512fp16-arith.ll     |  6 ++--
 llvm/test/CodeGen/X86/avx512vl-logic.ll       |  8 ++---
 llvm/test/CodeGen/X86/combine-bitselect.ll    |  6 ++--
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  2 +-
 llvm/test/CodeGen/X86/fp-round.ll             | 34 +++++++++----------
 llvm/test/CodeGen/X86/gfni-funnel-shifts.ll   | 12 +++----
 llvm/test/CodeGen/X86/gfni-shifts.ll          |  6 ++--
 llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 10 +++---
 llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 10 +++---
 .../CodeGen/X86/min-legal-vector-width.ll     | 12 +++----
 llvm/test/CodeGen/X86/pmul.ll                 |  4 +--
 llvm/test/CodeGen/X86/psubus.ll               |  6 ++--
 llvm/test/CodeGen/X86/sadd_sat_vec.ll         |  2 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |  2 +-
 llvm/test/CodeGen/X86/ssub_sat_vec.ll         |  4 +--
 llvm/test/CodeGen/X86/usub_sat_vec.ll         |  2 +-
 llvm/test/CodeGen/X86/vector-fshl-128.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-256.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-512.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshl-rot-256.ll  | 22 ++++++------
 llvm/test/CodeGen/X86/vector-fshl-rot-512.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-128.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-256.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-512.ll      | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll  | 24 ++++++-------
 llvm/test/CodeGen/X86/vector-fshr-rot-512.ll  | 12 +++----
 llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll |  6 ++--
 llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll |  2 +-
 .../vector-interleaved-store-i16-stride-5.ll  |  8 ++---
 .../vector-interleaved-store-i16-stride-7.ll  | 16 ++++-----
 llvm/test/CodeGen/X86/vector-rotate-128.ll    | 12 +++----
 llvm/test/CodeGen/X86/vector-rotate-256.ll    | 22 ++++++------
 llvm/test/CodeGen/X86/vector-rotate-512.ll    | 32 ++++++++---------
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |  4 +--
 .../test/CodeGen/X86/vector-shift-ashr-256.ll |  4 +--
 .../test/CodeGen/X86/vector-shift-ashr-512.ll |  2 +-
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   | 12 +++----
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll |  2 +-
 llvm/test/CodeGen/X86/vector-shift-shl-512.ll |  4 +--
 .../test/CodeGen/X86/vector-shuffle-avx512.ll |  2 +-
 llvm/test/CodeGen/X86/vselect-pcmp.ll         |  4 +--
 .../zero_extend_vector_inreg_of_broadcast.ll  |  8 ++---
 ...d_vector_inreg_of_broadcast_from_memory.ll |  8 ++---
 51 files changed, 269 insertions(+), 248 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 4c26fc86f9547..547745fdba9d6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -931,10 +931,18 @@ static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS,
     // dest, src1, mask, src2, memory, tbl
     CASE_PTERNLOG(PTERNLOGD, m)
     CASE_PTERNLOG(PTERNLOGQ, m)
+    Src2Idx = NumOperands - 7;
+    Src3Idx = -1;
+    break;
+
     CASE_PTERNLOG(PTERNLOGD, mb)
+    Src2Idx = NumOperands - 7;
+    Src3Idx = -2;
+    break;
+
     CASE_PTERNLOG(PTERNLOGQ, mb)
     Src2Idx = NumOperands - 7;
-    Src3Idx = -1;
+    Src3Idx = -3;
     break;
 
   default:
@@ -943,8 +951,21 @@ static bool printPTERNLOGComments(const MCInst *MI, raw_ostream &OS,
   StringRef DestName = getRegName(MI->getOperand(0).getReg());
   StringRef Src1Name = getRegName(MI->getOperand(1).getReg());
   StringRef Src2Name = getRegName(MI->getOperand(Src2Idx).getReg());
-  StringRef Src3Name =
-      Src3Idx != -1 ? getRegName(MI->getOperand(Src3Idx).getReg()) : "mem";
+  StringRef Src3Name;
+  switch (Src3Idx) {
+  case -1:
+    Src3Name = "mem";
+    break;
+  case -2:
+    Src3Name = "m32bcst";
+    break;
+  case -3:
+    Src3Name = "m64bcst";
+    break;
+  default:
+    Src3Name = getRegName(MI->getOperand(Src3Idx).getReg());
+    break;
+  }
   uint8_t TruthTable = MI->getOperand(NumOperands - 1).getImm();
 
   StringRef SrcNames[] = {Src1Name, Src2Name, Src3Name};
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 7d2915ddc75b1..dec829fed3535 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1235,7 +1235,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1248,7 +1248,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1359,7 +1359,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2702,7 +2702,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2717,7 +2717,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm0 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2964,7 +2964,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX512F-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, 32(%rcx)
@@ -2979,7 +2979,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.v
 ; AVX512DQ-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm0 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, 32(%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index dc723eb713c28..3d4cddbb94c7b 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1020,7 +1020,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1030,7 +1030,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1116,7 +1116,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1126,7 +1126,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -2125,7 +2125,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2137,7 +2137,7 @@ define void @vec384_i8_widen_to_i32_factor4_broadcast_to_v12i32_factor12(ptr %in
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2346,7 +2346,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqa %ymm1, 32(%rdx)
@@ -2358,7 +2358,7 @@ define void @vec384_i8_widen_to_i64_factor8_broadcast_to_v6i64_factor6(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 48(%rdi), %xmm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 32(%rdx)
diff --git a/llvm/test/CodeGen/X86/avgfloors.ll b/llvm/test/CodeGen/X86/avgfloors.ll
index 0508e5ccb5430..9cc55c6f7a81f 100644
--- a/llvm/test/CodeGen/X86/avgfloors.ll
+++ b/llvm/test/CodeGen/X86/avgfloors.ll
@@ -53,7 +53,7 @@ define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -108,7 +108,7 @@ define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -405,7 +405,7 @@ define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -478,7 +478,7 @@ define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
@@ -966,7 +966,7 @@ define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
@@ -1078,7 +1078,7 @@ define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsrlw $1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64]
-; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 3dd7b571b9215..76c87900b04d2 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -351,7 +351,7 @@ define <8 x double> @ulto8f64(<8 x i64> %a) {
 ; NODQ-LABEL: ulto8f64:
 ; NODQ:       # %bb.0:
 ; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; NODQ-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
 ; NODQ-NEXT:    vporq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
 ; NODQ-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll
index 23f4fcb1c77c6..bdcc524545fb1 100644
--- a/llvm/test/CodeGen/X86/avx512-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512-logic.ll
@@ -889,7 +889,7 @@ define <16 x i32> @ternlog_xor_andn(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z)
 define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 ; ALL-LABEL: ternlog_or_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & mem) | zmm1
+; ALL-NEXT:    vpternlogd {{.*#+}} zmm0 = (zmm0 & m32bcst) | zmm1
 ; ALL-NEXT:    retq
   %a = and <16 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %b = or <16 x i32> %a, %y
@@ -899,7 +899,7 @@ define <16 x i32> @ternlog_or_and_mask(<16 x i32> %x, <16 x i32> %y) {
 define <8 x i64> @ternlog_xor_and_mask(<8 x i64> %x, <8 x i64> %y) {
 ; ALL-LABEL: ternlog_xor_and_mask:
 ; ALL:       ## %bb.0:
-; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; ALL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m64bcst)
 ; ALL-NEXT:    retq
   %a = and <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %b = xor <8 x i64> %a, %y
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
index b264f5fc34688..d19c9bb550178 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
@@ -384,7 +384,7 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) {
 ; CHECK-LABEL: fcopysignv8f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (m32bcst & (xmm0 ^ xmm1))
 ; CHECK-NEXT:    retq
   %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y)
   ret <8 x half> %a
@@ -439,7 +439,7 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
 define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) {
 ; CHECK-LABEL: fcopysignv16f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-NEXT:    retq
   %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %a
@@ -494,7 +494,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
 define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) {
 ; CHECK-LABEL: fcopysignv32f16:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (m32bcst & (zmm0 ^ zmm1))
 ; CHECK-NEXT:    retq
   %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y)
   ret <32 x half> %a
diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll
index 284a0eb33047c..c1ae0e36c2c0d 100644
--- a/llvm/test/CodeGen/X86/avx512vl-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll
@@ -1039,7 +1039,7 @@ define <4 x i32> @ternlog_xor_andn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: ternlog_or_and_mask:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & mem) | xmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm0 = (xmm0 & m32bcst) | xmm1
 ; CHECK-NEXT:    retq
   %a = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
   %b = or <4 x i32> %a, %y
@@ -1049,7 +1049,7 @@ define <4 x i32> @ternlog_or_and_mask(<4 x i32> %x, <4 x i32> %y) {
 define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) {
 ; CHECK-LABEL: ternlog_or_and_mask_ymm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = (ymm0 & mem) | ymm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} ymm0 = (ymm0 & m32bcst) | ymm1
 ; CHECK-NEXT:    retq
   %a = and <8 x i32> %x, <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
   %b = or <8 x i32> %a, %y
@@ -1059,7 +1059,7 @@ define <8 x i32> @ternlog_or_and_mask_ymm(<8 x i32> %x, <8 x i32> %y) {
 define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) {
 ; CHECK-LABEL: ternlog_xor_and_mask:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; CHECK-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m64bcst)
 ; CHECK-NEXT:    retq
   %a = and <2 x i64> %x, <i64 1099511627775, i64 1099511627775>
   %b = xor <2 x i64> %a, %y
@@ -1069,7 +1069,7 @@ define <2 x i64> @ternlog_xor_and_mask(<2 x i64> %x, <2 x i64> %y) {
 define <4 x i64> @ternlog_xor_and_mask_ymm(<4 x i64> %x, <4 x i64> %y) {
 ; CHECK-LABEL: ternlog_xor_and_mask_ymm:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; CHECK-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m64bcst)
 ; CHECK-NEXT:    retq
   %a = and <4 x i64> %x, <i64 72057594037927935, i64 72057594037927935, i64 72057594037927935, i64 72057594037927935>
   %b = xor <4 x i64> %a, %y
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index 25c26d598881a..4f1c00b64fa98 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -283,7 +283,7 @@ define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, pt
 ;
 ; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (m64bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
@@ -604,7 +604,7 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, pt
 ;
 ; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm1 ^ (m64bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
@@ -975,7 +975,7 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, pt
 ;
 ; AVX512-LABEL: bitselect_v8i64_broadcast_rrm:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
+; AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm1 ^ (m64bcst & (zmm0 ^ zmm1))
 ; AVX512-NEXT:    retq
   %a2 = load i64, ptr %p2
   %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 95b5fcf8eac52..54390d8b66f7d 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -808,7 +808,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
 ; AVX512-LABEL: or_and_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | m64bcst)
 ; AVX512-NEXT:    retq
   %1 = and <2 x i64> %a0, <i64 7, i64 7>
   %2 = or <2 x i64> %1, <i64 3, i64 3>
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index e98fb8e374c0b..8595b63fc8107 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -52,7 +52,7 @@ define half @round_f16(half %h) {
 ; AVX512F:       # %bb.0: # %entry
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
@@ -98,7 +98,7 @@ define float @round_f32(float %x) {
 ; AVX512F-LABEL: round_f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -106,7 +106,7 @@ define float @round_f32(float %x) {
 ; AVX512FP16-LABEL: round_f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -142,7 +142,7 @@ define double @round_f64(double %x) {
 ; AVX512F-LABEL: round_f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -150,7 +150,7 @@ define double @round_f64(double %x) {
 ; AVX512FP16-LABEL: round_f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -208,7 +208,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512F-LABEL: round_v4f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -216,7 +216,7 @@ define <4 x float> @round_v4f32(<4 x float> %x) {
 ; AVX512FP16-LABEL: round_v4f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm1 | (xmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundps $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -262,7 +262,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512F-LABEL: round_v2f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512F-NEXT:    retq
@@ -270,7 +270,7 @@ define <2 x double> @round_v2f64(<2 x double> %x) {
 ; AVX512FP16-LABEL: round_v2f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} xmm1 = xmm1 | (xmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    vroundpd $11, %xmm0, %xmm0
 ; AVX512FP16-NEXT:    retq
@@ -356,7 +356,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512F-LABEL: round_v8f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -364,7 +364,7 @@ define <8 x float> @round_v8f32(<8 x float> %x) {
 ; AVX512FP16-LABEL: round_v8f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 | (ymm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundps $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -426,7 +426,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512F-LABEL: round_v4f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -434,7 +434,7 @@ define <4 x double> @round_v4f64(<4 x double> %x) {
 ; AVX512FP16-LABEL: round_v4f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    vroundpd $11, %ymm0, %ymm0
 ; AVX512FP16-NEXT:    retq
@@ -582,7 +582,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512F-LABEL: round_v16f32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst)
 ; AVX512F-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -590,7 +590,7 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
 ; AVX512FP16-LABEL: round_v16f32:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
-; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst)
 ; AVX512FP16-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscaleps $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
@@ -690,7 +690,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512F-LABEL: round_v8f64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; AVX512F-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -698,7 +698,7 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
 ; AVX512FP16-LABEL: round_v8f64:
 ; AVX512FP16:       ## %bb.0:
 ; AVX512FP16-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
-; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
+; AVX512FP16-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
 ; AVX512FP16-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    vrndscalepd $11, %zmm0, %zmm0
 ; AVX512FP16-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
index 0ca3380d188b7..7001bf7f28071 100644
--- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll
@@ -492,7 +492,7 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $3, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $5, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
   ret <16 x i8> %res
@@ -518,7 +518,7 @@ define <16 x i8> @splatconstant_fshr_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpaddw %xmm0, %xmm0, %xmm2
 ; GFNIAVX512-NEXT:    vpsrlw $7, %xmm1, %xmm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>)
   ret <16 x i8> %res
@@ -1311,7 +1311,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <32 x i8> %res
@@ -1349,7 +1349,7 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind
 ; GFNIAVX512:       # %bb.0:
 ; GFNIAVX512-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; GFNIAVX512-NEXT:    vpsrlw $6, %ymm1, %ymm0
-; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; GFNIAVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; GFNIAVX512-NEXT:    retq
   %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> <i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6, i8 6>)
   ret <32 x i8> %res
@@ -2775,7 +2775,7 @@ define <64 x i8> @splatconstant_fshl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpaddw %zmm0, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $7, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
   ret <64 x i8> %res
@@ -2836,7 +2836,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind
 ; GFNIAVX512BW:       # %bb.0:
 ; GFNIAVX512BW-NEXT:    vpsllw $6, %zmm0, %zmm2
 ; GFNIAVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; GFNIAVX512BW-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index 5cd1a2c76762e..cd16651123b07 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1261,7 +1261,7 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
 ; GFNIAVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; GFNIAVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; GFNIAVX512VL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & mem)
+; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: constant_shl_v32i8:
@@ -2634,7 +2634,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX512VL-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; GFNIAVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; GFNIAVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; GFNIAVX512VL-NEXT:    retq
 ;
 ; GFNIAVX512BW-LABEL: constant_shl_v64i8:
@@ -2642,7 +2642,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
 ; GFNIAVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; GFNIAVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; GFNIAVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; GFNIAVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index ac6b7e54ca5b5..a798f4c38f68f 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2500,7 +2500,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2729,7 +2729,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2961,7 +2961,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3192,7 +3192,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3432,7 +3432,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index a8021e3164f34..3a4a638c7330a 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2194,7 +2194,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} ymm2 = ~ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & m32bcst)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 24c884211cf97..d752659f94a50 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1993,21 +1993,21 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    retq
 ;
 ; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8:
@@ -2025,7 +2025,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-SKX-NEXT:    retq
 ;
@@ -2033,7 +2033,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    retq
 ;
@@ -2041,7 +2041,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-VBMI1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index c7cc2acaf2627..9aee2f11e9ea4 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -832,7 +832,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8c:
@@ -840,7 +840,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 73ee28a7fd247..e10b360b35b56 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -146,7 +146,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_xor_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = xor <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -187,7 +187,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_add_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = add <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -230,7 +230,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: usubsat_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ m32bcst)
 ; AVX512-NEXT:    retq
   %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 undef>)
   ret <4 x i32> %res
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index bd563f97b0ac4..80b55a364dba9 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 08d9183bd30b6..2d0778853fecd 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2461,7 +2461,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
 ; CHECK-AVX512VL-NEXT:    vpsllw $8, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; CHECK-AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; CHECK-AVX512VL-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,128,1,1,1,128,1,64,128,1,128,1,128,32,1,1]
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 88df3c175ec9c..eb2ad4fdff92f 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -567,7 +567,7 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
@@ -601,7 +601,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ;
 ; AVX512BW-LABEL: v16i1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ m32bcst)
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index 4e17ca6fbae33..a5f768e48bae2 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -543,7 +543,7 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 ;
 ; AVX512BW-LABEL: v16i1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 & (xmm1 ^ m32bcst)
 ; AVX512BW-NEXT:    retq
   %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y)
   ret <16 x i1> %z
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index fd0525e6d56a2..6b8a03ba5eb76 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -2407,7 +2407,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2416,14 +2416,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2432,7 +2432,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2441,14 +2441,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index fdd0d68b89003..c6e1aa9cd90ca 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -2296,7 +2296,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2304,14 +2304,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2319,7 +2319,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2327,14 +2327,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 1d807fa85ddc5..34ad667f01171 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -1124,7 +1124,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1137,35 +1137,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index b763b7bac2432..e60b56551e58d 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -1859,7 +1859,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1868,14 +1868,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1884,14 +1884,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1900,7 +1900,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 9e872cc6d74a9..11a02f8cf754c 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -443,12 +443,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
@@ -463,17 +463,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1641,7 +1641,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1649,14 +1649,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1664,14 +1664,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1679,7 +1679,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 436fbe31f7a34..4c6680ac4a19a 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index a56b0a6351a3b..bf525442a419b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -2412,7 +2412,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -2421,14 +2421,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -2437,7 +2437,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -2446,14 +2446,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm2))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 0fa2c858ff000..9479174d964cd 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -2096,7 +2096,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -2104,14 +2104,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -2119,7 +2119,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -2127,14 +2127,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX10-LABEL: splatconstant_funnnel_v32i8:
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX10-NEXT:    vpsrlw $4, %ymm1, %ymm0
-; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX10-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm2))
 ; AVX10-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 665223167fbb4..3a522ccb6214a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -1166,7 +1166,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -1179,35 +1179,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm2
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm1, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm2))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 9ce682306f18b..d9799975cd37a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -1928,7 +1928,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -1937,14 +1937,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -1953,14 +1953,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
 ; AVX512VBMI2-NEXT:    retq
@@ -1969,7 +1969,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_funnnel_v16i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 3d4f283260aa5..15e09c3b6737e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -469,17 +469,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
@@ -488,17 +488,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1693,7 +1693,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1701,14 +1701,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -1716,14 +1716,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -1731,7 +1731,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLVBMI2-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 11ea650e1f02d..1d089e427bfad 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
@@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
   ret <64 x i8> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 0ea754873d8b3..6bc4fcb6cc1ec 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -179,7 +179,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
@@ -500,7 +500,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm2 ^ (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k0
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm2
@@ -606,7 +606,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = srem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index fd7a4c9b8d5ad..9c56894f0c59c 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -651,7 +651,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %res = urem <64 x i8> %a, <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 418c987ab9a30..3311a311c8e46 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -1783,7 +1783,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1856,7 +1856,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -1932,7 +1932,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512DQ-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-NEXT:    vmovdqa64 %zmm6, (%r9)
@@ -2005,7 +2005,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm9 & (ymm0 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = mem ^ (ymm1 & (ymm0 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm1 = m64bcst ^ (ymm1 & (ymm0 ^ m64bcst))
 ; AVX512DQ-FCP-NEXT:    vmovdqa %ymm1, 128(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm4, 64(%r9)
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 %zmm6, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 6f50d61f4d1f4..fafb69be0d380 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1388,7 +1388,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
 ; AVX512-NEXT:    vpsrlq $48, %xmm4, %xmm2
 ; AVX512-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
@@ -1448,7 +1448,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
 ; AVX512-FCP-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
@@ -1511,7 +1511,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2))
 ; AVX512DQ-NEXT:    vpsrlq $48, %xmm4, %xmm2
 ; AVX512DQ-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
@@ -1571,7 +1571,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FCP-NEXT:    vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1]
 ; AVX512DQ-FCP-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX512DQ-FCP-NEXT:    vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6))
 ; AVX512DQ-FCP-NEXT:    vpsrlq $48, %xmm3, %xmm3
 ; AVX512DQ-FCP-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
@@ -13076,7 +13076,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload
 ; AVX512-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1))
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem))
+; AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst))
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-NEXT:    vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0))
 ; AVX512-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
@@ -13752,7 +13752,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6))
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem))
+; AVX512-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2))
 ; AVX512-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
@@ -14403,7 +14403,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-NEXT:    vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1))
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst))
 ; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0))
 ; AVX512DQ-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
@@ -15079,7 +15079,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6))
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = mem ^ (ymm0 & (ymm2 ^ mem))
+; AVX512DQ-FCP-NEXT:    vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
 ; AVX512DQ-FCP-NEXT:    vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2))
 ; AVX512DQ-FCP-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 993e6afc0eaf3..6c79be75550ed 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1581,7 +1581,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512NOVLX-NEXT:    vzeroupper
 ; AVX512NOVLX-NEXT:    retq
@@ -1590,7 +1590,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLX-NEXT:    retq
 ;
 ; XOP-LABEL: splatconstant_rotate_v16i8:
@@ -1739,7 +1739,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $5, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsrlw $11, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = m32bcst & (xmm0 | xmm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16:
@@ -1754,7 +1754,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %xmm0, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %xmm0, %xmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = m32bcst & (xmm0 | xmm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16:
@@ -1819,7 +1819,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512NOVLX-NEXT:    vzeroupper
 ; AVX512NOVLX-NEXT:    retq
@@ -1828,7 +1828,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (m32bcst & (xmm0 ^ xmm1))
 ; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; AVX512VLX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c2c6a5f7eba57..684721f434ebd 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -387,12 +387,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (m32bcst & (zmm3 ^ zmm2))
 ; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
@@ -407,17 +407,17 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (m32bcst & (ymm3 ^ ymm2))
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
 ; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -1392,7 +1392,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512NOVLX-NEXT:    retq
 ;
@@ -1400,7 +1400,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLX-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
@@ -1566,7 +1566,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst & (ymm0 | ymm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
@@ -1581,7 +1581,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %ymm0, %ymm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %ymm0, %ymm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = m32bcst & (ymm0 | ymm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
@@ -1653,7 +1653,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX512NOVLX:       # %bb.0:
 ; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512NOVLX-NEXT:    retq
 ;
@@ -1661,7 +1661,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX512VLX:       # %bb.0:
 ; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
+; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (m32bcst & (ymm0 ^ ymm1))
 ; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; AVX512VLX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 8ac0b178a16df..2cde988ed7762 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -754,7 +754,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_rotate_v64i8:
@@ -766,35 +766,35 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_v64i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    retq
 ;
 ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    retq
   %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
   %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
@@ -844,7 +844,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $11, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
@@ -856,21 +856,21 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $11, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $5, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $11, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst & (zmm0 | zmm1)
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
@@ -902,7 +902,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -915,7 +915,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -923,7 +923,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -931,7 +931,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
 ;
@@ -939,7 +939,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    retq
 ;
@@ -947,7 +947,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
 ; AVX512VLVBMI2:       # %bb.0:
 ; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
 ; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
 ; AVX512VLVBMI2-NEXT:    retq
   %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 60295f1c145a1..02f0f53a0bb30 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -2021,7 +2021,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2029,7 +2029,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 9f3fff34ea20c..15855e3bce46f 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -2295,7 +2295,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2303,7 +2303,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm1 ^ (ymm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 74dbee5e5d2ca..ea0745b157f58 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -540,7 +540,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm1 ^ (zmm0 & m32bcst)
 ; AVX512BW-NEXT:    vpsubb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index 4f8cbc07243fd..f7de8d427150f 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -2344,7 +2344,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2352,7 +2352,7 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
@@ -2414,7 +2414,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2422,7 +2422,7 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
@@ -2484,7 +2484,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; AVX512DQVL:       # %bb.0:
 ; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512DQVL-NEXT:    retq
 ;
@@ -2492,7 +2492,7 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
 ; AVX512BWVL:       # %bb.0:
 ; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
+; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & m32bcst)
 ; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index 4f55f7af20f47..3f238b5739f06 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1369,7 +1369,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
 ; AVX512DQVL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; AVX512DQVL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; AVX512DQVL-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & mem)
+; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
 ; AVX512DQVL-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: constant_shift_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index a42056be895e7..efd742956ed09 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -324,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512DQ-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & m32bcst)
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: constant_shift_v64i8:
@@ -332,7 +332,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 07498c1233b5d..db3be98efa530 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -355,7 +355,7 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
 ;
 ; AVX512F-LABEL: test_mm512_mask_blend_epi16:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (m32bcst & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    ret{{[l|q]}}
 entry:
   %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32>  <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index ab487ed888981..8543e9fd919b2 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -1763,7 +1763,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) {
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst ^ (zmm0 & (zmm1 ^ m32bcst))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: PR110875:
@@ -1780,7 +1780,7 @@ define <64 x i8> @PR110875(<32 x i8> %a0, <32 x i8> %a1, i64 %a2) {
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = mem ^ (zmm0 & (zmm1 ^ mem))
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = m32bcst ^ (zmm0 & (zmm1 ^ m32bcst))
 ; AVX512VL-NEXT:    retq
 ;
 ; XOP-LABEL: PR110875:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 7ad9fb0c27170..45ccc39fb2542 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1235,7 +1235,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1248,7 +1248,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1359,7 +1359,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1372,7 +1372,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.v
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512DQ-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 3bef834bbd902..31920d8348fbe 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -1020,7 +1020,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1030,7 +1030,7 @@ define void @vec256_i8_widen_to_i32_factor4_broadcast_to_v8i32_factor8(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm1 ^ (m32bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper
@@ -1116,7 +1116,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512F-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512F-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1126,7 +1126,7 @@ define void @vec256_i8_widen_to_i64_factor8_broadcast_to_v4i64_factor4(ptr %in.e
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpbroadcastb (%rdi), %ymm1
-; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 ^ (m64bcst & (ymm1 ^ ymm0))
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vmovdqa %ymm0, (%rdx)
 ; AVX512DQ-NEXT:    vzeroupper

From cc17f68e566ab7db4ac8e95dc857e49e10d8366c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Thu, 12 Jun 2025 07:23:07 -0700
Subject: [PATCH 236/851] [SLP] NFC: Precommit test for pull/137419 (#137730)

Precommit for https://github.com/llvm/llvm-project/pull/137419
---
 .../SLPVectorizer/AMDGPU/external-shuffle.ll  | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
new file mode 100644
index 0000000000000..ce9e47a03dee3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN %s
+
+; The insertelements in the exit block use the various parts of the vectorized tree. These external uses are just creating an identity vector using a sequence
+;  of insert elements. Since these insertelements are just recreating the same vectors that were produced during vectorization, they should not increase the cost of vectorization.
+
+define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, ptr %out2, i32 %flag) {
+; GCN-LABEL: define void @phi_4(
+; GCN-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], ptr [[OUT2:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
+; GCN-NEXT:  [[ENTRY:.*]]:
+; GCN-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
+; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2
+; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3
+; GCN-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4
+; GCN-NEXT:    [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5
+; GCN-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6
+; GCN-NEXT:    [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7
+; GCN-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8
+; GCN-NEXT:    [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9
+; GCN-NEXT:    [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10
+; GCN-NEXT:    [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11
+; GCN-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12
+; GCN-NEXT:    [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13
+; GCN-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14
+; GCN-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
+; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
+; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
+; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; GCN-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
+; GCN-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
+; GCN-NEXT:    [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
+; GCN-NEXT:    [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; GCN-NEXT:    [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
+; GCN-NEXT:    [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
+; GCN-NEXT:    br label %[[DO_BODY:.*]]
+; GCN:       [[DO_BODY]]:
+; GCN-NEXT:    [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
+; GCN-NEXT:    [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1
+; GCN-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1
+; GCN-NEXT:    [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1
+; GCN-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1
+; GCN-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1
+; GCN-NEXT:    [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1
+; GCN-NEXT:    [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
+; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
+; GCN-NEXT:    [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0
+; GCN-NEXT:    [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0
+; GCN-NEXT:    [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0
+; GCN-NEXT:    [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0
+; GCN-NEXT:    [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0
+; GCN-NEXT:    [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0
+; GCN-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
+; GCN:       [[EXIT]]:
+; GCN-NEXT:    [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2
+; GCN-NEXT:    [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3
+; GCN-NEXT:    [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4
+; GCN-NEXT:    [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5
+; GCN-NEXT:    [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6
+; GCN-NEXT:    [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7
+; GCN-NEXT:    [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8
+; GCN-NEXT:    [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9
+; GCN-NEXT:    [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10
+; GCN-NEXT:    [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11
+; GCN-NEXT:    [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12
+; GCN-NEXT:    [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13
+; GCN-NEXT:    [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32
+; GCN-NEXT:    ret void
+;
+entry:
+  %ele0 = load i16, ptr addrspace(3) %inptr0, align 8
+  %gep1 = getelementptr i16, ptr addrspace(3) %inptr0, i32 1
+  %ele1 = load i16, ptr addrspace(3) %gep1, align 1
+  %gep2 = getelementptr i16, ptr addrspace(3) %inptr0, i32 2
+  %ele2 = load i16, ptr addrspace(3) %gep2, align 2
+  %gep3 = getelementptr i16, ptr addrspace(3) %inptr0, i32 3
+  %ele3 = load i16, ptr addrspace(3) %gep3, align 1
+  %gep4 = getelementptr i16, ptr addrspace(3) %inptr0, i32 4
+  %ele4 = load i16, ptr addrspace(3) %gep4, align 8
+  %gep5 = getelementptr i16, ptr addrspace(3) %inptr0, i32 5
+  %ele5 = load i16, ptr addrspace(3) %gep5, align 1
+  %gep6 = getelementptr i16, ptr addrspace(3) %inptr0, i32 6
+  %ele6 = load i16, ptr addrspace(3) %gep6, align 2
+  %gep7 = getelementptr i16, ptr addrspace(3) %inptr0, i32 7
+  %ele7 = load i16, ptr addrspace(3) %gep7, align 1
+  %gep8 = getelementptr i16, ptr addrspace(3) %inptr0, i32 8
+  %ele8 = load i16, ptr addrspace(3) %gep8, align 8
+  %gep9 = getelementptr i16, ptr addrspace(3) %inptr0, i32 9
+  %ele9 = load i16, ptr addrspace(3) %gep9, align 1
+  %gep10 = getelementptr i16, ptr addrspace(3) %inptr0, i32 10
+  %ele10 = load i16, ptr addrspace(3) %gep10, align 2
+  %gep11 = getelementptr i16, ptr addrspace(3) %inptr0, i32 11
+  %ele11 = load i16, ptr addrspace(3) %gep11, align 1
+  %gep12 = getelementptr i16, ptr addrspace(3) %inptr0, i32 12
+  %ele12 = load i16, ptr addrspace(3) %gep12, align 8
+  %gep13 = getelementptr i16, ptr addrspace(3) %inptr0, i32 13
+  %ele13 = load i16, ptr addrspace(3) %gep13, align 1
+  %gep14 = getelementptr i16, ptr addrspace(3) %inptr0, i32 14
+  %ele14 = load i16, ptr addrspace(3) %gep14, align 2
+  %gep15 = getelementptr i16, ptr addrspace(3) %inptr0, i32 15
+  %ele15 = load i16, ptr addrspace(3) %gep15, align 1
+  br label %do.body
+
+do.body:
+  %phi0 = phi i16 [ %ele0, %entry ], [ %otherele0, %do.body ]
+  %phi1 = phi i16 [ %ele1, %entry ], [ %otherele1, %do.body ]
+  %phi2 = phi i16 [ %ele2, %entry ], [ %otherele2, %do.body ]
+  %phi3 = phi i16 [ %ele3, %entry ], [ %otherele3, %do.body ]
+  %phi4 = phi i16 [ %ele4, %entry ], [ %otherele4, %do.body ]
+  %phi5 = phi i16 [ %ele5, %entry ], [ %otherele5, %do.body ]
+  %phi6 = phi i16 [ %ele6, %entry ], [ %otherele6, %do.body ]
+  %phi7 = phi i16 [ %ele7, %entry ], [ %otherele7, %do.body ]
+  %phi8 = phi i16 [ %ele8, %entry ], [ %otherele8, %do.body ]
+  %phi9 = phi i16 [ %ele9, %entry ], [ %otherele9, %do.body ]
+  %phi10 = phi i16 [ %ele10, %entry ], [ %otherele10, %do.body ]
+  %phi11 = phi i16 [ %ele11, %entry ], [ %otherele11, %do.body ]
+  %phi12 = phi i16 [ %ele12, %entry ], [ %otherele12, %do.body ]
+  %phi13 = phi i16 [ %ele13, %entry ], [ %otherele13, %do.body ]
+  %phi14 = phi i16 [ %ele14, %entry ], [ %otherele14, %do.body ]
+  %phi15 = phi i16 [ %ele15, %entry ], [ %otherele15, %do.body ]
+
+  %otherele0 = load i16, ptr addrspace(3) %inptr0, align 8
+  %otherele1 = load i16, ptr addrspace(3) %gep1, align 1
+  %otherele2 = load i16, ptr addrspace(3) %gep2, align 2
+  %otherele3 = load i16, ptr addrspace(3) %gep3, align 1
+  %otherele4 = load i16, ptr addrspace(3) %gep4, align 8
+  %otherele5 = load i16, ptr addrspace(3) %gep5, align 1
+  %otherele6 = load i16, ptr addrspace(3) %gep6, align 2
+  %otherele7 = load i16, ptr addrspace(3) %gep7, align 1
+  %otherele8 = load i16, ptr addrspace(3) %gep8, align 8
+  %otherele9 = load i16, ptr addrspace(3) %gep9, align 1
+  %otherele10 = load i16, ptr addrspace(3) %gep10, align 2
+  %otherele11 = load i16, ptr addrspace(3) %gep11, align 1
+  %otherele12 = load i16, ptr addrspace(3) %gep12, align 8
+  %otherele13 = load i16, ptr addrspace(3) %gep13, align 1
+  %otherele14 = load i16, ptr addrspace(3) %gep14, align 2
+  %otherele15 = load i16, ptr addrspace(3) %gep15, align 1
+  %cmp = icmp eq i32 %flag, 0
+  br i1 %cmp, label %exit, label %do.body
+
+exit:
+  %vec00 = insertelement <16 x i16> poison, i16 %otherele0, i64 0
+  %vec01 = insertelement <16 x i16> %vec00, i16 %otherele1, i64 1
+  %vec02 = insertelement <16 x i16> %vec01, i16 %otherele2, i64 2
+  %vec03 = insertelement <16 x i16> %vec02, i16 %otherele3, i64 3
+  %vec04 = insertelement <16 x i16> %vec03, i16 %otherele4, i64 4
+  %vec05 = insertelement <16 x i16> %vec04, i16 %otherele5, i64 5
+  %vec06 = insertelement <16 x i16> %vec05, i16 %otherele6, i64 6
+  %vec07 = insertelement <16 x i16> %vec06, i16 %otherele7, i64 7
+  %vec08 = insertelement <16 x i16> %vec07, i16 %otherele8, i64 8
+  %vec09 = insertelement <16 x i16> %vec08, i16 %otherele9, i64 9
+  %vec010 = insertelement <16 x i16> %vec09, i16 %otherele10, i64 10
+  %vec011 = insertelement <16 x i16> %vec010, i16 %otherele11, i64 11
+  %vec012 = insertelement <16 x i16> %vec011, i16 %otherele12, i64 12
+  %vec013 = insertelement <16 x i16> %vec012, i16 %otherele13, i64 13
+  %vec014 = insertelement <16 x i16> %vec013, i16 %otherele14, i64 14
+  %vec015 = insertelement <16 x i16> %vec014, i16 %otherele15, i64 15
+
+  %vec10 = insertelement <16 x i16> poison, i16 %ele0, i64 0
+  %vec11 = insertelement <16 x i16> %vec10, i16 %ele1, i64 1
+  %vec12 = insertelement <16 x i16> %vec11, i16 %ele2, i64 2
+  %vec13 = insertelement <16 x i16> %vec12, i16 %ele3, i64 3
+  %vec14 = insertelement <16 x i16> %vec13, i16 %ele4, i64 4
+  %vec15 = insertelement <16 x i16> %vec14, i16 %ele5, i64 5
+  %vec16 = insertelement <16 x i16> %vec15, i16 %ele6, i64 6
+  %vec17 = insertelement <16 x i16> %vec16, i16 %ele7, i64 7
+  %vec18 = insertelement <16 x i16> %vec17, i16 %ele8, i64 8
+  %vec19 = insertelement <16 x i16> %vec18, i16 %ele9, i64 9
+  %vec110 = insertelement <16 x i16> %vec19, i16 %ele10, i64 10
+  %vec111 = insertelement <16 x i16> %vec110, i16 %ele11, i64 11
+  %vec112 = insertelement <16 x i16> %vec111, i16 %ele12, i64 12
+  %vec113 = insertelement <16 x i16> %vec112, i16 %ele13, i64 13
+  %vec114 = insertelement <16 x i16> %vec113, i16 %ele14, i64 14
+  %vec115 = insertelement <16 x i16> %vec114, i16 %ele15, i64 15
+
+  %vec20 = insertelement <16 x i16> poison, i16 %phi0, i64 0
+  %vec21 = insertelement <16 x i16> %vec20, i16 %phi1, i64 1
+  %vec22 = insertelement <16 x i16> %vec21, i16 %phi2, i64 2
+  %vec23 = insertelement <16 x i16> %vec22, i16 %phi3, i64 3
+  %vec24 = insertelement <16 x i16> %vec23, i16 %phi4, i64 4
+  %vec25 = insertelement <16 x i16> %vec24, i16 %phi5, i64 5
+  %vec26 = insertelement <16 x i16> %vec25, i16 %phi6, i64 6
+  %vec27 = insertelement <16 x i16> %vec26, i16 %phi7, i64 7
+  %vec28 = insertelement <16 x i16> %vec27, i16 %phi8, i64 8
+  %vec29 = insertelement <16 x i16> %vec28, i16 %phi9, i64 9
+  %vec210 = insertelement <16 x i16> %vec29, i16 %phi10, i64 10
+  %vec211 = insertelement <16 x i16> %vec210, i16 %phi11, i64 11
+  %vec212 = insertelement <16 x i16> %vec211, i16 %phi12, i64 12
+  %vec213 = insertelement <16 x i16> %vec212, i16 %phi13, i64 13
+  %vec214 = insertelement <16 x i16> %vec213, i16 %phi14, i64 14
+  %vec215 = insertelement <16 x i16> %vec214, i16 %phi15, i64 15
+
+  store <16 x i16> %vec115, ptr %out
+  store <16 x i16> %vec015, ptr %out1
+  store <16 x i16> %vec215, ptr %out2
+
+  ret void
+}

From e1e1836bbd70e4f30bd0be97b9d81eabfd6b45c8 Mon Sep 17 00:00:00 2001
From: Omair Javaid <omair.javaid@linaro.org>
Date: Thu, 12 Jun 2025 19:38:42 +0500
Subject: [PATCH 237/851] [CodeGen] Inline stack guard check on Windows
 (#136290)

This patch optimizes the Windows security cookie check mechanism by
moving the comparison inline and only calling __security_check_cookie
when the check fails. This reduces the overhead of making a DLL call
for every function return.

Previously, we implemented this optimization through a machine pass
(X86WinFixupBufferSecurityCheckPass) in PR #95904 submitted by
@mahesh-attarde. We have reverted that pass in favor of this new
approach. Also we have abandoned the AArch64 specific implementation
of same pass in PR #121938 in favor of this more general solution.

The old machine instruction pass approach:
- Scanned the generated code to find __security_check_cookie calls
- Modified these calls by splitting basic blocks
- Added comparison logic and conditional branching
- Required complex block management and live register computation

The new approach:
- Implements the same optimization during instruction selection
- Directly emits the comparison and conditional branching
- No need for post-processing or basic block manipulation
- Disables optimization at -Oz.

Thanks @tamaspetz, @efriedma-quic and @arsenm for their help.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  95 +++++--
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   8 +-
 llvm/lib/Target/X86/CMakeLists.txt            |   1 -
 llvm/lib/Target/X86/X86.h                     |   4 -
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   1 -
 .../X86/X86WinFixupBufferSecurityCheck.cpp    | 245 ------------------
 .../irtranslator-stack-protector-windows.ll   |  12 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   1 -
 .../CodeGen/X86/stack-protector-msvc-oz.ll    | 119 +++++++++
 llvm/test/CodeGen/X86/stack-protector-msvc.ll | 125 +++++++--
 llvm/test/CodeGen/X86/tailcc-ssp.ll           |  28 +-
 llvm/test/DebugInfo/COFF/fpo-stack-protect.ll |   7 +-
 12 files changed, 333 insertions(+), 313 deletions(-)
 delete mode 100644 llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
 create mode 100644 llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e6a1dc930685c..c63eb7fc6b374 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3037,8 +3037,9 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // First create the loads to the guard/stack slot for the comparison.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
-  EVT PtrMemTy = TLI.getPointerMemTy(DAG.getDataLayout());
+  auto &DL = DAG.getDataLayout();
+  EVT PtrTy = TLI.getFrameIndexTy(DL);
+  EVT PtrMemTy = TLI.getPointerMemTy(DL, DL.getAllocaAddrSpace());
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3047,8 +3048,8 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   SDLoc dl = getCurSDLoc();
   SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
   const Module &M = *ParentBB->getParent()->getFunction().getParent();
-  Align Align =
-      DAG.getDataLayout().getPrefTypeAlign(PointerType::get(M.getContext(), 0));
+  Align Align = DL.getPrefTypeAlign(
+      PointerType::get(M.getContext(), DL.getAllocaAddrSpace()));
 
   // Generate code to load the content of the guard slot.
   SDValue GuardVal = DAG.getLoad(
@@ -3059,8 +3060,14 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   if (TLI.useStackGuardXorFP())
     GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
 
-  // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  // If we're using function-based instrumentation, call the guard check
+  // function
+  if (SPD.shouldEmitFunctionBasedCheckStackProtector()) {
+    // Get the guard check function from the target and verify it exists since
+    // we're using function-based instrumentation
+    const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M);
+    assert(GuardCheckFn && "Guard check function is null");
+
     // The target provides a guard check function to validate the guard value.
     // Generate a call to that function with the content of the guard slot as
     // argument.
@@ -3101,10 +3108,9 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   }
 
   // Perform the comparison via a getsetcc.
-  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
-                                                        *DAG.getContext(),
-                                                        Guard.getValueType()),
-                             Guard, GuardVal, ISD::SETNE);
+  SDValue Cmp = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DL, *DAG.getContext(), Guard.getValueType()),
+      Guard, GuardVal, ISD::SETNE);
 
   // If the guard/stackslot do not equal, branch to failure MBB.
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
@@ -3126,14 +3132,69 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 /// For a high level explanation of how this fits into the stack protector
 /// generation see the comment on the declaration of class
 /// StackProtectorDescriptor.
-void
-SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+void SelectionDAGBuilder::visitSPDescriptorFailure(
+    StackProtectorDescriptor &SPD) {
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setDiscardResult(true);
-  SDValue Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
-                                  MVT::isVoid, {}, CallOptions, getCurSDLoc())
-                      .second;
+  MachineBasicBlock *ParentBB = SPD.getParentMBB();
+  const Module &M = *ParentBB->getParent()->getFunction().getParent();
+  SDValue Chain;
+
+  // For -Oz builds with a guard check function, we use function-based
+  // instrumentation. Otherwise, if we have a guard check function, we call it
+  // in the failure block.
+  auto *GuardCheckFn = TLI.getSSPStackGuardCheck(M);
+  if (GuardCheckFn && !SPD.shouldEmitFunctionBasedCheckStackProtector()) {
+    // First create the loads to the guard/stack slot for the comparison.
+    auto &DL = DAG.getDataLayout();
+    EVT PtrTy = TLI.getFrameIndexTy(DL);
+    EVT PtrMemTy = TLI.getPointerMemTy(DL, DL.getAllocaAddrSpace());
+
+    MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
+    int FI = MFI.getStackProtectorIndex();
+
+    SDLoc dl = getCurSDLoc();
+    SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+    Align Align = DL.getPrefTypeAlign(
+        PointerType::get(M.getContext(), DL.getAllocaAddrSpace()));
+
+    // Generate code to load the content of the guard slot.
+    SDValue GuardVal = DAG.getLoad(
+        PtrMemTy, dl, DAG.getEntryNode(), StackSlotPtr,
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
+        MachineMemOperand::MOVolatile);
+
+    if (TLI.useStackGuardXorFP())
+      GuardVal = TLI.emitStackGuardXorFP(DAG, GuardVal, dl);
+
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    FunctionType *FnTy = GuardCheckFn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = GuardVal;
+    Entry.Ty = FnTy->getParamType(0);
+    if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
+      Entry.IsInReg = true;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+        .setChain(DAG.getEntryNode())
+        .setCallee(GuardCheckFn->getCallingConv(), FnTy->getReturnType(),
+                   getValue(GuardCheckFn), std::move(Args));
+
+    Chain = TLI.LowerCallTo(CLI).second;
+  } else {
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setDiscardResult(true);
+    Chain = TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                            {}, CallOptions, getCurSDLoc())
+                .second;
+  }
 
   // Emit a trap instruction if we are required to do so.
   const TargetOptions &TargetOpts = DAG.getTarget().Options;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index f599637564715..b02a03c0b0cb2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1877,7 +1877,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
     if (SP->shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
-          TLI->getSSPStackGuardCheck(*Fn.getParent());
+          TLI->getSSPStackGuardCheck(*Fn.getParent()) && Fn.hasMinSize();
       SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->getMBB(LLVMBB),
                                    FunctionBasedInstrumentation);
     }
@@ -1950,8 +1950,7 @@ SelectionDAGISel::FinishBasicBlock() {
 
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
-    FuncInfo->InsertPt =
-        findSplitPointForStackProtector(ParentMBB, *TII);
+    FuncInfo->InsertPt = findSplitPointForStackProtector(ParentMBB, *TII);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
@@ -1973,8 +1972,7 @@ SelectionDAGISel::FinishBasicBlock() {
         findSplitPointForStackProtector(ParentMBB, *TII);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
-    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
-                       SplitPoint,
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB, SplitPoint,
                        ParentMBB->end());
 
     // Add compare/jump on neq/jump to the parent BB.
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 6627e97dd0943..1bf9f8b467993 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -85,7 +85,6 @@ set(sources
   X86VZeroUpper.cpp
   X86WinEHState.cpp
   X86WinEHUnwindV2.cpp
-  X86WinFixupBufferSecurityCheck.cpp
   X86InsertWait.cpp
   GISel/X86CallLowering.cpp
   GISel/X86InstructionSelector.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index be2ddac35cab8..6261fadf10a7a 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -73,9 +73,6 @@ FunctionPass *createX86OptimizeLEAs();
 /// Return a pass that transforms setcc + movzx pairs into xor + setcc.
 FunctionPass *createX86FixupSetCC();
 
-/// Return a pass that transform inline buffer security check into seperate bb
-FunctionPass *createX86WinFixupBufferSecurityCheckPass();
-
 /// Return a pass that avoids creating store forward block issues in the hardware.
 FunctionPass *createX86AvoidStoreForwardingBlocks();
 
@@ -195,7 +192,6 @@ void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FastPreTileConfigPass(PassRegistry &);
 void initializeX86FastTileConfigPass(PassRegistry &);
 void initializeX86FixupSetCCPassPass(PassRegistry &);
-void initializeX86WinFixupBufferSecurityCheckPassPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
 void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 7e75c0e565863..2d4afc23f1a42 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -554,7 +554,6 @@ bool X86PassConfig::addPreISel() {
 void X86PassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOptLevel::None) {
     addPass(&LiveRangeShrinkID);
-    addPass(createX86WinFixupBufferSecurityCheckPass());
     addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
     addPass(createX86CallFrameOptimization());
diff --git a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp b/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
deleted file mode 100644
index 5c12af1fee637..0000000000000
--- a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-//===- X86WinFixupBufferSecurityCheck.cpp Fix Buffer Security Check Call -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Buffer Security Check implementation inserts windows specific callback into
-// code. On windows, __security_check_cookie call gets call everytime function
-// is return without fixup. Since this function is defined in runtime library,
-// it incures cost of call in dll which simply does comparison and returns most
-// time. With Fixup, We selective move to call in DLL only if comparison fails.
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86FrameLowering.h"
-#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-win-fixup-bscheck"
-
-namespace {
-
-class X86WinFixupBufferSecurityCheckPass : public MachineFunctionPass {
-public:
-  static char ID;
-
-  X86WinFixupBufferSecurityCheckPass() : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override {
-    return "X86 Windows Fixup Buffer Security Check";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  std::pair<MachineBasicBlock *, MachineInstr *>
-  getSecurityCheckerBasicBlock(MachineFunction &MF);
-
-  void getGuardCheckSequence(MachineBasicBlock *CurMBB, MachineInstr *CheckCall,
-                             MachineInstr *SeqMI[5]);
-
-  void SplitBasicBlock(MachineBasicBlock *CurMBB, MachineBasicBlock *NewRetMBB,
-                       MachineBasicBlock::iterator SplitIt);
-
-  void FinishBlock(MachineBasicBlock *MBB);
-
-  void FinishFunction(MachineBasicBlock *FailMBB, MachineBasicBlock *NewRetMBB);
-
-  std::pair<MachineInstr *, MachineInstr *>
-  CreateFailCheckSequence(MachineBasicBlock *CurMBB, MachineBasicBlock *FailMBB,
-                          MachineInstr *SeqMI[5]);
-};
-} // end anonymous namespace
-
-char X86WinFixupBufferSecurityCheckPass::ID = 0;
-
-INITIALIZE_PASS(X86WinFixupBufferSecurityCheckPass, DEBUG_TYPE, DEBUG_TYPE,
-                false, false)
-
-FunctionPass *llvm::createX86WinFixupBufferSecurityCheckPass() {
-  return new X86WinFixupBufferSecurityCheckPass();
-}
-
-void X86WinFixupBufferSecurityCheckPass::SplitBasicBlock(
-    MachineBasicBlock *CurMBB, MachineBasicBlock *NewRetMBB,
-    MachineBasicBlock::iterator SplitIt) {
-  NewRetMBB->splice(NewRetMBB->end(), CurMBB, SplitIt, CurMBB->end());
-}
-
-std::pair<MachineBasicBlock *, MachineInstr *>
-X86WinFixupBufferSecurityCheckPass::getSecurityCheckerBasicBlock(
-    MachineFunction &MF) {
-  MachineBasicBlock::reverse_iterator RBegin, REnd;
-
-  for (auto &MBB : llvm::reverse(MF)) {
-    for (RBegin = MBB.rbegin(), REnd = MBB.rend(); RBegin != REnd; ++RBegin) {
-      auto &MI = *RBegin;
-      if (MI.getOpcode() == X86::CALL64pcrel32 &&
-          MI.getNumExplicitOperands() == 1) {
-        auto MO = MI.getOperand(0);
-        if (MO.isGlobal()) {
-          auto Callee = dyn_cast<Function>(MO.getGlobal());
-          if (Callee && Callee->getName() == "__security_check_cookie") {
-            return std::make_pair(&MBB, &MI);
-            break;
-          }
-        }
-      }
-    }
-  }
-  return std::make_pair(nullptr, nullptr);
-}
-
-void X86WinFixupBufferSecurityCheckPass::getGuardCheckSequence(
-    MachineBasicBlock *CurMBB, MachineInstr *CheckCall,
-    MachineInstr *SeqMI[5]) {
-
-  MachineBasicBlock::iterator UIt(CheckCall);
-  MachineBasicBlock::reverse_iterator DIt(CheckCall);
-  // Seq From StackUp to Stack Down Is fixed.
-  // ADJCALLSTACKUP64
-  ++UIt;
-  SeqMI[4] = &*UIt;
-
-  // CALL __security_check_cookie
-  SeqMI[3] = CheckCall;
-
-  // COPY function slot cookie
-  ++DIt;
-  SeqMI[2] = &*DIt;
-
-  // ADJCALLSTACKDOWN64
-  ++DIt;
-  SeqMI[1] = &*DIt;
-
-  MachineBasicBlock::reverse_iterator XIt(SeqMI[1]);
-  for (; XIt != CurMBB->rbegin(); ++XIt) {
-    auto &CI = *XIt;
-    if ((CI.getOpcode() == X86::XOR64_FP) || (CI.getOpcode() == X86::XOR32_FP))
-      break;
-  }
-  SeqMI[0] = &*XIt;
-}
-
-std::pair<MachineInstr *, MachineInstr *>
-X86WinFixupBufferSecurityCheckPass::CreateFailCheckSequence(
-    MachineBasicBlock *CurMBB, MachineBasicBlock *FailMBB,
-    MachineInstr *SeqMI[5]) {
-
-  auto MF = CurMBB->getParent();
-
-  Module &M = *MF->getFunction().getParent();
-  GlobalVariable *GV = M.getGlobalVariable("__security_cookie");
-  assert(GV && " Security Cookie was not installed!");
-
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-
-  MachineInstr *GuardXor = SeqMI[0];
-  MachineBasicBlock::iterator InsertPt(GuardXor);
-  ++InsertPt;
-
-  // Compare security_Cookie with XOR_Val, if not same, we have violation
-  auto CMI = BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rm))
-                 .addReg(GuardXor->getOperand(0).getReg())
-                 .addReg(X86::RIP)
-                 .addImm(1)
-                 .addReg(X86::NoRegister)
-                 .addGlobalAddress(GV)
-                 .addReg(X86::NoRegister);
-
-  BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::JCC_1))
-      .addMBB(FailMBB)
-      .addImm(X86::COND_NE);
-
-  auto JMI = BuildMI(*CurMBB, InsertPt, DebugLoc(), TII->get(X86::JMP_1));
-
-  return std::make_pair(CMI.getInstr(), JMI.getInstr());
-}
-
-void X86WinFixupBufferSecurityCheckPass::FinishBlock(MachineBasicBlock *MBB) {
-  LivePhysRegs LiveRegs;
-  computeAndAddLiveIns(LiveRegs, *MBB);
-}
-
-void X86WinFixupBufferSecurityCheckPass::FinishFunction(
-    MachineBasicBlock *FailMBB, MachineBasicBlock *NewRetMBB) {
-  FailMBB->getParent()->RenumberBlocks();
-  // FailMBB includes call to MSCV RT  where is __security_check_cookie
-  // function is called. This function uses regcall and it expects cookie
-  // value from stack slot.( even if this is modified)
-  // Before going further we compute back livein for this block to make sure
-  // it is live and provided.
-  FinishBlock(FailMBB);
-  FinishBlock(NewRetMBB);
-}
-
-bool X86WinFixupBufferSecurityCheckPass::runOnMachineFunction(
-    MachineFunction &MF) {
-  bool Changed = false;
-  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
-
-  if (!(STI.isTargetWindowsItanium() || STI.isTargetWindowsMSVC()))
-    return Changed;
-
-  // Check if security cookie was installed or not
-  Module &M = *MF.getFunction().getParent();
-  GlobalVariable *GV = M.getGlobalVariable("__security_cookie");
-  if (!GV)
-    return Changed;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-
-  // Check if security check cookie was installed or not
-  auto [CurMBB, CheckCall] = getSecurityCheckerBasicBlock(MF);
-
-  if (!CheckCall)
-    return Changed;
-
-  MachineBasicBlock *FailMBB = MF.CreateMachineBasicBlock();
-  MachineBasicBlock *NewRetMBB = MF.CreateMachineBasicBlock();
-
-  MF.insert(MF.end(), NewRetMBB);
-  MF.insert(MF.end(), FailMBB);
-
-  MachineInstr *SeqMI[5];
-  getGuardCheckSequence(CurMBB, CheckCall, SeqMI);
-  // MachineInstr * GuardXor  = SeqMI[0];
-
-  auto FailSeqRange = CreateFailCheckSequence(CurMBB, FailMBB, SeqMI);
-  MachineInstrBuilder JMI(MF, FailSeqRange.second);
-
-  // After Inserting JMP_1, we can not have two terminators
-  // in same block, split CurrentMBB after JMP_1
-  MachineBasicBlock::iterator SplitIt(SeqMI[4]);
-  ++SplitIt;
-  SplitBasicBlock(CurMBB, NewRetMBB, SplitIt);
-
-  // Fill up Failure Routine, move Fail Check Squence from CurMBB to FailMBB
-  MachineBasicBlock::iterator U1It(SeqMI[1]);
-  MachineBasicBlock::iterator U2It(SeqMI[4]);
-  ++U2It;
-  FailMBB->splice(FailMBB->end(), CurMBB, U1It, U2It);
-  BuildMI(*FailMBB, FailMBB->end(), DebugLoc(), TII->get(X86::INT3));
-
-  // Move left over instruction after StackUp
-  // from Current Basic BLocks into New Return Block
-  JMI.addMBB(NewRetMBB);
-  MachineBasicBlock::iterator SplicePt(JMI.getInstr());
-  ++SplicePt;
-  if (SplicePt != CurMBB->end())
-    NewRetMBB->splice(NewRetMBB->end(), CurMBB, SplicePt);
-
-  // Restructure Basic Blocks
-  CurMBB->addSuccessor(NewRetMBB);
-  CurMBB->addSuccessor(FailMBB);
-
-  FinishFunction(FailMBB, NewRetMBB);
-  return !Changed;
-}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
index 6aefc5341da07..e7f4785d01df6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-stack-protector-windows.ll
@@ -17,8 +17,12 @@ define void @caller() sspreq {
 ; CHECK-NEXT:    ldr x8, [x8, :lo12:__security_cookie]
 ; CHECK-NEXT:    str x8, [sp, #8]
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    ldr x0, [sp, #8]
-; CHECK-NEXT:    bl __security_check_cookie
+; CHECK-NEXT:    adrp x8, __security_cookie
+; CHECK-NEXT:    ldr x9, [sp, #8]
+; CHECK-NEXT:    ldr x8, [x8, :lo12:__security_cookie]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 16
@@ -26,6 +30,10 @@ define void @caller() sspreq {
 ; CHECK-NEXT:    .seh_stackalloc 32
 ; CHECK-NEXT:    .seh_endepilogue
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %entry
+; CHECK-NEXT:    ldr x0, [sp, #8]
+; CHECK-NEXT:    bl __security_check_cookie
+; CHECK-NEXT:    brk #0x1
 ; CHECK-NEXT:    .seh_endfunclet
 ; CHECK-NEXT:    .seh_endproc
 entry:
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 540046e6a8638..8d155bd57df13 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -121,7 +121,6 @@
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       Live Range Shrink
-; CHECK-NEXT:       X86 Windows Fixup Buffer Security Check
 ; CHECK-NEXT:       X86 Fixup SetCC
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 LEA Optimize
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll b/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll
new file mode 100644
index 0000000000000..d8a772efbd7ed
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc-oz.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X64 %s
+
+; Make sure fastisel falls back and does something secure.
+; RUN: llc -mtriple=i686-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X86-O0 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X64-O0 %s
+
+@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00"    ; <ptr> [#uses=1]
+
+define void @test(ptr %a) nounwind ssp minsize {
+; MSVC-X86-LABEL: test:
+; MSVC-X86:       # %bb.0: # %entry
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    subl $12, %esp
+; MSVC-X86-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-NEXT:    xorl %esp, %eax
+; MSVC-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    movl %esp, %esi
+; MSVC-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    calll _strcpy
+; MSVC-X86-NEXT:    popl %ecx
+; MSVC-X86-NEXT:    popl %edx
+; MSVC-X86-NEXT:    pushl %esi
+; MSVC-X86-NEXT:    pushl $LC
+; MSVC-X86-NEXT:    calll _printf
+; MSVC-X86-NEXT:    popl %ecx
+; MSVC-X86-NEXT:    popl %edx
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-NEXT:    xorl %esp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    addl $12, %esp
+; MSVC-X86-NEXT:    popl %esi
+; MSVC-X86-NEXT:    retl
+;
+; MSVC-X64-LABEL: test:
+; MSVC-X64:       # %bb.0: # %entry
+; MSVC-X64-NEXT:    pushq %rsi
+; MSVC-X64-NEXT:    subq $64, %rsp
+; MSVC-X64-NEXT:    movq %rcx, %rdx
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-NEXT:    xorq %rsp, %rax
+; MSVC-X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
+; MSVC-X64-NEXT:    movq %rsi, %rcx
+; MSVC-X64-NEXT:    callq strcpy
+; MSVC-X64-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-NEXT:    movq %rsi, %rdx
+; MSVC-X64-NEXT:    callq printf
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-NEXT:    callq __security_check_cookie
+; MSVC-X64-NEXT:    addq $64, %rsp
+; MSVC-X64-NEXT:    popq %rsi
+; MSVC-X64-NEXT:    retq
+;
+; MSVC-X86-O0-LABEL: test:
+; MSVC-X86-O0:       # %bb.0: # %entry
+; MSVC-X86-O0-NEXT:    subl $20, %esp
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    xorl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %esp, %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, 4(%eax)
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    movl %ecx, (%eax)
+; MSVC-X86-O0-NEXT:    calll _strcpy
+; MSVC-X86-O0-NEXT:    leal LC, %ecx
+; MSVC-X86-O0-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; MSVC-X86-O0-NEXT:    movl %ecx, (%esp)
+; MSVC-X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; MSVC-X86-O0-NEXT:    calll _printf
+; MSVC-X86-O0-NEXT:  # %bb.1: # %return
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
+; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:    addl $20, %esp
+; MSVC-X86-O0-NEXT:    retl
+;
+; MSVC-X64-O0-LABEL: test:
+; MSVC-X64-O0:       # %bb.0: # %entry
+; MSVC-X64-O0-NEXT:    subq $56, %rsp
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rax
+; MSVC-X64-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    callq strcpy
+; MSVC-X64-O0-NEXT:    leaq LC(%rip), %rcx
+; MSVC-X64-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; MSVC-X64-O0-NEXT:    callq printf
+; MSVC-X64-O0-NEXT:  # %bb.1: # %return
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:    addq $56, %rsp
+; MSVC-X64-O0-NEXT:    retq
+entry:
+ %a_addr = alloca ptr    ; <ptr> [#uses=2]
+ %buf = alloca [8 x i8]    ; <ptr> [#uses=2]
+ store ptr %a, ptr %a_addr
+ %0 = load ptr, ptr %a_addr, align 4    ; <ptr> [#uses=1]
+ %1 = call ptr @strcpy(ptr %buf, ptr %0) nounwind   ; <ptr> [#uses=0]
+ %2 = call i32 (ptr, ...) @printf(ptr @"\01LC", ptr %buf) nounwind    ; <i32> [#uses=0]
+ br label %return
+
+return:    ; preds = %entry
+ ret void
+}
+
+declare ptr @strcpy(ptr, ptr) nounwind
+
+declare i32 @printf(ptr, ...) nounwind
+
diff --git a/llvm/test/CodeGen/X86/stack-protector-msvc.ll b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
index d718062d2c485..a868fa549296d 100644
--- a/llvm/test/CodeGen/X86/stack-protector-msvc.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-msvc.ll
@@ -25,12 +25,19 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl $LC
 ; MSVC-X86-NEXT:    calll _printf
 ; MSVC-X86-NEXT:    addl $8, %esp
-; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; MSVC-X86-NEXT:    xorl %esp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MSVC-X86-NEXT:    xorl %esp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB0_2
+; MSVC-X86-NEXT:  # %bb.1: # %return
 ; MSVC-X86-NEXT:    addl $12, %esp
 ; MSVC-X86-NEXT:    popl %esi
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB0_2: # %return
+; MSVC-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-NEXT:    xorl %esp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test:
 ; MSVC-X64:       # %bb.0: # %entry
@@ -47,17 +54,19 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X64-NEXT:    leaq LC(%rip), %rcx
 ; MSVC-X64-NEXT:    movq %rsi, %rdx
 ; MSVC-X64-NEXT:    callq printf
-; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; MSVC-X64-NEXT:    xorq %rsp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; MSVC-X64-NEXT:    xorq %rsp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB0_2
-; MSVC-X64-NEXT:  # %bb.1:
+; MSVC-X64-NEXT:  # %bb.1: # %return
 ; MSVC-X64-NEXT:    addq $64, %rsp
 ; MSVC-X64-NEXT:    popq %rsi
 ; MSVC-X64-NEXT:    retq
-; MSVC-X64-NEXT:  .LBB0_2:
+; MSVC-X64-NEXT:  .LBB0_2: # %return
+; MSVC-X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-NEXT:    xorq %rsp, %rcx
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test:
 ; MSVC-X86-O0:       # %bb.0: # %entry
@@ -80,7 +89,15 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X86-O0-NEXT:  # %bb.1: # %return
 ; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB0_3
+; MSVC-X86-O0-NEXT:    jmp LBB0_2
+; MSVC-X86-O0-NEXT:  LBB0_3: # %return
+; MSVC-X86-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %esp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB0_2: # %return
 ; MSVC-X86-O0-NEXT:    addl $20, %esp
 ; MSVC-X86-O0-NEXT:    retl
 ;
@@ -100,9 +117,18 @@ define void @test(ptr %a) nounwind ssp {
 ; MSVC-X64-O0-NEXT:  # %bb.1: # %return
 ; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB0_3
+; MSVC-X64-O0-NEXT:    jmp .LBB0_2
+; MSVC-X64-O0-NEXT:  .LBB0_3: # %return
+; MSVC-X64-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rsp, %rcx
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB0_2: # %return
 ; MSVC-X64-O0-NEXT:    addq $56, %rsp
 ; MSVC-X64-O0-NEXT:    retq
+
 entry:
  %a_addr = alloca ptr    ; <ptr> [#uses=2]
  %buf = alloca [8 x i8]    ; <ptr> [#uses=2]
@@ -134,12 +160,19 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl %eax
 ; MSVC-X86-NEXT:    calll _escape
 ; MSVC-X86-NEXT:    addl $4, %esp
-; MSVC-X86-NEXT:    movl -4(%ebp), %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl -4(%ebp), %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB1_2
+; MSVC-X86-NEXT:  # %bb.1:
 ; MSVC-X86-NEXT:    movl %ebp, %esp
 ; MSVC-X86-NEXT:    popl %ebp
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB1_2:
+; MSVC-X86-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test_vla:
 ; MSVC-X64:       # %bb.0:
@@ -158,19 +191,20 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq escape
 ; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    movq -8(%rbp), %rcx
-; MSVC-X64-NEXT:    xorq %rbp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq -8(%rbp), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB1_2
 ; MSVC-X64-NEXT:  # %bb.1:
 ; MSVC-X64-NEXT:    movq %rbp, %rsp
 ; MSVC-X64-NEXT:    popq %rbp
 ; MSVC-X64-NEXT:    retq
 ; MSVC-X64-NEXT:  .LBB1_2:
+; MSVC-X64-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test_vla:
 ; MSVC-X86-O0:       # %bb.0:
@@ -190,7 +224,15 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X86-O0-NEXT:    addl $4, %esp
 ; MSVC-X86-O0-NEXT:    movl -4(%ebp), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB1_2
+; MSVC-X86-O0-NEXT:    jmp LBB1_1
+; MSVC-X86-O0-NEXT:  LBB1_2:
+; MSVC-X86-O0-NEXT:    movl -4(%ebp), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB1_1:
 ; MSVC-X86-O0-NEXT:    movl %ebp, %esp
 ; MSVC-X86-O0-NEXT:    popl %ebp
 ; MSVC-X86-O0-NEXT:    retl
@@ -215,8 +257,16 @@ define void @test_vla(i32 %n) nounwind ssp {
 ; MSVC-X64-O0-NEXT:    addq $32, %rsp
 ; MSVC-X64-O0-NEXT:    movq -8(%rbp), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB1_2
+; MSVC-X64-O0-NEXT:    jmp .LBB1_1
+; MSVC-X64-O0-NEXT:  .LBB1_2:
+; MSVC-X64-O0-NEXT:    movq -8(%rbp), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-O0-NEXT:    subq $32, %rsp
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB1_1:
 ; MSVC-X64-O0-NEXT:    movq %rbp, %rsp
 ; MSVC-X64-O0-NEXT:    popq %rbp
 ; MSVC-X64-O0-NEXT:    retq
@@ -253,14 +303,21 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X86-NEXT:    pushl %edi
 ; MSVC-X86-NEXT:    calll _escape
 ; MSVC-X86-NEXT:    addl $4, %esp
-; MSVC-X86-NEXT:    movl 12(%esi), %ecx
-; MSVC-X86-NEXT:    xorl %ebp, %ecx
-; MSVC-X86-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-NEXT:    movl 12(%esi), %eax
+; MSVC-X86-NEXT:    xorl %ebp, %eax
+; MSVC-X86-NEXT:    movl ___security_cookie, %ecx
+; MSVC-X86-NEXT:    cmpl %eax, %ecx
+; MSVC-X86-NEXT:    jne LBB2_2
+; MSVC-X86-NEXT:  # %bb.1:
 ; MSVC-X86-NEXT:    leal -8(%ebp), %esp
 ; MSVC-X86-NEXT:    popl %esi
 ; MSVC-X86-NEXT:    popl %edi
 ; MSVC-X86-NEXT:    popl %ebp
 ; MSVC-X86-NEXT:    retl
+; MSVC-X86-NEXT:  LBB2_2:
+; MSVC-X86-NEXT:    movl 12(%esi), %ecx
+; MSVC-X86-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-NEXT:    calll @__security_check_cookie@4
 ;
 ; MSVC-X64-LABEL: test_vla_realign:
 ; MSVC-X64:       # %bb.0:
@@ -286,9 +343,10 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    movq %rsi, %rcx
 ; MSVC-X64-NEXT:    callq escape
 ; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    movq 24(%rbx), %rcx
-; MSVC-X64-NEXT:    xorq %rbp, %rcx
-; MSVC-X64-NEXT:    cmpq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    movq 24(%rbx), %rax
+; MSVC-X64-NEXT:    xorq %rbp, %rax
+; MSVC-X64-NEXT:    movq __security_cookie(%rip), %rcx
+; MSVC-X64-NEXT:    cmpq %rax, %rcx
 ; MSVC-X64-NEXT:    jne .LBB2_2
 ; MSVC-X64-NEXT:  # %bb.1:
 ; MSVC-X64-NEXT:    movq %rbp, %rsp
@@ -297,10 +355,10 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-NEXT:    popq %rbp
 ; MSVC-X64-NEXT:    retq
 ; MSVC-X64-NEXT:  .LBB2_2:
+; MSVC-X64-NEXT:    movq 24(%rbx), %rcx
+; MSVC-X64-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-NEXT:    subq $32, %rsp
 ; MSVC-X64-NEXT:    callq __security_check_cookie
-; MSVC-X64-NEXT:    addq $32, %rsp
-; MSVC-X64-NEXT:    int3
 ;
 ; MSVC-X86-O0-LABEL: test_vla_realign:
 ; MSVC-X86-O0:       # %bb.0:
@@ -328,7 +386,15 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X86-O0-NEXT:    addl $4, %esp
 ; MSVC-X86-O0-NEXT:    movl 48(%esi), %ecx
 ; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
+; MSVC-X86-O0-NEXT:    movl ___security_cookie, %eax
+; MSVC-X86-O0-NEXT:    subl %ecx, %eax
+; MSVC-X86-O0-NEXT:    jne LBB2_2
+; MSVC-X86-O0-NEXT:    jmp LBB2_1
+; MSVC-X86-O0-NEXT:  LBB2_2:
+; MSVC-X86-O0-NEXT:    movl 48(%esi), %ecx
+; MSVC-X86-O0-NEXT:    xorl %ebp, %ecx
 ; MSVC-X86-O0-NEXT:    calll @__security_check_cookie@4
+; MSVC-X86-O0-NEXT:  LBB2_1:
 ; MSVC-X86-O0-NEXT:    leal -4(%ebp), %esp
 ; MSVC-X86-O0-NEXT:    popl %esi
 ; MSVC-X86-O0-NEXT:    popl %ebp
@@ -361,8 +427,16 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 ; MSVC-X64-O0-NEXT:    addq $32, %rsp
 ; MSVC-X64-O0-NEXT:    movq 64(%rbx), %rcx
 ; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
+; MSVC-X64-O0-NEXT:    movq __security_cookie(%rip), %rax
+; MSVC-X64-O0-NEXT:    subq %rcx, %rax
+; MSVC-X64-O0-NEXT:    jne .LBB2_2
+; MSVC-X64-O0-NEXT:    jmp .LBB2_1
+; MSVC-X64-O0-NEXT:  .LBB2_2:
+; MSVC-X64-O0-NEXT:    movq 64(%rbx), %rcx
+; MSVC-X64-O0-NEXT:    xorq %rbp, %rcx
 ; MSVC-X64-O0-NEXT:    subq $32, %rsp
 ; MSVC-X64-O0-NEXT:    callq __security_check_cookie
+; MSVC-X64-O0-NEXT:  .LBB2_1:
 ; MSVC-X64-O0-NEXT:    leaq 8(%rbp), %rsp
 ; MSVC-X64-O0-NEXT:    popq %rbx
 ; MSVC-X64-O0-NEXT:    popq %rbp
@@ -377,3 +451,4 @@ define void @test_vla_realign(i32 %n) nounwind ssp {
 declare ptr @strcpy(ptr, ptr) nounwind
 
 declare i32 @printf(ptr, ...) nounwind
+
diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll
index 7ea5dd49f0242..ac5dda7d69bde 100644
--- a/llvm/test/CodeGen/X86/tailcc-ssp.ll
+++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll
@@ -13,9 +13,10 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; WINDOWS-NEXT:    movq __security_cookie(%rip), %rax
 ; WINDOWS-NEXT:    xorq %rsp, %rax
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB0_1
 ; WINDOWS-NEXT:  # %bb.2:
 ; WINDOWS-NEXT:    xorl %ecx, %ecx
@@ -26,6 +27,8 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    jmp h # TAILCALL
 ; WINDOWS-NEXT:  .LBB0_1:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
@@ -49,7 +52,6 @@ define tailcc void @tailcall_frame(ptr %0, i64 %1) sspreq {
 ; LINUX-NEXT:  .LBB0_2: # %CallStackCheckFailBlk
 ; LINUX-NEXT:    .cfi_def_cfa_offset 32
 ; LINUX-NEXT:    callq __stack_chk_fail@PLT
-
    tail call tailcc void @h(ptr null, i64 0, ptr null)
    ret void
 }
@@ -65,9 +67,10 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    xorq %rsp, %rax
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WINDOWS-NEXT:    callq bar
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB1_1
 ; WINDOWS-NEXT:  # %bb.2:
 ; WINDOWS-NEXT:    .seh_startepilogue
@@ -75,6 +78,8 @@ define void @tailcall_unrelated_frame() sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    jmp bar # TAILCALL
 ; WINDOWS-NEXT:  .LBB1_1:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
@@ -115,9 +120,10 @@ define void @caller() sspreq {
 ; WINDOWS-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; WINDOWS-NEXT:    callq callee
 ; WINDOWS-NEXT:    callq callee
-; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; WINDOWS-NEXT:    xorq %rsp, %rcx
-; WINDOWS-NEXT:    cmpq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WINDOWS-NEXT:    xorq %rsp, %rax
+; WINDOWS-NEXT:    movq __security_cookie(%rip), %rcx
+; WINDOWS-NEXT:    cmpq %rax, %rcx
 ; WINDOWS-NEXT:    jne .LBB2_2
 ; WINDOWS-NEXT:  # %bb.1:
 ; WINDOWS-NEXT:    .seh_startepilogue
@@ -125,6 +131,8 @@ define void @caller() sspreq {
 ; WINDOWS-NEXT:    .seh_endepilogue
 ; WINDOWS-NEXT:    retq
 ; WINDOWS-NEXT:  .LBB2_2:
+; WINDOWS-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WINDOWS-NEXT:    xorq %rsp, %rcx
 ; WINDOWS-NEXT:    callq __security_check_cookie
 ; WINDOWS-NEXT:    int3
 ; WINDOWS-NEXT:    .seh_endproc
diff --git a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
index 566d36e87d2b6..d0d724910faf1 100644
--- a/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-stack-protect.ll
@@ -15,7 +15,7 @@
 ; CHECK:         subl    $20, %esp
 ; CHECK:         .cv_fpo_stackalloc      20
 ; CHECK:         .cv_fpo_endprologue
-; CHECK:         ___security_cookie
+; CHECK:         movl    ___security_cookie, %ea
 
 ; CHECK:         movl    28(%esp), %esi
 ; CHECK:         movl    %esi, {{[0-9]*}}(%esp)
@@ -24,13 +24,16 @@
 ; CHECK:         movl    %esi, {{[0-9]*}}(%esp)
 
 ; CHECK:         calll   _escape
-; CHECK:         calll   @__security_check_cookie@4
+
+; CHECK:         movl    ___security_cookie, %ecx
+; CHECK:         cmpl    %eax, %ecx
 
 ; CHECK:         movl    %esi, %eax
 ; CHECK:         addl    $20, %esp
 ; CHECK:         popl    %esi
 ; CHECK:         retl
 ; CHECK: Ltmp4:
+; CHECK:         calll   @__security_check_cookie@4
 ; CHECK:         .cv_fpo_endproc
 
 ; ModuleID = 't.c'

From 36878158586b92e53dd615264f883e9d7530d047 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 14:39:15 +0000
Subject: [PATCH 238/851] [gn build] Port e1e1836bbd70

---
 llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index f17b9afcbcddc..f22ee4f31741b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -144,7 +144,6 @@ static_library("LLVMX86CodeGen") {
     "X86VZeroUpper.cpp",
     "X86WinEHState.cpp",
     "X86WinEHUnwindV2.cpp",
-    "X86WinFixupBufferSecurityCheck.cpp",
   ]
 }
 

From b6a56b8ef26a6b612eb5f49d37024666b073481e Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Thu, 12 Jun 2025 15:50:31 +0100
Subject: [PATCH 239/851] [llvm-remarkutil] bitstream2yaml: Keep output file
 (#143220)

Keep the output file on successful exit, otherwise `llvm-remarkutil
bitstream2yaml -o filename.yaml ...` does not produce any output,
because the output file is deleted when the tool exits.
---
 llvm/test/tools/llvm-remarkutil/convert.test | 7 ++++---
 llvm/tools/llvm-remarkutil/RemarkConvert.cpp | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/llvm-remarkutil/convert.test b/llvm/test/tools/llvm-remarkutil/convert.test
index 83023c8ce6a89..0d7ab8e4682a5 100644
--- a/llvm/test/tools/llvm-remarkutil/convert.test
+++ b/llvm/test/tools/llvm-remarkutil/convert.test
@@ -1,6 +1,7 @@
-RUN: llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.bitstream -o - | FileCheck %s -strict-whitespace
-RUN: llvm-remarkutil yaml2bitstream %p/Inputs/two-remarks.yaml -o %t
-RUN: llvm-remarkutil bitstream2yaml %t -o - | FileCheck %s -strict-whitespace
+RUN: llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.bitstream -o %t.yaml
+RUN: FileCheck %s -strict-whitespace < %t.yaml
+RUN: llvm-remarkutil yaml2bitstream %p/Inputs/two-remarks.yaml -o %t.bitstream
+RUN: llvm-remarkutil bitstream2yaml %t.bitstream -o - | FileCheck %s -strict-whitespace
 
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            prologepilog
diff --git a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
index 35d8dcd99b4a9..207c5e0a8048b 100644
--- a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
+++ b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp
@@ -133,6 +133,7 @@ static Error tryBitstream2YAML() {
   if (!E.isA<EndOfFileError>())
     return E;
   consumeError(std::move(E));
+  OF->keep();
   return Error::success();
 }
 } // namespace bitstream2yaml

From ca5b71a4559890a9768558ddea724782fb638bfa Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 10:52:25 -0400
Subject: [PATCH 240/851] [Matrix] Propagate shape information through Select
 insts (#141876)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  47 +++++-
 .../LowerMatrixIntrinsics/select.ll           | 146 ++++++++++++++++++
 2 files changed, 188 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/select.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index a7072ea719292..ce6eaa292d8fb 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -323,9 +323,11 @@ computeShapeInfoForInst(Instruction *I,
       return OpShape->second;
   }
 
-  if (isUniformShape(I)) {
+  if (isUniformShape(I) || isa<SelectInst>(I)) {
+    auto Ops = I->operands();
+    auto ShapedOps = isa<SelectInst>(I) ? drop_begin(Ops) : Ops;
     // Find the first operand that has a known shape and use that.
-    for (auto &Op : I->operands()) {
+    for (auto &Op : ShapedOps) {
       auto OpShape = ShapeMap.find(Op.get());
       if (OpShape != ShapeMap.end())
         return OpShape->second;
@@ -701,7 +703,8 @@ class LowerMatrixIntrinsics {
       default:
         return isUniformShape(II);
       }
-    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V) ||
+           isa<SelectInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -788,10 +791,12 @@ class LowerMatrixIntrinsics {
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
         // backward propagate to an instruction with an already known shape.
-      } else if (isUniformShape(V)) {
+      } else if (isUniformShape(V) || isa<SelectInst>(V)) {
+        auto Ops = cast<Instruction>(V)->operands();
+        auto ShapedOps = isa<SelectInst>(V) ? drop_begin(Ops) : Ops;
         // Propagate to all operands.
         ShapeInfo Shape = ShapeMap[V];
-        for (Use &U : cast<Instruction>(V)->operands()) {
+        for (Use &U : ShapedOps) {
           if (setShapeInfo(U.get(), Shape))
             pushInstruction(U.get(), WorkList);
         }
@@ -1148,6 +1153,8 @@ class LowerMatrixIntrinsics {
         Result = VisitUnaryOperator(UnOp, SI);
       else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
         Result = VisitIntrinsicInst(Intr, SI);
+      else if (auto *Select = dyn_cast<SelectInst>(Inst))
+        Result = VisitSelectInst(Select, SI);
       else if (match(Inst, m_Load(m_Value(Op1))))
         Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
@@ -2307,6 +2314,36 @@ class LowerMatrixIntrinsics {
                                    Result.getNumVectors());
   }
 
+  /// Lower selects.
+  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape) {
+    Value *Cond = Inst->getOperand(0);
+    Value *OpA = Inst->getOperand(1);
+    Value *OpB = Inst->getOperand(2);
+
+    IRBuilder<> Builder(Inst);
+
+    MatrixTy Result;
+    MatrixTy A = getMatrix(OpA, Shape, Builder);
+    MatrixTy B = getMatrix(OpB, Shape, Builder);
+
+    Value *CondV[2];
+    if (isa<FixedVectorType>(Cond->getType())) {
+      MatrixTy C = getMatrix(Cond, Shape, Builder);
+      CondV[0] = C.getVector(0);
+      CondV[1] = C.getVector(1);
+    } else {
+      CondV[0] = Cond;
+      CondV[1] = Cond;
+    }
+
+    for (unsigned I = 0, E = Shape.getNumVectors(); I != E; ++I)
+      Result.addVector(
+          Builder.CreateSelect(CondV[I], A.getVector(I), B.getVector(I)));
+
+    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                   Result.getNumVectors());
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
new file mode 100644
index 0000000000000..70b0dfdb3e7e8
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @select_2x2_bot(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_bot(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 4
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %rhsv = load <4 x float>, ptr %rhs
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @select_2x2_lhs(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_lhs(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %lhs, i64 2, i1 false, i32 2, i32 2)
+  %rhsv = load <4 x float>, ptr %rhs
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_rhs(i1 %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_rhs(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS1:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD4]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select i1 %cond, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape1(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape1(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[CONDV:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[RHS1:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, ptr [[RHS1]], i64 2
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <2 x float>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = shufflevector <4 x i1> [[CONDV]], <4 x i1> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = shufflevector <4 x i1> [[CONDV]], <4 x i1> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[COL_LOAD2]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COL_LOAD4]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD7]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = load <4 x i1>, ptr %cond
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape2(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i1, ptr [[COND]], i64 2
+; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i1>, ptr [[VEC_GEP3]], align 1
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <2 x float>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[COL_LOAD2]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COL_LOAD4]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD7]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 2, i1 false, i32 2, i32 2)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape3(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape3(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x float>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x i1> [[COL_LOAD2]], <4 x i1> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <4 x i1> [[COL_LOAD2]], <4 x i1> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[SPLIT]], <2 x float> [[COL_LOAD]], <2 x float> [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[SPLIT6]], <2 x float> [[COL_LOAD1]], <2 x float> [[COL_LOAD5]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP7]], align 8
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 4, i1 false, i32 4, i32 1)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 2, i1 false, i32 2, i32 2)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}

From bba4ded3c2f94fe0de6011a6941b135b3cb0370a Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Thu, 12 Jun 2025 22:53:41 +0800
Subject: [PATCH 241/851] [libc++] Fix constructing `bitset` from
 non-null-terminated arrays (#143691)

Unconditional evaluation of `char_traits<_CharT>::length(__str)` is problematic, because it causes
UB when `__str` points to a non-null-terminated array. We should only call `length` (currently, in
`basic_string_view`'s constructor) when `__n == npos` per [bitset.cons]/8.

Drive-by change: Reduction of conditional compilation, given that
- both `basic_string_view<_CharT>::size_type` and `basic_string<_CharT>::size_type` must be
  `size_t`, and thus
- both `basic_string_view<_CharT>::npos` and `basic_string<_CharT>::npos` must be `size_t(-1)`.

For the type sameness in the standard wording, see:
- [string.view.template.general]
- [basic.string.general]
- [allocator.traits.types]/6
- [default.allocator.general]/1

Fixes #143684
---
 libcxx/include/bitset                         | 13 ++++-----
 .../bitset.cons/char_ptr_ctor.pass.cpp        | 29 +++++++++++++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/libcxx/include/bitset b/libcxx/include/bitset
index 88dc0e08c995d..6be476e2b69d8 100644
--- a/libcxx/include/bitset
+++ b/libcxx/include/bitset
@@ -645,16 +645,13 @@ public:
   template <class _CharT, __enable_if_t<_IsCharLikeType<_CharT>::value, int> = 0>
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX23 explicit bitset(
       const _CharT* __str,
-#  if _LIBCPP_STD_VER >= 26
-      typename basic_string_view<_CharT>::size_type __n = basic_string_view<_CharT>::npos,
-#  else
-      typename basic_string<_CharT>::size_type __n = basic_string<_CharT>::npos,
-#  endif
+      size_t __n    = basic_string<_CharT>::npos,
       _CharT __zero = _CharT('0'),
       _CharT __one  = _CharT('1')) {
-
-    size_t __rlen = std::min(__n, char_traits<_CharT>::length(__str));
-    __init_from_string_view(basic_string_view<_CharT>(__str, __rlen), __zero, __one);
+    if (__n == basic_string<_CharT>::npos)
+      __init_from_string_view(basic_string_view<_CharT>(__str), __zero, __one);
+    else
+      __init_from_string_view(basic_string_view<_CharT>(__str, __n), __zero, __one);
   }
 #  if _LIBCPP_STD_VER >= 26
   template <class _CharT, class _Traits>
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
index 86b144ed87b70..4f9cdaeb38c0b 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.cons/char_ptr_ctor.pass.cpp
@@ -72,6 +72,35 @@ TEST_CONSTEXPR_CXX23 void test_char_pointer_ctor()
     for (std::size_t i = 10; i < v.size(); ++i)
         assert(v[i] == false);
   }
+  // Verify that this constructor doesn't read over the given bound.
+  // See https://github.com/llvm/llvm-project/issues/143684
+  {
+    const char not_null_terminated[] = {'1', '0', '1', '0', '1', '0', '1', '0', '1', '0'};
+    std::bitset<N> v(not_null_terminated, 10);
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == '1'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
+  {
+    const char not_null_terminated[] = {'1', 'a', '1', 'a', '1', 'a', '1', 'a', '1', 'a'};
+    std::bitset<N> v(not_null_terminated, 10, 'a');
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == '1'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
+  {
+    const char not_null_terminated[] = {'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a'};
+    std::bitset<N> v(not_null_terminated, 10, 'a', 'b');
+    std::size_t M = std::min<std::size_t>(v.size(), 10);
+    for (std::size_t i = 0; i < M; ++i)
+      assert(v[i] == (not_null_terminated[M - 1 - i] == 'b'));
+    for (std::size_t i = 10; i < v.size(); ++i)
+      assert(!v[i]);
+  }
 }
 
 TEST_CONSTEXPR_CXX23 bool test() {

From 5c1a021f7f285f702a290d7faaaf0a274b3bf5a1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 07:54:36 -0700
Subject: [PATCH 242/851] [libc++] Fix typos in documentation (#143912)

---
 libcxx/docs/ABIGuarantees.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/docs/ABIGuarantees.rst b/libcxx/docs/ABIGuarantees.rst
index c25aaa8e42330..e6ac4f2b5b230 100644
--- a/libcxx/docs/ABIGuarantees.rst
+++ b/libcxx/docs/ABIGuarantees.rst
@@ -40,7 +40,7 @@ significantly.
 ``_LIBCPP_ABI_NO_ITERATOR_BASES``
 ---------------------------------
 This removes the ``iterator`` base class from ``back_insert_iterator``, ``front_insert_iterator``, ``insert_iterator``,
-``istream_iterator``, ``ostream_iterator``, ``ostreambuf_itreator``, ``reverse_iterator``, and ``raw_storage_iterator``.
+``istream_iterator``, ``ostream_iterator``, ``ostreambuf_iterator``, ``reverse_iterator``, and ``raw_storage_iterator``.
 This doesn't directly affect the layout of these types in most cases, but may result in more padding being used when
 they are used in combination, for example ``reverse_iterator<reverse_iterator<T>>``.
 
@@ -63,7 +63,7 @@ removes these workarounds for platforms that don't care about ABI compatibility.
 
 ``_LIBCPP_ABI_NO_COMPRESSED_PAIR_PADDING``
 ------------------------------------------
-This removes artifical padding from ``_LIBCPP_COMPRESSED_PAIR`` and ``_LIBCPP_COMPRESSED_TRIPLE``.
+This removes artificial padding from ``_LIBCPP_COMPRESSED_PAIR`` and ``_LIBCPP_COMPRESSED_TRIPLE``.
 
 These macros are used inside the associative and unordered containers, ``deque``, ``forward_list``, ``future``,
 ``list``, ``basic_string``, ``function``, ``shared_ptr``, ``unique_ptr``, and ``vector`` to stay ABI compatible with the
@@ -83,7 +83,7 @@ flag removes that artificial padding.
 
 Linking TUs which have been compiled against different releases of libc++
 =========================================================================
-libc++ supports linking TUs which have beeen compiled against different releases of libc++ by marking symbols with
+libc++ supports linking TUs which have been compiled against different releases of libc++ by marking symbols with
 hidden visibility and changing the mangling of header-only functions in every release.
 
 
@@ -104,7 +104,7 @@ behave as the flags say.
 
 Availability of symbols in the built library (both static and shared)
 =====================================================================
-In general, libc++ does not make any guarantees about forwards-compability. That is, a TU compiled against new headers
+In general, libc++ does not make any guarantees about forwards-compatibility. That is, a TU compiled against new headers
 may not work with an older library. Vendors who require such support can leverage availability markup. On the other
 hand, backwards compatibility is generally guaranteed.
 
@@ -166,7 +166,7 @@ There are multiple ABI flags which change which type an alias references:
 
 ``_LIBCPP_ABI_INCOMPLETE_TYPES_IN_DEQUE``
 -----------------------------------------
-This changes ``deque::iterator`` to avoid requring complete types for ``deque``.
+This changes ``deque::iterator`` to avoid requiring complete types for ``deque``.
 
 ``_LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE``
 -------------------------------------------------
@@ -198,7 +198,7 @@ This changes the value of ``regex_constants::syntax_option-type::ECMAScript`` to
 ``_LIBCPP_ABI_FIX_CITYHASH_IMPLEMENTATION``
 -------------------------------------------
 This flag fixes the implementation of CityHash used for ``hash<fundamental-type>``. The incorrect implementation of
-CityHash has the roblem that it drops some bits on the floor. Fixing the implementation changes the hash of values,
+CityHash has the problem that it drops some bits on the floor. Fixing the implementation changes the hash of values,
 resulting in an ABI break.
 
 inline namespaces

From 4f60321ca183ebf132e97e54d8d560643c5c3340 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 12 Jun 2025 15:59:59 +0100
Subject: [PATCH 243/851] [Offload] Add `ol_dimensions_t` and convert ranges
 from size_t -> uint32_t (#143901)

This is a three element x, y, z size_t vector that can be used any place
where a 3D vector is required. This ensures that all vectors across
liboffload are the same and don't require any resizing/reordering
dances.
---
 offload/liboffload/API/Common.td                    | 10 ++++++++++
 offload/liboffload/API/Kernel.td                    |  8 ++------
 offload/liboffload/src/OffloadImpl.cpp              | 12 ++++++------
 .../unittests/OffloadAPI/kernel/olLaunchKernel.cpp  | 13 ++++---------
 4 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index 7674da0438c29..8a2ecd6c6e8f4 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -148,6 +148,16 @@ def : Struct {
   ];
 }
 
+def : Struct {
+  let name = "ol_dimensions_t";
+  let desc = "A three element vector";
+  let members = [
+    StructMember<"uint32_t", "x", "X">,
+    StructMember<"uint32_t", "y", "Y">,
+    StructMember<"uint32_t", "z", "Z">,
+  ];
+}
+
 def : Function {
   let name = "olInit";
   let desc = "Perform initialization of the Offload library and plugins";
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 45e3d8112791c..0913a036fa04f 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -29,12 +29,8 @@ def : Struct {
     let desc = "Size-related arguments for a kernel launch.";
     let members = [
         StructMember<"size_t", "Dimensions", "Number of work dimensions">,
-        StructMember<"size_t", "NumGroupsX", "Number of work groups on the X dimension">,
-        StructMember<"size_t", "NumGroupsY", "Number of work groups on the Y dimension">,
-        StructMember<"size_t", "NumGroupsZ", "Number of work groups on the Z dimension">,
-        StructMember<"size_t", "GroupSizeX", "Size of a work group on the X dimension.">,
-        StructMember<"size_t", "GroupSizeY", "Size of a work group on the Y dimension.">,
-        StructMember<"size_t", "GroupSizeZ", "Size of a work group on the Z dimension.">,
+        StructMember<"struct ol_dimensions_t", "NumGroups", "Number of work groups in each dimension">,
+        StructMember<"struct ol_dimensions_t", "GroupSize", "Size of a work group in each dimension">,
         StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
     ];
 }
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d2b331905ab77..0a784cddeaecb 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -499,12 +499,12 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
   KernelArgsTy LaunchArgs{};
-  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
-  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
-  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroupsZ;
-  LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSizeX;
-  LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSizeY;
-  LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSizeZ;
+  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroups.x;
+  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroups.y;
+  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumGroups.z;
+  LaunchArgs.ThreadLimit[0] = LaunchSizeArgs->GroupSize.x;
+  LaunchArgs.ThreadLimit[1] = LaunchSizeArgs->GroupSize.y;
+  LaunchArgs.ThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
   LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
   KernelLaunchParamsTy Params;
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index d466799c1acaa..157f33a363700 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -19,13 +19,8 @@ struct LaunchKernelTestBase : OffloadQueueTest {
                                    DeviceBin->getBufferSize(), &Program));
     ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
     LaunchArgs.Dimensions = 1;
-    LaunchArgs.GroupSizeX = 64;
-    LaunchArgs.GroupSizeY = 1;
-    LaunchArgs.GroupSizeZ = 1;
-
-    LaunchArgs.NumGroupsX = 1;
-    LaunchArgs.NumGroupsY = 1;
-    LaunchArgs.NumGroupsZ = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
 
     LaunchArgs.DynSharedMemory = 0;
   }
@@ -60,7 +55,7 @@ OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
 TEST_P(olLaunchKernelTest, Success) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
-                            LaunchArgs.GroupSizeX * sizeof(uint32_t), &Mem));
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
   struct {
     void *Mem;
   } Args{Mem};
@@ -88,7 +83,7 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
 TEST_P(olLaunchKernelTest, SuccessSynchronous) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
-                            LaunchArgs.GroupSizeX * sizeof(uint32_t), &Mem));
+                            LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
 
   struct {
     void *Mem;

From 4bd0a0e50bcfc3263c219acc9709ae234a334456 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Thu, 12 Jun 2025 17:09:55 +0200
Subject: [PATCH 244/851] Revert "[flang] Enable delayed localization by
 default for `do concurrent` (#142567)" (#143905)

This reverts commit 937be177528de156922c1b5f6cab08ba3009dbf2.

Resolves https://github.com/llvm/llvm-project/issues/143897 until the
todo is properly handled.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +++++-
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5ff8101dba097..64b16b3abe991 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,7 +2033,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
+    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
+    // default unlike the staging flag) once the implementation of this is more
+    // complete.
+    bool useDelayedPriv =
+        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 039b17808d19e..6cae0eb46db13 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index 67f080eb2c1c5..a3d0c34ed8569 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 798cbb335c8c0..d643213854744 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 64f14ff972272..60df27a591dc3 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 34d7bcfb7d7ad..84db1972cca16 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From 62b694090093ed34d620dd1129b194fc66fa4bb0 Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Thu, 12 Jun 2025 16:10:33 +0100
Subject: [PATCH 245/851] [mlir][spirv] Add definition for GL
 Pack/UnpackHalf2x16 (#143889)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       |  84 ++++++++++++++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 104 ++++++++++++++++++
 mlir/test/Target/SPIRV/gl-ops.mlir            |  10 +-
 3 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index f3f75240e5214..7ffe0c8da1cae 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1317,4 +1317,88 @@ def SPIRV_GLFractOp : SPIRV_GLUnaryArithmeticOp<"Fract", 10, SPIRV_Float> {
   }];
 }
 
+// -----
+
+def SPIRV_GLPackHalf2x16Op : SPIRV_GLOp<"PackHalf2x16", 58, [Pure]> {
+  let summary = "Pack two-component vector of 32-bit floats into a 32-bit integer";
+
+  let description = [{
+    Result is the unsigned integer obtained by converting the components of a
+    two-component floating-point vector to the 16-bit OpTypeFloat, and then packing
+    these two 16-bit integers into a 32-bit unsigned integer. The first vector
+    component specifies the 16 least-significant bits of the result; the second
+    component specifies the 16 most-significant bits.
+
+    The RelaxedPrecision Decoration only affects the conversion step of the instruction.
+
+    The v operand must be a vector of 2 components whose type is a 32-bit floating-point.
+
+    Result Type must be a 32-bit integer type.
+
+    #### Example:
+
+    ```mlir
+    %1 = spirv.GL.PackHalf2x16 %0 : vector<2xf32> -> i32
+    ```
+  }];
+
+  let arguments = (ins
+    VectorOfLengthAndType<[2], [SPIRV_Float32]>:$operand
+  );
+
+  let results = (outs
+    SPIRV_Int32:$result
+  );
+
+  let assemblyFormat = [{
+    attr-dict $operand `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
+def SPIRV_GLUnpackHalf2x16Op : SPIRV_GLOp<"UnpackHalf2x16", 62, [Pure]> {
+  let summary = "Unpack 32-bit integer into two-component vector of 32-bit floats";
+
+  let description = [{
+    Result is the two-component floating-point vector with components obtained by
+    unpacking a 32-bit unsigned integer into a pair of 16-bit values, interpreting
+    those values as 16-bit floating-point numbers according to the OpenGL
+    Specification, and converting them to 32-bit floating-point values. Subnormal
+    numbers are either preserved or flushed to zero, consistently within an
+    implementation.
+
+    The first component of the vector is obtained from the 16 least-significant bits
+    of v; the second component is obtained from the 16 most-significant bits of v.
+
+    The RelaxedPrecision Decoration only affects the conversion step of the instruction.
+
+    The v operand must be a scalar with 32-bit integer type.
+
+    Result Type must be a vector of 2 components whose type is 32-bit floating point.
+
+    #### Example:
+
+    ```mlir
+    %1 = spirv.GL.UnpackHalf2x16 %0 : i32 -> vector<2xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_Int32:$operand
+  );
+
+  let results = (outs
+    VectorOfLengthAndType<[2], [SPIRV_Float32]>:$result
+  );
+
+  let assemblyFormat = [{
+    attr-dict $operand `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
 #endif // MLIR_DIALECT_SPIRV_IR_GL_OPS
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 29beee5aea93c..fbcf2095dc608 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -815,3 +815,107 @@ func.func @exp2_invalid_type(%arg0 : i32) -> () {
   %0 = spirv.GL.Exp2 %arg0 : i32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.PackHalf2x16 
+//===----------------------------------------------------------------------===//
+
+func.func @pack_half_2x16(%arg0 : vector<2xf32>) -> () {
+  // CHECK: spirv.GL.PackHalf2x16 {{%.*}} : vector<2xf32> -> i32
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_i16_output(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be Int32, but got 'i16'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xf32> -> i16
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_wrong_vec_size(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op operand #0 must be vector of Float32 values of length 2, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<3xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_wrong_vec_type(%arg0 : vector<2xi32>) -> () {
+  // expected-error @+1 {{op operand #0 must be vector of Float32 values of length 2, but got 'vector<2xi32>'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : vector<2xi32> -> i32
+  return
+}
+
+// -----
+
+func.func @pack_half_2x16_scalar_in(%arg0 : f32) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.vector, but found 'f32'}}
+  %0 = spirv.GL.PackHalf2x16 %arg0 : f32 -> i32
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_vector_out(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.integer, but found 'vector<2xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : vector<2xf32> -> vector<2xi32>
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.UnpackHalf2x16 
+//===----------------------------------------------------------------------===//
+
+func.func @unpack_half_2x16(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.UnpackHalf2x16 {{%.*}} : i32 -> vector<2xf32>
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_i16_input(%arg0 : i16) -> () {
+  // expected-error @+1 {{op operand #0 must be Int32, but got 'i16'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i16 -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_wrong_vec_size(%arg0 : i32) -> () {
+  // expected-error @+1 {{op result #0 must be vector of Float32 values of length 2, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<3xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_wrong_vec_type(%arg0 : i32) -> () {
+  // expected-error @+1 {{op result #0 must be vector of Float32 values of length 2, but got 'vector<2xi32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xi32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_vec_in(%arg0 : vector<2xf32>) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.integer, but found 'vector<2xf32>'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : vector<2xf32> -> vector<2xf32>
+  return
+}
+
+// -----
+
+func.func @unpack_half_2x16_scalar_out(%arg0 : i32) -> () {
+  // expected-error @+1 {{invalid kind of type specified: expected builtin.vector, but found 'f32'}}
+  %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> f32
+  return
+}
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index 3dee03345e9a1..e4a6c6fb5a34e 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -96,7 +96,7 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 
-spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
+  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
     // CHECK: {{%.*}} = spirv.GL.Cross {{%.*}}, {{%.*}} : vector<3xf32>
     %0 = spirv.GL.Cross %arg1, %arg2 : vector<3xf32>
     // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : f32
@@ -114,5 +114,11 @@ spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "N
     spirv.Return
   }
 
-
+  spirv.func @pack_half_2x16(%arg0 : i32) "None" {
+    // CHECK: {{%.*}} = spirv.GL.UnpackHalf2x16 {{%.*}} : i32 -> vector<2xf32>
+    %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> vector<2xf32>
+    // CHECK: {{%.*}} = spirv.GL.PackHalf2x16 {{%.*}} : vector<2xf32> -> i32
+    %1 = spirv.GL.PackHalf2x16 %0 : vector<2xf32> -> i32
+    spirv.Return
+  }
 }

From e4de74ba11eadb47cf78afbabffbf2b1a50e7298 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Thu, 12 Jun 2025 17:11:06 +0200
Subject: [PATCH 246/851] =?UTF-8?q?[mlir][Vector]=20Tighten=20up=20applica?=
 =?UTF-8?q?tion=20conditions=20in=20TransferReadAfter=E2=80=A6=20(#143869)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…WriteToBroadcast

The pattern would previously apply in spurious cases and generate
incorrect IR.

In the process, we disable the application of this pattern in the case
where there is no broadcast; this should be handled separately and may
more easily support masking.

The case {no-broadcast, yes-transpose} was previously caught by this
pattern and arguably could also generate incorrect IR (and was also
untested): this case does not apply anymore.

The last cast {yes-broadcast, yes-transpose} continues to apply but
should arguably be removed from the future because creating transposes
as part of canonicalization feels dangerous.
There are other patterns that move permutation logic:

- either into the transfer, or
- outside of the transfer

Ideally, this would be target-dependent and not a canonicalization (i.e.
does your DMA HW allow transpose on the fly or not) but this is beyond
the scope of this PR.

Co-authored-by: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   |  30 ++++--
 mlir/test/Dialect/Vector/canonicalize.mlir | 108 ++++++++++++++++++---
 2 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index a295bf1eb4d95..2a2357319bd23 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4668,12 +4668,15 @@ struct TransferReadAfterWriteToBroadcast
 
   LogicalResult matchAndRewrite(TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
-    if (readOp.hasOutOfBoundsDim() ||
-        !llvm::isa<RankedTensorType>(readOp.getShapedType()))
-      return failure();
     auto defWrite = readOp.getBase().getDefiningOp<vector::TransferWriteOp>();
     if (!defWrite)
       return failure();
+    // Bail if we need an alias analysis.
+    if (!readOp.hasPureTensorSemantics() || !defWrite.hasPureTensorSemantics())
+      return failure();
+    // Bail if we need a bounds analysis.
+    if (readOp.hasOutOfBoundsDim() || defWrite.hasOutOfBoundsDim())
+      return failure();
     // TODO: If the written transfer chunk is a superset of the read transfer
     // chunk we could do an extract_strided_slice.
     if (readOp.getTransferChunkAccessed() !=
@@ -4684,15 +4687,28 @@ struct TransferReadAfterWriteToBroadcast
     if (getUnusedDimsBitVector({readOp.getPermutationMap()}) !=
         getUnusedDimsBitVector({defWrite.getPermutationMap()}))
       return failure();
-    if (readOp.getIndices() != defWrite.getIndices() ||
-        readOp.getMask() != defWrite.getMask())
+    // This pattern should only catch the broadcast case, the non-broadcast case
+    // should be done separately to keep application conditions clean and
+    // separate.
+    AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
+    AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
+    bool bcast = !readMap.getBroadcastDims().empty() ||
+                 !writeMap.getBroadcastDims().empty();
+    if (!bcast)
+      return failure();
+    // At this point, we know we have a bcast.
+    // Bail in the masked case (too complex atm and needed to properly account
+    // for padding).
+    if (readOp.getMask() || defWrite.getMask())
+      return failure();
+    // If indices are not the same a shift may be required, bail.
+    if (readOp.getIndices() != defWrite.getIndices())
       return failure();
+
     Value vec = defWrite.getVector();
     // TODO: loop through the chain of transfer_write if we can prove that they
     // don't overlap with the transfer_read. This requires improving
     // `isDisjointTransferIndices` helper.
-    AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
-    AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
     AffineMap map = readMap.compose(writeMap);
     if (map.getNumResults() == 0)
       return failure();
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index a06a9f67d54dc..6691cb52acdc0 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -408,7 +408,7 @@ func.func @extract_strided_fold_insert(%a: vector<6x4xf32>, %b: vector<8x16xf32>
 // -----
 
 // Negative test where the extract is not a subset of the element inserted.
-// CHECK-LABEL: extract_strided_fold_negative
+// CHECK-LABEL: negative_extract_strided_fold
 //  CHECK-SAME: (%[[ARG0:.*]]: vector<4x4xf32>, %[[ARG1:.*]]: vector<8x16xf32>
 //       CHECK:   %[[INS:.*]] = vector.insert_strided_slice %[[ARG0]], %[[ARG1]]
 //  CHECK-SAME:     {offsets = [2, 2], strides = [1, 1]}
@@ -417,7 +417,7 @@ func.func @extract_strided_fold_insert(%a: vector<6x4xf32>, %b: vector<8x16xf32>
 //  CHECK-SAME:     {offsets = [2, 2], sizes = [6, 4], strides = [1, 1]}
 //  CHECK-SAME:       : vector<8x16xf32> to vector<6x4xf32>
 //  CHECK-NEXT:   return %[[EXT]] : vector<6x4xf32>
-func.func @extract_strided_fold_negative(%a: vector<4x4xf32>, %b: vector<8x16xf32>)
+func.func @negative_extract_strided_fold(%a: vector<4x4xf32>, %b: vector<8x16xf32>)
   -> (vector<6x4xf32>) {
   %0 = vector.insert_strided_slice %a, %b {offsets = [2, 2], strides = [1, 1]}
     : vector<4x4xf32> into vector<8x16xf32>
@@ -753,10 +753,10 @@ func.func @fold_extract_broadcast_0dvec_input_scalar_output(%a : vector<f32>,
 
 // -----
 
-// CHECK-LABEL: fold_extract_broadcast_negative
+// CHECK-LABEL: negative_fold_extract_broadcast
 //       CHECK:   vector.broadcast %{{.*}} : vector<1x1xf32> to vector<1x1x4xf32>
 //       CHECK:   vector.extract %{{.*}}[0, 0] : vector<4xf32> from vector<1x1x4xf32>
-func.func @fold_extract_broadcast_negative(%a : vector<1x1xf32>) -> vector<4xf32> {
+func.func @negative_fold_extract_broadcast(%a : vector<1x1xf32>) -> vector<4xf32> {
   %b = vector.broadcast %a : vector<1x1xf32> to vector<1x1x4xf32>
   %r = vector.extract %b[0, 0] : vector<4xf32> from vector<1x1x4xf32>
   return %r : vector<4xf32>
@@ -895,11 +895,11 @@ func.func @fold_extract_shapecast_0d_source(%arg0 : vector<f32>) -> f32 {
 
 // -----
 
-// CHECK-LABEL: fold_extract_shapecast_negative
+// CHECK-LABEL: negative_fold_extract_shapecast
 //       CHECK:   %[[V:.*]] = vector.shape_cast %{{.*}} : vector<16xf32> to vector<2x4x2xf32>
 //       CHECK:   %[[R:.*]] = vector.extract %[[V]][1] : vector<4x2xf32> from vector<2x4x2xf32>
 //       CHECK:   return %[[R]] : vector<4x2xf32>
-func.func @fold_extract_shapecast_negative(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
+func.func @negative_fold_extract_shapecast(%arg0 : vector<16xf32>) -> vector<4x2xf32> {
   %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<2x4x2xf32>
   %r = vector.extract %0[1] : vector<4x2xf32> from vector<2x4x2xf32>
   return %r : vector<4x2xf32>
@@ -1460,11 +1460,11 @@ func.func @store_after_load_tensor(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
 
 // -----
 
-// CHECK-LABEL: func @store_after_load_tensor_negative
+// CHECK-LABEL: func @negative_store_after_load_tensor
 //       CHECK:   vector.transfer_read
 //       CHECK:   vector.transfer_write
 //       CHECK:   return
-func.func @store_after_load_tensor_negative(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
+func.func @negative_store_after_load_tensor(%arg0 : tensor<4x4xf32>) -> tensor<4x4xf32> {
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
@@ -1499,12 +1499,12 @@ func.func @store_to_load_tensor(%arg0 : tensor<4x4xf32>,
 
 // -----
 
-// CHECK-LABEL: func @store_to_load_negative_tensor
+// CHECK-LABEL: func @negative_store_to_load_tensor
 //       CHECK:   vector.transfer_write
 //       CHECK:   vector.transfer_write
 //       CHECK:   %[[V:.*]] = vector.transfer_read
 //       CHECK:   return %[[V]] : vector<1x4xf32>
-func.func @store_to_load_negative_tensor(%arg0 : tensor<4x4xf32>,
+func.func @negative_store_to_load_tensor(%arg0 : tensor<4x4xf32>,
   %v0 : vector<1x4xf32>, %v1 : vector<1x4xf32>, %i : index) -> vector<1x4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -1540,6 +1540,86 @@ func.func @store_to_load_tensor_broadcast(%arg0 : tensor<4x4xf32>,
 
 // -----
 
+// CHECK-LABEL: func @negative_store_to_load_tensor_memref
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_memref(
+    %arg0 : tensor<?x?xf32>,
+    %arg1 : memref<?x?xf32>,
+    %v0 : vector<4x2xf32>
+  ) -> vector<4x2xf32> 
+{
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  vector.transfer_write %v0, %arg1[%c0, %c0] {in_bounds = [true, true]} :
+    vector<4x2xf32>, memref<?x?xf32>
+  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, true]} :
+    tensor<?x?xf32>, vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_no_actual_broadcast
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_no_actual_broadcast(%arg0 : tensor<?x?xf32>,
+  %v0 : vector<4x2xf32>) -> vector<4x2xf32> {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true]} :
+    tensor<?x?xf32>, vector<4x2xf32>
+  return %0 : vector<4x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_broadcast_out_of_bounds
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_broadcast_out_of_bounds(%arg0 : tensor<?x?xf32>,
+  %v0 : vector<4x2xf32>) -> vector<4x2x6xf32> {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0] :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true],
+  permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} :
+    tensor<?x?xf32>, vector<4x2x6xf32>
+  return %0 : vector<4x2x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_store_to_load_tensor_broadcast_masked
+//   CHECK-NOT:   vector.broadcast
+//   CHECK-NOT:   vector.transpose
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @negative_store_to_load_tensor_broadcast_masked(
+    %arg0 : tensor<?x?xf32>, %v0 : vector<4x2xf32>, %mask : vector<4x2xi1>)
+  -> vector<4x2x6xf32> 
+{
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0], %mask {in_bounds = [true, true]} :
+    vector<4x2xf32>, tensor<?x?xf32>
+  %0 = vector.transfer_read %w0[%c0, %c0], %cf0 {in_bounds = [true, true, true],
+  permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} :
+    tensor<?x?xf32>, vector<4x2x6xf32>
+  return %0 : vector<4x2x6xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @store_to_load_tensor_broadcast_scalable
 //  CHECK-SAME: (%[[ARG:.*]]: tensor<?xf32>, %[[V0:.*]]: vector<[4]xf32>)
 //       CHECK:   %[[B:.*]] = vector.broadcast %[[V0]] : vector<[4]xf32> to vector<6x[4]xf32>
@@ -1604,7 +1684,7 @@ func.func @dead_store_tensor(%arg0 : tensor<4x4xf32>,
 
 // -----
 
-// CHECK-LABEL: func @dead_store_tensor_negative
+// CHECK-LABEL: func @negative_dead_store_tensor
 //   CHECK-DAG:      %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:      %[[C1:.*]] = arith.constant 1 : index
 //       CHECK:   vector.transfer_write
@@ -1612,7 +1692,7 @@ func.func @dead_store_tensor(%arg0 : tensor<4x4xf32>,
 //       CHECK:   vector.transfer_read
 //       CHECK:   %[[VTW:.*]] = vector.transfer_write {{.*}}, {{.*}}[%[[C1]], %[[C0]]]
 //       CHECK:   return %[[VTW]] : tensor<4x4xf32>
-func.func @dead_store_tensor_negative(%arg0 : tensor<4x4xf32>,
+func.func @negative_dead_store_tensor(%arg0 : tensor<4x4xf32>,
   %v0 : vector<1x4xf32>, %v1 : vector<1x4xf32>, %i : index) -> tensor<4x4xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
@@ -2063,10 +2143,10 @@ func.func @extract_insert_rank_reduce(%a: vector<4xf32>, %b: vector<8x16xf32>)
 
 // -----
 
-// CHECK-LABEL: extract_insert_negative
+// CHECK-LABEL: negative_extract_insert
 //       CHECK: vector.insert_strided_slice
 //       CHECK: vector.extract
-func.func @extract_insert_negative(%a: vector<2x15xf32>, %b: vector<12x8x16xf32>)
+func.func @negative_extract_insert(%a: vector<2x15xf32>, %b: vector<12x8x16xf32>)
   -> vector<16xf32> {
   %0 = vector.insert_strided_slice %a, %b {offsets = [4, 2, 0], strides = [1, 1]}
     : vector<2x15xf32> into vector<12x8x16xf32>

From 2e5fb77ce03748608cfad49fd62479fc3d912372 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Thu, 12 Jun 2025 08:22:04 -0700
Subject: [PATCH 247/851] [llvm] Make TestData compatible with c++20 (#143801)

The clang-debian-cpp20 buildbot did not like direct initialization
without a matching constructor. This patch adds a new constructor taking
a json::Object that directly initializes the struct fields. We also
update an internal interface for const correctness.

https://lab.llvm.org/buildbot/#/builders/108/builds/13950
---
 .../llvm-test-mustache-spec.cpp               | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
index 28ed1b876672d..1f566e13f070a 100644
--- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
+++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp
@@ -146,6 +146,13 @@ static const StringMap<StringSet<>> XFailTestNames = {{
 }};
 
 struct TestData {
+  TestData() = default;
+  explicit TestData(const json::Object &TestCase)
+      : TemplateStr(*TestCase.getString("template")),
+        ExpectedStr(*TestCase.getString("expected")),
+        Name(*TestCase.getString("name")), Data(TestCase.get("data")),
+        Partials(TestCase.get("partials")) {}
+
   static Expected<TestData> createTestData(json::Object *TestCase,
                                            StringRef InputFile) {
     // If any of the needed elements are missing, we cannot continue.
@@ -157,19 +164,14 @@ struct TestData {
           llvm::inconvertibleErrorCode(),
           "invalid JSON schema in test file: " + InputFile + "\n");
 
-    return TestData{TestCase->getString("template").value(),
-                    TestCase->getString("expected").value(),
-                    TestCase->getString("name").value(), TestCase->get("data"),
-                    TestCase->get("partials")};
+    return TestData(*TestCase);
   }
 
-  TestData() = default;
-
   StringRef TemplateStr;
   StringRef ExpectedStr;
   StringRef Name;
-  Value *Data;
-  Value *Partials;
+  const Value *Data;
+  const Value *Partials;
 };
 
 static void reportTestFailure(const TestData &TD, StringRef ActualStr,
@@ -191,7 +193,7 @@ static void reportTestFailure(const TestData &TD, StringRef ActualStr,
   }
 }
 
-static void registerPartials(Value *Partials, Template &T) {
+static void registerPartials(const Value *Partials, Template &T) {
   if (!Partials)
     return;
   for (const auto &[Partial, Str] : *Partials->getAsObject())

From 9b679889b596aa5076062d5fbbdd01e3532b4ff5 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 12 Jun 2025 08:24:50 -0700
Subject: [PATCH 248/851] [clang][darwin] Fix assertion failure when reporting
 fatal errors when inferring OS versions (#143817)

---
 .../clang/Basic/DiagnosticDriverKinds.td      |  2 +
 clang/lib/Driver/ToolChains/Darwin.cpp        | 52 +++++++++++++------
 .../Driver/darwin-invalid-version-range.c     | 29 +++++++++++
 3 files changed, 68 insertions(+), 15 deletions(-)
 create mode 100644 clang/test/Driver/darwin-invalid-version-range.c

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 20fb47237c56f..29f6480ba935c 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -206,6 +206,8 @@ def err_drv_cannot_open_randomize_layout_seed_file : Error<
   "cannot read randomize layout seed file '%0'">;
 def err_drv_invalid_version_number : Error<
   "invalid version number in '%0'">;
+def err_drv_invalid_version_number_inferred
+    : Error<"invalid version number '%0' inferred from '%1'">;
 def err_drv_missing_version_number : Error<"missing version number in '%0'">;
 def err_drv_kcfi_arity_unsupported_target : Error<
   "target '%0' is unsupported by -fsanitize-kcfi-arity">;
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index e987ef78920e8..e5075cbcaf660 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1793,16 +1793,23 @@ struct DarwinPlatform {
     case TargetArg:
     case MTargetOSArg:
     case OSVersionArg:
-    case InferredFromSDK:
-    case InferredFromArch:
       assert(Arg && "OS version argument not yet inferred");
       return Arg->getAsString(Args);
     case DeploymentTargetEnv:
       return (llvm::Twine(EnvVarName) + "=" + OSVersionStr).str();
+    case InferredFromSDK:
+    case InferredFromArch:
+      llvm_unreachable("Cannot print arguments for inferred OS version");
     }
     llvm_unreachable("Unsupported Darwin Source Kind");
   }
 
+  // Returns the inferred source of how the OS version was resolved.
+  std::string getInferredSource() {
+    assert(!isExplicitlySpecified() && "OS version was not inferred");
+    return InferredSource.str();
+  }
+
   void setEnvironment(llvm::Triple::EnvironmentType EnvType,
                       const VersionTuple &OSVersion,
                       const std::optional<DarwinSDKInfo> &SDKInfo) {
@@ -1876,7 +1883,8 @@ struct DarwinPlatform {
     Result.EnvVarName = EnvVarName;
     return Result;
   }
-  static DarwinPlatform createFromSDK(DarwinPlatformKind Platform,
+  static DarwinPlatform createFromSDK(StringRef SDKRoot,
+                                      DarwinPlatformKind Platform,
                                       StringRef Value,
                                       bool IsSimulator = false) {
     DarwinPlatform Result(InferredFromSDK, Platform,
@@ -1884,11 +1892,15 @@ struct DarwinPlatform {
     if (IsSimulator)
       Result.Environment = DarwinEnvironmentKind::Simulator;
     Result.InferSimulatorFromArch = false;
+    Result.InferredSource = SDKRoot;
     return Result;
   }
-  static DarwinPlatform createFromArch(llvm::Triple::OSType OS,
+  static DarwinPlatform createFromArch(StringRef Arch, llvm::Triple::OSType OS,
                                        VersionTuple Version) {
-    return DarwinPlatform(InferredFromArch, getPlatformFromOS(OS), Version);
+    auto Result =
+        DarwinPlatform(InferredFromArch, getPlatformFromOS(OS), Version);
+    Result.InferredSource = Arch;
+    return Result;
   }
 
   /// Constructs an inferred SDKInfo value based on the version inferred from
@@ -1975,6 +1987,9 @@ struct DarwinPlatform {
   bool InferSimulatorFromArch = true;
   std::pair<Arg *, std::string> Arguments;
   StringRef EnvVarName;
+  // If the DarwinPlatform information is derived from an inferred source, this
+  // captures what that source input was for error reporting.
+  StringRef InferredSource;
   // When compiling for a zippered target, this value represents the target
   // triple encoded in the target variant.
   std::optional<llvm::Triple> TargetVariantTriple;
@@ -2143,26 +2158,27 @@ inferDeploymentTargetFromSDK(DerivedArgList &Args,
       [&](StringRef SDK) -> std::optional<DarwinPlatform> {
     if (SDK.starts_with("iPhoneOS") || SDK.starts_with("iPhoneSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::IPhoneOS, Version,
+          isysroot, Darwin::IPhoneOS, Version,
           /*IsSimulator=*/SDK.starts_with("iPhoneSimulator"));
     else if (SDK.starts_with("MacOSX"))
-      return DarwinPlatform::createFromSDK(Darwin::MacOS,
+      return DarwinPlatform::createFromSDK(isysroot, Darwin::MacOS,
                                            getSystemOrSDKMacOSVersion(Version));
     else if (SDK.starts_with("WatchOS") || SDK.starts_with("WatchSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::WatchOS, Version,
+          isysroot, Darwin::WatchOS, Version,
           /*IsSimulator=*/SDK.starts_with("WatchSimulator"));
     else if (SDK.starts_with("AppleTVOS") ||
              SDK.starts_with("AppleTVSimulator"))
       return DarwinPlatform::createFromSDK(
-          Darwin::TvOS, Version,
+          isysroot, Darwin::TvOS, Version,
           /*IsSimulator=*/SDK.starts_with("AppleTVSimulator"));
     else if (SDK.starts_with("XR"))
       return DarwinPlatform::createFromSDK(
-          Darwin::XROS, Version,
+          isysroot, Darwin::XROS, Version,
           /*IsSimulator=*/SDK.contains("Simulator"));
     else if (SDK.starts_with("DriverKit"))
-      return DarwinPlatform::createFromSDK(Darwin::DriverKit, Version);
+      return DarwinPlatform::createFromSDK(isysroot, Darwin::DriverKit,
+                                           Version);
     return std::nullopt;
   };
   if (auto Result = CreatePlatformFromSDKName(SDK))
@@ -2236,7 +2252,7 @@ inferDeploymentTargetFromArch(DerivedArgList &Args, const Darwin &Toolchain,
   if (OSTy == llvm::Triple::UnknownOS)
     return std::nullopt;
   return DarwinPlatform::createFromArch(
-      OSTy, getInferredOSVersion(OSTy, Triple, TheDriver));
+      MachOArchName, OSTy, getInferredOSVersion(OSTy, Triple, TheDriver));
 }
 
 /// Returns the deployment target that's specified using the -target option.
@@ -2455,9 +2471,15 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
   }
 
   assert(PlatformAndVersion && "Unable to infer Darwin variant");
-  if (!PlatformAndVersion->isValidOSVersion())
-    getDriver().Diag(diag::err_drv_invalid_version_number)
-        << PlatformAndVersion->getAsString(Args, Opts);
+  if (!PlatformAndVersion->isValidOSVersion()) {
+    if (PlatformAndVersion->isExplicitlySpecified())
+      getDriver().Diag(diag::err_drv_invalid_version_number)
+          << PlatformAndVersion->getAsString(Args, Opts);
+    else
+      getDriver().Diag(diag::err_drv_invalid_version_number_inferred)
+          << PlatformAndVersion->getOSVersion().getAsString()
+          << PlatformAndVersion->getInferredSource();
+  }
   // After the deployment OS version has been resolved, set it to the canonical
   // version before further error detection and converting to a proper target
   // triple.
diff --git a/clang/test/Driver/darwin-invalid-version-range.c b/clang/test/Driver/darwin-invalid-version-range.c
new file mode 100644
index 0000000000000..84603aec1d2f5
--- /dev/null
+++ b/clang/test/Driver/darwin-invalid-version-range.c
@@ -0,0 +1,29 @@
+/// This test validates that the various ways to assign an invalid deployment version are captured and detected.
+// REQUIRES: system-darwin && native
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// RUN: env SDKROOT=%t/iPhoneOS21.0.sdk not %clang -m64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=SDKROOT
+
+// RUN: not %clang -isysroot %t/iPhoneOS21.0.sdk -m64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=SYSROOT
+
+// RUN: not %clang -target arm64-apple-ios21 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=TARGET
+
+// RUN: not %clang -mtargetos=ios21 -arch arm64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=MTARGET
+
+// RUN: env IPHONEOS_DEPLOYMENT_TARGET=21.0 not %clang -arch arm64 -c -### %s 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=DEPLOY_VAR
+
+// SDKROOT:    error: invalid version number '21.0' inferred from '{{.*}}.sdk'
+// SYSROOT:    error: invalid version number '21.0' inferred from '{{.*}}.sdk'
+// TARGET:     error: invalid version number in '-target arm64-apple-ios21'
+// MTARGET:    error: invalid version number in '-mtargetos=ios21'
+// DEPLOY_VAR: error: invalid version number in 'IPHONEOS_DEPLOYMENT_TARGET=21.0'
+
+//--- iPhoneOS21.0.sdk/SDKSettings.json
+{"Version":"21.0", "MaximumDeploymentTarget": "21.0.99"}

From f6eaa2b00cc8d6421934cc92d4b210348809d700 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 12 Jun 2025 08:29:49 -0700
Subject: [PATCH 249/851] Reland "[clang-format][NFC] Clean up fillRanges() in
 ClangFormat.cpp" (#143477)

Reapply https://github.com/llvm/llvm-project/pull/143236 and fix the bug
reported in
https://github.com/llvm/llvm-project/pull/143236#issuecomment-2957102180.
---
 clang/tools/clang-format/ClangFormat.cpp | 52 +++++++++++-------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index b22d3aaf3183b..24ad3cb42254d 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -244,17 +244,17 @@ static bool fillRanges(MemoryBuffer *Code,
   DiagnosticsEngine Diagnostics(
       IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs), DiagOpts);
   SourceManager Sources(Diagnostics, Files);
-  FileID ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
-                                 InMemoryFileSystem.get());
+  const auto ID = createInMemoryFile("<irrelevant>", *Code, Sources, Files,
+                                     InMemoryFileSystem.get());
   if (!LineRanges.empty()) {
     if (!Offsets.empty() || !Lengths.empty()) {
       errs() << "error: cannot use -lines with -offset/-length\n";
       return true;
     }
 
-    for (unsigned i = 0, e = LineRanges.size(); i < e; ++i) {
+    for (const auto &LineRange : LineRanges) {
       unsigned FromLine, ToLine;
-      if (parseLineRange(LineRanges[i], FromLine, ToLine)) {
+      if (parseLineRange(LineRange, FromLine, ToLine)) {
         errs() << "error: invalid <start line>:<end line> pair\n";
         return true;
       }
@@ -266,12 +266,12 @@ static bool fillRanges(MemoryBuffer *Code,
         errs() << "error: start line should not exceed end line\n";
         return true;
       }
-      SourceLocation Start = Sources.translateLineCol(ID, FromLine, 1);
-      SourceLocation End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
+      const auto Start = Sources.translateLineCol(ID, FromLine, 1);
+      const auto End = Sources.translateLineCol(ID, ToLine, UINT_MAX);
       if (Start.isInvalid() || End.isInvalid())
         return true;
-      unsigned Offset = Sources.getFileOffset(Start);
-      unsigned Length = Sources.getFileOffset(End) - Offset;
+      const auto Offset = Sources.getFileOffset(Start);
+      const auto Length = Sources.getFileOffset(End) - Offset;
       Ranges.push_back(tooling::Range(Offset, Length));
     }
     return false;
@@ -279,32 +279,28 @@ static bool fillRanges(MemoryBuffer *Code,
 
   if (Offsets.empty())
     Offsets.push_back(0);
-  if (Offsets.size() != Lengths.size() &&
-      !(Offsets.size() == 1 && Lengths.empty())) {
+  const bool EmptyLengths = Lengths.empty();
+  unsigned Length = 0;
+  if (Offsets.size() == 1 && EmptyLengths) {
+    Length = Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) - Offsets[0];
+  } else if (Offsets.size() != Lengths.size()) {
     errs() << "error: number of -offset and -length arguments must match.\n";
     return true;
   }
-  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
-    if (Offsets[i] >= Code->getBufferSize()) {
-      errs() << "error: offset " << Offsets[i] << " is outside the file\n";
+  for (unsigned I = 0, E = Offsets.size(), CodeSize = Code->getBufferSize();
+       I < E; ++I) {
+    const auto Offset = Offsets[I];
+    if (Offset >= CodeSize) {
+      errs() << "error: offset " << Offset << " is outside the file\n";
       return true;
     }
-    SourceLocation Start =
-        Sources.getLocForStartOfFile(ID).getLocWithOffset(Offsets[i]);
-    SourceLocation End;
-    if (i < Lengths.size()) {
-      if (Offsets[i] + Lengths[i] > Code->getBufferSize()) {
-        errs() << "error: invalid length " << Lengths[i]
-               << ", offset + length (" << Offsets[i] + Lengths[i]
-               << ") is outside the file.\n";
-        return true;
-      }
-      End = Start.getLocWithOffset(Lengths[i]);
-    } else {
-      End = Sources.getLocForEndOfFile(ID);
+    if (!EmptyLengths)
+      Length = Lengths[I];
+    if (Offset + Length > CodeSize) {
+      errs() << "error: invalid length " << Length << ", offset + length ("
+             << Offset + Length << ") is outside the file.\n";
+      return true;
     }
-    unsigned Offset = Sources.getFileOffset(Start);
-    unsigned Length = Sources.getFileOffset(End) - Offset;
     Ranges.push_back(tooling::Range(Offset, Length));
   }
   return false;

From f12b1ed11672bc40a53fb1180541b2fda6e7d9fc Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Thu, 12 Jun 2025 16:35:36 +0100
Subject: [PATCH 250/851] [flang][OpenMP] Add TODOs for target [teams|parallel]
 private (#143706)

Using the private clause on `target teams` or `target parallel` is not
currently implemented and causes crashes during lowering. Add
appropriate TODOs.

Resolves https://github.com/llvm/llvm-project/issues/116428.

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp                   |  7 +++++++
 .../Lower/OpenMP/Todo/target-parallel-private.f90   | 13 +++++++++++++
 .../test/Lower/OpenMP/Todo/target-teams-private.f90 | 13 +++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/target-teams-private.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index c13fa471978db..82673f0948a5b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -4024,6 +4024,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
       TODO(clauseLocation, name + " clause is not implemented yet");
     }
+
+    if (std::holds_alternative<clause::Private>(clause.u) &&
+        origDirective == llvm::omp::Directive::OMPD_target_teams)
+      TODO(clauseLocation, "TARGET TEAMS PRIVATE is not implemented yet");
+    if (std::holds_alternative<clause::Private>(clause.u) &&
+        origDirective == llvm::omp::Directive::OMPD_target_parallel)
+      TODO(clauseLocation, "TARGET PARALLEL PRIVATE is not implemented yet");
   }
 
   llvm::omp::Directive directive =
diff --git a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90 b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
new file mode 100644
index 0000000000000..e820143021f9a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `private` clause on `target parallel`
+!===============================================================================
+
+! CHECK: not yet implemented: TARGET PARALLEL PRIVATE is not implemented yet
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target parallel private(i)
+!$omp end target parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90 b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
new file mode 100644
index 0000000000000..c8d998a5cbf94
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+!===============================================================================
+! `private` clause on `target teams`
+!===============================================================================
+
+! CHECK: not yet implemented: TARGET TEAMS PRIVATE is not implemented yet
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target teams private(i)
+!$omp end target teams
+end subroutine

From 13fe07d670e8a115929c9e595c4490ef5c75f583 Mon Sep 17 00:00:00 2001
From: tynasello-google <tynasello@google.com>
Date: Thu, 12 Jun 2025 08:39:28 -0700
Subject: [PATCH 251/851] [libc++] Expand Android libc++ test config files
 (#142846)

Parameterize (and rename) existing libc++/libc++abi test configuration
files for the Android NDK to work for both the NDK and platform.

Android LLVM downstream seeks to test libc++ for both the NDK and
platform build (currently only testing the NDK), which will use almost
identical test configuration files. The only difference is the name of
the libc++ shared object used. Because of this we parameterize the
current test files (for both libc++ and libc++abi) with the existing
LIBCXX_SHARED_OUTPUT_NAME cmake variable, and rename the file
accordingly.
---
 libcxx/cmake/caches/AndroidNDK.cmake                 |  4 ++--
 ...android-ndk.cfg.in => llvm-libc++-android.cfg.in} | 10 +++++-----
 ...roid-ndk.cfg.in => llvm-libc++abi-android.cfg.in} | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)
 rename libcxx/test/configs/{llvm-libc++-android-ndk.cfg.in => llvm-libc++-android.cfg.in} (83%)
 rename libcxxabi/test/configs/{llvm-libc++abi-android-ndk.cfg.in => llvm-libc++abi-android.cfg.in} (72%)

diff --git a/libcxx/cmake/caches/AndroidNDK.cmake b/libcxx/cmake/caches/AndroidNDK.cmake
index 298518781e9b7..1a04b7fbb217d 100644
--- a/libcxx/cmake/caches/AndroidNDK.cmake
+++ b/libcxx/cmake/caches/AndroidNDK.cmake
@@ -33,5 +33,5 @@ set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
 
 # Use adb to push tests to a locally-connected device (e.g. emulator) and run
 # them.
-set(LIBCXX_TEST_CONFIG "llvm-libc++-android-ndk.cfg.in" CACHE STRING "")
-set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android-ndk.cfg.in" CACHE STRING "")
+set(LIBCXX_TEST_CONFIG "llvm-libc++-android.cfg.in" CACHE STRING "")
+set(LIBCXXABI_TEST_CONFIG "llvm-libc++abi-android.cfg.in" CACHE STRING "")
diff --git a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in b/libcxx/test/configs/llvm-libc++-android.cfg.in
similarity index 83%
rename from libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
rename to libcxx/test/configs/llvm-libc++-android.cfg.in
index 31a07f6471651..9362c68e8f7a8 100644
--- a/libcxx/test/configs/llvm-libc++-android-ndk.cfg.in
+++ b/libcxx/test/configs/llvm-libc++-android.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's
-# libc++ using adb and a libc++_shared.so library on Android.
+# libc++ using adb on Android.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
@@ -27,11 +27,11 @@ if re.match(r'i686-linux-android(21|22|23)$', config.target_triple):
     compile_flags += ' -mstackrealign'
 config.substitutions.append(('%{compile_flags}', compile_flags))
 
-# The NDK library is called "libc++_shared.so". Use LD_LIBRARY_PATH to find
-# libc++_shared.so because older Bionic dynamic loaders don't support rpath
-# lookup.
+# The platform library is called "libc++.so" and the NDK library is called "libc++_shared.so". 
+# Use LD_LIBRARY_PATH to find the libcxx shared object because older Bionic dynamic loaders 
+# don't support rpath lookup.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib-dir} -lc++_shared'
+    '-nostdlib++ -L %{lib-dir} -l@LIBCXX_SHARED_OUTPUT_NAME@'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +
diff --git a/libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
similarity index 72%
rename from libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in
rename to libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
index f2cb62a32d4e8..bc58446615361 100644
--- a/libcxxabi/test/configs/llvm-libc++abi-android-ndk.cfg.in
+++ b/libcxxabi/test/configs/llvm-libc++abi-android.cfg.in
@@ -1,5 +1,5 @@
 # This testing configuration handles running the test suite against LLVM's
-# libc++abi using adb and a libc++_shared.so library on Android.
+# libc++abi using adb on Android.
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
@@ -19,12 +19,12 @@ config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} -I %{libcxx}/test/support -I %{libcxx}/src -D_LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS'
 ))
 
-# The NDK library is called "libc++_shared.so". Use LD_LIBRARY_PATH to find
-# libc++_shared.so because older Bionic dynamic loaders don't support rpath
-# lookup. The Android libc++ shared library exports libc++abi, so we don't need
-# to link with -lc++abi.
+# The platform library is called "libc++.so" and the NDK library is called "libc++_shared.so". 
+# Use LD_LIBRARY_PATH to find the libcxx shared object because older Bionic dynamic loaders 
+# don't support rpath lookup. The Android libc++ shared library exports libc++abi, so we 
+# don't need to link with -lc++abi.
 config.substitutions.append(('%{link_flags}',
-    '-nostdlib++ -L %{lib} -lc++_shared'
+    '-nostdlib++ -L %{lib} -l@LIBCXX_SHARED_OUTPUT_NAME@'
 ))
 config.substitutions.append(('%{exec}',
     '%{executor}' +

From 1c1df94d09820959c771cb4aaae4d36cdf5cab5a Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 12 Jun 2025 16:48:57 +0100
Subject: [PATCH 252/851] [lldb][Commands][NFC] Extract memory find expression
 evaluation into helpers (#143686)

This patch factors out the `-e` option logic into two helper functions.
The `EvaluateExpression` helper might seem redundant but I'll be adding
to it in a follow-up patch to fix an issue when running `memory find -e`
for Swift targets.

Also adds test coverage for the error cases that were previously
untested.

rdar://152113525
---
 lldb/source/Commands/CommandObjectMemory.cpp  | 101 ++++++++++--------
 .../memory/find/TestMemoryFind.py             |  41 +++++++
 .../API/functionalities/memory/find/main.cpp  |  15 +++
 3 files changed, 114 insertions(+), 43 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index 7140333bb3cde..85ae9f8f9e8cb 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -885,6 +885,52 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
 #define LLDB_OPTIONS_memory_find
 #include "CommandOptions.inc"
 
+static llvm::Error CopyExpressionResult(ValueObject &result,
+                                        DataBufferHeap &buffer) {
+  uint64_t value = result.GetValueAsUnsigned(0);
+  auto size_or_err = result.GetCompilerType().GetByteSize(nullptr);
+  if (!size_or_err)
+    return size_or_err.takeError();
+
+  switch (*size_or_err) {
+  case 1: {
+    uint8_t byte = (uint8_t)value;
+    buffer.CopyData(&byte, 1);
+  } break;
+  case 2: {
+    uint16_t word = (uint16_t)value;
+    buffer.CopyData(&word, 2);
+  } break;
+  case 4: {
+    uint32_t lword = (uint32_t)value;
+    buffer.CopyData(&lword, 4);
+  } break;
+  case 8: {
+    buffer.CopyData(&value, 8);
+  } break;
+  default:
+    return llvm::createStringError(
+        "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are "
+        "supported. For other pattern sizes the --string (-s) option may be "
+        "used.");
+  }
+
+  return llvm::Error::success();
+}
+
+static llvm::Expected<ValueObjectSP>
+EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
+                   Process &process) {
+  ValueObjectSP result_sp;
+  auto status =
+      process.GetTarget().EvaluateExpression(expression, &frame, result_sp);
+  if (status != eExpressionCompleted || !result_sp)
+    return llvm::createStringError(
+        "expression evaluation failed. pass a string instead");
+
+  return result_sp;
+}
+
 // Find the specified data in memory
 class CommandObjectMemoryFind : public CommandObjectParsed {
 public:
@@ -1026,49 +1072,18 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
       }
       buffer.CopyData(str);
     } else if (m_memory_options.m_expr.OptionWasSet()) {
-      StackFrame *frame = m_exe_ctx.GetFramePtr();
-      ValueObjectSP result_sp;
-      if ((eExpressionCompleted ==
-           process->GetTarget().EvaluateExpression(
-               m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(
-                   ""),
-               frame, result_sp)) &&
-          result_sp) {
-        uint64_t value = result_sp->GetValueAsUnsigned(0);
-        std::optional<uint64_t> size = llvm::expectedToOptional(
-            result_sp->GetCompilerType().GetByteSize(nullptr));
-        if (!size)
-          return;
-        switch (*size) {
-        case 1: {
-          uint8_t byte = (uint8_t)value;
-          buffer.CopyData(&byte, 1);
-        } break;
-        case 2: {
-          uint16_t word = (uint16_t)value;
-          buffer.CopyData(&word, 2);
-        } break;
-        case 4: {
-          uint32_t lword = (uint32_t)value;
-          buffer.CopyData(&lword, 4);
-        } break;
-        case 8: {
-          buffer.CopyData(&value, 8);
-        } break;
-        case 3:
-        case 5:
-        case 6:
-        case 7:
-          result.AppendError("unknown type. pass a string instead");
-          return;
-        default:
-          result.AppendError(
-              "result size larger than 8 bytes. pass a string instead");
-          return;
-        }
-      } else {
-        result.AppendError(
-            "expression evaluation failed. pass a string instead");
+      auto result_or_err = EvaluateExpression(
+          m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(""),
+          m_exe_ctx.GetFrameRef(), *process);
+      if (!result_or_err) {
+        result.AppendError(llvm::toString(result_or_err.takeError()));
+        return;
+      }
+
+      ValueObjectSP result_sp = *result_or_err;
+
+      if (auto err = CopyExpressionResult(*result_sp, buffer)) {
+        result.AppendError(llvm::toString(std::move(err)));
         return;
       }
     } else {
diff --git a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
index 09611cc808777..72426e75e013e 100644
--- a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
+++ b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
@@ -79,3 +79,44 @@ def test_memory_find(self):
             'memory find -s "nothere" `stringdata` `stringdata+10`',
             substrs=["data not found within the range."],
         )
+
+        # Expression results with unsupported result types.
+        self.expect(
+            'memory find -e "ThreeBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "FiveBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "SixBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "SevenBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "NineBytes{}" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Only expressions resulting in 1, 2, 4, or 8-byte-sized values are supported"
+            ],
+            error=True,
+        )
diff --git a/lldb/test/API/functionalities/memory/find/main.cpp b/lldb/test/API/functionalities/memory/find/main.cpp
index e3dcfc762ee0f..15c8df1a9fcf1 100644
--- a/lldb/test/API/functionalities/memory/find/main.cpp
+++ b/lldb/test/API/functionalities/memory/find/main.cpp
@@ -1,9 +1,24 @@
 #include <stdio.h>
 #include <stdint.h>
 
+template <size_t T> struct [[gnu::packed]] Payload {
+  uint8_t data[T];
+};
+
+using ThreeBytes = Payload<3>;
+using FiveBytes = Payload<5>;
+using SixBytes = Payload<5>;
+using SevenBytes = Payload<7>;
+using NineBytes = Payload<9>;
+
 int main (int argc, char const *argv[])
 {
     const char* stringdata = "hello world; I like to write text in const char pointers";
     uint8_t bytedata[] = {0xAA,0xBB,0xCC,0xDD,0xEE,0xFF,0x00,0x11,0x22,0x33,0x44,0x55,0x66,0x77,0x88,0x99};
+    ThreeBytes b1;
+    FiveBytes b2;
+    SixBytes b3;
+    SevenBytes b4;
+    NineBytes b5;
     return 0; // break here
 }

From 2a905dd1ebb46a6865b1f4743589b50cdb2cb4f0 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 08:41:54 -0700
Subject: [PATCH 253/851] [Matrix] Use range-for in Visit* Result construction.
 NFC

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index ce6eaa292d8fb..b32160ff275b9 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1223,7 +1224,7 @@ class LowerMatrixIntrinsics {
       MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
 
-      for (auto &Vector : M.vectors()) {
+      for (auto *Vector : M.vectors()) {
         switch (Inst->getIntrinsicID()) {
         case Intrinsic::abs:
           Result.addVector(Builder.CreateBinaryIntrinsic(Intrinsic::abs, Vector,
@@ -2256,9 +2257,8 @@ class LowerMatrixIntrinsics {
 
     Builder.setFastMathFlags(getFastMathFlags(Inst));
 
-    for (unsigned I = 0; I < SI.getNumVectors(); ++I)
-      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), A.getVector(I),
-                                           B.getVector(I)));
+    for (auto [AV, BV] : llvm::zip_equal(A.vectors(), B.vectors()))
+      Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), AV, BV));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());
@@ -2285,8 +2285,8 @@ class LowerMatrixIntrinsics {
       }
     };
 
-    for (unsigned I = 0; I < SI.getNumVectors(); ++I)
-      Result.addVector(BuildVectorOp(M.getVector(I)));
+    for (auto *Vector : M.vectors())
+      Result.addVector(BuildVectorOp(Vector));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());
@@ -2307,7 +2307,7 @@ class LowerMatrixIntrinsics {
     auto *NewVTy = VectorType::get(OrigVTy->getElementType(),
                                    ElementCount::getFixed(M.getStride()));
 
-    for (auto &Vector : M.vectors())
+    for (auto *Vector : M.vectors())
       Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
@@ -2336,9 +2336,8 @@ class LowerMatrixIntrinsics {
       CondV[1] = Cond;
     }
 
-    for (unsigned I = 0, E = Shape.getNumVectors(); I != E; ++I)
-      Result.addVector(
-          Builder.CreateSelect(CondV[I], A.getVector(I), B.getVector(I)));
+    for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors()))
+      Result.addVector(Builder.CreateSelect(CV, AV, BV));
 
     return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
                                    Result.getNumVectors());

From 316f530724ee2e870886e75729799afbcc1ff8d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 16:51:09 +0100
Subject: [PATCH 254/851] [X86] getTargetConstantBitsFromNode - handle
 EXTRACT_SUBVECTOR through bitcasts (#143886)

Generalize the extraction index/width to account for any changes in type through bitcasts
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 38 ++++++++---------
 .../zero_extend_vector_inreg_of_broadcast.ll  | 41 ++++++++-----------
 ...d_vector_inreg_of_broadcast_from_memory.ll | 30 ++++++--------
 3 files changed, 50 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f0fbf55e97be9..b4670e270141f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5242,25 +5242,25 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Extract constant bits from a subvector's source.
-  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    // TODO - support extract_subvector through bitcasts.
-    if (EltSizeInBits != VT.getScalarSizeInBits())
-      return false;
-
-    if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
-                                      UndefElts, EltBits, AllowWholeUndefs,
-                                      AllowPartialUndefs)) {
-      EVT SrcVT = Op.getOperand(0).getValueType();
-      unsigned NumSrcElts = SrcVT.getVectorNumElements();
-      unsigned NumSubElts = VT.getVectorNumElements();
-      unsigned BaseIdx = Op.getConstantOperandVal(1);
-      UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
-      if ((BaseIdx + NumSubElts) != NumSrcElts)
-        EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
-      if (BaseIdx != 0)
-        EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
-      return true;
-    }
+  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, UndefElts,
+                                    EltBits, AllowWholeUndefs,
+                                    AllowPartialUndefs)) {
+    EVT SrcVT = Op.getOperand(0).getValueType();
+    unsigned NumSrcElts = SrcVT.getSizeInBits() / EltSizeInBits;
+    unsigned NumSubElts = VT.getSizeInBits() / EltSizeInBits;
+    unsigned BaseOfs = Op.getConstantOperandVal(1) * VT.getScalarSizeInBits();
+    unsigned BaseIdx = BaseOfs / EltSizeInBits;
+    assert((SrcVT.getSizeInBits() % EltSizeInBits) == 0 &&
+           (VT.getSizeInBits() % EltSizeInBits) == 0 &&
+           (BaseOfs % EltSizeInBits) == 0 && "Bad subvector index");
+
+    UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
+    if ((BaseIdx + NumSubElts) != NumSrcElts)
+      EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
+    if (BaseIdx != 0)
+      EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
+    return true;
   }
 
   // Extract constant bits from shuffle node sources.
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 45ccc39fb2542..ed53c3693c9dc 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -3567,14 +3567,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
@@ -3757,14 +3756,14 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm3, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm3, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
@@ -3955,10 +3954,9 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
@@ -4181,17 +4179,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
-; AVX-NEXT:    vpaddb 16(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
-; AVX-NEXT:    vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT:    vmovdqa %xmm0, 32(%rcx)
+; AVX-NEXT:    vmovdqa %xmm2, 32(%rcx)
 ; AVX-NEXT:    retq
 ;
 ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
@@ -4379,10 +4376,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpaddb 32(%rdx), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rcx)
@@ -4517,10 +4513,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vmovaps 32(%rdx), %ymm2
 ; AVX-NEXT:    vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpaddb 16(%rdx), %xmm0, %xmm0
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rcx)
 ; AVX-NEXT:    vmovdqa %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 31920d8348fbe..239472c5cd1c1 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -2868,14 +2868,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
 ; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -2986,7 +2985,8 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7]
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
@@ -3135,9 +3135,8 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT:    vpaddb 32(%rsi), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
@@ -3319,13 +3318,12 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
 ; AVX-NEXT:    vmovdqa %xmm2, 32(%rdx)
 ; AVX-NEXT:    retq
 ;
@@ -3469,9 +3467,8 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpaddb 32(%rsi), %xmm0, %xmm2
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovdqa %xmm1, (%rdx)
@@ -3584,9 +3581,8 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7]
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; AVX-NEXT:    vmovaps 32(%rsi), %ymm2
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    vpaddb 16(%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX-NEXT:    vmovaps %ymm2, 32(%rdx)

From a53003fe23cb6c871e72d70ff2d3a075a7490da2 Mon Sep 17 00:00:00 2001
From: kotborealis <kotborealis@awooo.ru>
Date: Thu, 12 Jun 2025 18:51:22 +0300
Subject: [PATCH 255/851] [libc++] Update GDB pretty-printer to work with GDB
 17 (#142106)

This patch fixes an issue in libcxx/utils/gdb/libcxx/printers.py.

With gdb 17 (binutils 2_44) pretty-printers do not work anymore because
calls to `gdb.printing` requires `import gdb.printing` statement, which
was missing from the `printers.py`.

This was broken after commit https://github.com/bminor/binutils-gdb/commit/fc14343205d3a
and `import gdb.printing` was first referenced in https://github.com/bminor/binutils-gdb/commit/ee06c79b0f.

Co-authored-by: Dmitry Chestnykh <dm.chestnykh@gmail.com>
---
 libcxx/utils/gdb/libcxx/printers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/utils/gdb/libcxx/printers.py b/libcxx/utils/gdb/libcxx/printers.py
index e3d5d87aca325..90bc54d987ee8 100644
--- a/libcxx/utils/gdb/libcxx/printers.py
+++ b/libcxx/utils/gdb/libcxx/printers.py
@@ -14,6 +14,7 @@
 
 import re
 import gdb
+import gdb.printing
 
 # One under-documented feature of the gdb pretty-printer API
 # is that clients can call any other member of the API

From 882b58a90ae0c4a91e1ecda6df3767b0fc44dab1 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Thu, 12 Jun 2025 12:12:01 -0400
Subject: [PATCH 256/851] [DirectX] Reland #142853 with Circular GEP fixes
 (#143747)

This change relands  https://github.com/llvm/llvm-project/pull/142853
It fixes the circular reference issue we were seeing in GEPs
ex `%.flat = getelementptr inbounds [16 x i32], ptr %.flat, i32 0, i32
15`
---
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 24 +++--
 llvm/test/CodeGen/DirectX/flatten-array.ll    | 18 ++--
 .../CodeGen/DirectX/flatten-bug-117273.ll     |  4 +-
 .../DirectX/llc-vector-load-scalarize.ll      | 88 +++++++++----------
 .../test/CodeGen/DirectX/scalar-bug-117273.ll |  4 +-
 5 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index a3163a8969642..b1f3f41a28e8b 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -42,7 +42,7 @@ class DXILFlattenArraysLegacy : public ModulePass {
 
 struct GEPData {
   ArrayType *ParentArrayType;
-  Value *ParendOperand;
+  Value *ParentOperand;
   SmallVector<Value *> Indices;
   SmallVector<uint64_t> Dims;
   bool AllIndicesAreConstInt;
@@ -211,7 +211,7 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
 
   ArrayType *FattenedArrayType = ArrayType::get(BaseType, TotalElements);
   AllocaInst *FlatAlloca =
-      Builder.CreateAlloca(FattenedArrayType, nullptr, AI.getName() + ".flat");
+      Builder.CreateAlloca(FattenedArrayType, nullptr, AI.getName() + ".1dim");
   FlatAlloca->setAlignment(AI.getAlign());
   AI.replaceAllUsesWith(FlatAlloca);
   AI.eraseFromParent();
@@ -222,6 +222,10 @@ void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
     GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
     Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
     SmallVector<uint64_t> Dims, bool AllIndicesAreConstInt) {
+  // Check if this GEP is already in the map to avoid circular references
+  if (GEPChainMap.count(&CurrGEP) > 0)
+    return;
+
   Value *LastIndex = CurrGEP.getOperand(CurrGEP.getNumOperands() - 1);
   AllIndicesAreConstInt &= isa<ConstantInt>(LastIndex);
   Indices.push_back(LastIndex);
@@ -271,9 +275,19 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase(
         genInstructionFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder);
 
   ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType;
-  Value *FlatGEP =
-      Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParendOperand, FlatIndex,
-                        GEP.getName() + ".flat", GEP.isInBounds());
+
+  // Don't append '.flat' to an empty string. If the SSA name isn't available
+  // it could conflict with the ParentOperand's name.
+  std::string FlatName = GEP.hasName() ? GEP.getName().str() + ".flat" : "";
+
+  Value *FlatGEP = Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParentOperand,
+                                     {Builder.getInt32(0), FlatIndex}, FlatName,
+                                     GEP.getNoWrapFlags());
+
+  // Note: Old gep will become an invalid instruction after replaceAllUsesWith.
+  // Erase the old GEP in the map before to avoid invalid instructions
+  // and circular references.
+  GEPChainMap.erase(&GEP);
 
   GEP.replaceAllUsesWith(FlatGEP);
   GEP.eraseFromParent();
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 754d5a25ca905..5c761014d471f 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -31,7 +31,7 @@ define void @alloca_4d_test ()  {
 ; CHECK-LABEL: gep_2d_test
 define void @gep_2d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [9 x i32], align 4
-    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 {{[0-8]}}
+    ; CHECK-COUNT-9: getelementptr inbounds [9 x i32], ptr [[a]], i32 0, i32 {{[0-8]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [3 x [3 x i32]], align 4
     %g2d0 = getelementptr inbounds [3 x [3 x i32]], [3 x [3 x i32]]* %1, i32 0, i32 0
@@ -53,7 +53,7 @@ define void @gep_2d_test ()  {
 ; CHECK-LABEL: gep_3d_test
 define void @gep_3d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [8 x i32], align 4
-    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 {{[0-7]}}
+    ; CHECK-COUNT-8: getelementptr inbounds [8 x i32], ptr [[a]], i32 0, i32 {{[0-7]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2 x[2 x [2 x i32]]], align 4
     %g3d0 = getelementptr inbounds [2 x[2 x [2 x i32]]], [2 x[2 x [2 x i32]]]* %1, i32 0, i32 0
@@ -76,7 +76,7 @@ define void @gep_3d_test ()  {
 ; CHECK-LABEL: gep_4d_test
 define void @gep_4d_test ()  {
     ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
-    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 {{[0-9]|1[0-5]}}
+    ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[a]], i32 0, i32 {{[0-9]|1[0-5]}}
     ; CHECK-NEXT:    ret void
     %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
     %g4d0 = getelementptr inbounds [2x[2 x[2 x [2 x i32]]]], [2x[2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0
@@ -123,8 +123,8 @@ define void @gep_4d_test ()  {
 @b = internal global [2 x [3 x [4 x i32]]] zeroinitializer, align 16
 
 define void @global_gep_load() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 6
-  ; CHECK: load i32, ptr [[GEP_PTR]], align 4
+  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 6
+  ; CHECK-NEXT: load i32, ptr [[GEP_PTR]], align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @a, i32 0, i32 0
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 1
@@ -142,7 +142,7 @@ define void @global_gep_load_index(i32 %row, i32 %col, i32 %timeIndex) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[ROW]], 12
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP6]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP6]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -163,7 +163,7 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 0, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[ROW]], 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 [[TMP4]]
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [24 x i32], ptr @a.1dim, i32 0, i32 [[TMP4]]
 ; CHECK-NOT: getelementptr inbounds [2 x [3 x [4 x i32]]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [3 x [4 x i32]]{{.*}}
 ; CHECK-NOT: getelementptr inbounds [4 x i32]{{.*}}
@@ -177,8 +177,8 @@ define void @global_incomplete_gep_chain(i32 %row, i32 %col) {
 }
 
 define void @global_gep_store() {
-  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 13
-  ; CHECK:  store i32 1, ptr [[GEP_PTR]], align 4
+  ; CHECK: [[GEP_PTR:%.*]] = getelementptr inbounds [24 x i32], ptr @b.1dim, i32 0, i32 13
+  ; CHECK-NEXT: store i32 1, ptr [[GEP_PTR]], align 4
   ; CHECK-NEXT:    ret void
   %1 = getelementptr inbounds [2 x [3 x [4 x i32]]], [2 x [3 x [4 x i32]]]* @b, i32 0, i32 1
   %2 = getelementptr inbounds [3 x [4 x i32]], [3 x [4 x i32]]* %1, i32 0, i32 0
diff --git a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
index 3ae5832ce8322..c73e5017348d1 100644
--- a/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-bug-117273.ll
@@ -8,9 +8,9 @@
 define internal void @main() {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 1
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [24 x float], ptr @ZerroInitArr.1dim, i32 0, i32 2
 ; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index 7e5a92e1311f8..c960aad3d2627 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -32,23 +32,23 @@ define <4 x i32> @load_array_vec_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[DOTI12:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) [[DOTI12]], align 4
-; CHECK-NEXT:    [[DOTI24:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) [[DOTI24]], align 4
-; CHECK-NEXT:    [[DOTI36:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) [[DOTI36]], align 4
-; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP2]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP4]], [[DOTI13]]
-; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP6]], [[DOTI25]]
-; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP8]], [[DOTI37]]
-; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
-; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
-; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
+; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
+; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
+; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
+; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
+; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
+; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 1), align 4
@@ -81,23 +81,19 @@ define <4 x i32> @load_vec_test() #0 {
 define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 ; CHECK-LABEL: define <4 x i32> @load_static_array_of_vec_test(
 ; CHECK-SAME: i32 [[INDEX:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr i32, ptr [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOTFLAT:%.*]] = getelementptr inbounds [12 x i32], ptr @staticArrayOfVecData.scalarized.1dim, i32 0, i32 [[INDEX]]
+; CHECK-NEXT:    [[DOTI0:%.*]] = load i32, ptr [[DOTFLAT]], align 4
+; CHECK-NEXT:    [[DOTFLAT_I1:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 1
 ; CHECK-NEXT:    [[DOTI1:%.*]] = load i32, ptr [[DOTFLAT_I1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr i32, ptr [[TMP4]], i32 2
+; CHECK-NEXT:    [[DOTFLAT_I2:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 2
 ; CHECK-NEXT:    [[DOTI2:%.*]] = load i32, ptr [[DOTFLAT_I2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr [[DOTFLAT]] to ptr
-; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr i32, ptr [[TMP5]], i32 3
+; CHECK-NEXT:    [[DOTFLAT_I3:%.*]] = getelementptr i32, ptr [[DOTFLAT]], i32 3
 ; CHECK-NEXT:    [[DOTI3:%.*]] = load i32, ptr [[DOTFLAT_I3]], align 4
-; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[DOTI1]], i32 1
-; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[DOTI2]], i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[DOTI3]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[DOTUPTO01:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI0]], i32 0
+; CHECK-NEXT:    [[DOTUPTO12:%.*]] = insertelement <4 x i32> [[DOTUPTO01]], i32 [[DOTI1]], i32 1
+; CHECK-NEXT:    [[DOTUPTO23:%.*]] = insertelement <4 x i32> [[DOTUPTO12]], i32 [[DOTI2]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[DOTUPTO23]], i32 [[DOTI3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
   %3 = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* @staticArrayOfVecData, i32 0, i32 %index
   %4 = load <4 x i32>, <4 x i32>* %3, align 4
@@ -115,23 +111,23 @@ define <4 x i32> @multid_load_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[DOTI12:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) [[DOTI12]], align 4
-; CHECK-NEXT:    [[DOTI24:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) [[DOTI24]], align 4
-; CHECK-NEXT:    [[DOTI36:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) [[DOTI36]], align 4
-; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP2]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP4]], [[DOTI13]]
-; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP6]], [[DOTI25]]
-; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP8]], [[DOTI37]]
-; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
-; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
-; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
+; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
+; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
+; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
+; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
+; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
+; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
index 2676abec1d8ae..a07ce2c24f7ac 100644
--- a/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-bug-117273.ll
@@ -8,13 +8,13 @@
 define internal void @main() #1 {
 ; CHECK-LABEL: define internal void @main() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 1
 ; CHECK-NEXT:    [[DOTI0:%.*]] = load float, ptr [[TMP0]], align 16
 ; CHECK-NEXT:    [[DOTI1:%.*]] = getelementptr float, ptr [[TMP0]], i32 1
 ; CHECK-NEXT:    [[DOTI11:%.*]] = load float, ptr [[DOTI1]], align 4
 ; CHECK-NEXT:    [[DOTI2:%.*]] = getelementptr float, ptr [[TMP0]], i32 2
 ; CHECK-NEXT:    [[DOTI22:%.*]] = load float, ptr [[DOTI2]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [24 x float], ptr @StaticArr.scalarized.1dim, i32 0, i32 2
 ; CHECK-NEXT:    [[DOTI03:%.*]] = load float, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    [[DOTI14:%.*]] = getelementptr float, ptr [[TMP1]], i32 1
 ; CHECK-NEXT:    [[DOTI15:%.*]] = load float, ptr [[DOTI14]], align 4

From ef1cb8277ac3cb34ce9700a313ed60410dd9f84b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Thu, 12 Jun 2025 18:13:29 +0200
Subject: [PATCH 257/851] [SPIR-V] Fix ExecutionMode generation (#143888)

PR #141787 added code to emit the Fragment execution model. This
required emitting the OriginUpperLeft ExecutionMode. But this was done
by using the same codepath used for OpEntrypoint.

This has 2 issues:
- the interface variables were added to both OpEntryPoint and
OpExecutionMode.
- the existing OpExecutionMode logic was not used.

This commit fixes this, regrouping OpExecutionMode handling in one
place, and fixing bad codegen issue when interface variiables are added.
---
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp        | 16 ++++++++++++++++
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp      | 13 +------------
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp    |  2 --
 .../test/CodeGen/SPIRV/ExecutionMode_Fragment.ll | 14 +++++++++++---
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index d4becc2865049..26b94788b810e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -510,6 +510,22 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
       continue;
     MCRegister FReg = MAI->getFuncReg(&F);
     assert(FReg.isValid());
+
+    if (Attribute Attr = F.getFnAttribute("hlsl.shader"); Attr.isValid()) {
+      // SPIR-V common validation: Fragment requires OriginUpperLeft or
+      // OriginLowerLeft.
+      // VUID-StandaloneSpirv-OriginLowerLeft-04653: Fragment must declare
+      // OriginUpperLeft.
+      if (Attr.getValueAsString() == "pixel") {
+        MCInst Inst;
+        Inst.setOpcode(SPIRV::OpExecutionMode);
+        Inst.addOperand(MCOperand::createReg(FReg));
+        unsigned EM =
+            static_cast<unsigned>(SPIRV::ExecutionMode::OriginUpperLeft);
+        Inst.addOperand(MCOperand::createImm(EM));
+        outputMCInst(Inst);
+      }
+    }
     if (MDNode *Node = F.getMetadata("reqd_work_group_size"))
       outputExecutionModeFromMDNode(FReg, Node, SPIRV::ExecutionMode::LocalSize,
                                     3, 1);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 091368a309a82..36cc5cbe655bc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -475,21 +475,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     // environment if we need to.
     const SPIRVSubtarget *ST =
         static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
-    SPIRV::ExecutionModel::ExecutionModel ExecutionModel =
-        getExecutionModel(*ST, F);
     auto MIB = MIRBuilder.buildInstr(SPIRV::OpEntryPoint)
-                   .addImm(static_cast<uint32_t>(ExecutionModel))
+                   .addImm(static_cast<uint32_t>(getExecutionModel(*ST, F)))
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
-
-    if (ExecutionModel == SPIRV::ExecutionModel::Fragment) {
-      // SPIR-V common validation: Fragment requires OriginUpperLeft or
-      // OriginLowerLeft VUID-StandaloneSpirv-OriginLowerLeft-04653: Fragment
-      // must declare OriginUpperLeft.
-      MIRBuilder.buildInstr(SPIRV::OpExecutionMode)
-          .addUse(FuncVReg)
-          .addImm(static_cast<uint32_t>(SPIRV::ExecutionMode::OriginUpperLeft));
-    }
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
              F.getLinkage() != GlobalValue::PrivateLinkage) {
     SPIRV::LinkageType::LinkageType LnkTy =
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 2ddd028c79412..b71a9dd68dd44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -595,8 +595,6 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
           collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames, IS);
         } else if (OpCode == SPIRV::OpEntryPoint) {
           collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS);
-        } else if (OpCode == SPIRV::OpExecutionMode) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS);
         } else if (TII->isAliasingInstr(MI)) {
           collectOtherInstr(MI, MAI, SPIRV::MB_AliasingInsts, IS);
         } else if (TII->isDecorationInstr(MI)) {
diff --git a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
index 0a62db446cc11..4fa764fe192d3 100644
--- a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
+++ b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
@@ -1,12 +1,20 @@
 ; RUN: llc -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3-pixel %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
 
-; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main"
+; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main" {{.*}}
 ; CHECK-DAG: OpExecutionMode %[[#entry]] OriginUpperLeft
 
-define void @main() #1 {
+
+define void @main() #0 {
 entry:
+  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %0, i32 0)
+  store i32 1, ptr addrspace(11) %1, align 4
+
   ret void
 }
 
-attributes #1 = { "hlsl.shader"="pixel" }
+declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #1
+
+attributes #0 = { "hlsl.shader"="pixel" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

From daee5eee8562d26d234f85152e803b6571b15ee2 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Thu, 12 Jun 2025 11:14:21 -0500
Subject: [PATCH 258/851] [Offload][PGO] Fix new GPU PGO tests (#143645)

`pgo_atomic_teams.c` and `pgo_atomic_threads.c` currently are set to run
on NVPTX despite the changes for that target not being upstreamed yet.
This patch also replaces instances of `llvm-profdata` with `%profdata`
in those tests.
---
 offload/test/offloading/gpupgo/pgo_atomic_teams.c   | 6 +++---
 offload/test/offloading/gpupgo/pgo_atomic_threads.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/offload/test/offloading/gpupgo/pgo_atomic_teams.c b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
index 7bf3b1c11f28b..b3b72db080392 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_teams.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_teams.c
@@ -3,7 +3,7 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.llvm.profraw | \
 // RUN:     %fcheck-generic --check-prefix="LLVM-PGO"
 
@@ -12,11 +12,11 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.clang.profraw | \
 // RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
 
-// REQUIRES: gpu
+// REQUIRES: amdgpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }
diff --git a/offload/test/offloading/gpupgo/pgo_atomic_threads.c b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
index f0e7111f7a64b..440a6b533317d 100644
--- a/offload/test/offloading/gpupgo/pgo_atomic_threads.c
+++ b/offload/test/offloading/gpupgo/pgo_atomic_threads.c
@@ -3,7 +3,7 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.llvm.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.llvm.profraw | \
 // RUN:     %fcheck-generic --check-prefix="LLVM-PGO"
 
@@ -12,11 +12,11 @@
 // RUN:     -Xarch_device -fprofile-update=atomic
 // RUN: env LLVM_PROFILE_FILE=%basename_t.clang.profraw \
 // RUN:     %libomptarget-run-generic 2>&1
-// RUN: llvm-profdata show --all-functions --counts \
+// RUN: %profdata show --all-functions --counts \
 // RUN:     %target_triple.%basename_t.clang.profraw | \
 // RUN:     %fcheck-generic --check-prefix="CLANG-PGO"
 
-// REQUIRES: gpu
+// REQUIRES: amdgpu
 // REQUIRES: pgo
 
 int test1(int a) { return a / 2; }

From c6da2c877cb407c0404e58c5ca257d12036ed164 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Thu, 12 Jun 2025 17:14:31 +0100
Subject: [PATCH 259/851] [lldb][Commands] Fix memory find for Swift
 expressions (#143860)

(depends on https://github.com/llvm/llvm-project/pull/143686)

There were two issues previously preventing `memory find -e` expressions
to succeed when stopped in Swift frames:
1. We weren't getting the dynamic type of the result `ValueObject`.
   For Swift this would fail when we tried to produce a scalar value
   out of it because the static VO wasn't sufficient to get to the
integer value. Hence we add a call to
`GetQualifiedRepresentationIfAvailable`
(which is what we do for expressions in `OptionArgParser::ToAddress`
too).
2. We weren't passing an `ExecutionContextScope` to `GetByteSize`, which
   Swift relied on to get the size of the result type.

My plan is to add an API test for this on the Apple
`swiftlang/llvm-project` fork.

I considered an alternative where we use `OptionArgParser::ToAddress`
for `memory find -e` expressions, but it got a bit icky when trying to
figure out how many bytes we should copy out of the result into the
`DataBufferHeap` (currently we rely on the size of the result variable
type). This gets even trickier when we were to pass an expression that
was actually a hex digit or a number into `ToAddress`.

rdar://152113525
---
 lldb/source/Commands/CommandObjectMemory.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index 85ae9f8f9e8cb..ccb06d8ff4d59 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -886,9 +886,10 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
 #include "CommandOptions.inc"
 
 static llvm::Error CopyExpressionResult(ValueObject &result,
-                                        DataBufferHeap &buffer) {
+                                        DataBufferHeap &buffer,
+                                        ExecutionContextScope *scope) {
   uint64_t value = result.GetValueAsUnsigned(0);
-  auto size_or_err = result.GetCompilerType().GetByteSize(nullptr);
+  auto size_or_err = result.GetCompilerType().GetByteSize(scope);
   if (!size_or_err)
     return size_or_err.takeError();
 
@@ -928,6 +929,11 @@ EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
     return llvm::createStringError(
         "expression evaluation failed. pass a string instead");
 
+  result_sp = result_sp->GetQualifiedRepresentationIfAvailable(
+      result_sp->GetDynamicValueType(), /*synthValue=*/true);
+  if (!result_sp)
+    return llvm::createStringError("failed to get dynamic result type");
+
   return result_sp;
 }
 
@@ -1082,7 +1088,8 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
 
       ValueObjectSP result_sp = *result_or_err;
 
-      if (auto err = CopyExpressionResult(*result_sp, buffer)) {
+      if (auto err = CopyExpressionResult(*result_sp, buffer,
+                                          m_exe_ctx.GetFramePtr())) {
         result.AppendError(llvm::toString(std::move(err)));
         return;
       }

From 4039fdb7ba5a0d9ead5bdc0404f036063a4ca95d Mon Sep 17 00:00:00 2001
From: "W. Turner Abney" <weebney@gmail.com>
Date: Thu, 12 Jun 2025 12:20:32 -0400
Subject: [PATCH 260/851] [libc] add ioctl (#141393)

Closes #85275
Closes #90317
Updates #97191

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/config/linux/aarch64/entrypoints.txt     |  3 +
 libc/config/linux/arm/entrypoints.txt         |  3 +
 libc/config/linux/riscv/entrypoints.txt       |  3 +
 libc/config/linux/x86_64/entrypoints.txt      |  3 +
 libc/hdr/CMakeLists.txt                       |  9 +++
 libc/hdr/sys_ioctl_macros.h                   | 22 ++++++
 .../llvm-libc-macros/linux/sys-ioctl-macros.h |  1 +
 libc/src/sys/CMakeLists.txt                   |  1 +
 libc/src/sys/ioctl/CMakeLists.txt             | 10 +++
 libc/src/sys/ioctl/ioctl.h                    | 20 +++++
 libc/src/sys/ioctl/linux/CMakeLists.txt       | 12 +++
 libc/src/sys/ioctl/linux/ioctl.cpp            | 36 +++++++++
 libc/test/src/sys/CMakeLists.txt              |  1 +
 libc/test/src/sys/ioctl/CMakeLists.txt        |  3 +
 libc/test/src/sys/ioctl/linux/CMakeLists.txt  | 17 +++++
 libc/test/src/sys/ioctl/linux/ioctl_test.cpp  | 75 +++++++++++++++++++
 16 files changed, 219 insertions(+)
 create mode 100644 libc/hdr/sys_ioctl_macros.h
 create mode 100644 libc/src/sys/ioctl/CMakeLists.txt
 create mode 100644 libc/src/sys/ioctl/ioctl.h
 create mode 100644 libc/src/sys/ioctl/linux/CMakeLists.txt
 create mode 100644 libc/src/sys/ioctl/linux/ioctl.cpp
 create mode 100644 libc/test/src/sys/ioctl/CMakeLists.txt
 create mode 100644 libc/test/src/sys/ioctl/linux/CMakeLists.txt
 create mode 100644 libc/test/src/sys/ioctl/linux/ioctl_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 520046f768b5d..fcf1278eae723 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -245,6 +245,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 7432a7e912e81..1161ae260be2e 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -172,6 +172,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.free
     libc.src.stdlib.malloc
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 0b645a2d2fb8b..050fc2672a57e 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -246,6 +246,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 959bdbf08dbea..6c9d83708b92f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -246,6 +246,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # https://github.com/llvm/llvm-project/issues/80060
     # libc.src.sys.epoll.epoll_pwait2
 
+    # sys/ioctl.h entrypoints
+    libc.src.sys.ioctl.ioctl
+
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
     libc.src.sys.mman.mincore
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 209fcb965242f..1e9f59621a8e5 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -126,6 +126,15 @@ add_proxy_header_library(
     libc.include.llvm-libc-macros.sys_epoll_macros
 )
 
+add_proxy_header_library(
+  sys_ioctl_macros
+  HDRS
+    sys_ioctl_macros.h
+  FULL_BUILD_DEPENDS
+    libc.include.sys_ioctl
+    libc.include.llvm-libc-macros.sys_ioctl_macros
+)
+
 add_proxy_header_library(
   sys_stat_macros
   HDRS
diff --git a/libc/hdr/sys_ioctl_macros.h b/libc/hdr/sys_ioctl_macros.h
new file mode 100644
index 0000000000000..935d436273465
--- /dev/null
+++ b/libc/hdr/sys_ioctl_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from sys/ioctl.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
+#define LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/sys-ioctl-macros.h"
+
+#else // Overlay mode
+
+#include <sys/ioctl.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_SYS_IOCTL_MACROS_H
diff --git a/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h b/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
index 5eb779aeeca56..41226080084c3 100644
--- a/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
+++ b/libc/include/llvm-libc-macros/linux/sys-ioctl-macros.h
@@ -15,5 +15,6 @@
 // around the definitions of macros like _IO, _IOR, _IOW, and _IOWR that I don't
 // think is worth digging into right now.
 #define TIOCGETD 0x5424
+#define FIONREAD 0x541B
 
 #endif // LLVM_LIBC_MACROS_LINUX_SYS_IOCTL_MACROS_H
diff --git a/libc/src/sys/CMakeLists.txt b/libc/src/sys/CMakeLists.txt
index 9a73b80d35d2f..0fa11e9eee696 100644
--- a/libc/src/sys/CMakeLists.txt
+++ b/libc/src/sys/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(utsname)
 add_subdirectory(wait)
 add_subdirectory(prctl)
 add_subdirectory(uio)
+add_subdirectory(ioctl)
diff --git a/libc/src/sys/ioctl/CMakeLists.txt b/libc/src/sys/ioctl/CMakeLists.txt
new file mode 100644
index 0000000000000..099a1b96389fc
--- /dev/null
+++ b/libc/src/sys/ioctl/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+endif()
+
+add_entrypoint_object(
+  ioctl
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.ioctl
+)
diff --git a/libc/src/sys/ioctl/ioctl.h b/libc/src/sys/ioctl/ioctl.h
new file mode 100644
index 0000000000000..62323ba7dd4dc
--- /dev/null
+++ b/libc/src/sys/ioctl/ioctl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ioctl ---------------------------*-C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
+#define LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int ioctl(int fd, unsigned long request, ...);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_IOCTL_IOCTL_H
diff --git a/libc/src/sys/ioctl/linux/CMakeLists.txt b/libc/src/sys/ioctl/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..876f35aaee66c
--- /dev/null
+++ b/libc/src/sys/ioctl/linux/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_entrypoint_object(
+  ioctl
+  SRCS
+    ioctl.cpp
+  HDRS
+    ../ioctl.h
+  DEPENDS
+    libc.include.sys_ioctl
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
diff --git a/libc/src/sys/ioctl/linux/ioctl.cpp b/libc/src/sys/ioctl/linux/ioctl.cpp
new file mode 100644
index 0000000000000..f03fea21c75bd
--- /dev/null
+++ b/libc/src/sys/ioctl/linux/ioctl.cpp
@@ -0,0 +1,36 @@
+//===---------- Linux implementation of the ioctl function ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/ioctl/ioctl.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include <stdarg.h>
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, ioctl, (int fd, unsigned long request, ...)) {
+  va_list vargs;
+  va_start(vargs, request);
+  void *data_pointer = va_arg(vargs, void *);
+  int ret =
+      LIBC_NAMESPACE::syscall_impl<int>(SYS_ioctl, fd, request, data_pointer);
+  va_end(vargs);
+
+  // Some ioctls can be expected to return positive values
+  if (ret >= 0)
+    return ret;
+
+  // If there is an error, errno is set and -1 is returned.
+  libc_errno = -ret;
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/sys/CMakeLists.txt b/libc/test/src/sys/CMakeLists.txt
index 224cc7905ad31..13bf91eef04be 100644
--- a/libc/test/src/sys/CMakeLists.txt
+++ b/libc/test/src/sys/CMakeLists.txt
@@ -13,3 +13,4 @@ add_subdirectory(auxv)
 add_subdirectory(epoll)
 add_subdirectory(uio)
 add_subdirectory(time)
+add_subdirectory(ioctl)
diff --git a/libc/test/src/sys/ioctl/CMakeLists.txt b/libc/test/src/sys/ioctl/CMakeLists.txt
new file mode 100644
index 0000000000000..b4bbe81c92ff2
--- /dev/null
+++ b/libc/test/src/sys/ioctl/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
diff --git a/libc/test/src/sys/ioctl/linux/CMakeLists.txt b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..e5095c54a729f
--- /dev/null
+++ b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(libc_sys_ioctl_unittests)
+
+add_libc_unittest(
+  ioctl_test
+  SUITE
+    libc_sys_ioctl_unittests
+  SRCS
+    ioctl_test.cpp
+  DEPENDS
+    libc.hdr.ioctl_macros
+    libc.src.sys.ioctl.ioctl
+    libc.src.errno.errno
+    libc.src.fcntl.open
+    libc.src.unistd.close
+    libc.src.unistd.read
+    libc.src.unistd.write
+)
diff --git a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
new file mode 100644
index 0000000000000..9c56a4689b186
--- /dev/null
+++ b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
@@ -0,0 +1,75 @@
+//===-- Unittests for ioctl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/fcntl/open.h"
+#include "src/sys/ioctl/ioctl.h"
+#include "src/unistd/close.h"
+#include "src/unistd/read.h"
+#include "src/unistd/write.h"
+
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/sys_stat_macros.h"
+
+#include "hdr/sys_ioctl_macros.h"
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+TEST(LlvmLibcSysIoctlTest, InvalidCommandAndFIONREAD) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  // Setup the test file
+  constexpr const char *TEST_FILE_NAME = "ioctl.test";
+  constexpr const char TEST_MSG[] = "ioctl test";
+  constexpr int TEST_MSG_SIZE = sizeof(TEST_MSG) - 1;
+  auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
+  int new_test_file_fd = LIBC_NAMESPACE::open(
+      TEST_FILE, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+  ASSERT_THAT(
+      (int)LIBC_NAMESPACE::write(new_test_file_fd, TEST_MSG, TEST_MSG_SIZE),
+      Succeeds(TEST_MSG_SIZE));
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_THAT(LIBC_NAMESPACE::close(new_test_file_fd), Succeeds(0));
+  ASSERT_ERRNO_SUCCESS();
+
+  // Reopen the file for testing
+  int fd = LIBC_NAMESPACE::open(TEST_FILE, O_RDONLY);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+
+  // FIONREAD reports the number of available bytes to read for the passed fd
+  // This will report the full size of the file, as we haven't read anything yet
+  int n = -1;
+  int ret = LIBC_NAMESPACE::ioctl(fd, FIONREAD, &n);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(ret, -1);
+  ASSERT_EQ(n, TEST_MSG_SIZE);
+
+  // But if we read some bytes...
+  constexpr int READ_COUNT = 5;
+  char read_buffer[READ_COUNT];
+  ASSERT_THAT((int)LIBC_NAMESPACE::read(fd, read_buffer, READ_COUNT),
+              Succeeds(READ_COUNT));
+
+  // ... n should have decreased by the number of bytes we've read
+  int n_after_reading = -1;
+  ret = LIBC_NAMESPACE::ioctl(fd, FIONREAD, &n_after_reading);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(ret, -1);
+  ASSERT_EQ(n - READ_COUNT, n_after_reading);
+
+  // 0xDEADBEEF is just a random nonexistent command;
+  // calling this should always fail with ENOTTY
+  ret = LIBC_NAMESPACE::ioctl(fd, 0xDEADBEEF, NULL);
+  ASSERT_ERRNO_EQ(ENOTTY);
+  ASSERT_EQ(ret, -1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+}

From 77834a40cf350d2fe63fac26222c3918f5f348fd Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 09:24:26 -0700
Subject: [PATCH 261/851] [CIR] Upstream support for emitting constructors
 (#143639)

This change upstreams the code to emit simple constructor defintions.
---
 clang/include/clang/CIR/MissingFeatures.h     |  4 +
 clang/lib/CIR/CodeGen/CIRGenCXX.cpp           | 40 +++++++++
 clang/lib/CIR/CodeGen/CIRGenCXXABI.h          | 11 +++
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 41 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 81 +++++++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      | 60 ++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 18 +++++
 clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp | 65 ++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 15 ++--
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  5 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt          |  1 +
 clang/test/CIR/CodeGen/ctor.cpp               | 54 ++++++++++++-
 12 files changed, 379 insertions(+), 16 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenCXX.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 87908e2ec08ac..fbd15d5c886d2 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -81,6 +81,7 @@ struct MissingFeatures {
   static bool opFuncCPUAndFeaturesAttributes() { return false; }
   static bool opFuncSection() { return false; }
   static bool opFuncSetComdat() { return false; }
+  static bool opFuncAttributesForDefinition() { return false; }
 
   // CallOp handling
   static bool opCallPseudoDtor() { return false; }
@@ -226,6 +227,9 @@ struct MissingFeatures {
   static bool implicitConstructorArgs() { return false; }
   static bool intrinsics() { return false; }
   static bool attributeNoBuiltin() { return false; }
+  static bool emitCtorPrologue() { return false; }
+  static bool thunks() { return false; }
+  static bool runCleanupsScope() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
new file mode 100644
index 0000000000000..51751483d34e9
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code dealing with C++ code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+
+#include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+cir::FuncOp CIRGenModule::codegenCXXStructor(GlobalDecl gd) {
+  const CIRGenFunctionInfo &fnInfo =
+      getTypes().arrangeCXXStructorDeclaration(gd);
+  cir::FuncType funcType = getTypes().getFunctionType(fnInfo);
+  cir::FuncOp fn = getAddrOfCXXStructor(gd, &fnInfo, /*FnType=*/nullptr,
+                                        /*DontDefer=*/true, ForDefinition);
+  assert(!cir::MissingFeatures::opFuncLinkage());
+  CIRGenFunction cgf{*this, builder};
+  curCGF = &cgf;
+  {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    cgf.generateCode(gd, fn, funcType);
+  }
+  curCGF = nullptr;
+
+  setNonAliasAttributes(gd, fn);
+  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
+  return fn;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index 107535ebc7275..2d967fd307e01 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -37,6 +37,10 @@ class CIRGenCXXABI {
 
   void setCXXABIThisValue(CIRGenFunction &cgf, mlir::Value thisPtr);
 
+  /// Emit a single constructor/destructor with the gen type from a C++
+  /// constructor/destructor Decl.
+  virtual void emitCXXStructor(clang::GlobalDecl gd) = 0;
+
 public:
   clang::ImplicitParamDecl *getThisDecl(CIRGenFunction &cgf) {
     return cgf.cxxabiThisDecl;
@@ -55,12 +59,19 @@ class CIRGenCXXABI {
     return md->getParent();
   }
 
+  /// Return whether the given global decl needs a VTT (virtual table table)
+  /// parameter.
+  virtual bool needsVTTParameter(clang::GlobalDecl gd) { return false; }
+
   /// Build a parameter variable suitable for 'this'.
   void buildThisParam(CIRGenFunction &cgf, FunctionArgList &params);
 
   /// Loads the incoming C++ this pointer as it was passed by the caller.
   mlir::Value loadIncomingCXXThis(CIRGenFunction &cgf);
 
+  /// Emit constructor variants required by this ABI.
+  virtual void emitCXXConstructors(const clang::CXXConstructorDecl *d) = 0;
+
   /// Returns true if the given constructor or destructor is one of the kinds
   /// that the ABI says returns 'this' (only applies when called non-virtually
   /// for destructors).
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 9d25eea9e413d..da754e0806b2d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -162,6 +162,47 @@ arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl<CanQualType> &prefix,
   return cgt.arrangeCIRFunctionInfo(resultType, prefix, required);
 }
 
+void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
+                                         const VarDecl *param,
+                                         SourceLocation loc) {
+  // StartFunction converted the ABI-lowered parameter(s) into a local alloca.
+  // We need to turn that into an r-value suitable for emitCall
+  Address local = getAddrOfLocalVar(param);
+
+  QualType type = param->getType();
+
+  if (const auto *rd = type->getAsCXXRecordDecl()) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: record argument");
+    return;
+  }
+
+  // GetAddrOfLocalVar returns a pointer-to-pointer for references, but the
+  // argument needs to be the original pointer.
+  if (type->isReferenceType()) {
+    args.add(
+        RValue::get(builder.createLoad(getLoc(param->getSourceRange()), local)),
+        type);
+  } else if (getLangOpts().ObjCAutoRefCount) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: ObjCAutoRefCount");
+    // For the most part, we just need to load the alloca, except that aggregate
+    // r-values are actually pointers to temporaries.
+  } else {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: convertTempToRValue");
+  }
+
+  // Deactivate the cleanup for the callee-destructed param that was pushed.
+  assert(!cir::MissingFeatures::thunks());
+  if (type->isRecordType() &&
+      type->castAs<RecordType>()->getDecl()->isParamDestroyedInCallee() &&
+      param->needsDestruction(getContext())) {
+    cgm.errorNYI(param->getSourceRange(),
+                 "emitDelegateCallArg: callee-destructed param");
+  }
+}
+
 static const CIRGenFunctionInfo &
 arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm,
                             const CallArgList &args,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 8491a66ea6cb4..bb4b451c99247 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -21,6 +21,87 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
+/// Checks whether the given constructor is a valid subject for the
+/// complete-to-base constructor delegation optimization, i.e. emitting the
+/// complete constructor as a simple call to the base constructor.
+bool CIRGenFunction::isConstructorDelegationValid(
+    const CXXConstructorDecl *ctor) {
+  // Currently we disable the optimization for classes with virtual bases
+  // because (1) the address of parameter variables need to be consistent across
+  // all initializers but (2) the delegate function call necessarily creates a
+  // second copy of the parameter variable.
+  //
+  // The limiting example (purely theoretical AFAIK):
+  //   struct A { A(int &c) { c++; } };
+  //   struct A : virtual A {
+  //     B(int count) : A(count) { printf("%d\n", count); }
+  //   };
+  // ...although even this example could in principle be emitted as a delegation
+  // since the address of the parameter doesn't escape.
+  if (ctor->getParent()->getNumVBases())
+    return false;
+
+  // We also disable the optimization for variadic functions because it's
+  // impossible to "re-pass" varargs.
+  if (ctor->getType()->castAs<FunctionProtoType>()->isVariadic())
+    return false;
+
+  // FIXME: Decide if we can do a delegation of a delegating constructor.
+  if (ctor->isDelegatingConstructor())
+    return false;
+
+  return true;
+}
+
+Address CIRGenFunction::loadCXXThisAddress() {
+  assert(curFuncDecl && "loading 'this' without a func declaration?");
+  assert(isa<CXXMethodDecl>(curFuncDecl));
+
+  // Lazily compute CXXThisAlignment.
+  if (cxxThisAlignment.isZero()) {
+    // Just use the best known alignment for the parent.
+    // TODO: if we're currently emitting a complete-object ctor/dtor, we can
+    // always use the complete-object alignment.
+    auto rd = cast<CXXMethodDecl>(curFuncDecl)->getParent();
+    cxxThisAlignment = cgm.getClassPointerAlignment(rd);
+  }
+
+  return Address(loadCXXThis(), cxxThisAlignment);
+}
+
+void CIRGenFunction::emitDelegateCXXConstructorCall(
+    const CXXConstructorDecl *ctor, CXXCtorType ctorType,
+    const FunctionArgList &args, SourceLocation loc) {
+  CallArgList delegateArgs;
+
+  FunctionArgList::const_iterator i = args.begin(), e = args.end();
+  assert(i != e && "no parameters to constructor");
+
+  // this
+  Address thisAddr = loadCXXThisAddress();
+  delegateArgs.add(RValue::get(thisAddr.getPointer()), (*i)->getType());
+  ++i;
+
+  // FIXME: The location of the VTT parameter in the parameter list is specific
+  // to the Itanium ABI and shouldn't be hardcoded here.
+  if (cgm.getCXXABI().needsVTTParameter(curGD)) {
+    cgm.errorNYI(loc, "emitDelegateCXXConstructorCall: VTT parameter");
+    return;
+  }
+
+  // Explicit arguments.
+  for (; i != e; ++i) {
+    const VarDecl *param = *i;
+    // FIXME: per-argument source location
+    emitDelegateCallArg(delegateArgs, param, loc);
+  }
+
+  assert(!cir::MissingFeatures::sanitizers());
+
+  emitCXXConstructorCall(ctor, ctorType, /*ForVirtualBase=*/false,
+                         /*Delegating=*/true, thisAddr, delegateArgs, loc);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index e32a5c836be02..53c44c6cc7680 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -465,7 +465,7 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
     if (isa<CXXDestructorDecl>(funcDecl))
       getCIRGenModule().errorNYI(bodyRange, "C++ destructor definition");
     else if (isa<CXXConstructorDecl>(funcDecl))
-      getCIRGenModule().errorNYI(bodyRange, "C++ constructor definition");
+      emitConstructorBody(args);
     else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
              funcDecl->hasAttr<CUDAGlobalAttr>())
       getCIRGenModule().errorNYI(bodyRange, "CUDA kernel");
@@ -496,6 +496,54 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
   return fn;
 }
 
+void CIRGenFunction::emitConstructorBody(FunctionArgList &args) {
+  assert(!cir::MissingFeatures::sanitizers());
+  const auto *ctor = cast<CXXConstructorDecl>(curGD.getDecl());
+  CXXCtorType ctorType = curGD.getCtorType();
+
+  assert((cgm.getTarget().getCXXABI().hasConstructorVariants() ||
+          ctorType == Ctor_Complete) &&
+         "can only generate complete ctor for this ABI");
+
+  if (ctorType == Ctor_Complete && isConstructorDelegationValid(ctor) &&
+      cgm.getTarget().getCXXABI().hasConstructorVariants()) {
+    emitDelegateCXXConstructorCall(ctor, Ctor_Base, args, ctor->getEndLoc());
+    return;
+  }
+
+  const FunctionDecl *definition = nullptr;
+  Stmt *body = ctor->getBody(definition);
+  assert(definition == ctor && "emitting wrong constructor body");
+
+  if (isa_and_nonnull<CXXTryStmt>(body)) {
+    cgm.errorNYI(ctor->getSourceRange(), "emitConstructorBody: try body");
+    return;
+  }
+
+  assert(!cir::MissingFeatures::incrementProfileCounter());
+  assert(!cir::MissingFeatures::runCleanupsScope());
+
+  // TODO: in restricted cases, we can emit the vbase initializers of a
+  // complete ctor and then delegate to the base ctor.
+
+  assert(!cir::MissingFeatures::emitCtorPrologue());
+  if (ctor->isDelegatingConstructor()) {
+    // This will be handled in emitCtorPrologue, but we should emit a diagnostic
+    // rather than silently fail to delegate.
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitConstructorBody: delegating ctor");
+    return;
+  }
+
+  // TODO(cir): propagate this result via mlir::logical result. Just unreachable
+  // now just to have it handled.
+  if (mlir::failed(emitStmt(body, true))) {
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitConstructorBody: emit body statement failed.");
+    return;
+  }
+}
+
 /// Given a value of type T* that may not be to a complete object, construct
 /// an l-vlaue withi the natural pointee alignment of T.
 LValue CIRGenFunction::makeNaturalAlignPointeeAddrLValue(mlir::Value val,
@@ -522,16 +570,16 @@ clang::QualType CIRGenFunction::buildFunctionArgList(clang::GlobalDecl gd,
     cgm.getCXXABI().buildThisParam(*this, args);
   }
 
-  if (isa<CXXConstructorDecl>(fd))
-    cgm.errorNYI(fd->getSourceRange(),
-                 "buildFunctionArgList: CXXConstructorDecl");
+  if (const auto *cd = dyn_cast<CXXConstructorDecl>(fd))
+    if (cd->getInheritedConstructor())
+      cgm.errorNYI(fd->getSourceRange(),
+                   "buildFunctionArgList: inherited constructor");
 
   for (auto *param : fd->parameters())
     args.push_back(param);
 
   if (md && (isa<CXXConstructorDecl>(md) || isa<CXXDestructorDecl>(md)))
-    cgm.errorNYI(fd->getSourceRange(),
-                 "buildFunctionArgList: implicit structor params");
+    assert(!cir::MissingFeatures::cxxabiStructorImplicitParam());
 
   return retTy;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 682d59d63faa8..361dcd5ef1c31 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -66,6 +66,7 @@ class CIRGenFunction : public CIRGenTypeCache {
   ImplicitParamDecl *cxxabiThisDecl = nullptr;
   mlir::Value cxxabiThisValue = nullptr;
   mlir::Value cxxThisValue = nullptr;
+  clang::CharUnits cxxThisAlignment;
 
   // Holds the Decl for the current outermost non-closure context
   const clang::Decl *curFuncDecl = nullptr;
@@ -473,6 +474,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   bool shouldNullCheckClassCastValue(const CastExpr *ce);
 
+  static bool
+  isConstructorDelegationValid(const clang::CXXConstructorDecl *ctor);
+
   LValue makeNaturalAlignPointeeAddrLValue(mlir::Value v, clang::QualType t);
 
   /// Construct an address with the natural alignment of T. If a pointer to T
@@ -517,6 +521,7 @@ class CIRGenFunction : public CIRGenTypeCache {
     assert(cxxThisValue && "no 'this' value for this function");
     return cxxThisValue;
   }
+  Address loadCXXThisAddress();
 
   /// Get an appropriate 'undef' rvalue for the given type.
   /// TODO: What's the equivalent for MLIR? Currently we're only using this for
@@ -753,6 +758,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e);
 
+  void emitConstructorBody(FunctionArgList &args);
+
   mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s);
 
   void emitCXXConstructExpr(const clang::CXXConstructExpr *e,
@@ -841,6 +848,17 @@ class CIRGenFunction : public CIRGenTypeCache {
                                       mlir::Type condType,
                                       bool buildingTopLevelCase);
 
+  void emitDelegateCXXConstructorCall(const clang::CXXConstructorDecl *ctor,
+                                      clang::CXXCtorType ctorType,
+                                      const FunctionArgList &args,
+                                      clang::SourceLocation loc);
+
+  /// We are performing a delegate call; that is, the current function is
+  /// delegating to another one. Produce a r-value suitable for passing the
+  /// given parameter.
+  void emitDelegateCallArg(CallArgList &args, const clang::VarDecl *param,
+                           clang::SourceLocation loc);
+
   /// Emit an `if` on a boolean condition to the specified blocks.
   /// FIXME: Based on the condition, this might try to simplify the codegen of
   /// the conditional based on the branch.
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index fdd8b63fb6da0..cd9096a0188a7 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -20,7 +20,9 @@
 #include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
 
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
@@ -35,8 +37,13 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
     assert(!cir::MissingFeatures::cxxabiUseARMGuardVarABI());
   }
 
-  void emitInstanceFunctionProlog(SourceLocation Loc,
-                                  CIRGenFunction &CGF) override;
+  bool needsVTTParameter(clang::GlobalDecl gd) override;
+
+  void emitInstanceFunctionProlog(SourceLocation loc,
+                                  CIRGenFunction &cgf) override;
+
+  void emitCXXConstructors(const clang::CXXConstructorDecl *d) override;
+  void emitCXXStructor(clang::GlobalDecl gd) override;
 };
 
 } // namespace
@@ -72,6 +79,60 @@ void CIRGenItaniumCXXABI::emitInstanceFunctionProlog(SourceLocation loc,
   }
 }
 
+void CIRGenItaniumCXXABI::emitCXXStructor(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+  auto *cd = dyn_cast<CXXConstructorDecl>(md);
+
+  if (!cd) {
+    cgm.errorNYI(md->getSourceRange(), "CXCABI emit destructor");
+    return;
+  }
+
+  if (cgm.getCodeGenOpts().CXXCtorDtorAliases)
+    cgm.errorNYI(md->getSourceRange(), "Ctor/Dtor aliases");
+
+  auto fn = cgm.codegenCXXStructor(gd);
+
+  cgm.maybeSetTrivialComdat(*md, fn);
+}
+
+void CIRGenItaniumCXXABI::emitCXXConstructors(const CXXConstructorDecl *d) {
+  // Just make sure we're in sync with TargetCXXABI.
+  assert(cgm.getTarget().getCXXABI().hasConstructorVariants());
+
+  // The constructor used for constructing this as a base class;
+  // ignores virtual bases.
+  cgm.emitGlobal(GlobalDecl(d, Ctor_Base));
+
+  // The constructor used for constructing this as a complete class;
+  // constructs the virtual bases, then calls the base constructor.
+  if (!d->getParent()->isAbstract()) {
+    // We don't need to emit the complete ctro if the class is abstract.
+    cgm.emitGlobal(GlobalDecl(d, Ctor_Complete));
+  }
+}
+
+/// Return whether the given global decl needs a VTT (virtual table table)
+/// parameter, which it does if it's a base constructor or destructor with
+/// virtual bases.
+bool CIRGenItaniumCXXABI::needsVTTParameter(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  // We don't have any virtual bases, just return early.
+  if (!md->getParent()->getNumVBases())
+    return false;
+
+  // Check if we have a base constructor.
+  if (isa<CXXConstructorDecl>(md) && gd.getCtorType() == Ctor_Base)
+    return true;
+
+  // Check if we have a base destructor.
+  if (isa<CXXDestructorDecl>(md) && gd.getDtorType() == Dtor_Base)
+    return true;
+
+  return false;
+}
+
 CIRGenCXXABI *clang::CIRGen::CreateCIRGenItaniumCXXABI(CIRGenModule &cgm) {
   switch (cgm.getASTContext().getCXXABIKind()) {
   case TargetCXXABI::GenericItanium:
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 8407f8fad06ba..434dd376208e1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -226,11 +226,9 @@ mlir::Operation *
 CIRGenModule::getAddrOfGlobal(GlobalDecl gd, ForDefinition_t isForDefinition) {
   const Decl *d = gd.getDecl();
 
-  if (isa<CXXConstructorDecl>(d) || isa<CXXDestructorDecl>(d)) {
-    errorNYI(d->getSourceRange(),
-             "getAddrOfGlobal: C++ constructor/destructor");
-    return nullptr;
-  }
+  if (isa<CXXConstructorDecl>(d) || isa<CXXDestructorDecl>(d))
+    return getAddrOfCXXStructor(gd, /*FnInfo=*/nullptr, /*FnType=*/nullptr,
+                                /*DontDefer=*/false, isForDefinition);
 
   if (isa<CXXMethodDecl>(d)) {
     const CIRGenFunctionInfo &fi =
@@ -411,6 +409,7 @@ void CIRGenModule::emitGlobalFunctionDefinition(clang::GlobalDecl gd,
     cgf.generateCode(gd, funcOp, funcType);
   }
   curCGF = nullptr;
+  assert(!cir::MissingFeatures::opFuncAttributesForDefinition());
 }
 
 mlir::Operation *CIRGenModule::getGlobalValue(StringRef name) {
@@ -771,7 +770,7 @@ void CIRGenModule::emitGlobalDefinition(clang::GlobalDecl gd,
       // Make sure to emit the definition(s) before we emit the thunks. This is
       // necessary for the generation of certain thunks.
       if (isa<CXXConstructorDecl>(method) || isa<CXXDestructorDecl>(method))
-        errorNYI(method->getSourceRange(), "C++ ctor/dtor");
+        abi->emitCXXStructor(gd);
       else if (fd->isMultiVersion())
         errorNYI(method->getSourceRange(), "multiversion functions");
       else
@@ -1173,6 +1172,10 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) {
   case Decl::Empty:
     break;
 
+  case Decl::CXXConstructor:
+    getCXXABI().emitCXXConstructors(cast<CXXConstructorDecl>(decl));
+    break;
+
   // C++ Decls
   case Decl::LinkageSpec:
   case Decl::Namespace:
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 9748c0b3ed43a..f76fd8e733642 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -267,6 +267,11 @@ class CIRGenModule : public CIRGenTypeCache {
   // Make sure that this type is translated.
   void updateCompletedType(const clang::TagDecl *td);
 
+  // Produce code for this constructor/destructor. This method doesn't try to
+  // apply any ABI rules about which other constructors/destructors are needed
+  // or if they are alias to each other.
+  cir::FuncOp codegenCXXStructor(clang::GlobalDecl gd);
+
   bool supportsCOMDAT() const;
   void maybeSetTrivialComdat(const clang::Decl &d, mlir::Operation *op);
 
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index beaa9afb31f93..217609687eabc 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_library(clangCIR
   CIRGenBuilder.cpp
   CIRGenCall.cpp
   CIRGenClass.cpp
+  CIRGenCXX.cpp
   CIRGenCXXABI.cpp
   CIRGenCXXExpr.cpp
   CIRGenBuiltin.cpp
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 3a1e82e338c1c..3b4191fd74c97 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -3,7 +3,7 @@
 
 struct Struk {
   int a;
-  Struk();
+  Struk() {}
 };
 
 void baz() {
@@ -12,8 +12,58 @@ void baz() {
 
 // CHECK: !rec_Struk = !cir.record<struct "Struk" {!s32i}>
 
-// CHECK:   cir.func @_ZN5StrukC1Ev(!cir.ptr<!rec_Struk>)
+// Note: In the absence of the '-mconstructor-aliases' option, we emit two
+//       constructors here. The handling of constructor aliases is currently
+//       NYI, but when it is added this test should be updated to add a RUN
+//       line that passes '-mconstructor-aliases' to clang_cc1.
+// CHECK:   cir.func @_ZN5StrukC2Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
+// CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
+// CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     cir.return
+
+// CHECK:   cir.func @_ZN5StrukC1Ev(%arg0: !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>, ["this", init] {alignment = 8 : i64}
+// CHECK-NEXT:     cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr<!rec_Struk>, !cir.ptr<!cir.ptr<!rec_Struk>>
+// CHECK-NEXT:     %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_Struk>>, !cir.ptr<!rec_Struk>
+// CHECK-NEXT:     cir.call @_ZN5StrukC2Ev(%[[THIS]]) : (!cir.ptr<!rec_Struk>) -> ()
+// CHECK-NEXT:     cir.return
+
 // CHECK:   cir.func @_Z3bazv()
 // CHECK-NEXT:     %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr<!rec_Struk>, ["s", init] {alignment = 4 : i64}
 // CHECK-NEXT:     cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr<!rec_Struk>) -> ()
 // CHECK-NEXT:     cir.return
+
+struct VariadicStruk {
+  int a;
+  VariadicStruk(int n, ...) { a = n;}
+};
+
+void bar() {
+  VariadicStruk s(1, 2, 3);
+}
+
+// When a variadic constructor is present, we call the C2 constructor directly.
+
+// CHECK-NOT: cir.func @_ZN13VariadicStrukC2Eiz
+
+// CHECK:      cir.func @_ZN13VariadicStrukC1Eiz(%arg0: !cir.ptr<!rec_VariadicStruk>
+// CHECK-SAME:                                   %arg1: !s32i
+// CHECK-SAME:                                   ...) {
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   %[[A_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "a"}
+// CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:  cir.func @_Z3barv
+// CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca !rec_VariadicStruk, !cir.ptr<!rec_VariadicStruk>, ["s", init]
+// CHECK-NEXT:    %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CHECK-NEXT:    %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CHECK-NEXT:    %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
+// CHECK-NEXT:    cir.call @_ZN13VariadicStrukC1Eiz(%[[S_ADDR]], %[[ONE]], %[[TWO]], %[[THREE]])
+// CHECK-NEXT:    cir.return

From 639c19ddb688595a69ad9f83a40aa32e2187134c Mon Sep 17 00:00:00 2001
From: "long.chen" <lipracer@gmail.com>
Date: Fri, 13 Jun 2025 00:26:26 +0800
Subject: [PATCH 262/851] [NFC][mlir] make the assert consistent with the
 declared behavior (#143874)

---
 mlir/include/mlir/ExecutionEngine/MemRefUtils.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
index 918647d9feac3..f355dfb8648ec 100644
--- a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -48,7 +48,8 @@ inline std::array<int64_t, N> makeStrides(ArrayRef<int64_t> shape) {
   std::array<int64_t, N> res;
   int64_t running = 1;
   for (int64_t idx = N - 1; idx >= 0; --idx) {
-    assert(shape[idx] && "size must be non-negative for all shape dimensions");
+    assert(shape[idx] >= 0 &&
+           "size must be non-negative for all shape dimensions");
     res[idx] = running;
     running *= shape[idx];
   }

From 56548e1d9b2ed4f5d2fe3913c27af770cf0e06e5 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Thu, 12 Jun 2025 09:19:58 -0700
Subject: [PATCH 263/851] [Matrix] Fix a crash in VisitSelectInst due to
 iteration length mismatch

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  9 ++-
 .../LowerMatrixIntrinsics/select.ll           | 61 +++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index b32160ff275b9..1e37f40fa9d52 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -2326,14 +2326,13 @@ class LowerMatrixIntrinsics {
     MatrixTy A = getMatrix(OpA, Shape, Builder);
     MatrixTy B = getMatrix(OpB, Shape, Builder);
 
-    Value *CondV[2];
+    SmallVector<Value*> CondV;
     if (isa<FixedVectorType>(Cond->getType())) {
       MatrixTy C = getMatrix(Cond, Shape, Builder);
-      CondV[0] = C.getVector(0);
-      CondV[1] = C.getVector(1);
+      llvm::copy(C.vectors(), std::back_inserter(CondV));
     } else {
-      CondV[0] = Cond;
-      CondV[1] = Cond;
+      CondV.resize(A.getNumVectors());
+      std::fill(CondV.begin(), CondV.end(), Cond);
     }
 
     for (auto [CV, AV, BV] : llvm::zip_equal(CondV, A.vectors(), B.vectors()))
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
index 70b0dfdb3e7e8..bd97915759aac 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/select.ll
@@ -144,3 +144,64 @@ define void @select_2x2_vcond_shape3(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
   store <4 x float> %op, ptr %out
   ret void
 }
+
+define void @select_2x2_vcond_shape4(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape4(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <4 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <4 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <4 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[COL_LOAD1]], <4 x float> [[COL_LOAD]], <4 x float> [[COL_LOAD2]]
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 4, i1 false, i32 4, i32 1)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 4, i1 false, i32 4, i32 1)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}
+
+define void @select_2x2_vcond_shape5(ptr %cond, ptr %lhs, ptr %rhs, ptr %out) {
+; CHECK-LABEL: @select_2x2_vcond_shape5(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x float>, ptr [[LHS:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 1
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x float>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[LHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x float>, ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr float, ptr [[LHS]], i64 3
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <1 x float>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <1 x i1>, ptr [[COND:%.*]], align 1
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr i1, ptr [[COND]], i64 1
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x i1>, ptr [[VEC_GEP7]], align 1
+; CHECK-NEXT:    [[VEC_GEP9:%.*]] = getelementptr i1, ptr [[COND]], i64 2
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <1 x i1>, ptr [[VEC_GEP9]], align 1
+; CHECK-NEXT:    [[VEC_GEP11:%.*]] = getelementptr i1, ptr [[COND]], i64 3
+; CHECK-NEXT:    [[COL_LOAD12:%.*]] = load <1 x i1>, ptr [[VEC_GEP11]], align 1
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x float>, ptr [[RHS:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr float, ptr [[RHS]], i64 1
+; CHECK-NEXT:    [[COL_LOAD15:%.*]] = load <1 x float>, ptr [[VEC_GEP14]], align 4
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr float, ptr [[RHS]], i64 2
+; CHECK-NEXT:    [[COL_LOAD17:%.*]] = load <1 x float>, ptr [[VEC_GEP16]], align 4
+; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr float, ptr [[RHS]], i64 3
+; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x float>, ptr [[VEC_GEP18]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = select <1 x i1> [[COL_LOAD6]], <1 x float> [[COL_LOAD]], <1 x float> [[COL_LOAD13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <1 x i1> [[COL_LOAD8]], <1 x float> [[COL_LOAD1]], <1 x float> [[COL_LOAD15]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select <1 x i1> [[COL_LOAD10]], <1 x float> [[COL_LOAD3]], <1 x float> [[COL_LOAD17]]
+; CHECK-NEXT:    [[TMP4:%.*]] = select <1 x i1> [[COL_LOAD12]], <1 x float> [[COL_LOAD5]], <1 x float> [[COL_LOAD19]]
+; CHECK-NEXT:    store <1 x float> [[TMP1]], ptr [[OUT:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP20:%.*]] = getelementptr float, ptr [[OUT]], i64 1
+; CHECK-NEXT:    store <1 x float> [[TMP2]], ptr [[VEC_GEP20]], align 4
+; CHECK-NEXT:    [[VEC_GEP21:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <1 x float> [[TMP3]], ptr [[VEC_GEP21]], align 8
+; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 3
+; CHECK-NEXT:    store <1 x float> [[TMP4]], ptr [[VEC_GEP22]], align 4
+; CHECK-NEXT:    ret void
+;
+  %lhsv = load <4 x float>, ptr %lhs
+  %condv = call <4 x i1> @llvm.matrix.column.major.load(ptr %cond, i64 1, i1 false, i32 1, i32 4)
+  %rhsv = call <4 x float> @llvm.matrix.column.major.load(ptr %rhs, i64 1, i1 false, i32 1, i32 4)
+  %op = select <4 x i1> %condv, <4 x float> %lhsv, <4 x float> %rhsv
+  store <4 x float> %op, ptr %out
+  ret void
+}

From 31daed868d69ac1ac6f6a29340d0b5e0e6dc39ab Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Thu, 12 Jun 2025 22:01:11 +0530
Subject: [PATCH 264/851] [RISCV] Prefer QC_EXTU to ANDI for certain 12-bit
 mask immediates (#143838)

`QC_EXTU` can be compressed to `QC_C_EXTU` when the immediate is a `mask
>=63`. We currently only handle masks that don't fit in 12-bits in
`RISCVISelDAGToDAG`.

I have added ISEL patterns in `RISCVInstrInfoXqci.td` instead of
changing code in `RISCVISelDAGToDAG` since the other extract
instructions ( in `XTHeadbb` and `XAndesPerf`) don't have compressed
versions and it is a lot easier to maintain things this way.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |  8 ++++
 llvm/test/CodeGen/RISCV/xqcibm-extract.ll   | 42 +++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index dba035bab928c..9f96a3ed80561 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1441,6 +1441,14 @@ let Predicates = [HasVendorXqcibm, IsRV32] in {
 def : Pat<(sext_inreg (i32 GPR:$rs1), i16), (QC_EXT GPR:$rs1, 16, 0)>;
 def : Pat<(sext_inreg (i32 GPR:$rs1), i8), (QC_EXT GPR:$rs1, 8, 0)>;
 def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>;
+
+// Prefer qc.extu to andi for the following cases since the former can be compressed
+def : Pat<(i32 (and GPRNoX0:$rs, 63)), (QC_EXTU GPRNoX0:$rs, 6, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 127)), (QC_EXTU GPRNoX0:$rs, 7, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 255)), (QC_EXTU GPRNoX0:$rs, 8, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 511)), (QC_EXTU GPRNoX0:$rs, 9, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
+def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
 let Predicates = [HasVendorXqciint, IsRV32] in
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
index cb01510058da4..edf6e9a2d5019 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll
@@ -247,6 +247,48 @@ define i32 @extu_from_and_i32(i32 %x) {
   ret i32 %a
 }
 
+define i32 @no_extu_from_and_i32(i32 %x) {
+; RV32I-LABEL: no_extu_from_and_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 31
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: no_extu_from_and_i32:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    andi a0, a0, 31
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 31
+  ret i32 %a
+}
+
+define i32 @extu_from_and_i32_simm12_lb(i32 %x) {
+; RV32I-LABEL: extu_from_and_i32_simm12_lb:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 63
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i32_simm12_lb:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 6, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 63
+  ret i32 %a
+}
+
+define i32 @extu_from_and_i32_simm12_ub(i32 %x) {
+; RV32I-LABEL: extu_from_and_i32_simm12_ub:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 2047
+; RV32I-NEXT:    ret
+;
+; RV32XQCIBM-LABEL: extu_from_and_i32_simm12_ub:
+; RV32XQCIBM:       # %bb.0:
+; RV32XQCIBM-NEXT:    qc.extu a0, a0, 11, 0
+; RV32XQCIBM-NEXT:    ret
+  %a = and i32 %x, 2047
+  ret i32 %a
+}
+
 define i64 @extu_from_and_i64(i64 %x) {
 ; RV32I-LABEL: extu_from_and_i64:
 ; RV32I:       # %bb.0:

From cd8facebabab9b61c6af1313cd1fd1e586bc2ba6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 12 Jun 2025 18:36:03 +0200
Subject: [PATCH 265/851] [CIR] Implement folder for VecCreateOp (#143355)

This change adds a folder for the VecCreateOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |   1 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  10 ++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   3 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   4 +
 clang/test/CIR/CodeGen/vector-ext.cpp         | 136 ++++++------------
 clang/test/CIR/CodeGen/vector.cpp             | 127 ++++++----------
 .../CIR/Transforms/vector-create-fold.cir     |  19 +++
 7 files changed, 128 insertions(+), 172 deletions(-)
 create mode 100644 clang/test/CIR/Transforms/vector-create-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 634f0dd554c77..194153caa9271 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2059,6 +2059,7 @@ def VecCreateOp : CIR_Op<"vec.create", [Pure]> {
   }];
 
   let hasVerifier = 1;
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index a6cf0a6b5d75e..8ed0ee92574dc 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1533,6 +1533,16 @@ LogicalResult cir::GetMemberOp::verify() {
 // VecCreateOp
 //===----------------------------------------------------------------------===//
 
+OpFoldResult cir::VecCreateOp::fold(FoldAdaptor adaptor) {
+  if (llvm::any_of(getElements(), [](mlir::Value value) {
+        return !mlir::isa<cir::ConstantOp>(value.getDefiningOp());
+      }))
+    return {};
+
+  return cir::ConstVectorAttr::get(
+      getType(), mlir::ArrayAttr::get(getContext(), adaptor.getElements()));
+}
+
 LogicalResult cir::VecCreateOp::verify() {
   // Verify that the number of arguments matches the number of elements in the
   // vector, and that the type of all the arguments matches the type of the
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 29f9942638964..6f8a64ce0251e 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -142,7 +142,8 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            VecExtractOp, VecShuffleOp, VecShuffleDynamicOp, VecTernaryOp>(op))
+            VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
+            VecTernaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 1642d10d427b5..619e113202c9a 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -901,6 +901,10 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
       rewriter.eraseOp(op);
       return mlir::success();
     }
+  } else if (const auto vecTy = mlir::dyn_cast<cir::VectorType>(op.getType())) {
+    rewriter.replaceOp(op, lowerCirAttrAsValue(op, op.getValue(), rewriter,
+                                               getTypeConverter()));
+    return mlir::success();
   } else {
     return op.emitError() << "unsupported constant type " << op.getType();
   }
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index 965c44c9461a8..fe4919ec0478d 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -77,12 +77,8 @@ void foo() {
 // CIR: %[[VEC_F:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["f", init]
 // CIR: %[[VEC_G:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["g", init]
 // CIR: %[[VEC_H:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["h", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_E_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_E_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_E_VAL]], %[[VEC_E]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[GLOBAL_X:.*]] = cir.get_global @x : !cir.ptr<!s32i>
 // CIR: %[[X_VAL:.*]] = cir.load{{.*}} %[[GLOBAL_X]] : !cir.ptr<!s32i>, !s32i
@@ -95,13 +91,11 @@ void foo() {
 // CIR: %[[VEC_F_VAL:.*]] = cir.vec.create(%[[X_VAL]], %[[CONST_5]], %[[CONST_6]], %[[X_PLUS_1]] :
 // CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_F_VAL]], %[[VEC_F]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_G_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_G_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_G_VAL]], %[[VEC_G]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_H_VAL:.*]] = cir.vec.create(%[[ZERO]], %[[ZERO]], %[[ZERO]], %[[ZERO]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_H_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME; #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_H_VAL]], %[[VEC_H]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
@@ -148,12 +142,8 @@ void foo3() {
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
@@ -184,12 +174,8 @@ void foo4() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -225,12 +211,8 @@ void foo5() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -260,12 +242,8 @@ void foo6() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[VAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["value", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -307,12 +285,8 @@ void foo7() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -353,12 +327,8 @@ void foo8() {
 // CIR: %[[PLUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["plus_res", init]
 // CIR: %[[MINUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["minus_res", init]
 // CIR: %[[NOT_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["not_res", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
@@ -410,19 +380,11 @@ void foo9() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_6:.*]] = cir.const #cir.int<6> : !s32i
-// CIR: %[[CONST_7:.*]] = cir.const #cir.int<7> : !s32i
-// CIR: %[[CONST_8:.*]] = cir.const #cir.int<8> : !s32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_6]], %[[CONST_7]], %[[CONST_8]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -475,9 +437,11 @@ void foo10() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -534,11 +498,11 @@ void foo11() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -661,11 +625,11 @@ void foo12() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -776,11 +740,11 @@ void foo13() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -891,11 +855,11 @@ void foo14() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
@@ -1105,24 +1069,16 @@ void foo18() {
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
+// CIR-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
 // CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index 23e91724dc0f3..d0c5b83cd5b04 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -66,12 +66,8 @@ void foo() {
 // CIR: %[[VEC_E:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["e", init]
 // CIR: %[[VEC_F:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["f", init]
 // CIR: %[[VEC_G:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["g", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_D_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_D_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_D_VAL]], %[[VEC_D]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[GLOBAL_X:.*]] = cir.get_global @x : !cir.ptr<!s32i>
 // CIR: %[[X_VAL:.*]] = cir.load{{.*}} %[[GLOBAL_X]] : !cir.ptr<!s32i>, !s32i
@@ -84,14 +80,11 @@ void foo() {
 // CIR: %[[VEC_E_VAL:.*]] = cir.vec.create(%[[X_VAL]], %[[CONST_5]], %[[CONST_6]], %[[X_PLUS_1]] :
 // CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_E_VAL]], %[[VEC_E]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_5:.*]] = cir.const #cir.int<5> : !s32i
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_F_VAL:.*]] = cir.vec.create(%[[CONST_5]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_F_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_F_VAL]], %[[VEC_F]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
-// CIR: %[[VEC_G_VAL:.*]] = cir.vec.create(%[[CONST_0]], %[[CONST_0]], %[[CONST_0]], %[[CONST_0]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_G_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+// CIR-SAME; #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_G_VAL]], %[[VEC_G]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16
@@ -136,12 +129,8 @@ void foo3() {
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[IDX:.*]] = cir.const #cir.int<1> : !s32i
@@ -172,12 +161,8 @@ void foo4() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["e", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -213,12 +198,8 @@ void foo5() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -248,12 +229,8 @@ void foo6() {
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[IDX:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["idx", init]
 // CIR: %[[VAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["value", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
 // CIR: cir.store{{.*}} %[[CONST_IDX]], %[[IDX]] : !s32i, !cir.ptr<!s32i>
@@ -295,12 +272,8 @@ void foo7() {
 }
 
 // CIR: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[CONST_VAL:.*]] = cir.const #cir.int<5> : !s32i
 // CIR: %[[CONST_IDX:.*]] = cir.const #cir.int<2> : !s32i
@@ -341,12 +314,8 @@ void foo8() {
 // CIR: %[[PLUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["plus_res", init]
 // CIR: %[[MINUS_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["minus_res", init]
 // CIR: %[[NOT_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["not_res", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
@@ -398,9 +367,11 @@ void foo9() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -453,9 +424,11 @@ void foo10() {
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} :  !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -512,11 +485,11 @@ void foo11() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -639,11 +612,11 @@ void foo12() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+// CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !s32i, !s32i, !s32i, !s32i) :
-// CIR-SAME: !cir.vector<4 x !s32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !s32i, #cir.int<6> : !s32i,
+// CIR-SAME: #cir.int<7> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
@@ -754,11 +727,11 @@ void foo13() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !u32i, !u32i, !u32i, !u32i) :
-// CIR-SAME: !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<5> : !u32i, #cir.int<6> : !u32i,
+// CIR-SAME: #cir.int<7> : !u32i, #cir.int<8> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
@@ -869,11 +842,11 @@ void foo14() {
 
 // CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["a", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>, ["b", init]
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create({{.*}}, {{.*}}, {{.*}}, {{.*}} : !cir.float, !cir.float, !cir.float, !cir.float) :
-// CIR-SAME: !cir.vector<4 x !cir.float>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00> : !cir.float,
+// CIR-SAME: #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
@@ -1083,24 +1056,16 @@ void foo18() {
 // CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>, ["shl", init]
 // CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["b", init]
 // CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>, ["shr", init]
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i
-// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+// CIR: %[[VEC_A_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i,
+// CIR-SAME: #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i
 // CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i>
 // CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
-// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i
-// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i
-// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i
-// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i
-// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] :
-// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i>
+// CIR: %[[VEC_B_VAL:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !u32i, #cir.int<2> : !u32i,
+// CIR-SAME: #cir.int<3> : !u32i, #cir.int<4> : !u32i]> : !cir.vector<4 x !u32i>
 // CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr<!cir.vector<4 x !u32i>>
 // CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr<!cir.vector<4 x !u32i>>, !cir.vector<4 x !u32i>
 // CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i
diff --git a/clang/test/CIR/Transforms/vector-create-fold.cir b/clang/test/CIR/Transforms/vector-create-fold.cir
new file mode 100644
index 0000000000000..fb8f66dc4debc
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-create-fold.cir
@@ -0,0 +1,19 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.const #cir.int<3> : !s32i
+    %5 = cir.const #cir.int<4> : !s32i
+    %vec = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i>
+    cir.return %vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_create_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<2> : !s32i,
+  // CHECK-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[VEC]] : !cir.vector<4 x !s32i>
+}

From ae7ea6e3a28c017485cc2401703d6fab1549123d Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 09:38:43 -0700
Subject: [PATCH 266/851] [libc] Fix ioctl errno inclusion (#143928)

Since errno was moved in
https://github.com/llvm/llvm-project/pull/143187 the code including it
in https://github.com/llvm/llvm-project/pull/141393 was rendered
incorrect. This patch fixes the include and the cmake depends.
---
 libc/src/sys/ioctl/linux/ioctl.cpp           | 2 +-
 libc/test/src/sys/ioctl/linux/CMakeLists.txt | 3 ++-
 libc/test/src/sys/ioctl/linux/ioctl_test.cpp | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/libc/src/sys/ioctl/linux/ioctl.cpp b/libc/src/sys/ioctl/linux/ioctl.cpp
index f03fea21c75bd..9bb669c6a6f66 100644
--- a/libc/src/sys/ioctl/linux/ioctl.cpp
+++ b/libc/src/sys/ioctl/linux/ioctl.cpp
@@ -10,7 +10,7 @@
 
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
diff --git a/libc/test/src/sys/ioctl/linux/CMakeLists.txt b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
index e5095c54a729f..2df67e9d9cbde 100644
--- a/libc/test/src/sys/ioctl/linux/CMakeLists.txt
+++ b/libc/test/src/sys/ioctl/linux/CMakeLists.txt
@@ -7,7 +7,7 @@ add_libc_unittest(
   SRCS
     ioctl_test.cpp
   DEPENDS
-    libc.hdr.ioctl_macros
+    libc.hdr.sys_ioctl_macros
     libc.src.sys.ioctl.ioctl
     libc.src.errno.errno
     libc.src.fcntl.open
@@ -15,3 +15,4 @@ add_libc_unittest(
     libc.src.unistd.read
     libc.src.unistd.write
 )
+
diff --git a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
index 9c56a4689b186..b76dc14824c95 100644
--- a/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
+++ b/libc/test/src/sys/ioctl/linux/ioctl_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "src/errno/libc_errno.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/sys/ioctl/ioctl.h"
 #include "src/unistd/close.h"

From e65131a56335fc6b8e47c609f17df50ea65577b4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 12 Jun 2025 09:49:06 -0700
Subject: [PATCH 267/851] MC,test: Specify explicit triple for include.ll

The output is subject to .set or = difference.
---
 llvm/test/MC/AsmParser/include.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll
index 22c9eaf7a36e9..625fdd9e1e595 100644
--- a/llvm/test/MC/AsmParser/include.ll
+++ b/llvm/test/MC/AsmParser/include.ll
@@ -1,6 +1,4 @@
-; RUN: llc -I %p/Inputs -filetype asm -o - %s | FileCheck %s
-; UNSUPPORTED: target={{.*}}-zos{{.*}},target=nvptx{{.*}}
-; REQUIRES: default_triple
+; RUN: llc -mtriple=x86_64 -I %p/Inputs -filetype asm -o - %s | FileCheck %s
 
 module asm ".include \22module.x\22"
 

From 2c20bc5112a18a8a893e8caea6fd59c097754d74 Mon Sep 17 00:00:00 2001
From: fairywreath <65404740+fairywreath@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:54:42 -0400
Subject: [PATCH 268/851] [mlir][spirv] Add definitions for GL FindILsb and
 FindSMsb (#143916)

Adds SPIRV GL FindILsb and FindSMsb instructions which correspond to GL
instruction numbers 73 and 74.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       | 35 ++++++++++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 66 ++++++++++++++++++-
 mlir/test/Target/SPIRV/gl-ops.mlir            | 19 +++++-
 3 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 7ffe0c8da1cae..2ec61758ba8ef 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1274,6 +1274,41 @@ def SPIRV_GLReflectOp : SPIRV_GLBinaryArithmeticOp<"Reflect", 71, SPIRV_Float> {
 
 // ----
 
+def SPIRV_GLFindILsbOp : SPIRV_GLUnaryArithmeticOp<"FindILsb", 73, SPIRV_Integer> {
+  let summary = "Integer least-significant bit";
+
+  let description = [{
+    Results in the bit number of the least-significant 1-bit in the binary
+    representation of Value. If Value is 0, the result is -1.
+
+    Result Type and the type of Value must both be integer scalar or
+    integer vector types. Result Type and operand types must have the
+    same number of components with the same component width. Results are
+    computed per component.
+  }];
+}
+
+// ----
+
+def SPIRV_GLFindSMsbOp : SPIRV_GLUnaryArithmeticOp<"FindSMsb", 74, SPIRV_Int32> {
+  let summary = "Signed-integer most-significant bit, with Value interpreted as a signed integer";
+
+  let description = [{
+    For positive numbers, the result will be the bit number of the most significant
+    1-bit. For negative numbers, the result will be the bit number of the most
+    significant 0-bit. For a Value of 0 or -1, the result is -1.
+
+    Result Type and the type of Value must both be integer scalar or
+    integer vector types. Result Type and operand types must have the
+    same number of components with the same component width. Results are
+    computed per component.
+
+    This instruction is currently limited to 32-bit width components.
+  }];
+}
+
+// ----
+
 def SPIRV_GLFindUMsbOp : SPIRV_GLUnaryArithmeticOp<"FindUMsb", 75, SPIRV_Int32> {
   let summary = "Unsigned-integer most-significant bit";
 
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index fbcf2095dc608..2b75767feaf92 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -567,7 +567,71 @@ func.func @fmix_vector(%arg0 : vector<3xf32>, %arg1 : vector<3xf32>, %arg2 : vec
 // -----
 
 //===----------------------------------------------------------------------===//
-// spirv.GL.Exp
+// spirv.GL.FindILsb
+//===----------------------------------------------------------------------===//
+
+func.func @findimsb_scalar_i32(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : i32
+  %2 = spirv.GL.FindILsb %arg0 : i32
+  return
+}
+
+func.func @findimsb_vector_i32(%arg0 : vector<3xi32>) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : vector<3xi32>
+  %2 = spirv.GL.FindILsb %arg0 : vector<3xi32>
+  return
+}
+
+func.func @findimsb_scalar_i16(%arg0 : i16) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : i16
+  %2 = spirv.GL.FindILsb %arg0 : i16
+  return
+}
+
+func.func @findimsb_vector_i64(%arg0 : vector<3xi64>) -> () {
+  // CHECK: spirv.GL.FindILsb {{%.*}} : vector<3xi64>
+  %2 = spirv.GL.FindILsb %arg0 : vector<3xi64>
+  return
+}
+
+// -----
+
+func.func @findimsb_error_scalar_float(%arg0 : f32) -> () {
+  // expected-error @+1 {{operand #0 must be 8/16/32/64-bit integer or vector of 8/16/32/64-bit integer values of length 2/3/4/8/1}}
+  %2 = spirv.GL.FindILsb %arg0 : f32
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.FindSMsb
+//===----------------------------------------------------------------------===//
+
+func.func @findsmsb_scalar(%arg0 : i32) -> () {
+  // CHECK: spirv.GL.FindSMsb {{%.*}} : i32
+  %2 = spirv.GL.FindSMsb %arg0 : i32
+  return
+}
+
+func.func @findsmsb_vector(%arg0 : vector<3xi32>) -> () {
+  // CHECK: spirv.GL.FindSMsb {{%.*}} : vector<3xi32>
+  %2 = spirv.GL.FindSMsb %arg0 : vector<3xi32>
+  return
+}
+
+// -----
+
+func.func @findsmsb_error_scalar_i64(%arg0 : i64) -> () {
+  // expected-error @+1 {{operand #0 must be Int32 or vector of Int32}}
+  %2 = spirv.GL.FindSMsb %arg0 : i64
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.FindUMsb
 //===----------------------------------------------------------------------===//
 
 func.func @findumsb(%arg0 : i32) -> () {
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index e4a6c6fb5a34e..eacf36bfba9ce 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -90,13 +90,24 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 
+  spirv.func @findilsb(%arg0 : i32) "None" {
+    // CHECK: spirv.GL.FindILsb {{%.*}} : i32
+    %2 = spirv.GL.FindILsb %arg0 : i32
+    spirv.Return
+  }
+  spirv.func @findsmsb(%arg0 : i32) "None" {
+    // CHECK: spirv.GL.FindSMsb {{%.*}} : i32
+    %2 = spirv.GL.FindSMsb %arg0 : i32
+    spirv.Return
+  }
+
   spirv.func @findumsb(%arg0 : i32) "None" {
     // CHECK: spirv.GL.FindUMsb {{%.*}} : i32
     %2 = spirv.GL.FindUMsb %arg0 : i32
     spirv.Return
   }
 
-  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>) "None" {
+  spirv.func @vector(%arg0 : f32, %arg1 : vector<3xf32>, %arg2 : vector<3xf32>, %arg3: vector<3xi32>) "None" {
     // CHECK: {{%.*}} = spirv.GL.Cross {{%.*}}, {{%.*}} : vector<3xf32>
     %0 = spirv.GL.Cross %arg1, %arg2 : vector<3xf32>
     // CHECK: {{%.*}} = spirv.GL.Normalize {{%.*}} : f32
@@ -111,6 +122,12 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %5 = spirv.GL.Distance %arg0, %arg0 : f32, f32 -> f32
     // CHECK: {{%.*}} = spirv.GL.Distance {{%.*}}, {{%.*}} : vector<3xf32>, vector<3xf32> -> f32
     %6 = spirv.GL.Distance %arg1, %arg2 : vector<3xf32>, vector<3xf32> -> f32
+    // CHECK: {{%.*}} = spirv.GL.FindILsb {{%.*}} : vector<3xi32>
+    %7 = spirv.GL.FindILsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.FindSMsb {{%.*}} : vector<3xi32>
+    %8 = spirv.GL.FindSMsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.FindUMsb {{%.*}} : vector<3xi32>
+    %9 = spirv.GL.FindUMsb %arg3 : vector<3xi32>
     spirv.Return
   }
 

From 1a4cf1d3edff2d4c790f597834301702cfc6dc15 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 10:07:45 -0700
Subject: [PATCH 269/851] [lldb][headers] Create Python script to fix up
 framework headers (#142051)

This commit replaces the shell script that fixes up includes for the
LLDB framework with a Python script. This script will also be used when
fixing up includes for the LLDBRPC.framework.
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ++++++++++++++++++
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 ++
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 ++
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 ++
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 ++
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 ++
 7 files changed, 206 insertions(+), 21 deletions(-)
 create mode 100755 lldb/scripts/framework-header-fix.py
 create mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 create mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 8961b1afe93ad..70010ffbf738c 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,24 +68,17 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
-# At configuration time, collect headers for the framework bundle and copy them
-# into a staging directory. Later we can copy over the entire folder.
-file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
-set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
-file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
-file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
-list(REMOVE_ITEM root_public_headers ${root_private_headers})
-
 find_program(unifdef_EXECUTABLE unifdef)
 
-set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
-foreach(header
-    ${public_headers}
-    ${generated_public_headers}
-    ${root_public_headers})
+# All necessary header files will be staged in the include directory in the build directory,
+# so just copy the files from there into the framework's staging directory.
+set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
+set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
+file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
+foreach(header ${lldb_build_dir_header_staging_list})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_header_staging}/${basename})
+  set(staged_header ${lldb_framework_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -112,13 +105,20 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# At build time, copy the staged headers into the framework bundle (and do
-# some post-processing in-place).
-add_custom_command(TARGET liblldb POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
-  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
-  COMMENT "LLDB.framework: copy framework headers"
-)
+# Take the headers from the staging directory and fix up their includes for the framework.
+# Then write them to the output directory.
+# Also, run unifdef to remove any specified guards from the header files.
+file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
+foreach(header ${lldb_framework_header_staging_list})
+
+  set(input_header ${header})
+  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
+
+  add_custom_command(TARGET liblldb POST_BUILD
+    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
+    COMMENT "LLDB.framework: Fix up and copy framework headers"
+  )
+endforeach()
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
new file mode 100755
index 0000000000000..9e4e5f860a2c0
--- /dev/null
+++ b/lldb/scripts/framework-header-fix.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+"""
+Usage: <path/to/input-directory> <path/to/output-directory>
+
+This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
+
+This script is used in 2 ways:
+1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
+
+2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
+"""
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+
+# Main header regexes
+INCLUDE_FILENAME_REGEX = re.compile(
+    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
+)
+
+# RPC header regexes
+RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
+RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
+
+
+def modify_rpc_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to RPC framework level includes.
+            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+            # Also, RPC common code includes must change to RPC framework level includes.
+            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
+            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
+                file_buffer
+            )
+            for match in rpc_common_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            for match in rpc_include_filename_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            output_file.write(file_buffer)
+
+
+def modify_main_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to framework level includes.
+            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
+            for match in regex_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDB/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+                output_file.write(file_buffer)
+
+
+def remove_guards(output_file_path, unifdef_path, unifdef_guards):
+    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
+    # find it using shutil. If shutil can't find it, then exit.
+    if not shutil.which(unifdef_path):
+        unifdef_path = shutil.which("unifdef")
+    if not unifdef_path:
+        print(
+            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
+        )
+        sys.exit(1)
+
+    subprocess_command = (
+        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
+    )
+    subprocess.run(subprocess_command)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
+    parser.add_argument("-i", "--input_file")
+    parser.add_argument("-o", "--output_file")
+    parser.add_argument("-p", "--unifdef_path")
+    parser.add_argument(
+        "unifdef_guards",
+        nargs="+",
+        type=str,
+        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
+    )
+    args = parser.parse_args()
+    input_file_path = str(args.input_file)
+    output_file_path = str(args.output_file)
+    framework_version = args.framework
+    unifdef_path = str(args.unifdef_path)
+    # Prepend dashes to the list of guards passed in from the command line.
+    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
+    # but passing them in with dashes for this script causes argparse to think that they're
+    # arguments in and of themself, so they need to passed in without dashes.
+    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
+
+    if framework_version == "lldb_main":
+        modify_main_includes(input_file_path, output_file_path)
+    if framework_version == "lldb_rpc":
+        modify_rpc_includes(input_file_path, output_file_path)
+    # After the incldues have been modified, run unifdef on the headers to remove any guards
+    # specified at the command line.
+    remove_guards(output_file_path, unifdef_path, unifdef_guards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
new file mode 100644
index 0000000000000..fecc69687cd74
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
@@ -0,0 +1,13 @@
+// This is a truncated version of an SB API file
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDB.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "lldb/API/SBDefines.h"
+#include "lldb/API/SBModule.h"
+
+// Any include guards specified at the command line must be removed.
+#ifndef SWIG
+int a = 10
+#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
new file mode 100644
index 0000000000000..556afa38a9225
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
@@ -0,0 +1,9 @@
+// This is a truncated version of an SB API file generated by lldb-rpc-gen
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDBRPC.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "LLDBRPC.h"
+#include "SBDefines.h"
+#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
new file mode 100644
index 0000000000000..e90c3bdfc5adb
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
@@ -0,0 +1,11 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Local includes must be changed to framework level includes.
+# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
new file mode 100644
index 0000000000000..a7e82d2f3640c
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Any include guards specified at the command line must be removed.
+CHECK-NOT: #ifndef SWIG
+CHECK: int a = 10
+CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
new file mode 100644
index 0000000000000..8ba03a8c2afa8
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
@@ -0,0 +1,14 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/Main/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
+
+# Local includes must be changed to RPC framework level includes.
+# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+# Also, RPC common code includes must change to RPC framework level includes.
+# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/SBDefines.h>
+CHECK: #include <LLDBRPC/LLDBRPC.h>

From 217304a09949de73a8def5ee4c7ed9510449ce4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=83=E5=9C=8B=E5=BA=AD?= <we3223@gmail.com>
Date: Fri, 13 Jun 2025 01:08:07 +0800
Subject: [PATCH 270/851] [X86] Use X86FixupInstTunings to select between
 (V)MOVSS/D and (V)BLENDPS/D (#143895)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix https://github.com/llvm/llvm-project/issues/142588
Following @RKSimon’s suggestion, the transformation applies only when
the blend mask is exactly 1, indicating that the instruction behaves
like a move. Additionally, the conversion will only be performed when
optimizing for size or when the target prefers MOVSS/D over BLENDPS/D
for performance reasons.

The switch-case instructions were identified with GPT O.O .

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    | 15 ++++
 .../test/CodeGen/X86/2012-01-12-extract-sv.ll |  4 +-
 llvm/test/CodeGen/X86/avx-insertelt.ll        |  6 +-
 .../X86/avx512-intrinsics-fast-isel.ll        |  4 +-
 .../CodeGen/X86/avx512-intrinsics-upgrade.ll  |  2 +-
 llvm/test/CodeGen/X86/avx512-intrinsics.ll    |  2 +-
 .../test/CodeGen/X86/avx512copy-intrinsics.ll |  2 +-
 llvm/test/CodeGen/X86/build-vector-512.ll     |  6 +-
 llvm/test/CodeGen/X86/buildvec-extract.ll     |  6 +-
 .../CodeGen/X86/canonicalize-vars-f16-type.ll |  8 +-
 .../CodeGen/X86/coalesce_commute_movsd.ll     |  4 +-
 llvm/test/CodeGen/X86/combine-and.ll          |  4 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   | 80 ++++++++++++++-----
 .../CodeGen/X86/fminimumnum-fmaximumnum.ll    |  2 +-
 llvm/test/CodeGen/X86/fmsubadd-combine.ll     |  4 +-
 .../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 14 ++--
 .../X86/fp-strict-scalar-inttofp-fp16.ll      | 24 +++---
 .../X86/fp-strict-scalar-round-fp16.ll        | 14 ++--
 llvm/test/CodeGen/X86/half-constrained.ll     |  6 +-
 llvm/test/CodeGen/X86/half-darwin.ll          |  2 +-
 llvm/test/CodeGen/X86/insertelement-zero.ll   |  4 +-
 llvm/test/CodeGen/X86/masked_expandload.ll    |  2 +-
 llvm/test/CodeGen/X86/masked_gather.ll        | 12 +--
 .../test/CodeGen/X86/masked_gather_scatter.ll |  2 +-
 llvm/test/CodeGen/X86/masked_load.ll          |  2 +-
 llvm/test/CodeGen/X86/pr40730.ll              |  4 +-
 llvm/test/CodeGen/X86/scalarize-fp.ll         |  2 +-
 .../CodeGen/X86/sse-insertelt-from-mem.ll     |  2 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  2 +-
 .../CodeGen/X86/sse-intrinsics-fast-isel.ll   | 16 ++--
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  | 16 ++--
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |  8 +-
 llvm/test/CodeGen/X86/sse2.ll                 |  2 +-
 llvm/test/CodeGen/X86/sse41.ll                | 12 +--
 .../test/CodeGen/X86/stack-folding-fp-avx1.ll |  4 +-
 llvm/test/CodeGen/X86/vec-strict-128-fp16.ll  |  2 +-
 .../X86/vec-strict-fptoint-128-fp16.ll        | 32 ++++----
 llvm/test/CodeGen/X86/vec_extract-avx.ll      |  8 +-
 llvm/test/CodeGen/X86/vec_floor.ll            | 68 ++++++++--------
 llvm/test/CodeGen/X86/vec_ss_load_fold.ll     |  8 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |  2 +-
 .../CodeGen/X86/vector-half-conversions.ll    |  4 +-
 .../vector-interleaved-store-i32-stride-5.ll  |  4 +-
 .../vector-interleaved-store-i32-stride-7.ll  |  2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |  2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll | 16 ++--
 .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 32 +++-----
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |  4 +-
 .../X86/vector-shuffle-combining-avx2.ll      |  2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |  4 +-
 .../X86/vector-shuffle-combining-xop.ll       |  2 +-
 llvm/test/CodeGen/X86/vector-zmov.ll          |  2 +-
 llvm/test/CodeGen/X86/vselect.ll              |  8 +-
 53 files changed, 272 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 6bb7600dedcac..fd13305d8a73d 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,7 +222,22 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
+  auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
+    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+      return false;
+    bool Force = MF.getFunction().hasOptSize();
+    if (!Force && !NewOpcPreferable(MovOpc))
+      return false;
+    MI.setDesc(TII->get(MovOpc));
+    MI.removeOperand(NumOperands - 1);
+    return true;
+  };
+
   switch (Opc) {
+  case X86::VBLENDPSrri:
+    return ProcessBLENDToMOV(X86::VMOVSSrr);
+  case X86::VBLENDPDrri:
+    return ProcessBLENDToMOV(X86::VMOVSDrr);
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 254a53fcac4de..65273870c3dfb 100644
--- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -11,7 +11,7 @@ define void @endless_loop() {
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vmovaps %ymm0, (%eax)
 ; AVX1-NEXT:    vmovaps %ymm1, (%eax)
 ; AVX1-NEXT:    vzeroupper
@@ -21,7 +21,7 @@ define void @endless_loop() {
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vbroadcastss (%eax), %xmm0
 ; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 95a3169a5b161..02e6c9649c9a1 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -94,7 +94,7 @@ define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float
 ; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
@@ -202,9 +202,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
 define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
 ; AVX-LABEL: insert_f32_firstelts:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index a8574c0b7516c..30bf1a261f4b7 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1843,7 +1843,7 @@ define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) {
 ; X86-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
 ; X86-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
 ; X86-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
-; X86-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cvtu64_sd:
@@ -1891,7 +1891,7 @@ define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) {
 ; X86-NEXT:    fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    .cfi_def_cfa %esp, 4
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index c1ef500d9d3de..aae48aba93be6 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -10483,7 +10483,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
-; CHECK-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; CHECK-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %q = load float, ptr %ptr_b
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index 926af4e9957af..f9b5994a18d36 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -6505,7 +6505,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %q = load float, ptr %ptr_b
   %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
index a7ca23792e6fe..a2af7df44010e 100644
--- a/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512copy-intrinsics.ll
@@ -11,7 +11,7 @@ define <4 x i32> @test_mm_move_epi32(<4 x i32> %a0) nounwind {
 ; NOAVX512MOVZXC-LABEL: test_mm_move_epi32:
 ; NOAVX512MOVZXC:       # %bb.0:
 ; NOAVX512MOVZXC-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; NOAVX512MOVZXC-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; NOAVX512MOVZXC-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; NOAVX512MOVZXC-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOAVX512MOVZXC-NEXT:    retq # encoding: [0xc3]
   %res = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 789196c5e4848..69d17fe3ab69f 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -578,7 +578,7 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
 ; AVX-32-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,17,0,0]
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
-; AVX-32-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
 ; AVX-32-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm2[0]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
@@ -626,7 +626,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-32-NEXT:    vbroadcastss (%ecx), %xmm1
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-32-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
 ; AVX-32-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
@@ -640,7 +640,7 @@ define <16 x float> @test_buildvector_16f32_2_load(ptr %p0, ptr %p1) {
 ; AVX-64-NEXT:    vbroadcastss (%rdi), %xmm1
 ; AVX-64-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-64-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
-; AVX-64-NEXT:    vblendps {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm3 = xmm2[0],xmm1[1,2,3]
 ; AVX-64-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 545c57fed4b2c..9d856ed7647ca 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -42,7 +42,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX-LABEL: extract0_i32_zext_insert0_i64_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
   %z = zext i32 %e to i64
@@ -85,7 +85,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64
@@ -130,7 +130,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
index 556b0deaf4c83..8b3aa2964db02 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars-f16-type.ll
@@ -51,7 +51,7 @@ define void @v_test_canonicalize__half(half addrspace(1)* %out) nounwind {
 ; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
@@ -149,7 +149,7 @@ define half @complex_canonicalize_fmul_half(half %a, half %b) nounwind {
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0
@@ -235,12 +235,12 @@ define void @v_test_canonicalize_v2half(<2 x half> addrspace(1)* %out) nounwind
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm2
 ; AVX512-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX512-NEXT:    vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index b42fd957d7f4f..086df87d1d5ff 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -44,12 +44,12 @@ define <4 x float> @insert_f32(float %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: insert_f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: insert_f32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
  %1 = insertelement <4 x float> %a1, float %a0, i32 0
  ret <4 x float> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index e5594dc9c5e3c..173457ff46677 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -37,7 +37,7 @@ define <4 x i32> @test1(<4 x i32> %A) {
 ; AVX-LABEL: test1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0>
   ret <4 x i32> %1
@@ -195,7 +195,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
 ; AVX-LABEL: test11:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1>
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 54390d8b66f7d..2b5f09113ca68 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -86,10 +86,20 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -108,10 +118,20 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test5:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test5:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test5:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test5:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
   %or = or <4 x i32> %shuf1, %shuf2
@@ -241,10 +261,20 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test11:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test11:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test11:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test11:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
   %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
   %or = or <4 x i32> %and1, %and2
@@ -263,10 +293,20 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test12:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX-NEXT:    retq
+; AVX1-LABEL: test12:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test12:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test12:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX512-NEXT:    retq
   %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
   %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
   %or = or <4 x i32> %and1, %and2
@@ -395,18 +435,18 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; AVX1-LABEL: test18:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test18:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; AVX2-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vorps %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 33bc93d0fe4db..95d350d45d901 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -1343,7 +1343,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
 ; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_nan:
diff --git a/llvm/test/CodeGen/X86/fmsubadd-combine.ll b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
index ddf51b858cdd8..674a1d5ad779b 100644
--- a/llvm/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/llvm/test/CodeGen/X86/fmsubadd-combine.ll
@@ -12,7 +12,7 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou
 ; NOFMA-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; NOFMA-NEXT:    vsubpd %xmm2, %xmm0, %xmm1
 ; NOFMA-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; NOFMA-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; NOFMA-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; NOFMA-NEXT:    retq
 ;
 ; FMA3-LABEL: mul_subadd_pd128:
@@ -191,7 +191,7 @@ define <2 x double> @mul_subadd_bad_commute(<2 x double> %A, <2 x double> %B, <2
 ; CHECK-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vsubpd %xmm0, %xmm2, %xmm1
 ; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    retq
 entry:
   %AB = fmul <2 x double> %A, %B
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index fbc798d8bbe48..b013ddad19a95 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -44,7 +44,7 @@ define half @fadd_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -92,7 +92,7 @@ define half @fsub_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -140,7 +140,7 @@ define half @fmul_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -188,7 +188,7 @@ define half @fdiv_f16(half %a, half %b) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -400,7 +400,7 @@ define void @fsqrt_f16(ptr %a) nounwind strictfp {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; AVX-NEXT:    retq
@@ -469,7 +469,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    callq fmaf@PLT
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    popq %rax
 ; F16C-NEXT:    retq
@@ -490,7 +490,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
 ; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
index c31bee5ff1030..6312a26db9bf4 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -35,7 +35,7 @@ define half @sitofp_i1tof16(i1 %x) #0 {
 ; AVX-NEXT:    movsbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -76,7 +76,7 @@ define half @sitofp_i8tof16(i8 %x) #0 {
 ; AVX-NEXT:    movsbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -112,7 +112,7 @@ define half @sitofp_i16tof16(i16 %x) #0 {
 ; AVX-NEXT:    movswl %di, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -146,7 +146,7 @@ define half @sitofp_i32tof16(i32 %x) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -178,7 +178,7 @@ define half @sitofp_i64tof16(i64 %x) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvtsi2ss %rdi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -213,7 +213,7 @@ define half @uitofp_i1tof16(i1 %x) #0 {
 ; AVX-NEXT:    andl $1, %edi
 ; AVX-NEXT:    vcvtsi2ss %edi, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -251,7 +251,7 @@ define half @uitofp_i8tof16(i8 %x) #0 {
 ; AVX-NEXT:    movzbl %dil, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -287,7 +287,7 @@ define half @uitofp_i16tof16(i16 %x) #0 {
 ; AVX-NEXT:    movzwl %di, %eax
 ; AVX-NEXT:    vcvtsi2ss %eax, %xmm15, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -323,7 +323,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ; F16C-NEXT:    movl %edi, %eax
 ; F16C-NEXT:    vcvtsi2ss %rax, %xmm15, %xmm0
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -331,7 +331,7 @@ define half @uitofp_i32tof16(i32 %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtusi2ss %edi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -387,7 +387,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ; F16C-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:  .LBB9_2:
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; F16C-NEXT:    retq
 ;
@@ -395,7 +395,7 @@ define half @uitofp_i64tof16(i64 %x) #0 {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcvtusi2ss %rdi, %xmm15, %xmm0
 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
index c834ddbf46f7b..85a43394a1dc8 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -31,7 +31,7 @@ define half @fceil32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -67,7 +67,7 @@ define half @ffloor32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -103,7 +103,7 @@ define half @ftrunc32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -139,7 +139,7 @@ define half @frint32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -176,7 +176,7 @@ define half @fnearbyint32(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -213,7 +213,7 @@ define half @froundeven16(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    vroundss $8, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -251,7 +251,7 @@ define half @fround16(half %f) #0 {
 ; AVX-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll
index f1874cc03000a..d5f2060ca20e3 100644
--- a/llvm/test/CodeGen/X86/half-constrained.ll
+++ b/llvm/test/CodeGen/X86/half-constrained.ll
@@ -194,7 +194,7 @@ define void @float_to_half(float %0) strictfp {
 ; X64-F16C-LABEL: float_to_half:
 ; X64-F16C:       # %bb.0:
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X64-F16C-NEXT:    movq a@GOTPCREL(%rip), %rax
 ; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
@@ -350,7 +350,7 @@ define void @add() strictfp {
 ; X86-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; X86-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; X86-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X86-F16C-NEXT:    vpextrw $0, %xmm0, c
 ; X86-F16C-NEXT:    retl
@@ -387,7 +387,7 @@ define void @add() strictfp {
 ; X64-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
 ; X64-F16C-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; X64-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; X64-F16C-NEXT:    movq c@GOTPCREL(%rip), %rax
 ; X64-F16C-NEXT:    vpextrw $0, %xmm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/half-darwin.ll b/llvm/test/CodeGen/X86/half-darwin.ll
index 3cbf5c11235ea..8765f7dbe6d34 100644
--- a/llvm/test/CodeGen/X86/half-darwin.ll
+++ b/llvm/test/CodeGen/X86/half-darwin.ll
@@ -105,7 +105,7 @@ define void @strict_truncsfhf(float %in, ptr %ptr) nounwind strictfp {
 ; CHECK-F16C-LABEL: strict_truncsfhf:
 ; CHECK-F16C:       ## %bb.0:
 ; CHECK-F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-F16C-NEXT:    vpextrw $0, %xmm0, (%rdi)
 ; CHECK-F16C-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 952940d141808..31551360be483 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -508,8 +508,8 @@ define <8 x float> @PR41512_v8f32(float %x, float %y) {
 ; AVX-LABEL: PR41512_v8f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %ins1 = insertelement <8 x float> zeroinitializer, float %x, i32 0
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 4c5b67962a58b..b7fe8e053fa15 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1120,7 +1120,7 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32
 ; AVX1OR2-NEXT:    retq
 ; AVX1OR2-NEXT:  LBB4_1: ## %cond.load
 ; AVX1OR2-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1OR2-NEXT:    addq $4, %rdi
 ; AVX1OR2-NEXT:    testb $2, %al
 ; AVX1OR2-NEXT:    je LBB4_4
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 559a7ec0930b9..324a371632c45 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -65,7 +65,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB0_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB0_4
@@ -105,7 +105,7 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x ptr> %ptr, <4 x i32> %trigger, <
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB0_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB0_4
@@ -254,7 +254,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB1_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB1_4
@@ -299,7 +299,7 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(ptr %base, <4 x i32> %idx, <4 x i32
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB1_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB1_4
@@ -451,7 +451,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
 ; AVX1-NEXT:  # %bb.1: # %cond.load
 ; AVX1-NEXT:    vmovq %xmm0, %rcx
 ; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX1-NEXT:  .LBB2_2: # %else
 ; AVX1-NEXT:    testb $2, %al
 ; AVX1-NEXT:    je .LBB2_4
@@ -495,7 +495,7 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(ptr %base, <4 x i64> %idx, <4 x i32
 ; AVX2-NEXT:  # %bb.1: # %cond.load
 ; AVX2-NEXT:    vmovq %xmm0, %rcx
 ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
 ; AVX2-NEXT:  .LBB2_2: # %else
 ; AVX2-NEXT:    testb $2, %al
 ; AVX2-NEXT:    je .LBB2_4
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index af018d83d520e..4e6f666fa05de 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -3481,7 +3481,7 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
 ; X86-SKX-NEXT:  .LBB47_1: # %cond.load
 ; X86-SKX-NEXT:    vmovd %xmm0, %ecx
 ; X86-SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SKX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; X86-SKX-NEXT:    vmovss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; X86-SKX-NEXT:    testb $2, %al
 ; X86-SKX-NEXT:    je .LBB47_4
 ; X86-SKX-NEXT:  .LBB47_3: # %cond.load1
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177..e2e26da95b874 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6424,7 +6424,7 @@ define <4 x i32> @mload_constmask_v4i32(ptr %addr, <4 x i32> %dst) {
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
 ; AVX1-NEXT:    vmaskmovps (%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mload_constmask_v4i32:
diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll
index 164bf203d0545..304d071e5d6e5 100644
--- a/llvm/test/CodeGen/X86/pr40730.ll
+++ b/llvm/test/CodeGen/X86/pr40730.ll
@@ -5,7 +5,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: shuffle_v8i32_0dcd3f14:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -26,7 +26,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0)  {
 ; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0]
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7]
diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll
index ea9b69f8f5b80..ae24d3487c4b1 100644
--- a/llvm/test/CodeGen/X86/scalarize-fp.ll
+++ b/llvm/test/CodeGen/X86/scalarize-fp.ll
@@ -911,7 +911,7 @@ define <4 x float> @merge_fcmp_cmpeqss_v4f32(<4 x float> %x, <4 x float> %y) {
 ; AVX1-LABEL: merge_fcmp_cmpeqss_v4f32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vcmpeqss %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: merge_fcmp_cmpeqss_v4f32:
diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
index f0af8fc29969b..5ae9055835716 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
@@ -22,7 +22,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %s = load float, ptr %s.addr
   %i0 = insertelement <4 x float> %x, float %s, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index 34de7e65465d1..1e4fe81abc136 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -19,7 +19,7 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %i0 = insertelement <4 x float> %x, float %s, i32 0
   ret <4 x float> %i0
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index b1f9872f7b6eb..2e2e78a6da51e 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -190,7 +190,7 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpge_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
@@ -232,7 +232,7 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpgt_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
@@ -382,7 +382,7 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpnge_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
@@ -424,7 +424,7 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
 ; AVX-LABEL: test_mm_cmpngt_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
@@ -1603,7 +1603,7 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: test_mm_move_ss:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -2219,7 +2219,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX1-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2228,7 +2228,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X86-AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2243,7 +2243,7 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X64-AVX-LABEL: test_mm_set_ss:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
-; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
+; X64-AVX-NEXT:    vmovss %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf2,0x10,0xc0]
 ; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   %res0  = insertelement <4 x float> undef, float %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 47d35f3636d46..006c3006350cc 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1165,7 +1165,7 @@ define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
 ; AVX-LABEL: insert_test5_sub_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1203,7 +1203,7 @@ define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
 ; AVX-LABEL: insert_test5_div_ss:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <4 x float> %b, %a
   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1241,7 +1241,7 @@ define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
 ; AVX-LABEL: insert_test5_sub_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fsub <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1279,7 +1279,7 @@ define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
 ; AVX-LABEL: insert_test5_div_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %1 = fdiv <2 x double> %b, %a
   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
@@ -1318,7 +1318,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X86-AVX1-NEXT:  # %bb.1:
 ; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:  .LBB70_2:
-; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: add_ss_mask:
@@ -1360,7 +1360,7 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X64-AVX1-NEXT:  # %bb.1:
 ; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:  .LBB70_2:
-; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: add_ss_mask:
@@ -1412,7 +1412,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-AVX1-NEXT:  # %bb.1:
 ; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:  .LBB71_2:
-; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX512-LABEL: add_sd_mask:
@@ -1454,7 +1454,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X64-AVX1-NEXT:  # %bb.1:
 ; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:  .LBB71_2:
-; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index dbdc45abb24d6..18a6be8aaf0b6 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -631,7 +631,7 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; AVX-LABEL: test_mm_cmpge_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x02]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 2)
@@ -748,7 +748,7 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
 ; AVX-LABEL: test_mm_cmpgt_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x01]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 1)
@@ -976,7 +976,7 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; AVX-LABEL: test_mm_cmpnge_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnlesd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x06]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 6)
@@ -1021,7 +1021,7 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
 ; AVX-LABEL: test_mm_cmpngt_sd:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpnltsd %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf3,0xc2,0xc8,0x05]
-; AVX-NEXT:    vblendpd $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0d,0xc1,0x01]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %cmp = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a1, <2 x double> %a0, i8 5)
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 3e5d76eae0bb3..e1d91b407fc28 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -670,7 +670,7 @@ define <4 x i32> @PR19721(<4 x i32> %i) {
 ; AVX-LABEL: PR19721:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE-LABEL: PR19721:
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 2d7258a49f5d0..53a10ab0c26ff 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -353,7 +353,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -361,7 +361,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -373,7 +373,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ;
 ; X64-AVX-LABEL: blendps_not_insertps_1:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; X64-AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X64-AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-AVX-NEXT:    retq ## encoding: [0xc3]
   %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
@@ -440,7 +440,7 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
 ;
 ; AVX-LABEL: blendps_not_insertps_2:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
+; AVX-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; AVX-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %tmp2 = extractelement <4 x float> %t2, i32 0
@@ -1207,7 +1207,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ; AVX1-LABEL: i32_shuf_X00A:
 ; AVX1:       ## %bb.0:
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX1-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX1-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
 ; AVX1-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00]
 ; AVX1-NEXT:    ## xmm1 = xmm1[0,0,0,0]
@@ -1218,7 +1218,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
 ; AVX512-LABEL: i32_shuf_X00A:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
-; AVX512-NEXT:    vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01]
+; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm0 ## encoding: [0xc5,0xea,0x10,0xc0]
 ; AVX512-NEXT:    ## xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9]
 ; AVX512-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index d7404c9e7c7da..665a84a26fea0 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1216,7 +1216,7 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -1307,7 +1307,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index 2c3d7ceb37d03..a6e288608c87b 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -154,7 +154,7 @@ define <4 x float> @f18(<4 x float> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f18:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <8 x half> %a1, i32 0
   %cvt = call float @llvm.experimental.constrained.fpext.f32.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
index 0126685f2bb32..bde14e75dfc04 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -32,7 +32,7 @@ define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
@@ -54,7 +54,7 @@ define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
@@ -76,14 +76,14 @@ define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; NOVL-NEXT:    vcvttph2dq %ymm0, %zmm0
@@ -99,14 +99,14 @@ define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; NOVL-NEXT:    vcvttph2udq %ymm0, %zmm0
@@ -122,14 +122,14 @@ define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
@@ -145,14 +145,14 @@ define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
 ;
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
@@ -168,7 +168,7 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -176,7 +176,7 @@ define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; NOVL-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2w %zmm0, %zmm0
@@ -192,7 +192,7 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
 ; CHECK-NEXT:    ret{{[l|q]}}
@@ -200,7 +200,7 @@ define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
 ; NOVL-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
 ; NOVL:       # %bb.0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; NOVL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; NOVL-NEXT:    vcvttph2uw %zmm0, %zmm0
@@ -216,7 +216,7 @@ define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
 ; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k1
@@ -247,7 +247,7 @@ define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
 ; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
 ; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k1
diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll
index 341a703a21bd5..4b70933334fb7 100644
--- a/llvm/test/CodeGen/X86/vec_extract-avx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll
@@ -119,7 +119,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -128,7 +128,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-NEXT:    vmovaps %ymm0, (%rsi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
@@ -169,7 +169,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86-NEXT:    vmovaps %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl
@@ -178,7 +178,7 @@ define void @legal_vzmovl_2f32_8f32(ptr %in, ptr %out) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64-NEXT:    vmovaps %ymm0, (%rsi)
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index abb85ac83464c..0538cac12cbf7 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -828,13 +828,13 @@ define <4 x float> @floor_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; AVX-LABEL: floor_ss:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_ss:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
   %s = extractelement <4 x float> %x, i32 0
   %call = call float @llvm.floor.f32(float %s)
@@ -853,13 +853,13 @@ define <2 x double> @floor_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; AVX-LABEL: floor_sd:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_sd:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
   %s = extractelement <2 x double> %x, i32 0
   %call = call double @llvm.floor.f64(double %s)
@@ -1372,7 +1372,7 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB52_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss:
@@ -1414,7 +1414,7 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB53_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss:
@@ -1452,7 +1452,7 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB54_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd:
@@ -1494,7 +1494,7 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB55_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd:
@@ -1532,7 +1532,7 @@ define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x flo
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB56_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss_trunc:
@@ -1572,11 +1572,11 @@ define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; AVX-NEXT:    jne LBB57_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB57_1:
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss_trunc:
@@ -1613,7 +1613,7 @@ define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB58_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd_trunc:
@@ -1657,7 +1657,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB59_1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd_trunc:
@@ -1689,7 +1689,7 @@ define <4 x float> @floor_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x flo
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_ss_mask8:
@@ -1723,7 +1723,7 @@ define <4 x float> @floor_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwin
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_ss_mask8:
@@ -1756,7 +1756,7 @@ define <2 x double> @floor_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_mask_sd_mask8:
@@ -1790,7 +1790,7 @@ define <2 x double> @floor_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) noun
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: floor_maskz_sd_mask8:
@@ -1818,13 +1818,13 @@ define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind {
 ; AVX-LABEL: ceil_ss:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_ss:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    retq
   %s = extractelement <4 x float> %x, i32 0
   %call = call float @llvm.ceil.f32(float %s)
@@ -1843,13 +1843,13 @@ define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind {
 ; AVX-LABEL: ceil_sd:
 ; AVX:       ## %bb.0:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_sd:
 ; AVX512:       ## %bb.0:
 ; AVX512-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
   %s = extractelement <2 x double> %x, i32 0
   %call = call double @llvm.ceil.f64(double %s)
@@ -2362,7 +2362,7 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w,
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB78_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss:
@@ -2404,7 +2404,7 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB79_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss:
@@ -2442,7 +2442,7 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double>
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB80_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd:
@@ -2484,7 +2484,7 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB81_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd:
@@ -2522,7 +2522,7 @@ define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x floa
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB82_2:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss_trunc:
@@ -2562,11 +2562,11 @@ define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; AVX-NEXT:    jne LBB83_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB83_1:
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss_trunc:
@@ -2603,7 +2603,7 @@ define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x d
 ; AVX-NEXT:  ## %bb.1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:  LBB84_2:
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd_trunc:
@@ -2647,7 +2647,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB85_1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd_trunc:
@@ -2679,7 +2679,7 @@ define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x floa
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_ss_mask8:
@@ -2713,7 +2713,7 @@ define <4 x float> @ceil_maskz_ss_mask8(<4 x float> %x, <4 x float> %y) nounwind
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_ss_mask8:
@@ -2746,7 +2746,7 @@ define <2 x double> @ceil_mask_sd_mask8(<2 x double> %x, <2 x double> %y, <2 x d
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm3
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm3, %xmm2, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_mask_sd_mask8:
@@ -2780,7 +2780,7 @@ define <2 x double> @ceil_maskz_sd_mask8(<2 x double> %x, <2 x double> %y) nounw
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm2
 ; AVX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: ceil_maskz_sd_mask8:
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index e4304f2cc214a..e73d345d0fcd4 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -38,7 +38,7 @@ define i16 @test1(float %f) nounwind {
 ; X86_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; X86_AVX1-NEXT:    vcvttss2si %xmm0, %eax
@@ -50,7 +50,7 @@ define i16 @test1(float %f) nounwind {
 ; X64_AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64_AVX1-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
 ; X64_AVX1-NEXT:    vcvttss2si %xmm0, %eax
@@ -63,7 +63,7 @@ define i16 @test1(float %f) nounwind {
 ; X86_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X86_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X86_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
 ; X86_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X86_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
@@ -76,7 +76,7 @@ define i16 @test1(float %f) nounwind {
 ; X64_AVX512-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64_AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; X64_AVX512-NEXT:    vminss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index bd5c9363794aa..a38028e87532f 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -54,7 +54,7 @@ define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
 ;
 ; AVX-LABEL: vsel_float2:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 1bbf92e45fc6c..01159d4135d8e 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -5034,7 +5034,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
 ; F16C-LABEL: fptoui_2f16_to_4i32:
 ; F16C:       # %bb.0:
 ; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; F16C-NEXT:    vcvttps2dq %xmm0, %xmm1
 ; F16C-NEXT:    vpsrad $31, %xmm1, %xmm2
@@ -5048,7 +5048,7 @@ define <4 x i32> @fptoui_2f16_to_4i32(<2 x half> %a) nounwind {
 ; AVX512F-LABEL: fptoui_2f16_to_4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vcvtph2ps %xmm0, %xmm0
 ; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index b1986e7af3ec5..d83f969dd0339 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -51,7 +51,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm4[1,1],xmm0[1,1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
@@ -452,7 +452,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [7,3,7,3,7,3,7,3]
 ; AVX2-FCP-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX2-FCP-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, 32(%r9)
 ; AVX2-FCP-NEXT:    vmovaps %ymm0, (%r9)
 ; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm3[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 769f0ec47db01..bfd1e3ece2009 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -651,7 +651,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FCP-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [7,3,7,3,7,3,7,3]
 ; AVX2-FCP-NEXT:    vpermps %ymm6, %ymm3, %ymm3
 ; AVX2-FCP-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX2-FCP-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
+; AVX2-FCP-NEXT:    vmovss {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
 ; AVX2-FCP-NEXT:    vmovaps %ymm2, (%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm4, 64(%rax)
 ; AVX2-FCP-NEXT:    vmovaps %ymm7, 32(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 1d389f9817229..8679c262e0bf0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -877,7 +877,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
 ; AVX-LABEL: shuffle_v2i64_bitcast_z123:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %bitcast32 = bitcast <2 x i64> %x to <4 x float>
   %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> <float 1.000000e+00, float poison, float poison, float poison>, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 0eb72c8bc0be4..e1eb1a6704e39 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -506,7 +506,7 @@ define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
 ; AVX1OR2-LABEL: shuffle_v4i32_4012:
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2]
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX1OR2-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v4i32_4012:
@@ -618,7 +618,7 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
 ; AVX-LABEL: shuffle_v4f32_4zzz:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x float> %shuffle
@@ -1164,7 +1164,7 @@ define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
 ; AVX-LABEL: shuffle_v4i32_4zzz:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   ret <4 x i32> %shuffle
@@ -1202,14 +1202,14 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
 ; AVX1-LABEL: shuffle_v4i32_z4zz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -1258,14 +1258,14 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
 ; AVX1-LABEL: shuffle_v4i32_zz4z:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-SLOW-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -2138,7 +2138,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
 ; AVX-LABEL: insert_reg_and_zero_v4f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %v = insertelement <4 x float> poison, float %a, i32 0
   %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index d848a8b879215..94fc982d87e50 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2376,33 +2376,21 @@ define <4 x i64> @shuffle_v4i64_0zzz_pgso(<4 x i64> %a) !prof !14 {
 }
 
 define <8 x float> @shuffle_v8f32_0zzzzzzz_pgso(<8 x float> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8f32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
   %b = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %b
 }
 
 define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
-; AVX1OR2-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX1OR2-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: shuffle_v8i32_0zzzzzzz_pgso:
+; ALL:       # %bb.0:
+; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; ALL-NEXT:    retq
   %b = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %b
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index be3258765d87e..950683cbfaeea 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2132,7 +2132,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_08991abb:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
@@ -3402,7 +3402,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_0dcd3f14:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 38920aa5d7a12..f4f4842e4c69d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -435,7 +435,7 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
 ; CHECK-LABEL: combine_pshufb_as_vzmovl_32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = bitcast <8 x float> %a0 to <32 x i8>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 12d494c32b656..0570e2f580c1b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -75,7 +75,7 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; AVX-LABEL: combine_pshufb_as_movss:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
   %2 = bitcast <4 x float> %1 to <16 x i8>
@@ -137,7 +137,7 @@ define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
 ; AVX-LABEL: combine_pshufb_as_vzmovl_32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = bitcast <4 x float> %a0 to <16 x i8>
   %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index f53c7a3370174..e8bf5ec2b49a6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -106,7 +106,7 @@ define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x flo
 ; CHECK-LABEL: combine_vpermil2ps_blend_with_zero:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 8, i32 1, i32 2, i32 3>, i8 2)
   ret <4 x float> %res0
diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll
index 09835d797d172..2f84723b3c081 100644
--- a/llvm/test/CodeGen/X86/vector-zmov.ll
+++ b/llvm/test/CodeGen/X86/vector-zmov.ll
@@ -63,7 +63,7 @@ define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(ptr%ptr) {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
 entry:
   %X = load volatile <4 x i32>, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index be6ee8f689958..9851fe64847de 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -313,7 +313,7 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test18:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -332,7 +332,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX-LABEL: test19:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1
@@ -390,7 +390,7 @@ define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test22:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -410,7 +410,7 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX-LABEL: test23:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
   ret <4 x i32> %1

From 53e50472ff445bb946a53aba30649ae65f3534b1 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Thu, 12 Jun 2025 10:09:02 -0700
Subject: [PATCH 271/851] [Clang][NFC] Move FatbinFileName instead of copy
 (#143827)

Static analysis flagged FatbinFileName since we can move it instead of
copying it.
---
 clang/lib/Interpreter/DeviceOffload.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Interpreter/DeviceOffload.cpp b/clang/lib/Interpreter/DeviceOffload.cpp
index 05625ddedb72f..9a25a264b2d5c 100644
--- a/clang/lib/Interpreter/DeviceOffload.cpp
+++ b/clang/lib/Interpreter/DeviceOffload.cpp
@@ -151,7 +151,7 @@ llvm::Error IncrementalCUDADeviceParser::GenerateFatbinary() {
                    llvm::StringRef(FatbinContent.data(), FatbinContent.size()),
                    "", false));
 
-  CodeGenOpts.CudaGpuBinaryFileName = FatbinFileName;
+  CodeGenOpts.CudaGpuBinaryFileName = std::move(FatbinFileName);
 
   FatbinContent.clear();
 

From 82f19674bff578b9afd164144fd6b75d042ac932 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 12 Jun 2025 13:11:42 -0400
Subject: [PATCH 272/851] [libc] Update size_t and ssize_t definitions to use
 __SIZE_TYPE__ and __PTRDIFF_TYPE__ respectively. (#143921)

The current definition of `ssize_t` does not have the same bit width as
`size_t` on 32-bit platforms.
---
 libc/include/llvm-libc-types/size_t.h  | 7 +------
 libc/include/llvm-libc-types/ssize_t.h | 2 +-
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/libc/include/llvm-libc-types/size_t.h b/libc/include/llvm-libc-types/size_t.h
index 3b31b0820f237..26ae68abe0ee7 100644
--- a/libc/include/llvm-libc-types/size_t.h
+++ b/libc/include/llvm-libc-types/size_t.h
@@ -9,11 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_SIZE_T_H
 #define LLVM_LIBC_TYPES_SIZE_T_H
 
-// Since __need_size_t is defined, we get the definition of size_t from the
-// standalone C header stddef.h. Also, because __need_size_t is defined,
-// including stddef.h will pull only the type size_t and nothing else.
-#define __need_size_t
-#include <stddef.h>
-#undef __need_size_t
+typedef __SIZE_TYPE__ size_t;
 
 #endif // LLVM_LIBC_TYPES_SIZE_T_H
diff --git a/libc/include/llvm-libc-types/ssize_t.h b/libc/include/llvm-libc-types/ssize_t.h
index 41e4b6d2c500a..8f579e2749bac 100644
--- a/libc/include/llvm-libc-types/ssize_t.h
+++ b/libc/include/llvm-libc-types/ssize_t.h
@@ -9,6 +9,6 @@
 #ifndef LLVM_LIBC_TYPES_SSIZE_T_H
 #define LLVM_LIBC_TYPES_SSIZE_T_H
 
-typedef __INT64_TYPE__ ssize_t;
+typedef __PTRDIFF_TYPE__ ssize_t;
 
 #endif // LLVM_LIBC_TYPES_SSIZE_T_H

From cbc2ef0e890e6c700023fe00c7166554f2f5ad14 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 12 Jun 2025 10:13:30 -0700
Subject: [PATCH 273/851] [llvm][utils] Add synthetic provider for
 llvm::DenseSet (#143631)

Add a synthetic child provider for `DenseSet`, which is a wrapper around
`DenseMap`. This provider leverages the existing `DenseMap` provider,
reshaping its dictionary structured children into a set.
---
 llvm/utils/lldbDataFormatters.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py
index 988827ab4aa50..c5cd627c53149 100644
--- a/llvm/utils/lldbDataFormatters.py
+++ b/llvm/utils/lldbDataFormatters.py
@@ -3,6 +3,7 @@
 
 Load into LLDB with 'command script import /path/to/lldbDataFormatters.py'
 """
+
 from __future__ import annotations
 
 import collections
@@ -82,6 +83,11 @@ def __lldb_init_module(debugger, internal_dict):
         f"-l {__name__}.DenseMapSynthetic "
         '-x "^llvm::DenseMap<.+>$"'
     )
+    debugger.HandleCommand(
+        "type synthetic add -w llvm "
+        f"-l {__name__}.DenseSetSynthetic "
+        '-x "^llvm::DenseSet<.+>$"'
+    )
 
     debugger.HandleCommand(
         "type synthetic add -w llvm "
@@ -372,7 +378,8 @@ def update(self):
         # For each key, collect a list of buckets it appears in.
         key_buckets: dict[str, list[int]] = collections.defaultdict(list)
         for index in range(num_buckets):
-            key = buckets.GetValueForExpressionPath(f"[{index}].first")
+            bucket = buckets.GetValueForExpressionPath(f"[{index}]")
+            key = bucket.GetChildAtIndex(0)
             key_buckets[str(key.data)].append(index)
 
         # Heuristic: This is not a multi-map, any repeated (non-unique) keys are
@@ -383,6 +390,26 @@ def update(self):
                 self.child_buckets.append(indexes[0])
 
 
+class DenseSetSynthetic:
+    valobj: lldb.SBValue
+    map: lldb.SBValue
+
+    def __init__(self, valobj: lldb.SBValue, _) -> None:
+        self.valobj = valobj
+
+    def num_children(self) -> int:
+        return self.map.num_children
+
+    def get_child_at_index(self, idx: int) -> lldb.SBValue:
+        map_entry = self.map.child[idx]
+        set_entry = map_entry.GetChildAtIndex(0)
+        return set_entry.Clone(f"[{idx}]")
+
+    def update(self):
+        raw_map = self.valobj.GetChildMemberWithName("TheMap")
+        self.map = raw_map.GetSyntheticValue()
+
+
 class ExpectedSynthetic:
     # The llvm::Expected<T> value.
     expected: lldb.SBValue

From eab1a1d4914a51de8383b818bf595125fb830c51 Mon Sep 17 00:00:00 2001
From: halbi2 <hehiralbi@gmail.com>
Date: Thu, 12 Jun 2025 13:15:41 -0400
Subject: [PATCH 274/851] [libc++][test] Improve test coverage for flat_set
 (lack of) SCARY iterators (#139649)

Missing from 5e94e26a7afb8db00cc123e5fc5471c1125596e3.
---
 .../flat.set/scary.compile.pass.cpp           | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp

diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp
new file mode 100644
index 0000000000000..99e93fc3b08b9
--- /dev/null
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.set/scary.compile.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_set>
+
+// class flat_set
+// class flat_multiset
+
+// Extension: SCARY/N2913 iterator compatibility between flat_set and flat_multiset
+// Test for the absence of this feature
+
+#include <flat_set>
+#include <type_traits>
+
+#include "test_macros.h"
+
+void test() {
+  typedef std::flat_set<int, int> M1;
+  typedef std::flat_multiset<int, int> M2;
+
+  static_assert(!std::is_convertible_v<M1::iterator, M2::iterator>);
+  static_assert(!std::is_convertible_v<M2::iterator, M1::iterator>);
+
+  static_assert(!std::is_convertible_v<M1::const_iterator, M2::const_iterator>);
+  static_assert(!std::is_convertible_v<M2::const_iterator, M1::const_iterator>);
+}

From d1ca8d891ff038ec29e67065a446aa2f2043325e Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 12 Jun 2025 13:18:30 -0400
Subject: [PATCH 275/851] [libc][math] Refactor expf implementation to
 header-only in src/__support/math folder. (#143790)

This is a step in preparation for:
https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450
---
 libc/shared/math.h                            |  16 ++
 libc/shared/math/expf.h                       |  23 +++
 libc/src/__support/CMakeLists.txt             |   2 +
 libc/src/__support/math/CMakeLists.txt        |  24 +++
 libc/src/__support/math/exp_float_constants.h | 145 ++++++++++++++++++
 libc/src/__support/math/expf.h                | 116 ++++++++++++++
 libc/src/math/generic/CMakeLists.txt          |  10 +-
 libc/src/math/generic/expf.cpp                |  97 +-----------
 .../llvm-project-overlay/libc/BUILD.bazel     |  39 ++++-
 .../libc/test/libc_test_rules.bzl             |   1 +
 10 files changed, 361 insertions(+), 112 deletions(-)
 create mode 100644 libc/shared/math.h
 create mode 100644 libc/shared/math/expf.h
 create mode 100644 libc/src/__support/math/CMakeLists.txt
 create mode 100644 libc/src/__support/math/exp_float_constants.h
 create mode 100644 libc/src/__support/math/expf.h

diff --git a/libc/shared/math.h b/libc/shared/math.h
new file mode 100644
index 0000000000000..4ddc29c7ae834
--- /dev/null
+++ b/libc/shared/math.h
@@ -0,0 +1,16 @@
+//===-- Floating point math functions ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_H
+#define LLVM_LIBC_SHARED_MATH_H
+
+#include "libc_common.h"
+
+#include "math/expf.h"
+
+#endif // LLVM_LIBC_SHARED_MATH_H
diff --git a/libc/shared/math/expf.h b/libc/shared/math/expf.h
new file mode 100644
index 0000000000000..a4e8b0751bb42
--- /dev/null
+++ b/libc/shared/math/expf.h
@@ -0,0 +1,23 @@
+//===-- Shared expf function ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXPF_H
+#define LLVM_LIBC_SHARED_MATH_EXPF_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/expf.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::expf;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_EXPF_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 327ff5e0c6a37..8cf2b0cdcdccc 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -390,3 +390,5 @@ add_subdirectory(HashTable)
 add_subdirectory(fixed_point)
 
 add_subdirectory(time)
+
+add_subdirectory(math)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
new file mode 100644
index 0000000000000..66c1d19a1cab0
--- /dev/null
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -0,0 +1,24 @@
+add_header_library(
+  exp_float_constants
+  HDRS
+    exp_float_constants.h
+  DEPENDS
+    libc.src.__support.macros.config
+)
+
+add_header_library(
+  expf
+  HDRS
+    expf.h
+  DEPENDS
+    .exp_float_constants
+    libc.src.__support.common
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/__support/math/exp_float_constants.h b/libc/src/__support/math/exp_float_constants.h
new file mode 100644
index 0000000000000..cabb227a034b5
--- /dev/null
+++ b/libc/src/__support/math/exp_float_constants.h
@@ -0,0 +1,145 @@
+//===-- Look-up tables for exp*f functions ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+// Lookup table for exp(m) with m = -104, ..., 89.
+//   -104 = floor(log(single precision's min denormal))
+//     89 = ceil(log(single precision's max normal))
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for i from -104 to 89 do { D(exp(i)); };
+static constexpr double EXP_M1[195] = {
+    0x1.f1e6b68529e33p-151, 0x1.525be4e4e601dp-149, 0x1.cbe0a45f75eb1p-148,
+    0x1.3884e838aea68p-146, 0x1.a8c1f14e2af5dp-145, 0x1.20a717e64a9bdp-143,
+    0x1.8851d84118908p-142, 0x1.0a9bdfb02d240p-140, 0x1.6a5bea046b42ep-139,
+    0x1.ec7f3b269efa8p-138, 0x1.4eafb87eab0f2p-136, 0x1.c6e2d05bbc000p-135,
+    0x1.35208867c2683p-133, 0x1.a425b317eeacdp-132, 0x1.1d8508fa8246ap-130,
+    0x1.840fbc08fdc8ap-129, 0x1.07b7112bc1ffep-127, 0x1.666d0dad2961dp-126,
+    0x1.e726c3f64d0fep-125, 0x1.4b0dc07cabf98p-123, 0x1.c1f2daf3b6a46p-122,
+    0x1.31c5957a47de2p-120, 0x1.9f96445648b9fp-119, 0x1.1a6baeadb4fd1p-117,
+    0x1.7fd974d372e45p-116, 0x1.04da4d1452919p-114, 0x1.62891f06b3450p-113,
+    0x1.e1dd273aa8a4ap-112, 0x1.4775e0840bfddp-110, 0x1.bd109d9d94bdap-109,
+    0x1.2e73f53fba844p-107, 0x1.9b138170d6bfep-106, 0x1.175af0cf60ec5p-104,
+    0x1.7baee1bffa80bp-103, 0x1.02057d1245cebp-101, 0x1.5eafffb34ba31p-100,
+    0x1.dca23bae16424p-99,  0x1.43e7fc88b8056p-97,  0x1.b83bf23a9a9ebp-96,
+    0x1.2b2b8dd05b318p-94,  0x1.969d47321e4ccp-93,  0x1.1452b7723aed2p-91,
+    0x1.778fe2497184cp-90,  0x1.fe7116182e9ccp-89,  0x1.5ae191a99585ap-87,
+    0x1.d775d87da854dp-86,  0x1.4063f8cc8bb98p-84,  0x1.b374b315f87c1p-83,
+    0x1.27ec458c65e3cp-81,  0x1.923372c67a074p-80,  0x1.1152eaeb73c08p-78,
+    0x1.737c5645114b5p-77,  0x1.f8e6c24b5592ep-76,  0x1.571db733a9d61p-74,
+    0x1.d257d547e083fp-73,  0x1.3ce9b9de78f85p-71,  0x1.aebabae3a41b5p-70,
+    0x1.24b6031b49bdap-68,  0x1.8dd5e1bb09d7ep-67,  0x1.0e5b73d1ff53dp-65,
+    0x1.6f741de1748ecp-64,  0x1.f36bd37f42f3ep-63,  0x1.536452ee2f75cp-61,
+    0x1.cd480a1b74820p-60,  0x1.39792499b1a24p-58,  0x1.aa0de4bf35b38p-57,
+    0x1.2188ad6ae3303p-55,  0x1.898471fca6055p-54,  0x1.0b6c3afdde064p-52,
+    0x1.6b7719a59f0e0p-51,  0x1.ee001eed62aa0p-50,  0x1.4fb547c775da8p-48,
+    0x1.c8464f7616468p-47,  0x1.36121e24d3bbap-45,  0x1.a56e0c2ac7f75p-44,
+    0x1.1e642baeb84a0p-42,  0x1.853f01d6d53bap-41,  0x1.0885298767e9ap-39,
+    0x1.67852a7007e42p-38,  0x1.e8a37a45fc32ep-37,  0x1.4c1078fe9228ap-35,
+    0x1.c3527e433fab1p-34,  0x1.32b48bf117da2p-32,  0x1.a0db0d0ddb3ecp-31,
+    0x1.1b48655f37267p-29,  0x1.81056ff2c5772p-28,  0x1.05a628c699fa1p-26,
+    0x1.639e3175a689dp-25,  0x1.e355bbaee85cbp-24,  0x1.4875ca227ec38p-22,
+    0x1.be6c6fdb01612p-21,  0x1.2f6053b981d98p-19,  0x1.9c54c3b43bc8bp-18,
+    0x1.18354238f6764p-16,  0x1.7cd79b5647c9bp-15,  0x1.02cf22526545ap-13,
+    0x1.5fc21041027adp-12,  0x1.de16b9c24a98fp-11,  0x1.44e51f113d4d6p-9,
+    0x1.b993fe00d5376p-8,   0x1.2c155b8213cf4p-6,   0x1.97db0ccceb0afp-5,
+    0x1.152aaa3bf81ccp-3,   0x1.78b56362cef38p-2,   0x1.0000000000000p+0,
+    0x1.5bf0a8b145769p+1,   0x1.d8e64b8d4ddaep+2,   0x1.415e5bf6fb106p+4,
+    0x1.b4c902e273a58p+5,   0x1.28d389970338fp+7,   0x1.936dc5690c08fp+8,
+    0x1.122885aaeddaap+10,  0x1.749ea7d470c6ep+11,  0x1.fa7157c470f82p+12,
+    0x1.5829dcf950560p+14,  0x1.d3c4488ee4f7fp+15,  0x1.3de1654d37c9ap+17,
+    0x1.b00b5916ac955p+18,  0x1.259ac48bf05d7p+20,  0x1.8f0ccafad2a87p+21,
+    0x1.0f2ebd0a80020p+23,  0x1.709348c0ea4f9p+24,  0x1.f4f22091940bdp+25,
+    0x1.546d8f9ed26e1p+27,  0x1.ceb088b68e804p+28,  0x1.3a6e1fd9eecfdp+30,
+    0x1.ab5adb9c43600p+31,  0x1.226af33b1fdc1p+33,  0x1.8ab7fb5475fb7p+34,
+    0x1.0c3d3920962c9p+36,  0x1.6c932696a6b5dp+37,  0x1.ef822f7f6731dp+38,
+    0x1.50bba3796379ap+40,  0x1.c9aae4631c056p+41,  0x1.370470aec28edp+43,
+    0x1.a6b765d8cdf6dp+44,  0x1.1f43fcc4b662cp+46,  0x1.866f34a725782p+47,
+    0x1.0953e2f3a1ef7p+49,  0x1.689e221bc8d5bp+50,  0x1.ea215a1d20d76p+51,
+    0x1.4d13fbb1a001ap+53,  0x1.c4b334617cc67p+54,  0x1.33a43d282a519p+56,
+    0x1.a220d397972ebp+57,  0x1.1c25c88df6862p+59,  0x1.8232558201159p+60,
+    0x1.0672a3c9eb871p+62,  0x1.64b41c6d37832p+63,  0x1.e4cf766fe49bep+64,
+    0x1.49767bc0483e3p+66,  0x1.bfc951eb8bb76p+67,  0x1.304d6aeca254bp+69,
+    0x1.9d97010884251p+70,  0x1.19103e4080b45p+72,  0x1.7e013cd114461p+73,
+    0x1.03996528e074cp+75,  0x1.60d4f6fdac731p+76,  0x1.df8c5af17ba3bp+77,
+    0x1.45e3076d61699p+79,  0x1.baed16a6e0da7p+80,  0x1.2cffdfebde1a1p+82,
+    0x1.9919cabefcb69p+83,  0x1.160345c9953e3p+85,  0x1.79dbc9dc53c66p+86,
+    0x1.00c810d464097p+88,  0x1.5d009394c5c27p+89,  0x1.da57de8f107a8p+90,
+    0x1.425982cf597cdp+92,  0x1.b61e5ca3a5e31p+93,  0x1.29bb825dfcf87p+95,
+    0x1.94a90db0d6fe2p+96,  0x1.12fec759586fdp+98,  0x1.75c1dc469e3afp+99,
+    0x1.fbfd219c43b04p+100, 0x1.5936d44e1a146p+102, 0x1.d531d8a7ee79cp+103,
+    0x1.3ed9d24a2d51bp+105, 0x1.b15cfe5b6e17bp+106, 0x1.268038c2c0e00p+108,
+    0x1.9044a73545d48p+109, 0x1.1002ab6218b38p+111, 0x1.71b3540cbf921p+112,
+    0x1.f6799ea9c414ap+113, 0x1.55779b984f3ebp+115, 0x1.d01a210c44aa4p+116,
+    0x1.3b63da8e91210p+118, 0x1.aca8d6b0116b8p+119, 0x1.234de9e0c74e9p+121,
+    0x1.8bec7503ca477p+122, 0x1.0d0eda9796b90p+124, 0x1.6db0118477245p+125,
+    0x1.f1056dc7bf22dp+126, 0x1.51c2cc3433801p+128, 0x1.cb108ffbec164p+129,
+};
+
+// Lookup table for exp(m * 2^(-7)) with m = 0, ..., 127.
+// Table is generated with Sollya as follow:
+// > display = hexadecimal;
+// > for i from 0 to 127 do { D(exp(i / 128)); };
+static constexpr double EXP_M2[128] = {
+    0x1.0000000000000p0, 0x1.0202015600446p0, 0x1.04080ab55de39p0,
+    0x1.06122436410ddp0, 0x1.08205601127edp0, 0x1.0a32a84e9c1f6p0,
+    0x1.0c49236829e8cp0, 0x1.0e63cfa7ab09dp0, 0x1.1082b577d34edp0,
+    0x1.12a5dd543ccc5p0, 0x1.14cd4fc989cd6p0, 0x1.16f9157587069p0,
+    0x1.192937074e0cdp0, 0x1.1b5dbd3f68122p0, 0x1.1d96b0eff0e79p0,
+    0x1.1fd41afcba45ep0, 0x1.2216045b6f5cdp0, 0x1.245c7613b8a9bp0,
+    0x1.26a7793f60164p0, 0x1.28f7170a755fdp0, 0x1.2b4b58b372c79p0,
+    0x1.2da4478b620c7p0, 0x1.3001ecf601af7p0, 0x1.32645269ea829p0,
+    0x1.34cb8170b5835p0, 0x1.373783a722012p0, 0x1.39a862bd3c106p0,
+    0x1.3c1e2876834aap0, 0x1.3e98deaa11dccp0, 0x1.41188f42c3e32p0,
+    0x1.439d443f5f159p0, 0x1.462707b2bac21p0, 0x1.48b5e3c3e8186p0,
+    0x1.4b49e2ae5ac67p0, 0x1.4de30ec211e60p0, 0x1.50817263c13cdp0,
+    0x1.5325180cfacf7p0, 0x1.55ce0a4c58c7cp0, 0x1.587c53c5a7af0p0,
+    0x1.5b2fff3210fd9p0, 0x1.5de9176045ff5p0, 0x1.60a7a734ab0e8p0,
+    0x1.636bb9a983258p0, 0x1.663559cf1bc7cp0, 0x1.690492cbf9433p0,
+    0x1.6bd96fdd034a2p0, 0x1.6eb3fc55b1e76p0, 0x1.719443a03acb9p0,
+    0x1.747a513dbef6ap0, 0x1.776630c678bc1p0, 0x1.7a57ede9ea23ep0,
+    0x1.7d4f946f0ba8dp0, 0x1.804d30347b546p0, 0x1.8350cd30ac390p0,
+    0x1.865a7772164c5p0, 0x1.896a3b1f66a0ep0, 0x1.8c802477b0010p0,
+    0x1.8f9c3fd29beafp0, 0x1.92be99a09bf00p0, 0x1.95e73e6b1b75ep0,
+    0x1.99163ad4b1dccp0, 0x1.9c4b9b995509bp0, 0x1.9f876d8e8c566p0,
+    0x1.a2c9bda3a3e78p0, 0x1.a61298e1e069cp0, 0x1.a9620c6cb3374p0,
+    0x1.acb82581eee54p0, 0x1.b014f179fc3b8p0, 0x1.b3787dc80f95fp0,
+    0x1.b6e2d7fa5eb18p0, 0x1.ba540dba56e56p0, 0x1.bdcc2cccd3c85p0,
+    0x1.c14b431256446p0, 0x1.c4d15e873c193p0, 0x1.c85e8d43f7cd0p0,
+    0x1.cbf2dd7d490f2p0, 0x1.cf8e5d84758a9p0, 0x1.d3311bc7822b4p0,
+    0x1.d6db26d16cd67p0, 0x1.da8c8d4a66969p0, 0x1.de455df80e3c0p0,
+    0x1.e205a7bdab73ep0, 0x1.e5cd799c6a54ep0, 0x1.e99ce2b397649p0,
+    0x1.ed73f240dc142p0, 0x1.f152b7a07bb76p0, 0x1.f539424d90f5ep0,
+    0x1.f927a1e24bb76p0, 0x1.fd1de6182f8c9p0, 0x1.008e0f64294abp1,
+    0x1.02912df5ce72ap1, 0x1.049856cd84339p1, 0x1.06a39207f0a09p1,
+    0x1.08b2e7d2035cfp1, 0x1.0ac6606916501p1, 0x1.0cde041b0e9aep1,
+    0x1.0ef9db467dcf8p1, 0x1.1119ee5ac36b6p1, 0x1.133e45d82e952p1,
+    0x1.1566ea50201d7p1, 0x1.1793e4652cc50p1, 0x1.19c53ccb3fc6bp1,
+    0x1.1bfafc47bda73p1, 0x1.1e352bb1a74adp1, 0x1.2073d3f1bd518p1,
+    0x1.22b6fe02a3b9cp1, 0x1.24feb2f105cb8p1, 0x1.274afbdbba4a6p1,
+    0x1.299be1f3e7f1cp1, 0x1.2bf16e7d2a38cp1, 0x1.2e4baacdb6614p1,
+    0x1.30aaa04e80d05p1, 0x1.330e587b62b28p1, 0x1.3576dce33feadp1,
+    0x1.37e437282d4eep1, 0x1.3a5670ff972edp1, 0x1.3ccd9432682b4p1,
+    0x1.3f49aa9d30590p1, 0x1.41cabe304cb34p1, 0x1.4450d8f00edd4p1,
+    0x1.46dc04f4e5338p1, 0x1.496c4c6b832dap1, 0x1.4c01b9950a111p1,
+    0x1.4e9c56c731f5dp1, 0x1.513c2e6c731d7p1, 0x1.53e14b042f9cap1,
+    0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
+};
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H
diff --git a/libc/src/__support/math/expf.h b/libc/src/__support/math/expf.h
new file mode 100644
index 0000000000000..88c151492a041
--- /dev/null
+++ b/libc/src/__support/math/expf.h
@@ -0,0 +1,116 @@
+//===-- Implementation header for expf --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
+
+#include "exp_float_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+static constexpr float expf(float x) {
+  using FPBits = typename fputil::FPBits<float>;
+  FPBits xbits(x);
+
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & 0x7fff'ffffU;
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  // Exceptional values
+  if (LIBC_UNLIKELY(x_u == 0xc236'bd8cU)) { // x = -0x1.6d7b18p+5f
+    return 0x1.108a58p-66f - x * 0x1.0p-95f;
+  }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // When |x| >= 89, |x| < 2^-25, or x is nan
+  if (LIBC_UNLIKELY(x_abs >= 0x42b2'0000U || x_abs <= 0x3280'0000U)) {
+    // |x| < 2^-25
+    if (xbits.get_biased_exponent() <= 101) {
+      return 1.0f + x;
+    }
+
+    // When x < log(2^-150) or nan
+    if (xbits.uintval() >= 0xc2cf'f1b5U) {
+      // exp(-Inf) = 0
+      if (xbits.is_inf())
+        return 0.0f;
+      // exp(nan) = nan
+      if (xbits.is_nan())
+        return x;
+      if (fputil::fenv_is_round_up())
+        return FPBits::min_subnormal().get_val();
+      fputil::set_errno_if_required(ERANGE);
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+      return 0.0f;
+    }
+    // x >= 89 or nan
+    if (xbits.is_pos() && (xbits.uintval() >= 0x42b2'0000)) {
+      // x is finite
+      if (xbits.uintval() < 0x7f80'0000U) {
+        int rounding = fputil::quick_get_round();
+        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
+          return FPBits::max_normal().get_val();
+
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW);
+      }
+      // x is +inf or nan
+      return x + FPBits::inf().get_val();
+    }
+  }
+  // For -104 < x < 89, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^7 is an integer
+  //     -2^(-8) <= lo < 2^-8.
+  // In particular,
+  //   hi + mid = round(x * 2^7) * 2^(-7).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
+  // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
+  // generated by Sollya.
+
+  // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
+  float kf = fputil::nearest_integer(x * 0x1.0p7f);
+  // Subtract (hi + mid) from x to get lo.
+  double xd = static_cast<double>(fputil::multiply_add(kf, -0x1.0p-7f, x));
+  int x_hi = static_cast<int>(kf);
+  x_hi += 104 << 7;
+  // hi = x_hi >> 7
+  double exp_hi = EXP_M1[x_hi >> 7];
+  // mid * 2^7 = x_hi & 0x0000'007fU;
+  double exp_mid = EXP_M2[x_hi & 0x7f];
+  // Degree-4 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > Q = fpminimax(expm1(x)/x, 3, [|D...|], [-2^-8, 2^-8]);
+  //   > Q;
+  double exp_lo =
+      fputil::polyeval(xd, 0x1p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
+                       0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
+  return static_cast<float>(exp_hi * exp_mid * exp_lo);
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXPF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index db3ef8886b52b..fd1e6c0d648aa 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1321,15 +1321,7 @@ add_entrypoint_object(
   HDRS
     ../expf.h
   DEPENDS
-    .common_constants
-    libc.src.__support.FPUtil.basic_operations
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.expf
     libc.src.errno.errno
 )
 
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index fa507d4d9322c..de11f51ac64a0 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -7,103 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expf.h"
-#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
-#include "src/__support/FPUtil/BasicOperations.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/math/expf.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(float, expf, (float x)) {
-  using FPBits = typename fputil::FPBits<float>;
-  FPBits xbits(x);
-
-  uint32_t x_u = xbits.uintval();
-  uint32_t x_abs = x_u & 0x7fff'ffffU;
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  // Exceptional values
-  if (LIBC_UNLIKELY(x_u == 0xc236'bd8cU)) { // x = -0x1.6d7b18p+5f
-    return 0x1.108a58p-66f - x * 0x1.0p-95f;
-  }
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // When |x| >= 89, |x| < 2^-25, or x is nan
-  if (LIBC_UNLIKELY(x_abs >= 0x42b2'0000U || x_abs <= 0x3280'0000U)) {
-    // |x| < 2^-25
-    if (xbits.get_biased_exponent() <= 101) {
-      return 1.0f + x;
-    }
-
-    // When x < log(2^-150) or nan
-    if (xbits.uintval() >= 0xc2cf'f1b5U) {
-      // exp(-Inf) = 0
-      if (xbits.is_inf())
-        return 0.0f;
-      // exp(nan) = nan
-      if (xbits.is_nan())
-        return x;
-      if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
-      fputil::set_errno_if_required(ERANGE);
-      fputil::raise_except_if_required(FE_UNDERFLOW);
-      return 0.0f;
-    }
-    // x >= 89 or nan
-    if (xbits.is_pos() && (xbits.uintval() >= 0x42b2'0000)) {
-      // x is finite
-      if (xbits.uintval() < 0x7f80'0000U) {
-        int rounding = fputil::quick_get_round();
-        if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
-
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW);
-      }
-      // x is +inf or nan
-      return x + FPBits::inf().get_val();
-    }
-  }
-  // For -104 < x < 89, to compute exp(x), we perform the following range
-  // reduction: find hi, mid, lo such that:
-  //   x = hi + mid + lo, in which
-  //     hi is an integer,
-  //     mid * 2^7 is an integer
-  //     -2^(-8) <= lo < 2^-8.
-  // In particular,
-  //   hi + mid = round(x * 2^7) * 2^(-7).
-  // Then,
-  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
-  // We store exp(hi) and exp(mid) in the lookup tables EXP_M1 and EXP_M2
-  // respectively.  exp(lo) is computed using a degree-4 minimax polynomial
-  // generated by Sollya.
-
-  // x_hi = (hi + mid) * 2^7 = round(x * 2^7).
-  float kf = fputil::nearest_integer(x * 0x1.0p7f);
-  // Subtract (hi + mid) from x to get lo.
-  double xd = static_cast<double>(fputil::multiply_add(kf, -0x1.0p-7f, x));
-  int x_hi = static_cast<int>(kf);
-  x_hi += 104 << 7;
-  // hi = x_hi >> 7
-  double exp_hi = EXP_M1[x_hi >> 7];
-  // mid * 2^7 = x_hi & 0x0000'007fU;
-  double exp_mid = EXP_M2[x_hi & 0x7f];
-  // Degree-4 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > Q = fpminimax(expm1(x)/x, 3, [|D...|], [-2^-8, 2^-8]);
-  //   > Q;
-  double exp_lo =
-      fputil::polyeval(xd, 0x1p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
-                       0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
-  return static_cast<float>(exp_hi * exp_mid * exp_lo);
-}
+LLVM_LIBC_FUNCTION(float, expf, (float x)) { return math::expf(x); }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 123d9ccc8310f..0cedad2859247 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1616,13 +1616,18 @@ libc_header_library(
 
 ############################### errno ########################################
 
+libc_support_library(
+    name = "__support_libc_errno",
+    hdrs = ["src/__support/libc_errno.h"],
+)
+
 libc_support_library(
     name = "errno",
     srcs = ["src/errno/libc_errno.cpp"],
-    hdrs = ["src/__support/libc_errno.h"],
     deps = [
         ":__support_common",
         ":__support_cpp_atomic",
+        ":__support_libc_errno",
         ":__support_macros_attributes",
         ":__support_macros_properties_architectures",
         ":hdr_errno_macros",
@@ -1973,6 +1978,29 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp_float_constants",
+    hdrs = ["src/__support/math/exp_float_constants.h"],
+)
+
+libc_support_library(
+    name = "__support_math_expf",
+    hdrs = ["src/__support/math/expf.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_libc_errno",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_math_exp_float_constants",
+    ],
+)
+
 ############################### complex targets ################################
 
 libc_function(
@@ -2570,13 +2598,8 @@ libc_math_function(
 libc_math_function(
     name = "expf",
     additional_deps = [
-        ":__support_fputil_fma",
-        ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
-        ":__support_fputil_polyeval",
-        ":__support_fputil_rounding_mode",
-        ":__support_macros_optimization",
-        ":common_constants",
+        ":__support_math_expf",
+        ":errno",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
index 123e05727aeff..ba8753a17a855 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
@@ -31,6 +31,7 @@ def libc_test(name, copts = [], deps = [], local_defines = [], **kwargs):
         deps = [
             "//libc/test/UnitTest:LibcUnitTest",
             "//libc:__support_macros_config",
+            "//libc:__support_libc_errno",
             "//libc:errno",
             "//libc:func_aligned_alloc",
             "//libc:func_free",

From 6311f039b2678f0a1367a88679efb7b2e37949dc Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Thu, 12 Jun 2025 17:34:45 +0000
Subject: [PATCH 276/851] [libc] Build fixes for widechar characterconverter
 (#143805)

Build fixes for wchar CharacterConverter class
---
 libc/hdr/CMakeLists.txt                |  2 ++
 libc/hdr/types/CMakeLists.txt          | 22 ++++++++++++++++++++++
 libc/include/llvm-libc-types/char8_t.h |  3 +--
 libc/src/__support/CMakeLists.txt      |  2 ++
 libc/src/__support/wchar/mbstate.h     |  1 +
 libc/src/__support/wchar/utf_ret.h     |  5 ++++-
 6 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 1e9f59621a8e5..052a773a4fcec 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -221,6 +221,8 @@ add_proxy_header_library(
 
 add_header_library(wchar_overlay HDRS wchar_overlay.h)
 
+add_header_library(uchar_overlay HDRS uchar_overlay.h)
+
 add_proxy_header_library(
   wchar_macros
   HDRS
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 5f6197c93d445..c88c357009072 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -1,3 +1,25 @@
+add_proxy_header_library(
+  char8_t 
+  HDRS
+    char8_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char8_t
+    libc.include.uchar
+)
+
+add_proxy_header_library(
+  char32_t 
+  HDRS
+    char32_t.h
+  DEPENDS
+    libc.hdr.uchar_overlay
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.char32_t
+    libc.include.uchar
+)
+
 add_proxy_header_library(
   div_t
   HDRS
diff --git a/libc/include/llvm-libc-types/char8_t.h b/libc/include/llvm-libc-types/char8_t.h
index ddadab1afa219..a343be77d810b 100644
--- a/libc/include/llvm-libc-types/char8_t.h
+++ b/libc/include/llvm-libc-types/char8_t.h
@@ -9,8 +9,7 @@
 #ifndef LLVM_LIBC_TYPES_CHAR8_T_H
 #define LLVM_LIBC_TYPES_CHAR8_T_H
 
-#if !defined(__cplusplus) && defined(__STDC_VERSION__) &&                      \
-    __STDC_VERSION__ >= 202311L
+#if !(defined(__cplusplus) && defined(__cpp_char8_t))
 typedef unsigned char char8_t;
 #endif
 
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 8cf2b0cdcdccc..309cde76370f0 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -391,4 +391,6 @@ add_subdirectory(fixed_point)
 
 add_subdirectory(time)
 
+add_subdirectory(wchar)
+
 add_subdirectory(math)
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index 72ec727560003..cb8950374de41 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
 
 #include "hdr/types/char32_t.h"
+#include "src/__support/common.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
index b8a8f6f094143..fa99b76159bd8 100644
--- a/libc/src/__support/wchar/utf_ret.h
+++ b/libc/src/__support/wchar/utf_ret.h
@@ -9,13 +9,16 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 #define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
 
-namespace LIBC_NAMESPACE_DECL {
+#include "src/__support/common.h"
 
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
 template <typename T> struct utf_ret {
   T out;
   int error;
 };
 
+} // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H

From 9208b343e962b9f1140ee345c0050a3920bdcbf2 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Thu, 12 Jun 2025 13:38:13 -0400
Subject: [PATCH 277/851]  [PowerPC] frontend get target feature from backend
 with cpu name (#137670)

1. The PR proceeds with a backend target hook to allow front-ends to
determine what target features are available in a compilation based on
the CPU name.
2. Fix a backend target feature bug that supports HTM for
Power8/9/10/11. However, HTM is only supported on Power8/9 according to
the ISA.
3. All target features that are hardcoded in PPC.cpp can be retrieved
from the backend target feature. I have double-checked that the
hardcoded logic for inferring target features from the CPU in the
frontend(PPC.cpp) is the same as in PPC.td.
---
 clang/lib/Basic/Targets/PPC.cpp               | 148 +-----------------
 .../cxx11-thread-local-reference.cpp          |   2 +-
 .../Driver/aix-shared-lib-tls-model-opt.c     |   7 +-
 .../Driver/aix-small-local-exec-dynamic-tls.c |  39 +++--
 clang/test/Driver/ppc-crbits.cpp              |   4 -
 clang/test/Driver/ppc-isa-features.cpp        |  22 +--
 .../llvm/TargetParser/PPCTargetParser.h       |   6 +
 llvm/include/llvm/TargetParser/TargetParser.h |  27 ++++
 llvm/lib/Target/PowerPC/PPC.td                |   4 +-
 llvm/lib/TargetParser/CMakeLists.txt          |   8 +
 llvm/lib/TargetParser/PPCTargetParser.cpp     |  25 +++
 llvm/lib/TargetParser/TargetParser.cpp        |  47 ++++++
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  50 ++++--
 13 files changed, 191 insertions(+), 198 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index e6ef0ecc526ba..77145e2891a8a 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -15,6 +15,7 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/PPCTargetParser.h"
+#include <optional>
 
 using namespace clang;
 using namespace clang::targets;
@@ -516,129 +517,14 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
 bool PPCTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
-  Features["altivec"] = llvm::StringSwitch<bool>(CPU)
-                            .Case("7400", true)
-                            .Case("g4", true)
-                            .Case("7450", true)
-                            .Case("g4+", true)
-                            .Case("970", true)
-                            .Case("g5", true)
-                            .Case("pwr6", true)
-                            .Case("pwr7", true)
-                            .Case("pwr8", true)
-                            .Case("pwr9", true)
-                            .Case("ppc64", true)
-                            .Case("ppc64le", true)
-                            .Default(false);
-
-  Features["power9-vector"] = (CPU == "pwr9");
-  Features["crypto"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Default(false);
-  Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
-                                  .Case("ppc64le", true)
-                                  .Case("pwr9", true)
-                                  .Case("pwr8", true)
-                                  .Default(false);
-  Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
-                           .Case("ppc64le", true)
-                           .Case("pwr9", true)
-                           .Case("pwr8", true)
-                           .Case("pwr7", true)
-                           .Default(false);
-  Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["crbits"] = llvm::StringSwitch<bool>(CPU)
-                                .Case("ppc64le", true)
-                                .Case("pwr9", true)
-                                .Case("pwr8", true)
-                                .Default(false);
-  Features["vsx"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Case("pwr7", true)
-                        .Default(false);
-  Features["htm"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("ppc64le", true)
-                        .Case("pwr9", true)
-                        .Case("pwr8", true)
-                        .Default(false);
-
-  // ROP Protect is off by default.
-  Features["rop-protect"] = false;
-  // Privileged instructions are off by default.
-  Features["privileged"] = false;
 
-  if (getTriple().isOSAIX()) {
-    // The code generated by the -maix-small-local-[exec|dynamic]-tls option is
-    // turned off by default.
-    Features["aix-small-local-exec-tls"] = false;
-    Features["aix-small-local-dynamic-tls"] = false;
-
-    // Turn off TLS model opt by default.
-    Features["aix-shared-lib-tls-model-opt"] = false;
-  }
-
-  Features["spe"] = llvm::StringSwitch<bool>(CPU)
-                        .Case("8548", true)
-                        .Case("e500", true)
-                        .Default(false);
-
-  Features["isa-v206-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Case("pwr7", true)
-                                          .Case("a2", true)
-                                          .Default(false);
-
-  Features["isa-v207-instructions"] = llvm::StringSwitch<bool>(CPU)
-                                          .Case("ppc64le", true)
-                                          .Case("pwr9", true)
-                                          .Case("pwr8", true)
-                                          .Default(false);
-
-  Features["isa-v30-instructions"] =
-      llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
-
-  Features["quadword-atomics"] =
-      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
-                                       .Case("pwr9", true)
-                                       .Case("pwr8", true)
-                                       .Default(false);
-
-  // Power10 includes all the same features as Power9 plus any features specific
-  // to the Power10 core.
-  if (CPU == "pwr10" || CPU == "power10") {
-    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
-    addP10SpecificFeatures(Features);
-  }
-
-  // Power11 includes all the same features as Power10 plus any features
-  // specific to the Power11 core.
-  if (CPU == "pwr11" || CPU == "power11") {
-    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
-    addP11SpecificFeatures(Features);
-  }
+  const llvm::Triple &TheTriple = getTriple();
 
-  // Future CPU should include all of the features of Power 11 as well as any
-  // additional features (yet to be determined) specific to it.
-  if (CPU == "future") {
-    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
-    addFutureSpecificFeatures(Features);
-  }
+  std::optional<llvm::StringMap<bool>> FeaturesOpt =
+      llvm::PPC::getPPCDefaultTargetFeatures(TheTriple,
+                                             llvm::PPC::normalizeCPUName(CPU));
+  if (FeaturesOpt)
+    Features = FeaturesOpt.value();
 
   if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
     return false;
@@ -700,26 +586,6 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
-// Add any Power10 specific features.
-void PPCTargetInfo::addP10SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {
-  Features["htm"] = false; // HTM was removed for P10.
-  Features["paired-vector-memops"] = true;
-  Features["mma"] = true;
-  Features["power10-vector"] = true;
-  Features["pcrelative-memops"] = true;
-  Features["prefix-instrs"] = true;
-  Features["isa-v31-instructions"] = true;
-}
-
-// Add any Power11 specific features.
-void PPCTargetInfo::addP11SpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
-// Add features specific to the "Future" CPU.
-void PPCTargetInfo::addFutureSpecificFeatures(
-    llvm::StringMap<bool> &Features) const {}
-
 bool PPCTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("powerpc", true)
diff --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index cd5a18f39060e..a0e76e8a9a0b6 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -35,5 +35,5 @@ int &g() { return r; }
 // DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// LINUX_AIX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
+// LINUX_AIX: attributes [[ATTR0]] = { {{.*}} }
 // DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/clang/test/Driver/aix-shared-lib-tls-model-opt.c b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
index 7acf091f0a049..891caf4ed3fcd 100644
--- a/clang/test/Driver/aix-shared-lib-tls-model-opt.c
+++ b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 // RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
@@ -19,9 +19,8 @@ int test(void) {
 
 // CHECK-AIX: test() #0 {
 // CHECK-AIX: attributes #0 = {
-// CHECK-AIX-OFF-SAME: -aix-shared-lib-tls-model-opt
 // CHECK-AIX-ON-SAME: +aix-shared-lib-tls-model-opt
 
-// CHECK-LINUX-NOT: {{[-+]aix-shared-lib-tls-model-opt}}
+// CHECK-LINUX-NOT: {{[+]aix-shared-lib-tls-model-opt}}
 
 // CHECK-UNSUPPORTED-TARGET: option '-maix-shared-lib-tls-model-opt' cannot be specified on this target
diff --git a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
index 1a0619b58e891..6fc2b8efb4aed 100644
--- a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
+++ b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
@@ -1,37 +1,37 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
-// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
+// RUN: %clang --target=powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang --target=powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALEXEC_TLS
 
-// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
+// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALDYNAMIC_TLS
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
-// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
@@ -39,10 +39,9 @@ int test(void) {
   return 0;
 }
 
-// CHECK-AIX-DEFAULT: test() #0 {
-// CHECK-AIX-DEFAULT: attributes #0 = {
-// CHECK-AIX-DEFAULT-SAME: {{-aix-small-local-exec-tls,.*-aix-small-local-dynamic-tls|-aix-small-local-dynamic-tls,.*-aix-small-local-exec-tls}}
-// CHECK-LINUX-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
+// CHECK-DEFAULT: test() #0 {
+// CHECK-DEFAULT: attributes #0 = {
+// CHECK-DEFAULT-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
 
 // CHECK-UNSUPPORTED-AIX32: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
 // CHECK-UNSUPPORTED-LINUX: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
diff --git a/clang/test/Driver/ppc-crbits.cpp b/clang/test/Driver/ppc-crbits.cpp
index 3ed56308cb526..62893d3d0e87d 100644
--- a/clang/test/Driver/ppc-crbits.cpp
+++ b/clang/test/Driver/ppc-crbits.cpp
@@ -64,8 +64,6 @@
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mno-crbits \
@@ -92,8 +90,6 @@
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
-// RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -emit-llvm \
-// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mno-crbits \
diff --git a/clang/test/Driver/ppc-isa-features.cpp b/clang/test/Driver/ppc-isa-features.cpp
index 92c5bc82f72b8..35dbfbcdf5699 100644
--- a/clang/test/Driver/ppc-isa-features.cpp
+++ b/clang/test/Driver/ppc-isa-features.cpp
@@ -5,20 +5,20 @@
 // RUN: %clang -target powerpc64-unknown-aix -mcpu=pwr9 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR9
 // RUN: %clang -target powerpc-unknown-aix -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR10
 
-// CHECK-PWR6: -isa-v206-instructions
-// CHECK-PWR6: -isa-v207-instructions
-// CHECK-PWR6: -isa-v30-instructions
+// CHECK-PWR6-NOT: isa-v206-instructions
+// CHECK-PWR6-NOT: isa-v207-instructions
+// CHECK-PWR6-NOT: isa-v30-instructions
 
-// CHECK-A2: +isa-v206-instructions
-// CHECK-A2: -isa-v207-instructions
-// CHECK-A2: -isa-v30-instructions
+// CHECK-A2:     +isa-v206-instructions
+// CHECK-A2-NOT: isa-v207-instructions
+// CHECK-A2-NOT: isa-v30-instructions
 
-// CHECK-PWR7: +isa-v206-instructions
-// CHECK-PWR7: -isa-v207-instructions
-// CHECK-PWR7: -isa-v30-instructions
+// CHECK-PWR7:     +isa-v206-instructions
+// CHECK-PWR7-NOT: isa-v207-instructions
+// CHECK-PWR7-NOT: isa-v30-instructions
 
-// CHECK-PWR8: +isa-v207-instructions
-// CHECK-PWR8: -isa-v30-instructions
+// CHECK-PWR8:     +isa-v207-instructions
+// CHECK-PWR8-NOT: isa-v30-instructions
 
 // CHECK-PWR9: +isa-v207-instructions
 // CHECK-PWR9: +isa-v30-instructions
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index 59d9f867005a4..d3d44afb5f544 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
+#include "TargetParser.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -37,6 +39,10 @@ LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
 LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
+
+LLVM_ABI std::optional<llvm::StringMap<bool>>
+getPPCDefaultTargetFeatures(const Triple &T, StringRef CPUName);
+
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index 176205e17ae00..b4a92cc6b6c4b 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TARGETPARSER_TARGETPARSER_H
 #define LLVM_TARGETPARSER_TARGETPARSER_H
 
+#include "SubtargetFeature.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -190,6 +192,31 @@ insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
 } // namespace AMDGPU
+
+struct BasicSubtargetFeatureKV {
+  const char *Key;         ///< K-V key string
+  unsigned Value;          ///< K-V integer value
+  FeatureBitArray Implies; ///< K-V bit mask
+};
+
+/// Used to provide key value pairs for feature and CPU bit flags.
+struct BasicSubtargetSubTypeKV {
+  const char *Key;         ///< K-V key string
+  FeatureBitArray Implies; ///< K-V bit mask
+
+  /// Compare routine for std::lower_bound
+  bool operator<(StringRef S) const { return StringRef(Key) < S; }
+
+  /// Compare routine for std::is_sorted.
+  bool operator<(const BasicSubtargetSubTypeKV &Other) const {
+    return StringRef(Key) < StringRef(Other.Key);
+  }
+};
+
+std::optional<llvm::StringMap<bool>>
+getCPUDefaultTargetFeatures(StringRef CPU,
+                            ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+                            ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index fd850faf7b2fb..ea7c2203662bd 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -411,7 +411,6 @@ def ProcessorFeatures {
      FeatureP8Altivec,
      FeatureP8Vector,
      FeatureP8Crypto,
-     FeatureHTM,
      FeatureDirectMove,
      FeatureICBT,
      FeaturePartwordAtomic,
@@ -422,6 +421,7 @@ def ProcessorFeatures {
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
+                                               FeatureHTM,
                                                FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
@@ -443,7 +443,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, FeatureHTM];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 8f8b3a578a1d9..66aed45ff18c6 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -8,6 +8,12 @@ if (HAS_WERROR_GLOBAL_CTORS AND NOT LLVM_HAS_NOGLOBAL_CTOR_MUTEX)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=global-constructors")
 endif()
 
+set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC/PPC.td)
+
+tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget -I${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC)
+add_public_tablegen_target(PPCGenSubtargetInfo)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # Solaris code uses kstat, so specify dependency explicitly for shared builds.
 if (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   set(system_libs kstat)
@@ -41,3 +47,5 @@ add_llvm_component_library(LLVMTargetParser
   DEPENDS
   target_parser_gen
   )
+
+add_dependencies(LLVMTargetParser PPCGenSubtargetInfo)
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 422d758c772e1..1b637b27be3de 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -15,6 +15,10 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/Host.h"
 
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETFEATURES_KV
+#include "PPCGenSubtargetInfo.inc"
+
 namespace llvm {
 namespace PPC {
 
@@ -117,5 +121,26 @@ StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
   return getNormalizedPPCTargetCPU(T, CPUName);
 }
 
+std::optional<StringMap<bool>> getPPCDefaultTargetFeatures(const Triple &T,
+                                                           StringRef CPU) {
+  std::optional<StringMap<bool>> FeaturesOpt =
+      getCPUDefaultTargetFeatures(CPU, BasicPPCSubTypeKV, BasicPPCFeatureKV);
+
+  if (!FeaturesOpt.has_value())
+    return std::nullopt;
+
+  StringMap<bool> Features = FeaturesOpt.value();
+  // FIXME: We need to check for the processor model 8548, since the backend
+  // does not support this processor. When this processor model is implemented
+  // within the backend, the following code can be removed.
+  if (CPU == "8548")
+    Features["spe"] = true;
+
+  // The target feature `quadword-atomics` is only supported for 64-bit
+  // POWER8 and above.
+  if (Features.find("quadword-atomics") != Features.end() && !T.isArch64Bit())
+    Features["quadword-atomics"] = false;
+  return Features;
+}
 } // namespace PPC
 } // namespace llvm
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 7c54901dae47d..03f7d3899c2e7 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -18,6 +18,53 @@
 using namespace llvm;
 using namespace AMDGPU;
 
+/// Find KV in array using binary search.
+static const BasicSubtargetSubTypeKV *
+find(StringRef S, ArrayRef<BasicSubtargetSubTypeKV> A) {
+  // Binary search the array
+  auto F = llvm::lower_bound(A, S);
+  // If not found then return NULL
+  if (F == A.end() || StringRef(F->Key) != S)
+    return nullptr;
+  // Return the found array item
+  return F;
+}
+
+/// For each feature that is (transitively) implied by this feature, set it.
+static void setImpliedBits(FeatureBitset &Bits, const FeatureBitset &Implies,
+                           ArrayRef<BasicSubtargetFeatureKV> FeatureTable) {
+  // OR the Implies bits in outside the loop. This allows the Implies for CPUs
+  // which might imply features not in FeatureTable to use this.
+  Bits |= Implies;
+  for (const auto &FE : FeatureTable)
+    if (Implies.test(FE.Value))
+      setImpliedBits(Bits, FE.Implies.getAsBitset(), FeatureTable);
+}
+
+std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
+    StringRef CPU, ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
+    ArrayRef<BasicSubtargetFeatureKV> ProcFeatures) {
+  if (CPU.empty())
+    return std::nullopt;
+
+  const BasicSubtargetSubTypeKV *CPUEntry = ::find(CPU, ProcDesc);
+  if (!CPUEntry)
+    return std::nullopt;
+
+  // Set the features implied by this CPU feature if there is a match.
+  FeatureBitset Bits;
+  llvm::StringMap<bool> DefaultFeatures;
+  setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
+
+  unsigned BitSize = Bits.size();
+  for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
+    assert(FE.Value < BitSize && "Target Feature is out of range");
+    if (Bits[FE.Value])
+      DefaultFeatures[FE.Key] = true;
+  }
+  return DefaultFeatures;
+}
+
 namespace {
 
 struct GPUInfo {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index ca008e256a70f..da41e981888aa 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -89,8 +89,10 @@ class SubtargetEmitter {
 
   FeatureMapTy enumeration(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
-  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
-  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
+                            bool IsEmitBasic = false);
+  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
+                        bool IsEmitBasic = false);
   unsigned cpuNames(raw_ostream &OS);
   void formItineraryStageString(const std::string &Names,
                                 const Record *ItinData, std::string &ItinString,
@@ -254,7 +256,8 @@ void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
 // command line.
 //
 unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
-                                            const FeatureMapTy &FeatureMap) {
+                                            const FeatureMapTy &FeatureMap,
+                                            bool IsEmitBasic) {
   std::vector<const Record *> FeatureList =
       Records.getAllDerivedDefinitions("SubtargetFeature");
 
@@ -270,7 +273,8 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
   // Begin feature table.
   OS << "// Sorted (by key) array of values for CPU features.\n"
-     << "extern const llvm::SubtargetFeatureKV " << Target
+     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
+     << "SubtargetFeatureKV " << (IsEmitBasic ? "Basic" : "") << Target
      << "FeatureKV[] = {\n";
 
   for (const Record *Feature : FeatureList) {
@@ -281,9 +285,11 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
     // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
     // } }
-    OS << "  { "
-       << "\"" << CommandLineName << "\", "
-       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
+    OS << "  { " << "\"" << CommandLineName << "\", ";
+    if (!IsEmitBasic)
+      OS << "\"" << Desc << "\", ";
+
+    OS << Target << "::" << Name << ", ";
 
     ConstRecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -361,7 +367,8 @@ static void checkDuplicateCPUFeatures(StringRef CPUName,
 // line.
 //
 unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
-                                        const FeatureMapTy &FeatureMap) {
+                                        const FeatureMapTy &FeatureMap,
+                                        bool IsEmitBasic) {
   // Gather and sort processor information
   std::vector<const Record *> ProcessorList =
       Records.getAllDerivedDefinitions("Processor");
@@ -374,7 +381,8 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
 
   // Begin processor table.
   OS << "// Sorted (by key) array of values for CPU subtype.\n"
-     << "extern const llvm::SubtargetSubTypeKV " << Target
+     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
+     << "SubtargetSubTypeKV " << (IsEmitBasic ? "Basic" : "") << Target
      << "SubTypeKV[] = {\n";
 
   for (const Record *Processor : ProcessorList) {
@@ -392,13 +400,17 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
        << "\"" << Name << "\", ";
 
     printFeatureMask(OS, FeatureList, FeatureMap);
-    OS << ", ";
-    printFeatureMask(OS, TuneFeatureList, FeatureMap);
 
-    // Emit the scheduler model pointer.
-    const std::string &ProcModelName =
-        SchedModels.getModelForProc(Processor).ModelName;
-    OS << ", &" << ProcModelName << " },\n";
+    if (!IsEmitBasic) {
+      OS << ", ";
+      printFeatureMask(OS, TuneFeatureList, FeatureMap);
+
+      // Emit the scheduler model pointer.
+      const std::string &ProcModelName =
+          SchedModels.getModelForProc(Processor).ModelName;
+      OS << ", &" << ProcModelName;
+    }
+    OS << " },\n";
   }
 
   // End processor table.
@@ -2040,6 +2052,14 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace llvm\n\n";
   OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
 
+  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
+  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
+  OS << "namespace llvm {\n";
+  featureKeyValues(OS, FeatureMap, true);
+  cpuKeyValues(OS, FeatureMap, true);
+  OS << "} // end namespace llvm\n\n";
+  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
+
   emitSubtargetInfoMacroCalls(OS);
 
   OS << "namespace llvm {\n";

From 06dad352dba16fd9afa89be7abf9bb46f7552b48 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 10:39:53 -0700
Subject: [PATCH 278/851] Revert "[lldb][headers] Create Python script to fix
 up framework headers" (#143941)

Reverts llvm/llvm-project#142051
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ------------------
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 --
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 --
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 --
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 --
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 --
 7 files changed, 21 insertions(+), 206 deletions(-)
 delete mode 100755 lldb/scripts/framework-header-fix.py
 delete mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 delete mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 delete mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 delete mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 delete mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 70010ffbf738c..8961b1afe93ad 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,17 +68,24 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
+# At configuration time, collect headers for the framework bundle and copy them
+# into a staging directory. Later we can copy over the entire folder.
+file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
+set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
+file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
+file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
+list(REMOVE_ITEM root_public_headers ${root_private_headers})
+
 find_program(unifdef_EXECUTABLE unifdef)
 
-# All necessary header files will be staged in the include directory in the build directory,
-# so just copy the files from there into the framework's staging directory.
-set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
-set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
-file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
-foreach(header ${lldb_build_dir_header_staging_list})
+set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
+foreach(header
+    ${public_headers}
+    ${generated_public_headers}
+    ${root_public_headers})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_framework_header_staging}/${basename})
+  set(staged_header ${lldb_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -105,20 +112,13 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# Take the headers from the staging directory and fix up their includes for the framework.
-# Then write them to the output directory.
-# Also, run unifdef to remove any specified guards from the header files.
-file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
-foreach(header ${lldb_framework_header_staging_list})
-
-  set(input_header ${header})
-  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
-
-  add_custom_command(TARGET liblldb POST_BUILD
-    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
-    COMMENT "LLDB.framework: Fix up and copy framework headers"
-  )
-endforeach()
+# At build time, copy the staged headers into the framework bundle (and do
+# some post-processing in-place).
+add_custom_command(TARGET liblldb POST_BUILD
+  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
+  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
+  COMMENT "LLDB.framework: copy framework headers"
+)
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
deleted file mode 100755
index 9e4e5f860a2c0..0000000000000
--- a/lldb/scripts/framework-header-fix.py
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Usage: <path/to/input-directory> <path/to/output-directory>
-
-This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
-
-This script is used in 2 ways:
-1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
-
-2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
-"""
-
-import argparse
-import os
-import re
-import shutil
-import subprocess
-import sys
-
-# Main header regexes
-INCLUDE_FILENAME_REGEX = re.compile(
-    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
-)
-
-# RPC header regexes
-RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
-RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
-
-
-def modify_rpc_includes(input_file_path, output_file_path):
-    with open(input_file_path, "r") as input_file:
-        lines = input_file.readlines()
-        file_buffer = "".join(lines)
-        with open(output_file_path, "w") as output_file:
-            # Local includes must be changed to RPC framework level includes.
-            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
-            # Also, RPC common code includes must change to RPC framework level includes.
-            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
-            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
-            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
-                file_buffer
-            )
-            for match in rpc_common_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-            for match in rpc_include_filename_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-            output_file.write(file_buffer)
-
-
-def modify_main_includes(input_file_path, output_file_path):
-    with open(input_file_path, "r") as input_file:
-        lines = input_file.readlines()
-        file_buffer = "".join(lines)
-        with open(output_file_path, "w") as output_file:
-            # Local includes must be changed to framework level includes.
-            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
-            for match in regex_matches:
-                file_buffer = re.sub(
-                    match.group(),
-                    r"#include <LLDB/" + match.group("include_filename") + ">",
-                    file_buffer,
-                )
-                output_file.write(file_buffer)
-
-
-def remove_guards(output_file_path, unifdef_path, unifdef_guards):
-    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
-    # find it using shutil. If shutil can't find it, then exit.
-    if not shutil.which(unifdef_path):
-        unifdef_path = shutil.which("unifdef")
-    if not unifdef_path:
-        print(
-            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
-        )
-        sys.exit(1)
-
-    subprocess_command = (
-        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
-    )
-    subprocess.run(subprocess_command)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
-    parser.add_argument("-i", "--input_file")
-    parser.add_argument("-o", "--output_file")
-    parser.add_argument("-p", "--unifdef_path")
-    parser.add_argument(
-        "unifdef_guards",
-        nargs="+",
-        type=str,
-        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
-    )
-    args = parser.parse_args()
-    input_file_path = str(args.input_file)
-    output_file_path = str(args.output_file)
-    framework_version = args.framework
-    unifdef_path = str(args.unifdef_path)
-    # Prepend dashes to the list of guards passed in from the command line.
-    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
-    # but passing them in with dashes for this script causes argparse to think that they're
-    # arguments in and of themself, so they need to passed in without dashes.
-    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
-
-    if framework_version == "lldb_main":
-        modify_main_includes(input_file_path, output_file_path)
-    if framework_version == "lldb_rpc":
-        modify_rpc_includes(input_file_path, output_file_path)
-    # After the incldues have been modified, run unifdef on the headers to remove any guards
-    # specified at the command line.
-    remove_guards(output_file_path, unifdef_path, unifdef_guards)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
deleted file mode 100644
index fecc69687cd74..0000000000000
--- a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// This is a truncated version of an SB API file
-// used to test framework-header-fix.py to make sure the includes are correctly fixed
-// up for the LLDB.framework.
-
-// Local includes must be changed to framework level includes.
-// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-#include "lldb/API/SBDefines.h"
-#include "lldb/API/SBModule.h"
-
-// Any include guards specified at the command line must be removed.
-#ifndef SWIG
-int a = 10
-#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
deleted file mode 100644
index 556afa38a9225..0000000000000
--- a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
+++ /dev/null
@@ -1,9 +0,0 @@
-// This is a truncated version of an SB API file generated by lldb-rpc-gen
-// used to test framework-header-fix.py to make sure the includes are correctly fixed
-// up for the LLDBRPC.framework.
-
-// Local includes must be changed to framework level includes.
-// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-#include "LLDBRPC.h"
-#include "SBDefines.h"
-#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
deleted file mode 100644
index e90c3bdfc5adb..0000000000000
--- a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
+++ /dev/null
@@ -1,11 +0,0 @@
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
-
-# Local includes must be changed to framework level includes.
-# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
-CHECK: #include <LLDB/SBDefines.h>
-CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
deleted file mode 100644
index a7e82d2f3640c..0000000000000
--- a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
+++ /dev/null
@@ -1,12 +0,0 @@
-# REQUIRES: system-darwin
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
-
-# Any include guards specified at the command line must be removed.
-CHECK-NOT: #ifndef SWIG
-CHECK: int a = 10
-CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
deleted file mode 100644
index 8ba03a8c2afa8..0000000000000
--- a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
+++ /dev/null
@@ -1,14 +0,0 @@
-# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
-RUN: mkdir -p %t/Outputs
-RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/Main/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
-
-# Check the output
-RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
-
-# Local includes must be changed to RPC framework level includes.
-# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
-# Also, RPC common code includes must change to RPC framework level includes.
-# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
-CHECK: #include <LLDBRPC/RPCPublic.h>
-CHECK: #include <LLDBRPC/SBDefines.h>
-CHECK: #include <LLDBRPC/LLDBRPC.h>

From 4e765b7a6b93b5d82e90f9a112b3eca4f873f005 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 12 Jun 2025 18:40:26 +0100
Subject: [PATCH 279/851] [x86] dpbusd_i4.ll - regenerate VPTERNLOGD asm
 comment

---
 llvm/test/CodeGen/X86/dpbusd_i4.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/dpbusd_i4.ll b/llvm/test/CodeGen/X86/dpbusd_i4.ll
index a212f99680ef4..9fbac111ee16f 100644
--- a/llvm/test/CodeGen/X86/dpbusd_i4.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_i4.ll
@@ -54,7 +54,7 @@ define i32 @mul_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
 ; CHECK-NEXT:    vpsllw $4, %xmm1, %xmm1
 ; CHECK-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; CHECK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; CHECK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & m32bcst)
 ; CHECK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2

From 5a6a4b6ba6945363bf366a885103a4adca11b5ef Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 10:45:47 -0700
Subject: [PATCH 280/851] [libc] Implement perror (#143624)

The perror function writes an error message directly to stderr. This
patch adds an implementation, tests, and header generation details.
---
 libc/config/linux/aarch64/entrypoints.txt |  1 +
 libc/config/linux/riscv/entrypoints.txt   |  1 +
 libc/config/linux/x86_64/entrypoints.txt  |  1 +
 libc/include/stdio.yaml                   |  6 ++
 libc/src/stdio/CMakeLists.txt             |  1 +
 libc/src/stdio/generic/CMakeLists.txt     | 15 +++++
 libc/src/stdio/generic/perror.cpp         | 81 +++++++++++++++++++++++
 libc/src/stdio/perror.h                   | 20 ++++++
 libc/test/src/stdio/CMakeLists.txt        | 12 ++++
 libc/test/src/stdio/perror_test.cpp       | 32 +++++++++
 10 files changed, 170 insertions(+)
 create mode 100644 libc/src/stdio/generic/perror.cpp
 create mode 100644 libc/src/stdio/perror.h
 create mode 100644 libc/test/src/stdio/perror_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index fcf1278eae723..9e042cd4a8acb 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -972,6 +972,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 050fc2672a57e..db8f8a7cf0b74 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -1098,6 +1098,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 6c9d83708b92f..c993ef8303a59 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -1116,6 +1116,7 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdio.getc_unlocked
     libc.src.stdio.getchar
     libc.src.stdio.getchar_unlocked
+    libc.src.stdio.perror
     libc.src.stdio.putc
     libc.src.stdio.putchar
     libc.src.stdio.puts
diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml
index 3d5164fa10ffb..2a0c563709984 100644
--- a/libc/include/stdio.yaml
+++ b/libc/include/stdio.yaml
@@ -249,6 +249,12 @@ functions:
       - POSIX
     return_type: int
     arguments: []
+  - name: perror
+    standards:
+      - stdc
+    return_type: void
+    arguments:
+      - type: const char *
   - name: printf
     standards:
       - stdc
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index 63f6ed8a11f1d..b0a6ef1e291b5 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -221,6 +221,7 @@ add_stdio_entrypoint_object(fopen)
 add_stdio_entrypoint_object(fclose)
 add_stdio_entrypoint_object(fread_unlocked)
 add_stdio_entrypoint_object(fread)
+add_stdio_entrypoint_object(perror)
 add_stdio_entrypoint_object(puts)
 add_stdio_entrypoint_object(fputs)
 add_stdio_entrypoint_object(fwrite_unlocked)
diff --git a/libc/src/stdio/generic/CMakeLists.txt b/libc/src/stdio/generic/CMakeLists.txt
index e1f4ed5c19497..6361822b61999 100644
--- a/libc/src/stdio/generic/CMakeLists.txt
+++ b/libc/src/stdio/generic/CMakeLists.txt
@@ -206,6 +206,21 @@ add_generic_entrypoint_object(
     libc.src.__support.File.platform_file
 )
 
+add_generic_entrypoint_object(
+  perror
+  SRCS
+    perror.cpp
+  HDRS
+    ../perror.h
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.__support.StringUtil.error_to_string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.File.file
+    libc.src.__support.File.platform_file
+    libc.src.__support.File.platform_stderr
+)
+
 add_generic_entrypoint_object(
   fputs
   SRCS
diff --git a/libc/src/stdio/generic/perror.cpp b/libc/src/stdio/generic/perror.cpp
new file mode 100644
index 0000000000000..68b4ad644caab
--- /dev/null
+++ b/libc/src/stdio/generic/perror.cpp
@@ -0,0 +1,81 @@
+//===-- Implementation of perror ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/perror.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/File/file.h"
+#include "src/__support/StringUtil/error_to_string.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static int write_out(cpp::string_view str_view, File *f) {
+  if (str_view.size() > 0) {
+    auto result = f->write_unlocked(str_view.data(), str_view.size());
+    if (result.has_error())
+      return result.error;
+  }
+  return 0;
+}
+
+// separate function so that we can return early on error but still get the
+// unlock. This function sets errno and should not be called elsewhere.
+static void write_sequence(cpp::string_view str_view,
+                           cpp::string_view err_str) {
+  int write_err;
+  // TODO: this seems like there should be some sort of queue system to
+  // deduplicate this code.
+
+  // FORMAT:
+  // if str != nullptr and doesn't start with a null byte:
+  //   "[str]: [strerror(errno)]\n"
+  // else
+  //   "[strerror(errno)]\n"
+  if (str_view.size() > 0) {
+    write_err = write_out(str_view, LIBC_NAMESPACE::stderr);
+    if (write_err != 0) {
+      libc_errno = write_err;
+      return;
+    }
+
+    write_err = write_out(": ", LIBC_NAMESPACE::stderr);
+    if (write_err != 0) {
+      libc_errno = write_err;
+      return;
+    }
+  }
+
+  write_err = write_out(err_str, LIBC_NAMESPACE::stderr);
+  if (write_err != 0) {
+    libc_errno = write_err;
+    return;
+  }
+
+  write_err = write_out("\n", LIBC_NAMESPACE::stderr);
+  if (write_err != 0) {
+    libc_errno = write_err;
+    return;
+  }
+}
+
+LLVM_LIBC_FUNCTION(void, perror, (const char *str)) {
+  const char empty_str[1] = {'\0'};
+  if (str == nullptr)
+    str = empty_str;
+  cpp::string_view str_view(str);
+
+  cpp::string_view err_str = get_error_string(libc_errno);
+
+  // We need to lock the stream to ensure the newline is always appended.
+  LIBC_NAMESPACE::stderr->lock();
+  write_sequence(str_view, err_str);
+  LIBC_NAMESPACE::stderr->unlock();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/perror.h b/libc/src/stdio/perror.h
new file mode 100644
index 0000000000000..bf8d0af1df5d7
--- /dev/null
+++ b/libc/src/stdio/perror.h
@@ -0,0 +1,20 @@
+//===-- Implementation header of perror -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_PERROR_H
+#define LLVM_LIBC_SRC_STDIO_PERROR_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+void perror(const char *s);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_STDIO_PERROR_H
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 01904a30504ed..ce2171f19597b 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -357,6 +357,18 @@ add_libc_test(
     libc.src.stdio.puts
 )
 
+add_libc_test(
+  perror_test
+  HERMETIC_TEST_ONLY # writes to libc's stderr
+  SUITE
+    libc_stdio_unittests
+  SRCS
+    perror_test.cpp
+  DEPENDS
+    libc.src.stdio.perror
+    libc.src.errno.errno
+)
+
 add_libc_test(
   fputs_test
   HERMETIC_TEST_ONLY # writes to libc's stdout and stderr
diff --git a/libc/test/src/stdio/perror_test.cpp b/libc/test/src/stdio/perror_test.cpp
new file mode 100644
index 0000000000000..9a97be2eff210
--- /dev/null
+++ b/libc/test/src/stdio/perror_test.cpp
@@ -0,0 +1,32 @@
+//===-- Unittests for perror ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/perror.h"
+
+#include "src/__support/libc_errno.h"
+#include "test/UnitTest/Test.h"
+
+// The standard says perror prints directly to stderr and returns nothing. This
+// makes it rather difficult to test automatically.
+
+// TODO: figure out redirecting stderr so this test can check correctness.
+TEST(LlvmLibcPerrorTest, PrintOut) {
+  LIBC_NAMESPACE::libc_errno = 0;
+  constexpr char simple[] = "A simple string";
+  LIBC_NAMESPACE::perror(simple);
+
+  // stick to stdc errno values, specifically 0, EDOM, ERANGE, and EILSEQ.
+  LIBC_NAMESPACE::libc_errno = EDOM;
+  LIBC_NAMESPACE::perror("Print this and an error");
+
+  LIBC_NAMESPACE::libc_errno = EILSEQ;
+  LIBC_NAMESPACE::perror("\0 shouldn't print this.");
+
+  LIBC_NAMESPACE::libc_errno = ERANGE;
+  LIBC_NAMESPACE::perror(nullptr);
+}

From f94950db89a905309ec9ea2245889df88ffd0690 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Thu, 12 Jun 2025 18:04:26 +0000
Subject: [PATCH 281/851] [libc] Changed mbstate struct (#143942)

Changed the mbstate variable from bits processed to bytes processed and
implemented isComplete().

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/wchar/character_converter.cpp | 4 +++-
 libc/src/__support/wchar/mbstate.h               | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cdb8ca83b7f0..f09c7815a6cc4 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -18,7 +18,9 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
-bool CharacterConverter::isComplete() {}
+bool CharacterConverter::isComplete() {
+  return state->bytes_processed == state->total_bytes;
+}
 
 int CharacterConverter::push(char8_t utf8_byte) {}
 
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index cb8950374de41..d33ee354a5443 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -18,7 +18,7 @@ namespace internal {
 
 struct mbstate {
   char32_t partial;
-  uint8_t bits_processed;
+  uint8_t bytes_processed;
   uint8_t total_bytes;
 };
 

From fd88aef21bae75b4641472badeb2abe3757872ac Mon Sep 17 00:00:00 2001
From: Qinkun Bao <qinkun@google.com>
Date: Thu, 12 Jun 2025 14:08:36 -0400
Subject: [PATCH 282/851] [Doc][NFC] Fix Sanitizer Ignore list example errors.
 (#143755)

---
 clang/docs/SanitizerSpecialCaseList.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst
index 61b6c55d8e6e4..2c50778d0f491 100644
--- a/clang/docs/SanitizerSpecialCaseList.rst
+++ b/clang/docs/SanitizerSpecialCaseList.rst
@@ -109,13 +109,13 @@ precedence. Here are a few examples.
 .. code-block:: bash
 
   $ cat ignorelist1.txt
-  # test.cc will be instrumented.
+  # test.cc will not be instrumented.
   src:*
   src:*/mylib/*=sanitize
   src:*/mylib/test.cc
 
   $ cat ignorelist2.txt
-  # test.cc will not be instrumented.
+  # test.cc will be instrumented.
   src:*
   src:*/mylib/test.cc
   src:*/mylib/*=sanitize

From 639e811434d2c21b9161fe9955acdea28ce33c7b Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Thu, 12 Jun 2025 20:10:05 +0200
Subject: [PATCH 283/851] [CIR][NFC] Fix an unused variable warning (#143933)

This fixes a warning where a variable assigned in 'if' statement wasn't
referenced again.
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index da754e0806b2d..67c6a8dd3ef5a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -171,7 +171,7 @@ void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
 
   QualType type = param->getType();
 
-  if (const auto *rd = type->getAsCXXRecordDecl()) {
+  if (type->getAsCXXRecordDecl()) {
     cgm.errorNYI(param->getSourceRange(),
                  "emitDelegateCallArg: record argument");
     return;

From 4a58a63280a673142fc674db1fb668b7bae00420 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 12 Jun 2025 19:26:51 +0100
Subject: [PATCH 284/851] [mlir][linalg] Remove the
 `test-linalg-to-vector-patterns` option (#142116)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch removes the `test-linalg-to-vector-patterns` option from the
`-test-linalg-transform-patterns=` test flag. It was only used in one
test, where a more specialized transform dialect op can be used instead:

* `transform.apply_patterns.linalg.pad_vectorization`

While we could preserve `test-linalg-to-vector-patterns`, it's better to
rely on finer-grained transformations — this way, we know exactly what
is being run and tested. Now that its only use has been removed, it
feels natural to delete `test-linalg-to-vector-patterns`.
---
 .../Dialect/Linalg/CPU/test-padtensor.mlir       | 13 ++++++++++++-
 .../lib/Dialect/Linalg/TestLinalgTransforms.cpp  | 16 ----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index 1361d21e7d949..63db0def1cbc5 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-linalg-to-vector-patterns \
+// RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \
 // RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
@@ -34,4 +34,15 @@ func.func @main() {
   return
 }
 
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
 func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
index 046b9a65f3359..738648b8ccdcf 100644
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -65,11 +65,6 @@ struct TestLinalgTransforms
       llvm::cl::desc(
           "Test a fused pass that forwards memref.copy to vector.transfer"),
       llvm::cl::init(false)};
-  Option<bool> testGenericToVectorPattern{
-      *this, "test-linalg-to-vector-patterns",
-      llvm::cl::desc("Test a set of patterns that rewrite a linalg contraction "
-                     "in vector.contract form"),
-      llvm::cl::init(false)};
   Option<bool> testDecomposePadTensor{
       *this, "test-decompose-pad-tensor",
       llvm::cl::desc("Test transform pad tensor by copying with generic ops"),
@@ -166,15 +161,6 @@ static void applyVectorTransferForwardingPatterns(func::FuncOp funcOp) {
   (void)applyPatternsGreedily(funcOp, std::move(forwardPattern));
 }
 
-static void applyLinalgToVectorPatterns(func::FuncOp funcOp) {
-  RewritePatternSet patterns(funcOp.getContext());
-  auto *ctx = funcOp.getContext();
-  patterns.add<CopyVectorizationPattern>(ctx);
-  populatePadOpVectorizationPatterns(patterns);
-  populateConvolutionVectorizationPatterns(patterns);
-  (void)applyPatternsGreedily(funcOp, std::move(patterns));
-}
-
 static void applyDecomposePadPatterns(func::FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
   patterns.add<DecomposePadOpPattern>(funcOp.getContext());
@@ -254,8 +240,6 @@ void TestLinalgTransforms::runOnOperation() {
     return applyPatterns(getOperation());
   if (testVectorTransferForwardingPatterns)
     return applyVectorTransferForwardingPatterns(getOperation());
-  if (testGenericToVectorPattern)
-    return applyLinalgToVectorPatterns(getOperation());
   if (testDecomposePadTensor)
     return applyDecomposePadPatterns(getOperation());
   if (testDecomposeTensorPackOp)

From 3c1053811e6925e8b9f7a044f3a18bfda1d7ccfe Mon Sep 17 00:00:00 2001
From: David Rivera <davidriverg@gmail.com>
Date: Thu, 12 Jun 2025 14:33:06 -0400
Subject: [PATCH 285/851] Revert "[clang-tidy] Improve integer comparison by
 matching valid expressions outside implicitCastExpr" (#143944)

Reverts llvm/llvm-project#134188
related: https://github.com/llvm/llvm-project/issues/143927
---
 .../UseIntegerSignComparisonCheck.cpp         | 21 ++---
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 -
 .../modernize/use-integer-sign-comparison.cpp | 78 -------------------
 3 files changed, 7 insertions(+), 96 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
index c02c5dfa8756d..eeba5cce80da5 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.cpp
@@ -39,28 +39,21 @@ intCastExpression(bool IsSigned,
   // std::cmp_{} functions trigger a compile-time error if either LHS or RHS
   // is a non-integer type, char, enum or bool
   // (unsigned char/ signed char are Ok and can be used).
-  const auto HasIntegerType = hasType(hasCanonicalType(qualType(
+  auto IntTypeExpr = expr(hasType(hasCanonicalType(qualType(
       isInteger(), IsSigned ? isSignedInteger() : isUnsignedInteger(),
-      unless(isActualChar()), unless(booleanType()), unless(enumType()))));
-
-  const auto IntTypeExpr = expr(HasIntegerType);
+      unless(isActualChar()), unless(booleanType()), unless(enumType())))));
 
   const auto ImplicitCastExpr =
       CastBindName.empty() ? implicitCastExpr(hasSourceExpression(IntTypeExpr))
                            : implicitCastExpr(hasSourceExpression(IntTypeExpr))
                                  .bind(CastBindName);
 
-  const auto ExplicitCastExpr =
-      anyOf(explicitCastExpr(has(ImplicitCastExpr)),
-            ignoringImpCasts(explicitCastExpr(has(ImplicitCastExpr))));
-
-  // Match function calls or variable references not directly wrapped by an
-  // implicit cast
-  const auto CallIntExpr = CastBindName.empty()
-                               ? callExpr(HasIntegerType)
-                               : callExpr(HasIntegerType).bind(CastBindName);
+  const auto CStyleCastExpr = cStyleCastExpr(has(ImplicitCastExpr));
+  const auto StaticCastExpr = cxxStaticCastExpr(has(ImplicitCastExpr));
+  const auto FunctionalCastExpr = cxxFunctionalCastExpr(has(ImplicitCastExpr));
 
-  return expr(anyOf(ImplicitCastExpr, ExplicitCastExpr, CallIntExpr));
+  return expr(anyOf(ImplicitCastExpr, CStyleCastExpr, StaticCastExpr,
+                    FunctionalCastExpr));
 }
 
 static StringRef parseOpCode(BinaryOperator::Opcode Code) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 882ee0015df17..19ccd1790e757 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -237,10 +237,6 @@ Changes in existing checks
   <clang-tidy/checks/modernize/use-designated-initializers>` check by avoiding
   diagnosing designated initializers for ``std::array`` initializations.
 
-- Improved :doc:`modernize-use-integer-sign-comparison
-  <clang-tidy/checks/modernize/use-integer-sign-comparison>` check by matching
-  valid integer expressions not directly wrapped around an implicit cast.
-
 - Improved :doc:`modernize-use-ranges
   <clang-tidy/checks/modernize/use-ranges>` check by updating suppress
   warnings logic for ``nullptr`` in ``std::find``.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
index d93a05ac38050..e0a84ef5aed26 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp
@@ -121,81 +121,3 @@ int AllComparisons() {
 
     return 0;
 }
-
-namespace PR127471 {
-    int getSignedValue();
-    unsigned int getUnsignedValue();
-
-    void callExprTest() {
-
-        if (getSignedValue() < getUnsignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES:  if (std::cmp_less(getSignedValue() , getUnsignedValue()))
-
-        int sVar = 0;
-        if (getUnsignedValue() > sVar)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_greater(getUnsignedValue() , sVar))
-
-        unsigned int uVar = 0;
-        if (getSignedValue() > uVar)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_greater(getSignedValue() , uVar))
-
-    }
-
-    // Add a class with member functions for testing member function calls
-    class TestClass {
-    public:
-        int getSignedValue() { return -5; }
-        unsigned int getUnsignedValue() { return 5; }
-    };
-
-    void memberFunctionTests() {
-        TestClass obj;
-
-        if (obj.getSignedValue() < obj.getUnsignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(obj.getSignedValue() , obj.getUnsignedValue()))
-    }
-
-    void castFunctionTests() {
-        // C-style casts with function calls
-        if ((int)getUnsignedValue() < (unsigned int)getSignedValue())
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(getUnsignedValue(),getSignedValue()))
-
-
-        // Static casts with function calls
-        if (static_cast<int>(getUnsignedValue()) < static_cast<unsigned int>(getSignedValue()))
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(getUnsignedValue(),getSignedValue()))
-    }
-
-    // Define tests
-    #define SIGNED_FUNC getSignedValue()
-    #define UNSIGNED_FUNC getUnsignedValue()
-
-    void defineTests() {
-        if (SIGNED_FUNC < UNSIGNED_FUNC)
-            return;
-// CHECK-MESSAGES: :[[@LINE-2]]:13: warning: comparison between 'signed' and 'unsigned' integers [modernize-use-integer-sign-comparison]
-// CHECK-FIXES: if (std::cmp_less(SIGNED_FUNC , UNSIGNED_FUNC))
-    }
-
-    // Template tests (should not warn)
-    template <typename T1>
-    void templateFunctionTest(T1 value) {
-        if (value() < getUnsignedValue())
-            return;
-
-        if (value() < (getSignedValue() || getUnsignedValue()))
-          return;
-    }
-} // namespace PR127471

From edf636afe405ff90da7bf1834aa334bd52bc861e Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 14:38:54 -0400
Subject: [PATCH 286/851] [PowerPC][NFC] Update lowering STXVP to STXV in Oct
 word spilling (#142220)

Remove explicit register arithmetic from spilling ACC and STXVP code.
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 93 ++++++++++-----------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45183af0b7984..9dc69e203b0da 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,42 +1238,6 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
-static void spillRegPairs(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          const TargetInstrInfo &TII, Register SrcReg,
-                          unsigned FrameIndex, bool IsLittleEndian,
-                          bool IsKilled, bool TwoPairs) {
-  unsigned Offset = 0;
-  // The register arithmetic in this function does not support virtual
-  // registers.
-  assert(!SrcReg.isVirtual() &&
-         "Spilling register pairs does not support virtual registers.");
-
-  if (TwoPairs)
-    Offset = IsLittleEndian ? 48 : 0;
-  else
-    Offset = IsLittleEndian ? 16 : 0;
-  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
-                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  Offset += IsLittleEndian ? -16 : 16;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg + 1, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  if (TwoPairs) {
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 2, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 3, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-  }
-}
-
 /// Remove any STXVP[X] instructions and split them out into a pair of
 /// STXV[X] instructions if --disable-auto-paired-vec-st is specified on
 /// the command line.
@@ -1290,8 +1254,21 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
-                /* TwoPairs */ false);
+
+  assert(PPC::VSRpRCRegClass.contains(SrcReg) &&
+          "Expecting STXVP to be utilizing a VSRp register.");
+
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx0),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? 16 : 0);
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx1),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? 0 : 16);
+
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1325,8 +1302,6 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
-  Register Reg =
-      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1337,16 +1312,34 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt)
-    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
-                  /* TwoPairs */ true);
-  else {
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg + 1, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt) {
+    auto spillPair = [&](Register Reg, int Offset) {
+      addFrameReference(
+          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
+                      getKillRegState(IsKilled)),
+          FrameIndex, Offset);
+      addFrameReference(
+          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
+                      getKillRegState(IsKilled)),
+          FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
+    };
+    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+              IsLittleEndian ? 48 : 0);
+    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+              IsLittleEndian ? 16 : 32);
+  } else {
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);

From 46085d8f83623f6ea2921459de9f731d7df762d4 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 12 Jun 2025 11:41:16 -0700
Subject: [PATCH 287/851] [lld/ELF][x86-64] Place large executable sections at
 the edges of binary (#70358)

So that when mixing small and large text, large text stays out of the
way of the rest of the binary.

Place large RX sections at the beginning rather than at the end so that
with `--no-rosegment`, the large text and rodata share a single PT_LOAD
segment. Place large RWX sections at the end to keep writable and
readonly sections separate.

Clang started emitting the large section flag for `.ltext` sections in
#73037.
---
 lld/ELF/Writer.cpp                   | 23 +++++++++++-----
 lld/test/ELF/x86-64-section-layout.s | 41 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 10dc688160d1e..3d9888f576f05 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -653,15 +653,17 @@ enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
+  RF_LARGE_EXEC_WRITE = 1 << 16,
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
   RF_EXEC_WRITE = 1 << 13,
   RF_EXEC = 1 << 12,
   RF_RODATA = 1 << 11,
-  RF_LARGE = 1 << 10,
-  RF_NOT_RELRO = 1 << 9,
-  RF_NOT_TLS = 1 << 8,
-  RF_BSS = 1 << 7,
+  RF_LARGE_EXEC = 1 << 10,
+  RF_LARGE = 1 << 9,
+  RF_NOT_RELRO = 1 << 8,
+  RF_NOT_TLS = 1 << 7,
+  RF_BSS = 1 << 6,
 };
 
 unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
@@ -691,6 +693,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
   // places.
   bool isExec = osec.flags & SHF_EXECINSTR;
   bool isWrite = osec.flags & SHF_WRITE;
+  bool isLarge = osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64;
 
   if (!isWrite && !isExec) {
     // Among PROGBITS sections, place .lrodata further from .text.
@@ -698,7 +701,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     // layout has one extra PT_LOAD, but alleviates relocation overflow
     // pressure for absolute relocations referencing small data from -fno-pic
     // relocatable files.
-    if (osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64)
+    if (isLarge)
       rank |= ctx.arg.zLrodataAfterBss ? RF_LARGE_ALT : 0;
     else
       rank |= ctx.arg.zLrodataAfterBss ? 0 : RF_LARGE;
@@ -722,7 +725,13 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     else
       rank |= RF_RODATA;
   } else if (isExec) {
-    rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
+    // Place readonly .ltext before .lrodata and writable .ltext after .lbss to
+    // keep writable and readonly segments separate.
+    if (isLarge) {
+      rank |= isWrite ? RF_LARGE_EXEC_WRITE : RF_LARGE_EXEC;
+    } else {
+      rank |= isWrite ? RF_EXEC_WRITE : RF_EXEC;
+    }
   } else {
     rank |= RF_WRITE;
     // The TLS initialization block needs to be a single contiguous block. Place
@@ -737,7 +746,7 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
     // alleviates relocation overflow pressure.
     // For -z lrodata-after-bss, place .lbss/.lrodata/.ldata after .bss.
     // .bss/.lbss being adjacent reuses the NOBITS size optimization.
-    if (osec.flags & SHF_X86_64_LARGE && ctx.arg.emachine == EM_X86_64) {
+    if (isLarge) {
       rank |= ctx.arg.zLrodataAfterBss
                   ? (osec.type == SHT_NOBITS ? 1 : RF_LARGE_ALT)
                   : RF_LARGE;
diff --git a/lld/test/ELF/x86-64-section-layout.s b/lld/test/ELF/x86-64-section-layout.s
index b03d3e6c2b999..1432271b885a8 100644
--- a/lld/test/ELF/x86-64-section-layout.s
+++ b/lld/test/ELF/x86-64-section-layout.s
@@ -18,6 +18,10 @@
 # RUN: ld.lld --section-start=.note=0x200300 a.o -z lrodata-after-bss -o a3
 # RUN: llvm-readelf -S -l -sX a3 | FileCheck %s --check-prefix=CHECK3
 
+# RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o
+# RUN: ld.lld c.o -o c
+# RUN: llvm-readelf -S -l c | FileCheck %s --check-prefix=CHECK4
+
 # CHECK:       Name              Type            Address          Off    Size   ES Flg Lk Inf Al
 # CHECK-NEXT:                    NULL            0000000000000000 000000 000000 00      0   0  0
 # CHECK-NEXT:  .note             NOTE            0000000000200300 000300 000001 00   A  0   0  1
@@ -116,6 +120,18 @@
 # CHECK3-NEXT:  0000000000203307     0 NOTYPE  GLOBAL DEFAULT [[#]] (.data)   _edata
 # CHECK3-NEXT:  0000000000207d0d     0 NOTYPE  GLOBAL DEFAULT [[#]] (.ldata2) _end
 
+# CHECK4:      .note      NOTE
+# CHECK4-NEXT: .ltext     PROGBITS
+# CHECK4-NEXT: .lrodata   PROGBITS
+# CHECK4-NEXT: .rodata    PROGBITS
+# CHECK4-NEXT: .text      PROGBITS
+# CHECK4-NEXT: .data      PROGBITS
+# CHECK4-NEXT: .bss       NOBITS
+# CHECK4-NEXT: .ldata     PROGBITS
+# CHECK4-NEXT: .lbss      NOBITS
+# CHECK4-NEXT: .ltext_w   PROGBITS
+# CHECK4-NEXT: .comment   PROGBITS
+
 #--- a.s
 .globl _start, _etext, _edata, _end
 _start:
@@ -155,3 +171,28 @@ SECTIONS {
   .ldata2 : {}
   .lbss : { *(.lbss .lbss.*) }
 }
+
+#--- c.s
+## Test .ltext layout
+.section .ltext,"axl",@progbits
+.globl f
+f:
+  ret
+
+.section .ltext_w,"awxl",@progbits
+.globl g
+g:
+  ret
+
+.section .text,"ax",@progbits
+.globl h
+h:
+  ret
+
+.section .note,"a",@note; .space 1
+.section .rodata,"a",@progbits; .space 1
+.section .data,"aw",@progbits; .space 1
+.section .bss,"aw",@nobits; .space 1
+.section .lrodata,"al"; .space 1
+.section .ldata,"awl"; .space 1
+.section .lbss,"awl",@nobits; .space 1

From df7db441d4e97568a5cbf830b0810512bb702159 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 14:56:25 -0400
Subject: [PATCH 288/851] =?UTF-8?q?Revert=20"[PowerPC][NFC]=20Update=20low?=
 =?UTF-8?q?ering=20STXVP=20to=20STXV=20in=20Oct=20word=20spil=E2=80=A6=20(?=
 =?UTF-8?q?#143948)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ling (#142220)"

This reverts commit edf636afe405ff90da7bf1834aa334bd52bc861e.
checked in wrong branch.
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 93 +++++++++++----------
 1 file changed, 50 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9dc69e203b0da..45183af0b7984 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,6 +1238,42 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
+static void spillRegPairs(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator II, DebugLoc DL,
+                          const TargetInstrInfo &TII, Register SrcReg,
+                          unsigned FrameIndex, bool IsLittleEndian,
+                          bool IsKilled, bool TwoPairs) {
+  unsigned Offset = 0;
+  // The register arithmetic in this function does not support virtual
+  // registers.
+  assert(!SrcReg.isVirtual() &&
+         "Spilling register pairs does not support virtual registers.");
+
+  if (TwoPairs)
+    Offset = IsLittleEndian ? 48 : 0;
+  else
+    Offset = IsLittleEndian ? 16 : 0;
+  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
+                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  Offset += IsLittleEndian ? -16 : 16;
+  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                        .addReg(Reg + 1, getKillRegState(IsKilled)),
+                    FrameIndex, Offset);
+  if (TwoPairs) {
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 2, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+    Offset += IsLittleEndian ? -16 : 16;
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+                          .addReg(Reg + 3, getKillRegState(IsKilled)),
+                      FrameIndex, Offset);
+  }
+}
+
 /// Remove any STXVP[X] instructions and split them out into a pair of
 /// STXV[X] instructions if --disable-auto-paired-vec-st is specified on
 /// the command line.
@@ -1254,21 +1290,8 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-
-  assert(PPC::VSRpRCRegClass.contains(SrcReg) &&
-          "Expecting STXVP to be utilizing a VSRp register.");
-
-  addFrameReference(
-      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx0),
-                  getKillRegState(IsKilled)),
-      FrameIndex, IsLittleEndian ? 16 : 0);
-  addFrameReference(
-      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-          .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_vsx1),
-                  getKillRegState(IsKilled)),
-      FrameIndex, IsLittleEndian ? 0 : 16);
-
+  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
+                /* TwoPairs */ false);
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1302,6 +1325,8 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
+  Register Reg =
+      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1312,34 +1337,16 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt) {
-    auto spillPair = [&](Register Reg, int Offset) {
-      addFrameReference(
-          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
-                      getKillRegState(IsKilled)),
-          FrameIndex, Offset);
-      addFrameReference(
-          BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-              .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
-                      getKillRegState(IsKilled)),
-          FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
-    };
-    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
-              IsLittleEndian ? 48 : 0);
-    spillPair(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
-              IsLittleEndian ? 16 : 32);
-  } else {
-    addFrameReference(
-        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
-                    getKillRegState(IsKilled)),
-        FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(
-        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
-                    getKillRegState(IsKilled)),
-        FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt)
+    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
+                  /* TwoPairs */ true);
+  else {
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+                          .addReg(Reg + 1, getKillRegState(IsKilled)),
+                      FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);

From c317eda6e3785037f16a746a1096c2cca82d9455 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Thu, 12 Jun 2025 15:47:02 -0400
Subject: [PATCH 289/851] [PowerPC][NFC] Update lowering STXVP to STXV in Oct
 word spilling (#143953)

Simpliy handling for spilling of acc reg with stx by removing explicit
register arithmetic and clean up code gen for register mapping used in
stxvp spilling.

Relanding: https://github.com/llvm/llvm-project/pull/142220
---
 llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 88 ++++++++++-----------
 llvm/lib/Target/PowerPC/PPCRegisterInfo.h   |  5 ++
 2 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 45183af0b7984..ea34c1aba82e3 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1238,40 +1238,28 @@ static void emitAccSpillRestoreInfo(MachineBasicBlock &MBB, bool IsPrimed,
 #endif
 }
 
-static void spillRegPairs(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          const TargetInstrInfo &TII, Register SrcReg,
-                          unsigned FrameIndex, bool IsLittleEndian,
-                          bool IsKilled, bool TwoPairs) {
-  unsigned Offset = 0;
-  // The register arithmetic in this function does not support virtual
-  // registers.
-  assert(!SrcReg.isVirtual() &&
+void PPCRegisterInfo::spillRegPair(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator II, DebugLoc DL,
+                                   const TargetInstrInfo &TII,
+                                   unsigned FrameIndex, bool IsLittleEndian,
+                                   bool IsKilled, Register Reg,
+                                   int Offset) const {
+
+  // This function does not support virtual registers.
+  assert(!Reg.isVirtual() &&
          "Spilling register pairs does not support virtual registers.");
 
-  if (TwoPairs)
-    Offset = IsLittleEndian ? 48 : 0;
-  else
-    Offset = IsLittleEndian ? 16 : 0;
-  Register Reg = (SrcReg > PPC::VSRp15) ? PPC::V0 + (SrcReg - PPC::VSRp16) * 2
-                                        : PPC::VSL0 + (SrcReg - PPC::VSRp0) * 2;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  Offset += IsLittleEndian ? -16 : 16;
-  addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                        .addReg(Reg + 1, getKillRegState(IsKilled)),
-                    FrameIndex, Offset);
-  if (TwoPairs) {
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 2, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-    Offset += IsLittleEndian ? -16 : 16;
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXV))
-                          .addReg(Reg + 3, getKillRegState(IsKilled)),
-                      FrameIndex, Offset);
-  }
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx0),
+                  getKillRegState(IsKilled)),
+      FrameIndex, Offset);
+
+  addFrameReference(
+      BuildMI(MBB, II, DL, TII.get(PPC::STXV))
+          .addReg(TargetRegisterInfo::getSubReg(Reg, PPC::sub_vsx1),
+                  getKillRegState(IsKilled)),
+      FrameIndex, IsLittleEndian ? Offset - 16 : Offset + 16);
 }
 
 /// Remove any STXVP[X] instructions and split them out into a pair of
@@ -1290,8 +1278,10 @@ void PPCRegisterInfo::lowerOctWordSpilling(MachineBasicBlock::iterator II,
   Register SrcReg = MI.getOperand(0).getReg();
   bool IsLittleEndian = Subtarget.isLittleEndian();
   bool IsKilled = MI.getOperand(0).isKill();
-  spillRegPairs(MBB, II, DL, TII, SrcReg, FrameIndex, IsLittleEndian, IsKilled,
-                /* TwoPairs */ false);
+
+  spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled, SrcReg,
+               IsLittleEndian ? 16 : 0);
+
   // Discard the original instruction.
   MBB.erase(II);
 }
@@ -1325,8 +1315,6 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   bool IsKilled = MI.getOperand(0).isKill();
 
   bool IsPrimed = PPC::ACCRCRegClass.contains(SrcReg);
-  Register Reg =
-      PPC::VSRp0 + (SrcReg - (IsPrimed ? PPC::ACC0 : PPC::UACC0)) * 2;
   bool IsLittleEndian = Subtarget.isLittleEndian();
 
   emitAccSpillRestoreInfo(MBB, IsPrimed, false);
@@ -1337,16 +1325,24 @@ void PPCRegisterInfo::lowerACCSpilling(MachineBasicBlock::iterator II,
   // adjust the offset of the store that is within the 64-byte stack slot.
   if (IsPrimed)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMFACC), SrcReg).addReg(SrcReg);
-  if (DisableAutoPairedVecSt)
-    spillRegPairs(MBB, II, DL, TII, Reg, FrameIndex, IsLittleEndian, IsKilled,
-                  /* TwoPairs */ true);
-  else {
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 32 : 0);
-    addFrameReference(BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
-                          .addReg(Reg + 1, getKillRegState(IsKilled)),
-                      FrameIndex, IsLittleEndian ? 0 : 32);
+  if (DisableAutoPairedVecSt) {
+    spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled,
+                 TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                 IsLittleEndian ? 48 : 0);
+    spillRegPair(MBB, II, DL, TII, FrameIndex, IsLittleEndian, IsKilled,
+                 TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                 IsLittleEndian ? 16 : 32);
+  } else {
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair0),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 32 : 0);
+    addFrameReference(
+        BuildMI(MBB, II, DL, TII.get(PPC::STXVP))
+            .addReg(TargetRegisterInfo::getSubReg(SrcReg, PPC::sub_pair1),
+                    getKillRegState(IsKilled)),
+        FrameIndex, IsLittleEndian ? 0 : 32);
   }
   if (IsPrimed && !IsKilled)
     BuildMI(MBB, II, DL, TII.get(PPC::XXMTACC), SrcReg).addReg(SrcReg);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 4b66ece534112..849f856b5419e 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -58,6 +58,11 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   DenseMap<unsigned, unsigned> ImmToIdxMap;
   const PPCTargetMachine &TM;
 
+  void spillRegPair(MachineBasicBlock &MBB, MachineBasicBlock::iterator II,
+                    DebugLoc DL, const TargetInstrInfo &TII,
+                    unsigned FrameIndex, bool IsLittleEndian, bool IsKilled,
+                    Register Reg, int Offset) const;
+
 public:
   PPCRegisterInfo(const PPCTargetMachine &TM);
 

From 030a471753421477c7ef345cc60091788252fabc Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 12 Jun 2025 20:51:58 +0100
Subject: [PATCH 290/851] [AArch64][Clang] Exclude address spaces from
 pointer-only coercion types.

As reported on #135064, the generic pointer coercion code in
CoerceIntOrPtrToIntOrPtr cannot handle address space casts (it tries to bitcast
the pointers). This bails out if an address space qualifier is found on the
pointer.
---
 clang/lib/CodeGen/Targets/AArch64.cpp         |   3 +-
 .../AArch64/struct-coerce-using-ptr.cpp       | 181 +++++++++++++++++-
 2 files changed, 181 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 3efe6ab4ea9c0..b82c46966cf0b 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -507,7 +507,8 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
         if (FDTy->isArrayType())
           FDTy = getContext().getBaseElementType(FDTy);
         return (FDTy->isPointerOrReferenceType() &&
-                getContext().getTypeSize(FDTy) == 64) ||
+                getContext().getTypeSize(FDTy) == 64 &&
+                !FDTy->getPointeeType().hasAddressSpace()) ||
                Self(Self, FDTy);
       });
     };
diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
index a41f315340b57..b1232921df363 100644
--- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
+++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp
@@ -139,7 +139,7 @@ struct Srp {
 // CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 8
 // CHECK-A64-NEXT:    store [2 x ptr] [[S_COERCE]], ptr [[S]], align 8
 // CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0
-// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 8
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 8, !nonnull [[META2:![0-9]+]], !align [[META3:![0-9]+]]
 // CHECK-A64-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-A64-NEXT:    ret void
 //
@@ -149,7 +149,7 @@ struct Srp {
 // CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRP:%.*]], align 4
 // CHECK-A64_32-NEXT:    store i64 [[S_COERCE]], ptr [[S]], align 4
 // CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRP]], ptr [[S]], i32 0, i32 0
-// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 4
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X]], align 4, !nonnull [[META2:![0-9]+]], !align [[META3:![0-9]+]]
 // CHECK-A64_32-NEXT:    store i32 1, ptr [[TMP0]], align 4
 // CHECK-A64_32-NEXT:    ret void
 //
@@ -618,3 +618,180 @@ struct SpSempty {
 // CHECK-A64_32-NEXT:    ret void
 //
 void TpSempty(SpSempty s) { *s.x = 1; }
+
+
+struct Spaddrspace {
+    __attribute__((address_space(100))) int *x;
+};
+// CHECK-A64-LABEL: define dso_local void @_Z11Tpaddrspace11Spaddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 8
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Tpaddrspace11Spaddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 4
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Tpaddrspace(Spaddrspace s) { *s.x = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z11Cpaddrspacev(
+// CHECK-A64-SAME: ) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SPADDRSPACE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 8, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i64
+// CHECK-A64-NEXT:    call void @_Z11Tpaddrspace11Spaddrspace(i64 [[COERCE_VAL_PI]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Cpaddrspacev(
+// CHECK-A64_32-SAME: ) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SPADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SPADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 4, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SPADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i32
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = zext i32 [[COERCE_VAL_PI]] to i64
+// CHECK-A64_32-NEXT:    call void @_Z11Tpaddrspace11Spaddrspace(i64 [[COERCE_VAL_II]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Cpaddrspace() { Spaddrspace s; Tpaddrspace(s); }
+
+struct Sp2addrspace {
+    __attribute__((address_space(100))) int *x[2];
+};
+// CHECK-A64-LABEL: define dso_local void @_Z12Tp2addrspace12Sp2addrspace(
+// CHECK-A64-SAME: [2 x i64] [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    store [2 x i64] [[S_COERCE]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(100)], ptr [[X]], i64 0, i64 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[ARRAYIDX]], align 8
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z12Tp2addrspace12Sp2addrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    store i64 [[S_COERCE]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x ptr addrspace(100)], ptr [[X]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[ARRAYIDX]], align 4
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Tp2addrspace(Sp2addrspace s) { *s.x[0] = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z12Cp2addrspacev(
+// CHECK-A64-SAME: ) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SP2ADDRSPACE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 16, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load [2 x i64], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    call void @_Z12Tp2addrspace12Sp2addrspace([2 x i64] [[TMP0]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z12Cp2addrspacev(
+// CHECK-A64_32-SAME: ) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SP2ADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SP2ADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 8, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SP2ADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load i64, ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    call void @_Z12Tp2addrspace12Sp2addrspace(i64 [[TMP0]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Cp2addrspace() { Sp2addrspace s; Tp2addrspace(s); }
+
+struct Sraddrspace {
+    __attribute__((address_space(100))) int &x;
+};
+// CHECK-A64-LABEL: define dso_local void @_Z11Traddrspace11Sraddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 8, !align [[META3]]
+// CHECK-A64-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Traddrspace11Sraddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[X]], align 4, !align [[META3]]
+// CHECK-A64_32-NEXT:    store i32 1, ptr addrspace(100) [[TMP0]], align 4
+// CHECK-A64_32-NEXT:    ret void
+//
+void Traddrspace(Sraddrspace s) { s.x = 1; }
+// CHECK-A64-LABEL: define dso_local void @_Z11Craddrspace11Sraddrspace(
+// CHECK-A64-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 8
+// CHECK-A64-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SRADDRSPACE]], align 8
+// CHECK-A64-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[S_COERCE]] to ptr addrspace(100)
+// CHECK-A64-NEXT:    store ptr addrspace(100) [[COERCE_VAL_IP]], ptr [[COERCE_DIVE]], align 8
+// CHECK-A64-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[AGG_TMP]], ptr align 8 [[S]], i64 8, i1 false)
+// CHECK-A64-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE1]], align 8
+// CHECK-A64-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i64
+// CHECK-A64-NEXT:    call void @_Z11Traddrspace11Sraddrspace(i64 [[COERCE_VAL_PI]])
+// CHECK-A64-NEXT:    ret void
+//
+// CHECK-A64_32-LABEL: define void @_Z11Craddrspace11Sraddrspace(
+// CHECK-A64_32-SAME: i64 [[S_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-A64_32-NEXT:  [[ENTRY:.*:]]
+// CHECK-A64_32-NEXT:    [[S:%.*]] = alloca [[STRUCT_SRADDRSPACE:%.*]], align 4
+// CHECK-A64_32-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_SRADDRSPACE]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[S]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II:%.*]] = trunc i64 [[S_COERCE]] to i32
+// CHECK-A64_32-NEXT:    store i32 [[COERCE_VAL_II]], ptr [[COERCE_DIVE]], align 4
+// CHECK-A64_32-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_TMP]], ptr align 4 [[S]], i32 4, i1 false)
+// CHECK-A64_32-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_SRADDRSPACE]], ptr [[AGG_TMP]], i32 0, i32 0
+// CHECK-A64_32-NEXT:    [[TMP0:%.*]] = load ptr addrspace(100), ptr [[COERCE_DIVE1]], align 4
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_PI:%.*]] = ptrtoint ptr addrspace(100) [[TMP0]] to i32
+// CHECK-A64_32-NEXT:    [[COERCE_VAL_II2:%.*]] = zext i32 [[COERCE_VAL_PI]] to i64
+// CHECK-A64_32-NEXT:    call void @_Z11Traddrspace11Sraddrspace(i64 [[COERCE_VAL_II2]])
+// CHECK-A64_32-NEXT:    ret void
+//
+void Craddrspace(Sraddrspace s) { Traddrspace(s); }
+
+//.
+// CHECK-A64: [[META2]] = !{}
+// CHECK-A64: [[META3]] = !{i64 4}
+//.
+// CHECK-A64_32: [[META2]] = !{}
+// CHECK-A64_32: [[META3]] = !{i64 4}
+//.

From 891f6ae783b36122b0f2fadc0c2d95d7dd590415 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 12:50:02 -0700
Subject: [PATCH 291/851] [instcombine] Add test coverage for vp.reverse
 elimination combines

---
 .../test/Transforms/InstCombine/vp-reverse.ll | 101 ++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/vp-reverse.ll

diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
new file mode 100644
index 0000000000000..79e6c47bdf1b2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim2(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m, i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim_diffmask(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m1, <vscale x 4 x i1> %m2, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_diffmask(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M1:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M1]], i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M2:%.*]], i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m1, i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m1, i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> %m2, i32 10)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_elim_diffevl(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, %b.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 10)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @binop_reverse_splat_elim2(
+; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
+  %add.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %add, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %add.rev
+}
+
+define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
+; CHECK-LABEL: @unop_reverse_splat_elim(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
+; CHECK-NEXT:    [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OP_REV]]
+;
+  %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %op = fneg <vscale x 4 x float> %a.rev
+  %op.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %op, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x float> %op.rev
+}

From cbf27bf711c08c34185f05ca5edbfa61bd3786e2 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 12 Jun 2025 19:53:04 +0000
Subject: [PATCH 292/851] Revert " [PowerPC] frontend get target feature from
 backend with cpu name (#137670)"

This reverts commit 9208b343e962b9f1140ee345c0050a3920bdcbf2.

TargetParser shouldn't re-run the PPC subtarget tablegen target, it
should define its own `-gen-ppc-target-def` rule like all the other
targets do in llvm/include/llvm/TargetParser/CMakeLists.txt .

One user reported that there are incorrect CMake dependencies after this
change, so I will roll this back in the meantime.
---
 clang/lib/Basic/Targets/PPC.cpp               | 148 +++++++++++++++++-
 .../cxx11-thread-local-reference.cpp          |   2 +-
 .../Driver/aix-shared-lib-tls-model-opt.c     |   7 +-
 .../Driver/aix-small-local-exec-dynamic-tls.c |  39 ++---
 clang/test/Driver/ppc-crbits.cpp              |   4 +
 clang/test/Driver/ppc-isa-features.cpp        |  22 +--
 .../llvm/TargetParser/PPCTargetParser.h       |   6 -
 llvm/include/llvm/TargetParser/TargetParser.h |  27 ----
 llvm/lib/Target/PowerPC/PPC.td                |   4 +-
 llvm/lib/TargetParser/CMakeLists.txt          |   8 -
 llvm/lib/TargetParser/PPCTargetParser.cpp     |  25 ---
 llvm/lib/TargetParser/TargetParser.cpp        |  47 ------
 llvm/utils/TableGen/SubtargetEmitter.cpp      |  50 ++----
 13 files changed, 198 insertions(+), 191 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 77145e2891a8a..e6ef0ecc526ba 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -15,7 +15,6 @@
 #include "clang/Basic/MacroBuilder.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "llvm/TargetParser/PPCTargetParser.h"
-#include <optional>
 
 using namespace clang;
 using namespace clang::targets;
@@ -517,14 +516,129 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
 bool PPCTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
+  Features["altivec"] = llvm::StringSwitch<bool>(CPU)
+                            .Case("7400", true)
+                            .Case("g4", true)
+                            .Case("7450", true)
+                            .Case("g4+", true)
+                            .Case("970", true)
+                            .Case("g5", true)
+                            .Case("pwr6", true)
+                            .Case("pwr7", true)
+                            .Case("pwr8", true)
+                            .Case("pwr9", true)
+                            .Case("ppc64", true)
+                            .Case("ppc64le", true)
+                            .Default(false);
+
+  Features["power9-vector"] = (CPU == "pwr9");
+  Features["crypto"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Default(false);
+  Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
+                                  .Case("ppc64le", true)
+                                  .Case("pwr9", true)
+                                  .Case("pwr8", true)
+                                  .Default(false);
+  Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Case("pwr7", true)
+                           .Default(false);
+  Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
+                           .Case("ppc64le", true)
+                           .Case("pwr9", true)
+                           .Case("pwr8", true)
+                           .Case("pwr7", true)
+                           .Default(false);
+  Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
+                                .Case("ppc64le", true)
+                                .Case("pwr9", true)
+                                .Case("pwr8", true)
+                                .Default(false);
+  Features["crbits"] = llvm::StringSwitch<bool>(CPU)
+                                .Case("ppc64le", true)
+                                .Case("pwr9", true)
+                                .Case("pwr8", true)
+                                .Default(false);
+  Features["vsx"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("ppc64le", true)
+                        .Case("pwr9", true)
+                        .Case("pwr8", true)
+                        .Case("pwr7", true)
+                        .Default(false);
+  Features["htm"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("ppc64le", true)
+                        .Case("pwr9", true)
+                        .Case("pwr8", true)
+                        .Default(false);
+
+  // ROP Protect is off by default.
+  Features["rop-protect"] = false;
+  // Privileged instructions are off by default.
+  Features["privileged"] = false;
 
-  const llvm::Triple &TheTriple = getTriple();
+  if (getTriple().isOSAIX()) {
+    // The code generated by the -maix-small-local-[exec|dynamic]-tls option is
+    // turned off by default.
+    Features["aix-small-local-exec-tls"] = false;
+    Features["aix-small-local-dynamic-tls"] = false;
+
+    // Turn off TLS model opt by default.
+    Features["aix-shared-lib-tls-model-opt"] = false;
+  }
+
+  Features["spe"] = llvm::StringSwitch<bool>(CPU)
+                        .Case("8548", true)
+                        .Case("e500", true)
+                        .Default(false);
+
+  Features["isa-v206-instructions"] = llvm::StringSwitch<bool>(CPU)
+                                          .Case("ppc64le", true)
+                                          .Case("pwr9", true)
+                                          .Case("pwr8", true)
+                                          .Case("pwr7", true)
+                                          .Case("a2", true)
+                                          .Default(false);
+
+  Features["isa-v207-instructions"] = llvm::StringSwitch<bool>(CPU)
+                                          .Case("ppc64le", true)
+                                          .Case("pwr9", true)
+                                          .Case("pwr8", true)
+                                          .Default(false);
+
+  Features["isa-v30-instructions"] =
+      llvm::StringSwitch<bool>(CPU).Case("pwr9", true).Default(false);
+
+  Features["quadword-atomics"] =
+      getTriple().isArch64Bit() && llvm::StringSwitch<bool>(CPU)
+                                       .Case("pwr9", true)
+                                       .Case("pwr8", true)
+                                       .Default(false);
+
+  // Power10 includes all the same features as Power9 plus any features specific
+  // to the Power10 core.
+  if (CPU == "pwr10" || CPU == "power10") {
+    initFeatureMap(Features, Diags, "pwr9", FeaturesVec);
+    addP10SpecificFeatures(Features);
+  }
+
+  // Power11 includes all the same features as Power10 plus any features
+  // specific to the Power11 core.
+  if (CPU == "pwr11" || CPU == "power11") {
+    initFeatureMap(Features, Diags, "pwr10", FeaturesVec);
+    addP11SpecificFeatures(Features);
+  }
 
-  std::optional<llvm::StringMap<bool>> FeaturesOpt =
-      llvm::PPC::getPPCDefaultTargetFeatures(TheTriple,
-                                             llvm::PPC::normalizeCPUName(CPU));
-  if (FeaturesOpt)
-    Features = FeaturesOpt.value();
+  // Future CPU should include all of the features of Power 11 as well as any
+  // additional features (yet to be determined) specific to it.
+  if (CPU == "future") {
+    initFeatureMap(Features, Diags, "pwr11", FeaturesVec);
+    addFutureSpecificFeatures(Features);
+  }
 
   if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
     return false;
@@ -586,6 +700,26 @@ bool PPCTargetInfo::initFeatureMap(
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
+// Add any Power10 specific features.
+void PPCTargetInfo::addP10SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {
+  Features["htm"] = false; // HTM was removed for P10.
+  Features["paired-vector-memops"] = true;
+  Features["mma"] = true;
+  Features["power10-vector"] = true;
+  Features["pcrelative-memops"] = true;
+  Features["prefix-instrs"] = true;
+  Features["isa-v31-instructions"] = true;
+}
+
+// Add any Power11 specific features.
+void PPCTargetInfo::addP11SpecificFeatures(
+    llvm::StringMap<bool> &Features) const {}
+
+// Add features specific to the "Future" CPU.
+void PPCTargetInfo::addFutureSpecificFeatures(
+    llvm::StringMap<bool> &Features) const {}
+
 bool PPCTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("powerpc", true)
diff --git a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index a0e76e8a9a0b6..cd5a18f39060e 100644
--- a/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/clang/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -35,5 +35,5 @@ int &g() { return r; }
 // DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// LINUX_AIX: attributes [[ATTR0]] = { {{.*}} }
+// LINUX_AIX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
 // DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/clang/test/Driver/aix-shared-lib-tls-model-opt.c b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
index 891caf4ed3fcd..7acf091f0a049 100644
--- a/clang/test/Driver/aix-shared-lib-tls-model-opt.c
+++ b/clang/test/Driver/aix-shared-lib-tls-model-opt.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
-// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefixes=CHECK-AIX,CHECK-AIX-OFF %s
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 // RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
@@ -19,8 +19,9 @@ int test(void) {
 
 // CHECK-AIX: test() #0 {
 // CHECK-AIX: attributes #0 = {
+// CHECK-AIX-OFF-SAME: -aix-shared-lib-tls-model-opt
 // CHECK-AIX-ON-SAME: +aix-shared-lib-tls-model-opt
 
-// CHECK-LINUX-NOT: {{[+]aix-shared-lib-tls-model-opt}}
+// CHECK-LINUX-NOT: {{[-+]aix-shared-lib-tls-model-opt}}
 
 // CHECK-UNSUPPORTED-TARGET: option '-maix-shared-lib-tls-model-opt' cannot be specified on this target
diff --git a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
index 6fc2b8efb4aed..1a0619b58e891 100644
--- a/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
+++ b/clang/test/Driver/aix-small-local-exec-dynamic-tls.c
@@ -1,37 +1,37 @@
-// RUN: %clang --target=powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clang --target=powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-DEFAULT %s
+// RUN: %clang -target powerpc64-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
+// RUN: %clang -target powerpc-unknown-aix -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-AIX-DEFAULT %s
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-LINUX %s
 
-// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
+// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALEXEC_TLS
 
-// RUN: %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
+// RUN: %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls -S -emit-llvm \
 // RUN:    %s -o - | FileCheck %s --check-prefix=CHECK-AIX_SMALL_LOCALDYNAMIC_TLS
 
-// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-exec-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
-// RUN: not %clang --target=powerpc-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-AIX32 %s
-// RUN: not %clang --target=powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64le-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only %s 2>&1 | FileCheck --check-prefix=CHECK-UNSUPPORTED-LINUX %s
-// RUN: not %clang --target=powerpc64-unknown-aix -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-aix -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
-// RUN: not %clang --target=powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -maix-small-local-dynamic-tls \
 // RUN:    -fsyntax-only -fno-data-sections %s 2>&1 | \
 // RUN:    FileCheck --check-prefix=CHECK-UNSUPPORTED-NO-DATASEC %s
 
@@ -39,9 +39,10 @@ int test(void) {
   return 0;
 }
 
-// CHECK-DEFAULT: test() #0 {
-// CHECK-DEFAULT: attributes #0 = {
-// CHECK-DEFAULT-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
+// CHECK-AIX-DEFAULT: test() #0 {
+// CHECK-AIX-DEFAULT: attributes #0 = {
+// CHECK-AIX-DEFAULT-SAME: {{-aix-small-local-exec-tls,.*-aix-small-local-dynamic-tls|-aix-small-local-dynamic-tls,.*-aix-small-local-exec-tls}}
+// CHECK-LINUX-NOT: {{[-+]aix-small-local-exec-tls,.*[-+]aix-small-local-dynamic-tls|[-+]aix-small-local-dynamic-tls,.*[-+]aix-small-local-exec-tls}}
 
 // CHECK-UNSUPPORTED-AIX32: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
 // CHECK-UNSUPPORTED-LINUX: option '-maix-small-local-[exec|dynamic]-tls' cannot be specified on this target
diff --git a/clang/test/Driver/ppc-crbits.cpp b/clang/test/Driver/ppc-crbits.cpp
index 62893d3d0e87d..3ed56308cb526 100644
--- a/clang/test/Driver/ppc-crbits.cpp
+++ b/clang/test/Driver/ppc-crbits.cpp
@@ -64,6 +64,8 @@
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -emit-llvm \
+// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -mno-crbits \
@@ -90,6 +92,8 @@
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr8 -mno-crbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 
+// RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -emit-llvm \
+// RUN:   -S %s -o - | FileCheck %s --check-prefix=HAS-NOCRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mcrbits \
 // RUN:   -emit-llvm -S %s -o - | FileCheck %s --check-prefix=HAS-CRBITS
 // RUN: %clang -target powerpc-ibm-aix -mcpu=pwr7 -mno-crbits \
diff --git a/clang/test/Driver/ppc-isa-features.cpp b/clang/test/Driver/ppc-isa-features.cpp
index 35dbfbcdf5699..92c5bc82f72b8 100644
--- a/clang/test/Driver/ppc-isa-features.cpp
+++ b/clang/test/Driver/ppc-isa-features.cpp
@@ -5,20 +5,20 @@
 // RUN: %clang -target powerpc64-unknown-aix -mcpu=pwr9 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR9
 // RUN: %clang -target powerpc-unknown-aix -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-PWR10
 
-// CHECK-PWR6-NOT: isa-v206-instructions
-// CHECK-PWR6-NOT: isa-v207-instructions
-// CHECK-PWR6-NOT: isa-v30-instructions
+// CHECK-PWR6: -isa-v206-instructions
+// CHECK-PWR6: -isa-v207-instructions
+// CHECK-PWR6: -isa-v30-instructions
 
-// CHECK-A2:     +isa-v206-instructions
-// CHECK-A2-NOT: isa-v207-instructions
-// CHECK-A2-NOT: isa-v30-instructions
+// CHECK-A2: +isa-v206-instructions
+// CHECK-A2: -isa-v207-instructions
+// CHECK-A2: -isa-v30-instructions
 
-// CHECK-PWR7:     +isa-v206-instructions
-// CHECK-PWR7-NOT: isa-v207-instructions
-// CHECK-PWR7-NOT: isa-v30-instructions
+// CHECK-PWR7: +isa-v206-instructions
+// CHECK-PWR7: -isa-v207-instructions
+// CHECK-PWR7: -isa-v30-instructions
 
-// CHECK-PWR8:     +isa-v207-instructions
-// CHECK-PWR8-NOT: isa-v30-instructions
+// CHECK-PWR8: +isa-v207-instructions
+// CHECK-PWR8: -isa-v30-instructions
 
 // CHECK-PWR9: +isa-v207-instructions
 // CHECK-PWR9: +isa-v30-instructions
diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h
index d3d44afb5f544..59d9f867005a4 100644
--- a/llvm/include/llvm/TargetParser/PPCTargetParser.h
+++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TARGETPARSER_PPCTARGETPARSER_H
 #define LLVM_TARGETPARSER_PPCTARGETPARSER_H
 
-#include "TargetParser.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -39,10 +37,6 @@ LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T,
 // For PPC, there are some cpu names for same CPU, like pwr10 and power10,
 // normalize them.
 LLVM_ABI StringRef normalizeCPUName(StringRef CPUName);
-
-LLVM_ABI std::optional<llvm::StringMap<bool>>
-getPPCDefaultTargetFeatures(const Triple &T, StringRef CPUName);
-
 } // namespace PPC
 } // namespace llvm
 
diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h
index b4a92cc6b6c4b..176205e17ae00 100644
--- a/llvm/include/llvm/TargetParser/TargetParser.h
+++ b/llvm/include/llvm/TargetParser/TargetParser.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TARGETPARSER_TARGETPARSER_H
 #define LLVM_TARGETPARSER_TARGETPARSER_H
 
-#include "SubtargetFeature.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Compiler.h"
@@ -192,31 +190,6 @@ insertWaveSizeFeature(StringRef GPU, const Triple &T,
                       StringMap<bool> &Features);
 
 } // namespace AMDGPU
-
-struct BasicSubtargetFeatureKV {
-  const char *Key;         ///< K-V key string
-  unsigned Value;          ///< K-V integer value
-  FeatureBitArray Implies; ///< K-V bit mask
-};
-
-/// Used to provide key value pairs for feature and CPU bit flags.
-struct BasicSubtargetSubTypeKV {
-  const char *Key;         ///< K-V key string
-  FeatureBitArray Implies; ///< K-V bit mask
-
-  /// Compare routine for std::lower_bound
-  bool operator<(StringRef S) const { return StringRef(Key) < S; }
-
-  /// Compare routine for std::is_sorted.
-  bool operator<(const BasicSubtargetSubTypeKV &Other) const {
-    return StringRef(Key) < StringRef(Other.Key);
-  }
-};
-
-std::optional<llvm::StringMap<bool>>
-getCPUDefaultTargetFeatures(StringRef CPU,
-                            ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
-                            ArrayRef<BasicSubtargetFeatureKV> ProcFeatures);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index ea7c2203662bd..fd850faf7b2fb 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -411,6 +411,7 @@ def ProcessorFeatures {
      FeatureP8Altivec,
      FeatureP8Vector,
      FeatureP8Crypto,
+     FeatureHTM,
      FeatureDirectMove,
      FeatureICBT,
      FeaturePartwordAtomic,
@@ -421,7 +422,6 @@ def ProcessorFeatures {
     ];
 
   list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
-                                               FeatureHTM,
                                                FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
@@ -443,7 +443,7 @@ def ProcessorFeatures {
   // dispatch for vector operations than scalar ones. For the time being,
   // this list also includes scheduling-related features since we do not have
   // enough info to create custom scheduling strategies for future CPUs.
-  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits, FeatureHTM];
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits];
   list<SubtargetFeature> P9InheritableFeatures =
     !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
   list<SubtargetFeature> P9Features =
diff --git a/llvm/lib/TargetParser/CMakeLists.txt b/llvm/lib/TargetParser/CMakeLists.txt
index 66aed45ff18c6..8f8b3a578a1d9 100644
--- a/llvm/lib/TargetParser/CMakeLists.txt
+++ b/llvm/lib/TargetParser/CMakeLists.txt
@@ -8,12 +8,6 @@ if (HAS_WERROR_GLOBAL_CTORS AND NOT LLVM_HAS_NOGLOBAL_CTOR_MUTEX)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=global-constructors")
 endif()
 
-set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC/PPC.td)
-
-tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget -I${LLVM_MAIN_SRC_DIR}/lib/Target/PowerPC)
-add_public_tablegen_target(PPCGenSubtargetInfo)
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
 # Solaris code uses kstat, so specify dependency explicitly for shared builds.
 if (${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
   set(system_libs kstat)
@@ -47,5 +41,3 @@ add_llvm_component_library(LLVMTargetParser
   DEPENDS
   target_parser_gen
   )
-
-add_dependencies(LLVMTargetParser PPCGenSubtargetInfo)
diff --git a/llvm/lib/TargetParser/PPCTargetParser.cpp b/llvm/lib/TargetParser/PPCTargetParser.cpp
index 1b637b27be3de..422d758c772e1 100644
--- a/llvm/lib/TargetParser/PPCTargetParser.cpp
+++ b/llvm/lib/TargetParser/PPCTargetParser.cpp
@@ -15,10 +15,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/TargetParser/Host.h"
 
-#define GET_SUBTARGETINFO_ENUM
-#define GET_SUBTARGETFEATURES_KV
-#include "PPCGenSubtargetInfo.inc"
-
 namespace llvm {
 namespace PPC {
 
@@ -121,26 +117,5 @@ StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName) {
   return getNormalizedPPCTargetCPU(T, CPUName);
 }
 
-std::optional<StringMap<bool>> getPPCDefaultTargetFeatures(const Triple &T,
-                                                           StringRef CPU) {
-  std::optional<StringMap<bool>> FeaturesOpt =
-      getCPUDefaultTargetFeatures(CPU, BasicPPCSubTypeKV, BasicPPCFeatureKV);
-
-  if (!FeaturesOpt.has_value())
-    return std::nullopt;
-
-  StringMap<bool> Features = FeaturesOpt.value();
-  // FIXME: We need to check for the processor model 8548, since the backend
-  // does not support this processor. When this processor model is implemented
-  // within the backend, the following code can be removed.
-  if (CPU == "8548")
-    Features["spe"] = true;
-
-  // The target feature `quadword-atomics` is only supported for 64-bit
-  // POWER8 and above.
-  if (Features.find("quadword-atomics") != Features.end() && !T.isArch64Bit())
-    Features["quadword-atomics"] = false;
-  return Features;
-}
 } // namespace PPC
 } // namespace llvm
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 03f7d3899c2e7..7c54901dae47d 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -18,53 +18,6 @@
 using namespace llvm;
 using namespace AMDGPU;
 
-/// Find KV in array using binary search.
-static const BasicSubtargetSubTypeKV *
-find(StringRef S, ArrayRef<BasicSubtargetSubTypeKV> A) {
-  // Binary search the array
-  auto F = llvm::lower_bound(A, S);
-  // If not found then return NULL
-  if (F == A.end() || StringRef(F->Key) != S)
-    return nullptr;
-  // Return the found array item
-  return F;
-}
-
-/// For each feature that is (transitively) implied by this feature, set it.
-static void setImpliedBits(FeatureBitset &Bits, const FeatureBitset &Implies,
-                           ArrayRef<BasicSubtargetFeatureKV> FeatureTable) {
-  // OR the Implies bits in outside the loop. This allows the Implies for CPUs
-  // which might imply features not in FeatureTable to use this.
-  Bits |= Implies;
-  for (const auto &FE : FeatureTable)
-    if (Implies.test(FE.Value))
-      setImpliedBits(Bits, FE.Implies.getAsBitset(), FeatureTable);
-}
-
-std::optional<llvm::StringMap<bool>> llvm::getCPUDefaultTargetFeatures(
-    StringRef CPU, ArrayRef<BasicSubtargetSubTypeKV> ProcDesc,
-    ArrayRef<BasicSubtargetFeatureKV> ProcFeatures) {
-  if (CPU.empty())
-    return std::nullopt;
-
-  const BasicSubtargetSubTypeKV *CPUEntry = ::find(CPU, ProcDesc);
-  if (!CPUEntry)
-    return std::nullopt;
-
-  // Set the features implied by this CPU feature if there is a match.
-  FeatureBitset Bits;
-  llvm::StringMap<bool> DefaultFeatures;
-  setImpliedBits(Bits, CPUEntry->Implies.getAsBitset(), ProcFeatures);
-
-  unsigned BitSize = Bits.size();
-  for (const BasicSubtargetFeatureKV &FE : ProcFeatures) {
-    assert(FE.Value < BitSize && "Target Feature is out of range");
-    if (Bits[FE.Value])
-      DefaultFeatures[FE.Key] = true;
-  }
-  return DefaultFeatures;
-}
-
 namespace {
 
 struct GPUInfo {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index da41e981888aa..ca008e256a70f 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -89,10 +89,8 @@ class SubtargetEmitter {
 
   FeatureMapTy enumeration(raw_ostream &OS);
   void emitSubtargetInfoMacroCalls(raw_ostream &OS);
-  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
-                            bool IsEmitBasic = false);
-  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap,
-                        bool IsEmitBasic = false);
+  unsigned featureKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
+  unsigned cpuKeyValues(raw_ostream &OS, const FeatureMapTy &FeatureMap);
   unsigned cpuNames(raw_ostream &OS);
   void formItineraryStageString(const std::string &Names,
                                 const Record *ItinData, std::string &ItinString,
@@ -256,8 +254,7 @@ void SubtargetEmitter::emitSubtargetInfoMacroCalls(raw_ostream &OS) {
 // command line.
 //
 unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
-                                            const FeatureMapTy &FeatureMap,
-                                            bool IsEmitBasic) {
+                                            const FeatureMapTy &FeatureMap) {
   std::vector<const Record *> FeatureList =
       Records.getAllDerivedDefinitions("SubtargetFeature");
 
@@ -273,8 +270,7 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
   // Begin feature table.
   OS << "// Sorted (by key) array of values for CPU features.\n"
-     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
-     << "SubtargetFeatureKV " << (IsEmitBasic ? "Basic" : "") << Target
+     << "extern const llvm::SubtargetFeatureKV " << Target
      << "FeatureKV[] = {\n";
 
   for (const Record *Feature : FeatureList) {
@@ -285,11 +281,9 @@ unsigned SubtargetEmitter::featureKeyValues(raw_ostream &OS,
 
     // Emit as { "feature", "description", { featureEnum }, { i1 , i2 , ... , in
     // } }
-    OS << "  { " << "\"" << CommandLineName << "\", ";
-    if (!IsEmitBasic)
-      OS << "\"" << Desc << "\", ";
-
-    OS << Target << "::" << Name << ", ";
+    OS << "  { "
+       << "\"" << CommandLineName << "\", "
+       << "\"" << Desc << "\", " << Target << "::" << Name << ", ";
 
     ConstRecVec ImpliesList = Feature->getValueAsListOfDefs("Implies");
 
@@ -367,8 +361,7 @@ static void checkDuplicateCPUFeatures(StringRef CPUName,
 // line.
 //
 unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
-                                        const FeatureMapTy &FeatureMap,
-                                        bool IsEmitBasic) {
+                                        const FeatureMapTy &FeatureMap) {
   // Gather and sort processor information
   std::vector<const Record *> ProcessorList =
       Records.getAllDerivedDefinitions("Processor");
@@ -381,8 +374,7 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
 
   // Begin processor table.
   OS << "// Sorted (by key) array of values for CPU subtype.\n"
-     << "extern const llvm::" << (IsEmitBasic ? "Basic" : "")
-     << "SubtargetSubTypeKV " << (IsEmitBasic ? "Basic" : "") << Target
+     << "extern const llvm::SubtargetSubTypeKV " << Target
      << "SubTypeKV[] = {\n";
 
   for (const Record *Processor : ProcessorList) {
@@ -400,17 +392,13 @@ unsigned SubtargetEmitter::cpuKeyValues(raw_ostream &OS,
        << "\"" << Name << "\", ";
 
     printFeatureMask(OS, FeatureList, FeatureMap);
+    OS << ", ";
+    printFeatureMask(OS, TuneFeatureList, FeatureMap);
 
-    if (!IsEmitBasic) {
-      OS << ", ";
-      printFeatureMask(OS, TuneFeatureList, FeatureMap);
-
-      // Emit the scheduler model pointer.
-      const std::string &ProcModelName =
-          SchedModels.getModelForProc(Processor).ModelName;
-      OS << ", &" << ProcModelName;
-    }
-    OS << " },\n";
+    // Emit the scheduler model pointer.
+    const std::string &ProcModelName =
+        SchedModels.getModelForProc(Processor).ModelName;
+    OS << ", &" << ProcModelName << " },\n";
   }
 
   // End processor table.
@@ -2052,14 +2040,6 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "} // end namespace llvm\n\n";
   OS << "#endif // GET_SUBTARGETINFO_ENUM\n\n";
 
-  OS << "\n#ifdef GET_SUBTARGETFEATURES_KV\n";
-  OS << "#undef GET_SUBTARGETFEATURES_KV\n\n";
-  OS << "namespace llvm {\n";
-  featureKeyValues(OS, FeatureMap, true);
-  cpuKeyValues(OS, FeatureMap, true);
-  OS << "} // end namespace llvm\n\n";
-  OS << "#endif // GET_SUBTARGETFEATURES_KV\n\n";
-
   emitSubtargetInfoMacroCalls(OS);
 
   OS << "namespace llvm {\n";

From c19e900ce8b422f6b8c028fbbd9ef7e9d3720236 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Thu, 12 Jun 2025 16:02:51 -0400
Subject: [PATCH 293/851] [AArch64] Signed comparison using CMN is safe when
 the subtraction is nsw (#141993)

nsw means no signed wrap, and 0 - INT_MIN is a signed wrap.

Now, this is going to be a point I need to get out of the way:

So is it okay to always transform a > -b into cmn if it is a signed
comparison, even if b is INT_MIN because -INT_MIN is undefined, at least
in C, because unless fwrapv is specified, opt puts nsw on signed integer
operations, allowing for more folds anyway.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 19 ++++++--
 llvm/test/CodeGen/AArch64/cmp-to-cmn.ll       | 46 +++++++++++++++++++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac545534d728b..5b9e699eaa408 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3392,8 +3392,19 @@ bool isLegalCmpImmed(APInt C) {
   return isLegalArithImmed(C.abs().getZExtValue());
 }
 
-static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
-  KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
+static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
+  // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
+  if (Op->getFlags().hasNoSignedWrap())
+    return true;
+
+  // We can still figure out if the second operand is safe to use
+  // in a CMN instruction by checking if it is known to be not the minimum
+  // signed value. If it is not, then we can safely use CMN.
+  // Note: We can eventually remove this check and simply rely on
+  // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
+  // consistently sets them appropriately when making said nodes.
+
+  KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
   return !KnownSrc.getSignedMinValue().isMinSignedValue();
 }
 
@@ -3402,7 +3413,7 @@ static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
 // can be set differently by this operation. It comes down to whether
 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
 // everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
+// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
 //
 // So, finally, the only LLVM-native comparisons that don't mention C or V
 // are the ones that aren't unsigned comparisons. They're the only ones we can
@@ -3411,7 +3422,7 @@ static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
   return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
          (isIntEqualitySetCC(CC) ||
           (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
-          (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
+          (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
 }
 
 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
diff --git a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
index c5fd9b63cce97..5765e0acae269 100644
--- a/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
+++ b/llvm/test/CodeGen/AArch64/cmp-to-cmn.ll
@@ -602,3 +602,49 @@ define i1 @almost_immediate_neg_ugt_64(i64 %x) {
   %cmp = icmp ugt i64 %x, -16773121
   ret i1 %cmp
 }
+
+define i1 @cmn_nsw(i32 %a, i32 %b) {
+; CHECK-LABEL: cmn_nsw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn w0, w1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub nsw i32 0, %b
+  %cmp = icmp sgt i32 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_64(i64 %a, i64 %b) {
+; CHECK-LABEL: cmn_nsw_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmn x0, x1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub nsw i64 0, %b
+  %cmp = icmp sgt i64 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_neg(i32 %a, i32 %b) {
+; CHECK-LABEL: cmn_nsw_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg w8, w1
+; CHECK-NEXT:    cmp w0, w8
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub i32 0, %b
+  %cmp = icmp sgt i32 %a, %sub
+  ret i1 %cmp
+}
+
+define i1 @cmn_nsw_neg_64(i64 %a, i64 %b) {
+; CHECK-LABEL: cmn_nsw_neg_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg x8, x1
+; CHECK-NEXT:    cmp x0, x8
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %sub = sub i64 0, %b
+  %cmp = icmp sgt i64 %a, %sub
+  ret i1 %cmp
+}

From b1f5e26b78a9550a22ee2f24bb3f220d396c452f Mon Sep 17 00:00:00 2001
From: GeorgeHuyubo <113479859+GeorgeHuyubo@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:23:26 -0700
Subject: [PATCH 294/851] [lldb] Properly handle locate module callback when
 Target change arch (#143793)

Since this PR: https://github.com/llvm/llvm-project/pull/141670/ We
started to override the Platform/Arch for a target if needed. However we
may have already registered locate module callback with the old
platform.

This PR will move the locate module callback to the new Platform
whenever Target changes architecture.

Co-authored-by: George Hu <georgehuyubo@gmail.com>
---
 lldb/source/Target/Target.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 9660fc97970b0..45a9e1196a049 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1706,6 +1706,8 @@ bool Target::SetArchitecture(const ArchSpec &arch_spec, bool set_platform,
         if (PlatformSP arch_platform_sp =
                 GetDebugger().GetPlatformList().GetOrCreate(other, {},
                                                             &platform_arch)) {
+          arch_platform_sp->SetLocateModuleCallback(
+              platform_sp->GetLocateModuleCallback());
           SetPlatform(arch_platform_sp);
           if (platform_arch.IsValid())
             other = platform_arch;

From d65904675ea106713937c9cce24e3d1ec0bc570a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 12 Jun 2025 21:35:56 +0100
Subject: [PATCH 295/851] [LV] Move logic to create trip count check to helper
 (NFC).

Move the logic to create the iteration count check to a separate helper,
so it can be re-used by when creating the skeleton for epilogue
vectorization as well.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8177b76ad5bdf..404ee6874d2a5 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -530,6 +530,9 @@ class InnerLoopVectorizer {
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
 
+  // Create a check to see if the vector loop should be executed
+  Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
+
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
   void emitIterationCountCheck(BasicBlock *Bypass);
@@ -2370,13 +2373,8 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
   }
 }
 
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
-  Value *Count = getTripCount();
-  // Reuse existing vector loop preheader for TC checks.
-  // Note that new preheader block is generated for vector loop.
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
+Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
+                                                      unsigned UF) const {
   // Generate code to check if the loop's trip count is less than VF * UF, or
   // equal to it in case a scalar epilogue is required; this implies that the
   // vector trip count is zero. This check also covers the case where adding one
@@ -2385,7 +2383,13 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
                                                        : ICmpInst::ICMP_ULT;
 
+  // Reuse existing vector loop preheader for TC checks.
+  // Note that new preheader block is generated for vector loop.
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
   // If tail is to be folded, vector loop takes care of all iterations.
+  Value *Count = getTripCount();
   Type *CountTy = Count->getType();
   Value *CheckMinIters = Builder.getFalse();
   auto CreateStep = [&]() -> Value * {
@@ -2434,7 +2438,12 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
     // Don't execute the vector loop if (UMax - n) < (VF * UF).
     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
   }
+  return CheckMinIters;
+}
 
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+  Value *CheckMinIters = createIterationCountCheck(VF, UF);
   // Create new preheader for vector loop.
   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
                                    static_cast<DominatorTree *>(nullptr), LI,

From 8ee9646b06cd128a6c55f375e4df431aee053c76 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 13:46:06 -0700
Subject: [PATCH 296/851] [LV] Simplify creation of vp.load/vp.store/vp.reduce
 intrinsics (#143804)

The use of VectorBuilder here was simply obscuring what was actually
going on. For vp.load and vp.store, the resulting code is significantly
more idiomatic. For the vp.reduce cases, we remove several layers of
indirection, including passing parameters via implicit state on the
builder. In both cases, the code is significantly easier to follow.
---
 llvm/include/llvm/IR/VectorBuilder.h          | 120 --------
 .../include/llvm/Transforms/Utils/LoopUtils.h |  11 +-
 llvm/lib/IR/CMakeLists.txt                    |   1 -
 llvm/lib/IR/VectorBuilder.cpp                 | 116 --------
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |  31 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  23 +-
 llvm/unittests/IR/CMakeLists.txt              |   1 -
 llvm/unittests/IR/VectorBuilderTest.cpp       | 279 ------------------
 8 files changed, 31 insertions(+), 551 deletions(-)
 delete mode 100644 llvm/include/llvm/IR/VectorBuilder.h
 delete mode 100644 llvm/lib/IR/VectorBuilder.cpp
 delete mode 100644 llvm/unittests/IR/VectorBuilderTest.cpp

diff --git a/llvm/include/llvm/IR/VectorBuilder.h b/llvm/include/llvm/IR/VectorBuilder.h
deleted file mode 100644
index bc23842d8e6bd..0000000000000
--- a/llvm/include/llvm/IR/VectorBuilder.h
+++ /dev/null
@@ -1,120 +0,0 @@
-//===- llvm/VectorBuilder.h - Builder for VP Intrinsics ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the VectorBuilder class, which is used as a convenient way
-// to create VP intrinsics as if they were LLVM instructions with a consistent
-// and simplified interface.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_VECTORBUILDER_H
-#define LLVM_IR_VECTORBUILDER_H
-
-#include "llvm/Support/Compiler.h"
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/InstrTypes.h>
-#include <llvm/IR/Instruction.h>
-#include <llvm/IR/Value.h>
-
-namespace llvm {
-
-class VectorBuilder {
-public:
-  enum class Behavior {
-    // Abort if the requested VP intrinsic could not be created.
-    // This is useful for strict consistency.
-    ReportAndAbort = 0,
-
-    // Return a default-initialized value if the requested VP intrinsic could
-    // not be created.
-    // This is useful for a defensive fallback to non-VP code.
-    SilentlyReturnNone = 1,
-  };
-
-private:
-  IRBuilderBase &Builder;
-  Behavior ErrorHandling;
-
-  // Explicit mask parameter.
-  Value *Mask;
-  // Explicit vector length parameter.
-  Value *ExplicitVectorLength;
-  // Compile-time vector length.
-  ElementCount StaticVectorLength;
-
-  // Get mask/evl value handles for the current configuration.
-  Value &requestMask();
-  Value &requestEVL();
-
-  LLVM_ABI void handleError(const char *ErrorMsg) const;
-  template <typename RetType>
-  RetType returnWithError(const char *ErrorMsg) const {
-    handleError(ErrorMsg);
-    return RetType();
-  }
-
-  /// Helper function for creating VP intrinsic call.
-  Value *createVectorInstructionImpl(Intrinsic::ID VPID, Type *ReturnTy,
-                                     ArrayRef<Value *> VecOpArray,
-                                     const Twine &Name = Twine());
-
-public:
-  VectorBuilder(IRBuilderBase &Builder,
-                Behavior ErrorHandling = Behavior::ReportAndAbort)
-      : Builder(Builder), ErrorHandling(ErrorHandling), Mask(nullptr),
-        ExplicitVectorLength(nullptr),
-        StaticVectorLength(ElementCount::getFixed(0)) {}
-
-  LLVM_ABI Module &getModule() const;
-  LLVMContext &getContext() const { return Builder.getContext(); }
-
-  // All-true mask for the currently configured explicit vector length.
-  LLVM_ABI Value *getAllTrueMask();
-
-  VectorBuilder &setMask(Value *NewMask) {
-    Mask = NewMask;
-    return *this;
-  }
-  VectorBuilder &setEVL(Value *NewExplicitVectorLength) {
-    ExplicitVectorLength = NewExplicitVectorLength;
-    return *this;
-  }
-  VectorBuilder &setStaticVL(unsigned NewFixedVL) {
-    StaticVectorLength = ElementCount::getFixed(NewFixedVL);
-    return *this;
-  }
-
-  /// Get the flags to be applied to created floating point ops.
-  const FastMathFlags &getFastMathFlags() const {
-    return Builder.getFastMathFlags();
-  }
-
-  // TODO: setStaticVL(ElementCount) for scalable types.
-
-  // Emit a VP intrinsic call that mimics a regular instruction.
-  // This operation behaves according to the VectorBuilderBehavior.
-  // \p Opcode      The functional instruction opcode of the emitted intrinsic.
-  // \p ReturnTy    The return type of the operation.
-  // \p VecOpArray  The operand list.
-  LLVM_ABI Value *createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                          ArrayRef<Value *> VecOpArray,
-                                          const Twine &Name = Twine());
-
-  /// Emit a VP reduction intrinsic call for recurrence kind.
-  /// \param RdxID       The intrinsic ID of llvm.vector.reduce.*
-  /// \param ValTy       The type of operand which the reduction operation is
-  ///                    performed.
-  /// \param VecOpArray  The operand list.
-  LLVM_ABI Value *createSimpleReduction(Intrinsic::ID RdxID, Type *ValTy,
-                                        ArrayRef<Value *> VecOpArray,
-                                        const Twine &Name = Twine());
-};
-
-} // namespace llvm
-
-#endif // LLVM_IR_VECTORBUILDER_H
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 6c0e06482a6de..12be3bad04d38 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/VectorBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -423,8 +422,9 @@ LLVM_ABI Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
                                       RecurKind RdxKind);
 /// Overloaded function to generate vector-predication intrinsics for
 /// reduction.
-LLVM_ABI Value *createSimpleReduction(VectorBuilder &VB, Value *Src,
-                                      RecurKind RdxKind);
+LLVM_ABI Value *createSimpleReduction(IRBuilderBase &B, Value *Src,
+                                      RecurKind RdxKind, Value *Mask,
+                                      Value *EVL);
 
 /// Create a reduction of the given vector \p Src for a reduction of kind
 /// RecurKind::AnyOf. The start value of the reduction is \p InitVal.
@@ -442,8 +442,9 @@ LLVM_ABI Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind,
                                        Value *Src, Value *Start);
 /// Overloaded function to generate vector-predication intrinsics for ordered
 /// reduction.
-LLVM_ABI Value *createOrderedReduction(VectorBuilder &VB, RecurKind RdxKind,
-                                       Value *Src, Value *Start);
+LLVM_ABI Value *createOrderedReduction(IRBuilderBase &B, RecurKind RdxKind,
+                                       Value *Src, Value *Start, Value *Mask,
+                                       Value *EVL);
 
 /// Get the intersection (logical and) of all of the potential IR flags
 /// of each scalar operation (VL) that will be converted into a vector (I).
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index eb00829fd8c70..10572ff708bd3 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -74,7 +74,6 @@ add_llvm_component_library(LLVMCore
   User.cpp
   Value.cpp
   ValueSymbolTable.cpp
-  VectorBuilder.cpp
   VectorTypeUtils.cpp
   Verifier.cpp
   VFABIDemangler.cpp
diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp
deleted file mode 100644
index 737f49b1334d7..0000000000000
--- a/llvm/lib/IR/VectorBuilder.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-//===- VectorBuilder.cpp - Builder for VP Intrinsics ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the VectorBuilder class, which is used as a convenient
-// way to create VP intrinsics as if they were LLVM instructions with a
-// consistent and simplified interface.
-//
-//===----------------------------------------------------------------------===//
-
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/IR/FPEnv.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/IntrinsicInst.h>
-#include <llvm/IR/Intrinsics.h>
-#include <llvm/IR/VectorBuilder.h>
-
-namespace llvm {
-
-void VectorBuilder::handleError(const char *ErrorMsg) const {
-  if (ErrorHandling == Behavior::SilentlyReturnNone)
-    return;
-  report_fatal_error(ErrorMsg);
-}
-
-Module &VectorBuilder::getModule() const {
-  return *Builder.GetInsertBlock()->getModule();
-}
-
-Value *VectorBuilder::getAllTrueMask() {
-  return Builder.getAllOnesMask(StaticVectorLength);
-}
-
-Value &VectorBuilder::requestMask() {
-  if (Mask)
-    return *Mask;
-
-  return *getAllTrueMask();
-}
-
-Value &VectorBuilder::requestEVL() {
-  if (ExplicitVectorLength)
-    return *ExplicitVectorLength;
-
-  assert(!StaticVectorLength.isScalable() && "TODO vscale lowering");
-  auto *IntTy = Builder.getInt32Ty();
-  return *ConstantInt::get(IntTy, StaticVectorLength.getFixedValue());
-}
-
-Value *VectorBuilder::createVectorInstruction(unsigned Opcode, Type *ReturnTy,
-                                              ArrayRef<Value *> InstOpArray,
-                                              const Twine &Name) {
-  auto VPID = VPIntrinsic::getForOpcode(Opcode);
-  if (VPID == Intrinsic::not_intrinsic)
-    return returnWithError<Value *>("No VPIntrinsic for this opcode");
-  return createVectorInstructionImpl(VPID, ReturnTy, InstOpArray, Name);
-}
-
-Value *VectorBuilder::createSimpleReduction(Intrinsic::ID RdxID,
-                                            Type *ValTy,
-                                            ArrayRef<Value *> InstOpArray,
-                                            const Twine &Name) {
-  auto VPID = VPIntrinsic::getForIntrinsic(RdxID);
-  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
-         "No VPIntrinsic for this reduction");
-  return createVectorInstructionImpl(VPID, ValTy, InstOpArray, Name);
-}
-
-Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID,
-                                                  Type *ReturnTy,
-                                                  ArrayRef<Value *> InstOpArray,
-                                                  const Twine &Name) {
-  auto MaskPosOpt = VPIntrinsic::getMaskParamPos(VPID);
-  auto VLenPosOpt = VPIntrinsic::getVectorLengthParamPos(VPID);
-  size_t NumInstParams = InstOpArray.size();
-  size_t NumVPParams =
-      NumInstParams + MaskPosOpt.has_value() + VLenPosOpt.has_value();
-
-  SmallVector<Value *, 6> IntrinParams;
-
-  // Whether the mask and vlen parameter are at the end of the parameter list.
-  bool TrailingMaskAndVLen =
-      std::min<size_t>(MaskPosOpt.value_or(NumInstParams),
-                       VLenPosOpt.value_or(NumInstParams)) >= NumInstParams;
-
-  if (TrailingMaskAndVLen) {
-    // Fast path for trailing mask, vector length.
-    IntrinParams.append(InstOpArray.begin(), InstOpArray.end());
-    IntrinParams.resize(NumVPParams);
-  } else {
-    IntrinParams.resize(NumVPParams);
-    // Insert mask and evl operands in between the instruction operands.
-    for (size_t VPParamIdx = 0, ParamIdx = 0; VPParamIdx < NumVPParams;
-         ++VPParamIdx) {
-      if (MaskPosOpt == VPParamIdx || VLenPosOpt == VPParamIdx)
-        continue;
-      assert(ParamIdx < NumInstParams);
-      IntrinParams[VPParamIdx] = InstOpArray[ParamIdx++];
-    }
-  }
-
-  if (MaskPosOpt)
-    IntrinParams[*MaskPosOpt] = &requestMask();
-  if (VLenPosOpt)
-    IntrinParams[*VLenPosOpt] = &requestEVL();
-
-  auto *VPDecl = VPIntrinsic::getOrInsertDeclarationForParams(
-      &getModule(), VPID, ReturnTy, IntrinParams);
-  return Builder.CreateCall(VPDecl, IntrinParams, Name);
-}
-
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index ff69fa9f70c4e..cf6b183c78ac3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1319,18 +1319,19 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   }
 }
 
-Value *llvm::createSimpleReduction(VectorBuilder &VBuilder, Value *Src,
-                                   RecurKind Kind) {
+Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
+                                   RecurKind Kind, Value *Mask, Value *EVL) {
   assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
          !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&
          "AnyOf or FindLastIV reductions are not supported.");
   Intrinsic::ID Id = getReductionIntrinsicID(Kind);
-  auto *SrcTy = cast<VectorType>(Src->getType());
-  Type *SrcEltTy = SrcTy->getElementType();
-  Value *Iden =
-      getRecurrenceIdentity(Kind, SrcEltTy, VBuilder.getFastMathFlags());
-  Value *Ops[] = {Iden, Src};
-  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+  auto VPID = VPIntrinsic::getForIntrinsic(Id);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
+  auto *EltTy = cast<VectorType>(Src->getType())->getElementType();
+  Value *Iden = getRecurrenceIdentity(Kind, EltTy, Builder.getFastMathFlags());
+  Value *Ops[] = {Iden, Src, Mask, EVL};
+  return Builder.CreateIntrinsic(EltTy, VPID, Ops);
 }
 
 Value *llvm::createOrderedReduction(IRBuilderBase &B, RecurKind Kind,
@@ -1343,17 +1344,21 @@ Value *llvm::createOrderedReduction(IRBuilderBase &B, RecurKind Kind,
   return B.CreateFAddReduce(Start, Src);
 }
 
-Value *llvm::createOrderedReduction(VectorBuilder &VBuilder, RecurKind Kind,
-                                    Value *Src, Value *Start) {
+Value *llvm::createOrderedReduction(IRBuilderBase &Builder, RecurKind Kind,
+                                    Value *Src, Value *Start, Value *Mask,
+                                    Value *EVL) {
   assert((Kind == RecurKind::FAdd || Kind == RecurKind::FMulAdd) &&
          "Unexpected reduction kind");
   assert(Src->getType()->isVectorTy() && "Expected a vector type");
   assert(!Start->getType()->isVectorTy() && "Expected a scalar type");
 
   Intrinsic::ID Id = getReductionIntrinsicID(RecurKind::FAdd);
-  auto *SrcTy = cast<VectorType>(Src->getType());
-  Value *Ops[] = {Start, Src};
-  return VBuilder.createSimpleReduction(Id, SrcTy, Ops);
+  auto VPID = VPIntrinsic::getForIntrinsic(Id);
+  assert(VPReductionIntrinsic::isVPReduction(VPID) &&
+         "No VPIntrinsic for this reduction");
+  auto *EltTy = cast<VectorType>(Src->getType())->getElementType();
+  Value *Ops[] = {Start, Src, Mask, EVL};
+  return Builder.CreateIntrinsic(EltTy, VPID, Ops);
 }
 
 void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa6b13c217bd1..74472aaeb1675 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -30,7 +30,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/VectorBuilder.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2524,21 +2523,17 @@ void VPReductionEVLRecipe::execute(VPTransformState &State) {
   Value *VecOp = State.get(getVecOp());
   Value *EVL = State.get(getEVL(), VPLane(0));
 
-  VectorBuilder VBuilder(Builder);
-  VBuilder.setEVL(EVL);
   Value *Mask;
-  // TODO: move the all-true mask generation into VectorBuilder.
   if (VPValue *CondOp = getCondOp())
     Mask = State.get(CondOp);
   else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  VBuilder.setMask(Mask);
 
   Value *NewRed;
   if (isOrdered()) {
-    NewRed = createOrderedReduction(VBuilder, Kind, VecOp, Prev);
+    NewRed = createOrderedReduction(Builder, Kind, VecOp, Prev, Mask, EVL);
   } else {
-    NewRed = createSimpleReduction(VBuilder, VecOp, Kind);
+    NewRed = createSimpleReduction(Builder, VecOp, Kind, Mask, EVL);
     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
       NewRed = createMinMaxOp(Builder, Kind, NewRed, Prev);
     else
@@ -3086,10 +3081,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Load, DataTy, Addr, "vp.op.load"));
+    NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_load,
+                                    {Addr, Mask, EVL}, nullptr, "vp.op.load");
   }
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
@@ -3204,11 +3197,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
+    NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
+                                    Intrinsic::vp_store,
+                                    {StoredVal, Addr, Mask, EVL});
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index bea6b1b46f573..b66eae93f9339 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -50,7 +50,6 @@ add_llvm_unittest(IRTests
   ValueHandleTest.cpp
   ValueMapTest.cpp
   ValueTest.cpp
-  VectorBuilderTest.cpp
   VectorTypeUtilsTest.cpp
   VectorTypesTest.cpp
   VerifierTest.cpp
diff --git a/llvm/unittests/IR/VectorBuilderTest.cpp b/llvm/unittests/IR/VectorBuilderTest.cpp
deleted file mode 100644
index e01378a2755f0..0000000000000
--- a/llvm/unittests/IR/VectorBuilderTest.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-//===--------- VectorBuilderTest.cpp - VectorBuilder unit tests -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/VectorBuilder.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-
-namespace {
-
-static unsigned VectorNumElements = 8;
-
-class VectorBuilderTest : public testing::Test {
-protected:
-  LLVMContext Context;
-
-  VectorBuilderTest() : Context() {}
-
-  std::unique_ptr<Module> createBuilderModule(Function *&Func, BasicBlock *&BB,
-                                              Value *&Mask, Value *&EVL) {
-    auto Mod = std::make_unique<Module>("TestModule", Context);
-    auto *Int32Ty = Type::getInt32Ty(Context);
-    auto *Mask8Ty =
-        FixedVectorType::get(Type::getInt1Ty(Context), VectorNumElements);
-    auto *VoidFuncTy =
-        FunctionType::get(Type::getVoidTy(Context), {Mask8Ty, Int32Ty}, false);
-    Func =
-        Function::Create(VoidFuncTy, GlobalValue::ExternalLinkage, "bla", *Mod);
-    Mask = Func->getArg(0);
-    EVL = Func->getArg(1);
-    BB = BasicBlock::Create(Context, "entry", Func);
-
-    return Mod;
-  }
-};
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setEVL(EVL);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    auto *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
-    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-static bool isAllTrueMask(Value *Val, unsigned NumElements) {
-  auto *ConstMask = dyn_cast<Constant>(Val);
-  if (!ConstMask)
-    return false;
-
-  // Structure check.
-  if (!ConstMask->isAllOnesValue())
-    return false;
-
-  // Type check.
-  auto *MaskVecTy = cast<FixedVectorType>(ConstMask->getType());
-  if (MaskVecTy->getNumElements() != NumElements)
-    return false;
-
-  return MaskVecTy->getElementType()->isIntegerTy(1);
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoMask) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setEVL(EVL).setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
-    ASSERT_EQ(VPIntrin->getVectorLengthParam(), EVL);                          \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-static bool isLegalConstEVL(Value *Val, unsigned ExpectedEVL) {
-  auto *ConstEVL = dyn_cast<ConstantInt>(Val);
-  if (!ConstEVL)
-    return false;
-
-  // Value check.
-  if (ConstEVL->getZExtValue() != ExpectedEVL)
-    return false;
-
-  // Type check.
-  return ConstEVL->getType()->isIntegerTy(32);
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest, TestCreateBinaryInstructions_FixedVector_NoEVL) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_EQ(VPIntrin->getMaskParam(), Mask);                                 \
-    ASSERT_TRUE(                                                               \
-        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
-  }
-#include "llvm/IR/Instruction.def"
-}
-
-/// Check that creating binary arithmetic VP intrinsics works.
-TEST_F(VectorBuilderTest,
-       TestCreateBinaryInstructions_FixedVector_NoMask_NoEVL) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setStaticVL(VectorNumElements);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-  auto *IntVecTy =
-      FixedVectorType::get(Type::getInt32Ty(Context), VectorNumElements);
-
-#define HANDLE_BINARY_INST(NUM, OPCODE, INSTCLASS)                             \
-  {                                                                            \
-    auto VPID = VPIntrinsic::getForOpcode(Instruction::OPCODE);                \
-    bool IsFP = (#INSTCLASS)[0] == 'F';                                        \
-    Type *ValueTy = IsFP ? FloatVecTy : IntVecTy;                              \
-    Value *Op = PoisonValue::get(ValueTy);                                      \
-    auto *I = VBuild.createVectorInstruction(Instruction::OPCODE, ValueTy,     \
-                                             {Op, Op});                        \
-    ASSERT_TRUE(isa<VPIntrinsic>(I));                                          \
-    auto *VPIntrin = cast<VPIntrinsic>(I);                                     \
-    ASSERT_EQ(VPIntrin->getIntrinsicID(), VPID);                               \
-    ASSERT_TRUE(isAllTrueMask(VPIntrin->getMaskParam(), VectorNumElements));   \
-    ASSERT_TRUE(                                                               \
-        isLegalConstEVL(VPIntrin->getVectorLengthParam(), VectorNumElements)); \
-  }
-#include "llvm/IR/Instruction.def"
-}
-/// Check that creating vp.load/vp.store works.
-TEST_F(VectorBuilderTest, TestCreateLoadStore) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  VectorBuilder VBuild(Builder);
-  VBuild.setMask(Mask).setEVL(EVL);
-
-  auto *FloatVecTy =
-      FixedVectorType::get(Type::getFloatTy(Context), VectorNumElements);
-
-  Value *FloatVecPtr = PoisonValue::get(Builder.getPtrTy(0));
-  Value *FloatVec = PoisonValue::get(FloatVecTy);
-
-  // vp.load
-  auto LoadVPID = VPIntrinsic::getForOpcode(Instruction::Load);
-  auto *LoadIntrin = VBuild.createVectorInstruction(Instruction::Load,
-                                                    FloatVecTy, {FloatVecPtr});
-  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
-  auto *VPLoad = cast<VPIntrinsic>(LoadIntrin);
-  ASSERT_EQ(VPLoad->getIntrinsicID(), LoadVPID);
-  ASSERT_EQ(VPLoad->getMemoryPointerParam(), FloatVecPtr);
-
-  // vp.store
-  auto *VoidTy = Builder.getVoidTy();
-  auto StoreVPID = VPIntrinsic::getForOpcode(Instruction::Store);
-  auto *StoreIntrin = VBuild.createVectorInstruction(Instruction::Store, VoidTy,
-                                                     {FloatVec, FloatVecPtr});
-  ASSERT_TRUE(isa<VPIntrinsic>(LoadIntrin));
-  auto *VPStore = cast<VPIntrinsic>(StoreIntrin);
-  ASSERT_EQ(VPStore->getIntrinsicID(), StoreVPID);
-  ASSERT_EQ(VPStore->getMemoryPointerParam(), FloatVecPtr);
-  ASSERT_EQ(VPStore->getMemoryDataParam(), FloatVec);
-}
-
-/// Check that the SilentlyReturnNone error handling mode works.
-TEST_F(VectorBuilderTest, TestFail_SilentlyReturnNone) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  auto *VoidTy = Builder.getVoidTy();
-  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::SilentlyReturnNone);
-  VBuild.setMask(Mask).setEVL(EVL);
-  auto *Val = VBuild.createVectorInstruction(Instruction::Br, VoidTy, {});
-  ASSERT_EQ(Val, nullptr);
-}
-
-/// Check that the ReportAndFail error handling mode aborts as advertised.
-TEST_F(VectorBuilderTest, TestFail_ReportAndAbort) {
-  Function *F;
-  BasicBlock *BB;
-  Value *Mask, *EVL;
-  auto Mod = createBuilderModule(F, BB, Mask, EVL);
-
-  IRBuilder<> Builder(BB);
-  auto *VoidTy = Builder.getVoidTy();
-  VectorBuilder VBuild(Builder, VectorBuilder::Behavior::ReportAndAbort);
-  VBuild.setMask(Mask).setEVL(EVL);
-  ASSERT_DEATH({ VBuild.createVectorInstruction(Instruction::Br, VoidTy, {}); },
-               "No VPIntrinsic for this opcode");
-}
-
-} // end anonymous namespace

From 741ea80446e21b4052d723765011fe3583d3fc7f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 12 Jun 2025 20:46:30 +0000
Subject: [PATCH 297/851] [gn build] Port 8ee9646b06cd

---
 llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn       | 1 -
 llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
index 4f103d30f300b..22aa0b6418132 100644
--- a/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/IR/BUILD.gn
@@ -90,7 +90,6 @@ static_library("IR") {
     "VFABIDemangler.cpp",
     "Value.cpp",
     "ValueSymbolTable.cpp",
-    "VectorBuilder.cpp",
     "VectorTypeUtils.cpp",
     "Verifier.cpp",
   ]
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index 0f34231ae3216..0d162ff0f9d57 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -54,7 +54,6 @@ unittest("IRTests") {
     "ValueHandleTest.cpp",
     "ValueMapTest.cpp",
     "ValueTest.cpp",
-    "VectorBuilderTest.cpp",
     "VectorTypeUtilsTest.cpp",
     "VectorTypesTest.cpp",
     "VerifierTest.cpp",

From 8a8ea8fec063bd64c17e463e7c3eaae5cdb4a645 Mon Sep 17 00:00:00 2001
From: Chelsea Cassanova <chelsea_cassanova@apple.com>
Date: Thu, 12 Jun 2025 13:55:44 -0700
Subject: [PATCH 298/851] =?UTF-8?q?Reland=20"[lldb][headers]=20Create=20Py?=
 =?UTF-8?q?thon=20script=20to=20fix=20up=20framework=20head=E2=80=A6=20(#1?=
 =?UTF-8?q?43945)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ers" (#143941)

Reland the script that converts lldb headers to RPC headers. The RPC
test was failing due to the incorrect input filepath being used.

Original commit message:
This commit replaces the shell script that fixes up includes for the
LLDB framework with a Python script. This script will also be used when
fixing up includes for the LLDBRPC.framework.
---
 lldb/cmake/modules/LLDBFramework.cmake        |  42 +++---
 lldb/scripts/framework-header-fix.py          | 126 ++++++++++++++++++
 .../Shell/Scripts/Inputs/Main/SBAddress.h     |  13 ++
 .../Shell/Scripts/Inputs/RPC/RPCSBAddress.h   |   9 ++
 .../Shell/Scripts/TestFrameworkFixScript.test |  11 ++
 .../Scripts/TestFrameworkFixUnifdef.test      |  12 ++
 .../Scripts/TestRPCFrameworkFixScript.test    |  14 ++
 7 files changed, 206 insertions(+), 21 deletions(-)
 create mode 100755 lldb/scripts/framework-header-fix.py
 create mode 100644 lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixScript.test
 create mode 100644 lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
 create mode 100644 lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test

diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 8961b1afe93ad..70010ffbf738c 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -68,24 +68,17 @@ if(NOT APPLE_EMBEDDED)
   )
 endif()
 
-# At configuration time, collect headers for the framework bundle and copy them
-# into a staging directory. Later we can copy over the entire folder.
-file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h)
-set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h)
-file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h)
-file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h)
-list(REMOVE_ITEM root_public_headers ${root_private_headers})
-
 find_program(unifdef_EXECUTABLE unifdef)
 
-set(lldb_header_staging ${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders)
-foreach(header
-    ${public_headers}
-    ${generated_public_headers}
-    ${root_public_headers})
+# All necessary header files will be staged in the include directory in the build directory,
+# so just copy the files from there into the framework's staging directory.
+set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb")
+set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders")
+file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*)
+foreach(header ${lldb_build_dir_header_staging_list})
 
   get_filename_component(basename ${header} NAME)
-  set(staged_header ${lldb_header_staging}/${basename})
+  set(staged_header ${lldb_framework_header_staging}/${basename})
 
   if(unifdef_EXECUTABLE)
     # unifdef returns 0 when the file is unchanged and 1 if something was changed.
@@ -112,13 +105,20 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources
 add_dependencies(liblldb-resource-headers liblldb-header-staging)
 add_dependencies(liblldb liblldb-resource-headers)
 
-# At build time, copy the staged headers into the framework bundle (and do
-# some post-processing in-place).
-add_custom_command(TARGET liblldb POST_BUILD
-  COMMAND ${CMAKE_COMMAND} -E copy_directory ${lldb_header_staging} $<TARGET_FILE_DIR:liblldb>/Headers
-  COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.sh $<TARGET_FILE_DIR:liblldb>/Headers ${LLDB_VERSION}
-  COMMENT "LLDB.framework: copy framework headers"
-)
+# Take the headers from the staging directory and fix up their includes for the framework.
+# Then write them to the output directory.
+# Also, run unifdef to remove any specified guards from the header files.
+file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*)
+foreach(header ${lldb_framework_header_staging_list})
+
+  set(input_header ${header})
+  set(output_header $<TARGET_FILE_DIR:liblldb>/Headers/${input_header})
+
+  add_custom_command(TARGET liblldb POST_BUILD
+    COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG
+    COMMENT "LLDB.framework: Fix up and copy framework headers"
+  )
+endforeach()
 
 # Copy vendor-specific headers from clang (without staging).
 if(NOT APPLE_EMBEDDED)
diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py
new file mode 100755
index 0000000000000..9528fdb7e30bd
--- /dev/null
+++ b/lldb/scripts/framework-header-fix.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+"""
+Usage: <path/to/input-directory> <path/to/output-directory>
+
+This script is used when building LLDB.framework or LLDBRPC.framework. For each framework, local includes are converted to their respective framework includes.
+
+This script is used in 2 ways:
+1. It is used on header files that are copied into LLDB.framework. For these files, local LLDB includes are converted into framework includes, e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>.
+
+2. It is used on header files for LLDBRPC.framework. For these files, includes of RPC common files will be converted to framework includes, e.g. #include <lldb-rpc/common/RPCCommon.h> -> #include <LLDBRPC/RPCCommon.h>. It will also change local includes to framework includes, e.g. #include "SBAddress.h" -> #include <LLDBRPC/SBAddress.h>
+"""
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+
+# Main header regexes
+INCLUDE_FILENAME_REGEX = re.compile(
+    r'#include "lldb/API/(?P<include_filename>.*){0,1}"'
+)
+
+# RPC header regexes
+RPC_COMMON_REGEX = re.compile(r"#include <lldb-rpc/common/(?P<include_filename>.*)>")
+RPC_INCLUDE_FILENAME_REGEX = re.compile(r'#include "(?P<include_filename>.*)"')
+
+
+def modify_rpc_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to RPC framework level includes.
+            # e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+            # Also, RPC common code includes must change to RPC framework level includes.
+            # e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+            rpc_common_matches = RPC_COMMON_REGEX.finditer(file_buffer)
+            rpc_include_filename_matches = RPC_INCLUDE_FILENAME_REGEX.finditer(
+                file_buffer
+            )
+            for match in rpc_common_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            for match in rpc_include_filename_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDBRPC/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+            output_file.write(file_buffer)
+
+
+def modify_main_includes(input_file_path, output_file_path):
+    with open(input_file_path, "r") as input_file:
+        lines = input_file.readlines()
+        file_buffer = "".join(lines)
+        with open(output_file_path, "w") as output_file:
+            # Local includes must be changed to framework level includes.
+            # e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+            regex_matches = INCLUDE_FILENAME_REGEX.finditer(file_buffer)
+            for match in regex_matches:
+                file_buffer = re.sub(
+                    match.group(),
+                    r"#include <LLDB/" + match.group("include_filename") + ">",
+                    file_buffer,
+                )
+                output_file.write(file_buffer)
+
+
+def remove_guards(output_file_path, unifdef_path, unifdef_guards):
+    # The unifdef path should be passed in from CMake. If it wasn't there in CMake or is incorrect,
+    # find it using shutil. If shutil can't find it, then exit.
+    if not shutil.which(unifdef_path):
+        unifdef_path = shutil.which("unifdef")
+    if not unifdef_path:
+        print(
+            "Unable to find unifdef executable. Guards will not be removed from input files. Exiting..."
+        )
+        sys.exit()
+
+    subprocess_command = (
+        [unifdef_path, "-o", output_file_path] + unifdef_guards + [output_file_path]
+    )
+    subprocess.run(subprocess_command)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-f", "--framework", choices=["lldb_main", "lldb_rpc"])
+    parser.add_argument("-i", "--input_file")
+    parser.add_argument("-o", "--output_file")
+    parser.add_argument("-p", "--unifdef_path")
+    parser.add_argument(
+        "unifdef_guards",
+        nargs="+",
+        type=str,
+        help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.",
+    )
+    args = parser.parse_args()
+    input_file_path = str(args.input_file)
+    output_file_path = str(args.output_file)
+    framework_version = args.framework
+    unifdef_path = str(args.unifdef_path)
+    # Prepend dashes to the list of guards passed in from the command line.
+    # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG)
+    # but passing them in with dashes for this script causes argparse to think that they're
+    # arguments in and of themself, so they need to passed in without dashes.
+    unifdef_guards = ["-" + guard for guard in args.unifdef_guards]
+
+    if framework_version == "lldb_main":
+        modify_main_includes(input_file_path, output_file_path)
+    if framework_version == "lldb_rpc":
+        modify_rpc_includes(input_file_path, output_file_path)
+    # After the incldues have been modified, run unifdef on the headers to remove any guards
+    # specified at the command line.
+    remove_guards(output_file_path, unifdef_path, unifdef_guards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
new file mode 100644
index 0000000000000..fecc69687cd74
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/Main/SBAddress.h
@@ -0,0 +1,13 @@
+// This is a truncated version of an SB API file
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDB.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "lldb/API/SBDefines.h"
+#include "lldb/API/SBModule.h"
+
+// Any include guards specified at the command line must be removed.
+#ifndef SWIG
+int a = 10
+#endif
diff --git a/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
new file mode 100644
index 0000000000000..556afa38a9225
--- /dev/null
+++ b/lldb/test/Shell/Scripts/Inputs/RPC/RPCSBAddress.h
@@ -0,0 +1,9 @@
+// This is a truncated version of an SB API file generated by lldb-rpc-gen
+// used to test framework-header-fix.py to make sure the includes are correctly fixed
+// up for the LLDBRPC.framework.
+
+// Local includes must be changed to framework level includes.
+// e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+#include "LLDBRPC.h"
+#include "SBDefines.h"
+#include <lldb-rpc/common/RPCPublic.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
new file mode 100644
index 0000000000000..e90c3bdfc5adb
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test
@@ -0,0 +1,11 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Local includes must be changed to framework level includes.
+# e.g. #include "lldb/API/SBDefines.h" -> #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBDefines.h>
+CHECK: #include <LLDB/SBModule.h>
diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
new file mode 100644
index 0000000000000..a7e82d2f3640c
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/SBAddress.h | FileCheck %s
+
+# Any include guards specified at the command line must be removed.
+CHECK-NOT: #ifndef SWIG
+CHECK: int a = 10
+CHECK-NOT: #endif
diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
new file mode 100644
index 0000000000000..d015942653967
--- /dev/null
+++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test
@@ -0,0 +1,14 @@
+# Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir.
+RUN: mkdir -p %t/Outputs
+RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG
+
+# Check the output
+RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s
+
+# Local includes must be changed to RPC framework level includes.
+# e.g. #include "SBDefines.h" -> #include <LLDBRPC/SBDefines.h>
+# Also, RPC common code includes must change to RPC framework level includes.
+# e.g. #include "lldb-rpc/common/RPCPublic.h" -> #include <LLDBRPC/RPCPublic.h>
+CHECK: #include <LLDBRPC/LLDBRPC.h>
+CHECK: #include <LLDBRPC/SBDefines.h>
+CHECK: #include <LLDBRPC/RPCPublic.h>

From 6f3e2c076d6e3abac9cfd756e95a1ebb5979dd88 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Thu, 12 Jun 2025 14:08:50 -0700
Subject: [PATCH 299/851] [MSAN] fork avx512vl-intrinsics and x86-vpermi2 tests
 (#143643)

---
 .../X86/avx512vl-intrinsics.ll                | 12306 ++++++++++++++++
 .../MemorySanitizer/X86/x86-vpermi2.ll        |   722 +
 2 files changed, 13028 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
new file mode 100644
index 0000000000000..14d68b449a7b6
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -0,0 +1,12306 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s
+
+; Forked from llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_maskz_compress_pd_128(<2 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_maskz_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_compress_pd_128(<2 x double> %data, <2 x double> %data2) #0 {
+; CHECK-LABEL: define <2 x double> @test_compress_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> [[DATA]], <2 x double> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x double> %1
+}
+
+define <4 x float> @test_mask_compress_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_maskz_compress_ps_128(<4 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_maskz_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_compress_ps_128(<4 x float> %data, <4 x float> %data2) #0 {
+; CHECK-LABEL: define <4 x float> @test_compress_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> [[DATA]], <4 x float> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x float> %1
+}
+
+define <2 x i64> @test_mask_compress_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_mask_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_maskz_compress_q_128(<2 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_maskz_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_compress_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_compress_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> [[DATA]], <2 x i64> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @test_mask_compress_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_mask_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_maskz_compress_d_128(<4 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_maskz_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_compress_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_compress_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> [[DATA]], <4 x i32> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i32> %1
+}
+
+define <2 x double> @test_expand_pd_128(<2 x double> %data, <2 x double> %data2) #0 {
+; CHECK-LABEL: define <2 x double> @test_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_mask_expand_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], <2 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> %passthru, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_maskz_expand_pd_128(<2 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_maskz_expand_pd_128(
+; CHECK-SAME: <2 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> [[DATA]], <2 x double> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> zeroinitializer, <2 x i1> %extract)
+  ret <2 x double> %2
+}
+
+define <4 x float> @test_expand_ps_128(<4 x float> %data, <4 x float> %data2) #0 {
+; CHECK-LABEL: define <4 x float> @test_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_expand_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], <4 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> %passthru, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_maskz_expand_ps_128(<4 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_maskz_expand_ps_128(
+; CHECK-SAME: <4 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> [[DATA]], <4 x float> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> zeroinitializer, <4 x i1> %extract)
+  ret <4 x float> %2
+}
+
+define <2 x i64> @test_expand_q_128(<2 x i64> %data, <2 x i64> %data2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> [[DATA2]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> %data2, <2 x i1> <i1 true, i1 true>)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mask_expand_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_mask_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i2 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> [[PASSTHRU]], <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> %passthru, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_maskz_expand_q_128(<2 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_maskz_expand_q_128(
+; CHECK-SAME: <2 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i1> [[_MSPROP]] to i2
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i2 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> [[DATA]], <2 x i64> zeroinitializer, <2 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1>
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> zeroinitializer, <2 x i1> %extract)
+  ret <2 x i64> %2
+}
+
+define <4 x i32> @test_expand_d_128(<4 x i32> %data, <4 x i32> %data2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i32> %1
+}
+
+define <4 x i32> @test_mask_expand_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_mask_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], <4 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> %passthru, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @test_maskz_expand_d_128(<4 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_maskz_expand_d_128(
+; CHECK-SAME: <4 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> [[DATA]], <4 x i32> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i32> %2
+}
+
+define <4 x double> @test_mask_compress_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_maskz_compress_pd_256(<4 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_maskz_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_compress_pd_256(<4 x double> %data, <4 x double> %data2) #0 {
+; CHECK-LABEL: define <4 x double> @test_compress_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> [[DATA]], <4 x double> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x double> %1
+}
+
+define <8 x float> @test_mask_compress_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_maskz_compress_ps_256(<8 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_maskz_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_compress_ps_256(<8 x float> %data, <8 x float> %data2) #0 {
+; CHECK-LABEL: define <8 x float> @test_compress_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> [[DATA]], <8 x float> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x float> %1
+}
+
+define <4 x i64> @test_mask_compress_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_mask_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_maskz_compress_q_256(<4 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_maskz_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_compress_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_compress_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> [[DATA]], <4 x i64> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i64> %1
+}
+
+define <8 x i32> @test_mask_compress_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_mask_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_maskz_compress_d_256(<8 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_maskz_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_compress_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_compress_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> [[DATA]], <8 x i32> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i32> %1
+}
+
+define <4 x double> @test_expand_pd_256(<4 x double> %data, <4 x double> %data2) #0 {
+; CHECK-LABEL: define <4 x double> @test_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_mask_expand_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], <4 x double> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> %passthru, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_maskz_expand_pd_256(<4 x double> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_maskz_expand_pd_256(
+; CHECK-SAME: <4 x double> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> [[DATA]], <4 x double> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> zeroinitializer, <4 x i1> %extract)
+  ret <4 x double> %2
+}
+
+define <8 x float> @test_expand_ps_256(<8 x float> %data, <8 x float> %data2) #0 {
+; CHECK-LABEL: define <8 x float> @test_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_mask_expand_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], <8 x float> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> %passthru, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_maskz_expand_ps_256(<8 x float> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_maskz_expand_ps_256(
+; CHECK-SAME: <8 x float> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> [[DATA]], <8 x float> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> zeroinitializer, <8 x i1> %1)
+  ret <8 x float> %2
+}
+
+define <4 x i64> @test_expand_q_256(<4 x i64> %data, <4 x i64> %data2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> [[DATA2]], <4 x i1> splat (i1 true))
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> %data2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mask_expand_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_mask_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> [[PASSTHRU]], <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> %passthru, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_maskz_expand_q_256(<4 x i64> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_maskz_expand_q_256(
+; CHECK-SAME: <4 x i64> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> [[DATA]], <4 x i64> zeroinitializer, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> zeroinitializer, <4 x i1> %extract)
+  ret <4 x i64> %2
+}
+
+define <8 x i32> @test_expand_d_256(<8 x i32> %data, <8 x i32> %data2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[DATA2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> [[DATA2]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> %data2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @test_mask_expand_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_mask_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], <8 x i32> [[PASSTHRU:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> [[PASSTHRU]], <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> %passthru, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_maskz_expand_d_256(
+; CHECK-SAME: <8 x i32> [[DATA:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> [[DATA]], <8 x i32> zeroinitializer, <8 x i1> [[TMP1]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> zeroinitializer, <8 x i1> %1)
+  ret <8 x i32> %2
+}
+
+define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmpps_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> [[A]], <8 x float> [[B]], i32 2, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[RES]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x i1> %res to i8
+  ret i8 %1
+}
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.ps.256(<8 x float>, <8 x float>, i32, <8 x i1>)
+
+define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmpps_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> [[A]], <4 x float> [[B]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.ps.128(<4 x float>, <4 x float>, i32, <4 x i1>)
+
+define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmppd_256(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP6]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> [[A]], <4 x double> [[B]], i32 2, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[RES]], <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <4 x i1> %res, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <4 x i1> @llvm.x86.avx512.mask.cmp.pd.256(<4 x double>, <4 x double>, i32, <4 x i1>)
+
+define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: define i8 @test_cmppd_128(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP5]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> [[A]], <2 x double> [[B]], i32 2, <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i1> [[RES]], <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i1> [[TMP1]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %res = call <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, <2 x i1> <i1 true, i1 true>)
+  %1 = shufflevector <2 x i1> %res, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+declare <2 x i1> @llvm.x86.avx512.mask.cmp.pd.128(<2 x double>, <2 x double>, i32, <2 x i1>)
+
+define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_maskz_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_mask_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <8 x float> @test_mm512_max_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
+
+define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_maskz_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP7]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_mask_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <4 x float> @test_mm512_max_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
+
+define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_maskz_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP7]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mm512_mask_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[SRC:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[SRC]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[SRC]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <8 x float> @test_mm512_min_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
+
+define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_maskz_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP8]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = bitcast i8 %mask2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mm512_mask_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[SRC:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], [[TMP12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[SRC]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = bitcast i8 %mask2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: define <4 x float> @test_mm512_min_ps_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> [[A0]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
+
+define <4 x double> @test_getexp_pd_256(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_getexp_pd_256(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <8 x float> @test_getexp_ps_256(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_getexp_ps_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP5]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP15]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP5]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP15]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP6]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP15]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP15]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP13]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP6]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  ret <8 x i32> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
+
+define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) #0 {
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
+  ret <2 x double> %1
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP21]], <2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
+  %2 = bitcast <2 x i64> %x1 to <2 x double>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2
+  ret <2 x double> %4
+}
+
+declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
+
+define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
+  ret <4 x double> %1
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i64> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP21]], <4 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
+  %2 = bitcast <4 x i64> %x1 to <4 x double>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2
+  ret <4 x double> %4
+}
+
+declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>)
+
+define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
+  ret <4 x float> %1
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP21]], <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
+  %2 = bitcast <4 x i32> %x1 to <4 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2
+  ret <4 x float> %4
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128_cast(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <2 x i64> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP12]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP22]], <4 x i32> [[TMP17]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %x1cast = bitcast <2 x i64> %x1 to <4 x i32>
+  %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2)
+  %2 = bitcast <4 x i32> %x1cast to <4 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2
+  ret <4 x float> %4
+}
+
+declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
+
+define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) #0 {
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
+  ret <8 x float> %1
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> zeroinitializer, <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP21]], <8 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
+  %2 = bitcast <8 x i32> %x1 to <8 x float>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2
+  ret <8 x float> %4
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
+
+define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP5]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X1]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP5]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X1]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP13]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer
+  ret <2 x i64> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP5]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X1]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP5]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X1]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP13]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) #0 {
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_scalef_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_scalef_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_scalef_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_scalef_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  ret <4 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_scalef_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_scalef_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) #0 {
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_scalef_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_scalef_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  ret <8 x float> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_qb_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_qw_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <4 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <4 x i32> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES4]]
+;
+  %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res3 = add <4 x i32> %res0, %res1
+  %res4 = add <4 x i32> %res3, %res2
+  ret <4 x i32> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(ptr %ptr, <2 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <2 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr [[PTR]], <2 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.128(ptr %ptr, <2 x i64> %x1, i8 %x2)
+  ret void
+}
+
+define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i32> [[TMP1]], <4 x i32> [[X1]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1
+  ret <4 x i32> %3
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmov_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i64> [[X0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP6]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = trunc <4 x i64> %x0 to <4 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovs_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pmovus_qd_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> [[X0]], <4 x i32> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(ptr %ptr, <4 x i64> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr [[PTR]], <4 x i64> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.256(ptr %ptr, <4 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmov_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovs_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <16 x i8> @test_int_x86_avx512_mask_pmovus_db_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_db_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(ptr %ptr, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr [[PTR]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.128(ptr %ptr, <4 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmov_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmov_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovs_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_int_x86_avx512_mask_pmovus_dw_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(ptr %ptr, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr [[PTR]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.256(ptr %ptr, <8 x i32> %x1, i8 %x2)
+  ret void
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = add <4 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES2]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res2 = add <4 x i32> %res, %res1
+  ret <4 x i32> %res2
+}
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1) #0 {
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_cvt_pd2ps_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x float> [[RES2]], <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES3]]
+;
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1)
+  %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res3
+}
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_cvt_pd2ps_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> [[X0]], <4 x float> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x float> [[RES]], <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES1]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2)
+  %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x float> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2dq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2dq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2dq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_ask_cvtt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = shufflevector <4 x i32> [[RES2]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res3
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    [[RES1:%.*]] = shufflevector <4 x i32> [[RES]], <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES1]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %res1
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_pd2udq_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_cvtt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> [[X0]], <4 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[RES]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_cvtt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_cvtt_ps2udq_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> [[X0]], <8 x i32> [[X1]], i8 [[X2]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_rndscale_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 4, <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 88, <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES2]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 88, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_rndscale_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 4, <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 88, <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES2]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 88, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_rndscale_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 88, <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 4, <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 4, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_rndscale_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 5, <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 66, <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES2]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 66, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_getmant_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 11, <2 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 12, <2 x double> zeroinitializer, i8 [[X3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> [[X0]], i32 13, <2 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 11, <2 x double> %x2, i8 %x3)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 12, <2 x double> zeroinitializer, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double> %x0, i32 13, <2 x double> %x2, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_getmant_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 11, <4 x double> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> [[X0]], i32 12, <4 x double> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES2]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 11, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double> %x0, i32 12, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_getmant_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 11, <4 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> [[X0]], i32 12, <4 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 11, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float> %x0, i32 12, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_getmant_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 11, <8 x float> [[X2]], i8 [[X3]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> [[X0]], i32 12, <8 x float> [[X2]], i8 -1)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES2]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 11, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float> %x0, i32 12, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32)
+
+define <4 x i32>@test_int_x86_avx512_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) #0 {
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  ret <4 x i32> %1
+}
+
+define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP16]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> [[X0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
+  ret <4 x i32> %3
+}
+
+declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
+
+define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_pternlog_d_128(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP10]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP6]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer
+  ret <4 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32)
+
+define <8 x i32>@test_int_x86_avx512_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) #0 {
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP16]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> [[X0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
+  ret <8 x i32> %3
+}
+
+declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32, i8)
+
+define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_pternlog_d_256(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> [[TMP6]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP7]]
+;
+  %1 = call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  ret <8 x i32> %3
+}
+
+declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32)
+
+define <2 x i64>@test_int_x86_avx512_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) #0 {
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP7]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  ret <2 x i64> %1
+}
+
+define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP16]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> [[X0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x0
+  ret <2 x i64> %3
+}
+
+define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_pternlog_q_128(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP10]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+  %1 = call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer
+  ret <2 x i64> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32)
+
+define <4 x i64>@test_int_x86_avx512_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP7]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP16]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> [[X0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x0
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_pternlog_q_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X4]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_128(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 10, <8 x i16> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 11, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT:    [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES]]
+;
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask)
+  %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 11, <8 x i16> %src, i8 %mask)
+  %res0 = add <8 x i16> %res1, %res2
+  %res = add <8 x i16> %res3, %res0
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) #0 {
+;
+; CHECK-LABEL: define <8 x i16> @test_x86_vcvtps2ph_256(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]], <8 x i16> [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 11, <8 x i16> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 12, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT:    [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES]]
+;
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 11, <8 x i16> zeroinitializer, i8 %mask)
+  %res3 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 12, <8 x i16> %src, i8 %mask)
+  %res0 = add <8 x i16> %res1, %res2
+  %res = add <8 x i16> %res3, %res0
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly
+
+define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rr(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrkz(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rsqrt_ps_256_rrk(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) #0 {
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rr(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rsqrt_ps_128_rrk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) #0 {
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rr(
+; CHECK-SAME: <8 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrkz(
+; CHECK-SAME: <8 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_rcp_ps_256_rrk(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> [[A0]], <8 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) #0 {
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rr(
+; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_rcp_ps_128_rrk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> [[A0]], <4 x float> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rr(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rsqrt_pd_256_rrk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) #0 {
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rr(
+; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rsqrt_pd_128_rrk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) #0 {
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rr(
+; CHECK-SAME: <4 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_rcp_pd_256_rrk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> [[A0]], <4 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) #0 {
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rr(
+; CHECK-SAME: <2 x double> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> zeroinitializer, i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_rcp_pd_128_rrk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> [[A0]], <2 x double> [[A1]], i8 [[MASK]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
+
+declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>)
+
+define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) #0 {
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  ret <4 x double> %1
+}
+
+define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP7]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP17]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> %x2
+  ret <4 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_permvar_df_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP7]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP8]]
+;
+  %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> zeroinitializer
+  ret <4 x double> %3
+}
+
+declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>)
+
+define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) #0 {
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  ret <4 x i64> %1
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], [[X2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP11]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> %x2
+  ret <4 x i64> %3
+}
+
+define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_permvar_di_256(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[_MSPROP]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP6]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+  %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> zeroinitializer
+  ret <4 x i64> %3
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_fixupimm_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> [[X1]], <2 x i64> [[X2]], i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_fixupimm_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 3, i8 [[X4]])
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES3]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 3, i8 %x4)
+  ;%res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 -1)
+  %res3 = fadd <2 x double> %res, %res1
+  ;%res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res3
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_fixupimm_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES4]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res3, %res2
+  ret <4 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x double>, <4 x i64>, i32, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_fixupimm_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> zeroinitializer, i32 4, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> [[X0]], <4 x double> [[X1]], <4 x i64> [[X2]], i32 3, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[RES4]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4)
+  %res2 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 3, i8 -1)
+  %res3 = fadd <4 x double> %res, %res1
+  %res4 = fadd <4 x double> %res3, %res2
+  ret <4 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_fixupimm_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x float>, <4 x i32>, i32, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_fixupimm_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_fixupimm_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES4]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x float>, <8 x i32>, i32, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_fixupimm_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X4:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 5, i8 [[X4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i256 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> zeroinitializer, i32 6, i8 [[X4]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i256 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i256 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label %[[BB17:.*]], label %[[BB18:.*]], !prof [[PROF1]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB18]]:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> [[X0]], <8 x float> [[X1]], <8 x i32> [[X2]], i32 7, i8 -1)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES4]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
+  %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 6, i8 %x4)
+  %res2 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 7, i8 -1)
+  %res3 = fadd <8 x float> %res, %res1
+  %res4 = fadd <8 x float> %res3, %res2
+  ret <8 x float> %res4
+}
+
+define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i128 [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i1 [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i128 [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP16]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP12]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <2 x i64> [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP17]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP18]], <2 x i64> [[TMP15]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
+  ret <2 x i64> %res2
+}
+define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psra_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i128 [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP15]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP13]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP11]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP14]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i256
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i256 [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i128 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i1 [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i256 [[TMP9]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP16]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP12]], <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP4]], [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i64> [[TMP17]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP18]], <4 x i64> [[TMP15]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
+  ret <4 x i64> %res2
+}
+define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psra_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 88) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i128 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i256 [[TMP8]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP15]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP11]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP11]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_avx512_psrai_q_128(<2 x i64> %a0) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP3]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP5]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP6]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP7]], <8 x i1> [[TMP8]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP12]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
+  ret <2 x i64> %res2
+}
+define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[TMP9]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP7]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP2]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx512_psrai_q_256(<4 x i64> %a0) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP3]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    store <4 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[PASSTHRU:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP5]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP6]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP7]], <8 x i1> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[PASSTHRU]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP12]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
+  ret <4 x i64> %res2
+}
+define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[TMP9]], i32 7)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> [[A0]], i32 7)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) nounwind readnone
+
+define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_mask_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 56) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP9]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], [[A2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP15]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> [[A2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2
+  ret <2 x i64> %res2
+}
+
+define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <2 x i1> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[TMP1]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP10]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[TMP8]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP6]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <2 x i1> [[MASK_EXTRACT]], <2 x i64> [[RES]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[RES2]]
+;
+  %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast, <2 x i32> <i32 0, i32 1>
+  %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
+  ret <2 x i64> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) #0 {
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, i8 %mask, i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_mask_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP9]], <4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], [[A2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP4]], [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP15]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> [[A2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast , <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2
+  ret <4 x i64> %res2
+}
+
+define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask,  i8 %mask2) #0 {
+;
+; CHECK-LABEL: define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(
+; CHECK-SAME: <4 x i64> [[A0:%.*]], <4 x i64> [[A1:%.*]], i8 [[MASK:%.*]], i8 [[MASK2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <4 x i1> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[TMP1]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP7]], [[TMP14]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+; CHECK-NEXT:    [[MASK2_CAST:%.*]] = bitcast i8 [[MASK2]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[MASK_EXTRACT:%.*]] = shufflevector <8 x i1> [[MASK_CAST]], <8 x i1> [[MASK2_CAST]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[TMP8]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <4 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP6]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <4 x i1> [[MASK_EXTRACT]], <4 x i64> [[RES]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[RES2]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %mask2.cast = bitcast i8 %mask2 to <8 x i1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> %mask2.cast , <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
+  ret <4 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <8 x float> @test_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP1]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  ret <8 x float> %1
+}
+
+define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[A1]], <8 x float> [[A2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %a0
+  ret <8 x float> %3
+}
+
+define <4 x float> @test_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x double> @test_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 {
+; CHECK-LABEL: define <4 x double> @test_fmadd256_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
+  ret <4 x double> %1
+}
+
+define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_fmadd256_pd(
+; CHECK-SAME: <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A]], <4 x double> [[B]], <4 x double> [[C]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[A]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a
+  ret <4 x double> %3
+}
+
+define <2 x double> @test_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+; CHECK-LABEL: define <2 x double> @test_fmadd128_pd(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_fmadd128_pd(
+; CHECK-SAME: <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A]], <2 x double> [[B]], <2 x double> [[C]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[A]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a
+  ret <2 x double> %3
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP13]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %x2
+  ret <2 x double> %3
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <2 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP7]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP8]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> zeroinitializer
+  ret <2 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP13]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %x2
+  ret <4 x double> %3
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP7]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP8]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> zeroinitializer
+  ret <4 x double> %3
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP13]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %x2
+  ret <4 x float> %3
+}
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP7]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %3
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP13]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP9]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %x2
+  ret <8 x float> %3
+}
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP7]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %x2
+  ret <2 x double> %4
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %x2
+  ret <4 x double> %4
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %x2
+  ret <4 x float> %4
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %1)
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %x2
+  ret <8 x float> %4
+}
+
+define <8 x float> @test_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfnmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP2]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2)
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfnmadd256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[_MSPROP2]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP14]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP2]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP10]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %a0
+  ret <8 x float> %4
+}
+
+define <4 x float> @test_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfnmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP2]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2)
+  ret <4 x float> %2
+}
+
+define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfnmadd128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP2]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP14]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP2]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP10]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x float> %2, <4 x float> %a0
+  ret <4 x float> %4
+}
+
+define <4 x double> @test_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfnmadd256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2)
+  ret <4 x double> %2
+}
+
+define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfnmadd256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP2]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i64> [[TMP14]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP2]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP10]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = select <4 x i1> %extract, <4 x double> %2, <4 x double> %a0
+  ret <4 x double> %4
+}
+
+define <2 x double> @test_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfnmadd128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP5]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2)
+  ret <2 x double> %2
+}
+
+define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfnmadd128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP2]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[TMP7]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP3]], <2 x i64> [[TMP14]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP2]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP10]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %a2)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> <i32 0, i32 1>
+  %4 = select <2 x i1> %extract, <2 x double> %2, <2 x double> %a0
+  ret <2 x double> %4
+}
+
+define <8 x float> @test_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: define <8 x float> @test_vfnmsub256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2)
+  ret <8 x float> %3
+}
+
+define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_vfnmsub256_ps(
+; CHECK-SAME: <8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]], <8 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A0]], <8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[A0]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a0, <8 x float> %1, <8 x float> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %a0
+  ret <8 x float> %5
+}
+
+define <4 x float> @test_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: define <4 x float> @test_vfnmsub128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2)
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfnmsub128_ps(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a1
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %1, <4 x float> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %a0
+  ret <4 x float> %5
+}
+
+define <4 x double> @test_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfnmsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2)
+  ret <4 x double> %3
+}
+
+define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfnmsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %1, <4 x double> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %a0
+  ret <4 x double> %5
+}
+
+define <2 x double> @test_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfnmsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2)
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfnmsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[TMP12]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a1
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %1, <2 x double> %2)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
+  %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %a0
+  ret <2 x double> %5
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP3]], <2 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP4]], <2 x i64> [[TMP15]], <2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP3]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x0
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %1, <2 x double> %x1, <2 x double> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <2 x i32> <i32 0, i32 1>
+  %5 = select <2 x i1> %extract, <2 x double> %3, <2 x double> %x2
+  ret <2 x double> %5
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfnmsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP3]], <4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i64> [[TMP15]], <4 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP3]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x0
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %1, <4 x double> %x1, <4 x double> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x double> %3, <4 x double> %x2
+  ret <4 x double> %5
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP3]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP4]], <4 x i32> [[TMP15]], <4 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP3]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %1, <4 x float> %x1, <4 x float> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %4, <8 x i1> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %extract, <4 x float> %3, <4 x float> %x2
+  ret <4 x float> %5
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfnmsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X0]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP13]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[_MSPROP3]], <8 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i32> [[TMP15]], <8 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP4]], <8 x float> [[TMP3]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x0
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %1, <8 x float> %x1, <8 x float> %2)
+  %4 = bitcast i8 %x3 to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %x2
+  ret <8 x float> %5
+}
+
+define <8 x float> @test_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 {
+; CHECK-LABEL: define <8 x float> @test_fmaddsub256_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP4]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x float> %4
+}
+
+define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_mask_fmaddsub256_ps(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[A]], <8 x float> [[B]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[A]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %a
+  ret <8 x float> %6
+}
+
+define <4 x float> @test_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+; CHECK-LABEL: define <4 x float> @test_fmaddsub128_ps(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP4]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x float> %4
+}
+
+define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_fmaddsub128_ps(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[C]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[C]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A]], <4 x float> [[B]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[A]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[A]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %a
+  ret <4 x float> %6
+}
+
+define <4 x double> @test_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: define <4 x double> @test_vfmaddsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP4]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x double> %4
+}
+
+define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmaddsub256_pd(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], <4 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %a0
+  ret <4 x double> %6
+}
+
+define <2 x double> @test_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: define <2 x double> @test_vfmaddsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP4]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %4
+}
+
+define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmaddsub128_pd(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[A2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %a0
+  ret <2 x double> %6
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2
+  ret <2 x double> %6
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP4]], <2 x i64> [[_MSPROP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <2 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP10]], <2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> zeroinitializer
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP11]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %3, <2 x double> %1, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> zeroinitializer
+  ret <2 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmaddsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2
+  ret <4 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_maskz_vfmaddsub_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP4]], <4 x i64> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP10]], <4 x i64> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> zeroinitializer
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP11]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %3, <4 x double> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> zeroinitializer
+  ret <4 x double> %6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2
+  ret <4 x float> %6
+}
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP4]], <4 x i32> [[_MSPROP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP10]], <4 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP11]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %3, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> zeroinitializer
+  ret <4 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmaddsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2
+  ret <8 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_maskz_vfmaddsub_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP4]], <8 x i32> [[_MSPROP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP15]], <8 x i32> [[TMP10]], <8 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP11]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %3, <8 x float> %1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> zeroinitializer
+  ret <8 x float> %6
+}
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_128(
+; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <2 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <2 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <2 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X0]], <2 x double> [[X1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <2 x i64> [[_MSPROP1]], <2 x i64> [[_MSPROP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP5]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[X2]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <2 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <2 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <2 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP6]], <2 x i64> [[TMP16]], <2 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP4]], <2 x double> [[X2]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP12]]
+;
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2)
+  %2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <2 x double> @llvm.fma.v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %2)
+  %4 = shufflevector <2 x double> %1, <2 x double> %3, <2 x i32> <i32 0, i32 3>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract, <2 x double> %4, <2 x double> %x2
+  ret <2 x double> %6
+}
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask3_vfmsubadd_pd_256(
+; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i64> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x double> splat (double -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i64> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i64> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[X0]], <4 x double> [[X1]], <4 x double> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i64> [[_MSPROP1]], <4 x i64> [[_MSPROP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP5]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i64> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i64> [[TMP16]], <4 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP4]], <4 x double> [[X2]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP12]]
+;
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2)
+  %2 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x2
+  %3 = call <4 x double> @llvm.fma.v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %2)
+  %4 = shufflevector <4 x double> %1, <4 x double> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> %x2
+  ret <4 x double> %6
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_128(
+; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <4 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[X0]], <4 x float> [[X1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <4 x i32> [[_MSPROP1]], <4 x i32> [[_MSPROP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> [[TMP17]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[X2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP16]], <4 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[X2]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP12]]
+;
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2)
+  %2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %2)
+  %4 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract, <4 x float> %4, <4 x float> %x2
+  ret <4 x float> %6
+}
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) #0 {
+;
+; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask3_vfmsubadd_ps_256(
+; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP13]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[X2]])
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> zeroinitializer, [[TMP13]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> splat (float -0.000000e+00), [[X2]]
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <8 x i32> [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[X0]], <8 x float> [[X1]], <8 x float> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i32> [[_MSPROP1]], <8 x i32> [[_MSPROP4]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP5]], <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x float> [[X2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP9]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], [[TMP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> [[TMP16]], <8 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP5]], <8 x float> [[TMP4]], <8 x float> [[X2]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP12]]
+;
+  %1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2)
+  %2 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x2
+  %3 = call <8 x float> @llvm.fma.v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %2)
+  %4 = shufflevector <8 x float> %1, <8 x float> %3, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x float> %4, <8 x float> %x2
+  ret <8 x float> %6
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmk(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmka(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2, align 8
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmkza(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x float>, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a2 = load <4 x float>, ptr %ptr_a2, align 4
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmb(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %q = load float, ptr %ptr_a2
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmba(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP5]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP6]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i32> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP6]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP9]]
+;
+  %q = load float, ptr %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %a0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbz(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %q = load float, ptr %ptr_a2
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  ret <4 x float> %1
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x float> @test_mask_vfmadd128_ps_rmbza(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_A2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <4 x float> [[VECINIT4_I]], float [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[VECINIT6_I]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %q = load float, ptr %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i)
+  ret <4 x float> %1
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmk(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP15]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[_MSPROP1]], <2 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[A0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <2 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP2]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[A0]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP9]]
+;
+  %a2 = load <2 x double>, ptr %ptr_a2
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %a0
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <2 x double> @test_mask_vfmadd128_pd_rmkz(
+; CHECK-SAME: <2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <2 x double>, ptr [[PTR_A2]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %a2 = load <2 x double>, ptr %ptr_a2
+  %1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+  ret <2 x double> %1
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2, i8 %mask) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmk(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]], i8 [[MASK:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP15]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP14]] to <8 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP1]], <4 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[A0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP6]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i64> [[TMP17]], [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[A0]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP9]]
+;
+  %a2 = load <4 x double>, ptr %ptr_a2
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %a0
+  ret <4 x double> %3
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, ptr %ptr_a2) #0 {
+;
+; CHECK-LABEL: define <4 x double> @test_mask_vfmadd256_pd_rmkz(
+; CHECK-SAME: <4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]], ptr [[PTR_A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[A2:%.*]] = load <4 x double>, ptr [[PTR_A2]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_A2]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[A0]], <4 x double> [[A1]], <4 x double> [[A2]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP1]]
+;
+  %a2 = load <4 x double>, ptr %ptr_a2
+  %1 = call <4 x double> @llvm.fma.v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+  ret <4 x double> %1
+}
+
+define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 {
+;
+; CHECK-LABEL: define <8 x i32> @combine_vpermi2d_vpermps(
+; CHECK-SAME: <16 x i32> noundef [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[TMP2]])
+; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <16 x i32> %a, <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %1, <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> %2)
+  ret <8 x i32> %3
+}
+
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double>, <2 x double>, <2 x i1>)
+declare <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float>, <4 x float>, <4 x i1>)
+declare <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64>, <2 x i64>, <2 x i1>)
+declare <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32>, <4 x i32>, <4 x i1>)
+declare <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double>, <2 x double>, <2 x i1>)
+declare <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float>, <4 x float>, <4 x i1>)
+declare <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64>, <2 x i64>, <2 x i1>)
+declare <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32>, <4 x i32>, <4 x i1>)
+declare <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double>, <4 x double>, <4 x i1>)
+declare <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float>, <8 x float>, <8 x i1>)
+declare <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64>, <4 x i64>, <4 x i1>)
+declare <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32>, <8 x i32>, <8 x i1>)
+declare <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double>, <4 x double>, <4 x i1>)
+declare <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float>, <8 x float>, <8 x i1>)
+declare <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64>, <4 x i64>, <4 x i1>)
+declare <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32>, <8 x i32>, <8 x i1>)
+
+attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
new file mode 100644
index 0000000000000..2350d75b29b44
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mattr=+avx512f -passes=msan -S | FileCheck %s
+
+; Forked from llvm/test/Transforms/InstCombine/X86/x86-vpermi2.ll
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;
+; vXi64
+;
+
+define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> <i64 2, i64 0>, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_unary(
+; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X0]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> <i64 2, i64 0>, <2 x i64> %x0)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP4]], <i64 -1, i64 -5>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 4>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %m) #0 {
+; CHECK-LABEL: define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(
+; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP4]], <i64 -1, i64 -3>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %t = or <2 x i64> %m, <i64 0, i64 2>
+  %r = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %t, <2 x i64> %x1)
+  ret <2 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> %x1)
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_unary(
+; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X0]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> %x0)
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %m) #0 {
+; CHECK-LABEL: define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(
+; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i64> [[TMP4]], <i64 -1, i64 -9, i64 -17, i64 -33>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[R]]
+;
+  %t = or <4 x i64> %m, <i64 0, i64 8, i64 16, i64 32>
+  %r = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %t, <4 x i64> %x1)
+  ret <4 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64(
+; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> %x1)
+  ret <8 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_unary(
+; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X0]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> %x0)
+  ret <8 x i64> %r
+}
+
+define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %m) #0 {
+; CHECK-LABEL: define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(
+; CHECK-SAME: <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i64> [[M]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP4]], <i64 -1, i64 -17, i64 -33, i64 -65, i64 -257, i64 -513, i64 -1025, i64 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[R]]
+;
+  %t = or <8 x i64> %m, <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
+  %r = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %t, <8 x i64> %x1)
+  ret <8 x i64> %r
+}
+
+;
+; vXi32
+;
+
+define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> %x1)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_unary(
+; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X0]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> %x0)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %m) #0 {
+; CHECK-LABEL: define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(
+; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP4]], <i32 -1, i32 -9, i32 -17, i32 -33>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %t = or <4 x i32> %m, <i32 0, i32 8, i32 16, i32 32>
+  %r = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %t, <4 x i32> %x1)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> %x1)
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_unary(
+; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X0]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> %x0)
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %m) #0 {
+; CHECK-LABEL: define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(
+; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i32> [[TMP4]], <i32 -1, i32 -17, i32 -33, i32 -65, i32 -257, i32 -513, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %t = or <8 x i32> %m, <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
+  %r = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %t, <8 x i32> %x1)
+  ret <8 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32(
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> %x1)
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_unary(
+; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X0]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> %x0)
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %m) #0 {
+; CHECK-LABEL: define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(
+; CHECK-SAME: <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i32> [[M]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP4]], <i32 -1, i32 -33, i32 -65, i32 -257, i32 -513, i32 -1025, i32 -2049, i32 -4097, i32 -8193, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[R]]
+;
+  %t = or <16 x i32> %m, <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
+  %r = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %t, <16 x i32> %x1)
+  ret <16 x i32> %r
+}
+
+;
+; vXi16
+;
+
+define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16(
+; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> %x1)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_unary(
+; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X0]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> %x0)
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %m) #0 {
+; CHECK-LABEL: define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(
+; CHECK-SAME: <8 x i16> [[X0:%.*]], <8 x i16> [[X1:%.*]], <8 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <8 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i16> [[TMP4]], <i16 -1, i16 -17, i16 -33, i16 -65, i16 -257, i16 -513, i16 15, i16 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[R]]
+;
+  %t = or <8 x i16> %m, <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
+  %r = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %t, <8 x i16> %x1)
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16(
+; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> %x1)
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_unary(
+; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X0]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> %x0)
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %m) #0 {
+; CHECK-LABEL: define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(
+; CHECK-SAME: <16 x i16> [[X0:%.*]], <16 x i16> [[X1:%.*]], <16 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i16> [[TMP4]], <i16 -1, i16 -33, i16 -65, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 31, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
+; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %t = or <16 x i16> %m, <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %t, <16 x i16> %x1)
+  ret <16 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16(
+; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> %x1)
+  ret <32 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_unary(
+; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X0]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> %x0)
+  ret <32 x i16> %r
+}
+
+define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %m) #0 {
+; CHECK-LABEL: define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(
+; CHECK-SAME: <32 x i16> [[X0:%.*]], <32 x i16> [[X1:%.*]], <32 x i16> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i16> [[M]], splat (i16 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i16> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i16> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i16> [[TMP4]], <i16 -1, i16 -65, i16 -129, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 -1, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095, i16 -1, i16 -65, i16 -129, i16 -257, i16 -513, i16 -1025, i16 -2049, i16 -4097, i16 -1, i16 63, i16 127, i16 255, i16 511, i16 1023, i16 2047, i16 4095>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
+; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i16> [[R]]
+;
+  %t = or <32 x i16> %m, <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
+  %r = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %t, <32 x i16> %x1)
+  ret <32 x i16> %r
+}
+
+;
+; vXi8
+;
+
+define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8(
+; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> %x1)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_unary(
+; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X0]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> %x0)
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %m) #0 {
+; CHECK-LABEL: define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(
+; CHECK-SAME: <16 x i8> [[X0:%.*]], <16 x i8> [[X1:%.*]], <16 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <16 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i8> [[TMP4]], <i8 -1, i8 -33, i8 -65, i8 127, i8 -1, i8 31, i8 63, i8 127, i8 -1, i8 -33, i8 -65, i8 127, i8 -1, i8 31, i8 63, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
+; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[R]]
+;
+  %t = or <16 x i8> %m, <i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 128, i8 0, i8 -32, i8 -64, i8 -128>
+  %r = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %t, <16 x i8> %x1)
+  ret <16 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8(
+; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> %x1)
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_unary(
+; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X0]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> %x0)
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %m) #0 {
+; CHECK-LABEL: define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(
+; CHECK-SAME: <32 x i8> [[X0:%.*]], <32 x i8> [[X1:%.*]], <32 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <32 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <32 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <32 x i8> [[TMP4]], <i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127, i8 -1, i8 -1, i8 -65, i8 127, i8 -1, i8 -1, i8 63, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
+; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <32 x i8> [[R]]
+;
+  %t = or <32 x i8> %m, <i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 128, i8 0, i8 0, i8 -64, i8 -128>
+  %r = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %t, <32 x i8> %x1)
+  ret <32 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8(
+; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> <i8 128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %x1)
+  ret <64 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_unary(
+; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X0]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> <i8 128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> %x0)
+  ret <64 x i8> %r
+}
+
+define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %m) #0 {
+; CHECK-LABEL: define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(
+; CHECK-SAME: <64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]], <64 x i8> [[M:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <64 x i8> [[M]], splat (i8 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <64 x i8> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = and <64 x i8> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <64 x i8> [[TMP4]], <i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127, i8 -1, i8 127>
+; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
+; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <64 x i8> [[R]]
+;
+  %t = or <64 x i8> %m, <i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128, i8 0, i8 128, i8 0, i8 -128>
+  %r = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %t, <64 x i8> %x1)
+  ret <64 x i8> %r
+}
+
+attributes #0 = { sanitize_memory }

From ee6362515dfa4fe4531c7a7690c270313669195b Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 12 Jun 2025 14:22:50 -0700
Subject: [PATCH 300/851] [RISCV][CostModel] Add additional high LMUL reverse
 tests

---
 llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll     | 6 ++++++
 llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e068ab638d3ae..e1bca71614125 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -86,6 +86,8 @@ define void @vector_reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
@@ -106,6 +108,8 @@ define void @vector_reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
@@ -125,6 +129,8 @@ define void @vector_reverse() {
   %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
   %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
   %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+  %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+  %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
   %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
   %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
   %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 7e92d8203a136..8f3219861f2fd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -26,8 +26,11 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -55,8 +58,11 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -85,9 +91,12 @@ define void @reverse() {
   %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
   %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
   %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
   %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>

From e4c32a4147012da735205eb44a45b8be5eea048d Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Thu, 12 Jun 2025 14:30:59 -0700
Subject: [PATCH 301/851] [Clang][NFC] Move Input into SmallVector instead of
 copy (#143830)

Static analysis flagged Input as a large object that would benefit from
being moved over being copied.
---
 clang/lib/Frontend/CompilerInstance.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 587b0d1af9c8d..09a66b652518f 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1191,7 +1191,7 @@ std::unique_ptr<CompilerInstance> CompilerInstance::cloneForModuleCompileImpl(
   FrontendOpts.OriginalModuleMap = std::string(OriginalModuleMapFile);
   // Force implicitly-built modules to hash the content of the module file.
   HSOpts.ModulesHashContent = true;
-  FrontendOpts.Inputs = {Input};
+  FrontendOpts.Inputs = {std::move(Input)};
 
   // Don't free the remapped file buffers; they are owned by our caller.
   PPOpts.RetainRemappedFileBuffers = true;

From 902a991e1245537f5fc11e031409fdd69fba1c06 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 12 Jun 2025 14:46:37 -0700
Subject: [PATCH 302/851] [BOLT] Make memory profile parsing optional (#129585)

Introduce `parse-mem-profile` option to limit overheads processing
tracing data (Intel PT or ARM ETM). By default, it's enabled for
perf data (existing behavior), unless `itrace` is passed to parse
tracing data where it's extremely expensive. In this case, the flag
needs to be set explicitly if needed.
---
 bolt/lib/Profile/DataAggregator.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 308346e5d02ce..ade8478f556e9 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -61,6 +61,12 @@ FilterMemProfile("filter-mem-profile",
   cl::init(true),
   cl::cat(AggregatorCategory));
 
+static cl::opt<bool> ParseMemProfile(
+    "parse-mem-profile",
+    cl::desc("enable memory profile parsing if it's present in the input data, "
+             "on by default unless `--itrace` is set."),
+    cl::init(true), cl::cat(AggregatorCategory));
+
 static cl::opt<unsigned long long>
 FilterPID("pid",
   cl::desc("only use samples from process with specified PID"),
@@ -181,6 +187,10 @@ void DataAggregator::start() {
                       "script -F pid,event,ip",
                       /*Wait = */false);
   } else if (!opts::ITraceAggregation.empty()) {
+    // Disable parsing memory profile from trace data, unless requested by user.
+    if (!opts::ParseMemProfile.getNumOccurrences())
+      opts::ParseMemProfile = false;
+
     std::string ItracePerfScriptArgs = llvm::formatv(
         "script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
     launchPerfProcess("branch events with itrace", MainEventsPPI,
@@ -191,12 +201,9 @@ void DataAggregator::start() {
                       /*Wait = */ false);
   }
 
-  // Note: we launch script for mem events regardless of the option, as the
-  //       command fails fairly fast if mem events were not collected.
-  launchPerfProcess("mem events",
-                    MemEventsPPI,
-                    "script -F pid,event,addr,ip",
-                    /*Wait = */false);
+  if (opts::ParseMemProfile)
+    launchPerfProcess("mem events", MemEventsPPI, "script -F pid,event,addr,ip",
+                      /*Wait = */ false);
 
   launchPerfProcess("process events", MMapEventsPPI,
                     "script --show-mmap-events --no-itrace",
@@ -217,7 +224,8 @@ void DataAggregator::abort() {
   sys::Wait(TaskEventsPPI.PI, 1, &Error);
   sys::Wait(MMapEventsPPI.PI, 1, &Error);
   sys::Wait(MainEventsPPI.PI, 1, &Error);
-  sys::Wait(MemEventsPPI.PI, 1, &Error);
+  if (opts::ParseMemProfile)
+    sys::Wait(MemEventsPPI.PI, 1, &Error);
 
   deleteTempFiles();
 
@@ -506,7 +514,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
     errs() << "PERF2BOLT: failed to parse samples\n";
 
   // Special handling for memory events
-  if (!prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
+  if (opts::ParseMemProfile &&
+      !prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
     if (const std::error_code EC = parseMemEvents())
       errs() << "PERF2BOLT: failed to parse memory events: " << EC.message()
              << '\n';

From 1ac61c8334782629462e6bf7c91b3fc8f4e663e8 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Thu, 12 Jun 2025 14:49:00 -0700
Subject: [PATCH 303/851] [mlir][Vector] Remove
 `vector.extractelement/insertelement` from sparse vectorizer (#143270)

This PR is part of the last step to remove `vector.extractelement` and `vector.insertelement` ops.
RFC: https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops

It updates the Sparse Vectorizer to use `vector.extract` and `vector.insert` instead of `vector.extractelement` and `vector.insertelement`.
---
 .../Transforms/SparseVectorization.cpp        | 66 ++++++++++++-------
 .../SparseTensor/minipipeline_vector.mlir     |  2 +-
 .../Dialect/SparseTensor/sparse_vector.mlir   |  6 +-
 .../SparseTensor/sparse_vector_chain.mlir     |  2 +-
 .../SparseTensor/vectorize_reduction.mlir     | 10 +--
 5 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
index 3d963dea2f572..359590f2434dc 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseVectorization.cpp
@@ -198,14 +198,14 @@ static Value genVectorReducInit(PatternRewriter &rewriter, Location loc,
   case vector::CombiningKind::ADD:
   case vector::CombiningKind::XOR:
     // Initialize reduction vector to: | 0 | .. | 0 | r |
-    return rewriter.create<vector::InsertElementOp>(
-        loc, r, constantZero(rewriter, loc, vtp),
-        constantIndex(rewriter, loc, 0));
+    return rewriter.create<vector::InsertOp>(loc, r,
+                                             constantZero(rewriter, loc, vtp),
+                                             constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::MUL:
     // Initialize reduction vector to: | 1 | .. | 1 | r |
-    return rewriter.create<vector::InsertElementOp>(
-        loc, r, constantOne(rewriter, loc, vtp),
-        constantIndex(rewriter, loc, 0));
+    return rewriter.create<vector::InsertOp>(loc, r,
+                                             constantOne(rewriter, loc, vtp),
+                                             constantIndex(rewriter, loc, 0));
   case vector::CombiningKind::AND:
   case vector::CombiningKind::OR:
     // Initialize reduction vector to: | r | .. | r | r |
@@ -628,31 +628,49 @@ struct ForOpRewriter : public OpRewritePattern<scf::ForOp> {
   const VL vl;
 };
 
+static LogicalResult cleanReducChain(PatternRewriter &rewriter, Operation *op,
+                                     Value inp) {
+  if (auto redOp = inp.getDefiningOp<vector::ReductionOp>()) {
+    if (auto forOp = redOp.getVector().getDefiningOp<scf::ForOp>()) {
+      if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) {
+        rewriter.replaceOp(op, redOp.getVector());
+        return success();
+      }
+    }
+  }
+  return failure();
+}
+
 /// Reduction chain cleanup.
 ///   v = for { }
-///   s = vsum(v)               v = for { }
-///   u = expand(s)       ->    for (v) { }
+///   s = vsum(v)                  v = for { }
+///   u = broadcast(s)       ->    for (v) { }
 ///   for (u) { }
-template <typename VectorOp>
-struct ReducChainRewriter : public OpRewritePattern<VectorOp> {
+struct ReducChainBroadcastRewriter
+    : public OpRewritePattern<vector::BroadcastOp> {
 public:
-  using OpRewritePattern<VectorOp>::OpRewritePattern;
+  using OpRewritePattern<vector::BroadcastOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(VectorOp op,
+  LogicalResult matchAndRewrite(vector::BroadcastOp op,
                                 PatternRewriter &rewriter) const override {
-    Value inp = op.getSource();
-    if (auto redOp = inp.getDefiningOp<vector::ReductionOp>()) {
-      if (auto forOp = redOp.getVector().getDefiningOp<scf::ForOp>()) {
-        if (forOp->hasAttr(LoopEmitter::getLoopEmitterLoopAttrName())) {
-          rewriter.replaceOp(op, redOp.getVector());
-          return success();
-        }
-      }
-    }
-    return failure();
+    return cleanReducChain(rewriter, op, op.getSource());
   }
 };
 
+/// Reduction chain cleanup.
+///   v = for { }
+///   s = vsum(v)               v = for { }
+///   u = insert(s)       ->    for (v) { }
+///   for (u) { }
+struct ReducChainInsertRewriter : public OpRewritePattern<vector::InsertOp> {
+public:
+  using OpRewritePattern<vector::InsertOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::InsertOp op,
+                                PatternRewriter &rewriter) const override {
+    return cleanReducChain(rewriter, op, op.getValueToStore());
+  }
+};
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -668,6 +686,6 @@ void mlir::populateSparseVectorizationPatterns(RewritePatternSet &patterns,
   vector::populateVectorStepLoweringPatterns(patterns);
   patterns.add<ForOpRewriter>(patterns.getContext(), vectorLength,
                               enableVLAVectorization, enableSIMDIndex32);
-  patterns.add<ReducChainRewriter<vector::InsertElementOp>,
-               ReducChainRewriter<vector::BroadcastOp>>(patterns.getContext());
+  patterns.add<ReducChainInsertRewriter, ReducChainBroadcastRewriter>(
+      patterns.getContext());
 }
diff --git a/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
index 2475aa5139da4..b2dfbeb53fde8 100755
--- a/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/minipipeline_vector.mlir
@@ -22,7 +22,7 @@
 // CHECK-NOVEC:       }
 //
 // CHECK-VEC-LABEL: func.func @sum_reduction
-// CHECK-VEC:       vector.insertelement
+// CHECK-VEC:       vector.insert
 // CHECK-VEC:       scf.for
 // CHECK-VEC:         vector.create_mask
 // CHECK-VEC:         vector.maskedload
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
index 364ba6e71ff3b..64235c7227800 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
@@ -241,7 +241,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC16-DAG:   %[[c1024:.*]] = arith.constant 1024 : index
 // CHECK-VEC16-DAG:   %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
 // CHECK-VEC16:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC16:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
+// CHECK-VEC16:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<16xf32>
 // CHECK-VEC16:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC16:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC16:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
@@ -258,7 +258,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC16-IDX32-DAG:   %[[c1024:.*]] = arith.constant 1024 : index
 // CHECK-VEC16-IDX32-DAG:   %[[v0:.*]] = arith.constant dense<0.000000e+00> : vector<16xf32>
 // CHECK-VEC16-IDX32:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
-// CHECK-VEC16-IDX32:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<16xf32>
+// CHECK-VEC16-IDX32:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<16xf32>
 // CHECK-VEC16-IDX32:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<16xf32>) {
 // CHECK-VEC16-IDX32:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
 // CHECK-VEC16-IDX32:         %[[lb:.*]] = vector.load %{{.*}}[%[[i]]] : memref<1024xf32>, vector<16xf32>
@@ -278,7 +278,7 @@ func.func @mul_s(%arga: tensor<1024xf32, #SparseVector>,
 // CHECK-VEC4-SVE:       %[[l:.*]] = memref.load %{{.*}}[] : memref<f32>
 // CHECK-VEC4-SVE:       %[[vscale:.*]] = vector.vscale
 // CHECK-VEC4-SVE:       %[[step:.*]] = arith.muli %[[vscale]], %[[c4]] : index
-// CHECK-VEC4-SVE:       %[[r:.*]] = vector.insertelement %[[l]], %[[v0]][%[[c0]] : index] : vector<[4]xf32>
+// CHECK-VEC4-SVE:       %[[r:.*]] = vector.insert %[[l]], %[[v0]] [0] : f32 into vector<[4]xf32>
 // CHECK-VEC4-SVE:       %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c1024]] step %[[step]] iter_args(%[[red_in:.*]] = %[[r]]) -> (vector<[4]xf32>) {
 // CHECK-VEC4-SVE:         %[[sub:.*]] = affine.min #[[$map]](%[[c1024]], %[[i]])[%[[step]]]
 // CHECK-VEC4-SVE:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<[4]xi1>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
index f4b565c7f9c8a..0ab72897d7bc3 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
@@ -82,7 +82,7 @@
 // CHECK:               %[[VAL_57:.*]] = arith.select %[[VAL_39]], %[[VAL_56]], %[[VAL_32]] : index
 // CHECK:               scf.yield %[[VAL_55]], %[[VAL_57]], %[[VAL_58:.*]] : index, index, f64
 // CHECK:             } attributes {"Emitted from" = "linalg.generic"}
-// CHECK:             %[[VAL_59:.*]] = vector.insertelement %[[VAL_60:.*]]#2, %[[VAL_4]]{{\[}}%[[VAL_6]] : index] : vector<8xf64>
+// CHECK:             %[[VAL_59:.*]] = vector.insert %[[VAL_60:.*]]#2, %[[VAL_4]] [0] : f64 into vector<8xf64>
 // CHECK:             %[[VAL_61:.*]] = scf.for %[[VAL_62:.*]] = %[[VAL_60]]#0 to %[[VAL_21]] step %[[VAL_3]] iter_args(%[[VAL_63:.*]] = %[[VAL_59]]) -> (vector<8xf64>) {
 // CHECK:               %[[VAL_64:.*]] = affine.min #map(%[[VAL_21]], %[[VAL_62]]){{\[}}%[[VAL_3]]]
 // CHECK:               %[[VAL_65:.*]] = vector.create_mask %[[VAL_64]] : vector<8xi1>
diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
index 01b717090e87a..6effbbf98abb7 100644
--- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
@@ -172,7 +172,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 // CHECK-ON:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:           %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:           %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_4]]{{\[}}%[[VAL_3]] : index] : vector<8xi32>
+// CHECK-ON:           %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_4]] [0] : i32 into vector<8xi32>
 // CHECK-ON:           %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:             %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:             %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -247,7 +247,7 @@ func.func @sparse_reduction_subi(%argx: tensor<i32>,
 // CHECK-ON:  %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:  %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:  %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:  %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
+// CHECK-ON:  %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : i32 into vector<8xi32>
 // CHECK-ON:  %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:    %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:    %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -323,7 +323,7 @@ func.func @sparse_reduction_xor(%argx: tensor<i32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xi32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : i32 into vector<8xi32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xi32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -399,7 +399,7 @@ func.func @sparse_reduction_addi(%argx: tensor<i32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xf32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : f32 into vector<8xf32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xf32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>
@@ -475,7 +475,7 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
-// CHECK-ON:   %[[VAL_12:.*]] = vector.insertelement %[[VAL_9]], %[[VAL_3]]{{\[}}%[[VAL_4]] : index] : vector<8xf32>
+// CHECK-ON:   %[[VAL_12:.*]] = vector.insert %[[VAL_9]], %[[VAL_3]] [0] : f32 into vector<8xf32>
 // CHECK-ON:   %[[VAL_13:.*]] = scf.for %[[VAL_14:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_2]] iter_args(%[[VAL_15:.*]] = %[[VAL_12]]) -> (vector<8xf32>) {
 // CHECK-ON:     %[[VAL_16:.*]] = affine.min #map(%[[VAL_11]], %[[VAL_14]]){{\[}}%[[VAL_2]]]
 // CHECK-ON:     %[[VAL_17:.*]] = vector.create_mask %[[VAL_16]] : vector<8xi1>

From 4a4035c86b0dd2b1aa09bb2ff4b6788c2bf88745 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 14:52:07 -0700
Subject: [PATCH 304/851] [CIR] Add support for delegating constructors
 (#143932)

This change adds the necessary support for handling delegating
constructors in ClangIR. The implementation is kept as small as possible
by not handling any other sort of initialization (members, base classes,
etc.). That will be added in a future commit.
---
 clang/include/clang/CIR/MissingFeatures.h     |  2 +-
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          |  3 +-
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 38 +++++++++++++
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          |  7 ++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 21 ++++++++
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 10 +++-
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      | 10 +---
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 13 +++++
 clang/lib/CIR/CodeGen/CIRGenValue.h           | 53 ++++++++++++++++---
 clang/test/CIR/CodeGen/ctor.cpp               | 46 ++++++++++++++++
 10 files changed, 183 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index fbd15d5c886d2..97b933657d742 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -172,6 +172,7 @@ struct MissingFeatures {
   static bool astVarDeclInterface() { return false; }
   static bool stackSaveOp() { return false; }
   static bool aggValueSlot() { return false; }
+  static bool aggValueSlotMayOverlap() { return false; }
   static bool generateDebugInfo() { return false; }
   static bool pointerOverflowSanitizer() { return false; }
   static bool fpConstraints() { return false; }
@@ -227,7 +228,6 @@ struct MissingFeatures {
   static bool implicitConstructorArgs() { return false; }
   static bool intrinsics() { return false; }
   static bool attributeNoBuiltin() { return false; }
-  static bool emitCtorPrologue() { return false; }
   static bool thunks() { return false; }
   static bool runCleanupsScope() { return false; }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 67c6a8dd3ef5a..5ec720ffd54f1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -189,8 +189,7 @@ void CIRGenFunction::emitDelegateCallArg(CallArgList &args,
     // For the most part, we just need to load the alloca, except that aggregate
     // r-values are actually pointers to temporaries.
   } else {
-    cgm.errorNYI(param->getSourceRange(),
-                 "emitDelegateCallArg: convertTempToRValue");
+    args.add(convertTempToRValue(local, type, loc), type);
   }
 
   // Deactivate the cleanup for the callee-destructed param that was pushed.
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index bb4b451c99247..e59a1fdb837cb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -53,6 +53,21 @@ bool CIRGenFunction::isConstructorDelegationValid(
   return true;
 }
 
+/// This routine generates necessary code to initialize base classes and
+/// non-static data members belonging to this constructor.
+void CIRGenFunction::emitCtorPrologue(const CXXConstructorDecl *cd,
+                                      CXXCtorType ctorType,
+                                      FunctionArgList &args) {
+  if (cd->isDelegatingConstructor())
+    return emitDelegatingCXXConstructorCall(cd, args);
+
+  if (cd->getNumCtorInitializers() != 0) {
+    // There's much more to do here.
+    cgm.errorNYI(cd->getSourceRange(), "emitCtorPrologue: any initializer");
+    return;
+  }
+}
+
 Address CIRGenFunction::loadCXXThisAddress() {
   assert(curFuncDecl && "loading 'this' without a func declaration?");
   assert(isa<CXXMethodDecl>(curFuncDecl));
@@ -102,6 +117,29 @@ void CIRGenFunction::emitDelegateCXXConstructorCall(
                          /*Delegating=*/true, thisAddr, delegateArgs, loc);
 }
 
+void CIRGenFunction::emitDelegatingCXXConstructorCall(
+    const CXXConstructorDecl *ctor, const FunctionArgList &args) {
+  assert(ctor->isDelegatingConstructor());
+
+  Address thisPtr = loadCXXThisAddress();
+
+  assert(!cir::MissingFeatures::objCGC());
+  assert(!cir::MissingFeatures::sanitizers());
+  AggValueSlot aggSlot = AggValueSlot::forAddr(
+      thisPtr, Qualifiers(), AggValueSlot::IsDestructed,
+      AggValueSlot::IsNotAliased, AggValueSlot::MayOverlap,
+      AggValueSlot::IsNotZeroed);
+
+  emitAggExpr(ctor->init_begin()[0]->getInit(), aggSlot);
+
+  const CXXRecordDecl *classDecl = ctor->getParent();
+  if (cgm.getLangOpts().Exceptions && !classDecl->hasTrivialDestructor()) {
+    cgm.errorNYI(ctor->getSourceRange(),
+                 "emitDelegatingCXXConstructorCall: exception");
+    return;
+  }
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 80b0172090aa3..748c2b5f6fceb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -259,7 +259,12 @@ void CIRGenFunction::emitExprAsInit(const Expr *init, const ValueDecl *d,
     return;
   }
   case cir::TEK_Aggregate:
-    emitAggExpr(init, AggValueSlot::forLValue(lvalue));
+    // The overlap flag here should be calculated.
+    assert(!cir::MissingFeatures::aggValueSlotMayOverlap());
+    emitAggExpr(init,
+                AggValueSlot::forLValue(lvalue, AggValueSlot::IsDestructed,
+                                        AggValueSlot::IsNotAliased,
+                                        AggValueSlot::MayOverlap));
     return;
   }
   llvm_unreachable("bad evaluation kind");
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index f1f86509c9a9b..5d04faf443b8d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1261,6 +1261,23 @@ Address CIRGenFunction::emitArrayToPointerDecay(const Expr *e) {
   return Address(ptr, addr.getAlignment());
 }
 
+/// Given the address of a temporary variable, produce an r-value of its type.
+RValue CIRGenFunction::convertTempToRValue(Address addr, clang::QualType type,
+                                           clang::SourceLocation loc) {
+  LValue lvalue = makeAddrLValue(addr, type, AlignmentSource::Decl);
+  switch (getEvaluationKind(type)) {
+  case cir::TEK_Complex:
+    cgm.errorNYI(loc, "convertTempToRValue: complex type");
+    return RValue::get(nullptr);
+  case cir::TEK_Aggregate:
+    cgm.errorNYI(loc, "convertTempToRValue: aggregate type");
+    return RValue::get(nullptr);
+  case cir::TEK_Scalar:
+    return RValue::get(emitLoadOfScalar(lvalue, loc));
+  }
+  llvm_unreachable("bad evaluation kind");
+}
+
 /// Emit an `if` on a boolean condition, filling `then` and `else` into
 /// appropriated regions.
 mlir::LogicalResult CIRGenFunction::emitIfOnBoolExpr(const Expr *cond,
@@ -1473,6 +1490,10 @@ void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
     type = Ctor_Complete;
     break;
   case CXXConstructionKind::Delegating:
+    // We should be emitting a constructor; GlobalDecl will assert this
+    type = curGD.getCtorType();
+    delegating = true;
+    break;
   case CXXConstructionKind::VirtualBase:
   case CXXConstructionKind::NonVirtualBase:
     cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index f1df1b79fc48e..061123d55b882 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -203,7 +203,11 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
     cgf.cgm.errorNYI("emitInitializationToLValue TEK_Complex");
     break;
   case cir::TEK_Aggregate:
-    cgf.emitAggExpr(e, AggValueSlot::forLValue(lv));
+    cgf.emitAggExpr(e, AggValueSlot::forLValue(lv, AggValueSlot::IsDestructed,
+                                               AggValueSlot::IsNotAliased,
+                                               AggValueSlot::MayOverlap,
+                                               dest.isZeroed()));
+
     return;
   case cir::TEK_Scalar:
     if (lv.isSimple())
@@ -284,6 +288,8 @@ LValue CIRGenFunction::emitAggExprToLValue(const Expr *e) {
   assert(hasAggregateEvaluationKind(e->getType()) && "Invalid argument!");
   Address temp = createMemTemp(e->getType(), getLoc(e->getSourceRange()));
   LValue lv = makeAddrLValue(temp, e->getType());
-  emitAggExpr(e, AggValueSlot::forLValue(lv));
+  emitAggExpr(e, AggValueSlot::forLValue(lv, AggValueSlot::IsNotDestructed,
+                                         AggValueSlot::IsNotAliased,
+                                         AggValueSlot::DoesNotOverlap));
   return lv;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 53c44c6cc7680..c5bd5109343d3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -526,14 +526,8 @@ void CIRGenFunction::emitConstructorBody(FunctionArgList &args) {
   // TODO: in restricted cases, we can emit the vbase initializers of a
   // complete ctor and then delegate to the base ctor.
 
-  assert(!cir::MissingFeatures::emitCtorPrologue());
-  if (ctor->isDelegatingConstructor()) {
-    // This will be handled in emitCtorPrologue, but we should emit a diagnostic
-    // rather than silently fail to delegate.
-    cgm.errorNYI(ctor->getSourceRange(),
-                 "emitConstructorBody: delegating ctor");
-    return;
-  }
+  // Emit the constructor prologue, i.e. the base and member initializers.
+  emitCtorPrologue(ctor, ctorType, args);
 
   // TODO(cir): propagate this result via mlir::logical result. Just unreachable
   // now just to have it handled.
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 361dcd5ef1c31..cf672b0c90e60 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -474,6 +474,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   bool shouldNullCheckClassCastValue(const CastExpr *ce);
 
+  RValue convertTempToRValue(Address addr, clang::QualType type,
+                             clang::SourceLocation loc);
+
   static bool
   isConstructorDelegationValid(const clang::CXXConstructorDecl *ctor);
 
@@ -797,6 +800,16 @@ class CIRGenFunction : public CIRGenTypeCache {
                                        const CXXMethodDecl *md,
                                        ReturnValueSlot returnValue);
 
+  void emitCtorPrologue(const clang::CXXConstructorDecl *ctor,
+                        clang::CXXCtorType ctorType, FunctionArgList &args);
+
+  // It's important not to confuse this and emitDelegateCXXConstructorCall.
+  // Delegating constructors are the C++11 feature. The constructor delegate
+  // optimization is used to reduce duplication in the base and complete
+  // constructors where they are substantially the same.
+  void emitDelegatingCXXConstructorCall(const CXXConstructorDecl *ctor,
+                                        const FunctionArgList &args);
+
   mlir::LogicalResult emitDoStmt(const clang::DoStmt &s);
 
   /// Emit an expression as an initializer for an object (variable, field, etc.)
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 208247e16e531..8f52fea31750c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -267,23 +267,64 @@ class AggValueSlot {
   Address addr;
   clang::Qualifiers quals;
 
+  /// This is set to true if some external code is responsible for setting up a
+  /// destructor for the slot.  Otherwise the code which constructs it should
+  /// push the appropriate cleanup.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned destructedFlag : 1;
+
   /// This is set to true if the memory in the slot is known to be zero before
   /// the assignment into it.  This means that zero fields don't need to be set.
-  bool zeroedFlag : 1;
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned zeroedFlag : 1;
+
+  /// This is set to true if the slot might be aliased and it's not undefined
+  /// behavior to access it through such an alias.  Note that it's always
+  /// undefined behavior to access a C++ object that's under construction
+  /// through an alias derived from outside the construction process.
+  ///
+  /// This flag controls whether calls that produce the aggregate
+  /// value may be evaluated directly into the slot, or whether they
+  /// must be evaluated into an unaliased temporary and then memcpy'ed
+  /// over.  Since it's invalid in general to memcpy a non-POD C++
+  /// object, it's important that this flag never be set when
+  /// evaluating an expression which constructs such an object.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned aliasedFlag : 1;
+
+  /// This is set to true if the tail padding of this slot might overlap
+  /// another object that may have already been initialized (and whose
+  /// value must be preserved by this initialization). If so, we may only
+  /// store up to the dsize of the type. Otherwise we can widen stores to
+  /// the size of the type.
+  LLVM_PREFERRED_TYPE(bool)
+  [[maybe_unused]] unsigned overlapFlag : 1;
 
 public:
+  enum IsDestructed_t { IsNotDestructed, IsDestructed };
   enum IsZeroed_t { IsNotZeroed, IsZeroed };
+  enum IsAliased_t { IsNotAliased, IsAliased };
+  enum Overlap_t { MayOverlap, DoesNotOverlap };
 
-  AggValueSlot(Address addr, clang::Qualifiers quals, bool zeroedFlag)
-      : addr(addr), quals(quals), zeroedFlag(zeroedFlag) {}
+  AggValueSlot(Address addr, clang::Qualifiers quals, bool destructedFlag,
+               bool zeroedFlag, bool aliasedFlag, bool overlapFlag)
+      : addr(addr), quals(quals), destructedFlag(destructedFlag),
+        zeroedFlag(zeroedFlag), aliasedFlag(aliasedFlag),
+        overlapFlag(overlapFlag) {}
 
   static AggValueSlot forAddr(Address addr, clang::Qualifiers quals,
+                              IsDestructed_t isDestructed,
+                              IsAliased_t isAliased, Overlap_t mayOverlap,
                               IsZeroed_t isZeroed = IsNotZeroed) {
-    return AggValueSlot(addr, quals, isZeroed);
+    return AggValueSlot(addr, quals, isDestructed, isZeroed, isAliased,
+                        mayOverlap);
   }
 
-  static AggValueSlot forLValue(const LValue &lv) {
-    return forAddr(lv.getAddress(), lv.getQuals());
+  static AggValueSlot forLValue(const LValue &LV, IsDestructed_t isDestructed,
+                                IsAliased_t isAliased, Overlap_t mayOverlap,
+                                IsZeroed_t isZeroed = IsNotZeroed) {
+    return forAddr(LV.getAddress(), LV.getQuals(), isDestructed, isAliased,
+                   mayOverlap, isZeroed);
   }
 
   clang::Qualifiers getQualifiers() const { return quals; }
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
index 3b4191fd74c97..1a36eb0d9d3a6 100644
--- a/clang/test/CIR/CodeGen/ctor.cpp
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -67,3 +67,49 @@ void bar() {
 // CHECK-NEXT:    %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
 // CHECK-NEXT:    cir.call @_ZN13VariadicStrukC1Eiz(%[[S_ADDR]], %[[ONE]], %[[TWO]], %[[THREE]])
 // CHECK-NEXT:    cir.return
+
+struct DelegatingStruk {
+  int a;
+  DelegatingStruk(int n) { a = n; }
+  DelegatingStruk() : DelegatingStruk(0) {}
+};
+
+void bam() {
+  DelegatingStruk s;
+}
+
+// CHECK:       cir.func @_ZN15DelegatingStrukC2Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-SAME:                                     %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   %[[A_ADDR:.*]] = cir.get_member %[[THIS]][0] {name = "a"}
+// CHECK-NEXT:   cir.store{{.*}} %[[N]], %[[A_ADDR]]
+// CHECK-NEXT:   cir.return
+
+// CHECK:       cir.func @_ZN15DelegatingStrukC1Ei(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-SAME:                                     %arg1: !s32i
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   %[[N_ADDR:.*]] = cir.alloca {{.*}} ["n", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   cir.store %arg1, %[[N_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[N:.*]] = cir.load{{.*}} %[[N_ADDR]]
+// CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC2Ei(%[[THIS]], %[[N]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func @_ZN15DelegatingStrukC1Ev(%arg0: !cir.ptr<!rec_DelegatingStruk>
+// CHECK-NEXT:   %[[THIS_ADDR:.*]] = cir.alloca {{.*}} ["this", init]
+// CHECK-NEXT:   cir.store %arg0, %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[THIS:.*]] = cir.load{{.*}} %[[THIS_ADDR]]
+// CHECK-NEXT:   %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CHECK-NEXT:   cir.call @_ZN15DelegatingStrukC1Ei(%[[THIS]], %[[ZERO]])
+// CHECK-NEXT:   cir.return
+
+// CHECK: cir.func @_Z3bamv
+// CHECK-NEXT:    %[[S_ADDR:.*]] = cir.alloca {{.*}} ["s", init]
+// CHECK-NEXT:    cir.call @_ZN15DelegatingStrukC1Ev(%[[S_ADDR]])
+// CHECK-NEXT:    cir.return

From 8a2895ad89793591cd3f0114bc56cd345f651823 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 12 Jun 2025 14:52:43 -0700
Subject: [PATCH 305/851] [lldb] Implement JSON RPC (newline delimited)
 Transport (#143946)

This PR implements JSON RPC-style (i.e. newline delimited) JSON
transport. I moved the existing transport tests from DAP to Host and
moved the PipeTest base class into TestingSupport so it can be shared by
both.
---
 lldb/include/lldb/Host/JSONTransport.h        |  23 ++-
 lldb/source/Host/common/JSONTransport.cpp     |  37 +++-
 lldb/unittests/DAP/CMakeLists.txt             |   1 -
 lldb/unittests/DAP/TestBase.cpp               |   7 +-
 lldb/unittests/DAP/TestBase.h                 |  13 +-
 lldb/unittests/DAP/TransportTest.cpp          |  98 ----------
 lldb/unittests/Host/CMakeLists.txt            |   1 +
 lldb/unittests/Host/JSONTransportTest.cpp     | 176 ++++++++++++++++++
 .../TestingSupport/Host/PipeTestUtilities.h   |  28 +++
 9 files changed, 260 insertions(+), 124 deletions(-)
 delete mode 100644 lldb/unittests/DAP/TransportTest.cpp
 create mode 100644 lldb/unittests/Host/JSONTransportTest.cpp
 create mode 100644 lldb/unittests/TestingSupport/Host/PipeTestUtilities.h

diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h
index 4db5e417ea852..4087cdf2b42f7 100644
--- a/lldb/include/lldb/Host/JSONTransport.h
+++ b/lldb/include/lldb/Host/JSONTransport.h
@@ -51,17 +51,17 @@ class TransportTimeoutError : public llvm::ErrorInfo<TransportTimeoutError> {
   }
 };
 
-class TransportClosedError : public llvm::ErrorInfo<TransportClosedError> {
+class TransportInvalidError : public llvm::ErrorInfo<TransportInvalidError> {
 public:
   static char ID;
 
-  TransportClosedError() = default;
+  TransportInvalidError() = default;
 
   void log(llvm::raw_ostream &OS) const override {
-    OS << "transport is closed";
+    OS << "transport IO object invalid";
   }
   std::error_code convertToErrorCode() const override {
-    return llvm::inconvertibleErrorCode();
+    return std::make_error_code(std::errc::not_connected);
   }
 };
 
@@ -121,6 +121,21 @@ class HTTPDelimitedJSONTransport : public JSONTransport {
   static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n";
 };
 
+/// A transport class for JSON RPC.
+class JSONRPCTransport : public JSONTransport {
+public:
+  JSONRPCTransport(lldb::IOObjectSP input, lldb::IOObjectSP output)
+      : JSONTransport(input, output) {}
+  virtual ~JSONRPCTransport() = default;
+
+protected:
+  virtual llvm::Error WriteImpl(const std::string &message) override;
+  virtual llvm::Expected<std::string>
+  ReadImpl(const std::chrono::microseconds &timeout) override;
+
+  static constexpr llvm::StringLiteral kMessageSeparator = "\n";
+};
+
 } // namespace lldb_private
 
 #endif
diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp
index 103c76d25daf7..1a0851d5c4365 100644
--- a/lldb/source/Host/common/JSONTransport.cpp
+++ b/lldb/source/Host/common/JSONTransport.cpp
@@ -31,7 +31,7 @@ static Expected<std::string>
 ReadFull(IOObject &descriptor, size_t length,
          std::optional<std::chrono::microseconds> timeout = std::nullopt) {
   if (!descriptor.IsValid())
-    return llvm::make_error<TransportClosedError>();
+    return llvm::make_error<TransportInvalidError>();
 
   bool timeout_supported = true;
   // FIXME: SelectHelper does not work with NativeFile on Win32.
@@ -92,7 +92,7 @@ void JSONTransport::Log(llvm::StringRef message) {
 Expected<std::string>
 HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
   if (!m_input || !m_input->IsValid())
-    return createStringError("transport output is closed");
+    return llvm::make_error<TransportInvalidError>();
 
   IOObject *input = m_input.get();
   Expected<std::string> message_header =
@@ -131,7 +131,7 @@ HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) {
 
 Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
   if (!m_output || !m_output->IsValid())
-    return llvm::make_error<TransportClosedError>();
+    return llvm::make_error<TransportInvalidError>();
 
   Log(llvm::formatv("<-- {0}", message).str());
 
@@ -142,6 +142,35 @@ Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) {
   return m_output->Write(Output.data(), num_bytes).takeError();
 }
 
+Expected<std::string>
+JSONRPCTransport::ReadImpl(const std::chrono::microseconds &timeout) {
+  if (!m_input || !m_input->IsValid())
+    return make_error<TransportInvalidError>();
+
+  IOObject *input = m_input.get();
+  Expected<std::string> raw_json =
+      ReadUntil(*input, kMessageSeparator, timeout);
+  if (!raw_json)
+    return raw_json.takeError();
+
+  Log(llvm::formatv("--> {0}", *raw_json).str());
+
+  return *raw_json;
+}
+
+Error JSONRPCTransport::WriteImpl(const std::string &message) {
+  if (!m_output || !m_output->IsValid())
+    return llvm::make_error<TransportInvalidError>();
+
+  Log(llvm::formatv("<-- {0}", message).str());
+
+  std::string Output;
+  llvm::raw_string_ostream OS(Output);
+  OS << message << kMessageSeparator;
+  size_t num_bytes = Output.size();
+  return m_output->Write(Output.data(), num_bytes).takeError();
+}
+
 char TransportEOFError::ID;
 char TransportTimeoutError::ID;
-char TransportClosedError::ID;
+char TransportInvalidError::ID;
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index 37a6a81ad12a0..ee623d341ec69 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -7,7 +7,6 @@ add_lldb_unittest(DAPTests
   LLDBUtilsTest.cpp
   ProtocolTypesTest.cpp
   TestBase.cpp
-  TransportTest.cpp
   VariablesTest.cpp
 
   LINK_COMPONENTS
diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp
index 4063b34250312..27ad42686fbbf 100644
--- a/lldb/unittests/DAP/TestBase.cpp
+++ b/lldb/unittests/DAP/TestBase.cpp
@@ -28,13 +28,8 @@ using lldb_private::File;
 using lldb_private::NativeFile;
 using lldb_private::Pipe;
 
-void PipeBase::SetUp() {
-  ASSERT_THAT_ERROR(input.CreateNew(false).ToError(), Succeeded());
-  ASSERT_THAT_ERROR(output.CreateNew(false).ToError(), Succeeded());
-}
-
 void TransportBase::SetUp() {
-  PipeBase::SetUp();
+  PipeTest::SetUp();
   to_dap = std::make_unique<Transport>(
       "to_dap", nullptr,
       std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
diff --git a/lldb/unittests/DAP/TestBase.h b/lldb/unittests/DAP/TestBase.h
index 70b3985271a92..25d37013954d5 100644
--- a/lldb/unittests/DAP/TestBase.h
+++ b/lldb/unittests/DAP/TestBase.h
@@ -8,26 +8,17 @@
 
 #include "DAP.h"
 #include "Protocol/ProtocolBase.h"
+#include "TestingSupport/Host/PipeTestUtilities.h"
 #include "Transport.h"
-#include "lldb/Host/Pipe.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
 namespace lldb_dap_tests {
 
-/// A base class for tests that need a pair of pipes for communication.
-class PipeBase : public testing::Test {
-protected:
-  lldb_private::Pipe input;
-  lldb_private::Pipe output;
-
-  void SetUp() override;
-};
-
 /// A base class for tests that need transport configured for communicating DAP
 /// messages.
-class TransportBase : public PipeBase {
+class TransportBase : public PipeTest {
 protected:
   std::unique_ptr<lldb_dap::Transport> to_dap;
   std::unique_ptr<lldb_dap::Transport> from_dap;
diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp
deleted file mode 100644
index aaf257993af23..0000000000000
--- a/lldb/unittests/DAP/TransportTest.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===-- TransportTest.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Transport.h"
-#include "Protocol/ProtocolBase.h"
-#include "TestBase.h"
-#include "lldb/Host/File.h"
-#include "lldb/Host/Pipe.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gtest/gtest.h"
-#include <chrono>
-#include <memory>
-#include <optional>
-
-using namespace llvm;
-using namespace lldb;
-using namespace lldb_dap;
-using namespace lldb_dap_tests;
-using namespace lldb_dap::protocol;
-using lldb_private::File;
-using lldb_private::NativeFile;
-using lldb_private::Pipe;
-using lldb_private::TransportEOFError;
-using lldb_private::TransportTimeoutError;
-
-class TransportTest : public PipeBase {
-protected:
-  std::unique_ptr<Transport> transport;
-
-  void SetUp() override {
-    PipeBase::SetUp();
-    transport = std::make_unique<Transport>(
-        "stdio", nullptr,
-        std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
-                                     File::eOpenOptionReadOnly,
-                                     NativeFile::Unowned),
-        std::make_shared<NativeFile>(output.GetWriteFileDescriptor(),
-                                     File::eOpenOptionWriteOnly,
-                                     NativeFile::Unowned));
-  }
-};
-
-TEST_F(TransportTest, MalformedRequests) {
-  std::string malformed_header = "COnTent-LenGth: -1{}\r\n\r\nnotjosn";
-  ASSERT_THAT_EXPECTED(
-      input.Write(malformed_header.data(), malformed_header.size()),
-      Succeeded());
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      FailedWithMessage(
-          "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
-}
-
-TEST_F(TransportTest, Read) {
-  std::string json =
-      R"json({"seq": 1, "type": "request", "command": "abc"})json";
-  std::string message =
-      formatv("Content-Length: {0}\r\n\r\n{1}", json.size(), json).str();
-  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
-                       Succeeded());
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      HasValue(testing::VariantWith<Request>(testing::FieldsAre(
-          /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt))));
-}
-
-TEST_F(TransportTest, ReadWithTimeout) {
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      Failed<TransportTimeoutError>());
-}
-
-TEST_F(TransportTest, ReadWithEOF) {
-  input.CloseWriteFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<protocol::Message>(std::chrono::milliseconds(1)),
-      Failed<TransportEOFError>());
-}
-
-TEST_F(TransportTest, Write) {
-  ASSERT_THAT_ERROR(transport->Write(Event{"my-event", std::nullopt}),
-                    Succeeded());
-  output.CloseWriteFileDescriptor();
-  char buf[1024];
-  Expected<size_t> bytes_read =
-      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
-  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
-  ASSERT_EQ(
-      StringRef(buf, *bytes_read),
-      StringRef("Content-Length: 43\r\n\r\n"
-                R"json({"event":"my-event","seq":0,"type":"event"})json"));
-}
diff --git a/lldb/unittests/Host/CMakeLists.txt b/lldb/unittests/Host/CMakeLists.txt
index 5b8deed00af88..3b20f1d723d18 100644
--- a/lldb/unittests/Host/CMakeLists.txt
+++ b/lldb/unittests/Host/CMakeLists.txt
@@ -6,6 +6,7 @@ set (FILES
   HostInfoTest.cpp
   HostTest.cpp
   MainLoopTest.cpp
+  JSONTransportTest.cpp
   NativeProcessProtocolTest.cpp
   PipeTest.cpp
   ProcessLaunchInfoTest.cpp
diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
new file mode 100644
index 0000000000000..f1ec5e03bbeca
--- /dev/null
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -0,0 +1,176 @@
+//===-- JSONTransportTest.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/JSONTransport.h"
+#include "TestingSupport/Host/PipeTestUtilities.h"
+#include "lldb/Host/File.h"
+
+using namespace llvm;
+using namespace lldb_private;
+
+namespace {
+template <typename T> class JSONTransportTest : public PipeTest {
+protected:
+  std::unique_ptr<JSONTransport> transport;
+
+  void SetUp() override {
+    PipeTest::SetUp();
+    transport = std::make_unique<T>(
+        std::make_shared<NativeFile>(input.GetReadFileDescriptor(),
+                                     File::eOpenOptionReadOnly,
+                                     NativeFile::Unowned),
+        std::make_shared<NativeFile>(output.GetWriteFileDescriptor(),
+                                     File::eOpenOptionWriteOnly,
+                                     NativeFile::Unowned));
+  }
+};
+
+class HTTPDelimitedJSONTransportTest
+    : public JSONTransportTest<HTTPDelimitedJSONTransport> {
+public:
+  using JSONTransportTest::JSONTransportTest;
+};
+
+class JSONRPCTransportTest : public JSONTransportTest<JSONRPCTransport> {
+public:
+  using JSONTransportTest::JSONTransportTest;
+};
+
+struct JSONTestType {
+  std::string str;
+};
+
+llvm::json::Value toJSON(const JSONTestType &T) {
+  return llvm::json::Object{{"str", T.str}};
+}
+
+bool fromJSON(const llvm::json::Value &V, JSONTestType &T, llvm::json::Path P) {
+  llvm::json::ObjectMapper O(V, P);
+  return O && O.map("str", T.str);
+}
+} // namespace
+
+TEST_F(HTTPDelimitedJSONTransportTest, MalformedRequests) {
+  std::string malformed_header = "COnTent-LenGth: -1{}\r\n\r\nnotjosn";
+  ASSERT_THAT_EXPECTED(
+      input.Write(malformed_header.data(), malformed_header.size()),
+      Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      FailedWithMessage(
+          "expected 'Content-Length: ' and got 'COnTent-LenGth: '"));
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, Read) {
+  std::string json = R"json({"str": "foo"})json";
+  std::string message =
+      formatv("Content-Length: {0}\r\n\r\n{1}", json.size(), json).str();
+  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      HasValue(testing::FieldsAre(/*str=*/"foo")));
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, ReadWithEOF) {
+  input.CloseWriteFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
+  transport = std::make_unique<HTTPDelimitedJSONTransport>(nullptr, nullptr);
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportInvalidError>());
+}
+
+TEST_F(HTTPDelimitedJSONTransportTest, Write) {
+  ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
+  output.CloseWriteFileDescriptor();
+  char buf[1024];
+  Expected<size_t> bytes_read =
+      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
+  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
+  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef("Content-Length: 13\r\n\r\n"
+                                                   R"json({"str":"foo"})json"));
+}
+
+TEST_F(JSONRPCTransportTest, MalformedRequests) {
+  std::string malformed_header = "notjson\n";
+  ASSERT_THAT_EXPECTED(
+      input.Write(malformed_header.data(), malformed_header.size()),
+      Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, Read) {
+  std::string json = R"json({"str": "foo"})json";
+  std::string message = formatv("{0}\n", json).str();
+  ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()),
+                       Succeeded());
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      HasValue(testing::FieldsAre(/*str=*/"foo")));
+}
+
+TEST_F(JSONRPCTransportTest, ReadWithEOF) {
+  input.CloseWriteFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportEOFError>());
+}
+
+TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, Write) {
+  ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
+  output.CloseWriteFileDescriptor();
+  char buf[1024];
+  Expected<size_t> bytes_read =
+      output.Read(buf, sizeof(buf), std::chrono::milliseconds(1));
+  ASSERT_THAT_EXPECTED(bytes_read, Succeeded());
+  ASSERT_EQ(StringRef(buf, *bytes_read), StringRef(R"json({"str":"foo"})json"
+                                                   "\n"));
+}
+
+TEST_F(JSONRPCTransportTest, InvalidTransport) {
+  transport = std::make_unique<JSONRPCTransport>(nullptr, nullptr);
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportInvalidError>());
+}
+
+#ifndef _WIN32
+TEST_F(HTTPDelimitedJSONTransportTest, ReadWithTimeout) {
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
+}
+
+TEST_F(JSONRPCTransportTest, ReadWithTimeout) {
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      Failed<TransportTimeoutError>());
+}
+#endif
diff --git a/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h b/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h
new file mode 100644
index 0000000000000..50d5d4117c898
--- /dev/null
+++ b/lldb/unittests/TestingSupport/Host/PipeTestUtilities.h
@@ -0,0 +1,28 @@
+//===-- PipeTestUtilities.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_UNITTESTS_TESTINGSUPPORT_PIPETESTUTILITIES_H
+#define LLDB_UNITTESTS_TESTINGSUPPORT_PIPETESTUTILITIES_H
+
+#include "lldb/Host/Pipe.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+/// A base class for tests that need a pair of pipes for communication.
+class PipeTest : public testing::Test {
+protected:
+  lldb_private::Pipe input;
+  lldb_private::Pipe output;
+
+  void SetUp() override {
+    ASSERT_THAT_ERROR(input.CreateNew(false).ToError(), llvm::Succeeded());
+    ASSERT_THAT_ERROR(output.CreateNew(false).ToError(), llvm::Succeeded());
+  }
+};
+
+#endif

From 26f91610011f1a23cb306d61bbc1fafded7d077d Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 12 Jun 2025 23:13:13 +0100
Subject: [PATCH 306/851] [lit] cleanup unused imports (#143930)

Remove imports that are not used in some lit test files.
---
 lld/test/Unit/lit.cfg.py         | 1 -
 lldb/test/API/lit.cfg.py         | 2 --
 lldb/test/Shell/lit.cfg.py       | 5 +----
 lldb/test/lit.cfg.py             | 3 ---
 llvm/utils/lit/lit/LitConfig.py  | 6 ++----
 llvm/utils/lit/lit/TestRunner.py | 6 ------
 llvm/utils/lit/lit/discovery.py  | 2 +-
 llvm/utils/lit/lit/worker.py     | 2 --
 8 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/lld/test/Unit/lit.cfg.py b/lld/test/Unit/lit.cfg.py
index 1cf890a05cb28..47375db517e96 100644
--- a/lld/test/Unit/lit.cfg.py
+++ b/lld/test/Unit/lit.cfg.py
@@ -3,7 +3,6 @@
 # Configuration file for the 'lit' test runner.
 
 import os
-import subprocess
 
 import lit.formats
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 646a446c86fdb..04b360e8d3307 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -9,8 +9,6 @@
 import subprocess
 import sys
 
-import lit.formats
-
 # name: The name of this test suite.
 config.name = "lldb-api"
 
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index ab6113767187a..6f0e017fb7cb9 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -7,12 +7,9 @@
 import shutil
 import site
 import subprocess
-import sys
 
-import lit.formats
+import lit.util
 from lit.llvm import llvm_config
-from lit.llvm.subst import FindTool
-from lit.llvm.subst import ToolSubst
 
 site.addsitedir(os.path.dirname(__file__))
 from helper import toolchain
diff --git a/lldb/test/lit.cfg.py b/lldb/test/lit.cfg.py
index eefc32aabd16d..6a4255c2627d9 100644
--- a/lldb/test/lit.cfg.py
+++ b/lldb/test/lit.cfg.py
@@ -2,9 +2,6 @@
 
 import os
 
-import lit.formats
-from lit.llvm import llvm_config
-
 # This is the top level configuration. Most of these configuration options will
 # be overriden by individual lit configuration files in the test
 # subdirectories. Anything configured here will *not* be loaded when pointing
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index cb4aef6f72a87..5bb2d3c5c986c 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,14 +1,12 @@
 from __future__ import absolute_import
+
 import inspect
 import os
-import platform
 import sys
 
-import lit.Test
-import lit.formats
-import lit.TestingConfig
 import lit.util
 
+
 # LitConfig must be a new style class for properties to work
 class LitConfig(object):
     """LitConfig - Configuration data for a 'lit' test runner instance, shared
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 73db67aedb739..1d3bf8e4e8df1 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1,7 +1,4 @@
 from __future__ import absolute_import
-import errno
-import io
-import itertools
 import getopt
 import os, signal, subprocess, sys
 import re
@@ -12,11 +9,8 @@
 import shutil
 import tempfile
 import threading
-import typing
 from typing import Optional, Tuple
 
-import io
-
 try:
     from StringIO import StringIO
 except ImportError:
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e7f90c6bb0c9..2e93bacc12368 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -6,8 +6,8 @@
 import os
 import sys
 
+from lit import Test, util
 from lit.TestingConfig import TestingConfig
-from lit import LitConfig, Test, util
 
 
 def chooseConfigFileFromDir(dir, config_names):
diff --git a/llvm/utils/lit/lit/worker.py b/llvm/utils/lit/lit/worker.py
index 8e78bfd45d38b..dbc3ab53bc627 100644
--- a/llvm/utils/lit/lit/worker.py
+++ b/llvm/utils/lit/lit/worker.py
@@ -12,8 +12,6 @@
 import traceback
 
 import lit.Test
-import lit.util
-
 
 _lit_config = None
 _parallelism_semaphores = None

From 2ee8fdbfddcca86ac079104718e6fda3aabed0eb Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Thu, 12 Jun 2025 22:14:28 +0000
Subject: [PATCH 307/851] [libc] Prevent building wchar on MacOS (#143978)

Prevent building wchar on macos as it depends on uchar.h which isn't
available
---
 libc/src/__support/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 309cde76370f0..7e85136c08851 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -391,6 +391,10 @@ add_subdirectory(fixed_point)
 
 add_subdirectory(time)
 
-add_subdirectory(wchar)
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()
 
 add_subdirectory(math)

From 2b8f82b8308fc9df0a74cdd61a1257d9eb51189c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 12 Jun 2025 23:40:57 +0100
Subject: [PATCH 308/851] [RISCV] Remove implicit $vl def on vleNff pseudos
 (#143935)

In #90049 we removed the side effect flag on the vleNff pseudos with the
reasoning that we modelled the effect of setting vl as an output
operand.

This extends this further by removing the implicit def on vl, inserting
it back in RISCVInsertVSETVLI when we also emit the PseudoReadVL.

The motiviation for this is to make it easier to handle vleff in more
places in RISCVVectorPeephole in a follow up patch, which in turn will
make migrating the last vmerge peephole over from RISCVISelDAGToDAG
easier.

Some of these tests claim that the vleff shouldn't be deleted when none
of its values are used, but these are from the initial commit in
3b5430eb0dad5. I'm not sure if these still hold today?

This also moves the fault-only-first predicate to
RISCVInstrPredicates.td since we can't rely on the implicit vl operand
anymore.
---
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp            |  2 +-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp         |  5 +++--
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp             |  5 -----
 llvm/lib/Target/RISCV/RISCVInstrInfo.h               |  2 --
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td      |  6 ------
 llvm/lib/Target/RISCV/RISCVInstrPredicates.td        |  7 +++++++
 llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll | 12 ++++++------
 llvm/test/CodeGen/RISCV/rvv/vleff.ll                 |  5 -----
 llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll     |  4 ----
 llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll     |  4 ----
 10 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 72f1596d79a02..4fb71a3ed0006 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -1101,7 +1101,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
   if (RISCVII::hasRoundModeOp(TSFlags))
     --NumOps;
 
-  bool hasVLOutput = RISCV::isFaultFirstLoad(*MI);
+  bool hasVLOutput = RISCVInstrInfo::isFaultOnlyFirstLoad(*MI);
   for (unsigned OpNo = 0; OpNo != NumOps; ++OpNo) {
     const MachineOperand &MO = MI->getOperand(OpNo);
     // Skip vl output. It should be the second output.
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 53192e9dfe6c6..9a513891b765d 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1253,7 +1253,7 @@ void RISCVInsertVSETVLI::transferAfter(VSETVLIInfo &Info,
     return;
   }
 
-  if (RISCV::isFaultFirstLoad(MI)) {
+  if (RISCVInstrInfo::isFaultOnlyFirstLoad(MI)) {
     // Update AVL to vl-output of the fault first load.
     assert(MI.getOperand(1).getReg().isVirtual());
     if (LIS) {
@@ -1756,7 +1756,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
 void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
   for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
     MachineInstr &MI = *I++;
-    if (RISCV::isFaultFirstLoad(MI)) {
+    if (RISCVInstrInfo::isFaultOnlyFirstLoad(MI)) {
       Register VLOutput = MI.getOperand(1).getReg();
       assert(VLOutput.isVirtual());
       if (!MI.getOperand(1).isDead()) {
@@ -1774,6 +1774,7 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
       }
       // We don't use the vl output of the VLEFF/VLSEGFF anymore.
       MI.getOperand(1).setReg(RISCV::X0);
+      MI.addRegisterDefined(RISCV::VL, MRI->getTargetRegisterInfo());
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 86a4e8e370ee6..e5d29e1a8b476 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4568,11 +4568,6 @@ RISCV::isRVVSpillForZvlsseg(unsigned Opcode) {
   }
 }
 
-bool RISCV::isFaultFirstLoad(const MachineInstr &MI) {
-  return MI.getNumExplicitDefs() == 2 &&
-         MI.modifiesRegister(RISCV::VL, /*TRI=*/nullptr) && !MI.isInlineAsm();
-}
-
 bool RISCV::hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2) {
   int16_t MI1FrmOpIdx =
       RISCV::getNamedOperandIdx(MI1.getOpcode(), RISCV::OpName::frm);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index b099acd81e995..8260949cf918a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -357,8 +357,6 @@ bool isRVVSpill(const MachineInstr &MI);
 std::optional<std::pair<unsigned, unsigned>>
 isRVVSpillForZvlsseg(unsigned Opcode);
 
-bool isFaultFirstLoad(const MachineInstr &MI);
-
 // Return true if both input instructions have equal rounding mode. If at least
 // one of the instructions does not have rounding mode, false will be returned.
 bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 281f8d55932b9..f9fc6f0be3804 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -6144,8 +6144,6 @@ defm PseudoVSUX : VPseudoIStore<Ordered=false>;
 // 7.7. Unit-stride Fault-Only-First Loads
 //===----------------------------------------------------------------------===//
 
-// vleff may update VL register
-let Defs = [VL] in
 defm PseudoVL : VPseudoFFLoad;
 
 //===----------------------------------------------------------------------===//
@@ -6159,11 +6157,7 @@ defm PseudoVSSEG : VPseudoUSSegStore;
 defm PseudoVSSSEG : VPseudoSSegStore;
 defm PseudoVSOXSEG : VPseudoISegStore<Ordered=true>;
 defm PseudoVSUXSEG : VPseudoISegStore<Ordered=false>;
-
-// vlseg<nf>e<eew>ff.v may update VL register
-let Defs = [VL] in {
 defm PseudoVLSEG : VPseudoUSSegLoadFF;
-}
 
 //===----------------------------------------------------------------------===//
 // 11. Vector Integer Arithmetic Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 4c37cb7e393bf..1057eeee31d65 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -129,6 +129,13 @@ def isVSlideInstr
                       !instances<Pseudo>("^PseudoVSLIDEUP_VI.*")
                      ])>>>;
 
+def isFaultOnlyFirstLoad
+    : TIIPredicate<"isFaultOnlyFirstLoad",
+                    MCReturnStatement<
+                      CheckOpcode<
+                       !instances<Pseudo>(
+                          "^PseudoVL(SEG[2-8])?E(8|16|32|64)FF_V.*")>>>;
+
 def isNonZeroLoadImmediate
     : TIIPredicate<"isNonZeroLoadImmediate",
                    MCReturnStatement<CheckAll<[
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index e4235d03cda31..db31866b56372 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -14,7 +14,7 @@ define i64 @test_vleff_nxv8i8(ptr %p, i64 %vl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -31,7 +31,7 @@ define i64 @test_vleff_nxv8i8_tu(<vscale x 8 x i8> %passthru, ptr %p, i64 %vl) {
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vr = COPY $v8
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_:%[0-9]+]]:vr, [[PseudoVLE8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -50,7 +50,7 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -66,7 +66,7 @@ define i64 @test_vlseg2ff_nxv8i8(ptr %base, i64 %vl, ptr %outvl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -83,7 +83,7 @@ define i64 @test_vlseg2ff_nxv8i8_tu(target("riscv.vector.tuple", <vscale x 8 x i
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vrn2m1 = COPY $v8_v9
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[COPY2]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
@@ -102,7 +102,7 @@ define i64 @test_vlseg2ff_nxv8i8_mask(target("riscv.vector.tuple", <vscale x 8 x
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrn2m1nov0 = COPY $v8_v9
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_MASK:%[0-9]+]]:vrn2m1nov0, [[PseudoVLSEG2E8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff.ll b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
index 1f3959c1eac8e..4c989ce87290e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff.ll
@@ -3016,12 +3016,9 @@ entry:
   ret void
 }
 
-; Test with both outputs dead. Make sure the vleff isn't deleted.
 define void @intrinsic_vleff_dead_all(ptr %0, iXLen %1, ptr %2) nounwind {
 ; CHECK-LABEL: intrinsic_vleff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
-; CHECK-NEXT:    vle64ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   %a = call { <vscale x 1 x double>, iXLen } @llvm.riscv.vleff.nxv1f64(
@@ -3034,8 +3031,6 @@ entry:
 define void @intrinsic_vleff_mask_dead_all(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i1> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vleff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
-; CHECK-NEXT:    vle64ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   %a = call { <vscale x 1 x double>, iXLen } @llvm.riscv.vleff.mask.nxv1f64(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
index d8bff08ea5513..333ba83f69eef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv32-dead.ll
@@ -66,8 +66,6 @@ entry:
 define void @test_vlseg2ff_dead_all(ptr %base, i32 %vl) {
 ; CHECK-LABEL: test_vlseg2ff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i32} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4)
@@ -77,8 +75,6 @@ entry:
 define void @test_vlseg2ff_mask_dead_all(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, i32 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2ff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i32} @llvm.riscv.vlseg2ff.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
index 05a5be295cc71..b9e45cc190a65 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vlsegff-rv64-dead.ll
@@ -66,8 +66,6 @@ entry:
 define void @test_vlseg2ff_dead_all(ptr %base, i64 %vl) {
 ; CHECK-LABEL: test_vlseg2ff_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0)
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i64} @llvm.riscv.vlseg2ff.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4)
@@ -77,8 +75,6 @@ entry:
 define void @test_vlseg2ff_mask_dead_all(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, i64 %vl, <vscale x 16 x i1> %mask) {
 ; CHECK-LABEL: test_vlseg2ff_mask_dead_all:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
-; CHECK-NEXT:    vlseg2e16ff.v v8, (a0), v0.t
 ; CHECK-NEXT:    ret
 entry:
   tail call {target("riscv.vector.tuple", <vscale x 32 x i8>, 2), i64} @llvm.riscv.vlseg2ff.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %val, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4)

From 703e4460228fa5893dd0dff514ce44442b310b5e Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou@intel.com>
Date: Fri, 13 Jun 2025 06:45:28 +0800
Subject: [PATCH 309/851] [Clang] Add check for -mstack-alignment (#143124)

Currently the assertion in Alignment.h is triggered if a wrong value is
passed -mstack-alignment option:
```
Assertion `(Value == 0 || llvm::isPowerOf2_64(Value)) && "Alignment is neither 0 nor
a power of 2"' failed.
```

Added check in clang driver for the value of -mstack-alignment option,
and emitted an error message when the wrong value was passed.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 14 +++++++++++---
 clang/test/Driver/stack-alignment.c   | 11 +++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/stack-alignment.c

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1d11be1d82be8..15acb88c1a8fd 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6907,9 +6907,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.addOptInFlag(CmdArgs, options::OPT_mstackrealign,
                     options::OPT_mno_stackrealign);
 
-  if (Args.hasArg(options::OPT_mstack_alignment)) {
-    StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment);
-    CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
+  if (const Arg *A = Args.getLastArg(options::OPT_mstack_alignment)) {
+    StringRef Value = A->getValue();
+    int64_t Alignment = 0;
+    if (Value.getAsInteger(10, Alignment) || Alignment < 0)
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Value << A->getOption().getName();
+    else if (Alignment & (Alignment - 1))
+      D.Diag(diag::err_drv_alignment_not_power_of_two)
+          << A->getAsString(Args) << Value;
+    else
+      CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + Value));
   }
 
   if (Args.hasArg(options::OPT_mstack_probe_size)) {
diff --git a/clang/test/Driver/stack-alignment.c b/clang/test/Driver/stack-alignment.c
new file mode 100644
index 0000000000000..e1e62c05c32ab
--- /dev/null
+++ b/clang/test/Driver/stack-alignment.c
@@ -0,0 +1,11 @@
+// RUN: not %clang -### -mstack-alignment=-1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_NEG_1
+// RUN: %clang -### -mstack-alignment=0 %s 2>&1 | FileCheck %s --check-prefix=CHECK_0
+// RUN: %clang -### -mstack-alignment=1 %s 2>&1 | FileCheck %s --check-prefix=CHECK_1
+// RUN: %clang -### -mstack-alignment=4 %s 2>&1 | FileCheck %s --check-prefix=CHECK_4
+// RUN: not %clang -### -mstack-alignment=5 %s 2>&1 | FileCheck %s --check-prefix=CHECK_5
+
+// CHECK_NEG_1: error: invalid argument '-1' to -mstack-alignment=
+// CHECK_0: -mstack-alignment=0
+// CHECK_1: -mstack-alignment=1
+// CHECK_4: -mstack-alignment=4
+// CHECK_5: error: alignment is not a power of 2 in '-mstack-alignment=5'

From 28c14d475fbd16d07db88c8d12edddfe9cc226ab Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 15:57:58 -0700
Subject: [PATCH 310/851] [libc] Independent strcat/strncat/stpcpy (#142643)

The previous implementations called other entrypoints. This patch fixes
strcat, strncat, and stpcpy to be properly independent.
---
 libc/src/string/CMakeLists.txt |  3 ---
 libc/src/string/stpcpy.cpp     |  5 ++---
 libc/src/string/strcat.cpp     |  9 +++++----
 libc/src/string/strncat.cpp    | 10 +++++-----
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index c3b414d872858..8784bc3750cb1 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -87,7 +87,6 @@ add_entrypoint_object(
   HDRS
     stpcpy.h
   DEPENDS
-    .mempcpy
     .string_utils
 )
 
@@ -108,7 +107,6 @@ add_entrypoint_object(
   HDRS
     strcat.h
   DEPENDS
-    .strcpy
     .string_utils
     libc.include.llvm-libc-types.size_t
 )
@@ -265,7 +263,6 @@ add_entrypoint_object(
   HDRS
     strncat.h
   DEPENDS
-    .strncpy
     .string_utils
     libc.include.llvm-libc-types.size_t
 )
diff --git a/libc/src/string/stpcpy.cpp b/libc/src/string/stpcpy.cpp
index 979edd72c1f1d..48c0db950ace0 100644
--- a/libc/src/string/stpcpy.cpp
+++ b/libc/src/string/stpcpy.cpp
@@ -8,7 +8,6 @@
 
 #include "src/string/stpcpy.h"
 #include "src/__support/macros/config.h"
-#include "src/string/mempcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -18,8 +17,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(char *, stpcpy,
                    (char *__restrict dest, const char *__restrict src)) {
   size_t size = internal::string_length(src) + 1;
-  char *result =
-      reinterpret_cast<char *>(LIBC_NAMESPACE::mempcpy(dest, src, size));
+  __builtin_memcpy(dest, src, size);
+  char *result = dest + size;
 
   if (result != nullptr)
     return result - 1;
diff --git a/libc/src/string/strcat.cpp b/libc/src/string/strcat.cpp
index 6a6f068bd4759..7dce6d15a65c1 100644
--- a/libc/src/string/strcat.cpp
+++ b/libc/src/string/strcat.cpp
@@ -9,7 +9,6 @@
 #include "src/string/strcat.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/null_check.h"
-#include "src/string/strcpy.h"
 #include "src/string/string_utils.h"
 
 #include "src/__support/common.h"
@@ -21,9 +20,11 @@ LLVM_LIBC_FUNCTION(char *, strcat,
   LIBC_CRASH_ON_NULLPTR(dest);
   LIBC_CRASH_ON_NULLPTR(src);
   size_t dest_length = internal::string_length(dest);
-  size_t src_length = internal::string_length(src);
-  LIBC_NAMESPACE::strcpy(dest + dest_length, src);
-  dest[dest_length + src_length] = '\0';
+  size_t i;
+  for (i = 0; src[i] != '\0'; ++i)
+    dest[dest_length + i] = src[i];
+
+  dest[dest_length + i] = '\0';
   return dest;
 }
 
diff --git a/libc/src/string/strncat.cpp b/libc/src/string/strncat.cpp
index 4926b7d244d12..6d8bb69607486 100644
--- a/libc/src/string/strncat.cpp
+++ b/libc/src/string/strncat.cpp
@@ -10,7 +10,6 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/null_check.h"
 #include "src/string/string_utils.h"
-#include "src/string/strncpy.h"
 
 #include "src/__support/common.h"
 
@@ -23,11 +22,12 @@ LLVM_LIBC_FUNCTION(char *, strncat,
     LIBC_CRASH_ON_NULLPTR(dest);
     LIBC_CRASH_ON_NULLPTR(src);
   }
-  size_t src_length = internal::string_length(src);
-  size_t copy_amount = src_length > count ? count : src_length;
   size_t dest_length = internal::string_length(dest);
-  LIBC_NAMESPACE::strncpy(dest + dest_length, src, copy_amount);
-  dest[dest_length + copy_amount] = '\0';
+  size_t i;
+  for (i = 0; i < count && src[i] != '\0'; ++i)
+    dest[dest_length + i] = src[i];
+
+  dest[dest_length + i] = '\0';
   return dest;
 }
 

From 32e1360aaa9fbf5e388f9d061fa004b02c0a1359 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Thu, 12 Jun 2025 16:16:14 -0700
Subject: [PATCH 311/851] [CIR][NFC] Fix build problems with [[maybe_unused]]
 (#143994)

A recent commit introduced the use of [[maybe_unused]] following
LLVM_PREFFERED_TYPE(bool) on a member variable declaration. I compiled
it with clang 14.0, which doesn't support the `preferred_type` attribute
so I didn't notice a problem. However, starting with clang 18.0, this
reports an error ("an attribute list cannot appear here") because of the
mixing of attribute styles.

This change fixes the problem by replacing [[maybe_unused]] with
LLVM_ATTRIBUTE_UNUSED.
---
 clang/lib/CIR/CodeGen/CIRGenValue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 8f52fea31750c..258ae306f693d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -271,7 +271,7 @@ class AggValueSlot {
   /// destructor for the slot.  Otherwise the code which constructs it should
   /// push the appropriate cleanup.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned destructedFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned destructedFlag : 1;
 
   /// This is set to true if the memory in the slot is known to be zero before
   /// the assignment into it.  This means that zero fields don't need to be set.
@@ -290,7 +290,7 @@ class AggValueSlot {
   /// object, it's important that this flag never be set when
   /// evaluating an expression which constructs such an object.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned aliasedFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned aliasedFlag : 1;
 
   /// This is set to true if the tail padding of this slot might overlap
   /// another object that may have already been initialized (and whose
@@ -298,7 +298,7 @@ class AggValueSlot {
   /// store up to the dsize of the type. Otherwise we can widen stores to
   /// the size of the type.
   LLVM_PREFERRED_TYPE(bool)
-  [[maybe_unused]] unsigned overlapFlag : 1;
+  LLVM_ATTRIBUTE_UNUSED unsigned overlapFlag : 1;
 
 public:
   enum IsDestructed_t { IsNotDestructed, IsDestructed };

From 70f44ec6feba56b076cf65e02b8876f185efdab9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 12 Jun 2025 16:49:25 -0700
Subject: [PATCH 312/851] [libc][NFC] Accept doc fix (#143996)

Docgen updates the docs when the config options are changed. This update
has been waiting since https://github.com/llvm/llvm-project/pull/143187.
---
 libc/docs/configure.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 8d53390ae19bf..109412225634f 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -29,7 +29,7 @@ to learn about the defaults for your platform and target.
     - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack.
     - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
 * **"errno" options**
-    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+    - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE.
 * **"general" options**
     - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
 * **"math" options**

From e1bb35d067568794585544b8942638c467d13bea Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Thu, 12 Jun 2025 16:52:32 -0700
Subject: [PATCH 313/851] [bazel] Fix modules build for llvm-libc (speculative)
 (#143995)

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel          | 7 +++++++
 .../llvm-project-overlay/libc/test/UnitTest/BUILD.bazel    | 1 +
 2 files changed, 8 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 0cedad2859247..84a6b7d230442 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1619,6 +1619,10 @@ libc_header_library(
 libc_support_library(
     name = "__support_libc_errno",
     hdrs = ["src/__support/libc_errno.h"],
+    deps = [
+        ":__support_macros_config",
+        ":hdr_errno_macros",
+    ],
 )
 
 libc_support_library(
@@ -1981,6 +1985,9 @@ libc_support_library(
 libc_support_library(
     name = "__support_math_exp_float_constants",
     hdrs = ["src/__support/math/exp_float_constants.h"],
+    deps = [
+        ":__support_macros_config",
+    ],
 )
 
 libc_support_library(
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index b37ec19330236..2354337da2dc5 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -58,6 +58,7 @@ libc_test_library(
         "//libc:__support_fputil_fp_bits",
         "//libc:__support_fputil_fpbits_str",
         "//libc:__support_fputil_rounding_mode",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:__support_macros_properties_architectures",
         "//libc:__support_macros_properties_types",

From 3ddd137332237918fbb6175c20327fe765d2c4ad Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com>
Date: Thu, 12 Jun 2025 17:08:49 -0700
Subject: [PATCH 314/851] [flang] [cuda] Move SetImplicityCUDADevice after
 symbols in block construct are converted to objects (#143791)

`SetImplicitCUDADevice` looks for `symbol.has<ObjectEntityDetails>()` to
set the device attribute before symbols inside block constructs are
converted to ObjectEntity. Fix is to move the call to
`SetImplicitCUDADevice` after those symbols are converted.
---
 flang/lib/Semantics/resolve-names.cpp | 74 ++++++++++++++-------------
 flang/test/Semantics/cuf21.cuf        | 13 +++--
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 7db447aee0026..e23e91b674a73 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2828,6 +2828,16 @@ Scope &ScopeHandler::NonDerivedTypeScope() {
   return currScope_->IsDerivedType() ? currScope_->parent() : *currScope_;
 }
 
+static void SetImplicitCUDADevice(Symbol &symbol) {
+  if (auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+    if (!object->cudaDataAttr() && !IsValue(symbol) &&
+        !IsFunctionResult(symbol)) {
+      // Implicitly set device attribute if none is set in device context.
+      object->set_cudaDataAttr(common::CUDADataAttr::Device);
+    }
+  }
+}
+
 void ScopeHandler::PushScope(Scope::Kind kind, Symbol *symbol) {
   PushScope(currScope().MakeScope(kind, symbol));
 }
@@ -2867,9 +2877,35 @@ void ScopeHandler::PopScope() {
   // Entities that are not yet classified as objects or procedures are now
   // assumed to be objects.
   // TODO: Statement functions
+  bool inDeviceSubprogram{false};
+  const Symbol *scopeSym{currScope().GetSymbol()};
+  if (currScope().kind() == Scope::Kind::BlockConstruct) {
+    scopeSym = GetProgramUnitContaining(currScope()).GetSymbol();
+  }
+  if (scopeSym) {
+    if (auto *details{scopeSym->detailsIf<SubprogramDetails>()}) {
+      // Check the current procedure is a device procedure to apply implicit
+      // attribute at the end.
+      if (auto attrs{details->cudaSubprogramAttrs()}) {
+        if (*attrs == common::CUDASubprogramAttrs::Device ||
+            *attrs == common::CUDASubprogramAttrs::Global ||
+            *attrs == common::CUDASubprogramAttrs::Grid_Global) {
+          inDeviceSubprogram = true;
+        }
+      }
+    }
+  }
   for (auto &pair : currScope()) {
     ConvertToObjectEntity(*pair.second);
   }
+
+  // Apply CUDA device attributes if in a device subprogram
+  if (inDeviceSubprogram && currScope().kind() == Scope::Kind::BlockConstruct) {
+    for (auto &pair : currScope()) {
+      SetImplicitCUDADevice(*pair.second);
+    }
+  }
+
   funcResultStack_.Pop();
   // If popping back into a global scope, pop back to the top scope.
   Scope *hermetic{context().currentHermeticModuleFileScope()};
@@ -9555,40 +9591,11 @@ void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) {
   info.Resolve(&MakeSymbol(symbolName, Attrs{}, std::move(genericDetails)));
 }
 
-static void SetImplicitCUDADevice(bool inDeviceSubprogram, Symbol &symbol) {
-  if (inDeviceSubprogram && symbol.has<ObjectEntityDetails>()) {
-    auto *object{symbol.detailsIf<ObjectEntityDetails>()};
-    if (!object->cudaDataAttr() && !IsValue(symbol) &&
-        !IsFunctionResult(symbol)) {
-      // Implicitly set device attribute if none is set in device context.
-      object->set_cudaDataAttr(common::CUDADataAttr::Device);
-    }
-  }
-}
-
 void ResolveNamesVisitor::FinishSpecificationPart(
     const std::list<parser::DeclarationConstruct> &decls) {
   misparsedStmtFuncFound_ = false;
   funcResultStack().CompleteFunctionResultType();
   CheckImports();
-  bool inDeviceSubprogram{false};
-  Symbol *scopeSym{currScope().symbol()};
-  if (currScope().kind() == Scope::Kind::BlockConstruct) {
-    scopeSym = currScope().parent().symbol();
-  }
-  if (scopeSym) {
-    if (auto *details{scopeSym->detailsIf<SubprogramDetails>()}) {
-      // Check the current procedure is a device procedure to apply implicit
-      // attribute at the end.
-      if (auto attrs{details->cudaSubprogramAttrs()}) {
-        if (*attrs == common::CUDASubprogramAttrs::Device ||
-            *attrs == common::CUDASubprogramAttrs::Global ||
-            *attrs == common::CUDASubprogramAttrs::Grid_Global) {
-          inDeviceSubprogram = true;
-        }
-      }
-    }
-  }
   for (auto &pair : currScope()) {
     auto &symbol{*pair.second};
     if (inInterfaceBlock()) {
@@ -9623,11 +9630,6 @@ void ResolveNamesVisitor::FinishSpecificationPart(
         SetBindNameOn(symbol);
       }
     }
-    if (currScope().kind() == Scope::Kind::BlockConstruct) {
-      // Only look for specification in BlockConstruct. Other cases are done in
-      // ResolveSpecificationParts.
-      SetImplicitCUDADevice(inDeviceSubprogram, symbol);
-    }
   }
   currScope().InstantiateDerivedTypes();
   for (const auto &decl : decls) {
@@ -10187,7 +10189,9 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
     }
     ApplyImplicitRules(symbol);
     // Apply CUDA implicit attributes if needed.
-    SetImplicitCUDADevice(inDeviceSubprogram, symbol);
+    if (inDeviceSubprogram) {
+      SetImplicitCUDADevice(symbol);
+    }
     // Main program local objects usually don't have an implied SAVE attribute,
     // as one might think, but in the exceptional case of a derived type
     // local object that contains a coarray, we have to mark it as an
diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf
index 077657c8a52d5..db32f1dbd0e7b 100644
--- a/flang/test/Semantics/cuf21.cuf
+++ b/flang/test/Semantics/cuf21.cuf
@@ -13,18 +13,21 @@ contains
     implicit none
     logical, intent(in), value :: back
     real(4) :: mval
-
-    call maxlocUpdate(mval, back)
-
+  block
+    integer(8) :: xloc
+    call maxlocUpdate(mval, xloc, back)
+  end block
   end subroutine maxlocPartialMaskR_32F1D
 
-  attributes(device) subroutine maxlocUpdateR_32F(mval, back)
+  attributes(device) subroutine maxlocUpdateR_32F(mval, xloc, back)
     real(4) :: mval
+    integer(8) :: xloc
     logical :: back
   end subroutine maxlocUpdateR_32F
 
-  attributes(device) subroutine maxlocUpdateR_64F(mval, back)
+  attributes(device) subroutine maxlocUpdateR_64F(mval, xloc, back)
     real(8) :: mval
+    integer(8) :: xloc
     logical :: back
   end subroutine maxlocUpdateR_64F
 end module

From 22f9b4aa1dad597d908be77be1e10ba4c77330ce Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 20:08:55 -0400
Subject: [PATCH 315/851] Reland [HIP] use offload wrapper for non-device-only
 non-rdc (#132869) (#143964)

Fixed two issues:

1. assertion with -flto. the linker wrapper action is missing for
wrapping the device binary. Added it for -flto.

2. when there are two HIP files, the kernels in the second file were not
found. This is because the -r option of linker wrapper assumes offload
entries section of HIP to be hip_offloading_entries but it is actually
llvm_offload_entries, causing the offload entries sections not made
unique for different object files. Fixed and tested working for both
-fgpu-rdc and -fno-gpu-rdc case with and without -r
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 ++++-
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 58 ++++++++-----
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 ++++++++++++-------
 clang/test/Driver/linker-wrapper.c            |  1 +
 .../ClangLinkerWrapper.cpp                    | 31 ++++---
 8 files changed, 168 insertions(+), 92 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 38f514304df5e..dd26be74e561b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,7 +1280,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+      (CGM.getLangOpts().OffloadingNewDriver &&
+       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eb60d907d2218..060f76fb653c9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,6 +4423,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4556,7 +4560,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if (UseNewOffloadingDriver ||
+    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4867,10 +4871,31 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input and compile action to embed it in. If preprocessing only
-  // ignore embedding.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
-      !(isa<CompileJobAction>(HostAction) ||
+  // valid source input.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first))
+    return HostAction;
+
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
+  // For HIP non-rdc non-device-only compilation, create a linker wrapper
+  // action for each host object to link, bundle and wrap device files in
+  // it.
+  if ((isa<AssembleJobAction>(HostAction) ||
+       (isa<BackendJobAction>(HostAction) &&
+        HostAction->getType() == types::TY_LTO_BC)) &&
+      HIPNoRDC && !offloadDeviceOnly()) {
+    ActionList AL{HostAction};
+    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+                                         /*BoundArch=*/nullptr);
+    return HostAction;
+  }
+
+  // Don't build offloading actions if we do not have a compile action. If
+  // preprocessing only ignore embedding.
+  if (!(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4966,12 +4991,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in non-RDC mode requires linking each action individually.
+    // Compiling HIP in device-only non-RDC mode requires linking each action
+    // individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          Kind != Action::OFK_HIP ||
-          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+          !HIPNoRDC || !offloadDeviceOnly())
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4995,12 +5020,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+  // HIP code in device-only non-RDC mode will bundle the output if it invoked
+  // the linker.
   bool ShouldBundleHIP =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      HIPNoRDC && offloadDeviceOnly() &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5020,11 +5045,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
-             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                           false)) {
-    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
-    // translation unit, linking each input individually.
+  } else if (HIPNoRDC && offloadDeviceOnly()) {
+    // If we are in device-only non-RDC-mode we just emit the final HIP
+    // fatbinary for each translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5177,8 +5200,11 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                       false) ||
+         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                        false) ||
+           (Args.hasFlag(options::OPT_offload_new_driver,
+                         options::OPT_no_offload_new_driver, false) &&
+            !offloadDeviceOnly())) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 15acb88c1a8fd..8556bcadf0915 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if ((IsCuda || IsHIP) && !IsRDCMode) {
+    if (IsCuda && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,8 +9257,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-  for (const char *LinkArg : LinkCommand->getArguments())
-    CmdArgs.push_back(LinkArg);
+
+  // We use action type to differentiate two use cases of the linker wrapper.
+  // TY_Image for normal linker wrapper work.
+  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+  // object.
+  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+  if (JA.getType() == types::TY_Object) {
+    CmdArgs.append({"-o", Output.getFilename()});
+    for (auto Input : Inputs)
+      CmdArgs.push_back(Input.getFilename());
+    CmdArgs.push_back("-r");
+  } else
+    for (const char *LinkArg : LinkCommand->getArguments())
+      CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87b..d8b3f1e242018 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc3..d8a58b78d6d5c 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,57 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 6c69d1d51a260..ddd251b67cc57 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -133,40 +143,49 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
+// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
+
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index a7e98e7351d98..80b1a5745a123 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,6 +223,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 95b6f74af1f13..b8019fac4c2ec 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,22 +310,21 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
-    auto Section = (Prefix + "_offloading_entries").str();
-    // Rename the offloading entires to make them private to this link unit.
-    ObjcopyArgs.emplace_back("--rename-section");
-    ObjcopyArgs.emplace_back(
-        Args.MakeArgString(Section + "=" + Section + Suffix));
-
-    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-    // newly renamed section containing the offloading entries.
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                                "__start_" + Section + Suffix));
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                                "__stop_" + Section + Suffix));
-  }
+  StringRef Prefix = "llvm";
+  auto Section = (Prefix + "llvm_offload_entries").str();
+  // Rename the offloading entires to make them private to this link unit.
+  ObjcopyArgs.emplace_back("--rename-section");
+  ObjcopyArgs.emplace_back(
+      Args.MakeArgString(Section + "=" + Section + Suffix));
+
+  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+  // newly renamed section containing the offloading entries.
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                              "__start_" + Section + Suffix));
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                              "__stop_" + Section + Suffix));
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 029f8892a500594bd044507352503249fd641e6c Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 13 Jun 2025 02:00:14 +0100
Subject: [PATCH 316/851] [RISCV] Fold vmv.v.v into vleNff.v (#143981)

We currently already fold vmerge.vvm into vleNff.v via
RISCVDAGToDAGISel::performCombineVMergeAndVOps, so this teaches
RISCVVectorPeephole::foldVMV_V_V to do the same.
---
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp   |  7 ++++---
 llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll | 16 ++++++++++++++++
 .../test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir | 17 +++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 6bb026378274e..c9c2413d009b7 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -611,7 +611,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
 
   MachineInstr *Src = MRI->getVRegDef(MI.getOperand(2).getReg());
   if (!Src || Src->hasUnmodeledSideEffects() ||
-      Src->getParent() != MI.getParent() || Src->getNumDefs() != 1 ||
+      Src->getParent() != MI.getParent() ||
       !RISCVII::isFirstDefTiedToFirstUse(Src->getDesc()) ||
       !RISCVII::hasVLOp(Src->getDesc().TSFlags) ||
       !RISCVII::hasVecPolicyOp(Src->getDesc().TSFlags))
@@ -622,7 +622,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     return false;
 
   // Src needs to have the same passthru as VMV_V_V
-  MachineOperand &SrcPassthru = Src->getOperand(1);
+  MachineOperand &SrcPassthru = Src->getOperand(Src->getNumExplicitDefs());
   if (SrcPassthru.getReg() != RISCV::NoRegister &&
       SrcPassthru.getReg() != Passthru.getReg())
     return false;
@@ -643,7 +643,8 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
     // If Src is masked then its passthru needs to be in VRNoV0.
     if (Passthru.getReg() != RISCV::NoRegister)
       MRI->constrainRegClass(Passthru.getReg(),
-                             TII->getRegClass(Src->getDesc(), 1, TRI,
+                             TII->getRegClass(Src->getDesc(),
+                                              SrcPassthru.getOperandNo(), TRI,
                                               *Src->getParent()->getParent()));
   }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
index 6345b90db23b8..1e2e7795f6546 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.ll
@@ -206,3 +206,19 @@ define <vscale x 1 x i64> @undef_passthru(<vscale x 1 x i64> %passthru, <vscale
   %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> %a, iXLen %avl)
   ret <vscale x 1 x i64> %b
 }
+
+; Check that we can fold into vle64ff.v even if we need to move it past the
+; passthru and it's safe.
+define <vscale x 1 x i64> @vleff_move_past_passthru(ptr %p, ptr %q, iXLen %avl) {
+; CHECK-LABEL: vleff_move_past_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl1re64.v v8, (a1)
+; CHECK-NEXT:    vsetvli zero, a2, e64, m1, tu, ma
+; CHECK-NEXT:    vle64ff.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = call { <vscale x 1 x i64>, iXLen } @llvm.riscv.vleff(<vscale x 1 x i64> poison, ptr %p, iXLen %avl)
+  %vec = extractvalue { <vscale x 1 x i64>, iXLen } %a, 0
+  %passthru = load <vscale x 1 x i64>, ptr %q
+  %b = call <vscale x 1 x i64> @llvm.riscv.vmv.v.v.nxv1i64(<vscale x 1 x i64> %passthru, <vscale x 1 x i64> %vec, iXLen %avl)
+  ret <vscale x 1 x i64> %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index f545ecc5e53d7..6e106e50634f0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -135,3 +135,20 @@ body: |
     %3:vrnov0 = PseudoVMV_V_V_MF2 $noreg, %2, 0, 5 /* e32 */, 0 /* tu, mu */
     %7:vmv0 = COPY $v8
     %6:vrnov0 = PseudoVLSE32_V_MF2_MASK %3, $noreg, $noreg, %7, 0, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 4)
+...
+---
+name: move_vleff
+body: |
+  bb.0:
+    liveins: $v8
+    ; CHECK-LABEL: name: move_vleff
+    ; CHECK: liveins: $v8
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %passthru:vr = COPY $v8
+    ; CHECK-NEXT: %x:vr, %vl:gpr = PseudoVLE32FF_V_M1 %passthru, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 1)
+    ; CHECK-NEXT: %y:gpr = ADDI $x0, 1
+    %x:vr, %vl:gpr = PseudoVLE32FF_V_M1 $noreg, $noreg, 4, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size)
+    %passthru:vr = COPY $v8
+    %y:gpr = ADDI $x0, 1
+    %z:vr = PseudoVMV_V_V_M1 %passthru, %x, 4, 5 /* e32 */, 0 /* tu, mu */
+...

From 8890706db67384a423773cc921302dd63d950ef5 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 21:33:05 -0400
Subject: [PATCH 317/851] Revert "Reland [HIP] use offload wrapper for
 non-device-only non-rdc (#132869) (#143964)"

This reverts commit 22f9b4aa1dad597d908be77be1e10ba4c77330ce.
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++---------
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 +----
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 58 +++++--------
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 +++++++------------
 clang/test/Driver/linker-wrapper.c            |  1 -
 .../ClangLinkerWrapper.cpp                    | 31 +++----
 8 files changed, 92 insertions(+), 168 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index dd26be74e561b..38f514304df5e 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,8 +1280,7 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver &&
-       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 060f76fb653c9..eb60d907d2218 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,10 +4423,6 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4560,7 +4556,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
+    } else if (UseNewOffloadingDriver ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4871,31 +4867,10 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first))
-    return HostAction;
-
-  bool HIPNoRDC =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
-
-  // For HIP non-rdc non-device-only compilation, create a linker wrapper
-  // action for each host object to link, bundle and wrap device files in
-  // it.
-  if ((isa<AssembleJobAction>(HostAction) ||
-       (isa<BackendJobAction>(HostAction) &&
-        HostAction->getType() == types::TY_LTO_BC)) &&
-      HIPNoRDC && !offloadDeviceOnly()) {
-    ActionList AL{HostAction};
-    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
-    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
-                                         /*BoundArch=*/nullptr);
-    return HostAction;
-  }
-
-  // Don't build offloading actions if we do not have a compile action. If
-  // preprocessing only ignore embedding.
-  if (!(isa<CompileJobAction>(HostAction) ||
+  // valid source input and compile action to embed it in. If preprocessing only
+  // ignore embedding.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
+      !(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4991,12 +4966,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in device-only non-RDC mode requires linking each action
-    // individually.
+    // Compiling HIP in non-RDC mode requires linking each action individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          !HIPNoRDC || !offloadDeviceOnly())
+          Kind != Action::OFK_HIP ||
+          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -5020,12 +4995,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in device-only non-RDC mode will bundle the output if it invoked
-  // the linker.
+  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
   bool ShouldBundleHIP =
-      HIPNoRDC && offloadDeviceOnly() &&
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5045,9 +5020,11 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (HIPNoRDC && offloadDeviceOnly()) {
-    // If we are in device-only non-RDC-mode we just emit the final HIP
-    // fatbinary for each translation unit, linking each input individually.
+  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
+             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                           false)) {
+    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
+    // translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5200,11 +5177,8 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                        false) ||
-           (Args.hasFlag(options::OPT_offload_new_driver,
-                         options::OPT_no_offload_new_driver, false) &&
-            !offloadDeviceOnly())) ||
+         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                       false) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8556bcadf0915..15acb88c1a8fd 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if (IsCuda && !IsRDCMode) {
+    if ((IsCuda || IsHIP) && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,20 +9257,8 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-
-  // We use action type to differentiate two use cases of the linker wrapper.
-  // TY_Image for normal linker wrapper work.
-  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
-  // object.
-  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
-  if (JA.getType() == types::TY_Object) {
-    CmdArgs.append({"-o", Output.getFilename()});
-    for (auto Input : Inputs)
-      CmdArgs.push_back(Input.getFilename());
-    CmdArgs.push_back("-r");
-  } else
-    for (const char *LinkArg : LinkCommand->getArguments())
-      CmdArgs.push_back(LinkArg);
+  for (const char *LinkArg : LinkCommand->getArguments())
+    CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index d8b3f1e242018..57e57194ec87b 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
-// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
+// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index d8a58b78d6d5c..5fd2c0216ccc3 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,57 +8,39 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
+// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
-// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
-// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
+// RUN: | FileCheck -check-prefixes=BIN,RDC %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
-// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
-// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
-// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
-// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
-// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
-// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
+// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index ddd251b67cc57..6c69d1d51a260 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,23 +47,22 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// OLD-SAME: "-emit-obj"
-// NEW-SAME: "-emit-llvm-bc"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
+// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -71,71 +70,62 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
-// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
-
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -143,49 +133,40 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
+// CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
+// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// OLD-SAME: "-bundle-align=4096"
-// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
-
-// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
-// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
+// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// CHECK-SAME: "-bundle-align=4096"
+// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
-// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
-// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
-// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
-// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
-// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
-
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 80b1a5745a123..a7e98e7351d98 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,7 +223,6 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
-// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index b8019fac4c2ec..95b6f74af1f13 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,21 +310,22 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  StringRef Prefix = "llvm";
-  auto Section = (Prefix + "llvm_offload_entries").str();
-  // Rename the offloading entires to make them private to this link unit.
-  ObjcopyArgs.emplace_back("--rename-section");
-  ObjcopyArgs.emplace_back(
-      Args.MakeArgString(Section + "=" + Section + Suffix));
-
-  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-  // newly renamed section containing the offloading entries.
-  ObjcopyArgs.emplace_back("--redefine-sym");
-  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                              "__start_" + Section + Suffix));
-  ObjcopyArgs.emplace_back("--redefine-sym");
-  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                              "__stop_" + Section + Suffix));
+  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
+    auto Section = (Prefix + "_offloading_entries").str();
+    // Rename the offloading entires to make them private to this link unit.
+    ObjcopyArgs.emplace_back("--rename-section");
+    ObjcopyArgs.emplace_back(
+        Args.MakeArgString(Section + "=" + Section + Suffix));
+
+    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+    // newly renamed section containing the offloading entries.
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                                "__start_" + Section + Suffix));
+    ObjcopyArgs.emplace_back("--redefine-sym");
+    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                                "__stop_" + Section + Suffix));
+  }
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 7232c07eb97d5c21d47a661c9cca8981c7f91698 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 12 Jun 2025 21:35:22 -0400
Subject: [PATCH 318/851] Reland [HIP] use offload wrapper for non-device-only
 non-rdc (#143964)

Fixed a typo:

-  auto Section = (Prefix + "llvm_offload_entries").str();
+  auto Section = (Prefix + "_offload_entries").str();

which broke buildbot e.g.

https://lab.llvm.org/buildbot/#/builders/208/builds/1948
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  3 +-
 clang/lib/Driver/Driver.cpp                   | 62 +++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         | 18 ++++-
 clang/test/Driver/hip-binding.hip             |  6 +-
 clang/test/Driver/hip-phases.hip              | 58 ++++++++-----
 clang/test/Driver/hip-toolchain-no-rdc.hip    | 81 ++++++++++++-------
 clang/test/Driver/linker-wrapper.c            |  1 +
 .../ClangLinkerWrapper.cpp                    | 31 ++++---
 8 files changed, 168 insertions(+), 92 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 38f514304df5e..dd26be74e561b 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1280,7 +1280,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     return nullptr;
   }
   if (CGM.getLangOpts().OffloadViaLLVM ||
-      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
+      (CGM.getLangOpts().OffloadingNewDriver &&
+       (CGM.getLangOpts().HIP || RelocatableDeviceCode)))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index eb60d907d2218..060f76fb653c9 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4423,6 +4423,10 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
                    options::OPT_no_offload_new_driver,
                    C.isOffloadingHostKind(Action::OFK_Cuda));
 
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
   // Builder to be used to build offloading actions.
   std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
       !UseNewOffloadingDriver
@@ -4556,7 +4560,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
     // Check if this Linker Job should emit a static library.
     if (ShouldEmitStaticLibrary(Args)) {
       LA = C.MakeAction<StaticLibJobAction>(LinkerInputs, types::TY_Image);
-    } else if (UseNewOffloadingDriver ||
+    } else if ((UseNewOffloadingDriver && !HIPNoRDC) ||
                Args.hasArg(options::OPT_offload_link)) {
       LA = C.MakeAction<LinkerWrapperJobAction>(LinkerInputs, types::TY_Image);
       LA->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
@@ -4867,10 +4871,31 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
                                        const InputTy &Input, StringRef CUID,
                                        Action *HostAction) const {
   // Don't build offloading actions if explicitly disabled or we do not have a
-  // valid source input and compile action to embed it in. If preprocessing only
-  // ignore embedding.
-  if (offloadHostOnly() || !types::isSrcFile(Input.first) ||
-      !(isa<CompileJobAction>(HostAction) ||
+  // valid source input.
+  if (offloadHostOnly() || !types::isSrcFile(Input.first))
+    return HostAction;
+
+  bool HIPNoRDC =
+      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
+
+  // For HIP non-rdc non-device-only compilation, create a linker wrapper
+  // action for each host object to link, bundle and wrap device files in
+  // it.
+  if ((isa<AssembleJobAction>(HostAction) ||
+       (isa<BackendJobAction>(HostAction) &&
+        HostAction->getType() == types::TY_LTO_BC)) &&
+      HIPNoRDC && !offloadDeviceOnly()) {
+    ActionList AL{HostAction};
+    HostAction = C.MakeAction<LinkerWrapperJobAction>(AL, types::TY_Object);
+    HostAction->propagateHostOffloadInfo(C.getActiveOffloadKinds(),
+                                         /*BoundArch=*/nullptr);
+    return HostAction;
+  }
+
+  // Don't build offloading actions if we do not have a compile action. If
+  // preprocessing only ignore embedding.
+  if (!(isa<CompileJobAction>(HostAction) ||
         getFinalPhase(Args) == phases::Preprocess))
     return HostAction;
 
@@ -4966,12 +4991,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
       }
     }
 
-    // Compiling HIP in non-RDC mode requires linking each action individually.
+    // Compiling HIP in device-only non-RDC mode requires linking each action
+    // individually.
     for (Action *&A : DeviceActions) {
       if ((A->getType() != types::TY_Object &&
            A->getType() != types::TY_LTO_BC) ||
-          Kind != Action::OFK_HIP ||
-          Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+          !HIPNoRDC || !offloadDeviceOnly())
         continue;
       ActionList LinkerInput = {A};
       A = C.MakeAction<LinkJobAction>(LinkerInput, types::TY_Image);
@@ -4995,12 +5020,12 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
     }
   }
 
-  // HIP code in non-RDC mode will bundle the output if it invoked the linker.
+  // HIP code in device-only non-RDC mode will bundle the output if it invoked
+  // the linker.
   bool ShouldBundleHIP =
-      C.isOffloadingHostKind(Action::OFK_HIP) &&
+      HIPNoRDC && offloadDeviceOnly() &&
       Args.hasFlag(options::OPT_gpu_bundle_output,
                    options::OPT_no_gpu_bundle_output, true) &&
-      !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) &&
       !llvm::any_of(OffloadActions,
                     [](Action *A) { return A->getType() != types::TY_Image; });
 
@@ -5020,11 +5045,9 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_CUDA_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_Cuda>(),
              nullptr, Action::OFK_Cuda);
-  } else if (C.isOffloadingHostKind(Action::OFK_HIP) &&
-             !Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                           false)) {
-    // If we are not in RDC-mode we just emit the final HIP fatbinary for each
-    // translation unit, linking each input individually.
+  } else if (HIPNoRDC && offloadDeviceOnly()) {
+    // If we are in device-only non-RDC-mode we just emit the final HIP
+    // fatbinary for each translation unit, linking each input individually.
     Action *FatbinAction =
         C.MakeAction<LinkJobAction>(OffloadActions, types::TY_HIP_FATBIN);
     DDep.add(*FatbinAction, *C.getSingleOffloadToolChain<Action::OFK_HIP>(),
@@ -5177,8 +5200,11 @@ Action *Driver::ConstructPhaseAction(
         (((Input->getOffloadingToolChain() &&
            Input->getOffloadingToolChain()->getTriple().isAMDGPU()) ||
           TargetDeviceOffloadKind == Action::OFK_HIP) &&
-         (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
-                       false) ||
+         ((Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+                        false) ||
+           (Args.hasFlag(options::OPT_offload_new_driver,
+                         options::OPT_no_offload_new_driver, false) &&
+            !offloadDeviceOnly())) ||
           TargetDeviceOffloadKind == Action::OFK_OpenMP))) {
       types::ID Output =
           Args.hasArg(options::OPT_S) &&
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 15acb88c1a8fd..8556bcadf0915 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7710,7 +7710,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fcuda-include-gpubinary");
     CmdArgs.push_back(CudaDeviceInput->getFilename());
   } else if (!HostOffloadingInputs.empty()) {
-    if ((IsCuda || IsHIP) && !IsRDCMode) {
+    if (IsCuda && !IsRDCMode) {
       assert(HostOffloadingInputs.size() == 1 && "Only one input expected");
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(HostOffloadingInputs.front().getFilename());
@@ -9257,8 +9257,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add the linker arguments to be forwarded by the wrapper.
   CmdArgs.push_back(Args.MakeArgString(Twine("--linker-path=") +
                                        LinkCommand->getExecutable()));
-  for (const char *LinkArg : LinkCommand->getArguments())
-    CmdArgs.push_back(LinkArg);
+
+  // We use action type to differentiate two use cases of the linker wrapper.
+  // TY_Image for normal linker wrapper work.
+  // TY_Object for HIP fno-gpu-rdc embedding device binary in a relocatable
+  // object.
+  assert(JA.getType() == types::TY_Object || JA.getType() == types::TY_Image);
+  if (JA.getType() == types::TY_Object) {
+    CmdArgs.append({"-o", Output.getFilename()});
+    for (auto Input : Inputs)
+      CmdArgs.push_back(Input.getFilename());
+    CmdArgs.push_back("-r");
+  } else
+    for (const char *LinkArg : LinkCommand->getArguments())
+      CmdArgs.push_back(LinkArg);
 
   addOffloadCompressArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/hip-binding.hip b/clang/test/Driver/hip-binding.hip
index 57e57194ec87b..d8b3f1e242018 100644
--- a/clang/test/Driver/hip-binding.hip
+++ b/clang/test/Driver/hip-binding.hip
@@ -93,7 +93,7 @@
 // RUN:        -nogpulib -nogpuinc -foffload-lto --offload-arch=gfx90a --offload-arch=gfx908 -c %s 2>&1 \
 // RUN: | FileCheck -check-prefix=LTO-NO-RDC %s
 //      LTO-NO-RDC: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[LTO_908:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_908]]"], output: "[[OBJ_908:.+]]"
 // LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "clang", inputs: ["[[INPUT]]"], output: "[[LTO_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[LTO_90A]]"], output: "[[OBJ_90A:.+]]"
-// LTO-NO-RDC-NEXT: # "amdgcn-amd-amdhsa" - "AMDGCN::Linker", inputs: ["[[OBJ_908]]", "[[OBJ_90A]]"], output: "[[HIPFB:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[LTO_908]]", "[[LTO_90A]]"], output: "[[PKG:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[PKG]]"], output: "[[OBJ:.+]]"
+// LTO-NO-RDC-NEXT: # "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[OBJ]]"], output: "hip-binding.o"
diff --git a/clang/test/Driver/hip-phases.hip b/clang/test/Driver/hip-phases.hip
index 5fd2c0216ccc3..d8a58b78d6d5c 100644
--- a/clang/test/Driver/hip-phases.hip
+++ b/clang/test/Driver/hip-phases.hip
@@ -8,39 +8,57 @@
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,OLD %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDN %s
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --offload-new-driver --cuda-gpu-arch=gfx803 %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,NRD,NEW %s
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWN %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -flto -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWLTO %s
 //
 // RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
 // RUN: --no-offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
-// RUN: | FileCheck -check-prefixes=BIN,RDC %s
+// RUN: | FileCheck -check-prefixes=BIN,OLD,OLDR %s
+// RUN: %clang -x hip --target=x86_64-unknown-linux-gnu -ccc-print-phases \
+// RUN: --offload-new-driver --cuda-gpu-arch=gfx803 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=BIN,NEW,NEWR %s
 //
 // BIN-DAG: [[P0:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T:hip]], (host-[[T]])
 // BIN-DAG: [[P1:[0-9]+]]: preprocessor, {[[P0]]}, [[T]]-cpp-output, (host-[[T]])
 // BIN-DAG: [[P2:[0-9]+]]: compiler, {[[P1]]}, ir, (host-[[T]])
-// RDC-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
-// RDC-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDR-DAG: [[P12:[0-9]+]]: backend, {[[P2]]}, assembler, (host-[[T]])
+// OLDR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
 
 // BIN-DAG: [[P3:[0-9]+]]: input, "{{.*}}hip-phases.hip", [[T]], (device-[[T]], [[ARCH:gfx803]])
 // BIN-DAG: [[P4:[0-9]+]]: preprocessor, {[[P3]]}, [[T]]-cpp-output, (device-[[T]], [[ARCH]])
 // BIN-DAG: [[P5:[0-9]+]]: compiler, {[[P4]]}, ir, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
-// NRD-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
-// RDC-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
-// BIN-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
-// NRD-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
-// RDC-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
-
-// NRD-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
-// RDC-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
-// NRD-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
-// NRD-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
-// OLD-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
-// NEW-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
-// RDC-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// OLDN-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, assembler, (device-[[T]], [[ARCH]])
+// NEW-DAG: [[P6:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLDN-DAG: [[P7:[0-9]+]]: assembler, {[[P6]]}, object, (device-[[T]], [[ARCH]])
+// OLDR-DAG: [[P7:[0-9]+]]: backend, {[[P5]]}, ir, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P8:[0-9]+]]: linker, {[[P7]]}, image, (device-[[T]], [[ARCH]])
+// OLD-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P8]]}, image
+// NEW-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa:[[ARCH]])" {[[P6]]}, ir
+// OLDN-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, hip-fatbin, (device-[[T]])
+// NEW-DAG: [[P10:[0-9]+]]: clang-offload-packager, {[[P9]]}, image, (device-[[T]])
+// OLDR-DAG: [[P10:[0-9]+]]: linker, {[[P9]]}, object, (device-[[T]])
+
+// OLDN-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, ir
+// NEW-DAG: [[P11:[0-9]+]]: offload, "host-[[T]] (x86_64-unknown-linux-gnu)" {[[P2]]}, "device-[[T]] (x86_64-unknown-linux-gnu)" {[[P10]]}, ir
+// OLDR-DAG: [[P11:[0-9]+]]: offload, "device-[[T]] (amdgcn-amd-amdhsa)" {[[P10]]}, object
+// OLDN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// OLDN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWN-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWN-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P13:[0-9]+]]: backend, {[[P11]]}, lto-bc, (host-hip)
+// NEWR-DAG: [[P12:[0-9]+]]: backend, {[[P11]]}, assembler, (host-[[T]])
+// NEWR-DAG: [[P13:[0-9]+]]: assembler, {[[P12]]}, object, (host-[[T]])
+// OLDN-DAG: [[P14:[0-9]+]]: linker, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// NEWLTO-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, object, (host-[[T]])
+// OLDR-DAG: [[P14:[0-9]+]]: linker, {[[P13]], [[P11]]}, image, (host-[[T]])
+// NEWR-DAG: [[P14:[0-9]+]]: clang-linker-wrapper, {[[P13]]}, image, (host-[[T]])
+// NEWN-DAG: [[P15:[0-9]+]]: linker, {[[P14]]}, image
 
 //
 // Test single gpu architecture up to the assemble phase.
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 6c69d1d51a260..ddd251b67cc57 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -7,7 +7,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,LINK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -17,7 +17,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,OLD %s
 
 // RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc \
 // RUN:   -x hip --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 \
@@ -27,7 +27,7 @@
 // RUN:   -fuse-ld=lld -B%S/Inputs/lld -nogpuinc --offload-new-driver -c \
 // RUN:   %S/Inputs/hip_multiple_inputs/a.cu \
 // RUN:   %S/Inputs/hip_multiple_inputs/b.hip \
-// RUN: 2>&1 | FileCheck -check-prefixes=CHECK %s
+// RUN: 2>&1 | FileCheck -check-prefixes=CHECK,NEW %s
 
 // RUN: touch %t/a.o %t/b.o
 // RUN: %clang -### --target=x86_64-linux-gnu \
@@ -47,22 +47,23 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// OLD-SAME: "-emit-obj"
+// NEW-SAME: "-emit-llvm-bc"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC:".*a.cu"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]]
+// OLD: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" "[[OBJ_DEV_A_803]]"
 
 //
 // Compile device code in a.cu to code object for gfx900.
@@ -70,62 +71,71 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_A_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_A_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" "[[OBJ_DEV_A_900]]"
 
 //
 // Bundle and embed device code in host object for a.cu.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_A_803]]" "-input=[[IMG_DEV_A_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_A:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_A_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "a.cu"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_A]]"
+// OLD-SAME: {{.*}} "-o" [[A_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[A_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[A_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[A_OBJ_HOST:".*o"]] [[A_OBJ_HOST_TMP]] "-r"
+
 //
 // Compile device code in b.hip to code object for gfx803.
 //
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx803"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_803:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_803:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC:".*b.hip"]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" "[[OBJ_DEV_B_803]]"
 
 //
 // Compile device code in b.hip to code object for gfx900.
@@ -133,40 +143,49 @@
 
 // CHECK: [[CLANG:".*clang.*"]] "-cc1" "-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-aux-triple" "x86_64-unknown-linux-gnu"
-// CHECK-SAME: "-emit-obj"
+// CHECK-SAME: "-emit-{{(obj|llvm-bc)}}"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
 // CHECK-SAME: "-fcuda-is-device" "-fno-threadsafe-statics" "-mllvm" "-amdgpu-internalize-symbols"
 // CHECK-SAME: "-fcuda-allow-variadic-functions" "-fvisibility=hidden"
 // CHECK-SAME: "-fapply-global-visibility-to-externs"
 // CHECK-SAME: "{{.*}}lib1.bc" "{{.*}}lib2.bc"
 // CHECK-SAME: "-target-cpu" "gfx900"
-// CHECK-SAME: {{.*}} "-o" [[OBJ_DEV_B_900:".*o"]] "-x" "hip"
+// CHECK-SAME: {{.*}} "-o" "[[OBJ_DEV_B_900:.*(o|bc)]]" "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
 // CHECK-NOT: {{".*llvm-link"}}
 // CHECK-NOT: {{".*opt"}}
 // CHECK-NOT: {{".*llc"}}
 
-// CHECK: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
-// CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]]
+// OLD: [[LLD]] "-flavor" "gnu" "-m" "elf64_amdgpu" "--no-undefined" "-shared"
+// OLD-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" "[[OBJ_DEV_B_900]]"
 
 //
 // Bundle and embed device code in host object for b.hip.
 //
 
-// CHECK: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
-// CHECK-SAME: "-bundle-align=4096"
-// CHECK-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
-// CHECK-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_A:.*hipfb]]"
+// OLD: [[BUNDLER:".*clang-offload-bundler"]] "-type=o"
+// OLD-SAME: "-bundle-align=4096"
+// OLD-SAME: "-targets={{.*}},hipv4-amdgcn-amd-amdhsa--gfx803,hipv4-amdgcn-amd-amdhsa--gfx900"
+// OLD-SAME: "-input={{.*}}" "-input=[[IMG_DEV_B_803]]" "-input=[[IMG_DEV_B_900]]" "-output=[[BUNDLE_B:.*hipfb]]"
+
+// NEW: [[PACKAGER:".*clang-offload-packager"]] "-o" "[[PACKAGE_B:.*.out]]"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_803]],triple=amdgcn-amd-amdhsa,arch=gfx803,kind=hip"
+// NEW-SAME: "--image=file=[[OBJ_DEV_B_900]],triple=amdgcn-amd-amdhsa,arch=gfx900,kind=hip"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
 // CHECK-SAME: "-aux-triple" "amdgcn-amd-amdhsa"
 // CHECK-SAME: "-emit-obj"
 // CHECK-SAME: {{.*}} "-main-file-name" "b.hip"
-// CHECK-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_A]]"
-// CHECK-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// OLD-SAME: {{.*}} "-fcuda-include-gpubinary" "[[BUNDLE_B]]"
+// NEW-SAME: {{.*}} "-fembed-offload-object=[[PACKAGE_B]]"
+// OLD-SAME: {{.*}} "-o" [[B_OBJ_HOST:".*o"]] "-x" "hip"
+// NEW-SAME: {{.*}} "-o" [[B_OBJ_HOST_TMP:".*o"]] "-x" "hip"
 // CHECK-SAME: {{.*}} [[B_SRC]]
 
+// NEW: [[WRAPPER:".*clang-linker-wrapper]]" {{.*}}"--host-triple=x86_64-unknown-linux-gnu"
+// NEW:   "--linker-path={{.*}}" "-o" [[B_OBJ_HOST:".*o"]] [[B_OBJ_HOST_TMP]] "-r"
+
 //
 // Link host objects.
 //
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index a7e98e7351d98..80b1a5745a123 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -223,6 +223,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RELOCATABLE-LINK-HIP: clang-offload-bundler{{.*}} -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hip-amdgcn-amd-amdhsa--gfx90a -input={{/dev/null|NUL}} -input={{.*}} -output={{.*}}
 // RELOCATABLE-LINK-HIP: /usr/bin/ld.lld{{.*}}-r
 // RELOCATABLE-LINK-HIP: llvm-objcopy{{.*}}a.out --remove-section .llvm.offloading
+// RELOCATABLE-LINK-HIP: --rename-section llvm_offload_entries
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_89 \
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 95b6f74af1f13..7a1007d03737e 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -310,22 +310,21 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   // Remove the old .llvm.offloading section to prevent further linking.
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
-  for (StringRef Prefix : {"omp", "cuda", "hip"}) {
-    auto Section = (Prefix + "_offloading_entries").str();
-    // Rename the offloading entires to make them private to this link unit.
-    ObjcopyArgs.emplace_back("--rename-section");
-    ObjcopyArgs.emplace_back(
-        Args.MakeArgString(Section + "=" + Section + Suffix));
-
-    // Rename the __start_ / __stop_ symbols appropriately to iterate over the
-    // newly renamed section containing the offloading entries.
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
-                                                "__start_" + Section + Suffix));
-    ObjcopyArgs.emplace_back("--redefine-sym");
-    ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
-                                                "__stop_" + Section + Suffix));
-  }
+  StringRef Prefix = "llvm";
+  auto Section = (Prefix + "_offload_entries").str();
+  // Rename the offloading entires to make them private to this link unit.
+  ObjcopyArgs.emplace_back("--rename-section");
+  ObjcopyArgs.emplace_back(
+      Args.MakeArgString(Section + "=" + Section + Suffix));
+
+  // Rename the __start_ / __stop_ symbols appropriately to iterate over the
+  // newly renamed section containing the offloading entries.
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__start_" + Section + "=" +
+                                              "__start_" + Section + Suffix));
+  ObjcopyArgs.emplace_back("--redefine-sym");
+  ObjcopyArgs.emplace_back(Args.MakeArgString("__stop_" + Section + "=" +
+                                              "__stop_" + Section + Suffix));
 
   if (Error Err = executeCommands(*ObjcopyPath, ObjcopyArgs))
     return Err;

From 07dad4ecba43bcd92453a0cd4c351025126db683 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 12 Jun 2025 19:50:41 -0700
Subject: [PATCH 319/851] [ELF] Implement -z dynamic-undefined-weak

The behavior of an undefined weak reference is implementation defined.
For static -no-pie linking, dynamic relocations are generally avoided (except
IRELATIVE). -shared linking generally emits dynamic relocations.

Dynamic -no-pie linking and -pie allow flexibility. Changes adjust the
behavior for better consistency and simpler internal representation,
e.g. https://reviews.llvm.org/D63003 https://reviews.llvm.org/D105164
(generalized to undefined non-weak in
2fcaa00d1e2317a90c9071b735eb0e758b5dd58b).

GNU ld introduced -z [no]dynamic-undefined-weak option to fine-tune the
behavior. (The option is not very effective with -no-pie, e.g. on
x86-64, `ld.bfd a.o s.so -z dynamic-undefined-weak` generates
R_X86_64_NONE relocations instead of GLOB_DAT/JUMP_SLOT)

This patch implements -z [no]dynamic-undefined-weak option.
The effects are summarized as follows:

* Static -no-pie: no-op
* Dynamic -no-pie: nodynamic-undefined-weak suppresses GLOB_DAT/JUMP_SLOT
* Static -pie: dynamic-undefined-weak generates ABS/GLOB_DAT/JUMP_SLOT.
  https://discourse.llvm.org/t/lld-weak-undefined-symbols-in-vdso-only/86749
* Dynamic -pie: nodynamic-undefined-weak suppresses ABS/GLOB_DAT/JUMP_SLOT

The -pie behavior likely stays stable while -no-pie (`!ctx.arg.isPic` in
`isStaticLinkTimeConstant`) behavior will likely change in the future.
The current default value of ctx.arg.zDynamicUndefined is selected to
prevent behavior changes.

Pull Request: https://github.com/llvm/llvm-project/pull/143831
---
 lld/ELF/Config.h                  |  1 +
 lld/ELF/Driver.cpp                |  8 ++++++++
 lld/ELF/Symbols.cpp               | 14 ++++++++------
 lld/ELF/Writer.cpp                |  6 ++----
 lld/docs/ReleaseNotes.rst         |  4 ++++
 lld/docs/ld.lld.1                 |  8 ++++++++
 lld/test/ELF/driver.test          |  3 ++-
 lld/test/ELF/weak-undef-got-plt.s |  6 ++++++
 lld/test/ELF/weak-undef-hidden.s  |  4 ++++
 lld/test/ELF/weak-undef-rw.s      | 19 ++++++++++++++++---
 10 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 0a52dfe6901bd..3a9001d2cc8b8 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -368,6 +368,7 @@ struct Config {
   bool writeAddends;
   bool zCombreloc;
   bool zCopyreloc;
+  bool zDynamicUndefined;
   bool zForceBti;
   bool zForceIbt;
   bool zGlobal;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 12dac82c614a7..87b19cf543d9f 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -591,6 +591,7 @@ static void checkZOptions(Ctx &ctx, opt::InputArgList &args) {
   args::getZOptionValue(args, OPT_z, "max-page-size", 0);
   args::getZOptionValue(args, OPT_z, "common-page-size", 0);
   getZFlag(args, "rel", "rela", false);
+  getZFlag(args, "dynamic-undefined-weak", "nodynamic-undefined-weak", false);
   for (auto *arg : args.filtered(OPT_z))
     if (!arg->isClaimed())
       Warn(ctx) << "unknown -z value: " << StringRef(arg->getValue());
@@ -3058,6 +3059,13 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
   ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.isPic;
   ctx.arg.exportDynamic &= ctx.hasDynsym;
 
+  // Preemptibility of undefined symbols when ctx.hasDynsym is true. Default is
+  // true for dynamic linking.
+  ctx.arg.zDynamicUndefined =
+      getZFlag(args, "dynamic-undefined-weak", "nodynamic-undefined-weak",
+               ctx.sharedFiles.size() || ctx.arg.shared) &&
+      ctx.hasDynsym;
+
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = ctx.symtab->find(ctx.arg.entry))
     handleUndefined(ctx, sym, "--entry");
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index c461dfed0d741..de839795c50d7 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -333,10 +333,13 @@ bool elf::computeIsPreemptible(Ctx &ctx, const Symbol &sym) {
   if (sym.visibility() != STV_DEFAULT)
     return false;
 
-  // At this point copy relocations have not been created yet, so any
-  // symbol that is not defined locally is preemptible.
+  // At this point copy relocations have not been created yet.
+  // Shared symbols are preemptible. Undefined symbols are preemptible
+  // when zDynamicUndefined (default in dynamic linking). Weakness is not
+  // checked, though undefined non-weak would typically trigger relocation
+  // errors unless options like -z undefs are used.
   if (!sym.isDefined())
-    return true;
+    return !sym.isUndefined() || ctx.arg.zDynamicUndefined;
 
   if (!ctx.arg.shared)
     return false;
@@ -360,7 +363,6 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   // can contain versions in the form of <name>@<version>.
   // Let them parse and update their names to exclude version suffix.
   // In addition, compute isExported and isPreemptible.
-  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
@@ -369,11 +371,11 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
       continue;
     }
     if (!sym->isDefined() && !sym->isCommon()) {
-      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     } else if (ctx.arg.exportDynamic &&
                (sym->isUsedInRegularObj || !sym->ltoCanOmit)) {
       sym->isExported = true;
-      sym->isPreemptible = maybePreemptible && computeIsPreemptible(ctx, *sym);
+      sym->isPreemptible = computeIsPreemptible(ctx, *sym);
     }
   }
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 3d9888f576f05..15909daf51ab6 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -285,7 +285,6 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
-  bool maybePreemptible = ctx.sharedFiles.size() || ctx.arg.shared;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -301,9 +300,8 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
       }
     }
 
-    if (maybePreemptible)
-      sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
-                           computeIsPreemptible(ctx, *sym);
+    sym->isPreemptible = (sym->isUndefined() || sym->isExported) &&
+                         computeIsPreemptible(ctx, *sym);
   }
 }
 
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 5c180fd8fbeeb..064ed0828c31f 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -25,6 +25,10 @@ Non-comprehensive list of changes in this release
 
 ELF Improvements
 ----------------
+* Added ``-z dynamic-undefined-weak`` to make undefined weak symbols dynamic
+  when the dynamic symbol table is present.
+  (`#143831 <https://github.com/llvm/llvm-project/pull/143831>`_)
+
 * For AArch64, added support for ``-zgcs-report-dynamic``, enabling checks for
   GNU GCS Attribute Flags in Dynamic Objects when GCS is enabled. Inherits value
   from ``-zgcs-report`` (capped at ``warning`` level) unless user-defined,
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 57aa2be5907b5..cfacdb081a807 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -793,6 +793,14 @@ Specify how to report the missing GNU_PROPERTY_X86_FEATURE_1_IBT or GNU_PROPERTY
 .Cm none
 is the default, linker will not report the missing property otherwise will be reported as a warning or an error.
 .Pp
+.It Cm dynamic-undefined-weak
+Make undefined weak symbols dynamic when the dynamic symbol table is present, if they are referenced from
+relocatable object files and not forced local by symbol visibility or versioning. Do not make them dynamic when
+.Cm nodynamic-undefined-weak
+is specified.
+.Cm dynamic-undefined-weak
+is the default when building a shared object, or when an input shared object is present.
+.Pp
 .It Cm pauth-report Ns = Ns Ar [none|warning|error]
 Specify how to report the missing GNU_PROPERTY_AARCH64_FEATURE_PAUTH property.
 .Cm none
diff --git a/lld/test/ELF/driver.test b/lld/test/ELF/driver.test
index 45d73607c8ac6..6d5761212cc38 100644
--- a/lld/test/ELF/driver.test
+++ b/lld/test/ELF/driver.test
@@ -47,7 +47,8 @@
 # ERR9: error: cannot open output file utput=/no/such/file
 
 # RUN: ld.lld %t -z foo -o /dev/null 2>&1 | FileCheck -check-prefix=ERR10 %s --implicit-check-not=warning:
-# RUN: ld.lld %t -z foo -z rel -z rela -z max-page-size=1 -z common-page-size=1 -o /dev/null --version 2>&1 | \
+# RUN: ld.lld %t -z foo -z rel -z rela -z max-page-size=1 -z common-page-size=1 -z dynamic-undefined-weak \
+# RUN:   -z nodynamic-undefined-weak -o /dev/null --version 2>&1 | \
 # RUN:   FileCheck -check-prefix=ERR10 %s --implicit-check-not=warning:
 # ERR10: warning: unknown -z value: foo
 
diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s
index 0ee3da2cd3b40..48a7914e5b987 100644
--- a/lld/test/ELF/weak-undef-got-plt.s
+++ b/lld/test/ELF/weak-undef-got-plt.s
@@ -6,11 +6,17 @@
 
 # RUN: ld.lld a.o -o a
 # RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
+# RUN: ld.lld a.o -o a -z dynamic-undefined-weak
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
 # RUN: ld.lld a.o s.so -o as
 # RUN: llvm-objdump -dR as | FileCheck %s
+# RUN: ld.lld a.o s.so -o as -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC
 
 # RUN: ld.lld -pie a.o s.so -o as.pie
 # RUN: llvm-objdump -dR as.pie | FileCheck %s
+# RUN: ld.lld -pie a.o s.so -o as.pie -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r as.pie | FileCheck --check-prefix=NORELOC %s
 
 # RUN: ld.lld -shared a.o -o a.so
 # RUN: llvm-objdump -dR a.so | FileCheck %s
diff --git a/lld/test/ELF/weak-undef-hidden.s b/lld/test/ELF/weak-undef-hidden.s
index 2baad5738c36f..ad2ba29ec27ab 100644
--- a/lld/test/ELF/weak-undef-hidden.s
+++ b/lld/test/ELF/weak-undef-hidden.s
@@ -5,6 +5,10 @@
 // RUN: ld.lld %t.o -o %t -pie
 // RUN: llvm-readobj -r -S --section-data %t | FileCheck %s
 
+/// -z dynamic-undefined-weak does not affect hidden undefined symbols.
+// RUN: ld.lld %t.o -o %t.so -shared -z dynamic-undefined-weak
+// RUN: llvm-readobj -r -S --section-data %t.so | FileCheck %s
+
 /// This is usually guarded with a comparison. Don't report an error.
 call g
 
diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s
index 497228a3cf905..8d777669b7e16 100644
--- a/lld/test/ELF/weak-undef-rw.s
+++ b/lld/test/ELF/weak-undef-rw.s
@@ -18,9 +18,22 @@
 ## gABI leaves the behavior of weak undefined references implementation defined.
 ## We choose to resolve them statically for static linking and produce dynamic relocations
 ## for dynamic linking (-shared or at least one input DSO).
-##
-## Note: Some ports of GNU ld support -z nodynamic-undefined-weak that we don't
-## implement.
+
+## -z dynamic-undefined-weak is ignored if .dynsym is absent (-no-pie without DSO)
+# RUN: ld.lld a.o -o a.d -z dynamic-undefined-weak 2>&1 | count 0
+# RUN: llvm-readelf -r --hex-dump=.data a.d | FileCheck %s --check-prefix=STATIC
+
+## Currently no effect for S+A relocations.
+# RUN: ld.lld a.o s.so -o as.d -z dynamic-undefined-weak
+# RUN: llvm-readelf -r --hex-dump=.data as.d | FileCheck %s --check-prefix=STATIC
+
+## -z dynamic-undefined-weak forces dynamic relocations if .dynsym is present.
+# RUN: ld.lld a.o -o a.pie.d -pie -z dynamic-undefined-weak
+# RUN: llvm-readelf -r a.pie.d | FileCheck %s --check-prefix=DYN
+
+## -z nodynamic-undefined-weak suppresses dynamic relocations.
+# RUN: ld.lld a.o -o a.so.n -shared -z dynamic-undefined-weak -z nodynamic-undefined-weak
+# RUN: llvm-readelf -r --hex-dump=.data a.so.n | FileCheck %s --check-prefix=STATIC
 
 # STATIC:      no relocations
 # STATIC:      Hex dump of section '.data':

From 9992668404cfb2302f7a62f01884c210642caea1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 12 Jun 2025 20:47:58 -0700
Subject: [PATCH 320/851] [flang][cuda] Add runtime check for passing device
 arrays (#144003)

---
 flang-rt/lib/cuda/descriptor.cpp              |  8 +++++++
 flang/include/flang/Lower/LoweringOptions.def |  3 +++
 .../Builder/Runtime/CUDA/Descriptor.h         |  5 +++++
 flang/include/flang/Runtime/CUDA/descriptor.h |  4 ++++
 flang/lib/Lower/ConvertCall.cpp               | 14 ++++++++++++
 .../Builder/Runtime/CUDA/Descriptor.cpp       | 15 +++++++++++++
 flang/test/Lower/CUDA/cuda-runtime-check.cuf  | 22 +++++++++++++++++++
 flang/tools/bbc/bbc.cpp                       |  2 ++
 8 files changed, 73 insertions(+)
 create mode 100644 flang/test/Lower/CUDA/cuda-runtime-check.cuf

diff --git a/flang-rt/lib/cuda/descriptor.cpp b/flang-rt/lib/cuda/descriptor.cpp
index 7b768f91af29d..aa75d4eff0511 100644
--- a/flang-rt/lib/cuda/descriptor.cpp
+++ b/flang-rt/lib/cuda/descriptor.cpp
@@ -54,6 +54,14 @@ void RTDEF(CUFSyncGlobalDescriptor)(
   ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine);
 }
 
+void RTDEF(CUFDescriptorCheckSection)(
+    const Descriptor *desc, const char *sourceFile, int sourceLine) {
+  if (desc && !desc->IsContiguous()) {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("device array section argument is not contiguous");
+  }
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def
index b062ea1a805ac..d97abf4d864b8 100644
--- a/flang/include/flang/Lower/LoweringOptions.def
+++ b/flang/include/flang/Lower/LoweringOptions.def
@@ -63,5 +63,8 @@ ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0)
 /// in the leading dimension.
 ENUM_LOWERINGOPT(RepackArraysWhole, unsigned, 1, 0)
 
+/// If true, CUDA Fortran runtime check is inserted.
+ENUM_LOWERINGOPT(CUDARuntimeCheck, unsigned, 1, 0)
+
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
index 14d262bf22a70..bdeb7574012c6 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h
@@ -26,6 +26,11 @@ namespace fir::runtime::cuda {
 void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value hostPtr);
 
+/// Generate runtime call to check the section of a descriptor and raise an
+/// error if it is not contiguous.
+void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
+                               mlir::Value desc);
+
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
diff --git a/flang/include/flang/Runtime/CUDA/descriptor.h b/flang/include/flang/Runtime/CUDA/descriptor.h
index 0ee7feca10e44..06e4a4649db1b 100644
--- a/flang/include/flang/Runtime/CUDA/descriptor.h
+++ b/flang/include/flang/Runtime/CUDA/descriptor.h
@@ -37,6 +37,10 @@ void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
 void RTDECL(CUFSyncGlobalDescriptor)(
     void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Check descriptor passed to a kernel.
+void RTDECL(CUFDescriptorCheckSection)(
+    const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 7378118cfef7f..864499e6c3431 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -543,6 +544,19 @@ Fortran::lower::genCallOpAndResult(
   fir::FortranProcedureFlagsEnumAttr procAttrs =
       caller.getProcedureAttrs(builder.getContext());
 
+  if (converter.getLoweringOptions().getCUDARuntimeCheck()) {
+    if (caller.getCallDescription().chevrons().empty()) {
+      for (auto [oper, arg] :
+           llvm::zip(operands, caller.getPassedArguments())) {
+        if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(oper.getType())) {
+          const Fortran::semantics::Symbol *sym = caller.getDummySymbol(arg);
+          if (sym && Fortran::evaluate::IsCUDADeviceSymbol(*sym))
+            fir::runtime::cuda::genDescriptorCheckSection(builder, loc, oper);
+        }
+      }
+    }
+  }
+
   if (!caller.getCallDescription().chevrons().empty()) {
     // A call to a CUDA kernel with the chevron syntax.
 
diff --git a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
index 90662c094c65e..a943469a76728 100644
--- a/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp
@@ -32,3 +32,18 @@ void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
       builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, callee, args);
 }
+
+void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
+                                                   mlir::Location loc,
+                                                   mlir::Value desc) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFDescriptorCheckSection)>(loc,
+                                                                       builder);
+  auto fTy = func.getFunctionType();
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, sourceFile, sourceLine)};
+  builder.create<fir::CallOp>(loc, func, args);
+}
diff --git a/flang/test/Lower/CUDA/cuda-runtime-check.cuf b/flang/test/Lower/CUDA/cuda-runtime-check.cuf
new file mode 100644
index 0000000000000..f26d372769cab
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-runtime-check.cuf
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Check insertion of runtime checks
+
+interface
+  subroutine foo(a)
+    real, device, dimension(:,:) :: a
+  end subroutine
+end interface
+
+  real, device, allocatable, dimension(:,:) :: a
+  allocate(a(10,10))
+  call foo(a(1:10,1:10:2))
+end
+
+subroutine foo(a)
+  real, device, dimension(:,:) :: a
+end subroutine
+
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: fir.call @_FortranACUFDescriptorCheckSection
+! CHECK: fir.call @_QPfoo
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index c80872108ac8f..015c86604a1fd 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -434,6 +434,8 @@ static llvm::LogicalResult convertFortranSourceToMLIR(
   loweringOptions.setStackRepackArrays(stackRepackArrays);
   loweringOptions.setRepackArrays(repackArrays);
   loweringOptions.setRepackArraysWhole(repackArraysWhole);
+  if (enableCUDA)
+    loweringOptions.setCUDARuntimeCheck(true);
   std::vector<Fortran::lower::EnvironmentDefault> envDefaults = {};
   Fortran::frontend::TargetOptions targetOpts;
   Fortran::frontend::CodeGenOptions cgOpts;

From 4268360003e2dc6721469aa5ccab7efbb29dcbfd Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 13 Jun 2025 09:35:48 +0530
Subject: [PATCH 321/851] [Flang] [OpenMP] Allow any type as argument to the
 FlushOp (#143844)

Fixes: #143842
---
 flang/test/Lower/OpenMP/flush02.f90           | 32 +++++++++++++++++++
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  2 +-
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Lower/OpenMP/flush02.f90

diff --git a/flang/test/Lower/OpenMP/flush02.f90 b/flang/test/Lower/OpenMP/flush02.f90
new file mode 100644
index 0000000000000..b372e700e1a1d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/flush02.f90
@@ -0,0 +1,32 @@
+! This test checks lowering of OpenMP Flush Directive.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module flush02_mod
+    type t1
+       integer(kind=4) :: x = 4
+    end type t1
+
+    type :: t2
+       type(t1) :: y = t1(2)
+    end type t2
+
+
+contains
+
+    subroutine sub01(pt)
+        class(t1), intent(inout) :: pt
+        type(t2)                 :: dt
+        integer, allocatable     :: a(:)
+        integer, pointer         :: b(:)
+
+        ! CHECK: omp.flush({{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+        ! CHECK: omp.flush({{.*}} : !fir.ref<f32>)
+        ! CHECK: omp.flush({{.*}} : !fir.ref<!fir.type<_QMflush02_modTt2{y:!fir.type<_QMflush02_modTt1{x:i32}>}>>)
+        ! CHECK: omp.flush({{.*}} : !fir.class<!fir.type<_QMflush02_modTt1{x:i32}>>)
+        !$omp flush(a)
+        !$omp flush(p)
+        !$omp flush(dt)
+        !$omp flush(pt)
+    end subroutine
+end module flush02_mod
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 036c6a6e350a8..ac80926053a2d 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -889,7 +889,7 @@ def FlushOp : OpenMP_Op<"flush", clauses = [
     specified or implied.
   }] # clausesDescription;
 
-  let arguments = !con((ins Variadic<OpenMP_PointerLikeType>:$varList),
+  let arguments = !con((ins Variadic<AnyType>:$varList),
                        clausesArgs);
 
   // Override inherited assembly format to include `varList`.

From cd573e0a547dba18e2a960967c1f24f124c6cb26 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:08 -0700
Subject: [PATCH 322/851] [compiler-rt] Remove unused local variables (NFC)
 (#144010)

---
 compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
index 93bf817a857b4..c9210c78a0631 100644
--- a/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp
@@ -265,8 +265,6 @@ int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath,
     // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
     // Function number => DFT.
     auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
-    std::unordered_map<size_t, std::vector<uint8_t>> DFTMap;
-    std::unordered_set<std::string> Cov;
     Command Cmd;
     Cmd.addArgument(DFTBinary);
     Cmd.addArgument(F.File);

From 752538c12cf4b37499f73e1bf05ea421ab055665 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:15 -0700
Subject: [PATCH 323/851] [llvm-pdbutil] Remove an unused local variable (NFC)
 (#144011)

---
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 4cb64bdbe8ef9..e50d19a994b6f 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -1375,7 +1375,6 @@ static void mergePdbs() {
 }
 
 static void explain() {
-  std::unique_ptr<IPDBSession> Session;
   InputFile IF =
       ExitOnErr(InputFile::open(opts::explain::InputFilename.front(), true));
 

From 054f4a50bb2ec1e535111d779bc5fdc93314c55a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 12 Jun 2025 22:45:23 -0700
Subject: [PATCH 324/851] [polly] Remove an unused local variable (NFC)
 (#144012)

---
 polly/lib/Support/RegisterPasses.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 503c3ae1e07c7..56cb8aadce3b6 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -541,7 +541,6 @@ static bool
 parseTopLevelPipeline(llvm::ModulePassManager &MPM,
                       PassInstrumentationCallbacks *PIC,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
-  std::vector<PassBuilder::PipelineElement> FullPipeline;
   StringRef FirstName = Pipeline.front().Name;
 
   if (!isScopPassName(FirstName))

From dfc5125946ade289840fa119716957ebce2d31d2 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 12 Jun 2025 23:00:33 -0700
Subject: [PATCH 325/851] [NVPTX] Consistently check fast-math flags when
 lowering fsqrt (#143776)

Ensure that we check the global, function-level, and instruction-level
flags when considering whether to use `sqrt.rn` or `sqrt.approx` to
lower either `@llvm.sqrt.f32` or `@llvm.nvvm.sqrt.f`
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp |   4 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h   |   2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  24 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h   |   3 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     |   3 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td    |  35 +-
 llvm/test/CodeGen/NVPTX/fast-math.ll        | 467 ++++++++++++++++----
 llvm/test/CodeGen/NVPTX/sqrt-approx.ll      | 339 +++++++++++---
 8 files changed, 695 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index a20099788d09c..79b1bfbc8072b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -71,8 +71,8 @@ NVPTXDAGToDAGISel::getDivF32Level(const SDNode *N) const {
   return Subtarget->getTargetLowering()->getDivF32Level(*MF, *N);
 }
 
-bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
-  return Subtarget->getTargetLowering()->usePrecSqrtF32();
+bool NVPTXDAGToDAGISel::usePrecSqrtF32(const SDNode *N) const {
+  return Subtarget->getTargetLowering()->usePrecSqrtF32(*MF, N);
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 71a5b7ff8cd30..473f4781a6c38 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -44,7 +44,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool doMulWide;
 
   NVPTX::DivPrecisionLevel getDivF32Level(const SDNode *N) const;
-  bool usePrecSqrtF32() const;
+  bool usePrecSqrtF32(const SDNode *N) const;
   bool useF32FTZ() const;
   bool allowFMA() const;
   bool allowUnsafeFPMath() const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d6a134d9abafd..492f4ab76fdbb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -134,14 +134,23 @@ NVPTXTargetLowering::getDivF32Level(const MachineFunction &MF,
   return NVPTX::DivPrecisionLevel::IEEE754;
 }
 
-bool NVPTXTargetLowering::usePrecSqrtF32() const {
-  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+bool NVPTXTargetLowering::usePrecSqrtF32(const MachineFunction &MF,
+                                         const SDNode *N) const {
+  // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+  if (UsePrecSqrtF32.getNumOccurrences() > 0)
     return UsePrecSqrtF32;
-  } else {
-    // Otherwise, use sqrt.approx if fast math is enabled
-    return !getTargetMachine().Options.UnsafeFPMath;
+
+  // Otherwise, use sqrt.approx if fast math is enabled
+  if (allowUnsafeFPMath(MF))
+    return false;
+
+  if (N) {
+    const SDNodeFlags Flags = N->getFlags();
+    if (Flags.hasApproximateFuncs())
+      return false;
   }
+
+  return true;
 }
 
 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
@@ -1134,7 +1143,8 @@ SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                              bool &UseOneConst,
                                              bool Reciprocal) const {
   if (!(Enabled == ReciprocalEstimate::Enabled ||
-        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+        (Enabled == ReciprocalEstimate::Unspecified &&
+         !usePrecSqrtF32(DAG.getMachineFunction()))))
     return SDValue();
 
   if (ExtraSteps == ReciprocalEstimate::Unspecified)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 8d71022a1f102..0a54a8fd71f32 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -225,7 +225,8 @@ class NVPTXTargetLowering : public TargetLowering {
 
   // Get whether we should use a precise or approximate 32-bit floating point
   // sqrt instruction.
-  bool usePrecSqrtF32() const;
+  bool usePrecSqrtF32(const MachineFunction &MF,
+                      const SDNode *N = nullptr) const;
 
   // Get whether we should use instructions that flush floating-point denormals
   // to sign-preserving zero.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9ca4e8d20650a..fa521c040e8e5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -151,9 +151,6 @@ def doRsqrtOpt : Predicate<"doRsqrtOpt()">;
 
 def doMulWide      : Predicate<"doMulWide">;
 
-def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
-def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
-
 def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
 def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
 def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index f52ff39c3e1a5..b3c1296cf0ca6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1520,15 +1520,18 @@ def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64", F64RT, F64RT, int_nvvm_sqrt_rz_
 def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64", F64RT, F64RT, int_nvvm_sqrt_rm_d>;
 def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64", F64RT, F64RT, int_nvvm_sqrt_rp_d>;
 
+def fsqrt_approx : PatFrags<(ops node:$a),
+                            [(fsqrt node:$a),
+                             (int_nvvm_sqrt_f node:$a)], [{
+  return !usePrecSqrtF32(N);
+}]>;
+
 // nvvm_sqrt intrinsic
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_RN_F $a)>, Requires<[do_SQRTF32_RN]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
-def : Pat<(int_nvvm_sqrt_f f32:$a),
-          (INT_NVVM_SQRT_APPROX_F $a)>;
+def : Pat<(int_nvvm_sqrt_f f32:$a), (INT_NVVM_SQRT_RN_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f f32:$a), (INT_NVVM_SQRT_RN_F $a)>;
+
+def : Pat<(fsqrt_approx f32:$a), (INT_NVVM_SQRT_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>;
+def : Pat<(fsqrt_approx f32:$a), (INT_NVVM_SQRT_APPROX_F $a)>;
 
 //
 // Rsqrt
@@ -1551,20 +1554,14 @@ def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_f f32:$a)),
 def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_approx_ftz_f f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
          Requires<[doRsqrtOpt]>;
-// same for int_nvvm_sqrt_f when non-precision sqrt is requested
-def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv f32imm_1, (int_nvvm_sqrt_f f32:$a)),
-         (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
 
-def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
+// same for int_nvvm_sqrt_f when non-precision sqrt is requested
+def: Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
          (INT_NVVM_RSQRT_APPROX_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doNoF32FTZ]>;
-def: Pat<(fdiv f32imm_1, (fsqrt f32:$a)),
+         Requires<[doRsqrtOpt, doNoF32FTZ]>;
+def: Pat<(fdiv f32imm_1, (fsqrt_approx f32:$a)),
          (INT_NVVM_RSQRT_APPROX_FTZ_F $a)>,
-         Requires<[doRsqrtOpt, do_SQRTF32_APPROX, doF32FTZ]>;
+         Requires<[doRsqrtOpt, doF32FTZ]>;
 //
 // Add
 //
diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll
index 4cb6a35e796fb..bc48d242f88fd 100644
--- a/llvm/test/CodeGen/NVPTX/fast-math.ll
+++ b/llvm/test/CodeGen/NVPTX/fast-math.ll
@@ -1,58 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)
 
-; CHECK-LABEL: sqrt_div(
-; CHECK: sqrt.rn.f32
-; CHECK: div.rn.f32
 define float @sqrt_div(float %a, float %b) {
+; CHECK-LABEL: sqrt_div(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_param_1];
+; CHECK-NEXT:    div.rn.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast(
-; CHECK: sqrt.rn.f32
-; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
+; CHECK-LABEL: sqrt_div_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_param_1];
+; CHECK-NEXT:    div.approx.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ninf(
-; CHECK: sqrt.approx.f32
-; CHECK: div.approx.f32
 define float @sqrt_div_fast_ninf(float %a, float %b) #0 {
+; CHECK-LABEL: sqrt_div_fast_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ninf_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    abs.f32 %r3, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r3, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r4, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r5, [sqrt_div_fast_ninf_param_1];
+; CHECK-NEXT:    div.approx.f32 %r6, %r4, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_ftz(
-; CHECK: sqrt.rn.ftz.f32
-; CHECK: div.rn.ftz.f32
 define float @sqrt_div_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: sqrt_div_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_ftz_param_1];
+; CHECK-NEXT:    div.rn.ftz.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz(
-; CHECK: sqrt.rn.ftz.f32
-; CHECK: div.approx.ftz.f32
 define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    ld.param.b32 %r3, [sqrt_div_fast_ftz_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r4, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
-; CHECK: sqrt.approx.ftz.f32
-; CHECK: div.approx.ftz.f32
 define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [sqrt_div_fast_ftz_ninf_param_0];
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    selp.f32 %r3, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [sqrt_div_fast_ftz_ninf_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
@@ -61,69 +117,117 @@ define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 {
 ; There are no fast-math or ftz versions of sqrt and div for f64.  We use
 ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
 
-; CHECK-LABEL: sqrt_div_fast_ftz_f64(
-; CHECK: sqrt.rn.f64
-; CHECK: div.rn.f64
 define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [sqrt_div_fast_ftz_f64_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd4, %rd2, %rd3;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
   %t1 = tail call double @llvm.sqrt.f64(double %a)
   %t2 = fdiv double %t1, %b
   ret double %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
-; CHECK: div.rn.f64
 define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 {
+; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [sqrt_div_fast_ftz_f64_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd6, [sqrt_div_fast_ftz_f64_ninf_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd7, %rd5, %rd6;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ret;
   %t1 = tail call ninf afn double @llvm.sqrt.f64(double %a)
   %t2 = fdiv double %t1, %b
   ret double %t2
 }
 
-; CHECK-LABEL: rsqrt(
-; CHECK-NOT: rsqrt.approx
-; CHECK: sqrt.rn.f32
-; CHECK-NOT: rsqrt.approx
 define float @rsqrt(float %a) {
+; CHECK-LABEL: rsqrt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_param_0];
+; CHECK-NEXT:    sqrt.rn.f32 %r2, %r1;
+; CHECK-NEXT:    rcp.rn.f32 %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: rsqrt_fast(
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
-; CHECK: rsqrt.approx.f32
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
 define float @rsqrt_fast(float %a) #0 {
+; CHECK-LABEL: rsqrt_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_fast_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: rsqrt_fast_ftz(
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
-; CHECK: rsqrt.approx.ftz.f32
-; CHECK-NOT: div.
-; CHECK-NOT: sqrt.
 define float @rsqrt_fast_ftz(float %a) #0 #1 {
+; CHECK-LABEL: rsqrt_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [rsqrt_fast_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %b
   ret float %ret
 }
 
-; CHECK-LABEL: fadd
-; CHECK: add.rn.f32
 define float @fadd(float %a, float %b) {
+; CHECK-LABEL: fadd(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fadd_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fadd_param_1];
+; CHECK-NEXT:    add.rn.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %t1 = fadd float %a, %b
   ret float %t1
 }
 
-; CHECK-LABEL: fadd_ftz
-; CHECK: add.rn.ftz.f32
 define float @fadd_ftz(float %a, float %b) #1 {
+; CHECK-LABEL: fadd_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fadd_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [fadd_ftz_param_1];
+; CHECK-NEXT:    add.rn.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %t1 = fadd float %a, %b
   ret float %t1
 }
@@ -131,41 +235,83 @@ define float @fadd_ftz(float %a, float %b) #1 {
 declare float @llvm.sin.f32(float)
 declare float @llvm.cos.f32(float)
 
-; CHECK-LABEL: fsin_approx_afn
-; CHECK:       sin.approx.f32
 define float @fsin_approx_afn(float %a) {
+; CHECK-LABEL: fsin_approx_afn(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fsin_approx_afn_param_0];
+; CHECK-NEXT:    sin.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call afn float @llvm.sin.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fcos_approx_afn
-; CHECK:       cos.approx.f32
 define float @fcos_approx_afn(float %a) {
+; CHECK-LABEL: fcos_approx_afn(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fcos_approx_afn_param_0];
+; CHECK-NEXT:    cos.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call afn float @llvm.cos.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fsin_approx
-; CHECK:       sin.approx.f32
 define float @fsin_approx(float %a) #0 {
+; CHECK-LABEL: fsin_approx(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fsin_approx_param_0];
+; CHECK-NEXT:    sin.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call float @llvm.sin.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: fcos_approx
-; CHECK:       cos.approx.f32
 define float @fcos_approx(float %a) #0 {
+; CHECK-LABEL: fcos_approx(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [fcos_approx_param_0];
+; CHECK-NEXT:    cos.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %r = tail call float @llvm.cos.f32(float %a)
   ret float %r
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed
 define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %divisor) {
-; CHECK: rcp.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: mul.rn.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_recip_allowed(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_param_3];
+; CHECK-NEXT:    rcp.rn.f32 %r3, %r2;
+; CHECK-NEXT:    mul.rn.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_recip_allowed_param_2];
+; CHECK-NEXT:    mul.rn.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %z = fmul float %x, %y
@@ -173,23 +319,51 @@ define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %di
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_sel
 define float @repeated_div_recip_allowed_sel(i1 %pred, float %a, float %b, float %divisor) {
-; CHECK: selp.f32
-; CHECK: div.rn.f32
+; CHECK-LABEL: repeated_div_recip_allowed_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_recip_allowed_sel_param_3];
+; CHECK-NEXT:    div.rn.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_ftz
 define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float %divisor) #1 {
-; CHECK: rcp.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: mul.rn.ftz.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_recip_allowed_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_param_3];
+; CHECK-NEXT:    rcp.rn.ftz.f32 %r3, %r2;
+; CHECK-NEXT:    mul.rn.ftz.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_recip_allowed_ftz_param_2];
+; CHECK-NEXT:    mul.rn.ftz.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.rn.ftz.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %z = fmul float %x, %y
@@ -197,23 +371,51 @@ define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_recip_allowed_ftz_sel
 define float @repeated_div_recip_allowed_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #1 {
-; CHECK: selp.f32
-; CHECK: div.rn.ftz.f32
+; CHECK-LABEL: repeated_div_recip_allowed_ftz_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_recip_allowed_ftz_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_recip_allowed_ftz_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_recip_allowed_ftz_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_recip_allowed_ftz_sel_param_3];
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv arcp float %a, %divisor
   %y = fdiv arcp float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast
 define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 {
-; CHECK: rcp.approx.f32
-; CHECK: mul.f32
-; CHECK: mul.f32
-; CHECK: mul.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_fast(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_param_3];
+; CHECK-NEXT:    rcp.approx.f32 %r3, %r2;
+; CHECK-NEXT:    mul.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_fast_param_2];
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %z = fmul float %x, %y
@@ -221,23 +423,51 @@ define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_sel
 define float @repeated_div_fast_sel(i1 %pred, float %a, float %b, float %divisor) #0 {
-; CHECK: selp.f32
-; CHECK: div.approx.f32
+; CHECK-LABEL: repeated_div_fast_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_fast_sel_param_3];
+; CHECK-NEXT:    div.approx.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_ftz
 define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
-; CHECK: rcp.approx.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: mul.ftz.f32
-; CHECK: selp.f32
+; CHECK-LABEL: repeated_div_fast_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_ftz_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_ftz_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_ftz_param_3];
+; CHECK-NEXT:    rcp.approx.ftz.f32 %r3, %r2;
+; CHECK-NEXT:    mul.ftz.f32 %r4, %r1, %r3;
+; CHECK-NEXT:    ld.param.b32 %r5, [repeated_div_fast_ftz_param_2];
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r3;
+; CHECK-NEXT:    mul.ftz.f32 %r7, %r4, %r6;
+; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %z = fmul float %x, %y
@@ -245,33 +475,80 @@ define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor
   ret float %w
 }
 
-; CHECK-LABEL: repeated_div_fast_ftz_sel
 define float @repeated_div_fast_ftz_sel(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
-; CHECK: selp.f32
-; CHECK: div.approx.ftz.f32
+; CHECK-LABEL: repeated_div_fast_ftz_sel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [repeated_div_fast_ftz_sel_param_0];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; CHECK-NEXT:    ld.param.b32 %r1, [repeated_div_fast_ftz_sel_param_1];
+; CHECK-NEXT:    ld.param.b32 %r2, [repeated_div_fast_ftz_sel_param_2];
+; CHECK-NEXT:    selp.f32 %r3, %r1, %r2, %p1;
+; CHECK-NEXT:    ld.param.b32 %r4, [repeated_div_fast_ftz_sel_param_3];
+; CHECK-NEXT:    div.approx.ftz.f32 %r5, %r3, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ret;
   %x = fdiv float %a, %divisor
   %y = fdiv float %b, %divisor
   %w = select i1 %pred, float %x, float %y
   ret float %w
 }
 
-; CHECK-LABEL: frem
 define float @frem(float %a, float %b) #0 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_param_1];
+; CHECK-NEXT:    div.approx.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %rem = frem float %a, %b
   ret float %rem
 }
 
-; CHECK-LABEL: frem_ftz
 define float @frem_ftz(float %a, float %b) #0 #1 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [frem_ftz_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [frem_ftz_param_1];
+; CHECK-NEXT:    div.approx.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r4, %r3;
+; CHECK-NEXT:    neg.ftz.f32 %r5, %r4;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r6, %r5, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %rem = frem float %a, %b
   ret float %rem
 }
 
-; CHECK-LABEL: frem_f64
 define double @frem_f64(double %a, double %b) #0 {
-  ; CHECK-NOT: testp.infinite
+; CHECK-LABEL: frem_f64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [frem_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [frem_f64_param_1];
+; CHECK-NEXT:    div.rn.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.rzi.f64.f64 %rd4, %rd3;
+; CHECK-NEXT:    neg.f64 %rd5, %rd4;
+; CHECK-NEXT:    fma.rn.f64 %rd6, %rd5, %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %rem = frem double %a, %b
   ret double %rem
 }
diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index e2a3f1cf0d2de..a28d264cd8ec0 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
 ; RUN:   | FileCheck %s
 ; RUN: %if ptxas %{                                                                   \
@@ -12,34 +13,62 @@ declare double @llvm.sqrt.f64(double)
 
 ; -- reciprocal sqrt --
 
-; CHECK-LABEL: test_rsqrt32
 define float @test_rsqrt32(float %a) #0 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_rsqrt32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt_ftz
 define float @test_rsqrt_ftz(float %a) #0 #1 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_rsqrt_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64
 define double @test_rsqrt64(double %a) #0 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_rsqrt64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_ftz
-define double @test_rsqrt64_ftz(double %a) #0 #1 {
 ; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
+define double @test_rsqrt64_ftz(double %a) #0 #1 {
+; CHECK-LABEL: test_rsqrt64_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
@@ -47,64 +76,135 @@ define double @test_rsqrt64_ftz(double %a) #0 #1 {
 
 ; -- sqrt --
 
-; CHECK-LABEL: test_sqrt32
 define float @test_sqrt32(float %a) #0 {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_sqrt32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_ninf
 define float @test_sqrt32_ninf(float %a) #0 {
-; CHECK: sqrt.approx.f32
+; CHECK-LABEL: test_sqrt32_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_ninf_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    abs.f32 %r3, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r3, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r4, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt_ftz
 define float @test_sqrt_ftz(float %a) #0 #1 {
-; CHECK: sqrt.rn.ftz.f32
+; CHECK-LABEL: test_sqrt_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt_ftz_ninf
 define float @test_sqrt_ftz_ninf(float %a) #0 #1 {
-; CHECK: sqrt.approx.ftz.f32
+; CHECK-LABEL: test_sqrt_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_ftz_ninf_param_0];
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    selp.f32 %r3, 0f00000000, %r2, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt64
 define double @test_sqrt64(double %a) #0 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ninf
-define double @test_sqrt64_ninf(double %a) #0 {
 ; There's no sqrt.approx.f64 instruction; we emit
 ; reciprocal(rsqrt.approx.f64(x)).  There's no non-ftz approximate reciprocal,
 ; so we just use the ftz version.
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
+define double @test_sqrt64_ninf(double %a) #0 {
+; CHECK-LABEL: test_sqrt64_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ftz
 define double @test_sqrt64_ftz(double %a) #0 #1 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_ftz_ninf
-define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
-; CHECK: rcp.approx.ftz.f64
+define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
+; CHECK-LABEL: test_sqrt64_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<6>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_ftz_ninf_param_0];
+; CHECK-NEXT:    abs.f64 %rd2, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd2, 0d0010000000000000;
+; CHECK-NEXT:    rsqrt.approx.f64 %rd3, %rd1;
+; CHECK-NEXT:    rcp.approx.ftz.f64 %rd4, %rd3;
+; CHECK-NEXT:    selp.f64 %rd5, 0d0000000000000000, %rd4, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd5;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
@@ -114,93 +214,224 @@ define double @test_sqrt64_ftz_ninf(double %a) #0 #1 {
 ; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed
 ; by some math.
 
-; CHECK-LABEL: test_rsqrt32_refined
 define float @test_rsqrt32_refined(float %a) #0 #2 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_rsqrt32_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_refined_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    mul.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.f32 %r5, %r2, 0fBF000000;
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined
 define float @test_sqrt32_refined(float %a) #0 #2 {
-; CHECK: sqrt.rn.f32
+; CHECK-LABEL: test_sqrt32_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_param_0];
+; CHECK-NEXT:    sqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ninf
 define float @test_sqrt32_refined_ninf(float %a) #0 #2 {
-; CHECK: rsqrt.approx.f32
+; CHECK-LABEL: test_sqrt32_refined_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f32 %r2, %r1;
+; CHECK-NEXT:    mul.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.f32 %r5, %r3, 0fBF000000;
+; CHECK-NEXT:    mul.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    abs.f32 %r7, %r1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r7, 0f00800000;
+; CHECK-NEXT:    selp.f32 %r8, 0f00000000, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_refined
 define double @test_rsqrt64_refined(double %a) #0 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_rsqrt64_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_refined_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd2, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined
 define double @test_sqrt64_refined(double %a) #0 #2 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_refined(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ninf
 define double @test_sqrt64_refined_ninf(double %a) #0 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_sqrt64_refined_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd3, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    abs.f64 %rd7, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd7, 0d0010000000000000;
+; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
 ; -- refined sqrt and rsqrt with ftz enabled --
 
-; CHECK-LABEL: test_rsqrt32_refined_ftz
 define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_rsqrt32_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rsqrt32_refined_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    mul.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.ftz.f32 %r5, %r2, 0fBF000000;
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
   %ret = fdiv float 1.0, %val
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ftz
 define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
-; CHECK: sqrt.rn.ftz.f32
+; CHECK-LABEL: test_sqrt32_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ftz_param_0];
+; CHECK-NEXT:    sqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
   %ret = tail call float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_sqrt32_refined_ftz_ninf
 define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.ftz.f32
+; CHECK-LABEL: test_sqrt32_refined_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt32_refined_ftz_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.ftz.f32 %r2, %r1;
+; CHECK-NEXT:    mul.ftz.f32 %r3, %r1, %r2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r4, %r3, %r2, 0fC0400000;
+; CHECK-NEXT:    mul.ftz.f32 %r5, %r3, 0fBF000000;
+; CHECK-NEXT:    mul.ftz.f32 %r6, %r5, %r4;
+; CHECK-NEXT:    setp.eq.ftz.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    selp.f32 %r7, 0f00000000, %r6, %p1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn float @llvm.sqrt.f32(float %a)
   ret float %ret
 }
 
-; CHECK-LABEL: test_rsqrt64_refined_ftz
-define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version.
-; CHECK: rsqrt.approx.f64
+define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
+; CHECK-LABEL: test_rsqrt64_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rsqrt64_refined_ftz_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd2, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
   %ret = fdiv double 1.0, %val
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ftz
 define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
-; CHECK: sqrt.rn.f64
+; CHECK-LABEL: test_sqrt64_refined_ftz(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ftz_param_0];
+; CHECK-NEXT:    sqrt.rn.f64 %rd2, %rd1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %ret = tail call double @llvm.sqrt.f64(double %a)
   ret double %ret
 }
 
-; CHECK-LABEL: test_sqrt64_refined_ftz_ninf
 define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 {
-; CHECK: rsqrt.approx.f64
+; CHECK-LABEL: test_sqrt64_refined_ftz_ninf(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt64_refined_ftz_ninf_param_0];
+; CHECK-NEXT:    rsqrt.approx.f64 %rd2, %rd1;
+; CHECK-NEXT:    mul.f64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, 0dC008000000000000;
+; CHECK-NEXT:    mul.f64 %rd5, %rd3, 0dBFE0000000000000;
+; CHECK-NEXT:    mul.f64 %rd6, %rd5, %rd4;
+; CHECK-NEXT:    abs.f64 %rd7, %rd1;
+; CHECK-NEXT:    setp.lt.f64 %p1, %rd7, 0d0010000000000000;
+; CHECK-NEXT:    selp.f64 %rd8, 0d0000000000000000, %rd6, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd8;
+; CHECK-NEXT:    ret;
   %ret = tail call ninf afn double @llvm.sqrt.f64(double %a)
   ret double %ret
 }

From 432d06ab919ae18c4ed1e94148448501578a6c85 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Fri, 13 Jun 2025 11:33:52 +0530
Subject: [PATCH 326/851] [NFC][AMDGPU] Fix stale links to ROCm repositories
 (#143949)

Following GitHub organizations were merged into the ROCm org:
  * ROCm-Developer-Tools
  * RadeonOpenCompute
  * ROCmSoftwarePlatform

Ensure that all hyperlinks to the old organizations now point to the new
organization at https://github.com/ROCm.
---
 clang/docs/HIPSupport.rst                          | 14 +++++++-------
 ...GPUDwarfExtensionsForHeterogeneousDebugging.rst |  2 +-
 llvm/docs/AMDGPUUsage.rst                          |  4 ++--
 llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll     |  2 +-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h    |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index 051a253969943..406e1c8e5a2fe 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -17,7 +17,7 @@
 HIP Support
 =============
 
-HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm-Developer-Tools/HIP>`_ is
+HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm/HIP>`_ is
 a C++ Runtime API and Kernel Language. It enables developers to create portable applications for
 offloading computation to different hardware platforms from a single source code.
 
@@ -41,9 +41,9 @@ backend or the out-of-tree LLVM-SPIRV translator. The SPIR-V is then bundled and
 .. note::
    While Clang does not directly provide HIP support for NVIDIA GPUs and CPUs, these platforms are supported via other means:
 
-   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm-Developer-Tools/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
+   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
 
-   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm-Developer-Tools/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
+   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
 
 
 Example Usage
@@ -328,7 +328,7 @@ The `parallel_unsequenced_policy <https://en.cppreference.com/w/cpp/algorithm/ex
 maps relatively well to the execution model of AMD GPUs. This, coupled with the
 the availability and maturity of GPU accelerated algorithm libraries that
 implement most / all corresponding algorithms in the standard library
-(e.g. `rocThrust <https://github.com/ROCmSoftwarePlatform/rocThrust>`__), makes
+(e.g. `rocThrust <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust>`__), makes
 it feasible to provide seamless accelerator offload for supported algorithms,
 when an accelerated version exists. Thus, it becomes possible to easily access
 the computational resources of an AMD accelerator, via a well specified,
@@ -483,7 +483,7 @@ such as GPUs, work.
      allocation / deallocation functions with accelerator-aware equivalents,
      based on a pre-established table; the list of functions that can be
      interposed is available
-     `here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#allocation--deallocation-interposition-status>`__;
+     `here <https://github.com/ROCm/roc-stdpar#allocation--deallocation-interposition-status>`__;
    - This is only run when compiling for the host.
 
 The second pass is optional.
@@ -627,7 +627,7 @@ Linux operating system. Support is synthesised in the following table:
 The minimum Linux kernel version for running in HMM mode is 6.4.
 
 The forwarding header can be obtained from
-`its GitHub repository <https://github.com/ROCmSoftwarePlatform/roc-stdpar>`_.
+`its GitHub repository <https://github.com/ROCm/roc-stdpar>`_.
 It will be packaged with a future `ROCm <https://rocm.docs.amd.com/en/latest/>`_
 release. Because accelerated algorithms are provided via
 `rocThrust <https://rocm.docs.amd.com/projects/rocThrust/en/latest/>`_, a
@@ -636,7 +636,7 @@ transitive dependency on
 can be obtained either by installing their associated components of the
 `ROCm <https://rocm.docs.amd.com/en/latest/>`_ stack, or from their respective
 repositories. The list algorithms that can be offloaded is available
-`here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#algorithm-support-status>`_.
+`here <https://github.com/ROCm/roc-stdpar#algorithm-support-status>`_.
 
 HIP Specific Elements
 ---------------------
diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
index 0249c580964a0..95ae4f74e0ead 100644
--- a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
+++ b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -5323,7 +5323,7 @@ D. References
 
     .. _amdgpu-dwarf-AMD-ROCgdb:
 
-2.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm-Developer-Tools/ROCgdb>`__
+2.  [AMD-ROCgdb] `AMD ROCm Debugger (ROCgdb) <https://github.com/ROCm/ROCgdb>`__
 
     .. _amdgpu-dwarf-AMD-ROCm:
 
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 174a497c51b26..3aa8773fa506b 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -18435,8 +18435,8 @@ Additional Documentation
 .. [AMD-RADEON-HD-5000] `AMD Evergreen shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_Evergreen-Family_Instruction_Set_Architecture.pdf>`__
 .. [AMD-RADEON-HD-6000] `AMD Cayman/Trinity shader ISA <http://developer.amd.com/wordpress/media/2012/10/AMD_HD_6900_Series_Instruction_Set_Architecture.pdf>`__
 .. [AMD-ROCm] `AMD ROCm™ Platform <https://rocmdocs.amd.com/>`__
-.. [AMD-ROCm-github] `AMD ROCm™ github <http://github.com/RadeonOpenCompute>`__
-.. [AMD-ROCm-Release-Notes] `AMD ROCm Release Notes <https://github.com/RadeonOpenCompute/ROCm>`__
+.. [AMD-ROCm-github] `AMD ROCm™ github <http://github.com/ROCm>`__
+.. [AMD-ROCm-Release-Notes] `AMD ROCm Release Notes <https://github.com/ROCm/ROCm>`__
 .. [CLANG-ATTR] `Attributes in Clang <https://clang.llvm.org/docs/AttributeReference.html>`__
 .. [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
 .. [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
diff --git a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
index 049142732aa15..a84e261357de0 100644
--- a/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
+++ b/llvm/test/Analysis/KernelInfo/openmp/amdgpu.ll
@@ -182,7 +182,7 @@ attributes #5 = { nounwind }
 !10 = !{i32 7, !"frame-pointer", i32 2}
 !11 = !{i32 4, !"amdgpu_hostcall", i32 1}
 !12 = !{!"clang version 20.0.0git (/tmp/llvm/clang b9447c03a9ef2eed55b685a33511df86f7f94e89)"}
-!13 = !{!"AMD clang version 17.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
+!13 = !{!"AMD clang version 17.0.0 (https://github.com/ROCm/llvm-project roc-6.0.2 24012 af27734ed982b52a9f1be0f035ac91726fc697e4)"}
 !14 = !{i32 2, i32 0}
 !15 = distinct !DISubprogram(name: "__omp_offloading_fd02_727e9_h_l12_debug__", scope: !16, file: !16, line: 13, type: !17, scopeLine: 13, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !22)
 !16 = !DIFile(filename: "test.c", directory: "/tmp")
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index c2a82ffc1c43c..ce1fe5a03c494 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -13,8 +13,8 @@
 // pointed to here. However the following links contain more information about
 // ROCDL (ROCm-Device-Library)
 //
-// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/doc/OCML.md
-// https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/amd-stg-open/doc/OCKL.md
+// https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/doc/OCML.md
+// https://github.com/ROCm/llvm-project/blob/amd-staging/amd/device-libs/doc/OCKL.md
 // https://llvm.org/docs/AMDGPUUsage.html
 //
 //===----------------------------------------------------------------------===//

From 4e80a033a1bade55bca8a32e267cf1b06d05b1ed Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 12 Jun 2025 23:09:55 -0700
Subject: [PATCH 327/851] [NVPTX] Use prmt.f4e to lower pointer alignment fshr
 idiom (#143407)

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td |  4 ++++
 llvm/test/CodeGen/NVPTX/prmt.ll         | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index fa521c040e8e5..4c3501df57f84 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1621,6 +1621,10 @@ let hasSideEffects = false in {
 
 }
 
+// PRMT folding patterns
+def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))),
+          (PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>;
+
 
 // byte extraction + signed/unsigned extension to i32.
 def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)),
diff --git a/llvm/test/CodeGen/NVPTX/prmt.ll b/llvm/test/CodeGen/NVPTX/prmt.ll
index 271e4c86cd23e..48b9eefb9fb30 100644
--- a/llvm/test/CodeGen/NVPTX/prmt.ll
+++ b/llvm/test/CodeGen/NVPTX/prmt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -verify-machineinstrs | FileCheck %s
-; RUN: %if ptxas %{ llc < %s -verify-machineinstrs | %ptxas-verify %}
+; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %}
 
 target triple = "nvptx64-nvidia-cuda"
 
@@ -111,3 +111,20 @@ define i32 @test_prmt_rc16(i32 %lo, i32 %selector) {
   %val = call i32 @llvm.nvvm.prmt.rc16(i32 %lo, i32 %selector)
   ret i32 %val
 }
+
+define i32 @test_prmt_f4e_folding(i32 %lo, i32 %hi, i32 %ptr) {
+; CHECK-LABEL: test_prmt_f4e_folding(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_prmt_f4e_folding_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_prmt_f4e_folding_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [test_prmt_f4e_folding_param_2];
+; CHECK-NEXT:    prmt.b32.f4e %r4, %r1, %r2, %r3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
+; CHECK-NEXT:    ret;
+  %sh_amt = shl i32 %ptr, 3
+  %val = call i32 @llvm.fshr.i32(i32 %hi, i32 %lo, i32 %sh_amt)
+  ret i32 %val
+}

From f64b3bb276e820f00911dbf6ecc484751daeb5f1 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Fri, 13 Jun 2025 08:21:56 +0200
Subject: [PATCH 328/851] [mlir][llvm] Op interface LLVM converter (#143922)

Adds a utility conversion class for rewriting op interface instances
targeting LLVM dialect.
---
 .../mlir/Conversion/LLVMCommon/Pattern.h      | 45 +++++++++++++++++++
 .../AMX/Transforms/LegalizeForLLVMExport.cpp  | 15 ++-----
 .../Transforms/LegalizeForLLVMExport.cpp      | 15 ++-----
 3 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
index 7e946495e3e7f..503a2a7e6f0cd 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -92,6 +92,10 @@ class ConvertToLLVMPattern : public ConversionPattern {
                        PatternBenefit benefit = 1);
 
 protected:
+  /// See `ConversionPattern::ConversionPattern` for information on the other
+  /// available constructors.
+  using ConversionPattern::ConversionPattern;
+
   /// Returns the LLVM dialect.
   LLVM::LLVMDialect &getDialect() const;
 
@@ -234,6 +238,47 @@ class ConvertOpToLLVMPattern : public ConvertToLLVMPattern {
   using ConvertToLLVMPattern::matchAndRewrite;
 };
 
+/// Utility class for operation conversions targeting the LLVM dialect that
+/// allows for matching and rewriting against an instance of an OpInterface
+/// class.
+template <typename SourceOp>
+class ConvertOpInterfaceToLLVMPattern : public ConvertToLLVMPattern {
+public:
+  explicit ConvertOpInterfaceToLLVMPattern(
+      const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1)
+      : ConvertToLLVMPattern(typeConverter, Pattern::MatchInterfaceOpTypeTag(),
+                             SourceOp::getInterfaceID(), benefit,
+                             &typeConverter.getContext()) {}
+
+  /// Wrappers around the RewritePattern methods that pass the derived op type.
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    return matchAndRewrite(cast<SourceOp>(op), operands, rewriter);
+  }
+
+  /// Methods that operate on the SourceOp type. One of these must be
+  /// overridden by the derived pattern class.
+  virtual LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    llvm_unreachable("matchAndRewrite is not implemented");
+  }
+  virtual LogicalResult
+  matchAndRewrite(SourceOp op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const {
+    return matchAndRewrite(op, getOneToOneAdaptorOperands(operands), rewriter);
+  }
+
+private:
+  using ConvertToLLVMPattern::matchAndRewrite;
+};
+
 /// Generic implementation of one-to-one conversion from "SourceOp" to
 /// "TargetOp" where the latter belongs to the LLVM dialect or an equivalent.
 /// Upholds a convention that multi-result operations get converted into an
diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
index 37aebc9fab3eb..06e5f7c2196d2 100644
--- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp
@@ -24,27 +24,18 @@ namespace {
 /// Generic one-to-one conversion of simply mappable operations into calls
 /// to their respective LLVM intrinsics.
 struct AMXIntrinsicOpConversion
-    : public OpInterfaceConversionPattern<amx::AMXIntrinsicOp> {
-  using OpInterfaceConversionPattern<
-      amx::AMXIntrinsicOp>::OpInterfaceConversionPattern;
-
-  AMXIntrinsicOpConversion(const LLVMTypeConverter &typeConverter,
-                           PatternBenefit benefit = 1)
-      : OpInterfaceConversionPattern(typeConverter, &typeConverter.getContext(),
-                                     benefit),
-        typeConverter(typeConverter) {}
+    : public ConvertOpInterfaceToLLVMPattern<amx::AMXIntrinsicOp> {
+  using ConvertOpInterfaceToLLVMPattern::ConvertOpInterfaceToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(amx::AMXIntrinsicOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *getTypeConverter();
     return LLVM::detail::intrinsicRewrite(
         op, rewriter.getStringAttr(op.getIntrinsicName()),
         op.getIntrinsicOperands(operands, typeConverter, rewriter),
         typeConverter, rewriter);
   }
-
-private:
-  const LLVMTypeConverter &typeConverter;
 };
 
 } // namespace
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
index b2fc2f3f40e8c..8e062488f58c8 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/LegalizeForLLVMExport.cpp
@@ -23,27 +23,18 @@ namespace {
 /// Generic one-to-one conversion of simply mappable operations into calls
 /// to their respective LLVM intrinsics.
 struct X86IntrinsicOpConversion
-    : public OpInterfaceConversionPattern<x86vector::X86IntrinsicOp> {
-  using OpInterfaceConversionPattern<
-      x86vector::X86IntrinsicOp>::OpInterfaceConversionPattern;
-
-  X86IntrinsicOpConversion(const LLVMTypeConverter &typeConverter,
-                           PatternBenefit benefit = 1)
-      : OpInterfaceConversionPattern(typeConverter, &typeConverter.getContext(),
-                                     benefit),
-        typeConverter(typeConverter) {}
+    : public ConvertOpInterfaceToLLVMPattern<x86vector::X86IntrinsicOp> {
+  using ConvertOpInterfaceToLLVMPattern::ConvertOpInterfaceToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(x86vector::X86IntrinsicOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
+    const LLVMTypeConverter &typeConverter = *getTypeConverter();
     return LLVM::detail::intrinsicRewrite(
         op, rewriter.getStringAttr(op.getIntrinsicName()),
         op.getIntrinsicOperands(operands, typeConverter, rewriter),
         typeConverter, rewriter);
   }
-
-private:
-  const LLVMTypeConverter &typeConverter;
 };
 
 } // namespace

From 483d19619c3221c1d54080e57e43052eb863436a Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Fri, 13 Jun 2025 14:26:50 +0800
Subject: [PATCH 329/851] [RISCV] Add tune features for Andes 45 series cpus
 (#143899)

Add tune features TuneNoDefaultUnroll, TuneShortForwardBranchOpt and
TunePostRAScheduler for Andes 45 series cpus.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td   |  3 +++
 llvm/lib/Target/RISCV/RISCVProcessors.td | 19 ++++++++++++++-----
 llvm/lib/Target/RISCV/RISCVSubtarget.h   |  1 +
 llvm/test/CodeGen/RISCV/features-info.ll |  1 +
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 83eefc0858d4c..940caa4f40444 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1742,6 +1742,9 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
 def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
                                          "Ventana Veyron-Series processors">;
 
+def TuneAndes45 : SubtargetFeature<"andes45", "RISCVProcFamily", "Andes45",
+                                   "Andes 45-Series processors">;
+
 def TuneVXRMPipelineFlush : SubtargetFeature<"vxrm-pipeline-flush", "HasVXRMPipelineFlush",
                                              "true", "VXRM writes causes pipeline flush">;
 
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index de6f0ecfce737..32f4ab607a34c 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -722,8 +722,13 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
+defvar Andes45TuneFeatures = [TuneAndes45,
+                              TuneNoDefaultUnroll,
+                              TuneShortForwardBranchOpt,
+                              TunePostRAScheduler];
+
 def ANDES_45 : RISCVTuneProcessorModel<"andes-45-series",
-                                       Andes45Model>;
+                                       Andes45Model, Andes45TuneFeatures>;
 
 def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                     Andes45Model,
@@ -737,7 +742,8 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtD,
                                      FeatureStdExtC,
                                      FeatureStdExtB,
-                                     FeatureVendorXAndesPerf]>;
+                                     FeatureVendorXAndesPerf],
+                                    Andes45TuneFeatures>;
 
 def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                      Andes45Model,
@@ -751,7 +757,8 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtB,
-                                      FeatureVendorXAndesPerf]>;
+                                      FeatureVendorXAndesPerf],
+                                     Andes45TuneFeatures>;
 
 def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                     Andes45Model,
@@ -765,7 +772,8 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtD,
                                      FeatureStdExtC,
                                      FeatureStdExtB,
-                                     FeatureVendorXAndesPerf]>;
+                                     FeatureVendorXAndesPerf],
+                                    Andes45TuneFeatures>;
 
 def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                      Andes45Model,
@@ -779,4 +787,5 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtB,
-                                      FeatureVendorXAndesPerf]>;
+                                      FeatureVendorXAndesPerf],
+                                     Andes45TuneFeatures>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 0eef7b1feaf5b..04c7ca7d0572b 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -83,6 +83,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     SiFive7,
     VentanaVeyron,
     MIPSP8700,
+    Andes45,
   };
   enum RISCVVRGatherCostModelEnum : uint8_t {
     Quadratic,
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index b7b27cd579fb3..fab2e94959301 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -6,6 +6,7 @@
 ; CHECK-NEXT:   32bit                            - Implements RV32.
 ; CHECK-NEXT:   64bit                            - Implements RV64.
 ; CHECK-NEXT:   a                                - 'A' (Atomic Instructions).
+; CHECK-NEXT:   andes45                          - Andes 45-Series processors.
 ; CHECK-NEXT:   auipc-addi-fusion                - Enable AUIPC+ADDI macrofusion.
 ; CHECK-NEXT:   b                                - 'B' (the collection of the Zba, Zbb, Zbs extensions).
 ; CHECK-NEXT:   c                                - 'C' (Compressed Instructions).

From 4903c11a7e144d63635b115d97936a7aecf7a2f6 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Fri, 13 Jun 2025 14:31:48 +0800
Subject: [PATCH 330/851] [RISCV] Support memcmp expansion for vectors

This patch adds the support of generating vector instructions for
`memcmp`. This implementation is inspired by X86's.

We convert integer comparisons (eq/ne only) into vector comparisons
and do a vector reduction and to get the result.

The range of supported load sizes is (XLEN, VLEN * LMUL8] and
non-power-of-2 types are not supported.

Fixes #143294.

Reviewers: lukel97, asb, preames, topperc, dtcxzyw

Reviewed By: topperc, lukel97

Pull Request: https://github.com/llvm/llvm-project/pull/114517
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   69 +-
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   17 +
 llvm/test/CodeGen/RISCV/memcmp-optsize.ll     | 1384 +++++++++++++--
 llvm/test/CodeGen/RISCV/memcmp.ll             | 1556 +++++++++++++----
 4 files changed, 2474 insertions(+), 552 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a157c94849f37..7839af5c16917 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16147,17 +16147,80 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
   return true;
 }
 
+/// Try to map an integer comparison with size > XLEN to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue
+combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC,
+                                const SDLoc &DL, SelectionDAG &DAG,
+                                const RISCVSubtarget &Subtarget) {
+  assert(ISD::isIntEqualitySetCC(CC) && "Bad comparison predicate");
+
+  if (!Subtarget.hasVInstructions())
+    return SDValue();
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  EVT OpVT = X.getValueType();
+  // We're looking for an oversized integer equality comparison.
+  if (!OpVT.isScalarInteger())
+    return SDValue();
+
+  unsigned OpSize = OpVT.getSizeInBits();
+  // TODO: Support non-power-of-2 types.
+  if (!isPowerOf2_32(OpSize))
+    return SDValue();
+
+  // The size should be larger than XLen and smaller than the maximum vector
+  // size.
+  if (OpSize <= Subtarget.getXLen() ||
+      OpSize > Subtarget.getRealMinVLen() *
+                   Subtarget.getMaxLMULForFixedLengthVectors())
+    return SDValue();
+
+  // Don't perform this combine if constructing the vector will be expensive.
+  auto IsVectorBitCastCheap = [](SDValue X) {
+    X = peekThroughBitcasts(X);
+    return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
+           X.getOpcode() == ISD::LOAD;
+  };
+  if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y))
+    return SDValue();
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          Attribute::NoImplicitFloat))
+    return SDValue();
+
+  unsigned VecSize = OpSize / 8;
+  EVT VecVT = MVT::getVectorVT(MVT::i8, VecSize);
+  EVT CmpVT = MVT::getVectorVT(MVT::i1, VecSize);
+
+  SDValue VecX = DAG.getBitcast(VecVT, X);
+  SDValue VecY = DAG.getBitcast(VecVT, Y);
+  SDValue Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
+  return DAG.getSetCC(DL, VT, DAG.getNode(ISD::VECREDUCE_OR, DL, XLenVT, Cmp),
+                      DAG.getConstant(0, DL, XLenVT), CC);
+}
+
 // Replace (seteq (i64 (and X, 0xffffffff)), C1) with
 // (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
 // bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
 // can become a sext.w instead of a shift pair.
 static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  // Looking for an equality compare.
+  if (!isIntEqualitySetCC(Cond))
+    return SDValue();
+
+  if (SDValue V =
+          combineVectorSizedSetCCEquality(VT, N0, N1, Cond, dl, DAG, Subtarget))
+    return V;
+
   if (OpVT != MVT::i64 || !Subtarget.is64Bit())
     return SDValue();
 
@@ -16172,11 +16235,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
       N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
     return SDValue();
 
-  // Looking for an equality compare.
-  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  if (!isIntEqualitySetCC(Cond))
-    return SDValue();
-
   // Don't do this if the sign bit is provably zero, it will be turned back into
   // an AND.
   APInt SignMask = APInt::getOneBitSet(64, 31);
@@ -16185,7 +16243,6 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
 
   const APInt &C1 = N1C->getAPIntValue();
 
-  SDLoc dl(N);
   // If the constant is larger than 2^32 - 1 it is impossible for both sides
   // to be equal.
   if (C1.getActiveBits() > 32)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index d5ea0c5d52293..bee47527cf428 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2952,5 +2952,22 @@ RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     Options.LoadSizes = {4, 2, 1};
     Options.AllowedTailExpansions = {3};
   }
+
+  if (IsZeroCmp && ST->hasVInstructions()) {
+    unsigned RealMinVLen = ST->getRealMinVLen();
+    // Support Fractional LMULs if the lengths are larger than XLen.
+    // TODO: Support non-power-of-2 types.
+    for (unsigned FLMUL = 8; FLMUL >= 2; FLMUL /= 2) {
+      unsigned Len = RealMinVLen / FLMUL;
+      if (Len > ST->getXLen())
+        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
+    }
+    for (unsigned LMUL = 1; LMUL <= ST->getMaxLMULForFixedLengthVectors();
+         LMUL *= 2) {
+      unsigned Len = RealMinVLen * LMUL;
+      if (Len > ST->getXLen())
+        Options.LoadSizes.insert(Options.LoadSizes.begin(), Len / 8);
+    }
+  }
   return Options;
 }
diff --git a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
index 38cd51c074594..3742383675b96 100644
--- a/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp-optsize.ll
@@ -870,13 +870,11 @@ define i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_8:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1073,18 +1071,18 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
@@ -1284,33 +1282,21 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1319,15 +1305,15 @@ entry:
 }
 
 define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_31:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 31
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1339,6 +1325,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1349,6 +1345,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1359,6 +1365,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1369,6 +1385,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -1389,6 +1415,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -1409,6 +1445,16 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 31
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -1429,22 +1475,58 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_31:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_31:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
@@ -1454,15 +1536,15 @@ entry:
 }
 
 define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_32:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 32
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1474,6 +1556,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1484,6 +1576,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1494,6 +1596,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_32:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1504,6 +1616,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -1524,6 +1646,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -1544,6 +1676,16 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -1564,23 +1706,25 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_32:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1589,100 +1733,1020 @@ entry:
 }
 
 define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_63:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 63
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_63:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 63
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
-entry:
-  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 63)
-  ret i32 %bcmp
-}
-
-define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_64:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 64
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_64:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 64
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
-entry:
-  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 64)
-  ret i32 %bcmp
-}
-
-define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_127:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 127
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_127:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 127
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
-entry:
-  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
-  ret i32 %bcmp
-}
-
-define i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind optsize {
-; CHECK-RV32-LABEL: bcmp_size_128:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 128
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_128:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 63)
+  ret i32 %bcmp
+}
+
+define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind optsize {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 64)
+  ret i32 %bcmp
+}
+
+define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind optsize {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
+entry:
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
+  ret i32 %bcmp
+}
+
+define i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind optsize {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 128)
   ret i32 %bcmp
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
index df9d781a4536d..f9a6dbba04fc6 100644
--- a/llvm/test/CodeGen/RISCV/memcmp.ll
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -870,13 +870,11 @@ define i32 @bcmp_size_8(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_8:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1073,18 +1071,18 @@ define i32 @bcmp_size_15(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 7(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 11(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 7(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 11(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
@@ -1284,33 +1282,21 @@ define i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_16:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1577,29 +1563,29 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 15(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 19(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 23(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 27(a0)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 15(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 19(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 23(a1)
 ; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 27(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, t3, t1
 ; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a3, a6
 ; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
@@ -1607,18 +1593,18 @@ define i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 15(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 23(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 15(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 23(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
@@ -1878,57 +1864,23 @@ define i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind {
 ;
 ; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 0(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 4(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 8(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 12(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 16(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 28(a0)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 16(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 20(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 24(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 28(a1)
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t3, t1
-; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_32:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a4, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 32
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v10, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v12, v8, v10
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v12
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -1937,15 +1889,15 @@ entry:
 }
 
 define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_63:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 63
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -1957,6 +1909,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -1967,6 +1929,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -1977,6 +1949,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 63
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -1987,6 +1969,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -2023,6 +2015,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -2059,6 +2061,16 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 63
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -2095,6 +2107,98 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_63:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -48
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 8(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 4(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 47(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 51(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 55(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 59(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 31(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 35(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 39(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 43(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 31(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 35(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 39(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 43(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 47(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 51(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 55(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 59(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 4(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 48
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_63:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
@@ -2105,29 +2209,29 @@ define i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 31(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 39(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 47(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 55(a0)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 31(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 39(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 47(a1)
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 55(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, t3, t1
 ; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, t2, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t4, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a2, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a5, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a3, a6
 ; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a1, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2136,15 +2240,15 @@ entry:
 }
 
 define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_64:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 64
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
@@ -2156,6 +2260,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
@@ -2166,6 +2280,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
@@ -2176,6 +2300,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_64:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
@@ -2186,6 +2320,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
@@ -2222,6 +2366,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
@@ -2258,6 +2412,16 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
@@ -2294,39 +2458,25 @@ define i32 @bcmp_size_64(ptr %s1, ptr %s2) nounwind {
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
 ;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_64:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
 ; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_64:
 ; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 0(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 8(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 16(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 24(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, a6
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 32(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 40(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 48(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 56(a0)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 32(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 40(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 48(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 56(a1)
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, t2, t0
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t3, t1
-; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, a5
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a6, a7
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t0, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
-; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 64
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v12, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v16, v8, v12
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v16
 ; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
 ; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
@@ -2335,50 +2485,580 @@ entry:
 }
 
 define i32 @bcmp_size_127(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_127:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 127
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_127:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 127
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 127
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 127
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw ra, 92(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s0, 88(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s1, 84(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s2, 80(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s3, 76(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s4, 72(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s5, 68(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s6, 64(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s7, 60(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s8, 56(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s9, 52(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s10, 48(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw s11, 44(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 32(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 36(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 40(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 44(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 0(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t1, 4(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 8(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 12(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 48(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t0, 52(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 56(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 60(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 16(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 20(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 24(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 28(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 12(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 60(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 16(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 20(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 24(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 28(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 32(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 36(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 40(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 44(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 40(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 36(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a4, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 32(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 56(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 48(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a6, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 28(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 4(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a7, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 24(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 20(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 0(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 8(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t1, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 16(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 107(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, t5, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    sw a2, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 75(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t6, t6, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 123(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s2, s2, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s0, s0, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t5, t4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 83(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 87(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 91(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s1, s1, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 107(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t4, t3, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 91(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t3, t2, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 123(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t2, a5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 75(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s5, s11, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s7, s8, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 87(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 83(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s3, s10, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 115(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s6, ra, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 115(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s4, s4, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 119(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a7, 119(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s10, s10, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 71(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 67(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 67(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a6, 71(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a4, 99(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a3, 99(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t1, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 103(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 103(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor t0, a4, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a4, s9, s8
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a7, s11, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a6, ra, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a3, a5, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a5, 95(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 63(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 111(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a2, 79(a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 79(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 111(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a0, 63(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw a1, 95(a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a2, a2, s11
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor s9, s9, ra
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a0, s8, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    xor a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t2, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, t3, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t4, s9
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, s1, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, t5, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a6, s0, a6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a7, s2, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, t6, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t2, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 16(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t1, t2, t1
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t2, 20(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t2, s10
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t3, 24(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t3, t3, s4
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t4, 28(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t4, s6
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t5, 32(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t5, t5, s3
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw t6, 36(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, t6, s7
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    or s0, s0, s5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t6, s0, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t4, t5, t4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t2, t3, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or t0, t1, t0
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a4, a4, a7
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a6, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a2, a2, a5
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, t4, t6
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a5, t0, t2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a3, a3, a4
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a2
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a3
+; CHECK-UNALIGNED-RV32-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw ra, 92(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s0, 88(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s1, 84(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s2, 80(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s3, 76(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s4, 72(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s5, 68(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s6, 64(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s7, 60(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s8, 56(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s9, 52(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s10, 48(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    lw s11, 44(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_127:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, -96
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s0, 88(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s1, 80(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s2, 72(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s3, 64(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s4, 56(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s5, 48(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s6, 40(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s7, 32(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s8, 24(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s9, 16(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    sd s10, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a2, 32(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a3, 40(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a4, 48(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a5, 56(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a6, 0(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a7, 8(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t0, 16(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t1, 24(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t2, 32(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 40(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 48(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 56(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t6, 0(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 8(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 16(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 24(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 95(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 103(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 111(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 119(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 63(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 71(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 79(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a0, 87(a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t1, t1, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a5, a5, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a7, a7, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t5, 63(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 71(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 79(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 87(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a3, a3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t0, t0, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a4, a4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t3, 95(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld t4, 103(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 111(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld a1, 119(a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a6, a6, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a2, a2, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a0, a0, s10
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor a1, s6, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t2, s8, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t4, s4, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t6, s9, s2
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor s0, s5, s1
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t5, s7, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    xor t3, s3, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, t3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a6, a6, t5
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a4, a4, s0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or t0, t0, t6
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, a3, t4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a7, a7, t2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a5, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, t1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a0, a1
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a1, a7, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a3, t0, a4
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a6, a2
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a1, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV64-V-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s0, 88(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s1, 80(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s2, 72(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s3, 64(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s4, 56(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s5, 48(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s6, 40(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s7, 32(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s8, 24(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s9, 16(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    ld s10, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-V-NEXT:    addi sp, sp, 96
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 127)
   ret i32 %bcmp
 }
 
 define i32 @bcmp_size_128(ptr %s1, ptr %s2) nounwind {
-; CHECK-RV32-LABEL: bcmp_size_128:
-; CHECK-RV32:       # %bb.0: # %entry
-; CHECK-RV32-NEXT:    addi sp, sp, -16
-; CHECK-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-RV32-NEXT:    li a2, 128
-; CHECK-RV32-NEXT:    call bcmp
-; CHECK-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-RV32-NEXT:    addi sp, sp, 16
-; CHECK-RV32-NEXT:    ret
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
-; CHECK-RV64-LABEL: bcmp_size_128:
-; CHECK-RV64:       # %bb.0: # %entry
-; CHECK-RV64-NEXT:    addi sp, sp, -16
-; CHECK-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-RV64-NEXT:    li a2, 128
-; CHECK-RV64-NEXT:    call bcmp
-; CHECK-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-RV64-NEXT:    addi sp, sp, 16
-; CHECK-RV64-NEXT:    ret
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-ALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    call bcmp
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV32-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_size_128:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    li a2, 128
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v16, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v24, v8, v16
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v24
+; CHECK-UNALIGNED-RV64-V-NEXT:    snez a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
   %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 128)
   ret i32 %bcmp
@@ -2412,7 +3092,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2423,7 +3103,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2434,7 +3114,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2445,7 +3125,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2456,7 +3136,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2467,7 +3147,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
@@ -2478,7 +3158,7 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
@@ -2489,22 +3169,130 @@ define i1 @bcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
 ; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 4
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    call bcmp
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
 ; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: bcmp_eq_zero:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    seqz a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: bcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
-  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 4)
+  %bcmp = call signext i32 @bcmp(ptr %s1, ptr %s2, iXLen 16)
   %ret = icmp eq i32 %bcmp, 0
   ret i1 %ret
 }
@@ -5980,213 +6768,209 @@ entry:
 define i1 @memcmp_eq_zero(ptr %s1, ptr %s2) nounwind {
 ; CHECK-ALIGNED-RV32-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-ZBKB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-ZBKB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a2, 0(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a3, 1(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a1, 3(a1)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a5, 1(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a6, 2(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a7, 3(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lbu a0, 0(a0)
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a1, a4, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a3, a6, a7
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    packh a0, a0, a5
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a1, a2, a1
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    pack a0, a0, a3
-; CHECK-ALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-ZBKB-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-ZBKB:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a2, 0(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 1(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a4, 2(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a1, 3(a1)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a5, 0(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a6, 1(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a2, a2, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lbu a3, 2(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    packh a5, a5, a6
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a4, a4, 16
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a1, a1, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a4
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a3
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a1, a1, a2
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-ZBKB-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-ZBKB-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV32-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV32-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 3(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    lbu a0, 3(a0)
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV32-V-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV32-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV32-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV32-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV32-V-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-V-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-V-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV32-V-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-V-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV32-V-NEXT:    ret
 ;
 ; CHECK-ALIGNED-RV64-V-LABEL: memcmp_eq_zero:
 ; CHECK-ALIGNED-RV64-V:       # %bb.0: # %entry
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a3, 2(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a4, 3(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a1, 0(a1)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a3, a3, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a4, a4, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a2, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a3, a4, a3
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a2, 1(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a4, 0(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lbu a5, 2(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    lb a0, 3(a0)
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a2, a2, 8
-; CHECK-ALIGNED-RV64-V-NEXT:    or a2, a2, a4
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a5, a5, 16
-; CHECK-ALIGNED-RV64-V-NEXT:    slli a0, a0, 24
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a5
-; CHECK-ALIGNED-RV64-V-NEXT:    or a1, a3, a1
-; CHECK-ALIGNED-RV64-V-NEXT:    or a0, a0, a2
-; CHECK-ALIGNED-RV64-V-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV64-V-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-ALIGNED-RV64-V-NEXT:    li a2, 16
+; CHECK-ALIGNED-RV64-V-NEXT:    call memcmp
 ; CHECK-ALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-V-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; CHECK-ALIGNED-RV64-V-NEXT:    addi sp, sp, 16
 ; CHECK-ALIGNED-RV64-V-NEXT:    ret
 ;
-; CHECK-UNALIGNED-LABEL: memcmp_eq_zero:
-; CHECK-UNALIGNED:       # %bb.0: # %entry
-; CHECK-UNALIGNED-NEXT:    lw a0, 0(a0)
-; CHECK-UNALIGNED-NEXT:    lw a1, 0(a1)
-; CHECK-UNALIGNED-NEXT:    xor a0, a0, a1
-; CHECK-UNALIGNED-NEXT:    seqz a0, a0
-; CHECK-UNALIGNED-NEXT:    ret
+; CHECK-UNALIGNED-RV32-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-ZBKB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a3, 4(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a4, 8(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a0, 12(a0)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a5, 0(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a6, 4(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    lw a1, 12(a1)
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a2, a2, a5
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a3, a3, a6
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a4, a4, a7
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a2, a2, a3
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a4, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-ZBKB-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-ZBKB:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a0, 8(a0)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ld a1, 8(a1)
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-ZBKB-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-V-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV32-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV32-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV32-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV32-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-V-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-V-LABEL: memcmp_eq_zero:
+; CHECK-UNALIGNED-RV64-V:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-V-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v8, (a0)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vle8.v v9, (a1)
+; CHECK-UNALIGNED-RV64-V-NEXT:    vmsne.vv v8, v8, v9
+; CHECK-UNALIGNED-RV64-V-NEXT:    vcpop.m a0, v8
+; CHECK-UNALIGNED-RV64-V-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-V-NEXT:    ret
 entry:
-  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 4)
+  %memcmp = call signext i32 @memcmp(ptr %s1, ptr %s2, iXLen 16)
   %ret = icmp eq i32 %memcmp, 0
   ret i1 %ret
 }

From 43be31e35ab0985ec381041762586902c2718751 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 00:12:07 -0700
Subject: [PATCH 331/851] SPARC: Simplify SparcMCExpr

Reduce direct uses of SparcMCExpr, facilitating transition to
MCSpecifierExpr in the future.
---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 14 ++++++------
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.cpp     |  6 ++---
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |  6 ++---
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.cpp | 22 +++++++++----------
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.h   | 22 +++++++++----------
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp     |  8 +++----
 .../Target/Sparc/SparcTargetObjectFile.cpp    |  3 +--
 7 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 3deeac38e5604..187ecbaad4bb2 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -848,14 +848,14 @@ bool SparcAsmParser::expandSETX(MCInst &Inst, SMLoc IDLoc,
   // sethi %hh(val), tmp
   Instructions.push_back(MCInstBuilder(SP::SETHIi)
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(SparcMCExpr::create(
-                                 ELF::R_SPARC_HH22, ValExpr, getContext())));
+                             .addExpr(Sparc::createSpecifierExpr(
+                                 getContext(), ValExpr, ELF::R_SPARC_HH22)));
   // or    tmp, %hm(val), tmp
   Instructions.push_back(MCInstBuilder(SP::ORri)
                              .addReg(MCTmpOp.getReg())
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(SparcMCExpr::create(
-                                 ELF::R_SPARC_HM10, ValExpr, getContext())));
+                             .addExpr(Sparc::createSpecifierExpr(
+                                 getContext(), ValExpr, ELF::R_SPARC_HM10)));
   // sllx  tmp, 32, tmp
   Instructions.push_back(MCInstBuilder(SP::SLLXri)
                              .addReg(MCTmpOp.getReg())
@@ -1165,7 +1165,7 @@ ParseStatus SparcAsmParser::parseTailRelocSym(OperandVector &Operands) {
     return Error(getLoc(), "expected valid identifier for operand modifier");
 
   StringRef Name = getParser().getTok().getIdentifier();
-  uint16_t RelType = SparcMCExpr::parseSpecifier(Name);
+  uint16_t RelType = Sparc::parseSpecifier(Name);
   if (RelType == 0)
     return Error(getLoc(), "invalid relocation specifier");
 
@@ -1689,7 +1689,7 @@ const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
     }
   }
 
-  return SparcMCExpr::create(RelType, subExpr, getContext());
+  return Sparc::createSpecifierExpr(getContext(), subExpr, RelType);
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
@@ -1700,7 +1700,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
 
   StringRef name = Tok.getString();
 
-  auto VK = SparcMCExpr::parseSpecifier(name);
+  auto VK = Sparc::parseSpecifier(name);
   switch (VK) {
   case 0:
     Error(getLoc(), "invalid relocation specifier");
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 8de9a789a63bf..3049072b001cb 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -50,8 +50,7 @@ SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
   }
 
   return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
@@ -63,8 +62,7 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
                                        MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(Sym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 2c8dbaa5aba60..4ce9bea5d7958 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -135,7 +135,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   assert(MO.isExpr());
   const MCExpr *Expr = MO.getExpr();
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
 
@@ -165,7 +165,7 @@ unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
     return CE->getValue();
 
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
   Fixups.push_back(MCFixup::create(0, Expr, ELF::R_SPARC_5));
@@ -191,7 +191,7 @@ SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
     return CE->getValue();
 
   if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
-    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getFixupKind()));
+    Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
   Fixups.push_back(MCFixup::create(0, Expr, Sparc::fixup_sparc_13));
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 09e55a66fcc6b..2e03e47399864 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -22,13 +22,18 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sparcmcexpr"
 
-const SparcMCExpr *SparcMCExpr::create(uint16_t S, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) SparcMCExpr(S, Expr);
+const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
+                                              const MCExpr *Expr, uint16_t S) {
+  return new (Ctx) SparcMCExpr(Expr, S);
+}
+
+const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
+                                              const MCSymbol *Sym, uint16_t S) {
+  return new (Ctx) SparcMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
 }
 
 void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  StringRef S = getSpecifierName(specifier);
+  StringRef S = Sparc::getSpecifierName(specifier);
   if (!S.empty())
     OS << '%' << S << '(';
   getSubExpr()->print(OS, MAI);
@@ -36,7 +41,7 @@ void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     OS << ')';
 }
 
-StringRef SparcMCExpr::getSpecifierName(uint16_t S) {
+StringRef Sparc::getSpecifierName(uint16_t S) {
   // clang-format off
   switch (uint16_t(S)) {
   case 0:                          return {};
@@ -83,7 +88,7 @@ StringRef SparcMCExpr::getSpecifierName(uint16_t S) {
   llvm_unreachable("Unhandled SparcMCExpr::Specifier");
 }
 
-uint16_t SparcMCExpr::parseSpecifier(StringRef name) {
+uint16_t Sparc::parseSpecifier(StringRef name) {
   return StringSwitch<uint16_t>(name)
       .Case("lo", ELF::R_SPARC_LO10)
       .Case("hi", ELF::R_SPARC_HI22)
@@ -128,8 +133,3 @@ uint16_t SparcMCExpr::parseSpecifier(StringRef name) {
       .Case("gdop", ELF::R_SPARC_GOTDATA_OP)
       .Default(0);
 }
-
-uint16_t SparcMCExpr::getFixupKind() const {
-  assert(uint16_t(specifier) < FirstTargetFixupKind);
-  return specifier;
-}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 8368e8ff8795b..612b439bfc740 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -21,21 +21,21 @@ namespace llvm {
 
 class StringRef;
 class SparcMCExpr : public MCSpecifierExpr {
-private:
-  explicit SparcMCExpr(uint16_t S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
 public:
-  static const SparcMCExpr *create(uint16_t S, const MCExpr *Expr,
-                                   MCContext &Ctx);
-  uint16_t getFixupKind() const;
-
+  explicit SparcMCExpr(const MCExpr *Expr, uint16_t S)
+      : MCSpecifierExpr(Expr, S) {}
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static uint16_t parseSpecifier(StringRef name);
-  static StringRef getSpecifierName(uint16_t S);
 };
 
+namespace Sparc {
+const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,
+                                       uint16_t S);
+const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCSymbol *Sym,
+                                       uint16_t S);
+uint16_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint16_t S);
+} // namespace Sparc
+
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index a30cf5a661bb3..ffefdf97edab1 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -82,7 +82,7 @@ class SparcAsmPrinter : public AsmPrinter {
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::create(Kind, MCSym, OutContext);
+  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, MCSym, Kind);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = SparcMCExpr::create(Spec, Add, OutContext);
+  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, Add, Spec);
   return MCOperand::createExpr(expr);
 }
 
@@ -302,7 +302,7 @@ MCOperand SparcAsmPrinter::lowerOperand(const MachineOperand &MO) const {
 
     const MCExpr *expr = MCSymbolRefExpr::create(Symbol, OutContext);
     if (RelType)
-      expr = SparcMCExpr::create(RelType, expr, OutContext);
+      expr = Sparc::createSpecifierExpr(OutContext, expr, RelType);
     return MCOperand::createExpr(expr);
   }
 
@@ -374,7 +374,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand (opNum);
   auto TF = MO.getTargetFlags();
 
-  StringRef Spec = SparcMCExpr::getSpecifierName(TF);
+  StringRef Spec = Sparc::getSpecifierName(TF);
   O << Spec;
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 668e6eab4e1bc..be11ea272ed1f 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -39,8 +39,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     }
 
     MCContext &Ctx = getContext();
-    return SparcMCExpr::create(ELF::R_SPARC_DISP32,
-                               MCSymbolRefExpr::create(SSym, Ctx), Ctx);
+    return Sparc::createSpecifierExpr(Ctx, SSym, ELF::R_SPARC_DISP32);
   }
 
   return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,

From 1fae5918b3d6fbed8ce6d8a2edf31bdf304ca8db Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 13 Jun 2025 00:45:52 -0700
Subject: [PATCH 332/851] [clang-format] Fix an off-by-1 bug with -length
 option (#143302)

Also validate the argument value.

Fixes #56245
---
 clang/test/Format/multiple-inputs-error.cpp |  2 +-
 clang/test/Format/ranges.cpp                | 11 ++++++++++-
 clang/tools/clang-format/ClangFormat.cpp    | 10 +++++++---
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/clang/test/Format/multiple-inputs-error.cpp b/clang/test/Format/multiple-inputs-error.cpp
index 1aa9c9f3e2fad..7cb835d39f23e 100644
--- a/clang/test/Format/multiple-inputs-error.cpp
+++ b/clang/test/Format/multiple-inputs-error.cpp
@@ -1,6 +1,6 @@
 // RUN: cp %s %t-1.cpp
 // RUN: cp %s %t-2.cpp
-// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=0 %t-1.cpp %t-2.cpp |FileCheck %s
+// RUN: not clang-format 2>&1 >/dev/null -offset=1 -length=1 %t-1.cpp %t-2.cpp |FileCheck %s
 // RUN: not clang-format 2>&1 >/dev/null -lines=1:1 %t-1.cpp %t-2.cpp |FileCheck %s -check-prefix=CHECK-LINE
 // CHECK: error: -offset, -length and -lines can only be used for single file.
 // CHECK-LINE: error: -offset, -length and -lines can only be used for single file.
diff --git a/clang/test/Format/ranges.cpp b/clang/test/Format/ranges.cpp
index 66b984e037b3c..f42492e43f84b 100644
--- a/clang/test/Format/ranges.cpp
+++ b/clang/test/Format/ranges.cpp
@@ -1,5 +1,5 @@
 // RUN: grep -Ev "// *[A-Z-]+:" %s \
-// RUN:   | clang-format -style=LLVM -offset=2 -length=0 -offset=28 -length=0 \
+// RUN:   | clang-format -style=LLVM -offset=2 -length=1 -offset=28 -length=1 -offset=35 -length=8 \
 // RUN:   | FileCheck -strict-whitespace %s
 // CHECK: {{^int\ \*i;$}}
 int*i;
@@ -9,3 +9,12 @@ int  *  i;
 
 // CHECK: {{^int\ \*i;$}}
 int   *   i;
+
+// CHECK: int I;
+// CHECK-NEXT: int J ;
+int I ;
+int J ;
+
+// RUN: not clang-format -length=0 %s 2>&1 \
+// RUN:   | FileCheck -strict-whitespace -check-prefix=CHECK0 %s
+// CHECK0: error: length should be at least 1
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 24ad3cb42254d..c0efbb7588ccb 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -284,7 +284,7 @@ static bool fillRanges(MemoryBuffer *Code,
   if (Offsets.size() == 1 && EmptyLengths) {
     Length = Sources.getFileOffset(Sources.getLocForEndOfFile(ID)) - Offsets[0];
   } else if (Offsets.size() != Lengths.size()) {
-    errs() << "error: number of -offset and -length arguments must match.\n";
+    errs() << "error: number of -offset and -length arguments must match\n";
     return true;
   }
   for (unsigned I = 0, E = Offsets.size(), CodeSize = Code->getBufferSize();
@@ -296,12 +296,16 @@ static bool fillRanges(MemoryBuffer *Code,
     }
     if (!EmptyLengths)
       Length = Lengths[I];
+    if (Length == 0) {
+      errs() << "error: length should be at least 1\n";
+      return true;
+    }
     if (Offset + Length > CodeSize) {
       errs() << "error: invalid length " << Length << ", offset + length ("
-             << Offset + Length << ") is outside the file.\n";
+             << Offset + Length << ") is outside the file\n";
       return true;
     }
-    Ranges.push_back(tooling::Range(Offset, Length));
+    Ranges.push_back(tooling::Range(Offset, Length - 1));
   }
   return false;
 }

From 1f4b1729851bcada646be75c2bc90e0d012525dd Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 13 Jun 2025 16:46:20 +0900
Subject: [PATCH 333/851] GVN: Fix trying to inspect uselist of constants when
 emitting remark (#144009)

---
 llvm/lib/Transforms/Scalar/GVN.cpp            | 75 +++++++++++--------
 ...opt-remark-assert-constant-uselistorder.ll | 26 +++++++
 2 files changed, 69 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll

diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index a0eed31fde792..c8a0479358eab 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1165,7 +1165,7 @@ static bool isLifetimeStart(const Instruction *Inst) {
 /// Assuming To can be reached from both From and Between, does Between lie on
 /// every path from From to To?
 static bool liesBetween(const Instruction *From, Instruction *Between,
-                        const Instruction *To, DominatorTree *DT) {
+                        const Instruction *To, const DominatorTree *DT) {
   if (From->getParent() == Between->getParent())
     return DT->dominates(From, Between);
   SmallSet<BasicBlock *, 1> Exclusion;
@@ -1173,20 +1173,15 @@ static bool liesBetween(const Instruction *From, Instruction *Between,
   return !isPotentiallyReachable(From, To, &Exclusion, DT);
 }
 
-/// Try to locate the three instruction involved in a missed
-/// load-elimination case that is due to an intervening store.
-static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
-                                   DominatorTree *DT,
-                                   OptimizationRemarkEmitter *ORE) {
-  using namespace ore;
+static const Instruction *findMayClobberedPtrAccess(LoadInst *Load,
+                                                    const DominatorTree *DT) {
+  Value *PtrOp = Load->getPointerOperand();
+  if (!PtrOp->hasUseList())
+    return nullptr;
 
   Instruction *OtherAccess = nullptr;
 
-  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", Load);
-  R << "load of type " << NV("Type", Load->getType()) << " not eliminated"
-    << setExtraArgs();
-
-  for (auto *U : Load->getPointerOperand()->users()) {
+  for (auto *U : PtrOp->users()) {
     if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
       auto *I = cast<Instruction>(U);
       if (I->getFunction() == Load->getFunction() && DT->dominates(I, Load)) {
@@ -1202,32 +1197,48 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
     }
   }
 
-  if (!OtherAccess) {
-    // There is no dominating use, check if we can find a closest non-dominating
-    // use that lies between any other potentially available use and Load.
-    for (auto *U : Load->getPointerOperand()->users()) {
-      if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
-        auto *I = cast<Instruction>(U);
-        if (I->getFunction() == Load->getFunction() &&
-            isPotentiallyReachable(I, Load, nullptr, DT)) {
-          if (OtherAccess) {
-            if (liesBetween(OtherAccess, I, Load, DT)) {
-              OtherAccess = I;
-            } else if (!liesBetween(I, OtherAccess, Load, DT)) {
-              // These uses are both partially available at Load were it not for
-              // the clobber, but neither lies strictly after the other.
-              OtherAccess = nullptr;
-              break;
-            } // else: keep current OtherAccess since it lies between U and
-              // Load.
-          } else {
+  if (OtherAccess)
+    return OtherAccess;
+
+  // There is no dominating use, check if we can find a closest non-dominating
+  // use that lies between any other potentially available use and Load.
+  for (auto *U : PtrOp->users()) {
+    if (U != Load && (isa<LoadInst>(U) || isa<StoreInst>(U))) {
+      auto *I = cast<Instruction>(U);
+      if (I->getFunction() == Load->getFunction() &&
+          isPotentiallyReachable(I, Load, nullptr, DT)) {
+        if (OtherAccess) {
+          if (liesBetween(OtherAccess, I, Load, DT)) {
             OtherAccess = I;
-          }
+          } else if (!liesBetween(I, OtherAccess, Load, DT)) {
+            // These uses are both partially available at Load were it not for
+            // the clobber, but neither lies strictly after the other.
+            OtherAccess = nullptr;
+            break;
+          } // else: keep current OtherAccess since it lies between U and
+          // Load.
+        } else {
+          OtherAccess = I;
         }
       }
     }
   }
 
+  return OtherAccess;
+}
+
+/// Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
+                                   const DominatorTree *DT,
+                                   OptimizationRemarkEmitter *ORE) {
+  using namespace ore;
+
+  OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", Load);
+  R << "load of type " << NV("Type", Load->getType()) << " not eliminated"
+    << setExtraArgs();
+
+  const Instruction *OtherAccess = findMayClobberedPtrAccess(Load, DT);
   if (OtherAccess)
     R << " in favor of " << NV("OtherAccess", OtherAccess);
 
diff --git a/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll b/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll
new file mode 100644
index 0000000000000..e793728815a88
--- /dev/null
+++ b/llvm/test/Transforms/GVN/opt-remark-assert-constant-uselistorder.ll
@@ -0,0 +1,26 @@
+; RUN: opt -passes='gvn' -pass-remarks-output=%t.yaml %s
+; RUN: FileCheck %s < %t.yaml
+
+; Check that there's no assert from trying to the uses of the constant
+; null.
+
+; CHECK: --- !Missed
+; CHECK-NEXT: Pass:            gvn
+; CHECK-NEXT: Name:            LoadClobbered
+; CHECK-NEXT: Function:        c
+; CHECK-NEXT: Args:
+; CHECK-NEXT:   - String:          'load of type '
+; CHECK-NEXT:   - Type:            i64
+; CHECK-NEXT:   - String:          ' not eliminated'
+; CHECK-NEXT:   - String:          ' because it is clobbered by '
+; CHECK-NEXT:   - ClobberedBy:     store
+; CHECK-NEXT: ...
+define void @c(ptr addrspace(21) %a) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %load = load i64, ptr addrspace(21) null, align 1
+  store i64 %load, ptr addrspace(21) %a, align 1
+  br label %for.cond
+}

From 02f1f6967a847bba35fc207d61732f3466f39403 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Fri, 13 Jun 2025 15:49:54 +0800
Subject: [PATCH 334/851] [mlir][linalg] Add pure tensor check for
 `winogradConv2DHelper` (#142299)

This PR adds pure tensor semantics check for `winogradConv2DHelper` to
prevent a crash. Fixes #141566.
---
 .../Dialect/Linalg/Transforms/WinogradConv2D.cpp |  4 ++++
 .../Linalg/transform-winograd-conv2d.mlir        | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index c6ebd3a53d981..e4221d4748415 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -904,6 +904,10 @@ static bool hasAllOneValues(DenseIntElementsAttr attr) {
 static FailureOr<Operation *>
 winogradConv2DHelper(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp,
                      int64_t m, int64_t r) {
+  if (!convOp.hasPureTensorSemantics())
+    return rewriter.notifyMatchFailure(
+        convOp, "expected pure tensor semantics for linalg.conv_2d_nhwc_fhwc");
+
   Value input = convOp.getInputs()[0];
   Value filter = convOp.getInputs()[1];
   Value output = convOp.getOutputs()[0];
diff --git a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
index c10e0ccebfd7c..1de861e653005 100644
--- a/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/transform-winograd-conv2d.mlir
@@ -61,6 +61,22 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+func.func @conv2d_unsupported_type(%arg0: memref<2x10x10x5xf32>, %arg1: memref<2x3x3x5xf32>, %arg2: memref<2x8x8x2xf32>) {
+  linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : memref<2x10x10x5xf32>, memref<2x3x3x5xf32>) outs(%arg2 : memref<2x8x8x2xf32>)
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @+1 {{apply Winograd Conv2D failed}}
+    %1 = transform.structured.winograd_conv2d %0 { m = 4, r = 3 } : (!transform.any_op) -> (!transform.any_op)
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @conv2d(%arg0: tensor<2x?x?x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32> {
   %0 = linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<2x?x?x5xf32>, tensor<2x3x3x5xf32>) outs(%arg3 : tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32>
   return %0 : tensor<2x?x?x2xf32>

From cd3d234868cad8b42e2a09a570e3e229d5ecfb08 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 08:52:48 +0100
Subject: [PATCH 335/851] [X86] X86FixupInstTuning - extend BLENDPD/S ->
 MOVSD/S handling to SSE variant (#143961)

---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  10 +-
 llvm/test/CodeGen/X86/combine-and.ll          |   2 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  26 +---
 llvm/test/CodeGen/X86/insertelement-zero.ll   |   4 +-
 llvm/test/CodeGen/X86/masked_expandload.ll    |   2 +-
 llvm/test/CodeGen/X86/masked_load.ll          |   4 +-
 .../CodeGen/X86/sse-insertelt-from-mem.ll     |  16 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  13 +-
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  | 144 ++++++------------
 llvm/test/CodeGen/X86/sse41.ll                |   6 +-
 llvm/test/CodeGen/X86/vec_floor.ll            |  32 ++--
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |   2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |  13 +-
 llvm/test/CodeGen/X86/vector-zmov.ll          |  32 ++--
 llvm/test/CodeGen/X86/vselect.ll              |  26 +---
 15 files changed, 120 insertions(+), 212 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index fd13305d8a73d..be0a8c23ea5c4 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -234,10 +234,16 @@ bool X86FixupInstTuningPass::processInstruction(
   };
 
   switch (Opc) {
-  case X86::VBLENDPSrri:
-    return ProcessBLENDToMOV(X86::VMOVSSrr);
+  case X86::BLENDPDrri:
+    return ProcessBLENDToMOV(X86::MOVSDrr);
   case X86::VBLENDPDrri:
     return ProcessBLENDToMOV(X86::VMOVSDrr);
+
+  case X86::BLENDPSrri:
+    return ProcessBLENDToMOV(X86::MOVSSrr);
+  case X86::VBLENDPSrri:
+    return ProcessBLENDToMOV(X86::VMOVSSrr);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 173457ff46677..9ca4ebfec2774 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -189,7 +189,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
 ; SSE-LABEL: test11:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test11:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2b5f09113ca68..2f2a05fa6939b 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -108,15 +108,10 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test5:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test5:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test5:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test5:
 ; AVX1:       # %bb.0:
@@ -283,15 +278,10 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test12:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test12:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test12:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test12:
 ; AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 31551360be483..6036eddb0ca84 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -214,7 +214,7 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
 ; SSE41-LABEL: insert_v8f32_z12345z7:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
@@ -287,7 +287,7 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
 ; SSE41-LABEL: insert_v8i32_z12345z7:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index b7fe8e053fa15..e81a983c07018 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -1097,7 +1097,7 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32
 ; SSE42-NEXT:    retq
 ; SSE42-NEXT:  LBB4_1: ## %cond.load
 ; SSE42-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE42-NEXT:    addq $4, %rdi
 ; SSE42-NEXT:    testb $2, %al
 ; SSE42-NEXT:    je LBB4_4
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index e2e26da95b874..37ab4276fbcca 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -817,7 +817,7 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, ptr %addr, <2 x float>
 ; SSE42-NEXT:    retq
 ; SSE42-NEXT:  LBB7_1: ## %cond.load
 ; SSE42-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE42-NEXT:    testb $2, %al
 ; SSE42-NEXT:    je LBB7_4
 ; SSE42-NEXT:  LBB7_3: ## %cond.load1
@@ -1220,7 +1220,7 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, ptr %addr) {
 ; SSE42-NEXT:    je LBB10_10
 ; SSE42-NEXT:  LBB10_9: ## %cond.load10
 ; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE42-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE42-NEXT:    testb $32, %al
 ; SSE42-NEXT:    je LBB10_12
 ; SSE42-NEXT:  LBB10_11: ## %cond.load13
diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
index 5ae9055835716..1c3cfd079e9e9 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll
@@ -7,17 +7,11 @@
 ; 0'th element insertion into an SSE register.
 
 define <4 x float> @insert_f32_firstelt(<4 x float> %x, ptr %s.addr) {
-; SSE2-LABEL: insert_f32_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f32_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f32_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index 1e4fe81abc136..f174eaaca38c2 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -7,15 +7,10 @@
 ; 0'th element insertion into an SSE register.
 
 define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
-; SSE2-LABEL: insert_f32_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f32_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f32_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f32_firstelt:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 006c3006350cc..12bfb8d4fc9cf 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE41,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE41,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
 
@@ -1150,17 +1150,11 @@ define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: insert_test5_sub_ss:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subps %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_sub_ss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    subps %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_sub_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subps %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_sub_ss:
 ; AVX:       # %bb.0:
@@ -1188,17 +1182,11 @@ define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: insert_test5_div_ss:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    divps %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_div_ss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    divps %xmm0, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_div_ss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divps %xmm0, %xmm1
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_div_ss:
 ; AVX:       # %bb.0:
@@ -1226,17 +1214,11 @@ define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: insert_test5_sub_sd:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    subpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_sub_sd:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    subpd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_sub_sd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    subpd %xmm0, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_sub_sd:
 ; AVX:       # %bb.0:
@@ -1264,17 +1246,11 @@ define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: insert_test5_div_sd:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    divpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    ret{{[l|q]}}
-;
-; SSE41-LABEL: insert_test5_div_sd:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    divpd %xmm0, %xmm1
-; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE41-NEXT:    ret{{[l|q]}}
+; SSE-LABEL: insert_test5_div_sd:
+; SSE:       # %bb.0:
+; SSE-NEXT:    divpd %xmm0, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    ret{{[l|q]}}
 ;
 ; AVX-LABEL: insert_test5_div_sd:
 ; AVX:       # %bb.0:
@@ -1287,29 +1263,17 @@ define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
 }
 
 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_ss_mask:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    jne .LBB70_1
-; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X86-SSE2-NEXT:    retl
-; X86-SSE2-NEXT:  .LBB70_1:
-; X86-SSE2-NEXT:    addss %xmm0, %xmm1
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: add_ss_mask:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE41-NEXT:    jne .LBB70_1
-; X86-SSE41-NEXT:  # %bb.2:
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X86-SSE41-NEXT:    retl
-; X86-SSE41-NEXT:  .LBB70_1:
-; X86-SSE41-NEXT:    addss %xmm0, %xmm1
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: add_ss_mask:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    jne .LBB70_1
+; X86-SSE-NEXT:  # %bb.2:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB70_1:
+; X86-SSE-NEXT:    addss %xmm0, %xmm1
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_ss_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1329,29 +1293,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 ; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
 ; X86-AVX512-NEXT:    retl
 ;
-; X64-SSE2-LABEL: add_ss_mask:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    testb $1, %dil
-; X64-SSE2-NEXT:    jne .LBB70_1
-; X64-SSE2-NEXT:  # %bb.2:
-; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X64-SSE2-NEXT:    retq
-; X64-SSE2-NEXT:  .LBB70_1:
-; X64-SSE2-NEXT:    addss %xmm0, %xmm1
-; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: add_ss_mask:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    testb $1, %dil
-; X64-SSE41-NEXT:    jne .LBB70_1
-; X64-SSE41-NEXT:  # %bb.2:
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; X64-SSE41-NEXT:    retq
-; X64-SSE41-NEXT:  .LBB70_1:
-; X64-SSE41-NEXT:    addss %xmm0, %xmm1
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; X64-SSE41-NEXT:    retq
+; X64-SSE-LABEL: add_ss_mask:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    testb $1, %dil
+; X64-SSE-NEXT:    jne .LBB70_1
+; X64-SSE-NEXT:  # %bb.2:
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB70_1:
+; X64-SSE-NEXT:    addss %xmm0, %xmm1
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_ss_mask:
 ; X64-AVX1:       # %bb.0:
@@ -1402,7 +1354,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-SSE41-NEXT:    retl
 ; X86-SSE41-NEXT:  .LBB71_1:
 ; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X86-SSE41-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_sd_mask:
@@ -1444,7 +1396,7 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X64-SSE41-NEXT:    retq
 ; X64-SSE41-NEXT:  .LBB71_1:
 ; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X64-SSE41-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_sd_mask:
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 53a10ab0c26ff..4f5b7ee0eaea0 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -345,7 +345,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -367,7 +367,7 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 ;
 ; X64-SSE-LABEL: blendps_not_insertps_1:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; X64-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
@@ -434,7 +434,7 @@ define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize noun
 define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
 ; SSE-LABEL: blendps_not_insertps_2:
 ; SSE:       ## %bb.0:
-; SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
+; SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 ;
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 0538cac12cbf7..1007969b6c6d1 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1361,7 +1361,7 @@ define <4 x float> @floor_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB52_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1402,7 +1402,7 @@ define <4 x float> @floor_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwi
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB53_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1441,7 +1441,7 @@ define <2 x double> @floor_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB54_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1482,7 +1482,7 @@ define <2 x double> @floor_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) nou
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB55_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1521,7 +1521,7 @@ define <4 x float> @floor_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x flo
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB56_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1562,7 +1562,7 @@ define <4 x float> @floor_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; SSE41-NEXT:  LBB57_1:
 ; SSE41-NEXT:    roundss $9, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB57_3:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1602,7 +1602,7 @@ define <2 x double> @floor_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB58_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1643,7 +1643,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; SSE41-NEXT:  LBB59_1:
 ; SSE41-NEXT:    roundsd $9, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB59_3:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2351,7 +2351,7 @@ define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w,
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB78_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2392,7 +2392,7 @@ define <4 x float> @ceil_maskz_ss(<4 x float> %x, <4 x float> %y, i8 %k) nounwin
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB79_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2431,7 +2431,7 @@ define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double>
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB80_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2472,7 +2472,7 @@ define <2 x double> @ceil_maskz_sd(<2 x double> %x, <2 x double> %y, i8 %k) noun
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB81_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2511,7 +2511,7 @@ define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x floa
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB82_2:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2552,7 +2552,7 @@ define <4 x float> @ceil_maskz_ss_trunc(<4 x float> %x, <4 x float> %y, i16 %k)
 ; SSE41-NEXT:  LBB83_1:
 ; SSE41-NEXT:    roundss $10, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB83_3:
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2592,7 +2592,7 @@ define <2 x double> @ceil_mask_sd_trunc(<2 x double> %x, <2 x double> %y, <2 x d
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm2
 ; SSE41-NEXT:  LBB84_2:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -2633,7 +2633,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; SSE41-NEXT:  LBB85_1:
 ; SSE41-NEXT:    roundsd $10, %xmm0, %xmm0
 ; SSE41-NEXT:  LBB85_3:
-; SSE41-NEXT:    blendpd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 8679c262e0bf0..2d3dc4c593c11 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -871,7 +871,7 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
 ; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_bitcast_z123:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 0570e2f580c1b..002a3b77dc353 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -63,15 +63,10 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
 }
 
 define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
-; SSSE3-LABEL: combine_pshufb_as_movss:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: combine_pshufb_as_movss:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: combine_pshufb_as_movss:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_pshufb_as_movss:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-zmov.ll b/llvm/test/CodeGen/X86/vector-zmov.ll
index 2f84723b3c081..9d84ff8c01ab4 100644
--- a/llvm/test/CodeGen/X86/vector-zmov.ll
+++ b/llvm/test/CodeGen/X86/vector-zmov.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
 
@@ -38,26 +38,12 @@ entry:
 }
 
 define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(ptr%ptr) {
-; SSE2-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movaps (%rdi), %xmm1
-; SSSE3-NEXT:    xorps %xmm0, %xmm0
-; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movaps (%rdi), %xmm1
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: load_zmov_4i32_to_0zzz_volatile:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps (%rdi), %xmm1
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile:
 ; AVX:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 9851fe64847de..18a060ad910b7 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -301,15 +301,10 @@ define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: test18:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test18:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test18:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test18:
 ; AVX:       # %bb.0:
@@ -320,15 +315,10 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test19:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test19:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test19:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test19:
 ; AVX:       # %bb.0:

From 02b6ed0bf139518c704a2996418e66f3a93260a1 Mon Sep 17 00:00:00 2001
From: Shamshura Egor <164661612+egorshamshura@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:53:15 +0300
Subject: [PATCH 336/851] [Clang] Added explanation why `is_constructible`
 evaluated to false.  (#143309)

Added explanation why a is constructible evaluated to false. Also fixed
problem with ```ExtractTypeTraitFromExpression```. In case
```std::is_xxx_v<>``` with variadic pack it tries to get template
argument, but fails in expression ```Arg.getAsType()``` due to
```Arg.getKind() == TemplateArgument::ArgKind::Pack```, but not
```TemplateArgument::ArgKind::Type```.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  8 ++-
 clang/lib/Sema/SemaTypeTraits.cpp             | 71 ++++++++++++++++++-
 clang/test/CXX/drs/cwg18xx.cpp                |  3 +-
 ...overload-resolution-deferred-templates.cpp | 19 +++--
 .../type-traits-unsatisfied-diags-std.cpp     | 66 +++++++++++++++++
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 62 ++++++++++++++++
 6 files changed, 219 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 0f77083dac9df..a2cf84d024193 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1767,7 +1767,8 @@ def note_unsatisfied_trait
     : Note<"%0 is not %enum_select<TraitName>{"
            "%TriviallyRelocatable{trivially relocatable}|"
            "%Replaceable{replaceable}|"
-           "%TriviallyCopyable{trivially copyable}"
+           "%TriviallyCopyable{trivially copyable}|"
+           "%Constructible{constructible with provided types}"
            "}1">;
 
 def note_unsatisfied_trait_reason
@@ -1797,7 +1798,10 @@ def note_unsatisfied_trait_reason
            "%DeletedAssign{has a deleted %select{copy|move}1 "
            "assignment operator}|"
            "%UnionWithUserDeclaredSMF{is a union with a user-declared "
-           "%sub{select_special_member_kind}1}"
+           "%sub{select_special_member_kind}1}|"
+           "%FunctionType{is a function type}|"
+           "%CVVoidType{is a cv void type}|"
+           "%IncompleteArrayType{is an incomplete array type}"
            "}0">;
 
 def warn_consteval_if_always_true : Warning<
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1738ab4466001..22c690bedc1ed 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
@@ -1947,6 +1948,7 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
             TypeTrait::UTT_IsCppTriviallyRelocatable)
       .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
+      .Case("is_constructible", TypeTrait::TT_IsConstructible)
       .Default(std::nullopt);
 }
 
@@ -1983,8 +1985,16 @@ static ExtractedTypeTraitInfo ExtractTypeTraitFromExpression(const Expr *E) {
     Trait = StdNameToTypeTrait(Name);
     if (!Trait)
       return std::nullopt;
-    for (const auto &Arg : VD->getTemplateArgs().asArray())
-      Args.push_back(Arg.getAsType());
+    for (const auto &Arg : VD->getTemplateArgs().asArray()) {
+      if (Arg.getKind() == TemplateArgument::ArgKind::Pack) {
+        for (const auto &InnerArg : Arg.pack_elements())
+          Args.push_back(InnerArg.getAsType());
+      } else if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
+        Args.push_back(Arg.getAsType());
+      } else {
+        llvm_unreachable("Unexpected kind");
+      }
+    }
     return {{Trait.value(), std::move(Args)}};
   }
 
@@ -2257,6 +2267,60 @@ static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
   }
 }
 
+static void DiagnoseNonConstructibleReason(
+    Sema &SemaRef, SourceLocation Loc,
+    const llvm::SmallVector<clang::QualType, 1> &Ts) {
+  if (Ts.empty()) {
+    return;
+  }
+
+  bool ContainsVoid = false;
+  for (const QualType &ArgTy : Ts) {
+    ContainsVoid |= ArgTy->isVoidType();
+  }
+
+  if (ContainsVoid)
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::CVVoidType;
+
+  QualType T = Ts[0];
+  if (T->isFunctionType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::FunctionType;
+
+  if (T->isIncompleteArrayType())
+    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
+        << diag::TraitNotSatisfiedReason::IncompleteArrayType;
+
+  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
+  if (!D || D->isInvalidDecl() || !D->hasDefinition())
+    return;
+
+  llvm::BumpPtrAllocator OpaqueExprAllocator;
+  SmallVector<Expr *, 2> ArgExprs;
+  ArgExprs.reserve(Ts.size() - 1);
+  for (unsigned I = 1, N = Ts.size(); I != N; ++I) {
+    QualType ArgTy = Ts[I];
+    if (ArgTy->isObjectType() || ArgTy->isFunctionType())
+      ArgTy = SemaRef.Context.getRValueReferenceType(ArgTy);
+    ArgExprs.push_back(
+        new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
+            OpaqueValueExpr(Loc, ArgTy.getNonLValueExprType(SemaRef.Context),
+                            Expr::getValueKindForType(ArgTy)));
+  }
+
+  EnterExpressionEvaluationContext Unevaluated(
+      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
+  Sema::ContextRAII TUContext(SemaRef,
+                              SemaRef.Context.getTranslationUnitDecl());
+  InitializedEntity To(InitializedEntity::InitializeTemporary(T));
+  InitializationKind InitKind(InitializationKind::CreateDirect(Loc, Loc, Loc));
+  InitializationSequence Init(SemaRef, To, InitKind, ArgExprs);
+
+  Init.Diagnose(SemaRef, To, InitKind, ArgExprs);
+  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
+}
+
 static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
                                                SourceLocation Loc, QualType T) {
   SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
@@ -2296,6 +2360,9 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
+  case TT_IsConstructible:
+    DiagnoseNonConstructibleReason(*this, E->getBeginLoc(), Args);
+    break;
   default:
     break;
   }
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 5b4551ba0143b..9948075852135 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -564,11 +564,12 @@ struct A {
 namespace ex2 {
 #if __cplusplus >= 201103L
 struct Bar {
-  struct Baz {
+  struct Baz { // #cwg1890-Baz
     int a = 0;
   };
   static_assert(__is_constructible(Baz), "");
   // since-cxx11-error@-1 {{static assertion failed due to requirement '__is_constructible(cwg1890::ex2::Bar::Baz)'}}
+  // since-cxx11-note@#cwg1890-Baz {{'Baz' defined here}}
 };
 #endif
 } // namespace ex2
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 7cb71e075d50e..46c3670848529 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -80,21 +80,30 @@ struct ImplicitlyCopyable {
 static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&));
 
 
-struct Movable {
+struct Movable { // #Movable
   template <typename T>
   requires __is_constructible(Movable, T) // #err-self-constraint-1
-  explicit Movable(T op) noexcept; // #1
-  Movable(Movable&&) noexcept = default; // #2
+  explicit Movable(T op) noexcept; // #Movable1
+  Movable(Movable&&) noexcept = default; // #Movable2
 };
 static_assert(__is_constructible(Movable, Movable&&));
 static_assert(__is_constructible(Movable, const Movable&));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}}
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} \
+// expected-error@-1 {{call to implicitly-deleted copy constructor of 'Movable'}} \
+// expected-note@#Movable  {{'Movable' defined here}} \
+// expected-note@#Movable  {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const Movable' for 1st argument}} \
+// expected-note@#Movable2  {{copy constructor is implicitly deleted because 'Movable' has a user-declared move constructor}} \
+// expected-note@#Movable2  {{candidate constructor not viable: no known conversion from 'int' to 'Movable' for 1st argument}} \
+// expected-note@#Movable1  {{candidate template ignored: constraints not satisfied [with T = int]}}
+
 
 static_assert(__is_constructible(Movable, int));
-// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
+// expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
 // expected-note@#err-self-constraint-1 4{{}}
+// expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
 struct Members {
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index 329b611110c1d..a403a0450607a 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -20,6 +20,14 @@ struct is_trivially_copyable {
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename... Args>
+struct is_constructible {
+    static constexpr bool value = __is_constructible(Args...);
+};
+
+template <typename... Args>
+constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 #ifdef STD2
@@ -44,6 +52,17 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
+
+template <typename... Args>
+struct __details_is_constructible{
+    static constexpr bool value = __is_constructible(Args...);
+};
+
+template <typename... Args>
+using is_constructible  = __details_is_constructible<Args...>;
+
+template <typename... Args>
+constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 
@@ -73,6 +92,15 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+
+template <typename... Args>
+struct __details_is_constructible : bool_constant<__is_constructible(Args...)> {};
+
+template <typename... Args>
+using is_constructible  = __details_is_constructible<Args...>;
+
+template <typename... Args>
+constexpr bool is_constructible_v = is_constructible<Args...>::value;
 #endif
 
 }
@@ -100,6 +128,15 @@ static_assert(std::is_trivially_copyable_v<int&>);
 // expected-note@-1 {{because it is a reference type}}
 
 
+static_assert(std::is_constructible<int, int>::value);
+
+static_assert(std::is_constructible<void>::value);
+// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_constructible<void>::value'}} \
+// expected-note@-1 {{because it is a cv void type}}
+static_assert(std::is_constructible_v<void>);
+// expected-error@-1 {{static assertion failed due to requirement 'std::is_constructible_v<void>'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
 namespace test_namespace {
     using namespace std;
     static_assert(is_trivially_relocatable<int&>::value);
@@ -119,6 +156,13 @@ namespace test_namespace {
     // expected-error@-1 {{static assertion failed due to requirement 'is_trivially_copyable_v<int &>'}} \
     // expected-note@-1 {{'int &' is not trivially copyable}} \
     // expected-note@-1 {{because it is a reference type}}
+
+    static_assert(is_constructible<void>::value);
+    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_constructible<void>::value'}} \
+    // expected-note@-1 {{because it is a cv void type}}
+    static_assert(is_constructible_v<void>);
+    // expected-error@-1 {{static assertion failed due to requirement 'is_constructible_v<void>'}} \
+    // expected-note@-1 {{because it is a cv void type}}
 }
 
 
@@ -139,6 +183,15 @@ concept C2 = std::is_trivially_copyable_v<T>; // #concept4
 
 template <C2 T> void g2();  // #cand4
 
+template <typename... Args>
+requires std::is_constructible<Args...>::value void f3();  // #cand5
+
+template <typename... Args>
+concept C3 = std::is_constructible_v<Args...>; // #concept6
+
+template <C3 T> void g3();  // #cand6
+
+
 void test() {
     f<int&>();
     // expected-error@-1 {{no matching function for call to 'f'}} \
@@ -169,6 +222,19 @@ void test() {
     // expected-note@#concept4 {{because 'std::is_trivially_copyable_v<int &>' evaluated to false}} \
     // expected-note@#concept4 {{'int &' is not trivially copyable}} \
     // expected-note@#concept4 {{because it is a reference type}}
+
+    f3<void>();
+    // expected-error@-1 {{no matching function for call to 'f3'}} \
+    // expected-note@#cand5 {{candidate template ignored: constraints not satisfied [with Args = <void>]}} \
+    // expected-note-re@#cand5 {{because '{{.*}}is_constructible<void>::value' evaluated to false}} \
+    // expected-note@#cand5 {{because it is a cv void type}}
+
+    g3<void>();
+    // expected-error@-1 {{no matching function for call to 'g3'}} \
+    // expected-note@#cand6 {{candidate template ignored: constraints not satisfied [with T = void]}} \
+    // expected-note@#cand6 {{because 'void' does not satisfy 'C3'}} \
+    // expected-note@#concept6 {{because 'std::is_constructible_v<void>' evaluated to false}} \
+    // expected-note@#concept6 {{because it is a cv void type}}
 }
 }
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index a8c78f6304ca9..d0b3f294fbcab 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -488,3 +488,65 @@ static_assert(__is_trivially_copyable(S12));
 // expected-note@-1 {{'S12' is not trivially copyable}} \
 // expected-note@#tc-S12 {{'S12' defined here}}
 }
+
+namespace constructible {
+
+struct S1 {  // #c-S1
+    S1(int); // #cc-S1
+};
+static_assert(__is_constructible(S1, char*));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S1, char *)'}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S1'}} \
+// expected-note@#c-S1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'char *' to 'const S1' for 1st argument}} \
+// expected-note@#c-S1 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'char *' to 'S1' for 1st argument}} \
+// expected-note@#cc-S1 {{candidate constructor not viable: no known conversion from 'char *' to 'int' for 1st argument; dereference the argument with *}} \
+// expected-note@#c-S1 {{'S1' defined here}}
+
+struct S2 { // #c-S2
+    S2(int, float, double); // #cc-S2
+};
+static_assert(__is_constructible(S2, float));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float)'}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'float' to 'const S2' for 1st argument}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'float' to 'S2' for 1st argument}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
+// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 1 was provided}} \
+// expected-note@#c-S2 {{'S2' defined here}}
+
+static_assert(__is_constructible(S2, float, void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float, void)'}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} \
+// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} \
+// expected-note@-1{{because it is a cv void type}} \
+// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
+// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 2 were provided}} \
+// expected-note@#c-S2 {{'S2' defined here}}
+
+static_assert(__is_constructible(int[]));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int[])'}} \
+// expected-note@-1 {{because it is an incomplete array type}}
+
+static_assert(__is_constructible(void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(void, void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void, void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(const void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(const void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(volatile void));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(volatile void)'}} \
+// expected-note@-1 {{because it is a cv void type}}
+
+static_assert(__is_constructible(int ()));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int ())'}} \
+// expected-note@-1 {{because it is a function type}}
+
+static_assert(__is_constructible(void (int, float)));
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void (int, float))'}} \
+// expected-note@-1 {{because it is a function type}}
+}

From c4caf00bfbf10caa88f1c46a561564b4f0f723af Mon Sep 17 00:00:00 2001
From: LU-JOHN <John.Lu@amd.com>
Date: Fri, 13 Jun 2025 03:03:06 -0500
Subject: [PATCH 337/851] [AMDGPU] Convert more 64-bit lshr to 32-bit if shift
 amt>=32 (#138204)

Convert vector 64-bit lshr to 32-bit if shift amt is known to be >= 32.
Also convert scalar 64-bit lshr to 32-bit if shift amt is variable but
known to be >=32.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 130 ++++++++++-----
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |  77 ++++-----
 llvm/test/CodeGen/AMDGPU/srl64_reduce.ll      | 150 +++++++++---------
 3 files changed, 196 insertions(+), 161 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5f41bd7d8a617..c51cc2a2fe529 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4097,7 +4097,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
+  // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32))
 
   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   // common case, splitting this into a move and a 32-bit shift is faster and
@@ -4117,12 +4117,12 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
                                TargetType);
   } else {
-    SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
+    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
     const SDValue ShiftMask =
         DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
     // This AND instruction will clamp out of bounds shift values.
     // It will also be removed during later instruction selection.
-    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask);
+    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
   }
 
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
@@ -4181,50 +4181,105 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
 
 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
-  auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS)
-    return SDValue();
-
+  SDValue RHS = N->getOperand(1);
+  ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
-  unsigned ShiftAmt = RHS->getZExtValue();
   SelectionDAG &DAG = DCI.DAG;
   SDLoc SL(N);
+  unsigned RHSVal;
+
+  if (CRHS) {
+    RHSVal = CRHS->getZExtValue();
 
-  // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
-  // this improves the ability to match BFE patterns in isel.
-  if (LHS.getOpcode() == ISD::AND) {
-    if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
-      unsigned MaskIdx, MaskLen;
-      if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
-          MaskIdx == ShiftAmt) {
-        return DAG.getNode(
-            ISD::AND, SL, VT,
-            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
-            DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
+    // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
+    // this improves the ability to match BFE patterns in isel.
+    if (LHS.getOpcode() == ISD::AND) {
+      if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
+        unsigned MaskIdx, MaskLen;
+        if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
+            MaskIdx == RHSVal) {
+          return DAG.getNode(ISD::AND, SL, VT,
+                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
+                                         N->getOperand(1)),
+                             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
+                                         N->getOperand(1)));
+        }
       }
     }
   }
 
-  if (VT != MVT::i64)
+  if (VT.getScalarType() != MVT::i64)
     return SDValue();
 
-  if (ShiftAmt < 32)
+  // for C >= 32
+  // i64 (srl x, C) -> (build_pair (srl hi_32(x), C -32), 0)
+
+  // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+  // common case, splitting this into a move and a 32-bit shift is faster and
+  // the same code size.
+  KnownBits Known = DAG.computeKnownBits(RHS);
+
+  EVT ElementType = VT.getScalarType();
+  EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
+  EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
+                                 : TargetScalarType;
+
+  if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
     return SDValue();
 
-  // srl i64:x, C for C >= 32
-  // =>
-  //   build_pair (srl hi_32(x), C - 32), 0
-  SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+  SDValue ShiftAmt;
+  if (CRHS) {
+    ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
+                               TargetType);
+  } else {
+    SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
+    const SDValue ShiftMask =
+        DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
+    // This AND instruction will clamp out of bounds shift values.
+    // It will also be removed during later instruction selection.
+    ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
+  }
+
+  const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
+  EVT ConcatType;
+  SDValue Hi;
+  SDLoc LHSSL(LHS);
+  // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
+  if (VT.isVector()) {
+    unsigned NElts = TargetType.getVectorNumElements();
+    ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
+    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
+    SmallVector<SDValue, 8> HiOps(NElts);
+    SmallVector<SDValue, 16> HiAndLoOps;
 
-  SDValue Hi = getHiHalf64(LHS, DAG);
+    DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
+    for (unsigned I = 0; I != NElts; ++I)
+      HiOps[I] = HiAndLoOps[2 * I + 1];
+    Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
+  } else {
+    const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
+    ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
+    SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
+    Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
+  }
 
-  SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
-  SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+  SDValue NewShift = DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt);
 
-  SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+  SDValue Vec;
+  if (VT.isVector()) {
+    unsigned NElts = TargetType.getVectorNumElements();
+    SmallVector<SDValue, 8> LoOps;
+    SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
 
-  return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
+    DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
+    for (unsigned I = 0; I != NElts; ++I)
+      HiAndLoOps[2 * I] = LoOps[I];
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
+  } else {
+    Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
+  }
+  return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
 }
 
 SDValue AMDGPUTargetLowering::performTruncateCombine(
@@ -5209,21 +5264,18 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
-  case ISD::SHL: {
+  case ISD::SHL:
+  case ISD::SRL: {
     // Range metadata can be invalidated when loads are converted to legal types
     // (e.g. v2i64 -> v4i32).
-    // Try to convert vector shl before type legalization so that range metadata
-    // can be utilized.
+    // Try to convert vector shl/srl before type legalization so that range
+    // metadata can be utilized.
     if (!(N->getValueType(0).isVector() &&
           DCI.getDAGCombineLevel() == BeforeLegalizeTypes) &&
         DCI.getDAGCombineLevel() < AfterLegalizeDAG)
       break;
-    return performShlCombine(N, DCI);
-  }
-  case ISD::SRL: {
-    if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
-      break;
-
+    if (N->getOpcode() == ISD::SHL)
+      return performShlCombine(N, DCI);
     return performSrlCombine(N, DCI);
   }
   case ISD::SRA: {
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index bb642155cd0aa..117f359be0c3b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1945,16 +1945,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; CI-LABEL: lshr_mad_i64_vec:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v6, v3
-; CI-NEXT:    v_mov_b32_e32 v3, v1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
-; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
-; CI-NEXT:    v_mov_b32_e32 v3, v1
+; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
 ; CI-NEXT:    s_mov_b32 s4, 0xffff1118
-; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
+; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; CI-NEXT:    v_sub_i32_e32 v1, vcc, v5, v1
+; CI-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
 ; CI-NEXT:    v_mov_b32_e32 v0, v4
-; CI-NEXT:    v_mov_b32_e32 v1, v5
+; CI-NEXT:    v_mov_b32_e32 v2, v6
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-LABEL: lshr_mad_i64_vec:
@@ -1977,44 +1975,28 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX9-LABEL: lshr_mad_i64_vec:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v3
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, s4, v[0:1]
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, s4, v[2:3]
+; GFX9-NEXT:    v_sub_u32_e32 v1, v5, v1
+; GFX9-NEXT:    v_sub_u32_e32 v3, v7, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1100-LABEL: lshr_mad_i64_vec:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v8, v3
-; GFX1100-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
-; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
-; GFX1100-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, v7
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1150-LABEL: lshr_mad_i64_vec:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1150-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
-; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1150-NEXT:    v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
-; GFX1150-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: lshr_mad_i64_vec:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX11-NEXT:    v_mov_b32_e32 v2, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: lshr_mad_i64_vec:
 ; GFX12:       ; %bb.0:
@@ -2023,13 +2005,14 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mov_b32_e32 v3, v1
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[4:5], null, 0xffff1c18, v1, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, 0xffff1118, v3, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v5, v1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_sub_nc_u32_e32 v3, v7, v3
+; GFX12-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
   %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
diff --git a/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll b/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
index 09538c624de72..3567bafe5b1ca 100644
--- a/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl64_reduce.ll
@@ -17,9 +17,9 @@ define i64 @srl_metadata(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !0, !noundef !{}
@@ -30,9 +30,9 @@ define i64 @srl_metadata(i64 %arg0, ptr %arg1.ptr) {
 define amdgpu_ps i64 @srl_metadata_sgpr_return(i64 inreg %arg0, ptr addrspace(1) inreg %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata_sgpr_return:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dword s2, s[2:3], 0x0
+; CHECK-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_lshr_b32 s0, s1, s0
 ; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %shift.amt = load i64, ptr addrspace(1) %arg1.ptr, !range !0, !noundef !{}
@@ -45,9 +45,9 @@ define i64 @srl_exact_metadata(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_exact_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !0, !noundef !{}
@@ -59,9 +59,9 @@ define i64 @srl_metadata_two_ranges(i64 %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_metadata_two_ranges:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v2, v[2:3]
+; CHECK-NEXT:    flat_load_dword v0, v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load i64, ptr %arg1.ptr, !range !1, !noundef !{}
@@ -106,8 +106,10 @@ define <2 x i64> @srl_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <2 x i64> %arg0, %shift.amt
@@ -121,8 +123,10 @@ define <2 x i64> @srl_exact_v2_metadata(<2 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <2 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr exact <2 x i64> %arg0, %shift.amt
@@ -133,12 +137,15 @@ define <3 x i64> @srl_v3_metadata(<3 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-LABEL: srl_v3_metadata:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_load_dword v12, v[6:7] offset:16
+; CHECK-NEXT:    flat_load_dword v0, v[6:7] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v12, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v8, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v10, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v0, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v10, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <3 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <3 x i64> %arg0, %shift.amt
@@ -153,11 +160,15 @@ define <4 x i64> @srl_v4_metadata(<4 x i64> %arg0, ptr %arg1.ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    flat_load_dwordx4 v[13:16], v[8:9] offset:16
 ; CHECK-NEXT:    ; kill: killed $vgpr8 killed $vgpr9
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v10, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v12, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v10, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v12, v3
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v13, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v15, v[6:7]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v13, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, v15, v7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %shift.amt = load <4 x i64>, ptr %arg1.ptr, !range !0, !noundef !{}
   %srl = lshr <4 x i64> %arg0, %shift.amt
@@ -337,8 +348,7 @@ define i64 @srl_or32(i64 %arg0, i64 %shift_amt) {
 ; CHECK-LABEL: srl_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v2, 32, v2
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v2, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or i64 %shift_amt, 32
@@ -350,10 +360,10 @@ define <2 x i64> @srl_v2_or32(<2 x i64> %arg0, <2 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v2_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v5, 32, v6
-; CHECK-NEXT:    v_or_b32_e32 v4, 32, v4
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v4, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v6, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <2 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <2 x i64> %arg0, %or
@@ -364,12 +374,12 @@ define <3 x i64> @srl_v3_or32(<3 x i64> %arg0, <3 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v3_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v7, 32, v10
-; CHECK-NEXT:    v_or_b32_e32 v8, 32, v8
-; CHECK-NEXT:    v_or_b32_e32 v6, 32, v6
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v6, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v8, v[2:3]
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v7, v[4:5]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v8, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v10, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <3 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <3 x i64> %arg0, %or
@@ -380,14 +390,14 @@ define <4 x i64> @srl_v4_or32(<4 x i64> %arg0, <4 x i64> %shift_amt) {
 ; CHECK-LABEL: srl_v4_or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v9, 32, v14
-; CHECK-NEXT:    v_or_b32_e32 v11, 32, v12
-; CHECK-NEXT:    v_or_b32_e32 v10, 32, v10
-; CHECK-NEXT:    v_or_b32_e32 v8, 32, v8
-; CHECK-NEXT:    v_lshrrev_b64 v[0:1], v8, v[0:1]
-; CHECK-NEXT:    v_lshrrev_b64 v[2:3], v10, v[2:3]
-; CHECK-NEXT:    v_lshrrev_b64 v[4:5], v11, v[4:5]
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v9, v[6:7]
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, v8, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v2, v10, v3
+; CHECK-NEXT:    v_lshrrev_b32_e32 v4, v12, v5
+; CHECK-NEXT:    v_lshrrev_b32_e32 v6, v14, v7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <4 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <4 x i64> %arg0, %or
@@ -400,8 +410,7 @@ define i64 @srl_or32_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) {
 ; CHECK-LABEL: srl_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s4, s18, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s18
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -413,8 +422,7 @@ define i64 @srl_or32_sgpr(i64 inreg %arg0, i64 inreg %shift_amt) {
 define amdgpu_ps i64 @srl_or32_sgpr_return(i64 inreg %arg0, i64 inreg %shift_amt) {
 ; CHECK-LABEL: srl_or32_sgpr_return:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_or_b32 s2, s2, 32
-; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
+; CHECK-NEXT:    s_lshr_b32 s0, s1, s2
 ; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %or = or i64 %shift_amt, 32
@@ -426,14 +434,12 @@ define <2 x i64> @srl_v2_or32_sgpr(<2 x i64> inreg %arg0, <2 x i64> inreg %shift
 ; CHECK-LABEL: srl_v2_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s6, s22, 32
-; CHECK-NEXT:    s_or_b32 s4, s20, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s20
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s22
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <2 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <2 x i64> %arg0, %or
@@ -444,18 +450,15 @@ define <3 x i64> @srl_v3_or32_sgpr(<3 x i64> inreg %arg0, <3 x i64> inreg %shift
 ; CHECK-LABEL: srl_v3_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_or_b32 s8, s26, 32
-; CHECK-NEXT:    s_or_b32 s6, s24, 32
-; CHECK-NEXT:    s_or_b32 s4, s22, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
-; CHECK-NEXT:    s_lshr_b64 s[8:9], s[20:21], s8
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s22
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s24
+; CHECK-NEXT:    s_lshr_b32 s6, s21, s26
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_mov_b32_e32 v4, s8
-; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <3 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <3 x i64> %arg0, %or
@@ -466,20 +469,17 @@ define <4 x i64> @srl_v4_or32_sgpr(<4 x i64> inreg %arg0, <4 x i64> inreg %shift
 ; CHECK-LABEL: srl_v4_or32_sgpr:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v0, 32, v0
-; CHECK-NEXT:    s_or_b32 s8, s28, 32
-; CHECK-NEXT:    s_or_b32 s6, s26, 32
-; CHECK-NEXT:    s_or_b32 s4, s24, 32
-; CHECK-NEXT:    s_lshr_b64 s[4:5], s[16:17], s4
-; CHECK-NEXT:    s_lshr_b64 s[6:7], s[18:19], s6
-; CHECK-NEXT:    s_lshr_b64 s[8:9], s[20:21], s8
-; CHECK-NEXT:    v_lshrrev_b64 v[6:7], v0, s[22:23]
+; CHECK-NEXT:    s_lshr_b32 s4, s17, s24
+; CHECK-NEXT:    s_lshr_b32 s5, s19, s26
+; CHECK-NEXT:    s_lshr_b32 s6, s21, s28
+; CHECK-NEXT:    v_lshrrev_b32_e64 v6, v0, s23
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
-; CHECK-NEXT:    v_mov_b32_e32 v1, s5
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_mov_b32_e32 v4, s8
-; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %or = or <4 x i64> %shift_amt, splat (i64 32)
   %srl = lshr <4 x i64> %arg0, %or

From d4826cd324d9a10abdc67c973affa62d36dff4ee Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 13 Jun 2025 09:07:09 +0100
Subject: [PATCH 338/851] [AArch64] Observe Z-reg inline asm clobbers without
 SVE (#143742)

inline asm that clobbers any of the z-registers when not in streaming
mode, should still observe that the lower 128 bits of those registers
are clobbered.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 +++-
 llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll | 91 ++++++++++++++++++-
 2 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b9e699eaa408..781a1281db402 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12281,13 +12281,14 @@ enum class PredicateConstraint { Uph, Upl, Upa };
 // not what we want. The code here pre-empts this by matching the register
 // explicitly.
 static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
-parsePredicateRegAsConstraint(StringRef Constraint) {
+parseSVERegAsConstraint(StringRef Constraint) {
   if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
-      Constraint[1] != 'p')
+      (Constraint[1] != 'p' && Constraint[1] != 'z'))
     return std::nullopt;
 
+  bool IsPredicate = Constraint[1] == 'p';
   Constraint = Constraint.substr(2, Constraint.size() - 3);
-  bool IsPredicateAsCount = Constraint.starts_with("n");
+  bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
   if (IsPredicateAsCount)
     Constraint = Constraint.drop_front(1);
 
@@ -12297,8 +12298,9 @@ parsePredicateRegAsConstraint(StringRef Constraint) {
 
   if (IsPredicateAsCount)
     return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
-  else
+  if (IsPredicate)
     return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
+  return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
 }
 
 static std::optional<PredicateConstraint>
@@ -12548,8 +12550,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       break;
     }
   } else {
-    if (const auto P = parsePredicateRegAsConstraint(Constraint))
+    if (const auto P = parseSVERegAsConstraint(Constraint)) {
+      // SME functions that are not in streaming mode, should
+      // still observe clobbers of Z-registers by clobbering
+      // the lower 128bits of those registers.
+      if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
+          !Subtarget->isSVEorStreamingSVEAvailable())
+        return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
+                              &AArch64::FPR128RegClass);
       return *P;
+    }
     if (const auto PC = parsePredicateConstraint(Constraint))
       if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
         return std::make_pair(0U, RegClass);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
index 63cc061cb6188..b92a524036985 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme2-asm.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sme2 -force-streaming -stop-after=finalize-isel | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -stop-after=finalize-isel | FileCheck %s
 
-define void @UphPNR(target("aarch64.svcount") %predcnt) {
+define void @UphPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -14,7 +14,7 @@ entry:
   ret void
 }
 
-define void @UpaPNR(target("aarch64.svcount") %predcnt) {
+define void @UpaPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -28,7 +28,7 @@ entry:
   ret void
 }
 
-define void @UplPNR(target("aarch64.svcount") %predcnt) {
+define void @UplPNR(target("aarch64.svcount") %predcnt) "target-features"="+sme2" "aarch64_pstate_sm_enabled" {
 entry:
 ; CHECK:  %0:ppr = COPY $p0
 ; CHECK:  STR_PXI %0, %stack.0.predcnt.addr, 0 :: (store (<vscale x 1 x s16>) into %ir.predcnt.addr)
@@ -41,3 +41,86 @@ entry:
   call void asm sideeffect "fadd z0.h, $0/m, z0.h, #0.5", "@3Upl"(target("aarch64.svcount") %0)
   ret void
 }
+
+; Test that the z-register clobbers result in preserving %0 across the inline asm call.
+define <2 x float> @sme_nosve_nonstreaming(ptr %in) "target-features"="+sme,-sve" {
+entry:
+; CHECK-LABEL: name: sme_nosve_nonstreaming
+; CHECK:  INLINEASM &"smstart sm; smstop sm;"
+; CHECK-SAME: implicit-def early-clobber $q0
+; CHECK-SAME: implicit-def early-clobber $q1
+; CHECK-SAME: implicit-def early-clobber $q2
+; CHECK-SAME: implicit-def early-clobber $q3
+; CHECK-SAME: implicit-def early-clobber $q4
+; CHECK-SAME: implicit-def early-clobber $q5
+; CHECK-SAME: implicit-def early-clobber $q6
+; CHECK-SAME: implicit-def early-clobber $q7
+; CHECK-SAME: implicit-def early-clobber $q8
+; CHECK-SAME: implicit-def early-clobber $q9
+; CHECK-SAME: implicit-def early-clobber $q10
+; CHECK-SAME: implicit-def early-clobber $q11
+; CHECK-SAME: implicit-def early-clobber $q12
+; CHECK-SAME: implicit-def early-clobber $q13
+; CHECK-SAME: implicit-def early-clobber $q14
+; CHECK-SAME: implicit-def early-clobber $q15
+; CHECK-SAME: implicit-def early-clobber $q16
+; CHECK-SAME: implicit-def early-clobber $q17
+; CHECK-SAME: implicit-def early-clobber $q18
+; CHECK-SAME: implicit-def early-clobber $q19
+; CHECK-SAME: implicit-def early-clobber $q20
+; CHECK-SAME: implicit-def early-clobber $q21
+; CHECK-SAME: implicit-def early-clobber $q22
+; CHECK-SAME: implicit-def early-clobber $q23
+; CHECK-SAME: implicit-def early-clobber $q24
+; CHECK-SAME: implicit-def early-clobber $q25
+; CHECK-SAME: implicit-def early-clobber $q26
+; CHECK-SAME: implicit-def early-clobber $q27
+; CHECK-SAME: implicit-def early-clobber $q28
+; CHECK-SAME: implicit-def early-clobber $q29
+; CHECK-SAME: implicit-def early-clobber $q30
+; CHECK-SAME: implicit-def early-clobber $q31
+  %0 = load <2 x float>, ptr %in, align 8
+  call void asm sideeffect "smstart sm; smstop sm;", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"()
+  ret <2 x float> %0
+}
+
+define <2 x float> @sme_nosve_streaming(ptr %in) "target-features"="+sme,-sve" "aarch64_pstate_sm_enabled" {
+entry:
+; CHECK-LABEL: name: sme_nosve_streaming
+; CHECK:  INLINEASM &"smstart sm; smstop sm;"
+; CHECK-SAME: implicit-def early-clobber $z0
+; CHECK-SAME: implicit-def early-clobber $z1
+; CHECK-SAME: implicit-def early-clobber $z2
+; CHECK-SAME: implicit-def early-clobber $z3
+; CHECK-SAME: implicit-def early-clobber $z4
+; CHECK-SAME: implicit-def early-clobber $z5
+; CHECK-SAME: implicit-def early-clobber $z6
+; CHECK-SAME: implicit-def early-clobber $z7
+; CHECK-SAME: implicit-def early-clobber $z8
+; CHECK-SAME: implicit-def early-clobber $z9
+; CHECK-SAME: implicit-def early-clobber $z10
+; CHECK-SAME: implicit-def early-clobber $z11
+; CHECK-SAME: implicit-def early-clobber $z12
+; CHECK-SAME: implicit-def early-clobber $z13
+; CHECK-SAME: implicit-def early-clobber $z14
+; CHECK-SAME: implicit-def early-clobber $z15
+; CHECK-SAME: implicit-def early-clobber $z16
+; CHECK-SAME: implicit-def early-clobber $z17
+; CHECK-SAME: implicit-def early-clobber $z18
+; CHECK-SAME: implicit-def early-clobber $z19
+; CHECK-SAME: implicit-def early-clobber $z20
+; CHECK-SAME: implicit-def early-clobber $z21
+; CHECK-SAME: implicit-def early-clobber $z22
+; CHECK-SAME: implicit-def early-clobber $z23
+; CHECK-SAME: implicit-def early-clobber $z24
+; CHECK-SAME: implicit-def early-clobber $z25
+; CHECK-SAME: implicit-def early-clobber $z26
+; CHECK-SAME: implicit-def early-clobber $z27
+; CHECK-SAME: implicit-def early-clobber $z28
+; CHECK-SAME: implicit-def early-clobber $z29
+; CHECK-SAME: implicit-def early-clobber $z30
+; CHECK-SAME: implicit-def early-clobber $z31
+  %0 = load <2 x float>, ptr %in, align 8
+  call void asm sideeffect "smstart sm; smstop sm;", "~{z0},~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"()
+  ret <2 x float> %0
+}

From 0cf333878d310bf9bbc8156cb7d8a0e271fb2c6f Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 13 Jun 2025 09:26:08 +0100
Subject: [PATCH 339/851] [NFC] Pack MDNodeKeyImpl<DILocation> from 40 to 32
 bytes (#143891)

---
 llvm/lib/IR/LLVMContextImpl.h | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 87cd52e357be2..ef279721b9643 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -310,36 +310,33 @@ template <> struct MDNodeKeyImpl<MDTuple> : MDNodeOpsKey {
 
 /// DenseMapInfo for DILocation.
 template <> struct MDNodeKeyImpl<DILocation> {
-  unsigned Line;
-  uint16_t Column;
   Metadata *Scope;
   Metadata *InlinedAt;
-  bool ImplicitCode;
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
   uint64_t AtomGroup : 61;
   uint64_t AtomRank : 3;
 #endif
+  unsigned Line;
+  uint16_t Column;
+  bool ImplicitCode;
 
   MDNodeKeyImpl(unsigned Line, uint16_t Column, Metadata *Scope,
                 Metadata *InlinedAt, bool ImplicitCode, uint64_t AtomGroup,
                 uint8_t AtomRank)
-      : Line(Line), Column(Column), Scope(Scope), InlinedAt(InlinedAt),
-        ImplicitCode(ImplicitCode)
+      : Scope(Scope), InlinedAt(InlinedAt),
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-        ,
-        AtomGroup(AtomGroup), AtomRank(AtomRank)
+        AtomGroup(AtomGroup), AtomRank(AtomRank),
 #endif
-  {
+        Line(Line), Column(Column), ImplicitCode(ImplicitCode) {
   }
 
   MDNodeKeyImpl(const DILocation *L)
-      : Line(L->getLine()), Column(L->getColumn()), Scope(L->getRawScope()),
-        InlinedAt(L->getRawInlinedAt()), ImplicitCode(L->isImplicitCode())
+      : Scope(L->getRawScope()), InlinedAt(L->getRawInlinedAt()),
 #ifdef EXPERIMENTAL_KEY_INSTRUCTIONS
-        ,
-        AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank())
+        AtomGroup(L->getAtomGroup()), AtomRank(L->getAtomRank()),
 #endif
-  {
+        Line(L->getLine()), Column(L->getColumn()),
+        ImplicitCode(L->isImplicitCode()) {
   }
 
   bool isKeyOf(const DILocation *RHS) const {

From addd98f7a5b964a5a5860d65f327f3fc3b7e0a42 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 09:31:57 +0100
Subject: [PATCH 340/851] [lldb][test] Don't call SBDebugger::Terminate if
 TestMultipleDebuggers times out (#143732)

Fixes #101162

This test did this:
* SBDebugger::Initialize
* Spawn a bunch of threads that do:
  * SBDebugger::Create
  * some work
  * SBDebugger::Destroy
* Wait on those threads to finish then call SBDebugger::Terminate and
exit, or -
* Reach a time limit before all the threads finish, call
SBDebugger::Terminate and exit.

The problem was that in the timeout case, calling SBDebugger::Terminate
destroys data being used by threads that are still running. I expect
this test was expecting said threads to be so broken they were probably
stuck, but when the machine is just heavily loaded, one of them might
read that data before the whole program exits.

This means what should have been a timeout becomes a crash. Sometimes.
Which explains why we saw both timeouts and various signals on the
AArch64 Linux bot. It depends on the timings.

So I'm changing it not to call SBDebugger::Terminate in the timeout
case. We will have to tweak the timeout value based on what happens on
the buildbot, but we will know it's machine load not an lldb bug.

Also use _exit instead of exit, to skip more cleanup that might cause a
crash.
---
 .../API/api/multiple-debuggers/TestMultipleDebuggers.py    | 2 --
 .../API/api/multiple-debuggers/multi-process-driver.cpp    | 7 +++++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index 1fd4806cd74f4..f0a3893f53aab 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,8 +12,6 @@
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    # Sometimes times out on Linux, see https://github.com/llvm/llvm-project/issues/101162.
-    @skipIfLinux
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget
diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index 64728fb7c29a1..5ad75e3c1e472 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -296,6 +296,9 @@ int main (int argc, char **argv)
                  NUMBER_OF_SIMULTANEOUS_DEBUG_SESSIONS);
     }
 
-    SBDebugger::Terminate();
-    exit (1);
+    // We do not call SBDebugger::Terminate() here because it will destroy
+    // data that might be being used by threads that are still running. Which
+    // would change the timeout into an unrelated crash.
+    // _exit instead of exit, to skip more things that could cause a crash.
+    _exit(1);
 }

From 8ba62fdb3d2da2f5f199ee7a07222620a451293f Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Fri, 13 Jun 2025 16:47:56 +0800
Subject: [PATCH 341/851] [CIR] Function calls with aggregate arguments and
 return values (#143377)

This patch updates cir.call operation and allows function calls with
aggregate arguments and return values.

It seems that C++ class support is still at a minimum now. I tried to
make a call to a C++ function with an argument of aggregate type but it
failed because the initialization of C++ class / struct is NYI. I also
tried to inline this part of support into this patch, but the mixed
patch quickly blows in size and becomes unsuitable for review. Thus,
tests for calling functions with aggregate arguments are added only for
C for now.
---
 clang/include/clang/CIR/MissingFeatures.h     |   6 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  12 ++
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 103 ++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenCall.h            |  22 +++-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  12 +-
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  82 +++++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp      |  11 ++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  20 +++-
 clang/lib/CIR/CodeGen/CIRGenValue.h           |  16 +++
 clang/test/CIR/CodeGen/call.c                 | 111 ++++++++++++++++++
 clang/test/CIR/CodeGen/call.cpp               |  32 +++++
 11 files changed, 411 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/call.c

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 97b933657d742..225e9ec89a827 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -173,6 +173,10 @@ struct MissingFeatures {
   static bool stackSaveOp() { return false; }
   static bool aggValueSlot() { return false; }
   static bool aggValueSlotMayOverlap() { return false; }
+  static bool aggValueSlotVolatile() { return false; }
+  static bool aggValueSlotDestructedFlag() { return false; }
+  static bool aggValueSlotAlias() { return false; }
+  static bool aggValueSlotGC() { return false; }
   static bool generateDebugInfo() { return false; }
   static bool pointerOverflowSanitizer() { return false; }
   static bool fpConstraints() { return false; }
@@ -230,6 +234,8 @@ struct MissingFeatures {
   static bool attributeNoBuiltin() { return false; }
   static bool thunks() { return false; }
   static bool runCleanupsScope() { return false; }
+  static bool lowerAggregateLoadStore() { return false; }
+  static bool dataLayoutTypeAllocSize() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index fb1a290c18fa2..36c89809b4d90 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -332,6 +332,18 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  /// Cast the element type of the given address to a different type,
+  /// preserving information like the alignment.
+  Address createElementBitCast(mlir::Location loc, Address addr,
+                               mlir::Type destType) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    auto ptrTy = getPointerTo(destType);
+    return Address(createBitcast(loc, addr.getPointer(), ptrTy), destType,
+                   addr.getAlignment());
+  }
+
   cir::LoadOp createLoad(mlir::Location loc, Address addr,
                          bool isVolatile = false) {
     mlir::IntegerAttr align = getAlignmentAttr(addr.getAlignment());
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 5ec720ffd54f1..0d9064425fa95 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -60,6 +60,23 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const {
   return *this;
 }
 
+void CIRGenFunction::emitAggregateStore(mlir::Value value, Address dest) {
+  // In classic codegen:
+  // Function to store a first-class aggregate into memory. We prefer to
+  // store the elements rather than the aggregate to be more friendly to
+  // fast-isel.
+  // In CIR codegen:
+  // Emit the most simple cir.store possible (e.g. a store for a whole
+  // record), which can later be broken down in other CIR levels (or prior
+  // to dialect codegen).
+
+  // Stored result for the callers of this function expected to be in the same
+  // scope as the value, don't make assumptions about current insertion point.
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointAfter(value.getDefiningOp());
+  builder.createStore(*currSrcLoc, value, dest);
+}
+
 /// Returns the canonical formal type of the given C++ method.
 static CanQual<FunctionProtoType> getFormalType(const CXXMethodDecl *md) {
   return md->getType()
@@ -439,8 +456,49 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       assert(!cir::MissingFeatures::opCallBitcastArg());
       cirCallArgs[argNo] = v;
     } else {
-      assert(!cir::MissingFeatures::opCallAggregateArgs());
-      cgm.errorNYI("emitCall: aggregate function call argument");
+      Address src = Address::invalid();
+      if (!arg.isAggregate())
+        cgm.errorNYI(loc, "emitCall: non-aggregate call argument");
+      else
+        src = arg.hasLValue() ? arg.getKnownLValue().getAddress()
+                              : arg.getKnownRValue().getAggregateAddress();
+
+      // Fast-isel and the optimizer generally like scalar values better than
+      // FCAs, so we flatten them if this is safe to do for this argument.
+      auto argRecordTy = cast<cir::RecordType>(argType);
+      mlir::Type srcTy = src.getElementType();
+      // FIXME(cir): get proper location for each argument.
+      mlir::Location argLoc = loc;
+
+      // If the source type is smaller than the destination type of the
+      // coerce-to logic, copy the source value into a temp alloca the size
+      // of the destination type to allow loading all of it. The bits past
+      // the source value are left undef.
+      // FIXME(cir): add data layout info and compare sizes instead of
+      // matching the types.
+      //
+      // uint64_t SrcSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
+      // uint64_t DstSize = CGM.getDataLayout().getTypeAllocSize(STy);
+      // if (SrcSize < DstSize) {
+      assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+      if (srcTy != argRecordTy) {
+        cgm.errorNYI(loc, "emitCall: source type does not match argument type");
+      } else {
+        // FIXME(cir): this currently only runs when the types are exactly the
+        // same, but should be when alloc sizes are the same, fix this as soon
+        // as datalayout gets introduced.
+        assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+      }
+
+      // assert(NumCIRArgs == STy.getMembers().size());
+      // In LLVMGen: Still only pass the struct without any gaps but mark it
+      // as such somehow.
+      //
+      // In CIRGen: Emit a load from the "whole" struct,
+      // which shall be broken later by some lowering step into multiple
+      // loads.
+      assert(!cir::MissingFeatures::lowerAggregateLoadStore());
+      cirCallArgs[argNo] = builder.createLoad(argLoc, src);
     }
   }
 
@@ -479,6 +537,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
 
   assert(!cir::MissingFeatures::opCallAttrs());
 
+  mlir::Location callLoc = loc;
   cir::CIRCallOpInterface theCall = emitCallLikeOp(
       *this, loc, indirectFuncTy, indirectFuncVal, directFuncOp, cirCallArgs);
 
@@ -492,6 +551,19 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
   if (isa<cir::VoidType>(retCIRTy))
     return getUndefRValue(retTy);
   switch (getEvaluationKind(retTy)) {
+  case cir::TEK_Aggregate: {
+    Address destPtr = returnValue.getValue();
+
+    if (!destPtr.isValid())
+      destPtr = createMemTemp(retTy, callLoc, getCounterAggTmpAsString());
+
+    mlir::ResultRange results = theCall->getOpResults();
+    assert(results.size() <= 1 && "multiple returns from a call");
+
+    SourceLocRAIIObject loc{*this, callLoc};
+    emitAggregateStore(results[0], destPtr);
+    return RValue::getAggregate(destPtr);
+  }
   case cir::TEK_Scalar: {
     mlir::ResultRange results = theCall->getOpResults();
     assert(results.size() == 1 && "unexpected number of returns");
@@ -508,7 +580,6 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
     return RValue::get(results[0]);
   }
   case cir::TEK_Complex:
-  case cir::TEK_Aggregate:
     cgm.errorNYI(loc, "unsupported evaluation kind of function call result");
     return getUndefRValue(retTy);
   }
@@ -527,10 +598,21 @@ void CIRGenFunction::emitCallArg(CallArgList &args, const clang::Expr *e,
 
   bool hasAggregateEvalKind = hasAggregateEvaluationKind(argType);
 
-  if (hasAggregateEvalKind) {
-    assert(!cir::MissingFeatures::opCallAggregateArgs());
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitCallArg: aggregate function call argument");
+  // In the Microsoft C++ ABI, aggregate arguments are destructed by the callee.
+  // However, we still have to push an EH-only cleanup in case we unwind before
+  // we make it to the call.
+  if (argType->isRecordType() &&
+      argType->castAs<RecordType>()->getDecl()->isParamDestroyedInCallee()) {
+    assert(!cir::MissingFeatures::msabi());
+    cgm.errorNYI(e->getSourceRange(), "emitCallArg: msabi is NYI");
+  }
+
+  if (hasAggregateEvalKind && isa<ImplicitCastExpr>(e) &&
+      cast<CastExpr>(e)->getCastKind() == CK_LValueToRValue) {
+    LValue lv = emitLValue(cast<CastExpr>(e)->getSubExpr());
+    assert(lv.isSimple());
+    args.addUncopiedAggregate(lv, argType);
+    return;
   }
 
   args.add(emitAnyExprToTemp(e), argType);
@@ -551,12 +633,13 @@ QualType CIRGenFunction::getVarArgType(const Expr *arg) {
 /// Similar to emitAnyExpr(), however, the result will always be accessible
 /// even if no aggregate location is provided.
 RValue CIRGenFunction::emitAnyExprToTemp(const Expr *e) {
-  assert(!cir::MissingFeatures::opCallAggregateArgs());
+  AggValueSlot aggSlot = AggValueSlot::ignored();
 
   if (hasAggregateEvaluationKind(e->getType()))
-    cgm.errorNYI(e->getSourceRange(), "emit aggregate value to temp");
+    aggSlot = createAggTemp(e->getType(), getLoc(e->getSourceRange()),
+                            getCounterAggTmpAsString());
 
-  return emitAnyExpr(e);
+  return emitAnyExpr(e, aggSlot);
 }
 
 void CIRGenFunction::emitCallArgs(
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h
index 15c9080448c8b..0353848f3ec0d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.h
@@ -133,8 +133,16 @@ struct CallArg {
   CallArg(RValue rv, clang::QualType ty)
       : rv(rv), hasLV(false), isUsed(false), ty(ty) {}
 
+  CallArg(LValue lv, clang::QualType ty)
+      : lv(lv), hasLV(true), isUsed(false), ty(ty) {}
+
   bool hasLValue() const { return hasLV; }
 
+  LValue getKnownLValue() const {
+    assert(hasLV && !isUsed);
+    return lv;
+  }
+
   RValue getKnownRValue() const {
     assert(!hasLV && !isUsed);
     return rv;
@@ -147,6 +155,10 @@ class CallArgList : public llvm::SmallVector<CallArg, 8> {
 public:
   void add(RValue rvalue, clang::QualType type) { emplace_back(rvalue, type); }
 
+  void addUncopiedAggregate(LValue lvalue, clang::QualType type) {
+    emplace_back(lvalue, type);
+  }
+
   /// Add all the arguments from another CallArgList to this one. After doing
   /// this, the old CallArgList retains its list of arguments, but must not
   /// be used to emit a call.
@@ -162,7 +174,15 @@ class CallArgList : public llvm::SmallVector<CallArg, 8> {
 
 /// Contains the address where the return value of a function can be stored, and
 /// whether the address is volatile or not.
-class ReturnValueSlot {};
+class ReturnValueSlot {
+  Address addr = Address::invalid();
+
+public:
+  ReturnValueSlot() = default;
+  ReturnValueSlot(Address addr) : addr(addr) {}
+
+  Address getValue() const { return addr; }
+};
 
 } // namespace clang::CIRGen
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5d04faf443b8d..99f942fcf2cd3 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1010,16 +1010,20 @@ LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
 
 /// Emit code to compute the specified expression which
 /// can have any type.  The result is returned as an RValue struct.
-RValue CIRGenFunction::emitAnyExpr(const Expr *e) {
+RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
     return RValue::get(emitScalarExpr(e));
   case cir::TEK_Complex:
     cgm.errorNYI(e->getSourceRange(), "emitAnyExpr: complex type");
     return RValue::get(nullptr);
-  case cir::TEK_Aggregate:
-    cgm.errorNYI(e->getSourceRange(), "emitAnyExpr: aggregate type");
-    return RValue::get(nullptr);
+  case cir::TEK_Aggregate: {
+    if (aggSlot.isIgnored())
+      aggSlot = createAggTemp(e->getType(), getLoc(e->getSourceRange()),
+                              getCounterAggTmpAsString());
+    emitAggExpr(e, aggSlot);
+    return aggSlot.asRValue();
+  }
   }
   llvm_unreachable("bad evaluation kind");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 061123d55b882..ffe1b701b244e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -28,6 +28,15 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   CIRGenFunction &cgf;
   AggValueSlot dest;
 
+  // Calls `fn` with a valid return value slot, potentially creating a temporary
+  // to do so. If a temporary is created, an appropriate copy into `Dest` will
+  // be emitted, as will lifetime markers.
+  //
+  // The given function should take a ReturnValueSlot, and return an RValue that
+  // points to said slot.
+  void withReturnValueSlot(const Expr *e,
+                           llvm::function_ref<RValue(ReturnValueSlot)> fn);
+
   AggValueSlot ensureSlot(mlir::Location loc, QualType t) {
     if (!dest.isIgnored())
       return dest;
@@ -40,16 +49,28 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   AggExprEmitter(CIRGenFunction &cgf, AggValueSlot dest)
       : cgf(cgf), dest(dest) {}
 
+  /// Given an expression with aggregate type that represents a value lvalue,
+  /// this method emits the address of the lvalue, then loads the result into
+  /// DestPtr.
+  void emitAggLoadOfLValue(const Expr *e);
+
   void emitArrayInit(Address destPtr, cir::ArrayType arrayTy, QualType arrayQTy,
                      Expr *exprToVisit, ArrayRef<Expr *> args,
                      Expr *arrayFiller);
 
+  /// Perform the final copy to DestPtr, if desired.
+  void emitFinalDestCopy(QualType type, const LValue &src);
+
   void emitInitializationToLValue(Expr *e, LValue lv);
 
   void emitNullInitializationToLValue(mlir::Location loc, LValue lv);
 
   void Visit(Expr *e) { StmtVisitor<AggExprEmitter>::Visit(e); }
 
+  void VisitCallExpr(const CallExpr *e);
+
+  void VisitDeclRefExpr(DeclRefExpr *e) { emitAggLoadOfLValue(e); }
+
   void VisitInitListExpr(InitListExpr *e);
   void VisitCXXConstructExpr(const CXXConstructExpr *e);
 
@@ -80,6 +101,17 @@ static bool isTrivialFiller(Expr *e) {
   return false;
 }
 
+/// Given an expression with aggregate type that represents a value lvalue, this
+/// method emits the address of the lvalue, then loads the result into DestPtr.
+void AggExprEmitter::emitAggLoadOfLValue(const Expr *e) {
+  LValue lv = cgf.emitLValue(e);
+
+  // If the type of the l-value is atomic, then do an atomic load.
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+
+  emitFinalDestCopy(e->getType(), lv);
+}
+
 void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
                                    QualType arrayQTy, Expr *e,
                                    ArrayRef<Expr *> args, Expr *arrayFiller) {
@@ -182,6 +214,18 @@ void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy,
   }
 }
 
+/// Perform the final copy to destPtr, if desired.
+void AggExprEmitter::emitFinalDestCopy(QualType type, const LValue &src) {
+  // If dest is ignored, then we're evaluating an aggregate expression
+  // in a context that doesn't care about the result.  Note that loads
+  // from volatile l-values force the existence of a non-ignored
+  // destination.
+  if (dest.isIgnored())
+    return;
+
+  cgf.cgm.errorNYI("emitFinalDestCopy: non-ignored dest is NYI");
+}
+
 void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
   const QualType type = lv.getType();
 
@@ -250,6 +294,44 @@ void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc,
   cgf.emitNullInitialization(loc, lv.getAddress(), lv.getType());
 }
 
+void AggExprEmitter::VisitCallExpr(const CallExpr *e) {
+  if (e->getCallReturnType(cgf.getContext())->isReferenceType()) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "reference return type");
+    return;
+  }
+
+  withReturnValueSlot(
+      e, [&](ReturnValueSlot slot) { return cgf.emitCallExpr(e, slot); });
+}
+
+void AggExprEmitter::withReturnValueSlot(
+    const Expr *e, llvm::function_ref<RValue(ReturnValueSlot)> fn) {
+  QualType retTy = e->getType();
+
+  assert(!cir::MissingFeatures::aggValueSlotDestructedFlag());
+  bool requiresDestruction =
+      retTy.isDestructedType() == QualType::DK_nontrivial_c_struct;
+  if (requiresDestruction)
+    cgf.cgm.errorNYI(
+        e->getSourceRange(),
+        "withReturnValueSlot: return value requiring destruction is NYI");
+
+  // If it makes no observable difference, save a memcpy + temporary.
+  //
+  // We need to always provide our own temporary if destruction is required.
+  // Otherwise, fn will emit its own, notice that it's "unused", and end its
+  // lifetime before we have the chance to emit a proper destructor call.
+  assert(!cir::MissingFeatures::aggValueSlotAlias());
+  assert(!cir::MissingFeatures::aggValueSlotGC());
+
+  Address retAddr = dest.getAddress();
+  assert(!cir::MissingFeatures::emitLifetimeMarkers());
+
+  assert(!cir::MissingFeatures::aggValueSlotVolatile());
+  assert(!cir::MissingFeatures::aggValueSlotDestructedFlag());
+  fn(ReturnValueSlot(retAddr));
+}
+
 void AggExprEmitter::VisitInitListExpr(InitListExpr *e) {
   if (e->hadArrayRangeDesignator())
     llvm_unreachable("GNU array range designator extension");
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index c5bd5109343d3..fd413fe86383a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -629,6 +629,17 @@ LValue CIRGenFunction::emitLValue(const Expr *e) {
   }
 }
 
+static std::string getVersionedTmpName(llvm::StringRef name, unsigned cnt) {
+  SmallString<256> buffer;
+  llvm::raw_svector_ostream out(buffer);
+  out << name << cnt;
+  return std::string(out.str());
+}
+
+std::string CIRGenFunction::getCounterAggTmpAsString() {
+  return getVersionedTmpName("agg.tmp", counterAggTmp++);
+}
+
 void CIRGenFunction::emitNullInitialization(mlir::Location loc, Address destPtr,
                                             QualType ty) {
   // Ignore empty classes in C++.
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index cf672b0c90e60..9421ea26a429f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -316,6 +316,10 @@ class CIRGenFunction : public CIRGenTypeCache {
     ~SourceLocRAIIObject() { restore(); }
   };
 
+  /// Hold counters for incrementally naming temporaries
+  unsigned counterAggTmp = 0;
+  std::string getCounterAggTmpAsString();
+
   /// Helpers to convert Clang's SourceLocation to a MLIR Location.
   mlir::Location getLoc(clang::SourceLocation srcLoc);
   mlir::Location getLoc(clang::SourceRange srcLoc);
@@ -695,6 +699,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                          mlir::OpBuilder::InsertPoint ip,
                          mlir::Value arraySize = nullptr);
 
+  void emitAggregateStore(mlir::Value value, Address dest);
+
   void emitAggExpr(const clang::Expr *e, AggValueSlot slot);
 
   LValue emitAggExprToLValue(const Expr *e);
@@ -703,7 +709,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// result is returned as an RValue struct. If this is an aggregate
   /// expression, the aggloc/agglocvolatile arguments indicate where the result
   /// should be returned.
-  RValue emitAnyExpr(const clang::Expr *e);
+  RValue emitAnyExpr(const clang::Expr *e,
+                     AggValueSlot aggSlot = AggValueSlot::ignored());
 
   /// Similarly to emitAnyExpr(), however, the result will always be accessible
   /// even if no aggregate location is provided.
@@ -1152,6 +1159,17 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitOpenACCDeclare(const OpenACCDeclareDecl &d);
   void emitOpenACCRoutine(const OpenACCRoutineDecl &d);
 
+  /// Create a temporary memory object for the given aggregate type.
+  AggValueSlot createAggTemp(QualType ty, mlir::Location loc,
+                             const Twine &name = "tmp",
+                             Address *alloca = nullptr) {
+    assert(!cir::MissingFeatures::aggValueSlot());
+    return AggValueSlot::forAddr(
+        createMemTemp(ty, loc, name, alloca), ty.getQualifiers(),
+        AggValueSlot::IsNotDestructed, AggValueSlot::IsNotAliased,
+        AggValueSlot::DoesNotOverlap);
+  }
+
 private:
   QualType getVarArgType(const Expr *arg);
 };
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index 258ae306f693d..c1e08ba1e9b67 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -306,6 +306,13 @@ class AggValueSlot {
   enum IsAliased_t { IsNotAliased, IsAliased };
   enum Overlap_t { MayOverlap, DoesNotOverlap };
 
+  /// Returns an aggregate value slot indicating that the aggregate
+  /// value is being ignored.
+  static AggValueSlot ignored() {
+    return forAddr(Address::invalid(), clang::Qualifiers(), IsNotDestructed,
+                   IsNotAliased, DoesNotOverlap);
+  }
+
   AggValueSlot(Address addr, clang::Qualifiers quals, bool destructedFlag,
                bool zeroedFlag, bool aliasedFlag, bool overlapFlag)
       : addr(addr), quals(quals), destructedFlag(destructedFlag),
@@ -333,7 +340,16 @@ class AggValueSlot {
 
   bool isIgnored() const { return !addr.isValid(); }
 
+  mlir::Value getPointer() const { return addr.getPointer(); }
+
   IsZeroed_t isZeroed() const { return IsZeroed_t(zeroedFlag); }
+
+  RValue asRValue() const {
+    if (isIgnored())
+      return RValue::getIgnored();
+    assert(!cir::MissingFeatures::aggValueSlot());
+    return RValue::getAggregate(getAddress());
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/test/CIR/CodeGen/call.c b/clang/test/CIR/CodeGen/call.c
new file mode 100644
index 0000000000000..13f3c5a21ceb0
--- /dev/null
+++ b/clang/test/CIR/CodeGen/call.c
@@ -0,0 +1,111 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+struct S {
+  int x;
+  int y;
+};
+
+void f1(struct S);
+void f2() {
+  struct S s;
+  f1(s);
+}
+
+// CIR-LABEL: cir.func @f2()
+// CIR:         %[[S:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_S>, !rec_S
+// CIR-NEXT:    cir.call @f1(%[[S]]) : (!rec_S) -> ()
+
+// LLVM-LABEL: define void @f2()
+// LLVM:         %[[S:.+]] = load %struct.S, ptr %{{.+}}, align 4
+// LLVM-NEXT:    call void @f1(%struct.S %[[S]])
+
+// OGCG-LABEL: define dso_local void @f2()
+// OGCG:         %[[S:.+]] = load i64, ptr %{{.+}}, align 4
+// OGCG-NEXT:    call void @f1(i64 %[[S]])
+
+struct S f3();
+void f4() {
+  struct S s = f3();
+}
+
+// CIR-LABEL: cir.func @f4() {
+// CIR:         %[[S:.+]] = cir.call @f3() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[S]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @f4() {
+// LLVM:         %[[S:.+]] = call %struct.S (...) @f3()
+// LLVM-NEXT:    store %struct.S %[[S]], ptr %{{.+}}, align 4
+
+// OGCG-LABEL: define dso_local void @f4() #0 {
+// OGCG:         %[[S:.+]] = call i64 (...) @f3()
+// OGCG-NEXT:    store i64 %[[S]], ptr %{{.+}}, align 4
+
+struct Big {
+  int data[10];
+};
+
+void f5(struct Big);
+struct Big f6();
+
+void f7() {
+  struct Big b;
+  f5(b);
+}
+
+// CIR-LABEL: cir.func @f7()
+// CIR:         %[[B:.+]] = cir.load align(4) %{{.+}} : !cir.ptr<!rec_Big>, !rec_Big
+// CIR-NEXT:    cir.call @f5(%[[B]]) : (!rec_Big) -> ()
+
+// LLVM-LABEL: define void @f7() {
+// LLVM:         %[[B:.+]] = load %struct.Big, ptr %{{.+}}, align 4
+// LLVM-NEXT:    call void @f5(%struct.Big %[[B]])
+
+// OGCG-LABEL: define dso_local void @f7() #0 {
+// OGCG:         %[[B:.+]] = alloca %struct.Big, align 8
+// OGCG-NEXT:    call void @f5(ptr noundef byval(%struct.Big) align 8 %[[B]])
+
+void f8() {
+  struct Big b = f6();
+}
+
+// CIR-LABEL: cir.func @f8()
+// CIR:         %[[B:.+]] = cir.call @f6() : () -> !rec_Big
+// CIR:         cir.store align(4) %[[B]], %{{.+}} : !rec_Big, !cir.ptr<!rec_Big>
+
+// LLVM-LABEL: define void @f8() {
+// LLVM:        %[[B:.+]] = call %struct.Big (...) @f6()
+// LLVM-NEXT:   store %struct.Big %[[B]], ptr %{{.+}}, align 4
+
+// OGCG-LABEL: define dso_local void @f8() #0 {
+// OGCG:         %[[B:.+]] = alloca %struct.Big, align 4
+// OGCG-NEXT:    call void (ptr, ...) @f6(ptr dead_on_unwind writable sret(%struct.Big) align 4 %[[B]])
+
+void f9() {
+  f1(f3());
+}
+
+// CIR-LABEL: cir.func @f9()
+// CIR:         %[[SLOT:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"] {alignment = 4 : i64}
+// CIR-NEXT:    %[[RET:.+]] = cir.call @f3() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[RET]], %[[SLOT]] : !rec_S, !cir.ptr<!rec_S>
+// CIR-NEXT:    %[[ARG:.+]] = cir.load align(4) %[[SLOT]] : !cir.ptr<!rec_S>, !rec_S
+// CIR-NEXT:    cir.call @f1(%[[ARG]]) : (!rec_S) -> ()
+
+// LLVM-LABEL: define void @f9() {
+// LLVM:         %[[SLOT:.+]] = alloca %struct.S, i64 1, align 4
+// LLVM-NEXT:    %[[RET:.+]] = call %struct.S (...) @f3()
+// LLVM-NEXT:    store %struct.S %[[RET]], ptr %[[SLOT]], align 4
+// LLVM-NEXT:    %[[ARG:.+]] = load %struct.S, ptr %[[SLOT]], align 4
+// LLVM-NEXT:    call void @f1(%struct.S %[[ARG]])
+
+// OGCG-LABEL: define dso_local void @f9() #0 {
+// OGCG:         %[[SLOT:.+]] = alloca %struct.S, align 4
+// OGCG-NEXT:    %[[RET:.+]] = call i64 (...) @f3()
+// OGCG-NEXT:    store i64 %[[RET]], ptr %[[SLOT]], align 4
+// OGCG-NEXT:    %[[ARG:.+]] = load i64, ptr %[[SLOT]], align 4
+// OGCG-NEXT:    call void @f1(i64 %[[ARG]])
diff --git a/clang/test/CIR/CodeGen/call.cpp b/clang/test/CIR/CodeGen/call.cpp
index 741cadeb5c764..cc25afce1e5a4 100644
--- a/clang/test/CIR/CodeGen/call.cpp
+++ b/clang/test/CIR/CodeGen/call.cpp
@@ -70,3 +70,35 @@ void f9() {
 // LLVM-LABEL: define void @_Z2f9v()
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1)
 // LLVM:         call void (i32, ...) @_Z2f8iz(i32 1, i32 2, i32 3, i32 4)
+
+struct S {
+  int x;
+  int y;
+};
+
+S f10();
+void f11() {
+  S s = f10();
+}
+
+// CIR-LABEL: cir.func @_Z3f11v()
+// CIR:         %[[#s:]] = cir.call @_Z3f10v() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[#s]], %{{.+}} : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @_Z3f11v()
+// LLVM:         %[[#s:]] = call %struct.S @_Z3f10v()
+// LLVM-NEXT:    store %struct.S %[[#s]], ptr %{{.+}}, align 4
+
+void f12() {
+  f10();
+}
+
+// CIR-LABEL: cir.func @_Z3f12v()
+// CIR:         %[[#slot:]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["agg.tmp0"]
+// CIR-NEXT:    %[[#ret:]] = cir.call @_Z3f10v() : () -> !rec_S
+// CIR-NEXT:    cir.store align(4) %[[#ret]], %[[#slot]] : !rec_S, !cir.ptr<!rec_S>
+
+// LLVM-LABEL: define void @_Z3f12v() {
+// LLVM:         %[[#slot:]] = alloca %struct.S, i64 1, align 4
+// LLVM-NEXT:    %[[#ret:]] = call %struct.S @_Z3f10v()
+// LLVM-NEXT:    store %struct.S %[[#ret]], ptr %[[#slot]], align 4

From 2d49bc01cf07434138ea01ef7b9ba4b646b54183 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 13 Jun 2025 10:02:27 +0100
Subject: [PATCH 342/851] [LV][NFC] Tidy up check-prof-info.ll test (#143884)

---
 .../LoopVectorize/check-prof-info.ll          | 144 ++++++++++--------
 1 file changed, 83 insertions(+), 61 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 17013c5908065..87c1ccb702277 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -1,24 +1,43 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-@a = dso_local global [1024 x i32] zeroinitializer, align 16
-@b = dso_local global [1024 x i32] zeroinitializer, align 16
+@a = global [1024 x i32] zeroinitializer, align 16
+@b = global [1024 x i32] zeroinitializer, align 16
 
 ; Check correctness of profile info for vectorization without epilog.
-; Function Attrs: nofree norecurse nounwind uwtable
-define dso_local void @_Z3foov() local_unnamed_addr #0 {
+define void @_Z3foov() {
 ; CHECK-LABEL: @_Z3foov(
-; CHECK:  [[VECTOR_BODY:vector\.body]]:
-; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
-; CHECK:  [[FOR_BODY:for\.body]]:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
-; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
-; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
-; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+; CHECK:  entry:
+; CHECK:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:  vector.ph:
+; CHECK:    br label [[VECTOR_BODY:%.*]]
+; CHECK:  vector.body:
+; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:  middle.block:
+; CHECK:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK:  scalar.ph:
+; CHECK:    br label [[FOR_BODY:%.*]]
+; CHECK:  for.cond.cleanup:
+; CHECK:  for.body:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+;
+; CHECK-MASKED-LABEL: @_Z3foov(
+; CHECK-MASKED:  entry:
+; CHECK-MASKED:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-MASKED:  vector.ph:
+; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MASKED:  vector.body:
+; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-MASKED:  middle.block:
+; CHECK-MASKED:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-MASKED:  scalar.ph:
+; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
+; CHECK-MASKED:  for.cond.cleanup:
+; CHECK-MASKED:  for.body:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -27,32 +46,51 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4, !tbaa !2
-  %1 = trunc i64 %indvars.iv to i32
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = trunc i64 %iv to i32
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %2, %mul
-  store i32 %add, ptr %arrayidx2, align 4, !tbaa !2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1024
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
+  store i32 %add, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
 }
 
 ; Check correctness of profile info for vectorization with epilog.
-; Function Attrs: nofree norecurse nounwind uwtable
-define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
+define void @_Z3foo2v() {
 ; CHECK-LABEL: @_Z3foo2v(
-; CHECK:  [[VECTOR_BODY:vector\.body]]:
-; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
-; CHECK:  [[FOR_BODY:for\.body]]:
-; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
-; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
-; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
-; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
-; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+; CHECK:  entry:
+; CHECK:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK:  vector.ph:
+; CHECK:    br label [[VECTOR_BODY:%.*]]
+; CHECK:  vector.body:
+; CHECK:    br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:  middle.block:
+; CHECK:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK:  scalar.ph:
+; CHECK:    br label [[FOR_BODY:%.*]]
+; CHECK:  for.cond.cleanup:
+; CHECK:  for.body:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MASKED-LABEL: @_Z3foo2v(
+; CHECK-MASKED:  entry:
+; CHECK-MASKED:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK-MASKED:  vector.ph:
+; CHECK-MASKED:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MASKED:  vector.body:
+; CHECK-MASKED:    br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MASKED:  middle.block:
+; CHECK-MASKED:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-MASKED:  scalar.ph:
+; CHECK-MASKED:    br label [[FOR_BODY:%.*]]
+; CHECK-MASKED:  for.cond.cleanup:
+; CHECK-MASKED:  for.body:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -61,36 +99,20 @@ for.cond.cleanup:                                 ; preds = %for.body
   ret void
 
 for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %indvars.iv
-  %0 = load i32, ptr %arrayidx, align 4, !tbaa !2
-  %1 = trunc i64 %indvars.iv to i32
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = trunc i64 %iv to i32
   %mul = mul nsw i32 %0, %1
-  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %indvars.iv
-  %2 = load i32, ptr %arrayidx2, align 4, !tbaa !2
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
   %add = add nsw i32 %2, %mul
-  store i32 %add, ptr %arrayidx2, align 4, !tbaa !2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, 1027
-  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
+  store i32 %add, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1027
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !1
 }
 
-attributes #0 = { "use-soft-float"="false" }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
-; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
-; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
-; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
 
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C++ TBAA"}
-!6 = !{!"branch_weights", i32 1, i32 1023}
-!7 = !{!"branch_weights", i32 1, i32 1026}
+!0 = !{!"branch_weights", i32 1, i32 1023}
+!1 = !{!"branch_weights", i32 1, i32 1026}

From 4b59b7b94608ddbd21d14bec68400f2eb21f510d Mon Sep 17 00:00:00 2001
From: Simone Pellegrini <simone.pellegrini@arm.com>
Date: Fri, 13 Jun 2025 11:03:09 +0200
Subject: [PATCH 343/851] [mlir][Linalg] Fix fusing of indexed linalg consumer
 with different axes (#140892)

When fusing two `linalg.genericOp`, where the producer has index
semantics, invalid `affine.apply` ops can be generated where the number
of indices do not match the number of loops in the fused genericOp.

This patch fixes the issue by directly using the number of loops from
the generated fused op.
---
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |  3 +-
 .../Linalg/fusion-elementwise-ops.mlir        | 37 +++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 1f5af39e604e7..f97ed3d6d5111 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -231,8 +231,7 @@ static void generateFusedElementwiseOpRegion(
   // `consumerToProducerLoopsMap` to map the producer indices.
   if (producer.hasIndexSemantics()) {
     // Add an index operation for every fused loop dimension.
-    unsigned numFusedOpLoops =
-        std::max(producer.getNumLoops(), consumer.getNumLoops());
+    unsigned numFusedOpLoops = fusedOp.getNumLoops();
     SmallVector<Value> fusedIndices;
     fusedIndices.reserve(numFusedOpLoops);
     llvm::transform(llvm::seq<uint64_t>(0, numFusedOpLoops),
diff --git a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
index 28e1291bce1fa..66fc55fadf8fa 100644
--- a/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-elementwise-ops.mlir
@@ -860,6 +860,43 @@ func.func @fusion_different_axes(%arg0 : tensor<5000xi64>, %arg1 : tensor<5000xi
 
 // -----
 
+func.func @fusion_different_axes_indexed(%arg0: tensor<2x2xi32>) ->  tensor<2xi32> {
+  %0 = tensor.empty() : tensor<2x2xi32>
+  %1 = linalg.generic {
+        indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
+        iterator_types = ["parallel", "parallel"]}
+        ins(%arg0 : tensor<2x2xi32>) outs(%0 : tensor<2x2xi32>) {
+          ^bb0(%in: i32, %out: i32):
+            %2 = linalg.index 1 : index
+            %3 = arith.index_cast %2 : index to i32
+            linalg.yield %3 : i32
+        } -> tensor<2x2xi32>
+  %4 = tensor.empty() : tensor<2xi32>
+  %5 = linalg.generic {
+        indexing_maps = [affine_map<(d0) -> (d0, 1)>, affine_map<(d0) -> (d0)>],
+        iterator_types = ["parallel"]}
+        ins(%1 : tensor<2x2xi32>) outs(%4 : tensor<2xi32>) {
+          ^bb0(%in: i32, %out: i32):
+            linalg.yield %in : i32
+        } -> tensor<2xi32>
+  return %5 : tensor<2xi32>
+}
+
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> (d0)>
+//      CHECK: func @fusion_different_axes_indexed(
+// CHECK-SAME:     %[[ARG0:.+]]: tensor<2x2xi32>
+//  CHECK-DAG:   %[[CST:.+]] = arith.constant 1 : i32
+//  CHECK-DAG:   %[[INIT:.+]] = tensor.empty() : tensor<2xi32>
+//      CHECK:   %[[RESULT:.+]] = linalg.generic
+// CHECK-SAME:       indexing_maps = [#[[MAP]]]
+// CHECK-SAME:       outs(%[[INIT]] :
+// CHECK-NEXT:   ^bb0(
+// CHECK-SAME:       %[[B0:.+]]: i32
+//      CHECK:     linalg.yield %[[CST]] : i32
+//      CHECK:   return %[[RESULT]]
+
+// -----
+
 // CHECK-LABEL: func @fold_fill_generic_basic
 //  CHECK-SAME: (%[[ARG0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
 //   CHECK-NOT: linalg.fill

From 67c590004d055b7aeb0f82787041a114c3a136b3 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tim@gymni.ch>
Date: Fri, 13 Jun 2025 11:09:11 +0200
Subject: [PATCH 344/851] [mlir][AMDGPU] Add scaled floating point conversion
 ops (#141554)

implement `ScaledExtPackedOp` and `PackedScaledTruncOp`
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  62 ++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 189 ++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |   9 +
 .../Conversion/AMDGPUToROCDL/packed-ext.mlir  | 492 ++++++++++++++++
 .../AMDGPUToROCDL/packed-trunc.mlir           | 535 ++++++++++++++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             | 315 +++++++++++
 6 files changed, 1601 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 02308568c1ad1..d58558ac32884 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -112,6 +112,38 @@ def AMDGPU_ExtPackedFp8Op :
   }];
 }
 
+def AMDGPU_ScaledExtPackedOp
+    : AMDGPU_Op<"scaled_ext_packed", [Pure]>,
+      Arguments<(
+          ins AnyTypeOf<[VectorOfLengthAndType<[1, 2, 3, 4], [F8E5M2, F8E4M3FN]>,
+                         VectorOfLengthAndType<[1, 2, 3, 4, 5, 6, 7, 8],
+                                               [F4E2M1FN]>]>:$source,
+          F32:$scale,
+          ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfLengthAndType<[2], [F32]>,
+                          FixedVectorOfLengthAndType<[2], [F16]>,
+                          FixedVectorOfLengthAndType<[2], [BF16]>]>:$res)> {
+  let summary = "Extend a vector of packed floating point values";
+
+  let description = [{
+    Extend and scale two packed floats in `source[index]` to two floats and 
+    return them.
+
+    This rather unusual signature arises from the fact that AMD GPUs cannot
+    easily work with sub 32-bit quantities, so the compiler intrinsics for
+    extending 8-bit floats (which are, currently, the only way to work with
+    this operation) take packed vectors of 2 such floats.
+
+    If the passed-in vector has fewer than two elements, or the input is scalar,
+    the remaining values in the <2 x i8> will be filled with
+    undefined values as needed.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `[` $index `]` `,` $scale `:` type($source) `to` type($res)
+  }];
+}
+
 def AMDGPU_PackedTrunc2xFp8Op :
     AMDGPU_Op<"packed_trunc_2xfp8", [Pure, AttrSizedOperandSegments]>,
     Arguments<(ins F32:$sourceA,
@@ -139,6 +171,36 @@ def AMDGPU_PackedTrunc2xFp8Op :
   let hasVerifier = 1;
 }
 
+def AMDGPU_PackedScaledTruncOp
+    : AMDGPU_Op<"packed_scaled_trunc", [Pure]>,
+      Arguments<(ins VectorOfLengthAndType<[1, 2], [F32, F16, BF16]>:$source,
+          F32:$scale,
+          ConfinedAttr<I32Attr, [IntNonNegative, IntMaxValue<7>]>:$index,
+          Optional<AnyTypeOf<
+              [FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+               FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>>:$existing)>,
+      Results<(
+          outs AnyTypeOf<[FixedVectorOfLengthAndType<[4], [F8E5M2, F8E4M3FN]>,
+                          FixedVectorOfLengthAndType<[8], [F4E2M1FN]>]>:$res)> {
+  let summary = "Round two floats into a packed vector of floats";
+  let description = [{
+    Scale and round the inputs `source` (which is undefined if not
+    specified) into the low or high word (bottom two or top two) elements
+    of the returned vector, keeping the other two elements of `existing`
+    unchanged if present (or undefined if it was not passed in).
+
+    The reason for this odd signature is that AMD GPUs cannot easily work with
+    sub-registers, and so the conversion intrinsics take 32-bit wide
+    packed vectors of float values.
+  }];
+  let assemblyFormat = [{
+    attr-dict $source `into` ($existing^):(`undef`)? `[` $index `]`
+    `,` $scale
+    `:` type($source) `to` type($res) (`into` type($existing)^)?
+  }];
+  let hasVerifier = 1;
+}
+
 def AMDGPU_PackedStochRoundFp8Op :
     AMDGPU_Op<"packed_stoch_round_fp8", [Pure]>,
     Arguments<(ins F32:$source,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index c5094799bbef7..5e6f675a6414b 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 namespace mlir {
@@ -1174,6 +1175,32 @@ struct PackedStochRoundFp8OpLowering final
                   PackedStochRoundFp8OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override;
 };
+
+struct ScaledExtPackedOpLowering final
+    : public ConvertOpToLLVMPattern<ScaledExtPackedOp> {
+  ScaledExtPackedOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::ScaledExtPackedOp>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
+struct PackedScaledTruncOpLowering final
+    : public ConvertOpToLLVMPattern<PackedScaledTruncOp> {
+  PackedScaledTruncOpLowering(const LLVMTypeConverter &converter,
+                              Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::PackedScaledTruncOp>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 } // end namespace
 
 LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
@@ -1230,6 +1257,165 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
   return success();
 }
 
+LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
+    ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  if (chipset != kGfx950)
+    return rewriter.notifyMatchFailure(
+        loc, "Scaled fp conversion instructions are not available on target "
+             "architecture and their emulation is not implemented");
+  Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+  Value source = adaptor.getSource();
+  Value scale = adaptor.getScale();
+
+  VectorType sourceVecType = cast<VectorType>(op.getSource().getType());
+  Type sourceElemType = sourceVecType.getElementType();
+  VectorType destVecType = cast<VectorType>(op.getResult().getType());
+  Type destElemType = destVecType.getElementType();
+
+  VectorType packedVecType;
+  if (isa<Float8E5M2Type, Float8E4M3FNType>(sourceElemType)) {
+    VectorType v4i8 = VectorType::get(4, rewriter.getI8Type());
+    packedVecType = cast<VectorType>(getTypeConverter()->convertType(v4i8));
+  } else if (isa<Float4E2M1FNType>(sourceElemType)) {
+    VectorType v8i4 = VectorType::get(8, rewriter.getI4Type());
+    packedVecType = cast<VectorType>(getTypeConverter()->convertType(v8i4));
+  } else {
+    llvm_unreachable("invalid element type for scaled ext");
+  }
+
+  // Extend to a packedVectorType
+  if (sourceVecType.getNumElements() < packedVecType.getNumElements()) {
+    Value longVec = rewriter.create<LLVM::ZeroOp>(loc, packedVecType);
+    if (!sourceVecType) {
+      longVec = rewriter.create<LLVM::InsertElementOp>(
+          loc, longVec, source, createI32Constant(rewriter, loc, 0));
+    } else {
+      for (int32_t i = 0, e = sourceVecType.getNumElements(); i < e; ++i) {
+        Value idx = createI32Constant(rewriter, loc, i);
+        Value elem = rewriter.create<LLVM::ExtractElementOp>(loc, source, idx);
+        longVec =
+            rewriter.create<LLVM::InsertElementOp>(loc, longVec, elem, idx);
+      }
+    }
+    source = longVec;
+  }
+  Value i32Source = rewriter.create<LLVM::BitcastOp>(loc, i32, source);
+
+  if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E5M2Type>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Bf8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float8E4M3FNType>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp8Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF32())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF32Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkF16Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else if (isa<Float4E2M1FNType>(sourceElemType) && destElemType.isBF16())
+    rewriter.replaceOpWithNewOp<ROCDL::CvtScaleF32PkBf16Fp4Op>(
+        op, destVecType, i32Source, scale, op.getIndex());
+  else
+    return failure();
+
+  return success();
+}
+
+LogicalResult PackedScaledTruncOpLowering::matchAndRewrite(
+    PackedScaledTruncOp op, PackedScaledTruncOpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  Location loc = op.getLoc();
+  if (chipset != kGfx950)
+    return rewriter.notifyMatchFailure(
+        loc, "Scaled fp conversion instructions are not available on target "
+             "architecture and their emulation is not implemented");
+  Type v2i16 = getTypeConverter()->convertType(
+      VectorType::get(2, rewriter.getI16Type()));
+  Type i32 = getTypeConverter()->convertType(rewriter.getI32Type());
+
+  Type resultType = op.getResult().getType();
+  Type resultElemType = getElementTypeOrSelf(resultType);
+  VectorType sourceVecType = cast<VectorType>(op.getSource().getType());
+  Type sourceElemType = sourceVecType.getElementType();
+
+  Type intResultType = isa<Float4E2M1FNType>(resultElemType) ? i32 : v2i16;
+
+  Value source = adaptor.getSource();
+  Value scale = adaptor.getScale();
+  Value existing = adaptor.getExisting();
+  if (existing)
+    existing = rewriter.create<LLVM::BitcastOp>(loc, intResultType, existing);
+  else
+    existing = rewriter.create<LLVM::ZeroOp>(loc, intResultType);
+
+  if (sourceVecType.getNumElements() < 2) {
+    Value c0 = createI32Constant(rewriter, loc, 0);
+    Value elem0 = rewriter.create<LLVM::ExtractElementOp>(loc, source, c0);
+    VectorType v2 = VectorType::get(2, sourceElemType);
+    source = rewriter.create<LLVM::ZeroOp>(loc, v2);
+    source = rewriter.create<LLVM::InsertElementOp>(loc, source, elem0, c0);
+  }
+
+  Value sourceA, sourceB;
+  if (sourceElemType.isF32()) {
+    Value c0 = createI32Constant(rewriter, loc, 0);
+    Value c1 = createI32Constant(rewriter, loc, 1);
+    sourceA = rewriter.create<LLVM::ExtractElementOp>(loc, source, c0);
+    sourceB = rewriter.create<LLVM::ExtractElementOp>(loc, source, c1);
+  }
+
+  Value result;
+  if (sourceElemType.isF32() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float8E5M2Type>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkBf8Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isF32() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float8E4M3FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp8Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isF32() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4F32Op>(
+        loc, intResultType, existing, sourceA, sourceB, scale, op.getIndex());
+  else if (sourceElemType.isF16() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4F16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else if (sourceElemType.isBF16() && isa<Float4E2M1FNType>(resultElemType))
+    result = rewriter.create<ROCDL::CvtScaleF32PkFp4Bf16Op>(
+        loc, intResultType, existing, source, scale, op.getIndex());
+  else
+    return failure();
+
+  result = rewriter.replaceOpWithNewOp<LLVM::BitcastOp>(
+      op, getTypeConverter()->convertType(resultType), result);
+  return success();
+}
+
 LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
     PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1547,7 +1733,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
            AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
            MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+           ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+           PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering>(converter,
                                                                  chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index a0a98a4e86721..0d0add3094666 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -60,6 +60,15 @@ LogicalResult PackedStochRoundFp8Op::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// mxfp float ops
+//===----------------------------------------------------------------------===//
+LogicalResult PackedScaledTruncOp::verify() {
+  if (getExisting() && getExisting().getType() != getResult().getType())
+    return emitOpError("existing values must have same type as result");
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // FatRawBufferCastOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
new file mode 100644
index 0000000000000..ad2e7684afc4a
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-ext.mlir
@@ -0,0 +1,492 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E4M3FN> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf8E5M2> to vector<2xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi8>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK-DAG:   [[CAST:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST:%.+]] = llvm.bitcast [[CAST]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK:       [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK:       [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<4xf4E2M1FN> to vector<4xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C2:%.+]] = llvm.mlir.constant(2 : i32) : i32
+// CHECK:       [[ELEM_2:%.+]] = llvm.extractelement [[V]]{{\[}}[[C2]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_2:%.+]] = llvm.insertelement [[ELEM_2]], [[VEC_1]]{{\[}}[[C2]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C3:%.+]] = llvm.mlir.constant(3 : i32) : i32
+// CHECK:       [[ELEM_3:%.+]] = llvm.extractelement [[V]]{{\[}}[[C3]] : i32] : vector<4xi4>
+// CHECK:       [[VEC_3:%.+]] = llvm.insertelement [[ELEM_3]], [[VEC_2]]{{\[}}[[C3]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_3]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<2xf4E2M1FN> to vector<2xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM_1:%.+]] = llvm.extractelement [[V]]{{\[}}[[C1]] : i32] : vector<2xi4>
+// CHECK:       [[VEC_1:%.+]] = llvm.insertelement [[ELEM_1]], [[VEC_0]]{{\[}}[[C1]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_1]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f8e4m3_f32(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f8e4m3_f16(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e4m3_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E4M3FN> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f8e4m3_bf16(%v: vector<1xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.bf8 [[BITCAST]][false], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f8e5m2_f32(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.bf8 [[BITCAST]][false], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f8e5m2_f16(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f8e5m2_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf8E5M2> to vector<1xi8>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<4xi8>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi8>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<4xi8>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<4xi8> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.bf8 [[BITCAST]][false], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f8e5m2_bf16(%v: vector<1xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f32
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f32.fp4 [[BITCAST]][0], %arg1 : vector<2xf32>
+func.func @scaled_ext_one_f4e2m1_f32(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_f16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.f16.fp4 [[BITCAST]][0], %arg1 : vector<2xf16>
+func.func @scaled_ext_one_f4e2m1_f16(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_one_f4e2m1_bf16
+// CHECK:       [[V:%.+]] = builtin.unrealized_conversion_cast %arg0 : vector<1xf4E2M1FN> to vector<1xi4>
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<8xi4>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[ELEM_0:%.+]] = llvm.extractelement [[V]]{{\[}}[[C0]] : i32] : vector<1xi4>
+// CHECK:       [[VEC_0:%.+]] = llvm.insertelement [[ELEM_0]], [[ZERO]]{{\[}}[[C0]] : i32] : vector<8xi4>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[VEC_0]] : vector<8xi4> to i32
+// CHECK:       rocdl.cvt.scalef32.pk.bf16.fp4 [[BITCAST]][0], %arg1 : vector<2xbf16>
+func.func @scaled_ext_one_f4e2m1_bf16(%v: vector<1xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<1xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
new file mode 100644
index 0000000000000..e9764d34cefaf
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/packed-trunc.mlir
@@ -0,0 +1,535 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f32_vec1(%v: vector<1xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_f16_vec1(%v: vector<1xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_f8e4m3_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E4M3FN> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp8.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E4M3FN>
+// CHECK:       return [[CAST]] : vector<4xf8E4M3FN>
+func.func @packed_scaled_trunc_into_f8e4m3_bf16_vec1(%v: vector<1xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f32_vec1(%v: vector<1xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_f16_vec1(%v: vector<1xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg1 -> [[ZERO]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16_vec1
+// CHECK-DAG:   [[ZERO_I16:%.+]] = llvm.mlir.zero : vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 [[INSERT]], %arg1 -> [[ZERO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_f8e5m2_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK-DAG:   [[EXISTING_CAST_TO_I8:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST_TO_I16:%.+]] = llvm.bitcast [[EXISTING_CAST_TO_I8]] : vector<4xi8> to vector<2xi16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 %arg0, %arg2 -> [[EXISTING_BITCAST_TO_I16]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<4xf8E5M2> to vector<4xi8>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<4xi8> to vector<2xi16>
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.bf8.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][false] : vector<2xi16>
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : vector<2xi16> to vector<4xi8>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<4xi8> to vector<4xf8E5M2>
+// CHECK:       return [[CAST]] : vector<4xf8E5M2>
+func.func @packed_scaled_trunc_into_f8e5m2_bf16_vec1(%v: vector<1xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f32_vec1(%v: vector<1xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement %arg0{{\[}}[[C0]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement %arg0{{\[}}[[C1]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf32>
+// CHECK:       [[ZERO_F32:%.+]] = llvm.mlir.zero : vector<2xf32>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F32]]{{\[}}[[C0_I32]] : i32] : vector<2xf32>
+// CHECK-DAG:   [[C0_I32_2:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK-DAG:   [[C1_I32:%.+]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK:       [[ELEM0:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C0_I32_2]] : i32] : vector<2xf32>
+// CHECK:       [[ELEM1:%.+]] = llvm.extractelement [[INSERT]]{{\[}}[[C1_I32]] : i32] : vector<2xf32>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f32 [[ELEM0]], [[ELEM1]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f32_vec1(%v: vector<1xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 [[INSERT]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_f16_vec1(%v: vector<1xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xf16>
+// CHECK:       [[ZERO_F16:%.+]] = llvm.mlir.zero : vector<2xf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_F16]]{{\[}}[[C0_I32]] : i32] : vector<2xf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.f16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_f16_vec1(%v: vector<1xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK-DAG:   [[ZERO:%.+]] = llvm.mlir.zero : i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg1 -> [[ZERO]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16_vec1
+// CHECK-DAG:   [[ZERO_I32:%.+]] = llvm.mlir.zero : i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 [[INSERT]], %arg1 -> [[ZERO_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_f4e2m1_bf16_vec1(%v: vector<1xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<1xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK-DAG:   [[BITCAST_I4:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[BITCAST_I32:%.+]] = llvm.bitcast [[BITCAST_I4]] : vector<8xi4> to i32
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 %arg0, %arg2 -> [[BITCAST_I32]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16_vec1
+// CHECK-DAG:   [[EXISTING_CAST:%.+]] = builtin.unrealized_conversion_cast %arg1 : vector<8xf4E2M1FN> to vector<8xi4>
+// CHECK-DAG:   [[EXISTING_BITCAST:%.+]] = llvm.bitcast [[EXISTING_CAST]] : vector<8xi4> to i32
+// CHECK-DAG:   [[C0_I32:%.+]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK:       [[EXTRACT:%.+]] = llvm.extractelement %arg0{{\[}}[[C0_I32]] : i32] : vector<1xbf16>
+// CHECK:       [[ZERO_BF16:%.+]] = llvm.mlir.zero : vector<2xbf16>
+// CHECK:       [[INSERT:%.+]] = llvm.insertelement [[EXTRACT]], [[ZERO_BF16]]{{\[}}[[C0_I32]] : i32] : vector<2xbf16>
+// CHECK:       [[CVT:%.+]] = rocdl.cvt.scalef32.pk.fp4.bf16 [[INSERT]], %arg2 -> [[EXISTING_BITCAST]][0] : i32
+// CHECK:       [[BITCAST:%.+]] = llvm.bitcast [[CVT]] : i32 to vector<8xi4>
+// CHECK:       [[CAST:%.+]] = builtin.unrealized_conversion_cast [[BITCAST]] : vector<8xi4> to vector<8xf4E2M1FN>
+// CHECK:       return [[CAST]] : vector<8xf4E2M1FN>
+func.func @packed_scaled_trunc_into_f4e2m1_bf16_vec1(%v: vector<1xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<1xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 188cfcc4eb38b..6c3ffb575f7c2 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -32,6 +32,321 @@ func.func @packed_stoch_round_fp8(%v1: f32, %stoch: i32, %others: vector<4xf8E5M
   func.return %ret : vector<4xf8E5M2FNUZ>
 }
 
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f32(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_f16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e4m3_bf16(%v: vector<4xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f32(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_f16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e4m3_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e4m3_bf16(%v: vector<2xf8E4M3FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E4M3FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f32(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_f16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f8e5m2_bf16(%v: vector<4xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f32(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_f16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f8e5m2_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f8e5m2_bf16(%v: vector<2xf8E5M2>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf8E5M2> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_f16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_full_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_full_f4e2m1_bf16(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f32(%v: vector<8xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<8xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_f16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_half_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_half_f4e2m1_bf16(%v: vector<4xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<4xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f32
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f32(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf32> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf32>
+  func.return %ret : vector<2xf32>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_f16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_f16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xf16>
+  func.return %ret : vector<2xf16>
+}
+
+// CHECK-LABEL: func.func @scaled_ext_scalar_f4e2m1_bf16
+// CHECK: amdgpu.scaled_ext_packed
+func.func @scaled_ext_scalar_f4e2m1_bf16(%v: vector<2xf4E2M1FN>, %scale: f32) -> vector<2xbf16> {
+  %ret = amdgpu.scaled_ext_packed %v[0], %scale : vector<2xf4E2M1FN> to vector<2xbf16>
+  func.return %ret : vector<2xbf16>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f32(%v: vector<2xf32>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_f16(%v: vector<2xf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e4m3_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e4m3_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e4m3_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E4M3FN>, %scale: f32) -> vector<4xf8E4M3FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E4M3FN> into vector<4xf8E4M3FN>
+  func.return %ret : vector<4xf8E4M3FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f32(%v: vector<2xf32>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f32(%v: vector<2xf32>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_f16(%v: vector<2xf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_f16(%v: vector<2xf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f8e5m2_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f8e5m2_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f8e5m2_bf16(%v: vector<2xbf16>, %existing: vector<4xf8E5M2>, %scale: f32) -> vector<4xf8E5M2> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<4xf8E5M2> into vector<4xf8E5M2>
+  func.return %ret : vector<4xf8E5M2>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f32(%v: vector<2xf32>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf32> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f32
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f32(%v: vector<2xf32>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf32> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_f16(%v: vector<2xf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_f16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_f16(%v: vector<2xf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_f4e2m1_bf16(%v: vector<2xbf16>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into undef[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
+// CHECK-LABEL: func.func @packed_scaled_trunc_into_f4e2m1_bf16
+// CHECK: amdgpu.packed_scaled_trunc
+func.func @packed_scaled_trunc_into_f4e2m1_bf16(%v: vector<2xbf16>, %existing: vector<8xf4E2M1FN>, %scale: f32) -> vector<8xf4E2M1FN> {
+  %ret = amdgpu.packed_scaled_trunc %v into %existing[0], %scale : vector<2xbf16> to vector<8xf4E2M1FN> into vector<8xf4E2M1FN>
+  func.return %ret : vector<8xf4E2M1FN>
+}
+
 // CHECK-LABEL: func @fat_raw_buffer_cast_easy
 // CHECK: amdgpu.fat_raw_buffer_cast
 func.func @fat_raw_buffer_cast_easy(%m: memref<8xi32>) -> memref<8xi32, #amdgpu.address_space<fat_raw_buffer>> {

From 06c783567069db169ee2d1545a4bd3ffd0e3fec0 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 09:10:30 +0000
Subject: [PATCH 345/851] [lldb][test] Disable TestMultipleDebuggers again

I did manage to turn a crash into a non-zero return code,
but on the very first build it managed to time out.

I thought I had the appetite to tweak timeouts but
on second thought, I don't want yet another test to look
out for.

The test is not wrong, but on heavily loaded machines
it's always going to be inherently unstable.
---
 lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
index f0a3893f53aab..7d6fdd444791e 100644
--- a/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
+++ b/lldb/test/API/api/multiple-debuggers/TestMultipleDebuggers.py
@@ -12,6 +12,10 @@
 class TestMultipleSimultaneousDebuggers(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
+    # Times out on heavily loaded Linux buildbots, don't want to get into tweaking
+    # the timeout per bot. Does work when run alone. See:
+    # https://github.com/llvm/llvm-project/issues/101162
+    @skipIfLinux
     @skipIfNoSBHeaders
     @skipIfWindows
     @skipIfHostIncompatibleWithTarget

From 5762491e2a1935911c1e998a4865591d429f8559 Mon Sep 17 00:00:00 2001
From: SivanShani-Arm <sivan.shani@arm.com>
Date: Fri, 13 Jun 2025 11:02:33 +0100
Subject: [PATCH 346/851] [lld] Refactor storage of PAuth ABI core info
 (#141920)

Previously, the AArch64 PAuth ABI core values were stored as an
ArrayRef<uint8_t>, introducing unnecessary indirection.

This patch replaces the ArrayRef with two explicit uint64_t fields:
aarch64PauthAbiPlatform and aarch64PauthAbiVersion. This simplifies the
representation and improves readability.

No functional change intended, aside from improved error messages.
---
 lld/ELF/Arch/AArch64.cpp             |  3 +--
 lld/ELF/Config.h                     | 19 ++++++++++++++++++-
 lld/ELF/Driver.cpp                   | 27 +++++++++++++++++----------
 lld/ELF/InputFiles.cpp               |  6 ++++--
 lld/ELF/InputFiles.h                 |  2 +-
 lld/ELF/SyntheticSections.cpp        | 14 +++++++-------
 lld/test/ELF/aarch64-feature-pauth.s |  8 ++++++--
 7 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 2b5d5e90573fe..8a225ed103eef 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -1043,8 +1043,7 @@ AArch64BtiPac::AArch64BtiPac(Ctx &ctx) : AArch64(ctx) {
   // instructions.
 
   if (ctx.arg.zPacPlt) {
-    if (llvm::any_of(ctx.aarch64PauthAbiCoreInfo,
-                     [](uint8_t c) { return c != 0; }))
+    if (ctx.aarch64PauthAbiCoreInfo && ctx.aarch64PauthAbiCoreInfo->isValid())
       pacEntryKind = PEK_Auth;
     else
       pacEntryKind = PEK_AuthHint;
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 3a9001d2cc8b8..a2f7759fb7d37 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -139,6 +139,23 @@ enum class GcsPolicy { Implicit, Never, Always };
 // For some options that resemble -z bti-report={none,warning,error}
 enum class ReportPolicy { None, Warning, Error };
 
+// Describes the signing schema for a file using the PAuth ABI extension.
+// Two files are considered compatible when both `platform` and `version` match.
+// The pair (0, 0) is reserved to indicate incompatibility with the PAuth ABI.
+struct AArch64PauthAbiCoreInfo {
+  uint64_t platform;
+  uint64_t version;
+  // Returns true if the core info is not the reserved (0, 0) value.
+  bool isValid() const { return platform || version; }
+  static constexpr size_t size() { return sizeof(platform) + sizeof(version); }
+  bool operator==(const AArch64PauthAbiCoreInfo &other) const {
+    return platform == other.platform && version == other.version;
+  }
+  bool operator!=(const AArch64PauthAbiCoreInfo &other) const {
+    return !(*this == other);
+  }
+};
+
 struct SymbolVersion {
   llvm::StringRef name;
   bool isExternCpp;
@@ -699,7 +716,7 @@ struct Ctx : CommonLinkerContext {
 
   llvm::raw_fd_ostream openAuxiliaryFile(llvm::StringRef, std::error_code &);
 
-  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
+  std::optional<AArch64PauthAbiCoreInfo> aarch64PauthAbiCoreInfo;
 };
 
 // The first two elements of versionDefinitions represent VER_NDX_LOCAL and
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 87b19cf543d9f..c9ac71f7236f8 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2857,15 +2857,15 @@ static void readSecurityNotes(Ctx &ctx) {
   StringRef referenceFileName;
   if (ctx.arg.emachine == EM_AARCH64) {
     auto it = llvm::find_if(ctx.objectFiles, [](const ELFFileBase *f) {
-      return !f->aarch64PauthAbiCoreInfo.empty();
+      return f->aarch64PauthAbiCoreInfo.has_value();
     });
     if (it != ctx.objectFiles.end()) {
       ctx.aarch64PauthAbiCoreInfo = (*it)->aarch64PauthAbiCoreInfo;
       referenceFileName = (*it)->getName();
     }
   }
-  bool hasValidPauthAbiCoreInfo = llvm::any_of(
-      ctx.aarch64PauthAbiCoreInfo, [](uint8_t c) { return c != 0; });
+  bool hasValidPauthAbiCoreInfo =
+      ctx.aarch64PauthAbiCoreInfo && ctx.aarch64PauthAbiCoreInfo->isValid();
 
   auto report = [&](ReportPolicy policy) -> ELFSyncStream {
     return {ctx, toDiagLevel(policy)};
@@ -2952,10 +2952,10 @@ static void readSecurityNotes(Ctx &ctx) {
     }
     ctx.arg.andFeatures &= features;
 
-    if (ctx.aarch64PauthAbiCoreInfo.empty())
+    if (!ctx.aarch64PauthAbiCoreInfo)
       continue;
 
-    if (f->aarch64PauthAbiCoreInfo.empty()) {
+    if (!f->aarch64PauthAbiCoreInfo) {
       report(ctx.arg.zPauthReport)
           << f
           << ": -z pauth-report: file does not have AArch64 "
@@ -2965,11 +2965,18 @@ static void readSecurityNotes(Ctx &ctx) {
     }
 
     if (ctx.aarch64PauthAbiCoreInfo != f->aarch64PauthAbiCoreInfo)
-      Err(ctx) << "incompatible values of AArch64 PAuth core info found\n>>> "
-               << referenceFileName << ": 0x"
-               << toHex(ctx.aarch64PauthAbiCoreInfo, /*LowerCase=*/true)
-               << "\n>>> " << f << ": 0x"
-               << toHex(f->aarch64PauthAbiCoreInfo, /*LowerCase=*/true);
+      Err(ctx)
+          << "incompatible values of AArch64 PAuth core info found\n"
+          << "platform:\n"
+          << ">>> " << referenceFileName << ": 0x"
+          << toHex(ctx.aarch64PauthAbiCoreInfo->platform, /*LowerCase=*/true)
+          << "\n>>> " << f << ": 0x"
+          << toHex(f->aarch64PauthAbiCoreInfo->platform, /*LowerCase=*/true)
+          << "\nversion:\n"
+          << ">>> " << referenceFileName << ": 0x"
+          << toHex(ctx.aarch64PauthAbiCoreInfo->version, /*LowerCase=*/true)
+          << "\n>>> " << f << ": 0x"
+          << toHex(f->aarch64PauthAbiCoreInfo->version, /*LowerCase=*/true);
   }
 
   // Force enable Shadow Stack.
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 44e77bf57183f..71e72e7184b9f 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -948,7 +948,7 @@ static void parseGnuPropertyNote(Ctx &ctx, ELFFileBase &f,
     } else if (ctx.arg.emachine == EM_AARCH64 &&
                type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) {
       ArrayRef<uint8_t> contents = data ? *data : desc;
-      if (!f.aarch64PauthAbiCoreInfo.empty()) {
+      if (f.aarch64PauthAbiCoreInfo) {
         return void(
             err(contents.data())
             << "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
@@ -959,7 +959,9 @@ static void parseGnuPropertyNote(Ctx &ctx, ELFFileBase &f,
                        "is invalid: expected 16 bytes, but got "
                     << size);
       }
-      f.aarch64PauthAbiCoreInfo = desc;
+      f.aarch64PauthAbiCoreInfo = {
+          support::endian::read64<ELFT::Endianness>(&desc[0]),
+          support::endian::read64<ELFT::Endianness>(&desc[8])};
     }
 
     // Padding is present in the note descriptor, if necessary.
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 808cb5d24079f..ba844ad18f637 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -241,7 +241,7 @@ class ELFFileBase : public InputFile {
   StringRef sourceFile;
   uint32_t andFeatures = 0;
   bool hasCommonSyms = false;
-  ArrayRef<uint8_t> aarch64PauthAbiCoreInfo;
+  std::optional<AArch64PauthAbiCoreInfo> aarch64PauthAbiCoreInfo;
 };
 
 // .o file.
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 0a9c7a081eb8b..051e5cd04ef50 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -354,11 +354,11 @@ void GnuPropertySection::writeTo(uint8_t *buf) {
     offset += 16;
   }
 
-  if (!ctx.aarch64PauthAbiCoreInfo.empty()) {
+  if (ctx.aarch64PauthAbiCoreInfo) {
     write32(ctx, buf + offset + 0, GNU_PROPERTY_AARCH64_FEATURE_PAUTH);
-    write32(ctx, buf + offset + 4, ctx.aarch64PauthAbiCoreInfo.size());
-    memcpy(buf + offset + 8, ctx.aarch64PauthAbiCoreInfo.data(),
-           ctx.aarch64PauthAbiCoreInfo.size());
+    write32(ctx, buf + offset + 4, AArch64PauthAbiCoreInfo::size());
+    write64(ctx, buf + offset + 8, ctx.aarch64PauthAbiCoreInfo->platform);
+    write64(ctx, buf + offset + 16, ctx.aarch64PauthAbiCoreInfo->version);
   }
 }
 
@@ -366,8 +366,8 @@ size_t GnuPropertySection::getSize() const {
   uint32_t contentSize = 0;
   if (ctx.arg.andFeatures != 0)
     contentSize += ctx.arg.is64 ? 16 : 12;
-  if (!ctx.aarch64PauthAbiCoreInfo.empty())
-    contentSize += 4 + 4 + ctx.aarch64PauthAbiCoreInfo.size();
+  if (ctx.aarch64PauthAbiCoreInfo)
+    contentSize += 4 + 4 + AArch64PauthAbiCoreInfo::size();
   assert(contentSize != 0);
   return contentSize + 16;
 }
@@ -4967,7 +4967,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   ctx.in.iplt = std::make_unique<IpltSection>(ctx);
   add(*ctx.in.iplt);
 
-  if (ctx.arg.andFeatures || !ctx.aarch64PauthAbiCoreInfo.empty()) {
+  if (ctx.arg.andFeatures || ctx.aarch64PauthAbiCoreInfo) {
     ctx.in.gnuProperty = std::make_unique<GnuPropertySection>(ctx);
     add(*ctx.in.gnuProperty);
   }
diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s
index bc58f69d32f2b..e8c900b9cb134 100644
--- a/lld/test/ELF/aarch64-feature-pauth.s
+++ b/lld/test/ELF/aarch64-feature-pauth.s
@@ -13,8 +13,12 @@
 # RUN: not ld.lld tag1.o tag1a.o tag2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR1 %s
 
 # ERR1:      error: incompatible values of AArch64 PAuth core info found
-# ERR1-NEXT: >>> tag1.o: 0x2a000000000000000{{1|2}}00000000000000
-# ERR1-NEXT: >>> tag2.o: 0x2a000000000000000{{1|2}}00000000000000
+# ERR1-NEXT: platform:
+# ERR1-NEXT: >>> tag1.o: 0x2a
+# ERR1-NEXT: >>> tag2.o: 0x2a
+# ERR1-NEXT: version:
+# ERR1-NEXT: >>> tag1.o: 0x01
+# ERR1-NEXT: >>> tag2.o: 0x02
 
 # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-short.s -o short.o
 # RUN: not ld.lld short.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR2 %s

From 058602372e2bb7460469c5c53cc36f0a4b131f54 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 11:05:57 +0100
Subject: [PATCH 347/851] [X86] X86FixupInstTuning - fold BLENDPS -> MOVSD
 (#144029)

Reduces codesize - make use of free PS<->PD domain transfers (like we do in many other places) and replace a suitable BLENDPS mask with MOVSD if OptSize or the scheduler prefers it
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    |  15 +-
 llvm/test/CodeGen/X86/avx-insertelt.ll        |   2 +-
 .../CodeGen/X86/avx-intrinsics-x86-upgrade.ll |   4 +-
 .../CodeGen/X86/coalesce_commute_movsd.ll     |   4 +-
 llvm/test/CodeGen/X86/combine-and.ll          |   2 +-
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   | 175 +++++++-----------
 llvm/test/CodeGen/X86/commute-blend-sse41.ll  |   2 +-
 llvm/test/CodeGen/X86/horizontal-sum.ll       |   4 +-
 llvm/test/CodeGen/X86/insertelement-zero.ll   |  10 +-
 llvm/test/CodeGen/X86/masked_load.ll          |   2 +-
 llvm/test/CodeGen/X86/sse-insertelt.ll        |  15 +-
 llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll  |  76 +++-----
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |   4 +-
 .../X86/sse2-intrinsics-x86-upgrade.ll        |  16 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   2 +-
 .../CodeGen/X86/sse41-intrinsics-fast-isel.ll |   2 +-
 .../X86/sse41-intrinsics-x86-upgrade.ll       |   4 +-
 llvm/test/CodeGen/X86/vec-strict-128-fp16.ll  |   2 +-
 llvm/test/CodeGen/X86/vec_floor.ll            |   4 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |  24 +--
 .../vector-interleaved-load-i32-stride-4.ll   |  64 +++----
 .../vector-interleaved-load-i32-stride-5.ll   |  40 ++--
 llvm/test/CodeGen/X86/vector-mul.ll           |   2 +-
 .../test/CodeGen/X86/vector-shuffle-128-v2.ll |  62 ++-----
 .../test/CodeGen/X86/vector-shuffle-128-v4.ll |   6 +-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |   2 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |   2 +-
 .../X86/vector-shuffle-concatenation.ll       |   4 +-
 llvm/test/CodeGen/X86/vselect-2.ll            |  38 ++--
 llvm/test/CodeGen/X86/vselect.ll              |  60 ++----
 30 files changed, 258 insertions(+), 391 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index be0a8c23ea5c4..ce1e4966553f5 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -222,8 +222,9 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
-  auto ProcessBLENDToMOV = [&](unsigned MovOpc) -> bool {
-    if (MI.getOperand(NumOperands - 1).getImm() != 1)
+  auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
+                               unsigned MovImm) -> bool {
+    if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
       return false;
     bool Force = MF.getFunction().hasOptSize();
     if (!Force && !NewOpcPreferable(MovOpc))
@@ -235,14 +236,16 @@ bool X86FixupInstTuningPass::processInstruction(
 
   switch (Opc) {
   case X86::BLENDPDrri:
-    return ProcessBLENDToMOV(X86::MOVSDrr);
+    return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1);
   case X86::VBLENDPDrri:
-    return ProcessBLENDToMOV(X86::VMOVSDrr);
+    return ProcessBLENDToMOV(X86::VMOVSDrr, 0x3, 0x1);
 
   case X86::BLENDPSrri:
-    return ProcessBLENDToMOV(X86::MOVSSrr);
+    return ProcessBLENDToMOV(X86::MOVSSrr, 0xF, 0x1) ||
+           ProcessBLENDToMOV(X86::MOVSDrr, 0xF, 0x3);
   case X86::VBLENDPSrri:
-    return ProcessBLENDToMOV(X86::VMOVSSrr);
+    return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
+           ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
 
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 02e6c9649c9a1..f8feceb0404b5 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -111,7 +111,7 @@ define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, doub
 ; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 966662f5f9f8f..f0203b3b889e4 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -300,8 +300,8 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse41_blendpd:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vblendps $3, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; CHECK-NEXT:    # xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf3,0x10,0xc0]
+; CHECK-NEXT:    # xmm0 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
index 086df87d1d5ff..441c79b3fc31f 100644
--- a/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/llvm/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -19,12 +19,12 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
 ;
 ; AVX-LABEL: insert_f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: insert_f64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    retq
  %1 = insertelement <2 x double> %a1, double %a0, i32 0
  ret <2 x double> %1
diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll
index 9ca4ebfec2774..a476b21979cef 100644
--- a/llvm/test/CodeGen/X86/combine-and.ll
+++ b/llvm/test/CodeGen/X86/combine-and.ll
@@ -127,7 +127,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
 ; SSE-LABEL: test7:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 2f2a05fa6939b..14e3767f65564 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -31,15 +31,10 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
 
 
 define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
@@ -53,15 +48,10 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test3:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test3:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
@@ -201,15 +191,10 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
 
 
 define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test9:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test9:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test9:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test9:
 ; AVX:       # %bb.0:
@@ -223,15 +208,10 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test10:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test10:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test10:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test10:
 ; AVX:       # %bb.0:
@@ -563,20 +543,25 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
 ; bitcast to use the mask-or blend combine.
 
 define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
-; SSE2-LABEL: test22:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
+; SSE-LABEL: test22:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
-; SSE4-LABEL: test22:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; AVX1-LABEL: test22:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: test22:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    retq
+; AVX2-LABEL: test22:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test22:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT:    retq
   %bc1 = bitcast <2 x double> %a0 to <2 x i64>
   %bc2 = bitcast <2 x double> %a1 to <2 x i64>
   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -614,20 +599,25 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
 
 
 define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
-; SSE2-LABEL: test24:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
+; SSE-LABEL: test24:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
-; SSE4-LABEL: test24:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; AVX1-LABEL: test24:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT:    retq
 ;
-; AVX-LABEL: test24:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT:    retq
+; AVX2-LABEL: test24:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test24:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT:    retq
   %bc1 = bitcast <4 x float> %a0 to <2 x i64>
   %bc2 = bitcast <4 x float> %a1 to <2 x i64>
   %and1 = and <2 x i64> %bc1, <i64 0, i64 -1>
@@ -707,15 +697,10 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
 ; Verify that we can fold regardless of which operand is the zeroinitializer
 
 define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2b:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2b:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2b:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2b:
 ; AVX:       # %bb.0:
@@ -728,15 +713,10 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2c:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2c:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2c:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2c:
 ; AVX:       # %bb.0:
@@ -750,15 +730,10 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
 
 
 define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2d:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2d:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2d:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2d:
 ; AVX:       # %bb.0:
@@ -773,15 +748,10 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
 ; Make sure we can have an undef where an index pointing to the zero vector should be
 
 define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2e:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2e:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2e:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2e:
 ; AVX:       # %bb.0:
@@ -794,15 +764,10 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: test2f:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE4-LABEL: test2f:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE4-NEXT:    retq
+; SSE-LABEL: test2f:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2f:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/commute-blend-sse41.ll b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
index 07d6a8ba22bb1..4740bf59a69e7 100644
--- a/llvm/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-sse41.ll
@@ -57,7 +57,7 @@ define void @baz(ptr %arg, ptr %arg1) optsize {
 ; CHECK-NEXT:    movaps (%rdi), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [3,3]
 ; CHECK-NEXT:    andps %xmm0, %xmm1
-; CHECK-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; CHECK-NEXT:    movups %xmm1, (%rsi)
 ; CHECK-NEXT:    retq
 bb:
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 443275e11459d..0afc4f784bc5e 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -577,7 +577,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
 ; AVX-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
 ; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-SLOW-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-SLOW-NEXT:    vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
 ; AVX-SLOW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
 ; AVX-SLOW-NEXT:    vaddps %xmm3, %xmm4, %xmm4
 ; AVX-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -596,7 +596,7 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
 ; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm2, %xmm1
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
 ; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
-; AVX-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX-FAST-NEXT:    vmovsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
 ; AVX-FAST-NEXT:    vhaddps %xmm3, %xmm3, %xmm4
 ; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; AVX-FAST-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll
index 6036eddb0ca84..b66ad07c466e1 100644
--- a/llvm/test/CodeGen/X86/insertelement-zero.ll
+++ b/llvm/test/CodeGen/X86/insertelement-zero.ll
@@ -30,13 +30,13 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
 ; SSE41-LABEL: insert_v2f64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v2f64_z1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = insertelement <2 x double> %a, double 0.0, i32 0
   ret <2 x double> %1
@@ -68,7 +68,7 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v4f64_0zz3:
@@ -103,7 +103,7 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
 ; SSE41-LABEL: insert_v2i64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v2i64_z1:
@@ -137,7 +137,7 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
 ; SSE41-LABEL: insert_v4i64_01z3:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_v4i64_01z3:
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 37ab4276fbcca..8c4bab99a5b7b 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6512,7 +6512,7 @@ define <8 x float> @mload_constmask_v8f32(ptr %addr, <8 x float> %dst) {
 ; SSE42-LABEL: mload_constmask_v8f32:
 ; SSE42:       ## %bb.0:
 ; SSE42-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; SSE42-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; SSE42-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll
index f174eaaca38c2..72e002ed6b7db 100644
--- a/llvm/test/CodeGen/X86/sse-insertelt.ll
+++ b/llvm/test/CodeGen/X86/sse-insertelt.ll
@@ -21,19 +21,14 @@ define <4 x float> @insert_f32_firstelt(<4 x float> %x, float %s) {
 }
 
 define <2 x double> @insert_f64_firstelt(<2 x double> %x, double %s) {
-; SSE2-LABEL: insert_f64_firstelt:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: insert_f64_firstelt:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: insert_f64_firstelt:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: insert_f64_firstelt:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %i0 = insertelement <2 x double> %x, double %s, i32 0
   ret <2 x double> %i0
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index 12bfb8d4fc9cf..325f735b09cd9 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,X86-SSE41
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
 
@@ -1333,29 +1333,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
 }
 
 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; X86-SSE2-LABEL: add_sd_mask:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE2-NEXT:    jne .LBB71_1
-; X86-SSE2-NEXT:  # %bb.2:
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X86-SSE2-NEXT:    retl
-; X86-SSE2-NEXT:  .LBB71_1:
-; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: add_sd_mask:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
-; X86-SSE41-NEXT:    jne .LBB71_1
-; X86-SSE41-NEXT:  # %bb.2:
-; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X86-SSE41-NEXT:    retl
-; X86-SSE41-NEXT:  .LBB71_1:
-; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X86-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X86-SSE41-NEXT:    retl
+; X86-SSE-LABEL: add_sd_mask:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    jne .LBB71_1
+; X86-SSE-NEXT:  # %bb.2:
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X86-SSE-NEXT:    retl
+; X86-SSE-NEXT:  .LBB71_1:
+; X86-SSE-NEXT:    addsd %xmm0, %xmm1
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: add_sd_mask:
 ; X86-AVX1:       # %bb.0:
@@ -1375,29 +1363,17 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
 ; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
 ; X86-AVX512-NEXT:    retl
 ;
-; X64-SSE2-LABEL: add_sd_mask:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    testb $1, %dil
-; X64-SSE2-NEXT:    jne .LBB71_1
-; X64-SSE2-NEXT:  # %bb.2:
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; X64-SSE2-NEXT:    retq
-; X64-SSE2-NEXT:  .LBB71_1:
-; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: add_sd_mask:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    testb $1, %dil
-; X64-SSE41-NEXT:    jne .LBB71_1
-; X64-SSE41-NEXT:  # %bb.2:
-; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; X64-SSE41-NEXT:    retq
-; X64-SSE41-NEXT:  .LBB71_1:
-; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
-; X64-SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; X64-SSE41-NEXT:    retq
+; X64-SSE-LABEL: add_sd_mask:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    testb $1, %dil
+; X64-SSE-NEXT:    jne .LBB71_1
+; X64-SSE-NEXT:  # %bb.2:
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-SSE-NEXT:    retq
+; X64-SSE-NEXT:  .LBB71_1:
+; X64-SSE-NEXT:    addsd %xmm0, %xmm1
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: add_sd_mask:
 ; X64-AVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 18a6be8aaf0b6..3f48b22e2b9ff 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3010,8 +3010,8 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
 ;
 ; AVX-LABEL: test_mm_move_sd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; AVX-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x10,0xc1]
+; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %ext0 = extractelement <2 x double> %a1, i32 0
   %res0 = insertelement <2 x double> undef, double %ext0, i32 0
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 6dd75c8c09ce5..413b4e79257a0 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -724,8 +724,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -734,8 +734,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X86-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X86-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -752,8 +752,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX1-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
@@ -761,8 +761,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
-; X64-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
-; X64-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
+; X64-AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1]
+; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %a1 = load <4 x float>, ptr %p1
   %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index e1d91b407fc28..6e77d3e4fd134 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -417,7 +417,7 @@ define void @test12() nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovaps 0, %xmm0
 ; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
 ; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index 47630501864a5..c6f0ec493a36c 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -33,7 +33,7 @@ define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
 ;
 ; AVX-LABEL: test_mm_blend_pd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    ret{{[l|q]}}
   %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index bdf8033a00b0a..137606b7cfeed 100644
--- a/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -18,8 +18,8 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1)
 ;
 ; AVX-LABEL: test_x86_sse41_blendpd:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vblendps $3, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x03]
-; AVX-NEXT:    ## xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf3,0x10,0xc0]
+; AVX-NEXT:    ## xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
index a6e288608c87b..35688e59fc9f4 100644
--- a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -93,7 +93,7 @@ define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 {
 ; CHECK-LABEL: f12:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %ext = extractelement <8 x half> %a1, i32 0
   %cvt = call double @llvm.experimental.constrained.fpext.f64.f16(half %ext,
diff --git a/llvm/test/CodeGen/X86/vec_floor.ll b/llvm/test/CodeGen/X86/vec_floor.ll
index 1007969b6c6d1..7f4ed3394d10d 100644
--- a/llvm/test/CodeGen/X86/vec_floor.ll
+++ b/llvm/test/CodeGen/X86/vec_floor.ll
@@ -1653,7 +1653,7 @@ define <2 x double> @floor_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16
 ; AVX-NEXT:    jne LBB59_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB59_1:
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
@@ -2643,7 +2643,7 @@ define <2 x double> @ceil_maskz_sd_trunc(<2 x double> %x, <2 x double> %y, i16 %
 ; AVX-NEXT:    jne LBB85_1
 ; AVX-NEXT:  ## %bb.2:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  LBB85_1:
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index a38028e87532f..2d2fc6b6ee0d7 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -172,7 +172,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
 ;
 ; AVX-LABEL: vsel_double:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
 entry:
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -732,23 +732,11 @@ entry:
 }
 
 define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
-; SSE2-LABEL: blend_shufflevector_4xi64:
-; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: blend_shufflevector_4xi64:
-; SSSE3:       # %bb.0: # %entry
-; SSSE3-NEXT:    movaps %xmm3, %xmm1
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: blend_shufflevector_4xi64:
-; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    movaps %xmm3, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: blend_shufflevector_4xi64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm3, %xmm1
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: blend_shufflevector_4xi64:
 ; AVX:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 0bf1260738439..822d31eb45139 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -250,10 +250,10 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm5 = xmm5[0],xmm6[1]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm6 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
@@ -584,14 +584,14 @@ define void @load_i32_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm10[2,0],ymm9[2,3],ymm10[6,4],ymm9[6,7]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm10 = xmm5[1],xmm6[1],zero,zero
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm11 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm10 = xmm10[0],xmm11[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm10 = ymm1[1],ymm2[1],ymm1[3],ymm2[3]
 ; AVX-NEXT:    vunpckhps {{.*#+}} ymm11 = ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[6],ymm0[6],ymm4[7],ymm0[7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm11 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm12 = zero,zero,xmm7[2],xmm8[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm11 = xmm11[0],xmm12[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
 ; AVX-NEXT:    vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm4[3,0],ymm0[7,4],ymm4[7,4]
@@ -1080,7 +1080,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovaps %ymm12, %ymm0
@@ -1094,7 +1094,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -1105,7 +1105,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = zero,zero,xmm10[2],xmm3[2]
 ; AVX-NEXT:    vmovaps %xmm10, %xmm14
-; AVX-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm6 = xmm6[0],xmm7[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
@@ -1115,7 +1115,7 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm7 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm9[2],xmm1[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
@@ -2120,7 +2120,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm9[0],mem[0],xmm9[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5]
@@ -2131,7 +2131,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -2147,7 +2147,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = xmm5[0],mem[0],xmm5[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2164,7 +2164,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2176,7 +2176,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = zero,zero,xmm9[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm8 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
@@ -2187,7 +2187,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm14 = zero,zero,xmm5[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -2203,7 +2203,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,zero,xmm2[2],xmm4[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
@@ -2215,7 +2215,7 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm14 = zero,zero,xmm7[2],xmm13[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm14[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX-NEXT:    vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -4239,7 +4239,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm13[0],xmm3[1],xmm13[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4253,7 +4253,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{.*#+}} xmm15 = xmm6[0],xmm11[0],xmm6[1],xmm11[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4269,7 +4269,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4285,7 +4285,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4301,7 +4301,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4317,7 +4317,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4333,7 +4333,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4349,7 +4349,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = xmm15[0],mem[0],xmm15[1],mem[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
@@ -4358,7 +4358,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm3[2],xmm7[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -4368,7 +4368,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX-NEXT:    vunpckhps {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm6[2],xmm11[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -4384,7 +4384,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm15 = zero,zero,xmm2[2],xmm3[2]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
@@ -4400,7 +4400,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4416,7 +4416,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4432,7 +4432,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4448,7 +4448,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4464,7 +4464,7 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX-NEXT:    vinsertps $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm15 = zero,zero,xmm15[2],mem[0]
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm15[1]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index c08442f9d9d01..4f80140bc6c1b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -359,7 +359,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm3
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm4
-; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm5 = xmm3[0],xmm4[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
 ; AVX-NEXT:    vmovaps 64(%rdi), %xmm6
@@ -369,7 +369,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3]
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm7 = xmm7[1,0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1]
-; AVX-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
@@ -787,7 +787,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vshufps {{.*#+}} ymm7 = ymm9[2,0],ymm7[3,0],ymm9[6,4],ymm7[7,4]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm9
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm11 = xmm9[0],xmm10[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7]
@@ -806,7 +806,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6]
-; AVX-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4]
@@ -1552,7 +1552,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
 ; AVX-NEXT:    vmovaps (%rdi), %xmm15
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm10
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm15[0,1],xmm10[2,3]
+; AVX-NEXT:    vmovsd %xmm15, %xmm10, %xmm4
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1565,7 +1565,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4]
 ; AVX-NEXT:    vmovaps 160(%rdi), %xmm9
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm8
-; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT:    vmovsd %xmm9, %xmm8, %xmm4
 ; AVX-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1597,7 +1597,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm10 = xmm10[0],xmm15[1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm5[2,2],ymm13[6,4],ymm5[6,6]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3]
@@ -1605,7 +1605,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT:    vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm8 = xmm8[0],xmm9[1]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4]
 ; AVX-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
@@ -3086,7 +3086,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm0
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
@@ -3102,7 +3102,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3118,7 +3118,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3134,7 +3134,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6148,7 +6148,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 192(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6165,7 +6165,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 512(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6182,7 +6182,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 832(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6198,7 +6198,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 1152(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6214,7 +6214,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6230,7 +6230,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 352(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6247,7 +6247,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 672(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6265,7 +6265,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX-NEXT:    vmovaps %xmm2, (%rsp) # 16-byte Spill
 ; AVX-NEXT:    vmovaps 992(%rdi), %xmm1
 ; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
 ; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
 ; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index 98b5bab98c4f9..13b21a747878b 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -1579,7 +1579,7 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
 ; SSE4-LABEL: mul_v2i64_0_1:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    xorps %xmm1, %xmm1
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE4-NEXT:    ret{{[l|q]}}
 ;
 ; X64-AVX-LABEL: mul_v2i64_0_1:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 2d3dc4c593c11..baaae507ae15c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -242,35 +242,20 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: shuffle_v2f64_03:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
   ret <2 x double> %shuffle
 }
 define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: shuffle_v2f64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_v2f64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v2f64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v2f64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v2f64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_21:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %shuffle
@@ -523,25 +508,10 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
   ret <2 x i64> %shuffle
 }
 define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: shuffle_v2i64_21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE3-LABEL: shuffle_v2i64_21:
-; SSE3:       # %bb.0:
-; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE3-NEXT:    retq
-;
-; SSSE3-LABEL: shuffle_v2i64_21:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v2i64_21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: shuffle_v2i64_21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_21:
 ; AVX:       # %bb.0:
@@ -572,7 +542,7 @@ define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
 ; SSE41-LABEL: shuffle_v2i64_21_copy:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_21_copy:
@@ -740,7 +710,7 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
 ; SSE41-LABEL: shuffle_v2i64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2i64_z1:
@@ -821,13 +791,13 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
 ; SSE41-LABEL: shuffle_v2f64_z1:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_v2f64_z1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %shuffle
@@ -1102,7 +1072,7 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: insert_reg_lo_v2f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %v = insertelement <2 x double> poison, double %a, i32 0
   %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3>
@@ -1334,7 +1304,7 @@ define <2 x double> @shuffle_mem_v2f64_21(<2 x double> %a, ptr %pb) {
 ; SSE41-LABEL: shuffle_mem_v2f64_21:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v2f64_21:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index e1eb1a6704e39..9ec24c447c2cc 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -2222,7 +2222,7 @@ define <4 x i32> @insert_mem_lo_v4i32(ptr %ptr, <4 x i32> %b) {
 ; SSE41-LABEL: insert_mem_lo_v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: insert_mem_lo_v4i32:
@@ -2295,7 +2295,7 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: insert_reg_lo_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %a.cast = bitcast double %a to <2 x float>
   %v = shufflevector <2 x float> %a.cast, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
@@ -2489,7 +2489,7 @@ define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, ptr %pb) {
 ; SSE41-LABEL: shuffle_mem_v4f32_4523:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movups (%rdi), %xmm1
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_mem_v4f32_4523:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 950683cbfaeea..bce50db4d952e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -468,7 +468,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
-; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 002a3b77dc353..bd2710139d584 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -53,7 +53,7 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
 ;
 ; AVX-LABEL: combine_pshufb_as_movsd:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
   %2 = bitcast <2 x double> %1 to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
index 2812bf3489101..925f8d5104510 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll
@@ -173,7 +173,7 @@ define void @concat_a_to_shuf_of_ab(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps (%rdi), %xmm0
 ; SSE42-NEXT:    movaps (%rsi), %xmm1
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE42-NEXT:    movaps %xmm0, 16(%rdx)
 ; SSE42-NEXT:    movaps %xmm1, (%rdx)
 ; SSE42-NEXT:    retq
@@ -288,7 +288,7 @@ define void @concat_shuf_of_ab_to_a(ptr %a.ptr, ptr %b.ptr, ptr %dst) {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movaps (%rdi), %xmm0
 ; SSE42-NEXT:    movaps (%rsi), %xmm1
-; SSE42-NEXT:    blendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; SSE42-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
 ; SSE42-NEXT:    movaps %xmm1, 16(%rdx)
 ; SSE42-NEXT:    movaps %xmm0, (%rdx)
 ; SSE42-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vselect-2.ll b/llvm/test/CodeGen/X86/vselect-2.ll
index c02cbcf55408d..429ae88fe6d6f 100644
--- a/llvm/test/CodeGen/X86/vselect-2.ll
+++ b/llvm/test/CodeGen/X86/vselect-2.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
 ; SSE2-LABEL: test1:
@@ -24,15 +24,10 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
 }
 
 define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
-; SSE2-LABEL: test2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test2:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
@@ -55,26 +50,21 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
 }
 
 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
-; SSE2-LABEL: test4:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test4:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test4:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test4:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
   ret <4 x float> %select
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 18a060ad910b7..f70145d6b21c2 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -69,26 +69,21 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
 ;
 ; AVX-LABEL: test2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
 }
 
 define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
-; SSE2-LABEL: test3:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test3:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test3:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
   ret <4 x float> %1
@@ -152,15 +147,10 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
 }
 
 define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: test8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test8:
 ; AVX:       # %bb.0:
@@ -329,34 +319,24 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
-; SSE2-LABEL: test20:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test20:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test20:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test20:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; AVX-NEXT:    retq
   %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1
 }
 
 define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
-; SSE2-LABEL: test21:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test21:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; SSE41-NEXT:    retq
+; SSE-LABEL: test21:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test21:
 ; AVX:       # %bb.0:
@@ -419,7 +399,7 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
 ;
 ; AVX-LABEL: test24:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; AVX-NEXT:    retq
   %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
   ret <2 x double> %1

From e2c27fd66a13c7a37cccbf4309532fcbce86c09b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 11:11:01 +0100
Subject: [PATCH 348/851] [X86] X86FixupInstTuning - hoist OptSize flag. NFC.

Allow reuse in a future patch.
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index ce1e4966553f5..8c1ff523c975a 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -81,6 +81,7 @@ bool X86FixupInstTuningPass::processInstruction(
   MachineInstr &MI = *I;
   unsigned Opc = MI.getOpcode();
   unsigned NumOperands = MI.getDesc().getNumOperands();
+  bool OptSize = MF.getFunction().hasOptSize();
 
   auto GetInstTput = [&](unsigned Opcode) -> std::optional<double> {
     // We already checked that SchedModel exists in `NewOpcPreferable`.
@@ -226,8 +227,7 @@ bool X86FixupInstTuningPass::processInstruction(
                                unsigned MovImm) -> bool {
     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
       return false;
-    bool Force = MF.getFunction().hasOptSize();
-    if (!Force && !NewOpcPreferable(MovOpc))
+    if (!OptSize && !NewOpcPreferable(MovOpc))
       return false;
     MI.setDesc(TII->get(MovOpc));
     MI.removeOperand(NumOperands - 1);

From 6fc8ec720ea590bbdb94e19acefaf5bafdfcf817 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 12:29:50 +0200
Subject: [PATCH 349/851] [InstCombine] Restore splat gep support in
 OptimizePointerDifference() (#143906)

When looking for the common base pointer, support the case where the
type changes because the GEP goes from pointer to vector of pointers.
This was supported prior to #142958.
---
 .../InstCombine/InstCombineAddSub.cpp         |  7 +++--
 llvm/test/Transforms/InstCombine/sub-gep.ll   | 30 +++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 86d318967403d..0d91e7d77e4a7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2088,8 +2088,6 @@ CommonPointerBase CommonPointerBase::compute(Value *LHS, Value *RHS) {
   // Find common base and collect RHS GEPs.
   while (true) {
     if (Ptrs.contains(RHS)) {
-      if (LHS->getType() != RHS->getType())
-        return Base;
       Base.Ptr = RHS;
       break;
     }
@@ -2132,12 +2130,15 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   // TODO: We should probably do this even if there is only one GEP.
   bool RewriteGEPs = !Base.LHSGEPs.empty() && !Base.RHSGEPs.empty();
 
-  Type *IdxTy = DL.getIndexType(Base.Ptr->getType());
+  Type *IdxTy = DL.getIndexType(LHS->getType());
   auto EmitOffsetFromBase = [&](ArrayRef<GEPOperator *> GEPs,
                                 GEPNoWrapFlags NW) -> Value * {
     Value *Sum = nullptr;
     for (GEPOperator *GEP : reverse(GEPs)) {
       Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
+      if (Offset->getType() != IdxTy)
+        Offset = Builder.CreateVectorSplat(
+            cast<VectorType>(IdxTy)->getElementCount(), Offset);
       if (Sum)
         Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
                                 NW.isInBounds());
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 9444fef1887d3..375be8a3d69c3 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -995,3 +995,33 @@ define i64 @multiple_geps_inbounds_nuw(ptr %base, i64 %idx, i64 %idx2) {
   %d = sub i64 %i2, %i1
   ret i64 %d
 }
+
+define <2 x i64> @splat_geps(ptr %base, <2 x i64> %idx1, <2 x i64> %idx2) {
+; CHECK-LABEL: @splat_geps(
+; CHECK-NEXT:    [[D:%.*]] = sub nsw <2 x i64> [[IDX2:%.*]], [[IDX1:%.*]]
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %gep1 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx1
+  %gep2 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx2
+  %gep1.int = ptrtoint <2 x ptr> %gep1 to <2 x i64>
+  %gep2.int = ptrtoint <2 x ptr> %gep2 to <2 x i64>
+  %d = sub <2 x i64> %gep2.int, %gep1.int
+  ret <2 x i64> %d
+}
+
+define <2 x i64> @splat_geps_multiple(ptr %base, i64 %idx0, <2 x i64> %idx1, <2 x i64> %idx2) {
+; CHECK-LABEL: @splat_geps_multiple(
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[IDX0:%.*]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i64> [[DOTSPLAT]], [[IDX1:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = sub nsw <2 x i64> [[IDX2:%.*]], [[TMP1]]
+; CHECK-NEXT:    ret <2 x i64> [[D]]
+;
+  %gep0 = getelementptr inbounds i8, ptr %base, i64 %idx0
+  %gep1 = getelementptr inbounds i8, ptr %gep0, <2 x i64> %idx1
+  %gep2 = getelementptr inbounds i8, ptr %base, <2 x i64> %idx2
+  %gep1.int = ptrtoint <2 x ptr> %gep1 to <2 x i64>
+  %gep2.int = ptrtoint <2 x ptr> %gep2 to <2 x i64>
+  %d = sub <2 x i64> %gep2.int, %gep1.int
+  ret <2 x i64> %d
+}

From 2019553a0b8811a23d7546cbace52a8e241a3b37 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 12:34:18 +0200
Subject: [PATCH 350/851] [InstCombine] Extract EmitGEPOffsets() helper (NFC)

Extract a reusable helper for emitting a sum of multiple GEP
offsets.
---
 .../InstCombine/InstCombineAddSub.cpp         | 23 ++-----------------
 .../InstCombine/InstCombineInternal.h         |  4 ++++
 .../InstCombine/InstructionCombining.cpp      | 20 ++++++++++++++++
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0d91e7d77e4a7..c1ce364eb1794 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2131,27 +2131,8 @@ Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
   bool RewriteGEPs = !Base.LHSGEPs.empty() && !Base.RHSGEPs.empty();
 
   Type *IdxTy = DL.getIndexType(LHS->getType());
-  auto EmitOffsetFromBase = [&](ArrayRef<GEPOperator *> GEPs,
-                                GEPNoWrapFlags NW) -> Value * {
-    Value *Sum = nullptr;
-    for (GEPOperator *GEP : reverse(GEPs)) {
-      Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
-      if (Offset->getType() != IdxTy)
-        Offset = Builder.CreateVectorSplat(
-            cast<VectorType>(IdxTy)->getElementCount(), Offset);
-      if (Sum)
-        Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
-                                NW.isInBounds());
-      else
-        Sum = Offset;
-    }
-    if (!Sum)
-      return Constant::getNullValue(IdxTy);
-    return Sum;
-  };
-
-  Value *Result = EmitOffsetFromBase(Base.LHSGEPs, Base.LHSNW);
-  Value *Offset2 = EmitOffsetFromBase(Base.RHSGEPs, Base.RHSNW);
+  Value *Result = EmitGEPOffsets(Base.LHSGEPs, Base.LHSNW, IdxTy, RewriteGEPs);
+  Value *Offset2 = EmitGEPOffsets(Base.RHSGEPs, Base.RHSNW, IdxTy, RewriteGEPs);
 
   // If this is a single inbounds GEP and the original sub was nuw,
   // then the final multiplication is also nuw.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index bf7689bbfde70..ce0e843437b53 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -378,6 +378,10 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   }
 
   Value *EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP = false);
+  /// Emit sum of multiple GEP offsets. The GEPs are processed in reverse
+  /// order.
+  Value *EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs, GEPNoWrapFlags NW,
+                        Type *IdxTy, bool RewriteGEPs);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Instruction *foldBitcastExtElt(ExtractElementInst &ExtElt);
   Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index dc2a8cb0115e7..29582939fa06a 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -217,6 +217,26 @@ Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
   return Offset;
 }
 
+Value *InstCombinerImpl::EmitGEPOffsets(ArrayRef<GEPOperator *> GEPs,
+                                        GEPNoWrapFlags NW, Type *IdxTy,
+                                        bool RewriteGEPs) {
+  Value *Sum = nullptr;
+  for (GEPOperator *GEP : reverse(GEPs)) {
+    Value *Offset = EmitGEPOffset(GEP, RewriteGEPs);
+    if (Offset->getType() != IdxTy)
+      Offset = Builder.CreateVectorSplat(
+          cast<VectorType>(IdxTy)->getElementCount(), Offset);
+    if (Sum)
+      Sum = Builder.CreateAdd(Sum, Offset, "", NW.hasNoUnsignedWrap(),
+                              NW.isInBounds());
+    else
+      Sum = Offset;
+  }
+  if (!Sum)
+    return Constant::getNullValue(IdxTy);
+  return Sum;
+}
+
 /// Legal integers and common types are considered desirable. This is used to
 /// avoid creating instructions with types that may not be supported well by the
 /// the backend.

From 541e5118ce570c9bed74cb5ff836f88cf1c0e644 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 13 Jun 2025 11:43:50 +0100
Subject: [PATCH 351/851] [LV] Use getFixedValue instead of getKnownMinValue
 when appropriate (#143526)

There are many places in VPlan and LoopVectorize where we use
getKnownMinValue to discover the number of elements in a vector. Where
we expect the vector to have a fixed length, I have used the stronger
getFixedValue call. I believe this is clearer and adds extra protection
in the form of an assert in getFixedValue that the vector is not
scalable.

While looking at VPFirstOrderRecurrencePHIRecipe::computeCost I also
took the liberty of simplifying the code.

In theory I believe this patch should be NFC, but I'm reluctant to add
that to the title in case we're just missing tests for some of the VPlan
changes. I built and ran the LLVM test suite when targeting neoverse-v1
and it seemed ok.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 33 +++++++++++--------
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  7 ++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 19 +++++------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 404ee6874d2a5..fa313243a57da 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3116,12 +3116,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
     // that we will create. This cost is likely to be zero. The phi node
     // cost, if any, should be scaled by the block probability because it
     // models a copy at the end of each predicated block.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getCFInstrCost(Instruction::PHI, CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
 
     // The cost of the non-predicated instruction.
-    ScalarizationCost += VF.getKnownMinValue() *
-      TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
+    ScalarizationCost +=
+        VF.getFixedValue() *
+        TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
 
     // The cost of insertelement and extractelement instructions needed for
     // scalarization.
@@ -4289,7 +4290,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
           return NumLegalParts <= VF.getKnownMinValue();
         }
         // Two or more elements that share a register - are vectorized.
-        return NumLegalParts < VF.getKnownMinValue();
+        return NumLegalParts < VF.getFixedValue();
       };
 
       // If no def nor is a store, e.g., branches, continue - no value to check.
@@ -4574,8 +4575,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
         assert(!isa<SCEVCouldNotCompute>(TC) &&
                "Trip count SCEV must be computable");
         RemainingIterations = SE.getURemExpr(
-            TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
-        MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
+            TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+        MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
         if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
                                 SE.getConstant(TCType, MaxTripCount))) {
           MaxTripCount =
@@ -4586,7 +4587,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
       }
       if (SE.isKnownPredicate(
               CmpInst::ICMP_UGT,
-              SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+              SE.getConstant(TCType, NextVF.Width.getFixedValue()),
               RemainingIterations))
         continue;
     }
@@ -5257,14 +5258,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Get the cost of the scalar memory instruction and address computation.
   InstructionCost Cost =
-      VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+      VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   const Align Alignment = getLoadStoreAlignment(I);
-  Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
-                                                      ValTy->getScalarType(),
-                                                      Alignment, AS, CostKind);
+  Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
+                                                   ValTy->getScalarType(),
+                                                   Alignment, AS, CostKind);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5280,7 +5281,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
     auto *VecI1Ty =
         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
     Cost += TTI.getScalarizationOverhead(
-        VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
+        VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
         /*Insert=*/false, /*Extract=*/true, CostKind);
     Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
 
@@ -5341,6 +5342,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
   StoreInst *SI = cast<StoreInst>(I);
 
   bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
+  // TODO: We have existing tests that request the cost of extracting element
+  // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
+  // the actual generated code, which involves extracting the last element of
+  // a scalable vector where the lane to extract is unknown at compile time.
   return TTI.getAddressComputationCost(ValTy) +
          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
                              CostKind) +
@@ -5623,7 +5628,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
 
     for (Type *VectorTy : getContainedTypes(RetTy)) {
       Cost += TTI.getScalarizationOverhead(
-          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+          cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
           /*Insert=*/true,
           /*Extract=*/false, CostKind);
     }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b74ef91f26e70..10906d9a30df8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
 
   bool IsSingleScalar = vputils::isSingleScalar(Def);
 
-  VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
+  VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
   // Check if there is a scalar value for the selected lane.
   if (!hasScalarValue(Def, LastLane)) {
     // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
     Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
     set(Def, Undef);
-    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+    for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
       packScalarIntoVectorizedValue(Def, Lane);
     VectorValue = get(Def);
   }
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       Entry);
   State->Lane = VPLane(0);
-  for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
-       ++Lane) {
+  for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
     State->Lane = VPLane(Lane, VPLane::Kind::First);
     // Visit the VPBlocks connected to \p this, starting from it.
     for (VPBlockBase *Block : RPOT) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 74472aaeb1675..ccce0e07e4d0a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -871,7 +871,7 @@ void VPInstruction::execute(VPTransformState &State) {
                                     isVectorToScalar() || isSingleScalar());
   bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
   if (GeneratesPerAllLanes) {
-    for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
+    for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
          Lane != NumLanes; ++Lane) {
       Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
       assert(GeneratedValue && "generatePerLane must produce a value");
@@ -2787,8 +2787,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   }
 
   // Generate scalar instances for all VF lanes.
-  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
-  const unsigned EndLane = State.VF.getKnownMinValue();
+  const unsigned EndLane = State.VF.getFixedValue();
   for (unsigned Lane = 0; Lane < EndLane; ++Lane)
     scalarizeInstruction(UI, this, VPLane(Lane), State);
 }
@@ -2841,7 +2840,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
                UI->getOpcode(), ResultTy, CostKind,
                {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
                Op2Info, Operands, UI, &Ctx.TLI) *
-           (isSingleScalar() ? 1 : VF.getKnownMinValue());
+           (isSingleScalar() ? 1 : VF.getFixedValue());
   }
   }
 
@@ -3390,7 +3389,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
     Value *ResBlockInMask = State.get(BlockInMask);
     Value *ShuffledMask = State.Builder.CreateShuffleVector(
         ResBlockInMask,
-        createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
+        createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
         "interleaved.mask");
     return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
                                                    ShuffledMask, MaskForGaps)
@@ -3402,8 +3401,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   if (isa<LoadInst>(Instr)) {
     Value *MaskForGaps = nullptr;
     if (NeedsMaskForGaps) {
-      MaskForGaps = createBitMaskForGaps(State.Builder,
-                                         State.VF.getKnownMinValue(), *Group);
+      MaskForGaps =
+          createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
       assert(MaskForGaps && "Mask for Gaps is required but it is null");
     }
 
@@ -3454,6 +3453,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
 
       return;
     }
+    assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
 
     // For each member in the group, shuffle out the appropriate data from the
     // wide loads.
@@ -3466,13 +3466,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
         continue;
 
       auto StrideMask =
-          createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
+          createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
       Value *StridedVec =
           State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
 
       // If this member has different type, cast the result type.
       if (Member->getType() != ScalarTy) {
-        assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
         VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
         StridedVec =
             createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
@@ -3808,7 +3807,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
   if (VF.isScalar())
     return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 
-  if (VF.isScalable() && VF.getKnownMinValue() == 1)
+  if (VF == ElementCount::getScalable(1))
     return InstructionCost::getInvalid();
 
   return 0;

From 9eef4d1c5fa6b1bcbbe675c14ca8301d5d346f7b Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 13 Jun 2025 06:45:40 -0400
Subject: [PATCH 352/851] Remove delayed typo expressions (#143423)

This removes the delayed typo correction functionality from Clang
(regular typo correction still remains) due to fragility of the
solution.

An RFC was posted here:
https://discourse.llvm.org/t/rfc-removing-support-for-delayed-typo-correction/86631
and while that RFC was asking for folks to consider stepping up to be
maintainers, and we did have a few new contributors show some interest,
experiments show that it's likely worth it to remove this functionality
entirely and focus efforts on improving regular typo correction.

This removal fixes ~20 open issues (quite possibly more), improves
compile time performance by roughly .3-.4%
(https://llvm-compile-time-tracker.com/?config=Overview&stat=instructions%3Au&remote=AaronBallman&sortBy=date),
and does not appear to regress diagnostic behavior in a way we wouldn't
find acceptable.

Fixes #142457
Fixes #139913
Fixes #138850
Fixes #137867
Fixes #137860
Fixes #107840
Fixes #93308
Fixes #69470
Fixes #59391
Fixes #58172
Fixes #46215
Fixes #45915
Fixes #45891
Fixes #44490
Fixes #36703
Fixes #32903
Fixes #23312
Fixes #69874
---
 .../clangd/unittests/HoverTests.cpp           |   2 +-
 clang/docs/ReleaseNotes.rst                   |   8 +
 clang/include/clang/AST/Expr.h                |  33 +-
 clang/include/clang/AST/RecursiveASTVisitor.h |   1 -
 clang/include/clang/Basic/StmtNodes.td        |   1 -
 clang/include/clang/Parse/Parser.h            |   3 +-
 clang/include/clang/Sema/Sema.h               | 126 +-----
 clang/include/clang/Sema/SemaInternal.h       |  14 -
 clang/lib/AST/Expr.cpp                        |   1 -
 clang/lib/AST/ExprClassification.cpp          |   1 -
 clang/lib/AST/ExprConstant.cpp                |   1 -
 clang/lib/AST/ItaniumMangle.cpp               |   1 -
 clang/lib/AST/StmtPrinter.cpp                 |   5 -
 clang/lib/AST/StmtProfile.cpp                 |   4 -
 clang/lib/Parse/ParseCXXInlineMethods.cpp     |   1 -
 clang/lib/Parse/ParseDecl.cpp                 |  33 +-
 clang/lib/Parse/ParseDeclCXX.cpp              |   8 +-
 clang/lib/Parse/ParseExpr.cpp                 |  94 +----
 clang/lib/Parse/ParseExprCXX.cpp              |  13 +-
 clang/lib/Parse/ParseInit.cpp                 |   2 -
 clang/lib/Parse/ParseObjc.cpp                 |  23 +-
 clang/lib/Parse/ParseOpenACC.cpp              |  28 +-
 clang/lib/Parse/ParseOpenMP.cpp               |  27 +-
 clang/lib/Parse/ParseStmt.cpp                 |  16 +-
 clang/lib/Parse/ParseStmtAsm.cpp              |   2 +-
 clang/lib/Parse/ParseTemplate.cpp             |   3 +-
 clang/lib/Sema/Sema.cpp                       |   9 -
 clang/lib/Sema/SemaChecking.cpp               |   2 -
 clang/lib/Sema/SemaCoroutine.cpp              |  12 -
 clang/lib/Sema/SemaDecl.cpp                   |  39 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  27 +-
 clang/lib/Sema/SemaExceptionSpec.cpp          |   1 -
 clang/lib/Sema/SemaExpr.cpp                   | 156 +-------
 clang/lib/Sema/SemaExprCXX.cpp                | 376 +-----------------
 clang/lib/Sema/SemaExprMember.cpp             | 116 +-----
 clang/lib/Sema/SemaLookup.cpp                 |  60 ---
 clang/lib/Sema/SemaObjC.cpp                   |   7 +-
 clang/lib/Sema/SemaOverload.cpp               |   6 +-
 clang/lib/Sema/SemaStmt.cpp                   |  14 +-
 clang/lib/Sema/SemaStmtAttr.cpp               |   5 +-
 clang/lib/Sema/SemaTemplateVariadic.cpp       |  12 +-
 clang/lib/Sema/TreeTransform.h                |   6 -
 clang/lib/Serialization/ASTReaderStmt.cpp     |   4 -
 clang/lib/Serialization/ASTWriterStmt.cpp     |   6 -
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   1 -
 clang/test/AST/ByteCode/literals.cpp          |   8 +-
 clang/test/AST/ast-dump-recovery.c            |  30 --
 clang/test/AST/ast-dump-recovery.cpp          | 214 ----------
 clang/test/AST/ast-dump-recovery.m            |  32 --
 clang/test/CXX/drs/cwg1xx.cpp                 |   3 +-
 clang/test/CXX/drs/cwg26xx.cpp                |   1 -
 .../test/CXX/module/basic/basic.link/p2.cppm  |   4 +-
 clang/test/FixIt/typo.cpp                     | 137 -------
 clang/test/Index/complete-switch.c            |  10 -
 clang/test/Index/fix-its.c                    |  19 +-
 clang/test/Lexer/raw-string-ext.c             |  10 +-
 clang/test/Modules/diagnose-missing-import.m  |   2 -
 .../OpenMP/begin_declare_variant_messages.c   |   2 +-
 .../OpenMP/declare_reduction_messages.cpp     |   2 +-
 clang/test/OpenMP/declare_variant_messages.c  |   6 +-
 .../test/OpenMP/declare_variant_messages.cpp  |   4 +-
 clang/test/OpenMP/target_update_messages.cpp  |  15 +-
 clang/test/Parser/cxx1z-decomposition.cpp     |   6 +-
 clang/test/Parser/cxx1z-fold-expressions.cpp  |   6 +-
 clang/test/Parser/cxx2c-pack-indexing.cpp     |   3 +-
 clang/test/Parser/objc-foreach-syntax.m       |   3 +-
 clang/test/Parser/opencl-atomics-cl20.cl      |  18 +-
 clang/test/Parser/recovery.c                  |   2 +-
 clang/test/Parser/switch-recovery.cpp         |   2 +-
 clang/test/Parser/switch-typo-correction.cpp  |   4 +-
 .../ParserOpenACC/parse-cache-construct.cpp   |  10 +-
 clang/test/ParserOpenACC/parse-clauses.c      |  24 +-
 clang/test/ParserOpenACC/parse-constructs.cpp |   4 +-
 clang/test/ParserOpenACC/parse-wait-clause.c  |   5 +-
 .../test/ParserOpenACC/parse-wait-construct.c |   9 +-
 clang/test/Sema/PR28181.c                     |   8 +-
 clang/test/Sema/builtin-unary-fp.c            |   1 -
 .../c23-delayed-typo-correction-crashes.c     |  18 +
 .../Sema/delayed-typo-correction-crashes.c    |  18 +
 clang/test/Sema/invalid-member.cpp            |   6 +-
 clang/test/Sema/typo-correction-ambiguity.cpp |   4 +-
 clang/test/Sema/typo-correction-no-hang.c     |   9 +-
 clang/test/Sema/typo-correction-no-hang.cpp   |  12 +-
 clang/test/Sema/typo-correction-recursive.cpp |  28 +-
 clang/test/Sema/typo-correction.c             |  26 +-
 clang/test/SemaCXX/arrow-operator.cpp         |   9 +-
 .../SemaCXX/constant-expression-cxx11.cpp     |   7 +-
 clang/test/SemaCXX/conversion-function.cpp    |   2 +-
 clang/test/SemaCXX/coroutines.cpp             |  20 +-
 .../cxx-delayed-typo-correction-crashes.cpp   |  67 ++++
 clang/test/SemaCXX/cxx1z-decomposition.cpp    |   3 +-
 .../cxx20-delayed-typo-correction-crashes.cpp |  19 +
 .../SemaCXX/cxx2a-adl-only-template-id.cpp    |   2 +-
 clang/test/SemaCXX/destructor.cpp             |  15 +-
 clang/test/SemaCXX/invalid-if-constexpr.cpp   |  10 +-
 clang/test/SemaCXX/member-expr.cpp            |   8 +-
 clang/test/SemaCXX/nested-name-spec.cpp       |   6 +-
 .../test/SemaCXX/pr13394-crash-on-invalid.cpp |  29 --
 clang/test/SemaCXX/return.cpp                 |   2 +-
 clang/test/SemaCXX/typo-correction-crash.cpp  |  19 +-
 clang/test/SemaCXX/typo-correction-cxx11.cpp  |  11 +-
 .../test/SemaCXX/typo-correction-delayed.cpp  | 216 ----------
 clang/test/SemaCXX/typo-correction.cpp        |  38 +-
 clang/test/SemaCXX/virtuals.cpp               |   4 +-
 clang/test/SemaObjC/call-super-2.m            |   2 +-
 .../test/SemaObjC/typo-correction-subscript.m |   3 +-
 .../SemaObjC/undef-arg-super-method-call.m    |   8 +-
 .../SemaObjCXX/block-for-lambda-conversion.mm |   7 +-
 .../compute-construct-num_gangs-clause.cpp    |   6 +-
 clang/test/SemaOpenCL/atomic-ops.cl           |   2 +-
 .../test/SemaOpenCL/clang-builtin-version.cl  |   8 +-
 .../SemaTemplate/concepts-recovery-expr.cpp   |   4 +-
 clang/test/SemaTemplate/concepts.cpp          |   6 +-
 clang/test/SemaTemplate/typo-variadic.cpp     |   2 +-
 clang/tools/libclang/CXCursor.cpp             |   1 -
 .../unittests/Sema/ExternalSemaSourceTest.cpp |  14 -
 116 files changed, 438 insertions(+), 2147 deletions(-)
 delete mode 100644 clang/test/AST/ast-dump-recovery.m
 delete mode 100644 clang/test/FixIt/typo.cpp
 delete mode 100644 clang/test/Index/complete-switch.c
 create mode 100644 clang/test/Sema/c23-delayed-typo-correction-crashes.c
 create mode 100644 clang/test/Sema/delayed-typo-correction-crashes.c
 create mode 100644 clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
 create mode 100644 clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
 delete mode 100644 clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
 delete mode 100644 clang/test/SemaCXX/typo-correction-delayed.cpp

diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 69f6df46c87ce..775278ccf694b 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -974,7 +974,7 @@ class Foo final {})cpp";
          HI.Name = "abc";
          HI.Kind = index::SymbolKind::Variable;
          HI.NamespaceScope = "";
-         HI.Definition = "int abc = <recovery - expr>()";
+         HI.Definition = "int abc";
          HI.Type = "int";
          HI.AccessSpecifier = "public";
        }},
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b42d5f8425af6..9ab8031b9ea8c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -622,6 +622,14 @@ Improvements to Clang's diagnostics
 
 - Improved the FixIts for unused lambda captures.
 
+- Delayed typo correction was removed from the compiler; immediate typo
+  correction behavior remains the same. Delayed typo correction facilities were
+  fragile and unmaintained, and the removal closed the following issues:
+  #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
+  #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
+  #GH36703, #GH32903, #GH23312, #GH69874.
+
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 43c28c8bf649f..9fc23d30b733f 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -240,8 +240,7 @@ class Expr : public ValueStmt {
     return static_cast<bool>(getDependence() & ExprDependence::UnexpandedPack);
   }
 
-  /// Whether this expression contains subexpressions which had errors, e.g. a
-  /// TypoExpr.
+  /// Whether this expression contains subexpressions which had errors.
   bool containsErrors() const {
     return static_cast<bool>(getDependence() & ExprDependence::Error);
   }
@@ -6965,36 +6964,6 @@ class AtomicExpr : public Expr {
   }
 };
 
-/// TypoExpr - Internal placeholder for expressions where typo correction
-/// still needs to be performed and/or an error diagnostic emitted.
-class TypoExpr : public Expr {
-  // The location for the typo name.
-  SourceLocation TypoLoc;
-
-public:
-  TypoExpr(QualType T, SourceLocation TypoLoc)
-      : Expr(TypoExprClass, T, VK_LValue, OK_Ordinary), TypoLoc(TypoLoc) {
-    assert(T->isDependentType() && "TypoExpr given a non-dependent type");
-    setDependence(ExprDependence::TypeValueInstantiation |
-                  ExprDependence::Error);
-  }
-
-  child_range children() {
-    return child_range(child_iterator(), child_iterator());
-  }
-  const_child_range children() const {
-    return const_child_range(const_child_iterator(), const_child_iterator());
-  }
-
-  SourceLocation getBeginLoc() const LLVM_READONLY { return TypoLoc; }
-  SourceLocation getEndLoc() const LLVM_READONLY { return TypoLoc; }
-
-  static bool classof(const Stmt *T) {
-    return T->getStmtClass() == TypoExprClass;
-  }
-
-};
-
 /// This class represents BOTH the OpenMP Array Section and OpenACC 'subarray',
 /// with a boolean differentiator.
 /// OpenMP 5.0 [2.1.5, Array Sections].
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index b0f8ae621cf6d..5cb2f57edffe4 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2956,7 +2956,6 @@ DEF_TRAVERSE_STMT(CXXRewrittenBinaryOperator, {
   }
 })
 DEF_TRAVERSE_STMT(OpaqueValueExpr, {})
-DEF_TRAVERSE_STMT(TypoExpr, {})
 DEF_TRAVERSE_STMT(RecoveryExpr, {})
 DEF_TRAVERSE_STMT(CUDAKernelCallExpr, {})
 
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 9526fa5808aa5..c9c173f5c7469 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -202,7 +202,6 @@ def ShuffleVectorExpr : StmtNode<Expr>;
 def ConvertVectorExpr : StmtNode<Expr>;
 def BlockExpr : StmtNode<Expr>;
 def OpaqueValueExpr : StmtNode<Expr>;
-def TypoExpr : StmtNode<Expr>;
 def RecoveryExpr : StmtNode<Expr>;
 def BuiltinBitCastExpr : StmtNode<ExplicitCastExpr>;
 def EmbedExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index d99de77a52919..3243b94c5e5e6 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -4169,8 +4169,7 @@ class Parser : public CodeCompletionHandler {
   bool ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
                            llvm::function_ref<void()> ExpressionStarts =
                                llvm::function_ref<void()>(),
-                           bool FailImmediatelyOnInvalidExpr = false,
-                           bool EarlyTypoCorrection = false);
+                           bool FailImmediatelyOnInvalidExpr = false);
 
   /// ParseSimpleExpressionList - A simple comma-separated list of expressions,
   /// used for misc language extensions.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0dad07e55a820..29452bb37260d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -6713,10 +6713,6 @@ class Sema final : public SemaBase {
     /// this expression evaluation context.
     unsigned NumCleanupObjects;
 
-    /// The number of typos encountered during this expression evaluation
-    /// context (i.e. the number of TypoExprs created).
-    unsigned NumTypos;
-
     MaybeODRUseExprSet SavedMaybeODRUseExprs;
 
     /// The lambdas that are present within this context, if it
@@ -6813,7 +6809,7 @@ class Sema final : public SemaBase {
                                       Decl *ManglingContextDecl,
                                       ExpressionKind ExprContext)
         : Context(Context), ParentCleanup(ParentCleanup),
-          NumCleanupObjects(NumCleanupObjects), NumTypos(0),
+          NumCleanupObjects(NumCleanupObjects),
           ManglingContextDecl(ManglingContextDecl), ExprContext(ExprContext),
           InDiscardedStatement(false), InImmediateFunctionContext(false),
           InImmediateEscalatingFunctionContext(false) {}
@@ -7146,8 +7142,7 @@ class Sema final : public SemaBase {
                       CorrectionCandidateCallback &CCC,
                       TemplateArgumentListInfo *ExplicitTemplateArgs = nullptr,
                       ArrayRef<Expr *> Args = {},
-                      DeclContext *LookupCtx = nullptr,
-                      TypoExpr **Out = nullptr);
+                      DeclContext *LookupCtx = nullptr);
 
   /// If \p D cannot be odr-used in the current expression evaluation context,
   /// return a reason explaining why. Otherwise, return NOUR_None.
@@ -8748,40 +8743,6 @@ class Sema final : public SemaBase {
 
   ExprResult CheckUnevaluatedOperand(Expr *E);
 
-  /// Process any TypoExprs in the given Expr and its children,
-  /// generating diagnostics as appropriate and returning a new Expr if there
-  /// were typos that were all successfully corrected and ExprError if one or
-  /// more typos could not be corrected.
-  ///
-  /// \param E The Expr to check for TypoExprs.
-  ///
-  /// \param InitDecl A VarDecl to avoid because the Expr being corrected is its
-  /// initializer.
-  ///
-  /// \param RecoverUncorrectedTypos If true, when typo correction fails, it
-  /// will rebuild the given Expr with all TypoExprs degraded to RecoveryExprs.
-  ///
-  /// \param Filter A function applied to a newly rebuilt Expr to determine if
-  /// it is an acceptable/usable result from a single combination of typo
-  /// corrections. As long as the filter returns ExprError, different
-  /// combinations of corrections will be tried until all are exhausted.
-  ExprResult CorrectDelayedTyposInExpr(
-      Expr *E, VarDecl *InitDecl = nullptr,
-      bool RecoverUncorrectedTypos = false,
-      llvm::function_ref<ExprResult(Expr *)> Filter =
-          [](Expr *E) -> ExprResult { return E; });
-
-  ExprResult CorrectDelayedTyposInExpr(
-      ExprResult ER, VarDecl *InitDecl = nullptr,
-      bool RecoverUncorrectedTypos = false,
-      llvm::function_ref<ExprResult(Expr *)> Filter =
-          [](Expr *E) -> ExprResult { return E; }) {
-    return ER.isInvalid()
-               ? ER
-               : CorrectDelayedTyposInExpr(ER.get(), InitDecl,
-                                           RecoverUncorrectedTypos, Filter);
-  }
-
   IfExistsResult
   CheckMicrosoftIfExistsSymbol(Scope *S, CXXScopeSpec &SS,
                                const DeclarationNameInfo &TargetNameInfo);
@@ -9283,12 +9244,6 @@ class Sema final : public SemaBase {
   /// for C++ records.
   llvm::FoldingSet<SpecialMemberOverloadResultEntry> SpecialMemberCache;
 
-  /// Holds TypoExprs that are created from `createDelayedTypo`. This is used by
-  /// `TransformTypos` in order to keep track of any TypoExprs that are created
-  /// recursively during typo correction and wipe them away if the correction
-  /// fails.
-  llvm::SmallVector<TypoExpr *, 2> TypoExprs;
-
   enum class AcceptableKind { Visible, Reachable };
 
   // Members have to be NamespaceDecl* or TranslationUnitDecl*.
@@ -9376,10 +9331,6 @@ class Sema final : public SemaBase {
                       bool VolatileArg, bool RValueThis, bool ConstThis,
                       bool VolatileThis);
 
-  typedef std::function<void(const TypoCorrection &)> TypoDiagnosticGenerator;
-  typedef std::function<ExprResult(Sema &, TypoExpr *, TypoCorrection)>
-      TypoRecoveryCallback;
-
   RedeclarationKind forRedeclarationInCurContext() const;
 
   /// Look up a name, looking for a single declaration.  Return
@@ -9733,51 +9684,6 @@ class Sema final : public SemaBase {
                              const ObjCObjectPointerType *OPT = nullptr,
                              bool RecordFailure = true);
 
-  /// Try to "correct" a typo in the source code by finding
-  /// visible declarations whose names are similar to the name that was
-  /// present in the source code.
-  ///
-  /// \param TypoName the \c DeclarationNameInfo structure that contains
-  /// the name that was present in the source code along with its location.
-  ///
-  /// \param LookupKind the name-lookup criteria used to search for the name.
-  ///
-  /// \param S the scope in which name lookup occurs.
-  ///
-  /// \param SS the nested-name-specifier that precedes the name we're
-  /// looking for, if present.
-  ///
-  /// \param CCC A CorrectionCandidateCallback object that provides further
-  /// validation of typo correction candidates. It also provides flags for
-  /// determining the set of keywords permitted.
-  ///
-  /// \param TDG A TypoDiagnosticGenerator functor that will be used to print
-  /// diagnostics when the actual typo correction is attempted.
-  ///
-  /// \param TRC A TypoRecoveryCallback functor that will be used to build an
-  /// Expr from a typo correction candidate.
-  ///
-  /// \param MemberContext if non-NULL, the context in which to look for
-  /// a member access expression.
-  ///
-  /// \param EnteringContext whether we're entering the context described by
-  /// the nested-name-specifier SS.
-  ///
-  /// \param OPT when non-NULL, the search for visible declarations will
-  /// also walk the protocols in the qualified interfaces of \p OPT.
-  ///
-  /// \returns a new \c TypoExpr that will later be replaced in the AST with an
-  /// Expr representing the result of performing typo correction, or nullptr if
-  /// typo correction is not possible. If nullptr is returned, no diagnostics
-  /// will be emitted and it is the responsibility of the caller to emit any
-  /// that are needed.
-  TypoExpr *CorrectTypoDelayed(
-      const DeclarationNameInfo &Typo, Sema::LookupNameKind LookupKind,
-      Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
-      TypoDiagnosticGenerator TDG, TypoRecoveryCallback TRC,
-      CorrectTypoKind Mode, DeclContext *MemberContext = nullptr,
-      bool EnteringContext = false, const ObjCObjectPointerType *OPT = nullptr);
-
   /// Kinds of missing import. Note, the values of these enumerators correspond
   /// to %select values in diagnostics.
   enum class MissingImportKind {
@@ -9796,20 +9702,6 @@ class Sema final : public SemaBase {
                              SourceLocation DeclLoc, ArrayRef<Module *> Modules,
                              MissingImportKind MIK, bool Recover);
 
-  struct TypoExprState {
-    std::unique_ptr<TypoCorrectionConsumer> Consumer;
-    TypoDiagnosticGenerator DiagHandler;
-    TypoRecoveryCallback RecoveryHandler;
-    TypoExprState();
-    TypoExprState(TypoExprState &&other) noexcept;
-    TypoExprState &operator=(TypoExprState &&other) noexcept;
-  };
-
-  const TypoExprState &getTypoExprState(TypoExpr *TE) const;
-
-  /// Clears the state of the given TypoExpr.
-  void clearDelayedTypo(TypoExpr *TE);
-
   /// Called on #pragma clang __debug dump II
   void ActOnPragmaDump(Scope *S, SourceLocation Loc, IdentifierInfo *II);
 
@@ -9832,23 +9724,15 @@ class Sema final : public SemaBase {
   /// Determine if we could use all the declarations in the module.
   bool isUsableModule(const Module *M);
 
-  /// Helper for CorrectTypo and CorrectTypoDelayed used to create and
-  /// populate a new TypoCorrectionConsumer. Returns nullptr if typo correction
-  /// should be skipped entirely.
+  /// Helper for CorrectTypo used to create and populate a new
+  /// TypoCorrectionConsumer. Returns nullptr if typo correction should be
+  /// skipped entirely.
   std::unique_ptr<TypoCorrectionConsumer> makeTypoCorrectionConsumer(
       const DeclarationNameInfo &Typo, Sema::LookupNameKind LookupKind,
       Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
       DeclContext *MemberContext, bool EnteringContext,
       const ObjCObjectPointerType *OPT, bool ErrorRecovery);
 
-  /// The set of unhandled TypoExprs and their associated state.
-  llvm::MapVector<TypoExpr *, TypoExprState> DelayedTypos;
-
-  /// Creates a new TypoExpr AST node.
-  TypoExpr *createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
-                              TypoDiagnosticGenerator TDG,
-                              TypoRecoveryCallback TRC, SourceLocation TypoLoc);
-
   /// Cache for module units which is usable for current module.
   llvm::DenseSet<const Module *> UsableModuleUnitsCache;
 
diff --git a/clang/include/clang/Sema/SemaInternal.h b/clang/include/clang/Sema/SemaInternal.h
index 95874077050a9..4d0da1102bb59 100644
--- a/clang/include/clang/Sema/SemaInternal.h
+++ b/clang/include/clang/Sema/SemaInternal.h
@@ -314,20 +314,6 @@ class TypoCorrectionConsumer : public VisibleDeclConsumer {
   bool SearchNamespaces;
 };
 
-inline Sema::TypoExprState::TypoExprState() {}
-
-inline Sema::TypoExprState::TypoExprState(TypoExprState &&other) noexcept {
-  *this = std::move(other);
-}
-
-inline Sema::TypoExprState &Sema::TypoExprState::
-operator=(Sema::TypoExprState &&other) noexcept {
-  Consumer = std::move(other.Consumer);
-  DiagHandler = std::move(other.DiagHandler);
-  RecoveryHandler = std::move(other.RecoveryHandler);
-  return *this;
-}
-
 } // end namespace clang
 
 #endif
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 17d2cb4a30f30..c3722c65abf6e 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -3611,7 +3611,6 @@ bool Expr::HasSideEffects(const ASTContext &Ctx,
   case PackExpansionExprClass:
   case SubstNonTypeTemplateParmPackExprClass:
   case FunctionParmPackExprClass:
-  case TypoExprClass:
   case RecoveryExprClass:
   case CXXFoldExprClass:
     // Make a conservative assumption for dependent nodes.
diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp
index 3f37d06cc8f3a..ad66335138a42 100644
--- a/clang/lib/AST/ExprClassification.cpp
+++ b/clang/lib/AST/ExprClassification.cpp
@@ -129,7 +129,6 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) {
     // FIXME: Is this wise? Should they get their own kind?
   case Expr::UnresolvedLookupExprClass:
   case Expr::UnresolvedMemberExprClass:
-  case Expr::TypoExprClass:
   case Expr::DependentCoawaitExprClass:
   case Expr::CXXDependentScopeMemberExprClass:
   case Expr::DependentScopeDeclRefExprClass:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 27ea55e981446..f1580255a462a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -17327,7 +17327,6 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) {
   case Expr::CXXDeleteExprClass:
   case Expr::CXXPseudoDestructorExprClass:
   case Expr::UnresolvedLookupExprClass:
-  case Expr::TypoExprClass:
   case Expr::RecoveryExprClass:
   case Expr::DependentScopeDeclRefExprClass:
   case Expr::CXXConstructExprClass:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index ecf5be220439b..487933a748ab8 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4994,7 +4994,6 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
   case Expr::ParenListExprClass:
   case Expr::MSPropertyRefExprClass:
   case Expr::MSPropertySubscriptExprClass:
-  case Expr::TypoExprClass: // This should no longer exist in the AST by now.
   case Expr::RecoveryExprClass:
   case Expr::ArraySectionExprClass:
   case Expr::OMPArrayShapingExprClass:
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 13c3bc0387890..28317911d825b 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2914,11 +2914,6 @@ void StmtPrinter::VisitOpaqueValueExpr(OpaqueValueExpr *Node) {
   PrintExpr(Node->getSourceExpr());
 }
 
-void StmtPrinter::VisitTypoExpr(TypoExpr *Node) {
-  // TODO: Print something reasonable for a TypoExpr, if necessary.
-  llvm_unreachable("Cannot print TypoExpr nodes");
-}
-
 void StmtPrinter::VisitRecoveryExpr(RecoveryExpr *Node) {
   OS << "<recovery-expr>(";
   const char *Sep = "";
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index f7d1655f67ed1..c666d966a6e58 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2361,10 +2361,6 @@ void StmtProfiler::VisitOpaqueValueExpr(const OpaqueValueExpr *E) {
   VisitExpr(E);
 }
 
-void StmtProfiler::VisitTypoExpr(const TypoExpr *E) {
-  VisitExpr(E);
-}
-
 void StmtProfiler::VisitSourceLocExpr(const SourceLocExpr *E) {
   VisitExpr(E);
 }
diff --git a/clang/lib/Parse/ParseCXXInlineMethods.cpp b/clang/lib/Parse/ParseCXXInlineMethods.cpp
index e215c64cccd11..9a010fb5f3427 100644
--- a/clang/lib/Parse/ParseCXXInlineMethods.cpp
+++ b/clang/lib/Parse/ParseCXXInlineMethods.cpp
@@ -422,7 +422,6 @@ void Parser::ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM) {
         DefArgResult = ParseBraceInitializer();
       } else
         DefArgResult = ParseAssignmentExpression();
-      DefArgResult = Actions.CorrectDelayedTyposInExpr(DefArgResult, Param);
       if (DefArgResult.isInvalid()) {
         Actions.ActOnParamDefaultArgumentError(Param, EqualLoc,
                                                /*DefaultArg=*/nullptr);
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index f469e466e4634..647ee34efcabc 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -436,7 +436,6 @@ bool Parser::ParseAttributeArgumentList(
     } else {
       Expr = ParseAssignmentExpression();
     }
-    Expr = Actions.CorrectDelayedTyposInExpr(Expr);
 
     if (Tok.is(tok::ellipsis))
       Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
@@ -472,15 +471,6 @@ bool Parser::ParseAttributeArgumentList(
     Arg++;
   }
 
-  if (SawError) {
-    // Ensure typos get diagnosed when errors were encountered while parsing the
-    // expression list.
-    for (auto &E : Exprs) {
-      ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E);
-      if (Expr.isUsable())
-        E = Expr.get();
-    }
-  }
   return SawError;
 }
 
@@ -565,9 +555,7 @@ unsigned Parser::ParseAttributeArgsCommon(
               nullptr,
               Sema::ExpressionEvaluationContextRecord::EK_AttrArgument);
 
-          ExprResult ArgExpr(
-              Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
-
+          ExprResult ArgExpr = ParseAssignmentExpression();
           if (ArgExpr.isInvalid()) {
             SkipUntil(tok::r_paren, StopAtSemi);
             return 0;
@@ -3212,9 +3200,7 @@ void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, nullptr,
       ExpressionKind::EK_AttrArgument);
 
-  ExprResult ArgExpr(
-      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
-
+  ExprResult ArgExpr = ParseAssignmentExpression();
   if (ArgExpr.isInvalid()) {
     Parens.skipToEnd();
     return;
@@ -6890,8 +6876,8 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
       //   void (f()) requires true;
       Diag(Tok, diag::err_requires_clause_inside_parens);
       ConsumeToken();
-      ExprResult TrailingRequiresClause = Actions.CorrectDelayedTyposInExpr(
-         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      ExprResult TrailingRequiresClause =
+          ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
       if (TrailingRequiresClause.isUsable() && D.isFunctionDeclarator() &&
           !D.hasTrailingRequiresClause())
         // We're already ill-formed if we got here but we'll accept it anyway.
@@ -7538,8 +7524,7 @@ void Parser::ParseParameterDeclarationClause(
       Diag(Tok,
            diag::err_requires_clause_on_declarator_not_declaring_a_function);
       ConsumeToken();
-      Actions.CorrectDelayedTyposInExpr(
-         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
     }
 
     // Remember this parsed parameter in ParamInfo.
@@ -7653,7 +7638,6 @@ void Parser::ParseParameterDeclarationClause(
             }
             DefArgResult = ParseAssignmentExpression();
           }
-          DefArgResult = Actions.CorrectDelayedTyposInExpr(DefArgResult);
           if (DefArgResult.isInvalid()) {
             Actions.ActOnParamDefaultArgumentError(Param, EqualLoc,
                                                    /*DefaultArg=*/nullptr);
@@ -7799,8 +7783,7 @@ void Parser::ParseBracketDeclarator(Declarator &D) {
     } else {
       EnterExpressionEvaluationContext Unevaluated(
           Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-      NumElements =
-          Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+      NumElements = ParseAssignmentExpression();
     }
   } else {
     if (StaticLoc.isValid()) {
@@ -7937,8 +7920,8 @@ void Parser::ParseTypeofSpecifier(DeclSpec &DS) {
   bool isCastExpr;
   ParsedType CastTy;
   SourceRange CastRange;
-  ExprResult Operand = Actions.CorrectDelayedTyposInExpr(
-      ParseExprAfterUnaryExprOrTypeTrait(OpTok, isCastExpr, CastTy, CastRange));
+  ExprResult Operand =
+      ParseExprAfterUnaryExprOrTypeTrait(OpTok, isCastExpr, CastTy, CastRange);
   if (HasParens)
     DS.setTypeArgumentRange(CastRange);
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 0b5f56fea0b14..5f34370aeeb2d 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1071,10 +1071,7 @@ SourceLocation Parser::ParseDecltypeSpecifier(DeclSpec &DS) {
       EnterExpressionEvaluationContext Unevaluated(
           Actions, Sema::ExpressionEvaluationContext::Unevaluated, nullptr,
           Sema::ExpressionEvaluationContextRecord::EK_Decltype);
-      Result = Actions.CorrectDelayedTyposInExpr(
-          ParseExpression(), /*InitDecl=*/nullptr,
-          /*RecoverUncorrectedTypos=*/false,
-          [](Expr *E) { return E->hasPlaceholderType() ? ExprError() : E; });
+      Result = ParseExpression();
       if (Result.isInvalid()) {
         DS.SetTypeSpecError();
         if (SkipUntil(tok::r_paren, StopAtSemi | StopBeforeMatch)) {
@@ -4465,8 +4462,7 @@ bool Parser::ParseCXXAssumeAttributeArg(
       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   TentativeParsingAction TPA(*this);
-  ExprResult Res(
-      Actions.CorrectDelayedTyposInExpr(ParseConditionalExpression()));
+  ExprResult Res = ParseConditionalExpression();
   if (Res.isInvalid()) {
     TPA.Commit();
     SkipUntil(tok::r_paren, tok::r_square, StopAtSemi | StopBeforeMatch);
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 951a157305ddc..a27a44455b621 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -183,7 +183,6 @@ ExprResult Parser::ParseConstraintExpression() {
   ExprResult LHS(ParseCastExpression(CastParseKind::AnyCastExpr));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::LogicalOr));
   if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get())) {
-    Actions.CorrectDelayedTyposInExpr(Res);
     return ExprError();
   }
   return Res;
@@ -244,7 +243,6 @@ Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
       // the rest of the addition expression). Try to parse the rest of it here.
       if (PossibleNonPrimary)
         E = RecoverFromNonPrimary(E, /*Note=*/!IsConstraintExpr);
-      Actions.CorrectDelayedTyposInExpr(E);
       return ExprError();
     }
     return E;
@@ -256,14 +254,11 @@ Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
     SourceLocation LogicalAndLoc = ConsumeToken();
     ExprResult RHS = ParsePrimary();
     if (RHS.isInvalid()) {
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalAndLoc,
                                        tok::ampamp, LHS.get(), RHS.get());
     if (!Op.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(RHS);
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     LHS = Op;
@@ -281,14 +276,11 @@ Parser::ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause) {
     ExprResult RHS =
         ParseConstraintLogicalAndExpression(IsTrailingRequiresClause);
     if (!RHS.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalOrLoc,
                                        tok::pipepipe, LHS.get(), RHS.get());
     if (!Op.isUsable()) {
-      Actions.CorrectDelayedTyposInExpr(RHS);
-      Actions.CorrectDelayedTyposInExpr(LHS);
       return ExprError();
     }
     LHS = Op;
@@ -408,7 +400,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       }
 
       if (TernaryMiddle.isInvalid()) {
-        Actions.CorrectDelayedTyposInExpr(LHS);
         LHS = ExprError();
         TernaryMiddle = nullptr;
       }
@@ -466,11 +457,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       RHS = ParseCastExpression(CastParseKind::AnyCastExpr);
 
     if (RHS.isInvalid()) {
-      // FIXME: Errors generated by the delayed typo correction should be
-      // printed before errors from parsing the RHS, not after.
-      Actions.CorrectDelayedTyposInExpr(LHS);
-      if (TernaryMiddle.isUsable())
-        TernaryMiddle = Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
       LHS = ExprError();
     }
 
@@ -503,11 +489,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       RHSIsInitList = false;
 
       if (RHS.isInvalid()) {
-        // FIXME: Errors generated by the delayed typo correction should be
-        // printed before errors from ParseRHSOfBinaryExpression, not after.
-        Actions.CorrectDelayedTyposInExpr(LHS);
-        if (TernaryMiddle.isUsable())
-          TernaryMiddle = Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
         LHS = ExprError();
       }
 
@@ -571,17 +552,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
 
         LHS = CondOp;
       }
-      // In this case, ActOnBinOp or ActOnConditionalOp performed the
-      // CorrectDelayedTyposInExpr check.
-      if (!getLangOpts().CPlusPlus)
-        continue;
-    }
-
-    // Ensure potential typos aren't left undiagnosed.
-    if (LHS.isInvalid()) {
-      Actions.CorrectDelayedTyposInExpr(OrigLHS);
-      Actions.CorrectDelayedTyposInExpr(TernaryMiddle);
-      Actions.CorrectDelayedTyposInExpr(RHS);
     }
   }
 }
@@ -1711,7 +1681,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
       // Reject array indices starting with a lambda-expression. '[[' is
       // reserved for attributes.
       if (CheckProhibitedCXX11Attribute()) {
-        (void)Actions.CorrectDelayedTyposInExpr(LHS);
         return ExprError();
       }
       BalancedDelimiterTracker T(*this, tok::l_square);
@@ -1737,8 +1706,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           } else {
             Idx = ParseExpression(); // May be a comma expression
           }
-          LHS = Actions.CorrectDelayedTyposInExpr(LHS);
-          Idx = Actions.CorrectDelayedTyposInExpr(Idx);
           if (Idx.isInvalid()) {
             HasError = true;
           } else {
@@ -1746,7 +1713,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           }
         } else if (Tok.isNot(tok::r_square)) {
           if (ParseExpressionList(ArgExprs)) {
-            LHS = Actions.CorrectDelayedTyposInExpr(LHS);
             HasError = true;
           }
         }
@@ -1762,7 +1728,7 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           // Consume ':'
           ColonLocFirst = ConsumeToken();
           if (Tok.isNot(tok::r_square))
-            Length = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+            Length = ParseExpression();
         }
       } else if (ArgExprs.size() <= 1 && getLangOpts().OpenMP) {
         ColonProtectionRAIIObject RAII(*this);
@@ -1773,7 +1739,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
               (getLangOpts().OpenMP < 50 ||
                ((Tok.isNot(tok::colon) && getLangOpts().OpenMP >= 50)))) {
             Length = ParseExpression();
-            Length = Actions.CorrectDelayedTyposInExpr(Length);
           }
         }
         if (getLangOpts().OpenMP >= 50 &&
@@ -1789,8 +1754,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
       }
 
       SourceLocation RLoc = Tok.getLocation();
-      LHS = Actions.CorrectDelayedTyposInExpr(LHS);
-
       if (!LHS.isInvalid() && !HasError && !Length.isInvalid() &&
           !Stride.isInvalid() && Tok.is(tok::r_square)) {
         if (ColonLocFirst.isValid() || ColonLocSecond.isValid()) {
@@ -1838,7 +1801,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
         SourceLocation OpenLoc = ConsumeToken();
 
         if (ParseSimpleExpressionList(ExecConfigExprs)) {
-          (void)Actions.CorrectDelayedTyposInExpr(LHS);
           LHS = ExprError();
         }
 
@@ -1889,16 +1851,12 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                  PreferredType.enterFunctionArgument(Tok.getLocation(),
                                                      RunSignatureHelp);
                }))) {
-            (void)Actions.CorrectDelayedTyposInExpr(LHS);
             // If we got an error when parsing expression list, we don't call
             // the CodeCompleteCall handler inside the parser. So call it here
             // to make sure we get overload suggestions even when we are in the
             // middle of a parameter.
             if (PP.isCodeCompletionReached() && !CalledSignatureHelp)
               RunSignatureHelp();
-          } else if (LHS.isInvalid()) {
-            for (auto &E : ArgExprs)
-              Actions.CorrectDelayedTyposInExpr(E);
           }
         }
       }
@@ -1913,16 +1871,16 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                                          ArgExprs);
         SkipUntil(tok::r_paren, StopAtSemi);
       } else if (Tok.isNot(tok::r_paren)) {
-        bool HadDelayedTypo = false;
-        if (Actions.CorrectDelayedTyposInExpr(LHS).get() != LHS.get())
-          HadDelayedTypo = true;
+        bool HadErrors = false;
+        if (LHS.get()->containsErrors())
+          HadErrors = true;
         for (auto &E : ArgExprs)
-          if (Actions.CorrectDelayedTyposInExpr(E).get() != E)
-            HadDelayedTypo = true;
-        // If there were delayed typos in the LHS or ArgExprs, call SkipUntil
-        // instead of PT.consumeClose() to avoid emitting extra diagnostics for
-        // the unmatched l_paren.
-        if (HadDelayedTypo)
+          if (E->containsErrors())
+            HadErrors = true;
+        // If there were errors in the LHS or ArgExprs, call SkipUntil instead
+        // of PT.consumeClose() to avoid emitting extra diagnostics for the
+        // unmatched l_paren.
+        if (HadErrors)
           SkipUntil(tok::r_paren, StopAtSemi);
         else
           PT.consumeClose();
@@ -2050,7 +2008,6 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
                      /*AllowConstructorName=*/
                      getLangOpts().MicrosoftExt && SS.isNotEmpty(),
                      /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) {
-        (void)Actions.CorrectDelayedTyposInExpr(LHS);
         LHS = ExprError();
       }
 
@@ -2921,8 +2878,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     do {
       BalancedDelimiterTracker TS(*this, tok::l_square);
       TS.consumeOpen();
-      ExprResult NumElements =
-          Actions.CorrectDelayedTyposInExpr(ParseExpression());
+      ExprResult NumElements = ParseExpression();
       if (!NumElements.isUsable()) {
         ErrorFound = true;
         while (!SkipUntil(tok::r_square, tok::r_paren,
@@ -2936,7 +2892,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     // Match the ')'.
     T.consumeClose();
     RParenLoc = T.getCloseLocation();
-    Result = Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+    Result = ParseAssignmentExpression();
     if (ErrorFound) {
       Result = ExprError();
     } else if (!Result.isInvalid()) {
@@ -2948,12 +2904,6 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     InMessageExpressionRAIIObject InMessage(*this, false);
 
     Result = ParseExpression(TypeCastState::MaybeTypeCast);
-    if (!getLangOpts().CPlusPlus && Result.isUsable()) {
-      // Correct typos in non-C++ code earlier so that implicit-cast-like
-      // expressions are parsed correctly.
-      Result = Actions.CorrectDelayedTyposInExpr(Result);
-    }
-
     if (ExprType >= ParenParseOption::FoldExpr &&
         isFoldOperator(Tok.getKind()) && NextToken().is(tok::ellipsis)) {
       ExprType = ParenParseOption::FoldExpr;
@@ -3057,8 +3007,7 @@ ExprResult Parser::ParseGenericSelectionExpression() {
     // not evaluated."
     EnterExpressionEvaluationContext Unevaluated(
         Actions, Sema::ExpressionEvaluationContext::Unevaluated);
-    ControllingExpr =
-        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+    ControllingExpr = ParseAssignmentExpression();
     if (ControllingExpr.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
       return ExprError();
@@ -3104,8 +3053,7 @@ ExprResult Parser::ParseGenericSelectionExpression() {
 
     // FIXME: These expressions should be parsed in a potentially potentially
     // evaluated context.
-    ExprResult ER(
-        Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression()));
+    ExprResult ER = ParseAssignmentExpression();
     if (ER.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
       return ExprError();
@@ -3199,8 +3147,7 @@ void Parser::injectEmbedTokens() {
 
 bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
                                  llvm::function_ref<void()> ExpressionStarts,
-                                 bool FailImmediatelyOnInvalidExpr,
-                                 bool EarlyTypoCorrection) {
+                                 bool FailImmediatelyOnInvalidExpr) {
   bool SawError = false;
   while (true) {
     if (ExpressionStarts)
@@ -3213,9 +3160,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     } else
       Expr = ParseAssignmentExpression();
 
-    if (EarlyTypoCorrection)
-      Expr = Actions.CorrectDelayedTyposInExpr(Expr);
-
     if (Tok.is(tok::ellipsis))
       Expr = Actions.ActOnPackExpansion(Expr.get(), ConsumeToken());
     else if (Tok.is(tok::code_completion)) {
@@ -3244,14 +3188,6 @@ bool Parser::ParseExpressionList(SmallVectorImpl<Expr *> &Exprs,
     ConsumeToken();
     checkPotentialAngleBracketDelimiter(Comma);
   }
-  if (SawError) {
-    // Ensure typos get diagnosed when errors were encountered while parsing the
-    // expression list.
-    for (auto &E : Exprs) {
-      ExprResult Expr = Actions.CorrectDelayedTyposInExpr(E);
-      if (Expr.isUsable()) E = Expr.get();
-    }
-  }
   return SawError;
 }
 
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 55ad7f256fa82..329572047da04 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -972,8 +972,6 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
           SourceLocation StartLoc = Tok.getLocation();
           InMessageExpressionRAIIObject MaybeInMessageExpression(*this, true);
           Init = ParseInitializer();
-          if (!Init.isInvalid())
-            Init = Actions.CorrectDelayedTyposInExpr(Init.get());
 
           if (Tok.getLocation() != StartLoc) {
             // Back out the lexing of the token after the initializer.
@@ -1065,8 +1063,6 @@ bool Parser::ParseLambdaIntroducer(LambdaIntroducer &Intro,
     // enclosing the lambda-expression, rather than in the context of the
     // lambda-expression itself.
     ParsedType InitCaptureType;
-    if (Init.isUsable())
-      Init = Actions.CorrectDelayedTyposInExpr(Init.get());
     if (Init.isUsable()) {
       NonTentativeAction([&] {
         // Get the pointer and store it in an lvalue, so we can use it as an
@@ -3202,8 +3198,7 @@ ExprResult Parser::ParseRequiresExpression() {
         //             cv-qualifier-seq[opt] abstract-declarator[opt]
         BalancedDelimiterTracker ExprBraces(*this, tok::l_brace);
         ExprBraces.consumeOpen();
-        ExprResult Expression =
-            Actions.CorrectDelayedTyposInExpr(ParseExpression());
+        ExprResult Expression = ParseExpression();
         if (!Expression.isUsable()) {
           ExprBraces.skipToEnd();
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
@@ -3306,8 +3301,7 @@ ExprResult Parser::ParseRequiresExpression() {
             // C++ [expr.prim.req.nested]
             //     nested-requirement:
             //         'requires' constraint-expression ';'
-            ExprResult ConstraintExpr =
-                Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+            ExprResult ConstraintExpr = ParseConstraintExpression();
             if (ConstraintExpr.isInvalid() || !ConstraintExpr.isUsable()) {
               SkipUntil(tok::semi, tok::r_brace,
                         SkipUntilFlags::StopBeforeMatch);
@@ -3373,8 +3367,7 @@ ExprResult Parser::ParseRequiresExpression() {
         //     simple-requirement:
         //         expression ';'
         SourceLocation StartLoc = Tok.getLocation();
-        ExprResult Expression =
-            Actions.CorrectDelayedTyposInExpr(ParseExpression());
+        ExprResult Expression = ParseExpression();
         if (!Expression.isUsable()) {
           SkipUntil(tok::semi, tok::r_brace, SkipUntilFlags::StopBeforeMatch);
           break;
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index df8372b995e55..a3be3744a9327 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -477,8 +477,6 @@ ExprResult Parser::ParseBraceInitializer() {
     if (Tok.is(tok::ellipsis))
       SubElt = Actions.ActOnPackExpansion(SubElt.get(), ConsumeToken());
 
-    SubElt = Actions.CorrectDelayedTyposInExpr(SubElt.get());
-
     // If we couldn't parse the subelement, bail out.
     if (SubElt.isUsable()) {
       InitExprs.push_back(SubElt.get());
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 6afb7809d3cd2..8ef16a4d3808a 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -2629,10 +2629,7 @@ bool Parser::ParseObjCXXMessageReceiver(bool &IsExpr, void *&TypeOrExpr) {
   if (!Tok.isSimpleTypeSpecifier(getLangOpts())) {
     //   objc-receiver:
     //     expression
-    // Make sure any typos in the receiver are corrected or diagnosed, so that
-    // proper recovery can happen. FIXME: Perhaps filter the corrected expr to
-    // only the things that are valid ObjC receivers?
-    ExprResult Receiver = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    ExprResult Receiver = ParseExpression();
     if (Receiver.isInvalid())
       return true;
 
@@ -2809,7 +2806,7 @@ ExprResult Parser::ParseObjCMessageExpression() {
   }
 
   // Otherwise, an arbitrary expression can be the receiver of a send.
-  ExprResult Res = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+  ExprResult Res = ParseExpression();
   if (Res.isInvalid()) {
     SkipUntil(tok::r_square, StopAtSemi);
     return Res;
@@ -2930,8 +2927,6 @@ Parser::ParseObjCMessageExpressionBody(SourceLocation LBracLoc,
       SourceLocation commaLoc = ConsumeToken(); // Eat the ','.
       ///  Parse the expression after ','
       ExprResult Res(ParseAssignmentExpression());
-      if (Tok.is(tok::colon))
-        Res = Actions.CorrectDelayedTyposInExpr(Res);
       if (Res.isInvalid()) {
         if (Tok.is(tok::colon)) {
           Diag(commaLoc, diag::note_extra_comma_message_arg) <<
@@ -3078,10 +3073,6 @@ ExprResult Parser::ParseObjCArrayLiteral(SourceLocation AtLoc) {
       return Res;
     }
 
-    Res = Actions.CorrectDelayedTyposInExpr(Res.get());
-    if (Res.isInvalid())
-      HasInvalidEltExpr = true;
-
     // Parse the ellipsis that indicates a pack expansion.
     if (Tok.is(tok::ellipsis))
       Res = Actions.ActOnPackExpansion(Res.get(), ConsumeToken());
@@ -3108,7 +3099,6 @@ ExprResult Parser::ParseObjCArrayLiteral(SourceLocation AtLoc) {
 ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
   SmallVector<ObjCDictionaryElement, 4> Elements; // dictionary elements.
   ConsumeBrace(); // consume the l_square.
-  bool HasInvalidEltExpr = false;
   while (Tok.isNot(tok::r_brace)) {
     // Parse the comma separated key : value expressions.
     ExprResult KeyExpr;
@@ -3138,12 +3128,6 @@ ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
       return ValueExpr;
     }
 
-    // Check the key and value for possible typos
-    KeyExpr = Actions.CorrectDelayedTyposInExpr(KeyExpr.get());
-    ValueExpr = Actions.CorrectDelayedTyposInExpr(ValueExpr.get());
-    if (KeyExpr.isInvalid() || ValueExpr.isInvalid())
-      HasInvalidEltExpr = true;
-
     // Parse the ellipsis that designates this as a pack expansion. Do not
     // ActOnPackExpansion here, leave it to template instantiation time where
     // we can get better diagnostics.
@@ -3163,9 +3147,6 @@ ExprResult Parser::ParseObjCDictionaryLiteral(SourceLocation AtLoc) {
   }
   SourceLocation EndLoc = ConsumeBrace();
 
-  if (HasInvalidEltExpr)
-    return ExprError();
-
   // Create the ObjCDictionaryLiteral.
   return Actions.ObjC().BuildObjCDictionaryLiteral(SourceRange(AtLoc, EndLoc),
                                                    Elements);
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index ca4f878464c4f..f2849c4eac7cc 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -653,7 +653,7 @@ ExprResult Parser::ParseOpenACCConditionExpr() {
   // it does in an if/while/etc (See ParseCXXCondition), however as it was
   // written with Fortran/C in mind, we're going to assume it just means an
   // 'expression evaluating to boolean'.
-  ExprResult ER = getActions().CorrectDelayedTyposInExpr(ParseExpression());
+  ExprResult ER = ParseExpression();
 
   if (!ER.isUsable())
     return ER;
@@ -761,12 +761,6 @@ Parser::ParseOpenACCIntExpr(OpenACCDirectiveKind DK, OpenACCClauseKind CK,
   if (!ER.isUsable())
     return {ER, OpenACCParseCanContinue::Cannot};
 
-  // Parsing can continue after the initial assignment expression parsing, so
-  // even if there was a typo, we can continue.
-  ER = getActions().CorrectDelayedTyposInExpr(ER);
-  if (!ER.isUsable())
-    return {ER, OpenACCParseCanContinue::Can};
-
   return {getActions().OpenACC().ActOnIntExpr(DK, CK, Loc, ER.get()),
           OpenACCParseCanContinue::Can};
 }
@@ -836,8 +830,7 @@ ExprResult Parser::ParseOpenACCSizeExpr(OpenACCClauseKind CK) {
     return getActions().OpenACC().ActOnOpenACCAsteriskSizeExpr(AsteriskLoc);
   }
 
-  ExprResult SizeExpr =
-      getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+  ExprResult SizeExpr = ParseConstantExpression();
 
   if (!SizeExpr.isUsable())
     return SizeExpr;
@@ -891,8 +884,7 @@ Parser::OpenACCGangArgRes Parser::ParseOpenACCGangArg(SourceLocation GangLoc) {
     ConsumeToken();
     // Parse this as a const-expression, and we'll check its integer-ness/value
     // in CheckGangExpr.
-    ExprResult Res =
-        getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+    ExprResult Res = ParseConstantExpression();
     return {OpenACCGangKind::Dim, Res};
   }
 
@@ -1089,8 +1081,7 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
     case OpenACCClauseKind::Collapse: {
       bool HasForce = tryParseAndConsumeSpecialTokenKind(
           *this, OpenACCSpecialTokenKind::Force, ClauseKind);
-      ExprResult LoopCount =
-          getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
+      ExprResult LoopCount = ParseConstantExpression();
       if (LoopCount.isInvalid()) {
         Parens.skipToEnd();
         return OpenACCCanContinue();
@@ -1387,7 +1378,7 @@ ExprResult Parser::ParseOpenACCIDExpression() {
                                     /*isAddressOfOperand=*/false);
   }
 
-  return getActions().CorrectDelayedTyposInExpr(Res);
+  return Res;
 }
 
 std::variant<std::monostate, clang::StringLiteral *, IdentifierInfo *>
@@ -1414,9 +1405,8 @@ Parser::ParseOpenACCBindClauseArgument() {
     return std::monostate{};
   }
 
-  ExprResult Res =
-      getActions().CorrectDelayedTyposInExpr(ParseStringLiteralExpression(
-          /*AllowUserDefinedLiteral=*/false, /*Unevaluated=*/true));
+  ExprResult Res = ParseStringLiteralExpression(
+      /*AllowUserDefinedLiteral=*/false, /*Unevaluated=*/true);
   if (!Res.isUsable())
     return std::monostate{};
   return cast<StringLiteral>(Res.get());
@@ -1430,10 +1420,6 @@ Parser::OpenACCVarParseResult Parser::ParseOpenACCVar(OpenACCDirectiveKind DK,
   if (!Res.isUsable())
     return {Res, OpenACCParseCanContinue::Cannot};
 
-  Res = getActions().CorrectDelayedTyposInExpr(Res.get());
-  if (!Res.isUsable())
-    return {Res, OpenACCParseCanContinue::Can};
-
   Res = getActions().OpenACC().ActOnVar(DK, CK, Res.get());
 
   return {Res, OpenACCParseCanContinue::Can};
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index b69c3abe0b321..def1a52ba7d4a 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3600,8 +3600,7 @@ bool Parser::ParseOMPInteropInfo(OMPInteropInfo &InteropInfo,
       while (Tok.isNot(tok::r_paren)) {
         SourceLocation Loc = Tok.getLocation();
         ExprResult LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-        ExprResult PTExpr = Actions.CorrectDelayedTyposInExpr(
-            ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+        ExprResult PTExpr = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
         PTExpr = Actions.ActOnFinishFullExpr(PTExpr.get(), Loc,
                                              /*DiscardedValue=*/false);
         if (PTExpr.isUsable()) {
@@ -3662,8 +3661,7 @@ OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind,
 
   // Parse the variable.
   SourceLocation VarLoc = Tok.getLocation();
-  ExprResult InteropVarExpr =
-      Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+  ExprResult InteropVarExpr = ParseAssignmentExpression();
   if (!InteropVarExpr.isUsable()) {
     SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
               StopBeforeMatch);
@@ -4288,8 +4286,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     // Parse <begin>
     SourceLocation Loc = Tok.getLocation();
     ExprResult LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-    ExprResult Begin = Actions.CorrectDelayedTyposInExpr(
-        ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+    ExprResult Begin = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     Begin = Actions.ActOnFinishFullExpr(Begin.get(), Loc,
                                         /*DiscardedValue=*/false);
     // Parse ':'.
@@ -4300,8 +4297,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     // Parse <end>
     Loc = Tok.getLocation();
     LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-    ExprResult End = Actions.CorrectDelayedTyposInExpr(
-        ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+    ExprResult End = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     End = Actions.ActOnFinishFullExpr(End.get(), Loc,
                                       /*DiscardedValue=*/false);
 
@@ -4314,8 +4310,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
       // Parse <step>
       Loc = Tok.getLocation();
       LHS = ParseCastExpression(CastParseKind::AnyCastExpr);
-      Step = Actions.CorrectDelayedTyposInExpr(
-          ParseRHSOfBinaryExpression(LHS, prec::Conditional));
+      Step = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
       Step = Actions.ActOnFinishFullExpr(Step.get(), Loc,
                                          /*DiscardedValue=*/false);
     }
@@ -4797,7 +4792,6 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
       EnterScope(Scope::OpenMPDirectiveScope | Scope::DeclScope);
       Tail = ParseOpenMPIteratorsExpr();
     }
-    Tail = Actions.CorrectDelayedTyposInExpr(Tail);
     Tail = Actions.ActOnFinishFullExpr(Tail.get(), T.getOpenLocation(),
                                        /*DiscardedValue=*/false);
     if (Tail.isUsable() || Data.AllocateAlignment) {
@@ -4858,8 +4852,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     ColonProtectionRAIIObject ColonRAII(*this, MayHaveTail);
     if (!ParseOpenMPReservedLocator(Kind, Data, getLangOpts())) {
       // Parse variable
-      ExprResult VarExpr =
-          Actions.CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+      ExprResult VarExpr = ParseAssignmentExpression();
       if (VarExpr.isUsable()) {
         Vars.push_back(VarExpr.get());
       } else {
@@ -4896,6 +4889,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
     SourceLocation ELoc = ConsumeToken();
 
     if (getLangOpts().OpenMP >= 52 && Kind == OMPC_linear) {
+      bool Malformed = false;
       while (Tok.isNot(tok::r_paren)) {
         if (Tok.is(tok::identifier)) {
           // identifier could be a linear kind (val, uval, ref) or step
@@ -4932,6 +4926,11 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
             ModifierFound = true;
           } else {
             StepFound = parseStepSize(*this, Data, Kind, Tok.getLocation());
+            if (!StepFound) {
+              Malformed = true;
+              SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end,
+                        StopBeforeMatch);
+            }
           }
         } else {
           // parse an integer expression as step size
@@ -4943,7 +4942,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
         if (Tok.is(tok::r_paren) || Tok.is(tok::annot_pragma_openmp_end))
           break;
       }
-      if (!StepFound && !ModifierFound)
+      if (!Malformed && !StepFound && !ModifierFound)
         Diag(ELoc, diag::err_expected_expression);
     } else {
       // for OMPC_aligned and OMPC_linear (with OpenMP <= 5.1)
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c00759893b0c4..434ea68442819 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -602,7 +602,7 @@ StmtResult Parser::ParseSEHExceptBlock(SourceLocation ExceptLoc) {
   {
     ParseScopeFlags FilterScope(this, getCurScope()->getFlags() |
                                           Scope::SEHFilterScope);
-    FilterExpr = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    FilterExpr = ParseExpression();
   }
 
   if (getLangOpts().Borland) {
@@ -1832,11 +1832,7 @@ StmtResult Parser::ParseDoStatement() {
 
   SourceLocation Start = Tok.getLocation();
   ExprResult Cond = ParseExpression();
-  // Correct the typos in condition before closing the scope.
-  if (Cond.isUsable())
-    Cond = Actions.CorrectDelayedTyposInExpr(Cond, /*InitDecl=*/nullptr,
-                                             /*RecoverUncorrectedTypos=*/true);
-  else {
+  if (!Cond.isUsable()) {
     if (!Tok.isOneOf(tok::r_paren, tok::r_square, tok::r_brace))
       SkipUntil(tok::semi);
     Cond = Actions.CreateRecoveryExpr(
@@ -2018,7 +2014,7 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
     }
   } else {
     ProhibitAttributes(attrs);
-    Value = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    Value = ParseExpression();
 
     ForEach = isTokIdentifier_in();
 
@@ -2177,12 +2173,10 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
   StmtResult ForEachStmt;
 
   if (ForRangeInfo.ParsedForRangeDecl()) {
-    ExprResult CorrectedRange =
-        Actions.CorrectDelayedTyposInExpr(ForRangeInfo.RangeExpr.get());
     ForRangeStmt = Actions.ActOnCXXForRangeStmt(
         getCurScope(), ForLoc, CoawaitLoc, FirstPart.get(),
-        ForRangeInfo.LoopVar.get(), ForRangeInfo.ColonLoc, CorrectedRange.get(),
-        T.getCloseLocation(), Sema::BFRK_Build,
+        ForRangeInfo.LoopVar.get(), ForRangeInfo.ColonLoc,
+        ForRangeInfo.RangeExpr.get(), T.getCloseLocation(), Sema::BFRK_Build,
         ForRangeInfo.LifetimeExtendTemps);
   } else if (ForEach) {
     // Similarly, we need to do the semantic analysis for a for-range
diff --git a/clang/lib/Parse/ParseStmtAsm.cpp b/clang/lib/Parse/ParseStmtAsm.cpp
index f2417479a0e78..182907df56070 100644
--- a/clang/lib/Parse/ParseStmtAsm.cpp
+++ b/clang/lib/Parse/ParseStmtAsm.cpp
@@ -864,7 +864,7 @@ bool Parser::ParseAsmOperandsOpt(SmallVectorImpl<IdentifierInfo *> &Names,
     // Read the parenthesized expression.
     BalancedDelimiterTracker T(*this, tok::l_paren);
     T.consumeOpen();
-    ExprResult Res = Actions.CorrectDelayedTyposInExpr(ParseExpression());
+    ExprResult Res = ParseExpression();
     T.consumeClose();
     if (Res.isInvalid()) {
       SkipUntil(tok::r_paren, StopAtSemi);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index d3c9ca029c9aa..a16dbe95b788d 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -296,8 +296,7 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo,
     return nullptr;
   }
 
-  ExprResult ConstraintExprResult =
-      Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+  ExprResult ConstraintExprResult = ParseConstraintExpression();
   if (ConstraintExprResult.isInvalid()) {
     SkipUntil(tok::semi);
     if (D)
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 9826abc0c3b40..42ebf2a508a26 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1227,15 +1227,6 @@ void Sema::ActOnEndOfTranslationUnitFragment(TUFragmentKind Kind) {
   assert(LateParsedInstantiations.empty() &&
          "end of TU template instantiation should not create more "
          "late-parsed templates");
-
-  // Report diagnostics for uncorrected delayed typos. Ideally all of them
-  // should have been corrected by that time, but it is very hard to cover all
-  // cases in practice.
-  for (const auto &Typo : DelayedTypos) {
-    // We pass an empty TypoCorrection to indicate no correction was performed.
-    Typo.second.DiagHandler(TypoCorrection());
-  }
-  DelayedTypos.clear();
 }
 
 void Sema::ActOnEndOfTranslationUnit() {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 8f8e1ceb7197e..69276ce418fa6 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2648,8 +2648,6 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     bool IsDelete = BuiltinID == Builtin::BI__builtin_operator_delete;
     ExprResult Res =
         BuiltinOperatorNewDeleteOverloaded(TheCallResult, IsDelete);
-    if (Res.isInvalid())
-      CorrectDelayedTyposInExpr(TheCallResult.get());
     return Res;
   }
   case Builtin::BI__builtin_dump_struct:
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 425b32e53a7b7..a1389c6c034b1 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -309,15 +309,6 @@ static ExprResult buildMemberCall(Sema &S, Expr *Base, SourceLocation Loc,
   if (Result.isInvalid())
     return ExprError();
 
-  // We meant exactly what we asked for. No need for typo correction.
-  if (auto *TE = dyn_cast<TypoExpr>(Result.get())) {
-    S.clearDelayedTypo(TE);
-    S.Diag(Loc, diag::err_no_member)
-        << NameInfo.getName() << Base->getType()->getAsCXXRecordDecl()
-        << Base->getSourceRange();
-    return ExprError();
-  }
-
   auto EndLoc = Args.empty() ? Loc : Args.back()->getEndLoc();
   return S.BuildCallExpr(nullptr, Result.get(), Loc, Args, EndLoc, nullptr);
 }
@@ -811,7 +802,6 @@ ExprResult Sema::ActOnCoawaitExpr(Scope *S, SourceLocation Loc, Expr *E) {
     return ExprError();
 
   if (!ActOnCoroutineBodyStart(S, Loc, "co_await")) {
-    CorrectDelayedTyposInExpr(E);
     return ExprError();
   }
 
@@ -970,7 +960,6 @@ ExprResult Sema::ActOnCoyieldExpr(Scope *S, SourceLocation Loc, Expr *E) {
     return ExprError();
 
   if (!ActOnCoroutineBodyStart(S, Loc, "co_yield")) {
-    CorrectDelayedTyposInExpr(E);
     return ExprError();
   }
 
@@ -1025,7 +1014,6 @@ ExprResult Sema::BuildCoyieldExpr(SourceLocation Loc, Expr *E) {
 
 StmtResult Sema::ActOnCoreturnStmt(Scope *S, SourceLocation Loc, Expr *E) {
   if (!ActOnCoroutineBodyStart(S, Loc, "co_return")) {
-    CorrectDelayedTyposInExpr(E);
     return StmtError();
   }
   return BuildCoreturnStmt(Loc, E);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index bbd63372c168b..c152f406b4977 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13584,7 +13584,6 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   // If there is no declaration, there was an error parsing it.  Just ignore
   // the initializer.
   if (!RealDecl) {
-    CorrectDelayedTyposInExpr(Init, dyn_cast_or_null<VarDecl>(RealDecl));
     return;
   }
 
@@ -13607,12 +13606,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
   }
 
   if (VDecl->isInvalidDecl()) {
-    ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl);
-    SmallVector<Expr *> SubExprs;
-    if (Res.isUsable())
-      SubExprs.push_back(Res.get());
     ExprResult Recovery =
-        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), SubExprs);
+        CreateRecoveryExpr(Init->getBeginLoc(), Init->getEndLoc(), {Init});
     if (Expr *E = Recovery.get())
       VDecl->setInit(E);
     return;
@@ -13627,23 +13622,12 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
   // C++11 [decl.spec.auto]p6. Deduce the type which 'auto' stands in for.
   if (VDecl->getType()->isUndeducedType()) {
-    // Attempt typo correction early so that the type of the init expression can
-    // be deduced based on the chosen correction if the original init contains a
-    // TypoExpr.
-    ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl);
-    if (!Res.isUsable()) {
-      // There are unresolved typos in Init, just drop them.
-      // FIXME: improve the recovery strategy to preserve the Init.
-      RealDecl->setInvalidDecl();
-      return;
-    }
-    if (Res.get()->containsErrors()) {
+    if (Init->containsErrors()) {
       // Invalidate the decl as we don't know the type for recovery-expr yet.
       RealDecl->setInvalidDecl();
-      VDecl->setInit(Res.get());
+      VDecl->setInit(Init);
       return;
     }
-    Init = Res.get();
 
     if (DeduceVariableDeclarationType(VDecl, DirectInit, Init))
       return;
@@ -13789,23 +13773,6 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
       InitializedFromParenListExpr = true;
     }
 
-    // Try to correct any TypoExprs in the initialization arguments.
-    for (size_t Idx = 0; Idx < Args.size(); ++Idx) {
-      ExprResult Res = CorrectDelayedTyposInExpr(
-          Args[Idx], VDecl, /*RecoverUncorrectedTypos=*/true,
-          [this, Entity, Kind](Expr *E) {
-            InitializationSequence Init(*this, Entity, Kind, MultiExprArg(E));
-            return Init.Failed() ? ExprError() : E;
-          });
-      if (!Res.isUsable()) {
-        VDecl->setInvalidDecl();
-      } else if (Res.get() != Args[Idx]) {
-        Args[Idx] = Res.get();
-      }
-    }
-    if (VDecl->isInvalidDecl())
-      return;
-
     InitializationSequence InitSeq(*this, Entity, Kind, Args,
                                    /*TopLevelOfInitList=*/false,
                                    /*TreatUnavailableAsInvalid=*/false);
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 39d4d49a0fe79..31e2834336742 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -4154,10 +4154,6 @@ ExprResult Sema::ActOnRequiresClause(ExprResult ConstraintExpr) {
   if (ConstraintExpr.isInvalid())
     return ExprError();
 
-  ConstraintExpr = CorrectDelayedTyposInExpr(ConstraintExpr);
-  if (ConstraintExpr.isInvalid())
-    return ExprError();
-
   if (DiagnoseUnexpandedParameterPack(ConstraintExpr.get(),
                                       UPPC_RequiresClause))
     return ExprError();
@@ -4207,23 +4203,20 @@ void Sema::ActOnFinishCXXInClassMemberInitializer(Decl *D,
     return;
   }
 
-  ExprResult Init = CorrectDelayedTyposInExpr(InitExpr, /*InitDecl=*/nullptr,
-                                              /*RecoverUncorrectedTypos=*/true);
-  assert(Init.isUsable() && "Init should at least have a RecoveryExpr");
-  if (!FD->getType()->isDependentType() && !Init.get()->isTypeDependent()) {
-    Init = ConvertMemberDefaultInitExpression(FD, Init.get(), InitLoc);
+  if (!FD->getType()->isDependentType() && !InitExpr.get()->isTypeDependent()) {
+    InitExpr = ConvertMemberDefaultInitExpression(FD, InitExpr.get(), InitLoc);
     // C++11 [class.base.init]p7:
     //   The initialization of each base and member constitutes a
     //   full-expression.
-    if (!Init.isInvalid())
-      Init = ActOnFinishFullExpr(Init.get(), /*DiscarededValue=*/false);
-    if (Init.isInvalid()) {
+    if (!InitExpr.isInvalid())
+      InitExpr = ActOnFinishFullExpr(InitExpr.get(), /*DiscarededValue=*/false);
+    if (InitExpr.isInvalid()) {
       FD->setInvalidDecl();
       return;
     }
   }
 
-  FD->setInClassInitializer(Init.get());
+  FD->setInClassInitializer(InitExpr.get());
 }
 
 /// Find the direct and/or virtual base specifiers that
@@ -4393,13 +4386,7 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
                           SourceLocation IdLoc,
                           Expr *Init,
                           SourceLocation EllipsisLoc) {
-  ExprResult Res = CorrectDelayedTyposInExpr(Init, /*InitDecl=*/nullptr,
-                                             /*RecoverUncorrectedTypos=*/true);
-  if (!Res.isUsable())
-    return true;
-  Init = Res.get();
-
-  if (!ConstructorD)
+  if (!ConstructorD || !Init)
     return true;
 
   AdjustDeclIfTemplate(ConstructorD);
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index c692f824da422..0a6cea8869c14 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1368,7 +1368,6 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Expr::UnaryExprOrTypeTraitExprClass:
   case Expr::UnresolvedLookupExprClass:
   case Expr::UnresolvedMemberExprClass:
-  case Expr::TypoExprClass:
     // FIXME: Many of the above can throw.
     return CT_Cannot;
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index c7abbbd6993de..b7031bc8c0220 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2544,8 +2544,7 @@ bool Sema::DiagnoseDependentMemberLookup(const LookupResult &R) {
 bool Sema::DiagnoseEmptyLookup(Scope *S, CXXScopeSpec &SS, LookupResult &R,
                                CorrectionCandidateCallback &CCC,
                                TemplateArgumentListInfo *ExplicitTemplateArgs,
-                               ArrayRef<Expr *> Args, DeclContext *LookupCtx,
-                               TypoExpr **Out) {
+                               ArrayRef<Expr *> Args, DeclContext *LookupCtx) {
   DeclarationName Name = R.getLookupName();
   SourceRange NameRange = R.getLookupNameInfo().getSourceRange();
 
@@ -2604,21 +2603,9 @@ bool Sema::DiagnoseEmptyLookup(Scope *S, CXXScopeSpec &SS, LookupResult &R,
 
   // We didn't find anything, so try to correct for a typo.
   TypoCorrection Corrected;
-  if (S && Out) {
-    assert(!ExplicitTemplateArgs &&
-           "Diagnosing an empty lookup with explicit template args!");
-    *Out = CorrectTypoDelayed(
-        R.getLookupNameInfo(), R.getLookupKind(), S, &SS, CCC,
-        [=](const TypoCorrection &TC) {
-          emitEmptyLookupTypoDiagnostic(TC, *this, SS, Name, NameRange,
-                                        diagnostic, diagnostic_suggest);
-        },
-        nullptr, CorrectTypoKind::ErrorRecovery, LookupCtx);
-    if (*Out)
-      return true;
-  } else if (S && (Corrected = CorrectTypo(
-                       R.getLookupNameInfo(), R.getLookupKind(), S, &SS, CCC,
-                       CorrectTypoKind::ErrorRecovery, LookupCtx))) {
+  if (S && (Corrected =
+                CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S, &SS,
+                            CCC, CorrectTypoKind::ErrorRecovery, LookupCtx))) {
     std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
     bool DroppedSpecifier =
         Corrected.WillReplaceSpecifier() && Name.getAsString() == CorrectedStr;
@@ -2880,7 +2867,6 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
 
     // If this name wasn't predeclared and if this is not a function
     // call, diagnose the problem.
-    TypoExpr *TE = nullptr;
     DefaultFilterCCC DefaultValidator(II, SS.isValid() ? SS.getScopeRep()
                                                        : nullptr);
     DefaultValidator.IsAddressOfOperand = IsAddressOfOperand;
@@ -2896,29 +2882,8 @@ Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
     // a template name, but we happen to have always already looked up the name
     // before we get here if it must be a template name.
     if (DiagnoseEmptyLookup(S, SS, R, CCC ? *CCC : DefaultValidator, nullptr,
-                            {}, nullptr, &TE)) {
-      if (TE && KeywordReplacement) {
-        auto &State = getTypoExprState(TE);
-        auto BestTC = State.Consumer->getNextCorrection();
-        if (BestTC.isKeyword()) {
-          auto *II = BestTC.getCorrectionAsIdentifierInfo();
-          if (State.DiagHandler)
-            State.DiagHandler(BestTC);
-          KeywordReplacement->startToken();
-          KeywordReplacement->setKind(II->getTokenID());
-          KeywordReplacement->setIdentifierInfo(II);
-          KeywordReplacement->setLocation(BestTC.getCorrectionRange().getBegin());
-          // Clean up the state associated with the TypoExpr, since it has
-          // now been diagnosed (without a call to CorrectDelayedTyposInExpr).
-          clearDelayedTypo(TE);
-          // Signal that a correction to a keyword was performed by returning a
-          // valid-but-null ExprResult.
-          return (Expr*)nullptr;
-        }
-        State.Consumer->resetCorrectionStream();
-      }
-      return TE ? TE : ExprError();
-    }
+                            {}, nullptr))
+      return ExprError();
 
     assert(!R.empty() &&
            "DiagnoseEmptyLookup returned false but added no results");
@@ -7009,40 +6974,6 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
                          CurFPFeatureOverrides(), NumParams, UsesADL);
   }
 
-  if (!Context.isDependenceAllowed()) {
-    // Forget about the nulled arguments since typo correction
-    // do not handle them well.
-    TheCall->shrinkNumArgs(Args.size());
-    // C cannot always handle TypoExpr nodes in builtin calls and direct
-    // function calls as their argument checking don't necessarily handle
-    // dependent types properly, so make sure any TypoExprs have been
-    // dealt with.
-    ExprResult Result = CorrectDelayedTyposInExpr(TheCall);
-    if (!Result.isUsable()) return ExprError();
-    CallExpr *TheOldCall = TheCall;
-    TheCall = dyn_cast<CallExpr>(Result.get());
-    bool CorrectedTypos = TheCall != TheOldCall;
-    if (!TheCall) return Result;
-    Args = llvm::ArrayRef(TheCall->getArgs(), TheCall->getNumArgs());
-
-    // A new call expression node was created if some typos were corrected.
-    // However it may not have been constructed with enough storage. In this
-    // case, rebuild the node with enough storage. The waste of space is
-    // immaterial since this only happens when some typos were corrected.
-    if (CorrectedTypos && Args.size() < NumParams) {
-      if (Config)
-        TheCall = CUDAKernelCallExpr::Create(
-            Context, Fn, cast<CallExpr>(Config), Args, ResultTy, VK_PRValue,
-            RParenLoc, CurFPFeatureOverrides(), NumParams);
-      else
-        TheCall =
-            CallExpr::Create(Context, Fn, Args, ResultTy, VK_PRValue, RParenLoc,
-                             CurFPFeatureOverrides(), NumParams, UsesADL);
-    }
-    // We can now handle the nulled arguments for the default arguments.
-    TheCall->setNumArgsUnsafe(std::max<unsigned>(Args.size(), NumParams));
-  }
-
   // Bail out early if calling a builtin with custom type checking.
   if (BuiltinID && Context.BuiltinInfo.hasCustomTypechecking(BuiltinID)) {
     ExprResult E = CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);
@@ -7933,12 +7864,6 @@ Sema::ActOnCastExpr(Scope *S, SourceLocation LParenLoc,
   if (getLangOpts().CPlusPlus) {
     // Check that there are no default arguments (C++ only).
     CheckExtraCXXDefaultArguments(D);
-  } else {
-    // Make sure any TypoExprs have been dealt with.
-    ExprResult Res = CorrectDelayedTyposInExpr(CastExpr);
-    if (!Res.isUsable())
-      return ExprError();
-    CastExpr = Res.get();
   }
 
   checkUnusedDeclAttributes(D);
@@ -8984,30 +8909,6 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
                                     SourceLocation ColonLoc,
                                     Expr *CondExpr, Expr *LHSExpr,
                                     Expr *RHSExpr) {
-  if (!Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes in the condition because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    ExprResult CondResult = CorrectDelayedTyposInExpr(CondExpr);
-    ExprResult LHSResult = CorrectDelayedTyposInExpr(LHSExpr);
-    ExprResult RHSResult = CorrectDelayedTyposInExpr(RHSExpr);
-
-    if (!CondResult.isUsable())
-      return ExprError();
-
-    if (LHSExpr) {
-      if (!LHSResult.isUsable())
-        return ExprError();
-    }
-
-    if (!RHSResult.isUsable())
-      return ExprError();
-
-    CondExpr = CondResult.get();
-    LHSExpr = LHSResult.get();
-    RHSExpr = RHSResult.get();
-  }
-
   // If this is the gnu "x ?: y" extension, analyze the types as though the LHS
   // was the condition.
   OpaqueValueExpr *opaqueValue = nullptr;
@@ -15068,28 +14969,6 @@ static ExprResult convertHalfVecBinOp(Sema &S, ExprResult LHS, ExprResult RHS,
   return convertVector(BO, ResultTy->castAs<VectorType>()->getElementType(), S);
 }
 
-static std::pair<ExprResult, ExprResult>
-CorrectDelayedTyposInBinOp(Sema &S, BinaryOperatorKind Opc, Expr *LHSExpr,
-                           Expr *RHSExpr) {
-  ExprResult LHS = LHSExpr, RHS = RHSExpr;
-  if (!S.Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes on either side of a binop because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    LHS = S.CorrectDelayedTyposInExpr(LHS);
-    RHS = S.CorrectDelayedTyposInExpr(
-        RHS, /*InitDecl=*/nullptr, /*RecoverUncorrectedTypos=*/false,
-        [Opc, LHS](Expr *E) {
-          if (Opc != BO_Assign)
-            return ExprResult(E);
-          // Avoid correcting the RHS to the same Expr as the LHS.
-          Decl *D = getDeclFromExpr(E);
-          return (D && D == getDeclFromExpr(LHS.get())) ? ExprError() : E;
-        });
-  }
-  return std::make_pair(LHS, RHS);
-}
-
 /// Returns true if conversion between vectors of halfs and vectors of floats
 /// is needed.
 static bool needsConversionOfHalfVec(bool OpRequiresConversion, ASTContext &Ctx,
@@ -15146,7 +15025,6 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
   ExprObjectKind OK = OK_Ordinary;
   bool ConvertHalfVec = false;
 
-  std::tie(LHS, RHS) = CorrectDelayedTyposInBinOp(*this, Opc, LHSExpr, RHSExpr);
   if (!LHS.isUsable() || !RHS.isUsable())
     return ExprError();
 
@@ -15662,12 +15540,8 @@ static ExprResult BuildOverloadedBinOp(Sema &S, Scope *Sc, SourceLocation OpLoc,
 ExprResult Sema::BuildBinOp(Scope *S, SourceLocation OpLoc,
                             BinaryOperatorKind Opc, Expr *LHSExpr,
                             Expr *RHSExpr, bool ForFoldExpression) {
-  ExprResult LHS, RHS;
-  std::tie(LHS, RHS) = CorrectDelayedTyposInBinOp(*this, Opc, LHSExpr, RHSExpr);
-  if (!LHS.isUsable() || !RHS.isUsable())
+  if (!LHSExpr || !RHSExpr)
     return ExprError();
-  LHSExpr = LHS.get();
-  RHSExpr = RHS.get();
 
   // We want to end up calling one of SemaPseudoObject::checkAssignment
   // (if the LHS is a pseudo-object), BuildOverloadedBinOp (if
@@ -18194,8 +18068,6 @@ HandleImmediateInvocations(Sema &SemaRef,
 
 void Sema::PopExpressionEvaluationContext() {
   ExpressionEvaluationContextRecord& Rec = ExprEvalContexts.back();
-  unsigned NumTypos = Rec.NumTypos;
-
   if (!Rec.Lambdas.empty()) {
     using ExpressionKind = ExpressionEvaluationContextRecord::ExpressionKind;
     if (!getLangOpts().CPlusPlus20 &&
@@ -18263,9 +18135,6 @@ void Sema::PopExpressionEvaluationContext() {
 
   // Pop the current expression evaluation context off the stack.
   ExprEvalContexts.pop_back();
-
-  // The global expression evaluation context record is never popped.
-  ExprEvalContexts.back().NumTypos += NumTypos;
 }
 
 void Sema::DiscardCleanupsInEvaluationContext() {
@@ -20023,8 +19892,6 @@ ExprResult Sema::CheckLValueToRValueConversionOperand(Expr *E) {
 }
 
 ExprResult Sema::ActOnConstantExpression(ExprResult Res) {
-  Res = CorrectDelayedTyposInExpr(Res);
-
   if (!Res.isUsable())
     return Res;
 
@@ -21350,15 +21217,6 @@ static ExprResult diagnoseUnknownAnyExpr(Sema &S, Expr *E) {
 }
 
 ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
-  if (!Context.isDependenceAllowed()) {
-    // C cannot handle TypoExpr nodes on either side of a binop because it
-    // doesn't handle dependent types properly, so make sure any TypoExprs have
-    // been dealt with before checking the operands.
-    ExprResult Result = CorrectDelayedTyposInExpr(E);
-    if (!Result.isUsable()) return ExprError();
-    E = Result.get();
-  }
-
   const BuiltinType *placeholderType = E->getType()->getAsPlaceholderType();
   if (!placeholderType) return E;
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c106ea749170f..c653cb56351cb 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1500,13 +1500,7 @@ Sema::ActOnCXXTypeConstructExpr(ParsedType TypeRep,
 
   auto Result = BuildCXXTypeConstructExpr(TInfo, LParenOrBraceLoc, exprs,
                                           RParenOrBraceLoc, ListInitialization);
-  // Avoid creating a non-type-dependent expression that contains typos.
-  // Non-type-dependent expressions are liable to be discarded without
-  // checking for embedded typos.
-  if (!Result.isInvalid() && Result.get()->isInstantiationDependent() &&
-      !Result.get()->isTypeDependent())
-    Result = CorrectDelayedTyposInExpr(Result.get());
-  else if (Result.isInvalid())
+  if (Result.isInvalid())
     Result = CreateRecoveryExpr(TInfo->getTypeLoc().getBeginLoc(),
                                 RParenOrBraceLoc, exprs, Ty);
   return Result;
@@ -7698,372 +7692,6 @@ static ExprResult attemptRecovery(Sema &SemaRef,
                                           /*AcceptInvalidDecl*/ true);
 }
 
-namespace {
-class FindTypoExprs : public DynamicRecursiveASTVisitor {
-  llvm::SmallSetVector<TypoExpr *, 2> &TypoExprs;
-
-public:
-  explicit FindTypoExprs(llvm::SmallSetVector<TypoExpr *, 2> &TypoExprs)
-      : TypoExprs(TypoExprs) {}
-  bool VisitTypoExpr(TypoExpr *TE) override {
-    TypoExprs.insert(TE);
-    return true;
-  }
-};
-
-class TransformTypos : public TreeTransform<TransformTypos> {
-  typedef TreeTransform<TransformTypos> BaseTransform;
-
-  VarDecl *InitDecl; // A decl to avoid as a correction because it is in the
-                     // process of being initialized.
-  llvm::function_ref<ExprResult(Expr *)> ExprFilter;
-  llvm::SmallSetVector<TypoExpr *, 2> TypoExprs, AmbiguousTypoExprs;
-  llvm::SmallDenseMap<TypoExpr *, ExprResult, 2> TransformCache;
-  llvm::SmallDenseMap<OverloadExpr *, Expr *, 4> OverloadResolution;
-
-  /// Emit diagnostics for all of the TypoExprs encountered.
-  ///
-  /// If the TypoExprs were successfully corrected, then the diagnostics should
-  /// suggest the corrections. Otherwise the diagnostics will not suggest
-  /// anything (having been passed an empty TypoCorrection).
-  ///
-  /// If we've failed to correct due to ambiguous corrections, we need to
-  /// be sure to pass empty corrections and replacements. Otherwise it's
-  /// possible that the Consumer has a TypoCorrection that failed to ambiguity
-  /// and we don't want to report those diagnostics.
-  void EmitAllDiagnostics(bool IsAmbiguous) {
-    for (TypoExpr *TE : TypoExprs) {
-      auto &State = SemaRef.getTypoExprState(TE);
-      if (State.DiagHandler) {
-        TypoCorrection TC = IsAmbiguous
-            ? TypoCorrection() : State.Consumer->getCurrentCorrection();
-        ExprResult Replacement = IsAmbiguous ? ExprError() : TransformCache[TE];
-
-        // Extract the NamedDecl from the transformed TypoExpr and add it to the
-        // TypoCorrection, replacing the existing decls. This ensures the right
-        // NamedDecl is used in diagnostics e.g. in the case where overload
-        // resolution was used to select one from several possible decls that
-        // had been stored in the TypoCorrection.
-        if (auto *ND = getDeclFromExpr(
-                Replacement.isInvalid() ? nullptr : Replacement.get()))
-          TC.setCorrectionDecl(ND);
-
-        State.DiagHandler(TC);
-      }
-      SemaRef.clearDelayedTypo(TE);
-    }
-  }
-
-  /// Try to advance the typo correction state of the first unfinished TypoExpr.
-  /// We allow advancement of the correction stream by removing it from the
-  /// TransformCache which allows `TransformTypoExpr` to advance during the
-  /// next transformation attempt.
-  ///
-  /// Any substitution attempts for the previous TypoExprs (which must have been
-  /// finished) will need to be retried since it's possible that they will now
-  /// be invalid given the latest advancement.
-  ///
-  /// We need to be sure that we're making progress - it's possible that the
-  /// tree is so malformed that the transform never makes it to the
-  /// `TransformTypoExpr`.
-  ///
-  /// Returns true if there are any untried correction combinations.
-  bool CheckAndAdvanceTypoExprCorrectionStreams() {
-    for (auto *TE : TypoExprs) {
-      auto &State = SemaRef.getTypoExprState(TE);
-      TransformCache.erase(TE);
-      if (!State.Consumer->hasMadeAnyCorrectionProgress())
-        return false;
-      if (!State.Consumer->finished())
-        return true;
-      State.Consumer->resetCorrectionStream();
-    }
-    return false;
-  }
-
-  NamedDecl *getDeclFromExpr(Expr *E) {
-    if (auto *OE = dyn_cast_or_null<OverloadExpr>(E))
-      E = OverloadResolution[OE];
-
-    if (!E)
-      return nullptr;
-    if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-      return DRE->getFoundDecl();
-    if (auto *ME = dyn_cast<MemberExpr>(E))
-      return ME->getFoundDecl();
-    // FIXME: Add any other expr types that could be seen by the delayed typo
-    // correction TreeTransform for which the corresponding TypoCorrection could
-    // contain multiple decls.
-    return nullptr;
-  }
-
-  ExprResult TryTransform(Expr *E) {
-    Sema::SFINAETrap Trap(SemaRef);
-    ExprResult Res = TransformExpr(E);
-    if (Trap.hasErrorOccurred() || Res.isInvalid())
-      return ExprError();
-
-    return ExprFilter(Res.get());
-  }
-
-  // Since correcting typos may intoduce new TypoExprs, this function
-  // checks for new TypoExprs and recurses if it finds any. Note that it will
-  // only succeed if it is able to correct all typos in the given expression.
-  ExprResult CheckForRecursiveTypos(ExprResult Res, bool &IsAmbiguous) {
-    if (Res.isInvalid()) {
-      return Res;
-    }
-    // Check to see if any new TypoExprs were created. If so, we need to recurse
-    // to check their validity.
-    Expr *FixedExpr = Res.get();
-
-    auto SavedTypoExprs = std::move(TypoExprs);
-    auto SavedAmbiguousTypoExprs = std::move(AmbiguousTypoExprs);
-    TypoExprs.clear();
-    AmbiguousTypoExprs.clear();
-
-    FindTypoExprs(TypoExprs).TraverseStmt(FixedExpr);
-    if (!TypoExprs.empty()) {
-      // Recurse to handle newly created TypoExprs. If we're not able to
-      // handle them, discard these TypoExprs.
-      ExprResult RecurResult =
-          RecursiveTransformLoop(FixedExpr, IsAmbiguous);
-      if (RecurResult.isInvalid()) {
-        Res = ExprError();
-        // Recursive corrections didn't work, wipe them away and don't add
-        // them to the TypoExprs set. Remove them from Sema's TypoExpr list
-        // since we don't want to clear them twice. Note: it's possible the
-        // TypoExprs were created recursively and thus won't be in our
-        // Sema's TypoExprs - they were created in our `RecursiveTransformLoop`.
-        auto &SemaTypoExprs = SemaRef.TypoExprs;
-        for (auto *TE : TypoExprs) {
-          TransformCache.erase(TE);
-          SemaRef.clearDelayedTypo(TE);
-
-          auto SI = find(SemaTypoExprs, TE);
-          if (SI != SemaTypoExprs.end()) {
-            SemaTypoExprs.erase(SI);
-          }
-        }
-      } else {
-        // TypoExpr is valid: add newly created TypoExprs since we were
-        // able to correct them.
-        Res = RecurResult;
-        SavedTypoExprs.set_union(TypoExprs);
-      }
-    }
-
-    TypoExprs = std::move(SavedTypoExprs);
-    AmbiguousTypoExprs = std::move(SavedAmbiguousTypoExprs);
-
-    return Res;
-  }
-
-  // Try to transform the given expression, looping through the correction
-  // candidates with `CheckAndAdvanceTypoExprCorrectionStreams`.
-  //
-  // If valid ambiguous typo corrections are seen, `IsAmbiguous` is set to
-  // true and this method immediately will return an `ExprError`.
-  ExprResult RecursiveTransformLoop(Expr *E, bool &IsAmbiguous) {
-    ExprResult Res;
-    auto SavedTypoExprs = std::move(SemaRef.TypoExprs);
-    SemaRef.TypoExprs.clear();
-
-    while (true) {
-      Res = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous);
-
-      // Recursion encountered an ambiguous correction. This means that our
-      // correction itself is ambiguous, so stop now.
-      if (IsAmbiguous)
-        break;
-
-      // If the transform is still valid after checking for any new typos,
-      // it's good to go.
-      if (!Res.isInvalid())
-        break;
-
-      // The transform was invalid, see if we have any TypoExprs with untried
-      // correction candidates.
-      if (!CheckAndAdvanceTypoExprCorrectionStreams())
-        break;
-    }
-
-    // If we found a valid result, double check to make sure it's not ambiguous.
-    if (!IsAmbiguous && !Res.isInvalid() && !AmbiguousTypoExprs.empty()) {
-      auto SavedTransformCache =
-          llvm::SmallDenseMap<TypoExpr *, ExprResult, 2>(TransformCache);
-
-      // Ensure none of the TypoExprs have multiple typo correction candidates
-      // with the same edit length that pass all the checks and filters.
-      while (!AmbiguousTypoExprs.empty()) {
-        auto TE  = AmbiguousTypoExprs.back();
-
-        // TryTransform itself can create new Typos, adding them to the TypoExpr map
-        // and invalidating our TypoExprState, so always fetch it instead of storing.
-        SemaRef.getTypoExprState(TE).Consumer->saveCurrentPosition();
-
-        TypoCorrection TC = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection();
-        TypoCorrection Next;
-        do {
-          // Fetch the next correction by erasing the typo from the cache and calling
-          // `TryTransform` which will iterate through corrections in
-          // `TransformTypoExpr`.
-          TransformCache.erase(TE);
-          ExprResult AmbigRes = CheckForRecursiveTypos(TryTransform(E), IsAmbiguous);
-
-          if (!AmbigRes.isInvalid() || IsAmbiguous) {
-            SemaRef.getTypoExprState(TE).Consumer->resetCorrectionStream();
-            SavedTransformCache.erase(TE);
-            Res = ExprError();
-            IsAmbiguous = true;
-            break;
-          }
-        } while ((Next = SemaRef.getTypoExprState(TE).Consumer->peekNextCorrection()) &&
-                 Next.getEditDistance(false) == TC.getEditDistance(false));
-
-        if (IsAmbiguous)
-          break;
-
-        AmbiguousTypoExprs.remove(TE);
-        SemaRef.getTypoExprState(TE).Consumer->restoreSavedPosition();
-        TransformCache[TE] = SavedTransformCache[TE];
-      }
-      TransformCache = std::move(SavedTransformCache);
-    }
-
-    // Wipe away any newly created TypoExprs that we don't know about. Since we
-    // clear any invalid TypoExprs in `CheckForRecursiveTypos`, this is only
-    // possible if a `TypoExpr` is created during a transformation but then
-    // fails before we can discover it.
-    auto &SemaTypoExprs = SemaRef.TypoExprs;
-    for (auto Iterator = SemaTypoExprs.begin(); Iterator != SemaTypoExprs.end();) {
-      auto TE = *Iterator;
-      auto FI = find(TypoExprs, TE);
-      if (FI != TypoExprs.end()) {
-        Iterator++;
-        continue;
-      }
-      SemaRef.clearDelayedTypo(TE);
-      Iterator = SemaTypoExprs.erase(Iterator);
-    }
-    SemaRef.TypoExprs = std::move(SavedTypoExprs);
-
-    return Res;
-  }
-
-public:
-  TransformTypos(Sema &SemaRef, VarDecl *InitDecl, llvm::function_ref<ExprResult(Expr *)> Filter)
-      : BaseTransform(SemaRef), InitDecl(InitDecl), ExprFilter(Filter) {}
-
-  ExprResult RebuildCallExpr(Expr *Callee, SourceLocation LParenLoc,
-                                   MultiExprArg Args,
-                                   SourceLocation RParenLoc,
-                                   Expr *ExecConfig = nullptr) {
-    auto Result = BaseTransform::RebuildCallExpr(Callee, LParenLoc, Args,
-                                                 RParenLoc, ExecConfig);
-    if (auto *OE = dyn_cast<OverloadExpr>(Callee)) {
-      if (Result.isUsable()) {
-        Expr *ResultCall = Result.get();
-        if (auto *BE = dyn_cast<CXXBindTemporaryExpr>(ResultCall))
-          ResultCall = BE->getSubExpr();
-        if (auto *CE = dyn_cast<CallExpr>(ResultCall))
-          OverloadResolution[OE] = CE->getCallee();
-      }
-    }
-    return Result;
-  }
-
-  ExprResult TransformLambdaExpr(LambdaExpr *E) { return Owned(E); }
-
-  ExprResult TransformBlockExpr(BlockExpr *E) { return Owned(E); }
-
-  ExprResult Transform(Expr *E) {
-    bool IsAmbiguous = false;
-    ExprResult Res = RecursiveTransformLoop(E, IsAmbiguous);
-
-    if (!Res.isUsable())
-      FindTypoExprs(TypoExprs).TraverseStmt(E);
-
-    EmitAllDiagnostics(IsAmbiguous);
-
-    return Res;
-  }
-
-  ExprResult TransformTypoExpr(TypoExpr *E) {
-    // If the TypoExpr hasn't been seen before, record it. Otherwise, return the
-    // cached transformation result if there is one and the TypoExpr isn't the
-    // first one that was encountered.
-    auto &CacheEntry = TransformCache[E];
-    if (!TypoExprs.insert(E) && !CacheEntry.isUnset()) {
-      return CacheEntry;
-    }
-
-    auto &State = SemaRef.getTypoExprState(E);
-    assert(State.Consumer && "Cannot transform a cleared TypoExpr");
-
-    // For the first TypoExpr and an uncached TypoExpr, find the next likely
-    // typo correction and return it.
-    while (TypoCorrection TC = State.Consumer->getNextCorrection()) {
-      if (InitDecl && TC.getFoundDecl() == InitDecl)
-        continue;
-      // FIXME: If we would typo-correct to an invalid declaration, it's
-      // probably best to just suppress all errors from this typo correction.
-      ExprResult NE = State.RecoveryHandler ?
-          State.RecoveryHandler(SemaRef, E, TC) :
-          attemptRecovery(SemaRef, *State.Consumer, TC);
-      if (!NE.isInvalid()) {
-        // Check whether there may be a second viable correction with the same
-        // edit distance; if so, remember this TypoExpr may have an ambiguous
-        // correction so it can be more thoroughly vetted later.
-        TypoCorrection Next;
-        if ((Next = State.Consumer->peekNextCorrection()) &&
-            Next.getEditDistance(false) == TC.getEditDistance(false)) {
-          AmbiguousTypoExprs.insert(E);
-        } else {
-          AmbiguousTypoExprs.remove(E);
-        }
-        assert(!NE.isUnset() &&
-               "Typo was transformed into a valid-but-null ExprResult");
-        return CacheEntry = NE;
-      }
-    }
-    return CacheEntry = ExprError();
-  }
-};
-}
-
-ExprResult
-Sema::CorrectDelayedTyposInExpr(Expr *E, VarDecl *InitDecl,
-                                bool RecoverUncorrectedTypos,
-                                llvm::function_ref<ExprResult(Expr *)> Filter) {
-  // If the current evaluation context indicates there are uncorrected typos
-  // and the current expression isn't guaranteed to not have typos, try to
-  // resolve any TypoExpr nodes that might be in the expression.
-  if (E && !ExprEvalContexts.empty() && ExprEvalContexts.back().NumTypos &&
-      (E->isTypeDependent() || E->isValueDependent() ||
-       E->isInstantiationDependent())) {
-    auto TyposResolved = DelayedTypos.size();
-    auto Result = TransformTypos(*this, InitDecl, Filter).Transform(E);
-    TyposResolved -= DelayedTypos.size();
-    if (Result.isInvalid() || Result.get() != E) {
-      ExprEvalContexts.back().NumTypos -= TyposResolved;
-      if (Result.isInvalid() && RecoverUncorrectedTypos) {
-        struct TyposReplace : TreeTransform<TyposReplace> {
-          TyposReplace(Sema &SemaRef) : TreeTransform(SemaRef) {}
-          ExprResult TransformTypoExpr(clang::TypoExpr *E) {
-            return this->SemaRef.CreateRecoveryExpr(E->getBeginLoc(),
-                                                    E->getEndLoc(), {});
-          }
-        } TT(*this);
-        return TT.TransformExpr(E);
-      }
-      return Result;
-    }
-    assert(TyposResolved == 0 && "Corrected typo but got same Expr back?");
-  }
-  return E;
-}
-
 ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
                                      bool DiscardedValue, bool IsConstexpr,
                                      bool IsTemplateArgument) {
@@ -8095,8 +7723,6 @@ ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
     DiagnoseUnusedExprResult(FullExpr.get(), diag::warn_unused_expr);
   }
 
-  FullExpr = CorrectDelayedTyposInExpr(FullExpr.get(), /*InitDecl=*/nullptr,
-                                       /*RecoverUncorrectedTypos=*/true);
   if (FullExpr.isInvalid())
     return ExprError();
 
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 39c162c3b835d..5dca509d46fdb 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -650,64 +650,11 @@ bool Sema::CheckQualifiedMemberReference(Expr *BaseExpr,
   return true;
 }
 
-namespace {
-
-// Callback to only accept typo corrections that are either a ValueDecl or a
-// FunctionTemplateDecl and are declared in the current record or, for a C++
-// classes, one of its base classes.
-class RecordMemberExprValidatorCCC final : public CorrectionCandidateCallback {
-public:
-  explicit RecordMemberExprValidatorCCC(QualType RTy)
-      : Record(RTy->getAsRecordDecl()) {
-    // Don't add bare keywords to the consumer since they will always fail
-    // validation by virtue of not being associated with any decls.
-    WantTypeSpecifiers = false;
-    WantExpressionKeywords = false;
-    WantCXXNamedCasts = false;
-    WantFunctionLikeCasts = false;
-    WantRemainingKeywords = false;
-  }
-
-  bool ValidateCandidate(const TypoCorrection &candidate) override {
-    NamedDecl *ND = candidate.getCorrectionDecl();
-    // Don't accept candidates that cannot be member functions, constants,
-    // variables, or templates.
-    if (!ND || !(isa<ValueDecl>(ND) || isa<FunctionTemplateDecl>(ND)))
-      return false;
-
-    // Accept candidates that occur in the current record.
-    if (Record->containsDecl(ND))
-      return true;
-
-    if (const auto *RD = dyn_cast<CXXRecordDecl>(Record)) {
-      // Accept candidates that occur in any of the current class' base classes.
-      for (const auto &BS : RD->bases()) {
-        if (const auto *BSTy = BS.getType()->getAs<RecordType>()) {
-          if (BSTy->getDecl()->containsDecl(ND))
-            return true;
-        }
-      }
-    }
-
-    return false;
-  }
-
-  std::unique_ptr<CorrectionCandidateCallback> clone() override {
-    return std::make_unique<RecordMemberExprValidatorCCC>(*this);
-  }
-
-private:
-  const RecordDecl *const Record;
-};
-
-}
-
 static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                      Expr *BaseExpr, QualType RTy,
                                      SourceLocation OpLoc, bool IsArrow,
                                      CXXScopeSpec &SS, bool HasTemplateArgs,
-                                     SourceLocation TemplateKWLoc,
-                                     TypoExpr *&TE) {
+                                     SourceLocation TemplateKWLoc) {
   SourceRange BaseRange = BaseExpr ? BaseExpr->getSourceRange() : SourceRange();
   if (!RTy->isDependentType() &&
       !SemaRef.isThisOutsideMemberFunctionBody(RTy) &&
@@ -724,56 +671,6 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
                                       /*EnteringContext=*/false, TemplateKWLoc);
 
   SemaRef.LookupParsedName(R, /*S=*/nullptr, &SS, ObjectType);
-
-  if (!R.empty() || R.wasNotFoundInCurrentInstantiation())
-    return false;
-
-  DeclarationName Typo = R.getLookupName();
-  SourceLocation TypoLoc = R.getNameLoc();
-  // Recompute the lookup context.
-  DeclContext *DC = SS.isSet() ? SemaRef.computeDeclContext(SS)
-                               : SemaRef.computeDeclContext(RTy);
-
-  struct QueryState {
-    Sema &SemaRef;
-    DeclarationNameInfo NameInfo;
-    Sema::LookupNameKind LookupKind;
-    RedeclarationKind Redecl;
-  };
-  QueryState Q = {R.getSema(), R.getLookupNameInfo(), R.getLookupKind(),
-                  R.redeclarationKind()};
-  RecordMemberExprValidatorCCC CCC(RTy);
-  TE = SemaRef.CorrectTypoDelayed(
-      R.getLookupNameInfo(), R.getLookupKind(), nullptr, &SS, CCC,
-      [=, &SemaRef](const TypoCorrection &TC) {
-        if (TC) {
-          assert(!TC.isKeyword() &&
-                 "Got a keyword as a correction for a member!");
-          bool DroppedSpecifier =
-              TC.WillReplaceSpecifier() &&
-              Typo.getAsString() == TC.getAsString(SemaRef.getLangOpts());
-          SemaRef.diagnoseTypo(TC, SemaRef.PDiag(diag::err_no_member_suggest)
-                                       << Typo << DC << DroppedSpecifier
-                                       << SS.getRange());
-        } else {
-          SemaRef.Diag(TypoLoc, diag::err_no_member)
-              << Typo << DC << (SS.isSet() ? SS.getRange() : BaseRange);
-        }
-      },
-      [=](Sema &SemaRef, TypoExpr *TE, TypoCorrection TC) mutable {
-        LookupResult R(Q.SemaRef, Q.NameInfo, Q.LookupKind, Q.Redecl);
-        R.clear(); // Ensure there's no decls lingering in the shared state.
-        R.suppressDiagnostics();
-        R.setLookupName(TC.getCorrection());
-        for (NamedDecl *ND : TC)
-          R.addDecl(ND);
-        R.resolveKind();
-        return SemaRef.BuildMemberReferenceExpr(
-            BaseExpr, BaseExpr->getType(), OpLoc, IsArrow, SS, SourceLocation(),
-            nullptr, R, nullptr, nullptr);
-      },
-      CorrectTypoKind::ErrorRecovery, DC);
-
   return false;
 }
 
@@ -793,15 +690,11 @@ ExprResult Sema::BuildMemberReferenceExpr(
 
   // Implicit member accesses.
   if (!Base) {
-    TypoExpr *TE = nullptr;
     QualType RecordTy = BaseType;
     if (IsArrow) RecordTy = RecordTy->castAs<PointerType>()->getPointeeType();
     if (LookupMemberExprInRecord(*this, R, nullptr, RecordTy, OpLoc, IsArrow,
-                                 SS, TemplateArgs != nullptr, TemplateKWLoc,
-                                 TE))
+                                 SS, TemplateArgs != nullptr, TemplateKWLoc))
       return ExprError();
-    if (TE)
-      return TE;
 
   // Explicit member accesses.
   } else {
@@ -1396,16 +1289,15 @@ static ExprResult LookupMemberExpr(Sema &S, LookupResult &R,
 
   // Handle field access to simple records.
   if (BaseType->getAsRecordDecl()) {
-    TypoExpr *TE = nullptr;
     if (LookupMemberExprInRecord(S, R, BaseExpr.get(), BaseType, OpLoc, IsArrow,
-                                 SS, HasTemplateArgs, TemplateKWLoc, TE))
+                                 SS, HasTemplateArgs, TemplateKWLoc))
       return ExprError();
 
     // Returning valid-but-null is how we indicate to the caller that
     // the lookup result was filled in. If typo correction was attempted and
     // failed, the lookup result will have been cleared--that combined with the
     // valid-but-null ExprResult will trigger the appropriate diagnostics.
-    return ExprResult(TE);
+    return ExprResult{};
   } else if (BaseType->isDependentType()) {
     R.setNotFoundInCurrentInstantiation();
     return ExprEmpty();
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 91822909f1fd3..5ad9dd8ed0d3e 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -5444,40 +5444,6 @@ TypoCorrection Sema::CorrectTypo(const DeclarationNameInfo &TypoName,
   return FailedCorrection(Typo, TypoName.getLoc(), RecordFailure && !SecondBestTC);
 }
 
-TypoExpr *Sema::CorrectTypoDelayed(
-    const DeclarationNameInfo &TypoName, Sema::LookupNameKind LookupKind,
-    Scope *S, CXXScopeSpec *SS, CorrectionCandidateCallback &CCC,
-    TypoDiagnosticGenerator TDG, TypoRecoveryCallback TRC, CorrectTypoKind Mode,
-    DeclContext *MemberContext, bool EnteringContext,
-    const ObjCObjectPointerType *OPT) {
-  auto Consumer = makeTypoCorrectionConsumer(
-      TypoName, LookupKind, S, SS, CCC, MemberContext, EnteringContext, OPT,
-      Mode == CorrectTypoKind::ErrorRecovery);
-
-  // Give the external sema source a chance to correct the typo.
-  TypoCorrection ExternalTypo;
-  if (ExternalSource && Consumer) {
-    ExternalTypo = ExternalSource->CorrectTypo(
-        TypoName, LookupKind, S, SS, *Consumer->getCorrectionValidator(),
-        MemberContext, EnteringContext, OPT);
-    if (ExternalTypo)
-      Consumer->addCorrection(ExternalTypo);
-  }
-
-  if (!Consumer || Consumer->empty())
-    return nullptr;
-
-  // Make sure the best edit distance (prior to adding any namespace qualifiers)
-  // is not more that about a third of the length of the typo's identifier.
-  unsigned ED = Consumer->getBestEditDistance(true);
-  IdentifierInfo *Typo = TypoName.getName().getAsIdentifierInfo();
-  if (!ExternalTypo && ED > 0 && Typo->getName().size() / ED < 3)
-    return nullptr;
-  ExprEvalContexts.back().NumTypos++;
-  return createDelayedTypo(std::move(Consumer), std::move(TDG), std::move(TRC),
-                           TypoName.getLoc());
-}
-
 void TypoCorrection::addCorrectionDecl(NamedDecl *CDecl) {
   if (!CDecl) return;
 
@@ -5802,32 +5768,6 @@ void Sema::diagnoseTypo(const TypoCorrection &Correction,
     Diag(Correction.getCorrectionRange().getBegin(), PD);
 }
 
-TypoExpr *Sema::createDelayedTypo(std::unique_ptr<TypoCorrectionConsumer> TCC,
-                                  TypoDiagnosticGenerator TDG,
-                                  TypoRecoveryCallback TRC,
-                                  SourceLocation TypoLoc) {
-  assert(TCC && "createDelayedTypo requires a valid TypoCorrectionConsumer");
-  auto TE = new (Context) TypoExpr(Context.DependentTy, TypoLoc);
-  auto &State = DelayedTypos[TE];
-  State.Consumer = std::move(TCC);
-  State.DiagHandler = std::move(TDG);
-  State.RecoveryHandler = std::move(TRC);
-  if (TE)
-    TypoExprs.push_back(TE);
-  return TE;
-}
-
-const Sema::TypoExprState &Sema::getTypoExprState(TypoExpr *TE) const {
-  auto Entry = DelayedTypos.find(TE);
-  assert(Entry != DelayedTypos.end() &&
-         "Failed to get the state for a TypoExpr!");
-  return Entry->second;
-}
-
-void Sema::clearDelayedTypo(TypoExpr *TE) {
-  DelayedTypos.erase(TE);
-}
-
 void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) {
   DeclarationNameInfo Name(II, IILoc);
   LookupResult R(*this, Name, LookupAnyName,
diff --git a/clang/lib/Sema/SemaObjC.cpp b/clang/lib/Sema/SemaObjC.cpp
index 56815cd2731a1..0f39a9817ce7f 100644
--- a/clang/lib/Sema/SemaObjC.cpp
+++ b/clang/lib/Sema/SemaObjC.cpp
@@ -124,17 +124,12 @@ ExprResult SemaObjC::CheckObjCForCollectionOperand(SourceLocation forLoc,
   if (!collection)
     return ExprError();
 
-  ExprResult result = SemaRef.CorrectDelayedTyposInExpr(collection);
-  if (!result.isUsable())
-    return ExprError();
-  collection = result.get();
-
   // Bail out early if we've got a type-dependent expression.
   if (collection->isTypeDependent())
     return collection;
 
   // Perform normal l-value conversion.
-  result = SemaRef.DefaultFunctionArrayLvalueConversion(collection);
+  ExprResult result = SemaRef.DefaultFunctionArrayLvalueConversion(collection);
   if (result.isInvalid())
     return ExprError();
   collection = result.get();
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 89e86f49a3ca8..49e5a311e239e 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14055,8 +14055,10 @@ FunctionDecl *Sema::ResolveSingleFunctionTemplateSpecialization(
     //   specified and it, along with any default template arguments,
     //   identifies a single function template specialization, then the
     //   template-id is an lvalue for the function template specialization.
-    FunctionTemplateDecl *FunctionTemplate
-      = cast<FunctionTemplateDecl>((*I)->getUnderlyingDecl());
+    FunctionTemplateDecl *FunctionTemplate =
+        dyn_cast<FunctionTemplateDecl>((*I)->getUnderlyingDecl());
+    if (!FunctionTemplate)
+      continue;
 
     // C++ [over.over]p2:
     //   If the name is a function template, template argument deduction is
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 50f5757dff5bc..923a9e81fbd6a 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -535,12 +535,7 @@ Sema::ActOnCaseExpr(SourceLocation CaseLoc, ExprResult Val) {
     return ER;
   };
 
-  ExprResult Converted = CorrectDelayedTyposInExpr(
-      Val, /*InitDecl=*/nullptr, /*RecoverUncorrectedTypos=*/false,
-      CheckAndFinish);
-  if (Converted.get() == Val.get())
-    Converted = CheckAndFinish(Val.get());
-  return Converted;
+  return CheckAndFinish(Val.get());
 }
 
 StmtResult
@@ -2344,7 +2339,7 @@ StmtResult Sema::ActOnForEachLValueExpr(Expr *E) {
 static bool FinishForRangeVarDecl(Sema &SemaRef, VarDecl *Decl, Expr *Init,
                                   SourceLocation Loc, int DiagID) {
   if (Decl->getType()->isUndeducedType()) {
-    ExprResult Res = SemaRef.CorrectDelayedTyposInExpr(Init);
+    ExprResult Res = Init;
     if (!Res.isUsable()) {
       Decl->setInvalidDecl();
       return true;
@@ -3845,10 +3840,7 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
 StmtResult
 Sema::ActOnReturnStmt(SourceLocation ReturnLoc, Expr *RetValExp,
                       Scope *CurScope) {
-  // Correct typos, in case the containing function returns 'auto' and
-  // RetValExp should determine the deduced type.
-  ExprResult RetVal = CorrectDelayedTyposInExpr(
-      RetValExp, nullptr, /*RecoverUncorrectedTypos=*/true);
+  ExprResult RetVal = RetValExp;
   if (RetVal.isInvalid())
     return StmtError();
 
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 17da5fd8325be..b78080c991763 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -782,11 +782,10 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
 ExprResult Sema::BuildCXXAssumeExpr(Expr *Assumption,
                                     const IdentifierInfo *AttrName,
                                     SourceRange Range) {
-  ExprResult Res = CorrectDelayedTyposInExpr(Assumption);
-  if (Res.isInvalid())
+  if (!Assumption)
     return ExprError();
 
-  Res = CheckPlaceholderExpr(Res.get());
+  ExprResult Res = CheckPlaceholderExpr(Assumption);
   if (Res.isInvalid())
     return ExprError();
 
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 5f0e968ff18c4..572dbf2e7393f 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -741,7 +741,6 @@ ExprResult Sema::CheckPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc,
   if (!Pattern->containsUnexpandedParameterPack()) {
     Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
     << Pattern->getSourceRange();
-    CorrectDelayedTyposInExpr(Pattern);
     return ExprError();
   }
 
@@ -1201,11 +1200,9 @@ ExprResult Sema::ActOnPackIndexingExpr(Scope *S, Expr *PackExpression,
                                        SourceLocation RSquareLoc) {
   bool isParameterPack = ::isParameterPack(PackExpression);
   if (!isParameterPack) {
-    if (!PackExpression->containsErrors()) {
-      CorrectDelayedTyposInExpr(IndexExpr);
+    if (!PackExpression->containsErrors())
       Diag(PackExpression->getBeginLoc(), diag::err_expected_name_of_pack)
           << PackExpression;
-    }
     return ExprError();
   }
   ExprResult Res =
@@ -1403,11 +1400,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
   CheckFoldOperand(*this, LHS);
   CheckFoldOperand(*this, RHS);
 
-  auto DiscardOperands = [&] {
-    CorrectDelayedTyposInExpr(LHS);
-    CorrectDelayedTyposInExpr(RHS);
-  };
-
   // [expr.prim.fold]p3:
   //   In a binary fold, op1 and op2 shall be the same fold-operator, and
   //   either e1 shall contain an unexpanded parameter pack or e2 shall contain
@@ -1415,7 +1407,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
   if (LHS && RHS &&
       LHS->containsUnexpandedParameterPack() ==
           RHS->containsUnexpandedParameterPack()) {
-    DiscardOperands();
     return Diag(EllipsisLoc,
                 LHS->containsUnexpandedParameterPack()
                     ? diag::err_fold_expression_packs_both_sides
@@ -1430,7 +1421,6 @@ ExprResult Sema::ActOnCXXFoldExpr(Scope *S, SourceLocation LParenLoc, Expr *LHS,
     Expr *Pack = LHS ? LHS : RHS;
     assert(Pack && "fold expression with neither LHS nor RHS");
     if (!Pack->containsUnexpandedParameterPack()) {
-      DiscardOperands();
       return Diag(EllipsisLoc, diag::err_pack_expansion_without_parameter_packs)
              << Pack->getSourceRange();
     }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index c8d29f0a625f8..3e33fb73e01b4 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -13121,12 +13121,6 @@ TreeTransform<Derived>::TransformOpaqueValueExpr(OpaqueValueExpr *E) {
   return E;
 }
 
-template<typename Derived>
-ExprResult
-TreeTransform<Derived>::TransformTypoExpr(TypoExpr *E) {
-  return E;
-}
-
 template <typename Derived>
 ExprResult TreeTransform<Derived>::TransformRecoveryExpr(RecoveryExpr *E) {
   llvm::SmallVector<Expr *, 8> Children;
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 01c838b955755..65102b64030c6 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2310,10 +2310,6 @@ void ASTStmtReader::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
   E->setIsUnique(Record.readInt());
 }
 
-void ASTStmtReader::VisitTypoExpr(TypoExpr *E) {
-  llvm_unreachable("Cannot read TypoExpr nodes");
-}
-
 void ASTStmtReader::VisitRecoveryExpr(RecoveryExpr *E) {
   VisitExpr(E);
   unsigned NumArgs = Record.readInt();
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 767e7405752c2..a6e320c7f3eb0 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2314,12 +2314,6 @@ void ASTStmtWriter::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
   Code = serialization::EXPR_OPAQUE_VALUE;
 }
 
-void ASTStmtWriter::VisitTypoExpr(TypoExpr *E) {
-  VisitExpr(E);
-  // TODO: Figure out sane writer behavior for a TypoExpr, if necessary
-  llvm_unreachable("Cannot write TypoExpr nodes");
-}
-
 //===----------------------------------------------------------------------===//
 // CUDA Expressions and Statements.
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index b28deee41d1c5..c77ef26da568d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1732,7 +1732,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::ExpressionTraitExprClass:
     case Stmt::UnresolvedLookupExprClass:
     case Stmt::UnresolvedMemberExprClass:
-    case Stmt::TypoExprClass:
     case Stmt::RecoveryExprClass:
     case Stmt::CXXNoexceptExprClass:
     case Stmt::PackExpansionExprClass:
diff --git a/clang/test/AST/ByteCode/literals.cpp b/clang/test/AST/ByteCode/literals.cpp
index 2fa7b69b93470..699746c0b2c4a 100644
--- a/clang/test/AST/ByteCode/literals.cpp
+++ b/clang/test/AST/ByteCode/literals.cpp
@@ -910,7 +910,8 @@ namespace CompoundLiterals {
   constexpr int f2(int *x =(int[]){1,2,3}) {
     return x[0];
   }
-  constexpr int g = f2(); // Should evaluate to 1?
+  // Should evaluate to 1?
+  constexpr int g = f2(); // #g_decl
   static_assert(g == 1, "");
 
   // This example should be rejected because the lifetime of the compound
@@ -1347,7 +1348,10 @@ namespace NTTP {
 namespace UnaryOpError {
   constexpr int foo() {
     int f = 0;
-    ++g; // both-error {{use of undeclared identifier 'g'}}
+    ++g; // both-error {{use of undeclared identifier 'g'}} \
+            both-error {{cannot assign to variable 'g' with const-qualified type 'const int'}} \
+            both-note@#g_decl {{'CompoundLiterals::g' declared here}} \
+            both-note@#g_decl {{variable 'g' declared const here}}
     return f;
   }
 }
diff --git a/clang/test/AST/ast-dump-recovery.c b/clang/test/AST/ast-dump-recovery.c
index 68d3f182dd9f6..09a03fb9d6fdf 100644
--- a/clang/test/AST/ast-dump-recovery.c
+++ b/clang/test/AST/ast-dump-recovery.c
@@ -23,13 +23,6 @@ int postfix_inc = a++;
 // CHECK-NEXT:      `-IntegerLiteral {{.*}} 'int'
 int unary_address = &(a + 1);
 
-// CHECK:       VarDecl {{.*}} ternary 'int' cinit
-// CHECK-NEXT:  `-ConditionalOperator {{.*}}
-// CHECK-NEXT:    |-DeclRefExpr {{.*}} 'a'
-// CHECK-NEXT:    |-RecoveryExpr {{.*}}
-// CHECK-NEXT:    `-DeclRefExpr {{.*}} 'a'
-int ternary = a ? undef : a;
-
 void test1() {
   // CHECK:     `-RecoveryExpr {{.*}} contains-errors
   // CHECK-NEXT:  `-DeclRefExpr {{.*}} 'a' 'const int'
@@ -91,12 +84,6 @@ void test3() {
   // CHECK-NEXT: |   `-DeclRefExpr {{.*}} '__builtin_classify_type'
   // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 1
   (*__builtin_classify_type)(1);
-
-  extern void ext();
-  // CHECK:     CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-DeclRefExpr {{.*}} 'ext'
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>'
-  ext(undef_var);
 }
 
 // Verify no crash.
@@ -110,23 +97,6 @@ void test4() {
   };
 }
 
-// Verify no crash
-void test5_GH62711() {
-  // CHECK:      VAArgExpr {{.*}} 'int' contains-errors
-  // CHECK-NEXT: | `-ImplicitCastExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  if (__builtin_va_arg(undef, int) << 1);
-}
-
-void test6_GH50244() {
-  double array[16];
-  // CHECK:      UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' contains-errors sizeof
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-DeclRefExpr {{.*}} 'int ()'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} '<dependent type>'
-  sizeof array / sizeof foo(undef);
-}
-
 // No crash on DeclRefExpr that refers to ValueDecl with invalid initializers.
 void test7() {
   int b[] = {""()};
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index b8195950f2fa1..a8e30f1759e9f 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -9,28 +9,6 @@ int some_func(int *);
 // CHECK-NEXT:    `-IntegerLiteral {{.*}} 123
 // DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
 int invalid_call = some_func(123);
-void test_invalid_call_1(int s) {
-  // CHECK:      CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:13>
-  // CHECK-NEXT: `-BinaryOperator {{.*}}
-  // CHECK-NEXT:   |-RecoveryExpr {{.*}}
-  // CHECK-NEXT:   `-IntegerLiteral {{.*}} <col:28> 'int' 1
-  some_func(undef1, undef2+1);
-
-  // CHECK:      BinaryOperator {{.*}} '<dependent type>' contains-errors '='
-  // CHECK-NEXT: |-DeclRefExpr {{.*}} 's'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  s = some_func(undef1);
-
-  // CHECK:     VarDecl {{.*}} var 'int'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  int var = some_func(undef1);
-}
 
 int some_func2(int a, int b);
 void test_invalid_call_2() {
@@ -63,22 +41,6 @@ int ambig_func(float);
 // DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
 int ambig_call = ambig_func(123);
 
-// CHECK:     VarDecl {{.*}} unresolved_call1
-// CHECK-NEXT:`-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-// CHECK-NEXT:  `-UnresolvedLookupExpr {{.*}} 'bar'
-// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
-int unresolved_call1 = bar();
-
-// CHECK:     VarDecl {{.*}} unresolved_call2
-// CHECK-NEXT:`-CallExpr {{.*}} contains-errors
-// CHECK-NEXT:  |-UnresolvedLookupExpr {{.*}} 'bar'
-// CHECK-NEXT:  |-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:  | `-UnresolvedLookupExpr {{.*}} 'baz'
-// CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'qux'
-// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors
-int unresolved_call2 = bar(baz(), qux());
-
 constexpr int a = 10;
 
 // CHECK:     VarDecl {{.*}} postfix_inc
@@ -177,11 +139,6 @@ void test2(Foo2 f) {
   f.overload(1);
 }
 
-// CHECK:     |-AlignedAttr {{.*}} alignas
-// CHECK-NEXT:| `-RecoveryExpr {{.*}} contains-errors
-// CHECK-NEXT:|   `-UnresolvedLookupExpr {{.*}} 'invalid'
-struct alignas(invalid()) Aligned {};
-
 auto f();
 int f(double);
 // CHECK:      VarDecl {{.*}} unknown_type_call 'int'
@@ -203,16 +160,6 @@ void InvalidInitalizer(int x) {
   // CHECK-NEXT:  `-InitListExpr
   // CHECK-NEDT:   `-DeclRefExpr {{.*}} 'x'
   Bar a3{x};
-  // CHECK:     `-VarDecl {{.*}} a4 'Bar'
-  // CHECK-NEXT: `-ParenListExpr {{.*}} 'NULL TYPE' contains-errors
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar a4(invalid());
-  // CHECK:     `-VarDecl {{.*}} a5 'Bar'
-  // CHECK-NEXT: `-InitListExpr {{.*}} contains-errors
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar a5{invalid()};
 
   // CHECK:     `-VarDecl {{.*}} b1 'Bar'
   // CHECK-NEXT: `-RecoveryExpr {{.*}} contains-errors
@@ -231,52 +178,12 @@ void InvalidInitalizer(int x) {
   // CHECK-NEXT:    `-InitListExpr {{.*}} 'void'
   // CHECK-NEXT:      `-DeclRefExpr {{.*}} 'x' 'int'
   Bar b4 = Bar{x};
-  // CHECK:     `-VarDecl {{.*}} b5 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar b5 = Bar(invalid());
-  // CHECK:     `-VarDecl {{.*}} b6 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
-  // CHECK-NEXT:  `-InitListExpr {{.*}} contains-errors
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  Bar b6 = Bar{invalid()};
 
   // CHECK:     RecoveryExpr {{.*}} 'Bar' contains-errors
   // CHECK-NEXT:  `-IntegerLiteral {{.*}} 'int' 1
   Bar(1);
-
-  // CHECK:     `-VarDecl {{.*}} var1
-  // CHECK-NEXT: `-BinaryOperator {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 1
-  int var1 = undef + 1;
-}
-void InitializerForAuto() {
-  // CHECK:     `-VarDecl {{.*}} invalid a 'auto'
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-UnresolvedLookupExpr {{.*}} 'invalid'
-  auto a = invalid();
-
-  // CHECK:     `-VarDecl {{.*}} invalid b 'auto'
-  // CHECK-NEXT: `-CallExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   |-UnresolvedLookupExpr {{.*}} 'some_func'
-  // CHECK-NEXT:   `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
-  auto b = some_func(invalid());
-
-  decltype(ned);
-  // very bad initailizer: there is an unresolved typo expr internally, we just
-  // drop it.
-  // CHECK: `-VarDecl {{.*}} invalid unresolved_typo 'auto'
-  auto unresolved_typo = gned.*[] {};
 }
 
-// Verified that the generated call operator is invalid.
-// CHECK: |-CXXMethodDecl {{.*}} invalid operator() 'auto () const -> auto'
-using Escape = decltype([] { return undef(); }());
-
 // CHECK:      VarDecl {{.*}} NoCrashOnInvalidInitList
 // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
 // CHECK-NEXT:   `-InitListExpr
@@ -301,56 +208,8 @@ void ValueCategory() {
   xvalue(); // call to a function (rvalue reference return type) yields an xvalue.
 }
 
-void InvalidCondition() {
-  // CHECK:      IfStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:7, col:15> '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:7>
-  if (invalid()) {}
-
-  // CHECK:      WhileStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} <col:10, col:18> '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:10>
-  while (invalid()) {}
-
-  // CHECK:      SwitchStmt {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} <col:10>
-  switch(invalid()) {
-    case 1:
-      break;
-  }
-  // FIXME: figure out why the type of ConditionalOperator is not int.
-  // CHECK:      ConditionalOperator {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}}
-  // CHECK-NEXT: |-IntegerLiteral {{.*}} 'int' 1
-  // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 2
-  invalid() ? 1 : 2;
-}
-
 void CtorInitializer() {
   struct S{int m};
-  class MemberInit {
-    int x, y, z;
-    S s;
-    MemberInit() : x(invalid), y(invalid, invalid), z(invalid()), s(1,2) {}
-    // CHECK:      CXXConstructorDecl {{.*}} MemberInit 'void ()'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'x' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'y' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   |-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 'z' 'int'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
-    // CHECK-NEXT: |     `-UnresolvedLookupExpr {{.*}} '<overloaded function type>'
-    // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 's' 'S'
-    // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S' contains-errors
-    // CHECK-NEXT: |   |-IntegerLiteral {{.*}} 1
-    // CHECK-NEXT: |   `-IntegerLiteral {{.*}} 2
-  };
   class BaseInit : S {
     BaseInit(float) : S("no match") {}
     // CHECK:      CXXConstructorDecl {{.*}} BaseInit 'void (float)'
@@ -358,13 +217,6 @@ void CtorInitializer() {
     // CHECK-NEXT: |-CXXCtorInitializer 'S'
     // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S'
     // CHECK-NEXT: |   `-StringLiteral
-
-    BaseInit(double) : S(invalid) {}
-    // CHECK:      CXXConstructorDecl {{.*}} BaseInit 'void (double)'
-    // CHECK-NEXT: |-ParmVarDecl
-    // CHECK-NEXT: |-CXXCtorInitializer 'S'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
   };
   class DelegatingInit {
     DelegatingInit(float) : DelegatingInit("no match") {}
@@ -373,13 +225,6 @@ void CtorInitializer() {
     // CHECK-NEXT: |-CXXCtorInitializer 'DelegatingInit'
     // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'DelegatingInit'
     // CHECK-NEXT: |   `-StringLiteral
-
-    DelegatingInit(double) : DelegatingInit(invalid) {}
-    // CHECK:      CXXConstructorDecl {{.*}} DelegatingInit 'void (double)'
-    // CHECK-NEXT: |-ParmVarDecl
-    // CHECK-NEXT: |-CXXCtorInitializer 'DelegatingInit'
-    // CHECK-NEXT: | `-ParenListExpr
-    // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
   };
 }
 
@@ -423,65 +268,6 @@ void returnInitListFromVoid() {
   // CHECK-NEXT:   `-IntegerLiteral {{.*}} 'int' 8
 }
 
-void RecoveryExprForInvalidDecls(Unknown InvalidDecl) {
-  InvalidDecl + 1;
-  // CHECK:      BinaryOperator {{.*}}
-  // CHECK-NEXT: |-RecoveryExpr {{.*}} '<dependent type>'
-  // CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'InvalidDecl' 'int'
-  // CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 1
-  InvalidDecl();
-  // CHECK:      CallExpr {{.*}}
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>'
-}
-
-void InitializerOfInvalidDecl() {
-  int ValidDecl;
-  Unkown InvalidDecl = ValidDecl;
-  // CHECK:      VarDecl {{.*}} invalid InvalidDecl
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'ValidDecl'
-
-  Unknown InvalidDeclWithInvalidInit = Invalid;
-  // CHECK:      VarDecl {{.*}} invalid InvalidDeclWithInvalidInit
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NOT:    `-TypoExpr
-}
-
-void RecoverToAnInvalidDecl() {
-  Unknown* foo; // invalid decl
-  goo; // the typo was correct to the invalid foo.
-  // Verify that RecoveryExpr has an inner DeclRefExpr.
-  // CHECK:      RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
-  // CHECK-NEXT: `-DeclRefExpr {{.*}} 'foo' 'int *'
-}
-
-void RecoveryToDoWhileStmtCond() {
-  // CHECK:       FunctionDecl {{.*}} RecoveryToDoWhileStmtCond
-  // CHECK:       `-DoStmt {{.*}}
-  // CHECK-NEXT:    |-CompoundStmt {{.*}}
-  // CHECK-NEXT:    `-BinaryOperator {{.*}} '<dependent type>' contains-errors '<'
-  // CHECK-NEXT:      |-BinaryOperator {{.*}} '<dependent type>' contains-errors '+'
-  // CHECK-NEXT:      | |-RecoveryExpr {{.*}} '<dependent type>' contains-errors lvalue
-  // CHECK-NEXT:      | `-IntegerLiteral {{.*}} 'int' 1
-  // CHECK-NEXT:      `-IntegerLiteral {{.*}} 'int' 10
-  do {} while (some_invalid_val + 1 < 10);
-}
-
-void RecoveryForStmtCond() {
-  // CHECK:FunctionDecl {{.*}} RecoveryForStmtCond
-  // CHECK-NEXT:`-CompoundStmt {{.*}}
-  // CHECK-NEXT:  `-ForStmt {{.*}}
-  // CHECK-NEXT:    |-DeclStmt {{.*}}
-  // CHECK-NEXT:    | `-VarDecl {{.*}}
-  // CHECK-NEXT:    |   `-IntegerLiteral {{.*}} <col:16> 'int' 0
-  // CHECK-NEXT:    |-<<<NULL>>>
-  // CHECK-NEXT:    |-RecoveryExpr {{.*}} 'bool' contains-errors
-  // CHECK-NEXT:    |-UnaryOperator {{.*}} 'int' lvalue prefix '++'
-  // CHECK-NEXT:    | `-DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'i' 'int'
-  // CHECK-NEXT:    `-CompoundStmt {{.*}}
-  for (int i = 0; i < invalid; ++i) {}
-}
-
 // Fix crash issue https://github.com/llvm/llvm-project/issues/112560.
 // Make sure clang compiles the following code without crashing:
 
diff --git a/clang/test/AST/ast-dump-recovery.m b/clang/test/AST/ast-dump-recovery.m
deleted file mode 100644
index 37fa8045c0b94..0000000000000
--- a/clang/test/AST/ast-dump-recovery.m
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -frecovery-ast -frecovery-ast-type -fblocks -ast-dump %s | FileCheck -strict-whitespace %s
-
-@interface Foo
-- (void)method:(int)n;
-@end
-
-void k(Foo *foo) {
-  // CHECK:       ObjCMessageExpr {{.*}} 'void' contains-errors
-  // CHECK-CHECK:  |-ImplicitCastExpr {{.*}} 'Foo *' <LValueToRValue>
-  // CHECK-CHECK:  | `-DeclRefExpr {{.*}} 'foo'
-  // CHECK-CHECK:  `-RecoveryExpr {{.*}}
-  [foo method:undef];
-
-  // CHECK:      ImplicitCastExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT: `-RecoveryExpr {{.*}} '<dependent type>' contains-errors
-  // CHECK-NEXT:   `-DeclRefExpr {{.*}} 'foo'
-  foo.undef;
-}
-
-// CHECK:      |-VarDecl {{.*}} 'int (^)()' cinit
-// CHECK-NEXT: | `-RecoveryExpr {{.*}} '<dependent type> (^)(void)' contains-errors lvalue
-// CHECK-NEXT: |   `-BlockExpr {{.*}} '<dependent type> (^)(void)'
-// CHECK-NEXT: |     `-BlockDecl {{.*}} invalid
-int (^gh63863)() = ^() {
-  return undef;
-};
-
-// CHECK:      `-BlockExpr {{.*}} 'int (^)(int, int)'
-// CHECK-NEXT:   `-BlockDecl {{.*}} invalid
-int (^gh64005)(int, int) = ^(int, undefined b) {
-   return 1;
-};
diff --git a/clang/test/CXX/drs/cwg1xx.cpp b/clang/test/CXX/drs/cwg1xx.cpp
index 8b84de0ab5a9a..c9dce77b772dc 100644
--- a/clang/test/CXX/drs/cwg1xx.cpp
+++ b/clang/test/CXX/drs/cwg1xx.cpp
@@ -702,8 +702,7 @@ namespace cwg141 { // cwg141: 3.1
     // cxx98-error@#cwg141-a {{lookup of 'S' in member access expression is ambiguous; using member of 'struct A'}}
     //   cxx98-note@#cwg141-A-S {{lookup in the object type 'struct A' refers here}}
     //   cxx98-note@#cwg141-S {{lookup from the current scope refers here}}
-    // expected-error@#cwg141-a {{no member named 'n' in 'cwg141::A::S<int>'; did you mean '::cwg141::S<int>::n'?}}
-    //   expected-note@#cwg141-S {{'::cwg141::S<int>::n' declared here}}
+    // expected-error@#cwg141-a {{no member named 'n' in 'cwg141::A::S<int>'}}
     // FIXME: we issue a useful diagnostic first, then some bogus ones.
     b.f<int>();
     // expected-error@-1 {{no member named 'f' in 'cwg141::B'}}
diff --git a/clang/test/CXX/drs/cwg26xx.cpp b/clang/test/CXX/drs/cwg26xx.cpp
index ab4d3695b6e22..60d896443ecd1 100644
--- a/clang/test/CXX/drs/cwg26xx.cpp
+++ b/clang/test/CXX/drs/cwg26xx.cpp
@@ -220,7 +220,6 @@ int x = cwg2640_a\N{abc});
 int y = cwg2640_a\N{LOTUS});
 // expected-error@-1 {{character <U+1FAB7> not allowed in an identifier}}
 // expected-error@-2 {{use of undeclared identifier 'cwg2640_a🪷'}}
-// expected-error@-3 {{extraneous ')' before ';'}}
 } // namespace cwg2640
 
 // cwg2642: na
diff --git a/clang/test/CXX/module/basic/basic.link/p2.cppm b/clang/test/CXX/module/basic/basic.link/p2.cppm
index d7d2b5992a235..6a2c67526c9a1 100644
--- a/clang/test/CXX/module/basic/basic.link/p2.cppm
+++ b/clang/test/CXX/module/basic/basic.link/p2.cppm
@@ -51,7 +51,7 @@ void use_from_module_impl() {
   (void)external_linkage_var;
   (void)module_linkage_var;
 
-  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} //expected-error{{}}
+  (void)internal_linkage_class{}; // expected-error {{use of undeclared identifier 'internal_linkage_class'}} // expected-note@* {{}}
   (void)internal_linkage_var; // expected-error {{use of undeclared identifier 'internal_linkage_var'}}
 }
 
@@ -64,7 +64,7 @@ void use_from_module_impl() {
   internal_linkage_fn(); // expected-error {{use of undeclared identifier 'internal_linkage_fn'}}
   (void)external_linkage_class{};
   (void)module_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
-  (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}}
+  (void)internal_linkage_class{}; // expected-error {{undeclared identifier}} expected-error 0+{{}} // expected-note@* {{}}
   (void)external_linkage_var;
   (void)module_linkage_var; // expected-error {{undeclared identifier}}
   (void)internal_linkage_var; // expected-error {{undeclared identifier}}
diff --git a/clang/test/FixIt/typo.cpp b/clang/test/FixIt/typo.cpp
deleted file mode 100644
index e489fbbcaa1df..0000000000000
--- a/clang/test/FixIt/typo.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: cp %s %t
-// RUN: not %clang_cc1 -fixit -x c++ %t
-// RUN: %clang_cc1 -fsyntax-only -pedantic -Werror -x c++ %t
-// RUN: grep test_string %t
-
-namespace std {
-  template<typename T> class basic_string { // expected-note 3{{'basic_string' declared here}}
-  public:
-    int find(const char *substr); // expected-note{{'find' declared here}}
-    static const int npos = -1; // expected-note{{'npos' declared here}}
-  };
-
-  typedef basic_string<char> string; // expected-note 2{{'string' declared here}}
-}
-
-namespace otherstd { // expected-note 2{{'otherstd' declared here}} \
-                     // expected-note{{namespace 'otherstd' defined here}}
-  using namespace std;
-}
-
-using namespace std;
-
-other_std::strng str1; // expected-error{{use of undeclared identifier 'other_std'; did you mean 'otherstd'?}} \
-// expected-error{{no type named 'strng' in namespace 'otherstd'; did you mean 'string'?}}
-tring str2; // expected-error{{unknown type name 'tring'; did you mean 'string'?}}
-
-::other_std::string str3; // expected-error{{no member named 'other_std' in the global namespace; did you mean 'otherstd'?}}
-
-float area(float radius, // expected-note{{'radius' declared here}}
-           float pi) {
-  return radious * pi; // expected-error{{did you mean 'radius'?}}
-}
-
-using namespace othestd; // expected-error{{no namespace named 'othestd'; did you mean 'otherstd'?}}
-namespace blargh = otherstd; // expected-note 3{{namespace 'blargh' defined here}}
-using namespace ::blarg; // expected-error{{no namespace named 'blarg' in the global namespace; did you mean 'blargh'?}}
-
-namespace wibble = blarg; // expected-error{{no namespace named 'blarg'; did you mean 'blargh'?}}
-namespace wobble = ::blarg; // expected-error{{no namespace named 'blarg' in the global namespace; did you mean 'blargh'?}}
-
-bool test_string(std::string s) {
-  basc_string<char> b1; // expected-error{{no template named 'basc_string'; did you mean 'basic_string'?}}
-  std::basic_sting<char> b2; // expected-error{{no template named 'basic_sting' in namespace 'std'; did you mean 'basic_string'?}}
-  (void)b1;
-  (void)b2;
-  return s.fnd("hello") // expected-error{{no member named 'fnd' in 'std::basic_string<char>'; did you mean 'find'?}}
-    == std::string::pos; // expected-error{{no member named 'pos' in 'std::basic_string<char>'; did you mean 'npos'?}}
-}
-
-struct Base { };
-struct Derived : public Base { // expected-note{{base class 'Base' specified here}}
-  int member; // expected-note 3{{'member' declared here}}
-
-  Derived() : base(), // expected-error{{initializer 'base' does not name a non-static data member or base class; did you mean the base class 'Base'?}}
-              ember() { } // expected-error{{initializer 'ember' does not name a non-static data member or base class; did you mean the member 'member'?}}
-
-  int getMember() const {
-    return ember; // expected-error{{use of undeclared identifier 'ember'; did you mean 'member'?}}
-  }
-
-  int &getMember();
-};
-
-int &Derived::getMember() {
-  return ember; // expected-error{{use of undeclared identifier 'ember'; did you mean 'member'?}}
-}
-
-typedef int Integer; // expected-note{{'Integer' declared here}}
-int global_value; // expected-note{{'global_value' declared here}}
-
-int foo() {
-  integer * i = 0; // expected-error{{unknown type name 'integer'; did you mean 'Integer'?}}
-  unsinged *ptr = 0; // expected-error{{use of undeclared identifier 'unsinged'; did you mean 'unsigned'?}}
-  return *i + *ptr + global_val; // expected-error{{use of undeclared identifier 'global_val'; did you mean 'global_value'?}}
-}
-
-namespace nonstd {
-  typedef std::basic_string<char> yarn; // expected-note 2 {{'nonstd::yarn' declared here}}
-  int narf; // expected-note{{'nonstd::narf' declared here}}
-}
-
-yarn str4; // expected-error{{unknown type name 'yarn'; did you mean 'nonstd::yarn'?}}
-wibble::yarn str5; // expected-error{{no type named 'yarn' in namespace 'otherstd'; did you mean 'nonstd::yarn'?}}
-
-namespace another {
-  template<typename T> class wide_string {}; // expected-note {{'another::wide_string' declared here}}
-}
-int poit() {
-  nonstd::basic_string<char> str; // expected-error{{no template named 'basic_string' in namespace 'nonstd'; did you mean simply 'basic_string'?}}
-  nonstd::wide_string<char> str2; // expected-error{{no template named 'wide_string' in namespace 'nonstd'; did you mean 'another::wide_string'?}}
-  return wibble::narf; // expected-error{{no member named 'narf' in namespace 'otherstd'; did you mean 'nonstd::narf'?}}
-}
-
-namespace check_bool {
-  void f() {
-    Bool b; // expected-error{{use of undeclared identifier 'Bool'; did you mean 'bool'?}}
-  }
-}
-
-namespace outr {
-}
-namespace outer {
-  namespace inner { // expected-note{{'outer::inner' declared here}} \
-                    // expected-note{{namespace 'outer::inner' defined here}} \
-                    // expected-note{{'inner' declared here}}
-    int i;
-  }
-}
-
-using namespace outr::inner; // expected-error{{no namespace named 'inner' in namespace 'outr'; did you mean 'outer::inner'?}}
-
-void func() {
-  outr::inner::i = 3; // expected-error{{no member named 'inner' in namespace 'outr'; did you mean 'outer::inner'?}}
-  outer::innr::i = 4; // expected-error{{no member named 'innr' in namespace 'outer'; did you mean 'inner'?}}
-}
-
-struct base {
-};
-struct derived : base {
-  int i;
-};
-
-void func2() {
-  derived d;
-  // FIXME: we should offer a fix here. We do if the 'i' is misspelled, but we don't do name qualification changes
-  //        to replace base::i with derived::i as we would for other qualified name misspellings.
-  // d.base::i = 3;
-}
-
-class A {
-  void bar(int);
-};
-void bar(int, int);  // expected-note{{'::bar' declared here}}
-void A::bar(int x) {
-  bar(x, 5);  // expected-error{{too many arguments to function call, expected 1, have 2; did you mean '::bar'?}}
-}
diff --git a/clang/test/Index/complete-switch.c b/clang/test/Index/complete-switch.c
deleted file mode 100644
index 4a78854595543..0000000000000
--- a/clang/test/Index/complete-switch.c
+++ /dev/null
@@ -1,10 +0,0 @@
-void f() {
-  auto foo = bar;
-  switch(foo) {
-    case x:
-      break;
-  }
-}
-
-// RUN: not %clang_cc1 -fsyntax-only -fno-recovery-ast -code-completion-at=%s:4:10 %s | FileCheck %s -allow-empty
-// CHECK-NOT: COMPLETION: foo
diff --git a/clang/test/Index/fix-its.c b/clang/test/Index/fix-its.c
index 1e710c28afcca..8378fd9da9b43 100644
--- a/clang/test/Index/fix-its.c
+++ b/clang/test/Index/fix-its.c
@@ -1,27 +1,12 @@
-// RUN: c-index-test -test-load-source all -fspell-checking %s 2> %t  
+// RUN: c-index-test -test-load-source all -fspell-checking %s 2> %t
 // RUN: FileCheck %s < %t
-struct X {
-  int wibble;
-};
-
 #define MACRO(X) X
 
-void f(struct X *x) {
-  // CHECK: error: no member named 'wobble' in 'struct X'; did you mean 'wibble'?
-  // CHECK: FIX-IT: Replace [13:12 - 13:18] with "wibble"
-  // CHECK: note: 'wibble' declared here
-  MACRO(x->wobble = 17);
-  // CHECK: error: no member named 'wabble' in 'struct X'; did you mean 'wibble'?
-  // CHECK: FIX-IT: Replace [17:6 - 17:12] with "wibble"
-  // CHECK: note: 'wibble' declared here
-  x->wabble = 17;
-}
-
 int printf(const char *restrict, ...);
 
 void f2() {
   unsigned long index;
   // CHECK: warning: format specifies type 'int' but the argument has type 'unsigned long'
-  // CHECK: FIX-IT: Replace [26:17 - 26:19] with "%lu"
+  // CHECK: FIX-IT: Replace [11:17 - 11:19] with "%lu"
   MACRO(printf("%d", index));
 }
diff --git a/clang/test/Lexer/raw-string-ext.c b/clang/test/Lexer/raw-string-ext.c
index de318b616df70..8ed96e5c19f0d 100644
--- a/clang/test/Lexer/raw-string-ext.c
+++ b/clang/test/Lexer/raw-string-ext.c
@@ -27,13 +27,13 @@
 // no-warning@* {{ignoring '-fno-raw-string-literals'}}
 
 void f() {
-  (void) R"foo()foo"; // unsupported-error {{use of undeclared identifier 'R'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) LR"foo()foo"; // unsupported-error {{use of undeclared identifier 'LR'}} cxx-unsupported-error {{expected ';' after expression}}
+  (void) R"foo()foo"; // unsupported-error {{use of undeclared identifier 'R'}}
+  (void) LR"foo()foo"; // unsupported-error {{use of undeclared identifier 'LR'}}
 
 #ifdef UNICODE
-  (void) uR"foo()foo"; // unsupported-error {{use of undeclared identifier 'uR'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) u8R"foo()foo"; // unsupported-error {{use of undeclared identifier 'u8R'}} cxx-unsupported-error {{expected ';' after expression}}
-  (void) UR"foo()foo"; // unsupported-error {{use of undeclared identifier 'UR'}} cxx-unsupported-error {{expected ';' after expression}}
+  (void) uR"foo()foo"; // unsupported-error {{use of undeclared identifier 'uR'}}
+  (void) u8R"foo()foo"; // unsupported-error {{use of undeclared identifier 'u8R'}}
+  (void) UR"foo()foo"; // unsupported-error {{use of undeclared identifier 'UR'}}
 #endif
 }
 
diff --git a/clang/test/Modules/diagnose-missing-import.m b/clang/test/Modules/diagnose-missing-import.m
index 8fb8e6b25f68a..b34bc1a62b6bc 100644
--- a/clang/test/Modules/diagnose-missing-import.m
+++ b/clang/test/Modules/diagnose-missing-import.m
@@ -7,11 +7,9 @@
 void foo(void) {
   XYZLogEvent(xyzRiskyCloseOpenParam, xyzRiskyCloseOpenParam); // expected-error {{call to undeclared function 'XYZLogEvent'; ISO C99 and later do not support implicit function declarations}} \
                                                                   expected-error {{declaration of 'XYZLogEvent' must be imported}} \
-                                                                  expected-error {{declaration of 'xyzRiskyCloseOpenParam' must be imported from module 'NCI.A'}} \
                                                                   expected-error {{declaration of 'xyzRiskyCloseOpenParam' must be imported from module 'NCI.A'}}
 }
 
-// expected-note@Inputs/diagnose-missing-import/a.h:5 {{declaration here is not visible}}
 // expected-note@Inputs/diagnose-missing-import/a.h:5 {{declaration here is not visible}}
 // expected-note@Inputs/diagnose-missing-import/a.h:6 {{declaration here is not visible}}
 
diff --git a/clang/test/OpenMP/begin_declare_variant_messages.c b/clang/test/OpenMP/begin_declare_variant_messages.c
index d8d8f4211678f..8878188e7ceb2 100644
--- a/clang/test/OpenMP/begin_declare_variant_messages.c
+++ b/clang/test/OpenMP/begin_declare_variant_messages.c
@@ -83,7 +83,7 @@ const int var;
 #pragma omp end declare variant
 #pragma omp begin declare variant match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}}
 #pragma omp end declare variant
-#pragma omp begin declare variant match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<recovery-expr>()'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp begin declare variant match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp end declare variant
 #pragma omp begin declare variant match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp end declare variant
diff --git a/clang/test/OpenMP/declare_reduction_messages.cpp b/clang/test/OpenMP/declare_reduction_messages.cpp
index 752cc4fb05a12..f91d952dfa14f 100644
--- a/clang/test/OpenMP/declare_reduction_messages.cpp
+++ b/clang/test/OpenMP/declare_reduction_messages.cpp
@@ -69,7 +69,7 @@ class Class2 : public Class1<T> {
 #pragma omp declare reduction(fun77 : long : omp_out += omp_in) initializer(omp_priv Class2 < int > ()) // expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv 23)                 // expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare reduction(fun88 : long : omp_out += omp_in) initializer(omp_priv 23))               // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
-#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'; did you mean 'omp_in'?}} expected-note {{'omp_in' declared here}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'}}
 #pragma omp declare reduction(fun10 : long : omp_out += omp_in) initializer(omp_priv = 23)
 
 template <typename T>
diff --git a/clang/test/OpenMP/declare_variant_messages.c b/clang/test/OpenMP/declare_variant_messages.c
index 32e365cc415bd..d1e36e5d1e7e9 100644
--- a/clang/test/OpenMP/declare_variant_messages.c
+++ b/clang/test/OpenMP/declare_variant_messages.c
@@ -11,7 +11,7 @@ int foo(void);
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foo) xxx // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
@@ -42,7 +42,7 @@ int foo(void);
 #pragma omp declare variant(foo) match(device={kind(}) // expected-error {{expected ')'}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind()}) // expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}}
 #pragma omp declare variant(foo) match(device={kind(score cpu)}) // expected-error {{expected '(' after 'score'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}}
-#pragma omp declare variant(foo) match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<recovery-expr>()'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
+#pragma omp declare variant(foo) match(device = {kind(score(ibm) }) // expected-error {{use of undeclared identifier 'ibm'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('<invalid>'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind(score(2 gpu)}) // expected-error {{expected ')'}} expected-error {{expected ')'}} expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('2'); score ignored}} expected-warning {{expected identifier or string literal describing a context property; property skipped}} expected-note {{to match this '('}} expected-note {{context property options are: 'host' 'nohost' 'cpu' 'gpu' 'fpga' 'any'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo) match(device={kind(score(foo()) ibm)}) // expected-warning {{expected '':'' after the score expression; '':'' assumed}} expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('foo()'); score ignored}} expected-warning {{'ibm' is not a valid context property for the context selector 'kind' and the context set 'device'; property ignored}} expected-note {{try 'match(implementation={vendor(ibm)})'}} expected-note {{the ignored property spans until here}}
 #pragma omp declare variant(foo) match(device={kind(score(5): host), kind(llvm)}) // expected-warning {{the context selector 'kind' in the context set 'device' cannot have a score ('5'); score ignored}} expected-warning {{the context selector 'kind' was used already in the same 'omp declare variant' directive; selector ignored}} expected-note {{the previous context selector 'kind' used here}} expected-note {{the ignored selector spans until here}}
@@ -56,7 +56,7 @@ int foo(void);
 #pragma omp declare variant(foo) match(target_device={device_num}) // expected-warning {{the context selector 'device_num' in context set 'target_device' requires a context property defined in parentheses; selector ignored}} expected-note {{the ignored selector spans until here}}
 #pragma omp declare variant(foo) match(target_device={device_num()}) // expected-error {{expected expression}}
 #pragma omp declare variant(foo) match(target_device={device_num(-1)}) // expected-error {{argument to 'device_num' clause must be a non-negative integer value}}
-#pragma omp declare variant(foo) match(target_device={device_num(abc)}) // expected-error {{expected expression}} expected-error {{use of undeclared identifier 'abc'}}
+#pragma omp declare variant(foo) match(target_device={device_num(abc)}) // expected-error {{use of undeclared identifier 'abc'}}
 int bar(void);
 
 
diff --git a/clang/test/OpenMP/declare_variant_messages.cpp b/clang/test/OpenMP/declare_variant_messages.cpp
index 8eb37bc64cbc1..06da8a8e5b058 100644
--- a/clang/test/OpenMP/declare_variant_messages.cpp
+++ b/clang/test/OpenMP/declare_variant_messages.cpp
@@ -16,7 +16,7 @@ T foofoo();
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foo // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <int>) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <int>) xxx // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
@@ -57,7 +57,7 @@ int bar();
 #pragma omp declare variant // expected-error {{expected '(' after 'declare variant'}}
 #pragma omp declare variant( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
 #pragma omp declare variant(foofoo <T> // expected-error {{expected ')'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}} expected-note {{to match this '('}}
-#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}} omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
+#pragma omp declare variant(x) // expected-error {{use of undeclared identifier 'x'}}
 #pragma omp declare variant(foo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
 #pragma omp declare variant(foofoo <T>) // omp50-error {{expected 'match' clause on 'omp declare variant' directive}} omp51-error {{expected 'match', 'adjust_args', or 'append_args' clause on 'omp declare variant' directive}}
diff --git a/clang/test/OpenMP/target_update_messages.cpp b/clang/test/OpenMP/target_update_messages.cpp
index 83191059202ca..000cc80e513e6 100644
--- a/clang/test/OpenMP/target_update_messages.cpp
+++ b/clang/test/OpenMP/target_update_messages.cpp
@@ -113,9 +113,11 @@ int main(int argc, char **argv) {
   // Check parsing with two modifiers.
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id), present: s)
-  // lt51-error@+3 {{use of undeclared identifier 'present'}}
-  // lt51-error@+2 {{use of undeclared identifier 'id'}}
-  // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  // lt51-error@+5 {{use of undeclared identifier 'present'}}
+  // lt51-error@+4 {{use of undeclared identifier 'id'}}
+  // lt51-error@+3 {{expected ',' or ')' in 'to' clause}}
+  // lt51-error@+2 {{expected ')'}}
+  // lt51-note@+1 {{to match this '('}}
   #pragma omp target update to(present, mapper(id): s)
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id) present: s)
@@ -141,10 +143,9 @@ int main(int argc, char **argv) {
   #pragma omp target update to(present,,: s)
   // lt51-warning@+1 {{missing ':' after ) - ignoring}}
   #pragma omp target update to(mapper(id), present,: s)
-  // lt51-error@+4 {{use of undeclared identifier 'present'}}
-  // lt51-error@+3 {{use of undeclared identifier 'id'}}
-  // lt51-error@+2 {{expected expression}}
-  // lt51-error@+1 {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  // lt51-error@+3 {{use of undeclared identifier 'present'}}
+  // lt51-error@+2 {{use of undeclared identifier 'id'}}
+  // lt51-error@+1 {{expected expression}}
   #pragma omp target update to(present, mapper(id),: s)
 
   #pragma omp target update from(m) allocate(m) // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp target update'}}
diff --git a/clang/test/Parser/cxx1z-decomposition.cpp b/clang/test/Parser/cxx1z-decomposition.cpp
index 3e2526979be8b..b7a8d30bd16c5 100644
--- a/clang/test/Parser/cxx1z-decomposition.cpp
+++ b/clang/test/Parser/cxx1z-decomposition.cpp
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -std=c++2c %s -triple x86_64-unknown-linux-gnu -verify=expected,cxx2c,post2b -fcxx-exceptions
 // RUN: not %clang_cc1 -std=c++17 %s -triple x86_64-unknown-linux-gnu -emit-llvm-only -fcxx-exceptions
 
-struct S { int a, b, c; };
+struct S { int a, b, c; }; // expected-note 2 {{'S::a' declared here}}
 
 // A simple-declaration can be a decompsition declaration.
 namespace SimpleDecl {
@@ -32,7 +32,7 @@ namespace ForRangeDecl {
 namespace OtherDecl {
   // A parameter-declaration is not a simple-declaration.
   // This parses as an array declaration.
-  void f(auto [a, b, c]); // cxx17-error {{'auto' not allowed in function prototype}} expected-error {{'a'}}
+  void f(auto [a, b, c]); // cxx17-error {{'auto' not allowed in function prototype}} expected-error 1+{{'a'}}
 
   void g() {
     // A condition is allowed as a Clang extension.
@@ -46,7 +46,7 @@ namespace OtherDecl {
 
     // An exception-declaration is not a simple-declaration.
     try {}
-    catch (auto [a, b, c]) {} // expected-error {{'auto' not allowed in exception declaration}} expected-error {{'a'}}
+    catch (auto [a, b, c]) {} // expected-error {{'auto' not allowed in exception declaration}} expected-error 1+{{'a'}}
   }
 
   // A member-declaration is not a simple-declaration.
diff --git a/clang/test/Parser/cxx1z-fold-expressions.cpp b/clang/test/Parser/cxx1z-fold-expressions.cpp
index 4a329646b799f..d798a9cbb99b7 100644
--- a/clang/test/Parser/cxx1z-fold-expressions.cpp
+++ b/clang/test/Parser/cxx1z-fold-expressions.cpp
@@ -37,14 +37,14 @@ template<int ...N> int bad12() { return (... N); } // expected-error {{expected
 
 template<typename ...T> void as_operand_of_cast(int a, T ...t) {
   return
-    (int)(a + ... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(a + ... + undeclared_junk) + // expected-error {{undeclared}}
     (int)(t + ... + undeclared_junk) + // expected-error {{undeclared}}
-    (int)(... + undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(... + undeclared_junk) + // expected-error {{undeclared}}
     (int)(undeclared_junk + ...) + // expected-error {{undeclared}}
     (int)(a + ...) + // expected-error {{does not contain any unexpanded}}
     (int)(a, ...) + // expected-error {{does not contain any unexpanded}}
     (int)(..., a) + // expected-error {{does not contain any unexpanded}}
-    (int)(a, ..., undeclared_junk) + // expected-error {{undeclared}} expected-error {{does not contain any unexpanded}}
+    (int)(a, ..., undeclared_junk) + // expected-error {{undeclared}}
     (int)(t, ...) +
     (int)(..., t) +
     (int)(t, ..., a);
diff --git a/clang/test/Parser/cxx2c-pack-indexing.cpp b/clang/test/Parser/cxx2c-pack-indexing.cpp
index 72e286322fa97..79069a86ea706 100644
--- a/clang/test/Parser/cxx2c-pack-indexing.cpp
+++ b/clang/test/Parser/cxx2c-pack-indexing.cpp
@@ -69,7 +69,8 @@ template <typename... T>
 requires( ); // expected-error {{expected expression}}
 struct SS {
     void f( ) {
-        (*p).~T...[](); // expected-error {{use of undeclared identifier 'p'}}
+        (*p).~T...[](); // expected-error {{use of undeclared identifier 'p'}} \
+                           expected-error {{undeclared identifier 'T' in destructor name}}
     }
 };
 }
diff --git a/clang/test/Parser/objc-foreach-syntax.m b/clang/test/Parser/objc-foreach-syntax.m
index 2158d8062f6cd..1ff84f393b9f4 100644
--- a/clang/test/Parser/objc-foreach-syntax.m
+++ b/clang/test/Parser/objc-foreach-syntax.m
@@ -21,6 +21,5 @@ - (void)compilerTestAgainst {
 
 
 static int test7(id keys) {
-  for (id key; in keys) ;  // expected-error {{use of undeclared identifier 'in'}} \
-                           // expected-error {{expected ';' in 'for' statement specifier}}
+  for (id key; in keys) ;  // expected-error {{use of undeclared identifier 'in'}}
 }
diff --git a/clang/test/Parser/opencl-atomics-cl20.cl b/clang/test/Parser/opencl-atomics-cl20.cl
index 2648142f28e7c..2cd2c6ca133e1 100644
--- a/clang/test/Parser/opencl-atomics-cl20.cl
+++ b/clang/test/Parser/opencl-atomics-cl20.cl
@@ -39,23 +39,17 @@ void atomic_types_test(void) {
 // expected-error@-11 {{use of undeclared identifier 'atomic_ulong'}}
 // expected-error@-11 {{use of undeclared identifier 'atomic_double'}}
 #if defined(LANG_VER_OK)
-// expected-error@-15 {{expected ';' after expression}}
-// expected-error@-16 {{use of undeclared identifier 'l'}}
-// expected-error@-16 {{expected ';' after expression}}
-// expected-error@-17 {{use of undeclared identifier 'ul'}}
 #endif
 #if !defined(LANG_VER_OK) || defined(__SPIR64__)
-// expected-error@-18 {{use of undeclared identifier 'atomic_size_t'}}
-// expected-error@-16 {{use of undeclared identifier 'atomic_ptrdiff_t'}}
+// expected-error@-14 {{use of undeclared identifier 'atomic_size_t'}}
+// expected-error@-12 {{use of undeclared identifier 'atomic_ptrdiff_t'}}
 #if !defined(LANG_VER_OK)
-// expected-error@-20 {{use of undeclared identifier 'atomic_intptr_t'}}
-// expected-error@-20 {{use of undeclared identifier 'atomic_uintptr_t'}}
+// expected-error@-16 {{use of undeclared identifier 'atomic_intptr_t'}}
+// expected-error@-16 {{use of undeclared identifier 'atomic_uintptr_t'}}
 #else
-// expected-error@-24 {{expected ';' after expression}}
-// expected-error@-25 {{use of undeclared identifier 's'}}
-// expected-error@-25 {{unknown type name 'atomic_intptr_t'; did you mean 'atomic_int'?}}
+// expected-error@-19 {{unknown type name 'atomic_intptr_t'; did you mean 'atomic_int'?}}
 // expected-note@* {{'atomic_int' declared here}}
-// expected-error@-26 {{unknown type name 'atomic_uintptr_t'; did you mean 'atomic_uint'?}}
+// expected-error@-20 {{unknown type name 'atomic_uintptr_t'; did you mean 'atomic_uint'?}}
 // expected-note@* {{'atomic_uint' declared here}}
 #endif
 #endif
diff --git a/clang/test/Parser/recovery.c b/clang/test/Parser/recovery.c
index 6fdbedffd236a..0d86bd0608bf1 100644
--- a/clang/test/Parser/recovery.c
+++ b/clang/test/Parser/recovery.c
@@ -11,7 +11,7 @@ float test2241[2] = {
 static void f (char * (*g) (char **, int), char **p, ...) {
   char *s;
   va_list v;                              // expected-error {{identifier}}
-  s = g (p, __builtin_va_arg(v, int));    // expected-error {{identifier}}
+  s = g (p, __builtin_va_arg(v, int));    // expected-error {{identifier}} expected-error {{extraneous ')' before ';'}}
 }
 
 
diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp
index 7b3909e3b0d32..40712799933c2 100644
--- a/clang/test/Parser/switch-recovery.cpp
+++ b/clang/test/Parser/switch-recovery.cpp
@@ -104,7 +104,7 @@ void test9(int x) { // expected-note {{'x' declared here}}
               expected-error {{expected expression}}
     8:: x; // expected-error {{expected ';' after expression}} \
               expected-error {{no member named 'x' in the global namespace; did you mean simply 'x'?}} \
-              expected-warning {{expression result unused}}
+              expected-warning 2 {{expression result unused}}
     9:: :y; // expected-error {{expected ';' after expression}} \
                expected-error {{expected unqualified-id}} \
                expected-warning {{expression result unused}}
diff --git a/clang/test/Parser/switch-typo-correction.cpp b/clang/test/Parser/switch-typo-correction.cpp
index ebf1c18f2b86a..95d610b9cdd25 100644
--- a/clang/test/Parser/switch-typo-correction.cpp
+++ b/clang/test/Parser/switch-typo-correction.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
-namespace c { double xxx; } // expected-note{{'c::xxx' declared here}}
+namespace c { double xxx; }
 namespace d { float xxx; }
 namespace z { namespace xxx {} }
 
 void crash() {
-  switch (xxx) {} // expected-error{{use of undeclared identifier 'xxx'; did you mean }}
+  switch (xxx) {} // expected-error{{use of undeclared identifier 'xxx'}}
 }
diff --git a/clang/test/ParserOpenACC/parse-cache-construct.cpp b/clang/test/ParserOpenACC/parse-cache-construct.cpp
index a5a1e58028c33..948f2e30f149c 100644
--- a/clang/test/ParserOpenACC/parse-cache-construct.cpp
+++ b/clang/test/ParserOpenACC/parse-cache-construct.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 %s -verify -fopenacc
 
 namespace NS {
-  static char* NSArray;// expected-note{{declared here}}
-  static int NSInt;// expected-note 2{{declared here}}
+  static char* NSArray; // expected-note {{'NS::NSArray' declared here}}
+  static int NSInt;     // expected-note 2 {{'NS::NSInt' declared here}}
 }
 char *getArrayPtr();
 template<typename T, int I>
@@ -21,17 +21,17 @@ void func() {
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSArray'; did you mean 'NS::NSArray'}}
+    // expected-error@+1{{use of undeclared identifier 'NSArray'}}
     #pragma acc cache(NSArray[NS::NSInt : NS::NSInt])
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSInt'; did you mean 'NS::NSInt'}}
+    // expected-error@+1{{use of undeclared identifier 'NSInt'}}
     #pragma acc cache(NS::NSArray[NSInt : NS::NSInt])
   }
 
   for (int i = 0; i < 10; ++i) {
-    // expected-error@+1{{use of undeclared identifier 'NSInt'; did you mean 'NS::NSInt'}}
+    // expected-error@+1{{use of undeclared identifier 'NSInt'}}
     #pragma acc cache(NS::NSArray[NS::NSInt : NSInt])
   }
 }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 6d771e858d243..a9ad7ab176cbc 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -347,9 +347,7 @@ void SelfUpdate() {
 #pragma acc update host(s) self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+3{{use of undeclared identifier 'zero'}}
-  // expected-error@+2{{expected ','}}
-  // expected-error@+1{{expected expression}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc update self(zero : s.array[s.value : 5], s.value), if_present
   for(int i = 0; i < 5;++i) {}
 
@@ -453,8 +451,6 @@ void VarListClauses() {
 #pragma acc parallel copy(always, alwaysin, always: HasMem.MemArr[3:]) self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+3{{use of undeclared identifier 'always'}}
-  // expected-error@+2{{use of undeclared identifier 'alwaysin'}}
   // expected-error@+1{{use of undeclared identifier 'always'}}
 #pragma acc parallel copy(always, alwaysin, always, HasMem.MemArr[3:]) self
   for(int i = 0; i < 5;++i) {}
@@ -591,8 +587,7 @@ void VarListClauses() {
 #pragma acc serial copyout(zero : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc serial copyout(zero s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -608,8 +603,7 @@ void VarListClauses() {
 #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial copyout(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -657,8 +651,7 @@ void VarListClauses() {
 #pragma acc serial create(zero : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'zero'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'zero'}}
 #pragma acc serial create(zero s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -674,8 +667,7 @@ void VarListClauses() {
 #pragma acc serial create(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial create(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -700,8 +692,7 @@ void VarListClauses() {
 #pragma acc serial copyin(readonly : s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'readonly'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'readonly'}}
 #pragma acc serial copyin(readonly s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
@@ -717,8 +708,7 @@ void VarListClauses() {
 #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc serial copyin(invalid s.array[s.value : 5], s.value), self
   for(int i = 0; i < 5;++i) {}
 
diff --git a/clang/test/ParserOpenACC/parse-constructs.cpp b/clang/test/ParserOpenACC/parse-constructs.cpp
index 814f6a1fd09f2..69b04bcbad9e3 100644
--- a/clang/test/ParserOpenACC/parse-constructs.cpp
+++ b/clang/test/ParserOpenACC/parse-constructs.cpp
@@ -18,13 +18,13 @@ namespace NS {
 #pragma acc routine(NS::foo) seq
 
 // expected-error@+2{{use of undeclared identifier 'templ'; did you mean 'NS::templ'?}}
-// expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
+// expected-error@+1{{OpenACC routine name 'templ' names a set of overloads}}
 #pragma acc routine(templ) seq
 // expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
 #pragma acc routine(NS::templ) seq
 
 // expected-error@+2{{use of undeclared identifier 'templ'; did you mean 'NS::templ'?}}
-// expected-error@+1{{OpenACC routine name 'NS::templ' names a set of overloads}}
+// expected-error@+1{{OpenACC routine name 'templ<int>' names a set of overloads}}
 #pragma acc routine(templ<int>) seq
 // expected-error@+1{{OpenACC routine name 'NS::templ<int>' names a set of overloads}}
 #pragma acc routine(NS::templ<int>) seq
diff --git a/clang/test/ParserOpenACC/parse-wait-clause.c b/clang/test/ParserOpenACC/parse-wait-clause.c
index 16e31a67c094f..5c006b4379a27 100644
--- a/clang/test/ParserOpenACC/parse-wait-clause.c
+++ b/clang/test/ParserOpenACC/parse-wait-clause.c
@@ -85,19 +85,16 @@ void func() {
   #pragma acc parallel wait (devnum: i + j:queues:) clause-list
     {}
 
-  // expected-error@+4{{use of undeclared identifier 'devnum'}}
-  // expected-error@+3{{expected ','}}
+  // expected-error@+3{{use of undeclared identifier 'devnum'}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc parallel wait (queues:devnum: i + j
     {}
 
-  // expected-error@+2{{expected ','}}
   // expected-error@+1{{use of undeclared identifier 'devnum'}}
   #pragma acc parallel wait (queues:devnum: i + j)
     {}
 
-  // expected-error@+3{{expected ','}}
   // expected-error@+2{{use of undeclared identifier 'devnum'}}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc parallel wait (queues:devnum: i + j) clause-list
diff --git a/clang/test/ParserOpenACC/parse-wait-construct.c b/clang/test/ParserOpenACC/parse-wait-construct.c
index 491c3bee4ac5a..27a3a02dc2637 100644
--- a/clang/test/ParserOpenACC/parse-wait-construct.c
+++ b/clang/test/ParserOpenACC/parse-wait-construct.c
@@ -68,18 +68,15 @@ void func() {
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc wait (devnum: i + j:queues:) clause-list
 
-  // expected-error@+4{{use of undeclared identifier 'devnum'}}
-  // expected-error@+3{{expected ','}}
+  // expected-error@+3{{use of undeclared identifier 'devnum'}}
   // expected-error@+2{{expected ')'}}
   // expected-note@+1{{to match this '('}}
   #pragma acc wait (queues:devnum: i + j
 
-  // expected-error@+2{{use of undeclared identifier 'devnum'}}
-  // expected-error@+1{{expected ','}}
+  // expected-error@+1{{use of undeclared identifier 'devnum'}}
   #pragma acc wait (queues:devnum: i + j)
 
-  // expected-error@+3{{use of undeclared identifier 'devnum'}}
-  // expected-error@+2{{expected ','}}
+  // expected-error@+2{{use of undeclared identifier 'devnum'}}
   // expected-error@+1{{invalid OpenACC clause 'clause'}}
   #pragma acc wait (queues:devnum: i + j) clause-list
 
diff --git a/clang/test/Sema/PR28181.c b/clang/test/Sema/PR28181.c
index 8d0a4ad33562a..7e9d5cc91038d 100644
--- a/clang/test/Sema/PR28181.c
+++ b/clang/test/Sema/PR28181.c
@@ -5,9 +5,9 @@ struct spinlock_t {
 } audit_skb_queue;
 
 void fn1(void) {
-  audit_skb_queue = (lock); // expected-error {{use of undeclared identifier 'lock'; did you mean 'long'?}}
-}                           // expected-error@-1 {{assigning to 'struct spinlock_t' from incompatible type '<overloaded function type>'}}
+  audit_skb_queue = (lock); // expected-error {{use of undeclared identifier 'lock'}}
+}
 
 void fn2(void) {
-  audit_skb_queue + (lock); // expected-error {{use of undeclared identifier 'lock'; did you mean 'long'?}}
-}                           // expected-error@-1 {{reference to overloaded function could not be resolved; did you mean to call it?}}
+  audit_skb_queue + (lock); // expected-error {{use of undeclared identifier 'lock'}}
+}
diff --git a/clang/test/Sema/builtin-unary-fp.c b/clang/test/Sema/builtin-unary-fp.c
index fb8e341156a59..9bfcb30b9eba3 100644
--- a/clang/test/Sema/builtin-unary-fp.c
+++ b/clang/test/Sema/builtin-unary-fp.c
@@ -17,5 +17,4 @@ void a(void) {
 
   check(__builtin_fpclassify(0,0,0,0,0, (invalid))); // expected-error{{use of undeclared identifier 'invalid'}}
   check(__builtin_fpclassify(0,0,0,0,0, (inf))); // expected-error{{use of undeclared identifier 'inf'}}
-                                                // expected-error@-1{{reference to overloaded function could not be resolved}}
 }
diff --git a/clang/test/Sema/c23-delayed-typo-correction-crashes.c b/clang/test/Sema/c23-delayed-typo-correction-crashes.c
new file mode 100644
index 0000000000000..6afd3fd32c366
--- /dev/null
+++ b/clang/test/Sema/c23-delayed-typo-correction-crashes.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c23 -fsyntax-only -verify %s
+
+void GH139913(...);
+void GH139913_test() {
+  GH139913(CONCAT(foo, )); // expected-error {{use of undeclared identifier 'CONCAT'}} \
+                              expected-error {{use of undeclared identifier 'foo'}} \
+                              expected-error {{expected expression}}
+}
+
+struct GH137867 {
+ char value;
+};
+void GH137867_test() {
+  _Atomic(struct GH137867) t;
+  while (!atomic_load(&t.value)->value) // expected-error {{use of undeclared identifier 'atomic_load'}} \
+                                           expected-error {{accessing a member of an atomic structure or union is undefined behavior}}
+    ;
+}
diff --git a/clang/test/Sema/delayed-typo-correction-crashes.c b/clang/test/Sema/delayed-typo-correction-crashes.c
new file mode 100644
index 0000000000000..81c966789ccb5
--- /dev/null
+++ b/clang/test/Sema/delayed-typo-correction-crashes.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fsyntax-only -fblocks -ffixed-point -verify %s
+
+void GH137860_test(void) {
+  struct S {
+    char h;
+  };
+  _Atomic struct S s = { .h = UINT8_MIN }; // expected-error {{use of undeclared identifier 'UINT8_MIN'}}
+  __c11_atomic_fetch_add(&s.h, UINT8_MIN); // expected-error {{use of undeclared identifier 'UINT8_MIN'}} \
+                                              expected-error {{accessing a member of an atomic structure or union is undefined behavior}}
+}
+
+int (^GH69470) (int i, int j) = ^(int i, int j)
+{ return i / j; }/ j; // expected-error {{use of undeclared identifier 'j'}}
+
+void GH69874(void) {
+  *a = (a_struct){0}; // expected-error {{use of undeclared identifier 'a'}} \
+                         expected-error {{use of undeclared identifier 'a_struct'}}
+}
diff --git a/clang/test/Sema/invalid-member.cpp b/clang/test/Sema/invalid-member.cpp
index 57ee187ccf4d5..0e3fec1b18eec 100644
--- a/clang/test/Sema/invalid-member.cpp
+++ b/clang/test/Sema/invalid-member.cpp
@@ -20,10 +20,12 @@ class Z {
 // Should be able to evaluate sizeof without crashing.
 static_assert(sizeof(Z) == 1, "No valid members");
 
-constexpr int N = undef; // expected-error {{use of undeclared identifier}}
+constexpr int N = undef; // expected-error {{use of undeclared identifier}} \
+                            expected-note {{declared here}}
 template<int a>
 class ABC {};
 class T {
-  ABC<N> abc;
+  ABC<N> abc; // expected-error {{non-type template argument is not a constant expression}} \
+                 expected-note {{initializer of 'N' is unknown}}
 };
 static_assert(sizeof(T) == 1, "No valid members");
diff --git a/clang/test/Sema/typo-correction-ambiguity.cpp b/clang/test/Sema/typo-correction-ambiguity.cpp
index 9dcff3d68c823..b2dae1d7696c3 100644
--- a/clang/test/Sema/typo-correction-ambiguity.cpp
+++ b/clang/test/Sema/typo-correction-ambiguity.cpp
@@ -18,12 +18,12 @@ void testAmbiguousNoSuggestions()
 
 namespace MultipleCorrectionsButNotAmbiguous
 {
-  int PrefixType_Name(int value);  // expected-note {{'PrefixType_Name' declared here}}
+  int PrefixType_Name(int value);
   int PrefixType_MIN();
   int PrefixType_MAX();
 };
 
 int testMultipleCorrectionsButNotAmbiguous() {
-  int val = MultipleCorrectionsButNotAmbiguous::PrefixType_Enum(0);  // expected-error {{no member named 'PrefixType_Enum' in namespace 'MultipleCorrectionsButNotAmbiguous'; did you mean 'PrefixType_Name'?}}
+  int val = MultipleCorrectionsButNotAmbiguous::PrefixType_Enum(0);  // expected-error {{no member named 'PrefixType_Enum' in namespace 'MultipleCorrectionsButNotAmbiguous'}}
   return val;
 }
diff --git a/clang/test/Sema/typo-correction-no-hang.c b/clang/test/Sema/typo-correction-no-hang.c
index e6041704ff324..da234a2c7373c 100644
--- a/clang/test/Sema/typo-correction-no-hang.c
+++ b/clang/test/Sema/typo-correction-no-hang.c
@@ -2,16 +2,15 @@
 
 // PR50797
 struct a {
-  int xxx; // expected-note {{'xxx' declared here}}
+  int xxx;
 };
 
 int g_107;
 int g_108;
 int g_109;
 
-struct a g_999; // expected-note 4{{'g_999' declared here}}
+struct a g_999;
 
-void b(void) { (g_910.xxx = g_910.xxx); } //expected-error 2{{use of undeclared identifier 'g_910'; did you mean 'g_999'}}
+void b(void) { (g_910.xxx = g_910.xxx); } //expected-error 2{{use of undeclared identifier 'g_910'}}
 
-void c(void) { (g_910.xxx = g_910.xxx1); } //expected-error 2{{use of undeclared identifier 'g_910'; did you mean 'g_999'}} \
-                                             expected-error {{no member named 'xxx1' in 'struct a'; did you mean 'xxx'}}
+void c(void) { (g_910.xxx = g_910.xxx1); } //expected-error 2{{use of undeclared identifier 'g_910'}}
diff --git a/clang/test/Sema/typo-correction-no-hang.cpp b/clang/test/Sema/typo-correction-no-hang.cpp
index 3c591645be25c..34b8486bed902 100644
--- a/clang/test/Sema/typo-correction-no-hang.cpp
+++ b/clang/test/Sema/typo-correction-no-hang.cpp
@@ -8,10 +8,12 @@ struct rdar38642201 {
 
 void rdar38642201_callee(int x, int y);
 void rdar38642201_caller() {
-  struct rdar38642201 structVar;
+  struct rdar38642201 structVar;      //expected-note 2{{'structVar' declared here}}
   rdar38642201_callee(
-      structVar1.fieldName1.member1,  //expected-error{{use of undeclared identifier 'structVar1'}}
-      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}}
+      structVar1.fieldName1.member1,  //expected-error{{use of undeclared identifier 'structVar1'}} \
+                                        expected-error{{no member named 'fieldName1' in 'rdar38642201'}}
+      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}} \
+                                        expected-error{{no member named 'fieldName2' in 'rdar38642201'}}
 }
 
 // Similar reproducer.
@@ -20,7 +22,7 @@ class A {
   int minut() const = delete;
   int hour() const = delete;
 
-  int longit() const; //expected-note{{'longit' declared here}}
+  int longit() const;
   int latit() const;
 };
 
@@ -35,6 +37,6 @@ int Foo(const B &b) {
 }
 
 int Bar(const B &b) {
-  return b.depar().longitude() + //expected-error{{no member named 'longitude' in 'A'; did you mean 'longit'?}}
+  return b.depar().longitude() + //expected-error{{no member named 'longitude' in 'A'}}
          b.depar().latitude();   //expected-error{{no member named 'latitude' in 'A'}}
 }
diff --git a/clang/test/Sema/typo-correction-recursive.cpp b/clang/test/Sema/typo-correction-recursive.cpp
index b39beb5493f65..a7d7127564b75 100644
--- a/clang/test/Sema/typo-correction-recursive.cpp
+++ b/clang/test/Sema/typo-correction-recursive.cpp
@@ -8,13 +8,13 @@
 class DeepClass
 {
 public:
-  void trigger() const;  // expected-note {{'trigger' declared here}}
+  void trigger() const;
 };
 
 class Y
 {
 public:
-  const DeepClass& getX() const { return m_deepInstance; }  // expected-note {{'getX' declared here}}
+  const DeepClass& getX() const { return m_deepInstance; }
 private:
   DeepClass m_deepInstance;
   int m_n;
@@ -23,7 +23,7 @@ class Y
 class Z
 {
 public:
-  const Y& getY0() const { return m_y0; }  // expected-note {{'getY0' declared here}}
+  const Y& getY0() const { return m_y0; }
   const Y& getActiveY() const { return m_y0; }
 
 private:
@@ -35,9 +35,9 @@ Z z_obj;
 
 void testMultipleCorrections()
 {
-  z_obj.getY2().  // expected-error {{no member named 'getY2' in 'Z'; did you mean 'getY0'}}
-      getM().     // expected-error {{no member named 'getM' in 'Y'; did you mean 'getX'}}
-      triggee();  // expected-error {{no member named 'triggee' in 'DeepClass'; did you mean 'trigger'}}
+  z_obj.getY2().  // expected-error {{no member named 'getY2' in 'Z'}}
+      getM().
+      triggee();
 }
 
 void testNoCorrections()
@@ -53,19 +53,19 @@ struct A {
   C get_me_a_C();
 };
 struct B {
-  D get_me_a_D();  // expected-note {{'get_me_a_D' declared here}}
+  D get_me_a_D();
 };
 class Scope {
 public:
   A make_an_A();
-  B make_a_B();  // expected-note {{'make_a_B' declared here}}
+  B make_a_B();
 };
 
 Scope scope_obj;
 
 int testDiscardedCorrections() {
-  return scope_obj.make_an_E().  // expected-error {{no member named 'make_an_E' in 'Scope'; did you mean 'make_a_B'}}
-      get_me_a_Z().value;        // expected-error {{no member named 'get_me_a_Z' in 'B'; did you mean 'get_me_a_D'}}
+  return scope_obj.make_an_E().  // expected-error {{no member named 'make_an_E' in 'Scope'}}
+      get_me_a_Z().value;
 }
 
 class AmbiguousHelper {
@@ -120,13 +120,13 @@ int testDeepAmbiguity() {
 }
 
 struct Dog {
-  int age;  //expected-note{{'age' declared here}}
-  int size; //expected-note{{'size' declared here}}
+  int age;
+  int size;
 };
 
 int from_dog_years(int DogYears, int DogSize);
 int get_dog_years() {
   struct Dog doggo;
-  return from_dog_years(doggo.agee,   //expected-error{{no member named 'agee' in 'Dog'; did you mean 'age'}}
-                        doggo.sizee); //expected-error{{no member named 'sizee' in 'Dog'; did you mean 'size'}}
+  return from_dog_years(doggo.agee,   //expected-error{{no member named 'agee' in 'Dog'}}
+                        doggo.sizee); //expected-error{{no member named 'sizee' in 'Dog'}}
 }
diff --git a/clang/test/Sema/typo-correction.c b/clang/test/Sema/typo-correction.c
index 4157207a9ac42..510a67e725f9c 100644
--- a/clang/test/Sema/typo-correction.c
+++ b/clang/test/Sema/typo-correction.c
@@ -50,10 +50,12 @@ void fn1(void) {
   cabs(errij);  // expected-error {{use of undeclared identifier 'errij'}}
 }
 
-extern long afunction(int);
+extern long afunction(int); // expected-note {{'afunction' declared here}} \
+                               expected-note {{passing argument to parameter here}}
 void fn2(void) {
   f(THIS_IS_AN_ERROR,       // expected-error {{use of undeclared identifier 'THIS_IS_AN_ERROR'}}
-    afunction(afunction_)); // expected-error {{use of undeclared identifier 'afunction_'}}
+    afunction(afunction_)); // expected-error {{use of undeclared identifier 'afunction_'}} \
+                               expected-error {{incompatible pointer to integer conversion passing 'long (int)' to parameter of type 'int'}}
 }
 
 int d = X ? d : L; // expected-error 2 {{use of undeclared identifier}}
@@ -94,22 +96,24 @@ struct rdar38642201 {
 
 void rdar38642201_callee(int x, int y);
 void rdar38642201_caller(void) {
-  struct rdar38642201 structVar;
+  struct rdar38642201 structVar;     // expected-note 2{{'structVar' declared here}}
   rdar38642201_callee(
-      structVar1.fieldName1.member1, //expected-error{{use of undeclared identifier 'structVar1'}}
-      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}}
+      structVar1.fieldName1.member1, //expected-error{{use of undeclared identifier 'structVar1'}} \
+                                       expected-error{{no member named 'fieldName1' in 'struct rdar38642201'}}
+      structVar2.fieldName2.member2); //expected-error{{use of undeclared identifier 'structVar2'}} \
+                                        expected-error{{no member named 'fieldName2' in 'struct rdar38642201'}}
 }
 
 void PR40286_g(int x, int y);
 void PR40286_h(int x, int y, int z);
-void PR40286_1(int the_value) {
-  PR40286_g(the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_1(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_g(the_walue, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
-void PR40286_2(int the_value) {
-  PR40286_h(the_value, the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_2(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_h(the_value, the_walue, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
-void PR40286_3(int the_value) {
-  PR40286_h(the_walue); // expected-error {{use of undeclared identifier 'the_walue'}}
+void PR40286_3(int the_value) { // expected-note {{'the_value' declared here}}
+  PR40286_h(the_walue, 0, 0); // expected-error {{use of undeclared identifier 'the_walue'}}
 }
 void PR40286_4(int the_value) { // expected-note {{'the_value' declared here}}
   PR40286_h(the_value, the_value, the_walue); // expected-error {{use of undeclared identifier 'the_walue'; did you mean 'the_value'?}}
diff --git a/clang/test/SemaCXX/arrow-operator.cpp b/clang/test/SemaCXX/arrow-operator.cpp
index 295dea3c1756c..a789c4e36e4c9 100644
--- a/clang/test/SemaCXX/arrow-operator.cpp
+++ b/clang/test/SemaCXX/arrow-operator.cpp
@@ -47,23 +47,22 @@ class wrapped_ptr {
  public:
   wrapped_ptr(T* ptr) : ptr_(ptr) {}
   T* operator->() { return ptr_; }
-  void Check(); // expected-note {{'Check' declared here}}
+  void Check();
  private:
   T *ptr_;
 };
 
 class Worker {
  public:
-  void DoSomething(); // expected-note {{'DoSomething' declared here}}
+  void DoSomething();
   void Chuck();
 };
 
 void test() {
   wrapped_ptr<Worker> worker(new Worker);
   worker.DoSomething(); // expected-error {{no member named 'DoSomething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean to use '->' instead of '.'?}}
-  worker.DoSamething(); // expected-error {{no member named 'DoSamething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean to use '->' instead of '.'?}} \
-                        // expected-error {{no member named 'DoSamething' in 'arrow_suggest::Worker'; did you mean 'DoSomething'?}}
-  worker.Chuck(); // expected-error {{no member named 'Chuck' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'; did you mean 'Check'?}}
+  worker.DoSamething(); // expected-error {{no member named 'DoSamething' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'}}
+  worker.Chuck(); // expected-error {{no member named 'Chuck' in 'arrow_suggest::wrapped_ptr<arrow_suggest::Worker>'}}
 }
 
 } // namespace arrow_suggest
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index eeeb58f1a771a..ab4e50072f654 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1888,10 +1888,11 @@ namespace PR15884 {
 }
 
 namespace AfterError {
-  constexpr int error() {
+  constexpr int error() { // pre-cxx23-error {{no return statement in constexpr function}}
     return foobar; // expected-error {{undeclared identifier}}
-  }
-  constexpr int k = error(); // expected-error {{constexpr variable 'k' must be initialized by a constant expression}}
+  } // cxx23-note {{control reached end of constexpr function}}
+  constexpr int k = error(); // cxx23-error {{constexpr variable 'k' must be initialized by a constant expression}} \
+                                cxx23-note {{in call to 'error()'}}
 }
 
 namespace std {
diff --git a/clang/test/SemaCXX/conversion-function.cpp b/clang/test/SemaCXX/conversion-function.cpp
index b653a3bf1a1d2..717c73c4786eb 100644
--- a/clang/test/SemaCXX/conversion-function.cpp
+++ b/clang/test/SemaCXX/conversion-function.cpp
@@ -458,7 +458,7 @@ namespace PR18234 {
 #endif
   } a;
   A::S s = a; // expected-error {{no viable conversion from 'struct A' to 'A::S'}}
-  A::E e = a;
+  A::E e = a; // expected-note {{'e' declared here}}
   bool k1 = e == A::e; // expected-error {{no member named 'e'}}
   bool k2 = e.n == 0;
 }
diff --git a/clang/test/SemaCXX/coroutines.cpp b/clang/test/SemaCXX/coroutines.cpp
index 068fdab4bfe38..c9cefeb30c15a 100644
--- a/clang/test/SemaCXX/coroutines.cpp
+++ b/clang/test/SemaCXX/coroutines.cpp
@@ -8,19 +8,16 @@
 // RUN: not %clang_cc1 -std=c++20 -fsyntax-only %s -fcxx-exceptions -fexceptions -Wunused-result 2>&1 | FileCheck %s
 
 void no_coroutine_traits_bad_arg_await() {
-  co_await a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_await a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 void no_coroutine_traits_bad_arg_yield() {
-  co_yield a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_yield a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 
 void no_coroutine_traits_bad_arg_return() {
-  co_return a; // expected-error {{include <coroutine>}}
-  // expected-error@-1 {{use of undeclared identifier 'a'}}
+  co_return a; // expected-error {{use of undeclared identifier 'a'}}
 }
 
 void no_coroutine_traits() {
@@ -208,8 +205,7 @@ void mixed_yield() {
 
 void mixed_yield_invalid() {
   co_yield blah; // expected-error {{use of undeclared identifier}}
-  // expected-note@-1 {{function is a coroutine due to use of 'co_yield'}}
-  return; // expected-error {{return statement not allowed in coroutine}}
+  return;
 }
 
 void mixed_yield_return_first(bool b) {
@@ -231,8 +227,7 @@ void mixed_return_for_range(bool b, T t) {
 template <class T>
 void mixed_yield_template(T) {
   co_yield blah; // expected-error {{use of undeclared identifier}}
-  // expected-note@-1 {{function is a coroutine due to use of 'co_yield'}}
-  return; // expected-error {{return statement not allowed in coroutine}}
+  return;
 }
 
 template <class T>
@@ -314,10 +309,9 @@ template void mixed_coreturn_template(void_tag, bool, int); // expected-note {{r
 template <class T>
 void mixed_coreturn_template2(bool b, T) {
   if (b)
-    co_return v; // expected-note {{use of 'co_return'}}
-    // expected-error@-1 {{use of undeclared identifier 'v'}}
+    co_return v; // expected-error {{use of undeclared identifier 'v'}}
   else
-    return; // expected-error {{not allowed in coroutine}}
+    return;
 }
 
 struct promise_handle;
diff --git a/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp b/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
new file mode 100644
index 0000000000000..f3aa051532815
--- /dev/null
+++ b/clang/test/SemaCXX/cxx-delayed-typo-correction-crashes.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+namespace GH138850 {
+void test() {
+  int tmp = add(int, 0, 0);    // expected-error {{expected '(' for function-style cast or type construction}} \
+                                  expected-note {{previous definition is here}}
+  uint tmp = add(uint, 1, 1);  // expected-error {{use of undeclared identifier 'uint'; did you mean 'int'?}} \
+                                  expected-error {{redefinition of 'tmp'}} \
+                                  expected-error {{use of undeclared identifier 'uint'}}
+  call(void, f, (int)tmp);     // expected-error {{expected '(' for function-style cast or type construction}} \
+                                  expected-error {{use of undeclared identifier 'f'}}
+}
+}
+
+namespace GH107840 {
+struct tm {};          // expected-note {{'tm' declared here}}
+
+auto getCache = [&] {  // expected-error {{non-local lambda expression cannot have a capture-default}}
+  ::foo([=] {          // expected-error {{no member named 'foo' in the global namespace}}
+    tms time;          // expected-error {{unknown type name 'tms'; did you mean 'tm'?}}
+    (void)time;
+  });
+};
+}
+
+namespace GH59391 {
+template <typename b> class c {
+  c(b);
+  b e;
+  void f() {
+    for (auto core : a::c(cores)) { // expected-error {{use of undeclared identifier 'cores'}} \
+                                       expected-error {{use of undeclared identifier 'a'}}
+    }
+  }
+};
+}
+
+namespace GH45915 {
+short g_volatile_ushort;                   // expected-note {{'g_volatile_ushort' declared here}}
+namespace a {
+   int b = l_volatile_uwchar.a ::c ::~d<>; // expected-error {{use of undeclared identifier 'l_volatile_uwchar'}} \
+                                              expected-error {{no member named 'd' in namespace 'GH45915::a'}}
+}
+}
+
+namespace GH45891 {
+int a = b.c < enum , > :: template ~d < > [ e; // expected-error {{use of undeclared identifier 'b'}} \
+                                                  expected-error {{expected identifier or '{'}} \
+                                                  expected-error {{expected ';' after top level declarator}}
+}
+
+namespace GH32903 {
+void
+B(
+  char cat_dog_3, char cat_dog_2, char cat_dog_1, char cat_dog_0, char pigeon_dog_3, char pigeon_dog_2,
+  char pigeon_dog_1, char pigeon_dog_0, short &elefant15_lion, short &elefant14_lion, short &elefant13_lion,       // expected-note 3 {{declared here}}
+  short &elefant12_lion, short &elefant11_lion, short &elefant10_lion, short &elefant9_lion, short &elefant8_lion, // expected-note 5 {{declared here}}
+  short &elefant7_lion, short &elefant6_lion, short &elefant5_lion, short &elefant4_lion, short &elefant3_lion,    // expected-note 2 {{declared here}}
+  short &elefant2_lion, short &elefant1_lion, short &elefant0_lion, char& no_animal)
+{
+
+    A(  // FIXME: it's surprising that we don't issue a "use of undeclared identifier" diagnostic for the call itself.
+        elefant_15_lion, elefant_14_lion, elefant_13_lion, elefant_12_lion, elefant_11_lion, elefant_10_lion, elefant_9_lion, // expected-error 7 {{use of undeclared identifier}}
+        elefant_8_lion, elefant_7_lion, elefant_6_lion, elefant_5_lion, elefant_4_lion, elefant_3_lion, elefant_2_lion,       // expected-error 7 {{use of undeclared identifier}}
+        elefant_1_lion, elefant_0_lion, no_animal, other_mammal);                                                             // expected-error 3 {{use of undeclared identifier}}
+}
+}
diff --git a/clang/test/SemaCXX/cxx1z-decomposition.cpp b/clang/test/SemaCXX/cxx1z-decomposition.cpp
index 95c64bc3b8bff..6ee1249a66c3f 100644
--- a/clang/test/SemaCXX/cxx1z-decomposition.cpp
+++ b/clang/test/SemaCXX/cxx1z-decomposition.cpp
@@ -121,7 +121,8 @@ void for_range() {
 }
 
 int error_recovery() {
-  auto [foobar]; // expected-error {{requires an initializer}}
+  auto [foobar]; // expected-error {{requires an initializer}} \
+                    expected-note {{'foobar' declared here}}
   return foobar_; // expected-error {{undeclared identifier 'foobar_'}}
 }
 
diff --git a/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp b/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
new file mode 100644
index 0000000000000..a16a7f8255f7c
--- /dev/null
+++ b/clang/test/SemaCXX/cxx20-delayed-typo-correction-crashes.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++20 %s
+
+#include "Inputs/std-coroutine.h"
+
+namespace GH58172 {
+template<typename Fn>
+int f2(int, Fn&&)
+{
+  return 0;
+}
+
+int f1()
+{
+  return f2(v1, []() -> task<int> {   // expected-error {{no template named 'task'}} \
+                                         expected-error {{use of undeclared identifier 'v1'}}
+    co_return v2;                     // expected-error {{use of undeclared identifier 'v2'}}
+  });
+}
+}
diff --git a/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp b/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
index 5c0d89d9125f2..1bc7f2cce3c92 100644
--- a/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
+++ b/clang/test/SemaCXX/cxx2a-adl-only-template-id.cpp
@@ -61,7 +61,7 @@ struct A : X<int> { // expected-error {{no template named 'X'}}
 // Similarly for treating overload sets of functions as template names.
 struct g<int> {}; // expected-error {{'g' refers to a function template}}
 g<int>::Y xy; // expected-error {{no template named 'g'}} FIXME lies
-void xf(g<int> x); // expected-error {{variable has incomplete type 'void'}} expected-error 1+{{}} expected-note {{}}
+void xf(g<int> x); // expected-error {{variable has incomplete type 'void'}} expected-error 1+{{}}
 struct B : g<int> { // expected-error {{expected class name}}
   B() : g<int>() {} // expected-error {{expected class member or base class name}}
 };
diff --git a/clang/test/SemaCXX/destructor.cpp b/clang/test/SemaCXX/destructor.cpp
index ed4802943ad3f..b9e0b17d510ab 100644
--- a/clang/test/SemaCXX/destructor.cpp
+++ b/clang/test/SemaCXX/destructor.cpp
@@ -553,14 +553,11 @@ namespace crash_on_invalid_base_dtor {
 struct Test {
   virtual ~Test();
 };
-struct Baz : public Test { // expected-warning {{non-virtual destructor}}
+struct Baz : public Test {
   Baz() {}
-  ~Baz() = defaul; // expected-error {{undeclared identifier 'defaul'}} \
-                   // expected-error {{initializer on function}} \
-                   // expected-note {{overridden virtual function is here}}
+  ~Baz() = defaul; // expected-error {{undeclared identifier 'defaul'}}
 };
-struct Foo : public Baz { // expected-error {{cannot override a non-deleted function}} \
-                          // expected-note {{destructor of 'Foo' is implicitly deleted}}
+struct Foo : public Baz {
   Foo() {}
 };
 }
@@ -579,11 +576,9 @@ static_assert(!__is_trivially_constructible(Foo, Foo &&), "");
 
 namespace GH97230 {
 struct X {
-  ~X() = defaul; // expected-error {{initializer on function does not look like a pure-specifier}} \
-                 // expected-error {{use of undeclared identifier 'defaul'}}
+  ~X() = defaul; // expected-error {{use of undeclared identifier 'defaul'}}
 };
-struct Y : X {} y1{ }; // expected-error {{call to implicitly-deleted default constructor of 'struct Y'}} \
-                       // expected-note {{default constructor of 'Y' is implicitly deleted because base class 'X' has no destructor}}
+struct Y : X {} y1{ };
 }
 
 namespace GH121706 {
diff --git a/clang/test/SemaCXX/invalid-if-constexpr.cpp b/clang/test/SemaCXX/invalid-if-constexpr.cpp
index 0007f2739cbbd..9f27741871484 100644
--- a/clang/test/SemaCXX/invalid-if-constexpr.cpp
+++ b/clang/test/SemaCXX/invalid-if-constexpr.cpp
@@ -2,12 +2,16 @@
 
 namespace GH61885 {
 void similar() { // expected-note {{'similar' declared here}}
-  if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}}
+  if constexpr (similer<>) {} // expected-error {{use of undeclared identifier 'similer'; did you mean 'similar'?}} \
+                                 expected-warning {{address of function 'similar<>' will always evaluate to 'true'}} \
+                                 expected-note {{prefix with the address-of operator to silence this warning}}
 }
-void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'; did you mean '__sync_swap'?}}
+void a() { if constexpr (__adl_swap<>) {}} // expected-error{{use of undeclared identifier '__adl_swap'}}
 
 int AA() { return true;} // expected-note {{'AA' declared here}}
 
-void b() { if constexpr (AAA<>) {}} // expected-error {{use of undeclared identifier 'AAA'; did you mean 'AA'?}}
+void b() { if constexpr (AAA<>) {}} // expected-error {{use of undeclared identifier 'AAA'; did you mean 'AA'?}} \
+                                       expected-warning {{address of function 'AA<>' will always evaluate to 'true'}} \
+                                       expected-note {{prefix with the address-of operator to silence this warning}}
 }
 
diff --git a/clang/test/SemaCXX/member-expr.cpp b/clang/test/SemaCXX/member-expr.cpp
index 0596e40f6c2f6..902b09097a120 100644
--- a/clang/test/SemaCXX/member-expr.cpp
+++ b/clang/test/SemaCXX/member-expr.cpp
@@ -96,11 +96,11 @@ namespace test5 {
 namespace PR7508 {
   struct A {
     struct CleanupScope {};
-    void PopCleanupBlock(); // expected-note{{'PopCleanupBlock' declared here}}
+    void PopCleanupBlock();
   };
 
   void foo(A &a) {
-    a.PopCleanupScope(); // expected-error{{no member named 'PopCleanupScope' in 'PR7508::A'; did you mean 'PopCleanupBlock'?}}
+    a.PopCleanupScope(); // expected-error{{no member named 'PopCleanupScope' in 'PR7508::A'}}
   }
 }
 
@@ -189,7 +189,7 @@ namespace PR15045 {
   }
 
   struct bar {
-    void func();  // expected-note {{'func' declared here}}
+    void func();
   };
 
   struct foo {
@@ -207,7 +207,7 @@ namespace PR15045 {
 
     // Show that recovery has happened by also triggering typo correction
     e->Func();  // expected-error {{member reference type 'bar' is not a pointer; did you mean to use '.'?}} \
-                // expected-error {{no member named 'Func' in 'PR15045::bar'; did you mean 'func'?}}
+                // expected-error {{no member named 'Func' in 'PR15045::bar'}}
 
     // Make sure a fixit isn't given in the case that the '->' isn't actually
     // the problem (the problem is with the return value of an operator->).
diff --git a/clang/test/SemaCXX/nested-name-spec.cpp b/clang/test/SemaCXX/nested-name-spec.cpp
index 36398aed7ac5f..abeaba9d8dde2 100644
--- a/clang/test/SemaCXX/nested-name-spec.cpp
+++ b/clang/test/SemaCXX/nested-name-spec.cpp
@@ -409,7 +409,8 @@ T1<C2::N1> var_1a;
 T1<C2:N1> var_1b;  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}}
 template<int N> int F() {}
 int (*X1)() = (B1::B2 ? F<1> : F<2>);
-int (*X2)() = (B1:B2 ? F<1> : F<2>);  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}}
+int (*X2)() = (B1:B2 ? F<1> : F<2>);  // expected-error{{unexpected ':' in nested name specifier; did you mean '::'?}} \
+                                         expected-note{{'PR18587::X2' declared here}}
 
 // Bit fields + templates
 struct S7a {
@@ -445,7 +446,8 @@ namespace PR16951 {
 
   int x4 = enumerator_2::ENUMERATOR_2; // expected-warning{{use of enumeration in a nested name specifier is a C++11 extension}}
   int x5 = enumerator_2::X2; // expected-warning{{use of enumeration in a nested name specifier is a C++11 extension}} \
-                             // expected-error{{no member named 'X2' in 'PR16951::enumerator_2'}}
+                             // expected-error{{no member named 'X2' in 'PR16951::enumerator_2'}} \
+                             // expected-error{{cannot initialize a variable of type 'int' with an lvalue of type 'int (*)()'}}
 
 }
 
diff --git a/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp b/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
deleted file mode 100644
index 304ee92f6a8da..0000000000000
--- a/clang/test/SemaCXX/pr13394-crash-on-invalid.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// Don't crash (PR13394).
-
-namespace stretch_v1 {
-  struct closure_t {
-    const stretch_v1::ops_t* d_methods; // expected-error {{no type named 'ops_t' in namespace 'stretch_v1'}}
-  };
-}
-namespace gatekeeper_v1 {
-  namespace gatekeeper_factory_v1 {
-    struct closure_t { // expected-note {{'closure_t' declared here}} expected-note {{'gatekeeper_factory_v1::closure_t' declared here}}
-      gatekeeper_v1::closure_t* create(); // expected-error {{no type named 'closure_t' in namespace 'gatekeeper_v1'; did you mean simply 'closure_t'?}}
-    };
-  }
-  // FIXME: Typo correction should remove the 'gatekeeper_v1::' name specifier
-  gatekeeper_v1::closure_t *x; // expected-error {{no type named 'closure_t' in namespace 'gatekeeper_v1'; did you mean 'gatekeeper_factory_v1::closure_t'}}
-}
-
-namespace Foo {
-struct Base {
-  void Bar() {} // expected-note{{'Bar' declared here}}
-};
-}
-
-struct Derived : public Foo::Base {
-  void test() {
-    Foo::Bar(); // expected-error{{no member named 'Bar' in namespace 'Foo'; did you mean simply 'Bar'?}}
-  }
-};
diff --git a/clang/test/SemaCXX/return.cpp b/clang/test/SemaCXX/return.cpp
index 17d7892d8dbd9..796c9ae91dedc 100644
--- a/clang/test/SemaCXX/return.cpp
+++ b/clang/test/SemaCXX/return.cpp
@@ -130,5 +130,5 @@ void cxx_unresolved_expr() {
   // CXXUnresolvedConstructExpr, and the missing ')' gives it an invalid source
   // location for its rparen.  Check that emitting a diag on the range of the
   // expr doesn't assert.
-  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{use of undeclared identifier 'undeclared'}}
+  return int(undeclared, 4; // expected-error {{use of undeclared identifier 'undeclared'}}
 }
diff --git a/clang/test/SemaCXX/typo-correction-crash.cpp b/clang/test/SemaCXX/typo-correction-crash.cpp
index 2a77c9df505e8..434b70e3c5097 100644
--- a/clang/test/SemaCXX/typo-correction-crash.cpp
+++ b/clang/test/SemaCXX/typo-correction-crash.cpp
@@ -4,10 +4,10 @@ auto check1() {
   return s; // expected-error {{use of undeclared identifier 's'}}
 }
 
-int test = 11; // expected-note 2 {{'test' declared here}}
+int test = 11; // expected-note 3 {{'test' declared here}}
 auto check2() {
   return "s";
-  return tes; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
+  return tes; // expected-error {{use of undeclared identifier 'tes'}}
               // expected-error@-1 {{deduced as 'int' here but deduced as 'const char *' in earlier}}
 }
 
@@ -16,9 +16,8 @@ template <class A> struct is_same<A,A> { static constexpr bool value = true; };
 
 auto L1 = [] { return s; }; // expected-error {{use of undeclared identifier 's'}}
 using T1 = decltype(L1());
-// FIXME: Suppress the 'undeclared identifier T1' diagnostic, the UsingDecl T1 is discarded because of an invalid L1().
-static_assert(is_same<T1, void>::value, "Return statement should be discarded"); // expected-error {{use of undeclared identifier 'T1'}}
-auto L2 = [] { return tes; }; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
+static_assert(is_same<T1, void>::value, "Return statement should be discarded");
+auto L2 = [] { return tes; }; // expected-error {{use of undeclared identifier 'tes'}}
 using T2 = decltype(L2());
 static_assert(is_same<T2, int>::value, "Return statement was corrected");
 
@@ -32,13 +31,13 @@ FooRecord::NestedNamespace::type x; // expected-error {{no member named 'NestedN
 
 void cast_expr(int g) { +int(n)(g); } // expected-error {{undeclared identifier 'n'}}
 
-void bind() { for (const auto& [test,_] : _test_) { }; } // expected-error {{undeclared identifier '_test_'}}
+void bind() { for (const auto& [test,_] : _test_) { }; } // expected-error {{undeclared identifier '_test_'}} \
+                                                            expected-error {{invalid range expression of type 'int'; no viable 'begin' function available}}
 
 namespace NoCrash {
 class S {
   void Function(int a) {
-    unknown1(unknown2, Function, unknown3); // expected-error 2{{use of undeclared identifier}} \
-                                               expected-error {{reference to non-static member function must be called}}
+    unknown1(unknown2, Function, unknown3); // expected-error 2{{use of undeclared identifier}}
   }
 };
 }
@@ -46,8 +45,6 @@ class S {
 namespace NoCrashOnCheckArgAlignment {
 template <typename a> void b(a &);
 void test() {
-  for (auto file_data :b(files_db_data)); // expected-error {{use of undeclared identifier 'files_db_data'; did you mean 'file_data'?}} \
-                                          // expected-note {{'file_data' declared here}} \
-                                          // expected-error {{cannot use type 'void' as a range}}
+  for (auto file_data :b(files_db_data)); // expected-error {{use of undeclared identifier 'files_db_data'}}
 }
 }
diff --git a/clang/test/SemaCXX/typo-correction-cxx11.cpp b/clang/test/SemaCXX/typo-correction-cxx11.cpp
index 8c588203cc128..9eb5f9c299629 100644
--- a/clang/test/SemaCXX/typo-correction-cxx11.cpp
+++ b/clang/test/SemaCXX/typo-correction-cxx11.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 
 namespace PR23186 {
-decltype(ned);  // expected-error-re {{use of undeclared identifier 'ned'{{$}}}}
+decltype(ned);  // expected-error {{use of undeclared identifier 'ned'}}
 // The code below was triggering an UNREACHABLE in ASTContext::getTypeInfoImpl
 // once the above code failed to recover properly after making the bogus
 // correction of 'ned' to 'new'.
@@ -19,8 +19,9 @@ struct S {
 namespace PR23140 {
 auto lneed = gned.*[] {};  // expected-error-re {{use of undeclared identifier 'gned'{{$}}}}
 
-void test(int aaa, int bbb, int thisvar) {  // expected-note {{'thisvar' declared here}}
-  int thatval = aaa * (bbb + thatvar);  // expected-error {{use of undeclared identifier 'thatvar'; did you mean 'thisvar'?}}
+void test(int aaa, int bbb, int thisvar) {
+  int thatval = aaa * (bbb + thatvar);  // expected-error {{use of undeclared identifier 'thatvar'; did you mean 'thatval'}} \
+                                           expected-note {{'thatval' declared here}}
 }
 }
 
@@ -54,7 +55,7 @@ void run(A *annotations) {
 
   auto &annotation = *annotations;
   auto new_it = new_annotations.find(5);
-  auto &new_anotation = new_it.second;  // expected-note {{'new_anotation' declared here}}
-  new_annotation->Swap(&annotation);  // expected-error {{use of undeclared identifier 'new_annotation'; did you mean 'new_anotation'?}}
+  auto &new_anotation = new_it.second;
+  new_annotation->Swap(&annotation);  // expected-error {{use of undeclared identifier 'new_annotation'}}
 }
 }
diff --git a/clang/test/SemaCXX/typo-correction-delayed.cpp b/clang/test/SemaCXX/typo-correction-delayed.cpp
deleted file mode 100644
index fdb1f740fda6a..0000000000000
--- a/clang/test/SemaCXX/typo-correction-delayed.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wno-c++11-extensions %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 -Wno-c++11-extensions %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-
-struct A {};
-struct B {};
-struct D {
-  A fizbin;  // expected-note 2 {{declared here}}
-  A foobar;  // expected-note 2 {{declared here}}
-  B roxbin;  // expected-note 2 {{declared here}}
-  B toobad;  // expected-note 2 {{declared here}}
-  void BooHoo();
-  void FoxBox();
-};
-
-void something(A, B);
-void test() {
-  D obj;
-  something(obj.fixbin,   // expected-error {{did you mean 'fizbin'?}}
-            obj.toobat);  // expected-error {{did you mean 'toobad'?}}
-  something(obj.toobat,   // expected-error {{did you mean 'foobar'?}}
-            obj.fixbin);  // expected-error {{did you mean 'roxbin'?}}
-  something(obj.fixbin,   // expected-error {{did you mean 'fizbin'?}}
-            obj.fixbin);  // expected-error {{did you mean 'roxbin'?}}
-  something(obj.toobat,   // expected-error {{did you mean 'foobar'?}}
-            obj.toobat);  // expected-error {{did you mean 'toobad'?}}
-  // Both members could be corrected to methods, but that isn't valid.
-  something(obj.boohoo,   // expected-error-re {{no member named 'boohoo' in 'D'{{$}}}}
-            obj.foxbox);  // expected-error-re {{no member named 'foxbox' in 'D'{{$}}}}
-  // The first argument has a usable correction but the second doesn't.
-  something(obj.boobar,   // expected-error-re {{no member named 'boobar' in 'D'{{$}}}}
-            obj.foxbox);  // expected-error-re {{no member named 'foxbox' in 'D'{{$}}}}
-}
-
-// Ensure the delayed typo correction does the right thing when trying to
-// recover using a seemingly-valid correction for which a valid expression to
-// replace the TypoExpr cannot be created (but which does have a second
-// correction candidate that would be a valid and usable correction).
-class Foo {
-public:
-  template <> void testIt();  // expected-error {{no function template matches}}
-  void textIt();  // expected-note {{'textIt' declared here}}
-};
-void testMemberExpr(Foo *f) {
-  f->TestIt();  // expected-error {{no member named 'TestIt' in 'Foo'; did you mean 'textIt'?}}
-}
-
-void callee(double, double);
-void testNoCandidates() {
-  callee(xxxxxx,   // expected-error-re {{use of undeclared identifier 'xxxxxx'{{$}}}}
-         zzzzzz);  // expected-error-re {{use of undeclared identifier 'zzzzzz'{{$}}}}
-}
-
-class string {};
-struct Item {
-  void Nest();
-  string text();
-  Item* next();  // expected-note {{'next' declared here}}
-};
-void testExprFilter(Item *i) {
-  Item *j;
-  j = i->Next();  // expected-error {{no member named 'Next' in 'Item'; did you mean 'next'?}}
-}
-
-// Test that initializer expressions are handled correctly and that the type
-// being initialized is taken into account when choosing a correction.
-namespace initializerCorrections {
-struct Node {
-  string text() const;
-  // Node* Next() is not implemented yet
-};
-void f(Node *node) {
-  // text is only an edit distance of 1 from Next, but would trigger type
-  // conversion errors if used in this initialization expression.
-  Node *next = node->Next();  // expected-error-re {{no member named 'Next' in 'initializerCorrections::Node'{{$}}}}
-}
-
-struct LinkedNode {
-  LinkedNode* next();  // expected-note {{'next' declared here}}
-  string text() const;
-};
-void f(LinkedNode *node) {
-  // text and next are equidistant from Next, but only one results in a valid
-  // initialization expression.
-  LinkedNode *next = node->Next();  // expected-error {{no member named 'Next' in 'initializerCorrections::LinkedNode'; did you mean 'next'?}}
-}
-
-struct NestedNode {
-  NestedNode* Nest();
-  NestedNode* next();
-  string text() const;
-};
-void f(NestedNode *node) {
-  // There are two equidistant, usable corrections for Next: next and Nest
-  NestedNode *next = node->Next();  // expected-error-re {{no member named 'Next' in 'initializerCorrections::NestedNode'{{$}}}}
-}
-}
-
-namespace PR21669 {
-void f(int *i) {
-  // Check that arguments to a builtin with custom type checking are corrected
-  // properly, since calls to such builtins bypass much of the normal code path
-  // for building and checking the call.
-  __atomic_load(i, i, something_something);  // expected-error-re {{use of undeclared identifier 'something_something'{{$}}}}
-}
-}
-
-const int DefaultArg = 9;  // expected-note {{'DefaultArg' declared here}}
-template <int I = defaultArg> struct S {};  // expected-error {{use of undeclared identifier 'defaultArg'; did you mean 'DefaultArg'?}}
-S<1> s;
-
-namespace foo {}
-void test_paren_suffix() {
-  foo::bar({5, 6});  // expected-error-re {{no member named 'bar' in namespace 'foo'{{$}}}}
-#if __cplusplus <= 199711L
-  // expected-error@-2 {{expected expression}}
-#endif
-}
-
-const int kNum = 10;  // expected-note {{'kNum' declared here}}
-class SomeClass {
-  int Kind;
-public:
-  explicit SomeClass() : Kind(kSum) {}  // expected-error {{use of undeclared identifier 'kSum'; did you mean 'kNum'?}}
-};
-
-// There used to be an issue with typo resolution inside overloads.
-struct AssertionResult { ~AssertionResult(); };
-AssertionResult Overload(const char *a);
-AssertionResult Overload(int a);
-void UseOverload() {
-  // expected-note@+1 {{'result' declared here}}
-  const char *result;
-  // expected-error@+1 {{use of undeclared identifier 'resulta'; did you mean 'result'?}}
-  Overload(resulta);
-}
-
-namespace PR21925 {
-struct X {
-  int get() { return 7; }  // expected-note {{'get' declared here}}
-};
-void test() {
-  X variable;  // expected-note {{'variable' declared here}}
-
-  // expected-error@+2 {{use of undeclared identifier 'variableX'; did you mean 'variable'?}}
-  // expected-error@+1 {{no member named 'getX' in 'PR21925::X'; did you mean 'get'?}}
-  int x = variableX.getX();
-}
-}
-
-namespace PR21905 {
-int (*a)() = (void)Z; // expected-error-re {{use of undeclared identifier 'Z'{{$}}}} \
-                      // expected-error {{cannot initialize a variable of type 'int (*)()' with an rvalue of type 'void'}}
-}
-
-namespace PR21947 {
-int blue;  // expected-note {{'blue' declared here}}
-__typeof blur y;  // expected-error {{use of undeclared identifier 'blur'; did you mean 'blue'?}}
-}
-
-namespace PR22092 {
-a = b ? : 0;  // expected-error {{a type specifier is required for all declarations}} \
-              // expected-error-re {{use of undeclared identifier 'b'{{$}}}}
-}
-
-extern long clock (void);
-struct Pointer {
-  void set_xpos(int);
-  void set_ypos(int);
-};
-void MovePointer(Pointer &Click, int x, int y) {  // expected-note 2 {{'Click' declared here}}
-  click.set_xpos(x);  // expected-error {{use of undeclared identifier 'click'; did you mean 'Click'?}}
-  click.set_ypos(x);  // expected-error {{use of undeclared identifier 'click'; did you mean 'Click'?}}
-}
-
-namespace PR22250 {
-// expected-error@+4 {{use of undeclared identifier 'size_t'; did you mean 'sizeof'?}}
-// expected-error-re@+3 {{use of undeclared identifier 'y'{{$}}}}
-// expected-error-re@+2 {{use of undeclared identifier 'z'{{$}}}}
-// expected-error@+1 {{expected ';' after top level declarator}}
-int getenv_s(size_t *y, char(&z)) {}
-}
-
-namespace PR22291 {
-template <unsigned I> void f() {
-  unsigned *prio_bits_array;  // expected-note {{'prio_bits_array' declared here}}
-  // expected-error@+1 {{use of undeclared identifier 'prio_op_array'; did you mean 'prio_bits_array'?}}
-  __atomic_store_n(prio_op_array + I, false, __ATOMIC_RELAXED);
-}
-}
-
-namespace PR22297 {
-double pow(double x, double y);
-struct TimeTicks {
-  static void Now();  // expected-note {{'Now' declared here}}
-};
-void f() {
-  TimeTicks::now();  // expected-error {{no member named 'now' in 'PR22297::TimeTicks'; did you mean 'Now'?}}
-}
-}
-
-namespace PR23005 {
-void f() { int a = Unknown::b(c); }  // expected-error {{use of undeclared identifier 'Unknown'}}
-// expected-error@-1 {{use of undeclared identifier 'c'}}
-}
-
-namespace PR23350 {
-int z = 1 ? N : ;  // expected-error {{expected expression}}
-// expected-error-re@-1 {{use of undeclared identifier 'N'{{$}}}}
-}
-
-// PR 23285. This test must be at the end of the file to avoid additional,
-// unwanted diagnostics.
-// expected-error-re@+2 {{use of undeclared identifier 'uintmax_t'{{$}}}}
-// expected-error@+1 {{expected ';' after top level declarator}}
-unsigned int a = 0(uintmax_t
diff --git a/clang/test/SemaCXX/typo-correction.cpp b/clang/test/SemaCXX/typo-correction.cpp
index 45f42c4260358..e4dadf83e0a08 100644
--- a/clang/test/SemaCXX/typo-correction.cpp
+++ b/clang/test/SemaCXX/typo-correction.cpp
@@ -3,7 +3,6 @@
 
 namespace PR21817{
 int a(-rsing[2]); // expected-error {{undeclared identifier 'rsing'; did you mean 'using'?}}
-                  // expected-error@-1 {{expected expression}}
 }
 
 struct errc {
@@ -43,14 +42,14 @@ inline error_condition make_error_condition(errc _e) {
 // refer to a base class or non-static data member.
 struct BaseType { };
 struct Derived : public BaseType { // expected-note {{base class 'BaseType' specified here}}
-  static int base_type; // expected-note {{'base_type' declared here}}
+  static int base_type;
   Derived() : basetype() {} // expected-error{{initializer 'basetype' does not name a non-static data member or base class; did you mean the base class 'BaseType'?}}
 };
 
 // Test the improvement from passing a callback object to CorrectTypo in
 // the helper function LookupMemberExprInRecord.
 int get_type(struct Derived *st) {
-  return st->Base_Type; // expected-error{{no member named 'Base_Type' in 'Derived'; did you mean 'base_type'?}}
+  return st->Base_Type; // expected-error{{no member named 'Base_Type' in 'Derived'}}
 }
 
 // In this example, somename should not be corrected to the cached correction
@@ -212,12 +211,11 @@ namespace PR13051 {
   };
 
   void foo(); // expected-note{{'foo' declared here}}
-  void g(void(*)()); // expected-note{{candidate function not viable}}
-  void g(bool(S<int>::*)() const); // expected-note{{candidate function not viable}}
+  void g(void(*)());
+  void g(bool(S<int>::*)() const);
 
   void test() {
-    g(&S<int>::tempalte f<int>); // expected-error{{did you mean 'template'?}} \
-                                 // expected-error{{no matching function for call to 'g'}}
+    g(&S<int>::tempalte f<int>); // expected-error{{did you mean 'template'?}}
     g(&S<int>::opeartor bool); // expected-error{{did you mean 'operator'?}}
     g(&S<int>::foo); // expected-error{{no member named 'foo' in 'PR13051::S<int>'; did you mean simply 'foo'?}}
   }
@@ -251,13 +249,13 @@ namespace b6956809_test1 {
 
   struct S1 {
     void method(A*);  // no note here
-    void method(B*);  // expected-note{{'method' declared here}}
+    void method(B*);
   };
 
   void test1() {
     B b;
     S1 s;
-    s.methodd(&b);  // expected-error{{no member named 'methodd' in 'b6956809_test1::S1'; did you mean 'method'}}
+    s.methodd(&b);  // expected-error{{no member named 'methodd' in 'b6956809_test1::S1'}}
   }
 
   struct S2 {
@@ -275,15 +273,15 @@ namespace b6956809_test1 {
 }
 
 namespace b6956809_test2 {
-  template<typename T> struct Err { typename T::error n; };  // expected-error{{type 'void *' cannot be used prior to '::' because it has no members}}
+  template<typename T> struct Err { typename T::error n; };
   struct S {
-    template<typename T> typename Err<T>::type method(T);  // expected-note{{in instantiation of template class 'b6956809_test2::Err<void *>' requested here}}
-    template<typename T> int method(T *);  // expected-note{{'method' declared here}}
+    template<typename T> typename Err<T>::type method(T);
+    template<typename T> int method(T *);
   };
 
   void test() {
     S s;
-    int k = s.methodd((void*)0);  // expected-error{{no member named 'methodd' in 'b6956809_test2::S'; did you mean 'method'?}} expected-note{{while substituting deduced template arguments into function template 'method' [with T = void *]}}
+    int k = s.methodd((void*)0);  // expected-error{{no member named 'methodd' in 'b6956809_test2::S'}}
   }
 }
 
@@ -309,12 +307,12 @@ struct A {
   void CreateBar(float, float);
 };
 struct B : A {
-  using A::CreateFoo; // expected-note {{'CreateFoo' declared here}}
-  void CreateFoo(int, int);  // expected-note {{'CreateFoo' declared here}}
+  using A::CreateFoo;
+  void CreateFoo(int, int);
 };
 void f(B &x) {
-  x.Createfoo(0,0);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'; did you mean 'CreateFoo'?}}
-  x.Createfoo(0.f,0.f);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'; did you mean 'CreateFoo'?}}
+  x.Createfoo(0,0);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'}}
+  x.Createfoo(0.f,0.f);  // expected-error {{no member named 'Createfoo' in 'PR13387::B'}}
 }
 }
 
@@ -649,12 +647,12 @@ class AddObservation { // expected-note {{declared here}}
 
 namespace testNonStaticMemberHandling {
 struct Foo {
-  bool usesMetadata;  // expected-note {{'usesMetadata' declared here}}
+  bool usesMetadata;
 };
 int test(Foo f) {
   if (UsesMetadata)  // expected-error-re {{use of undeclared identifier 'UsesMetadata'{{$}}}}
     return 5;
-  if (f.UsesMetadata)  // expected-error {{no member named 'UsesMetadata' in 'testNonStaticMemberHandling::Foo'; did you mean 'usesMetadata'?}}
+  if (f.UsesMetadata)  // expected-error {{no member named 'UsesMetadata' in 'testNonStaticMemberHandling::Foo'}}
     return 11;
   return 0;
 }
@@ -707,7 +705,7 @@ using C::D::Foofoo;  // expected-error {{no member named 'Foofoo' in namespace '
 int d = ? L : d; // expected-error {{expected expression}} expected-error {{undeclared identifier}}
 
 struct B0 {
-  int : 0 |         // expected-error {{invalid operands to binary expression}}
+  int : 0 |
       (struct B0)e; // expected-error {{use of undeclared identifier}}
 };
 
diff --git a/clang/test/SemaCXX/virtuals.cpp b/clang/test/SemaCXX/virtuals.cpp
index 2a22ab9fc2b09..f6f52d51f650c 100644
--- a/clang/test/SemaCXX/virtuals.cpp
+++ b/clang/test/SemaCXX/virtuals.cpp
@@ -58,10 +58,8 @@ struct Base {
 };
 
 struct Derived final : Base {
-  virtual ~Derived() = defaul; // #default
+  virtual ~Derived() = defaul; // expected-error {{use of undeclared identifier 'defaul'}}
 } do_not_crash;
-// expected-error@#default {{initializer on function does not look like a pure-specifier}}
-// expected-error@#default {{use of undeclared identifier 'defaul'}}
 }
 
 namespace VirtualFriend {
diff --git a/clang/test/SemaObjC/call-super-2.m b/clang/test/SemaObjC/call-super-2.m
index 01acff70c2301..885f392e353a6 100644
--- a/clang/test/SemaObjC/call-super-2.m
+++ b/clang/test/SemaObjC/call-super-2.m
@@ -115,7 +115,7 @@ @interface B : A
 @end
 
 @implementation B
-- (instancetype)initWithCoder:(C *)coder {
+- (instancetype)initWithCoder:(C *)coder {     // expected-note {{'coder' declared here}}
   if (0 != (self = [super initWithCode:code])) // expected-error {{use of undeclared identifier 'code'}} expected-warning {{instance method '-initWithCode:' not found}}
     return (void *)0;
   return (void *)0;
diff --git a/clang/test/SemaObjC/typo-correction-subscript.m b/clang/test/SemaObjC/typo-correction-subscript.m
index 340f3cfe2743c..6c09127dbb8d6 100644
--- a/clang/test/SemaObjC/typo-correction-subscript.m
+++ b/clang/test/SemaObjC/typo-correction-subscript.m
@@ -7,8 +7,7 @@ @interface Test
 @implementation Test
 - (void)rdar47403222:(Dictionary *)opts {
   [self undeclaredMethod:undeclaredArg];
-  // expected-error@-1{{no visible @interface for 'Test' declares the selector 'undeclaredMethod:'}}
-  // expected-error@-2{{use of undeclared identifier 'undeclaredArg}}
+  // expected-error@-1{{use of undeclared identifier 'undeclaredArg}}
   opts[(__bridge id)undeclaredKey] = 0;
   // expected-error@-1{{use of undeclared identifier 'undeclaredKey'}}
 }
diff --git a/clang/test/SemaObjC/undef-arg-super-method-call.m b/clang/test/SemaObjC/undef-arg-super-method-call.m
index 11fd97f2c00d8..b8cbe7f69f2f5 100644
--- a/clang/test/SemaObjC/undef-arg-super-method-call.m
+++ b/clang/test/SemaObjC/undef-arg-super-method-call.m
@@ -11,12 +11,12 @@ @interface DBGViewDebuggerSupport_iOS : DBGViewDebuggerSupport
 @end
 
 @implementation DBGViewDebuggerSupport_iOS
-+ (void)addViewLayerInfo:(id)aView; // expected-note {{'aView' declared here}}
++ (void)addViewLayerInfo:(id)aView;
 {
-    [super addViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'; did you mean 'aView'?}}
+    [super addViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'}}
 }
-- (void)addInstViewLayerInfo:(id)aView; // expected-note {{'aView' declared here}}
+- (void)addInstViewLayerInfo:(id)aView;
 {
-    [super addInstViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'; did you mean 'aView'?}}
+    [super addInstViewLayerInfo:view]; // expected-error {{use of undeclared identifier 'view'}}
 }
 @end
diff --git a/clang/test/SemaObjCXX/block-for-lambda-conversion.mm b/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
index 671e83dc22019..a3bcfab677197 100644
--- a/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
+++ b/clang/test/SemaObjCXX/block-for-lambda-conversion.mm
@@ -8,19 +8,20 @@
   NSEventMaskLeftMouseDown = 1
 };
 
-static const NSEventType NSFlagsChanged = NSEventTypeFlagsChanged;
+static const NSEventType NSFlagsChanged = NSEventTypeFlagsChanged; // expected-note {{'NSFlagsChanged' declared here}}
 
 @interface NSObject
 @end
 @interface NSEvent : NSObject {
 }
 + (nullable id)
-addMonitor:(NSEventMask)mask handler:(NSEvent *_Nullable (^)(NSEvent *))block;
+addMonitor:(NSEventMask)mask handler:(NSEvent *_Nullable (^)(NSEvent *))block; // expected-note {{passing argument to parameter 'mask' here}}
 @end
 
 void test(id weakThis) {
   id m_flagsChangedEventMonitor = [NSEvent
-      addMonitor:NSFlagsChangedMask //expected-error {{use of undeclared identifier 'NSFlagsChangedMask'}}
+      addMonitor:NSFlagsChangedMask //expected-error {{use of undeclared identifier 'NSFlagsChangedMask'}} \
+                                      expected-error {{cannot initialize a parameter of type 'NSEventMask' with an lvalue of type 'const NSEventType'}}
          handler:[weakThis](NSEvent *flagsChangedEvent) {
              return flagsChangedEvent;
          }];
diff --git a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
index c6dbe4db2be64..0cf27666dd030 100644
--- a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
@@ -119,8 +119,7 @@ struct HasInt {
 
 template <typename T>
 void TestInst() {
-  // expected-error@+2{{no member named 'Invalid' in 'HasInt'}}
-  // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial' directive}}
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
 #pragma acc serial num_gangs(HasInt::Invalid)
   while(1);
 
@@ -137,8 +136,7 @@ void TestInst() {
 #pragma acc parallel num_gangs(T::Invalid, 1)
   while(1);
 
-  // expected-error@+2{{no member named 'Invalid' in 'HasInt'}}
-  // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial' directive}}
+  // expected-error@+1{{no member named 'Invalid' in 'HasInt'}}
 #pragma acc serial num_gangs(1, HasInt::Invalid)
   while(1);
 
diff --git a/clang/test/SemaOpenCL/atomic-ops.cl b/clang/test/SemaOpenCL/atomic-ops.cl
index 7a273546db772..babebba31e82b 100644
--- a/clang/test/SemaOpenCL/atomic-ops.cl
+++ b/clang/test/SemaOpenCL/atomic-ops.cl
@@ -167,7 +167,7 @@ void syncscope_checks(atomic_int *Ap, int scope) {
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_all_devices);
 #if __OPENCL_C_VERSION__ < CL_VERSION_3_0
   // expected-error@-2{{use of undeclared identifier 'memory_scope_all_devices'}}
-  // expected-note@* {{'memory_scope_all_svm_devices' declared here}}
+  // expected-note@opencl-c-base.h:*{{'memory_scope_all_svm_devices' declared here}}
 #endif
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_sub_group);
   (void)__opencl_atomic_load(Ap, memory_order_relaxed, scope);
diff --git a/clang/test/SemaOpenCL/clang-builtin-version.cl b/clang/test/SemaOpenCL/clang-builtin-version.cl
index ec6eecee3106c..21cbf2d8f28d4 100644
--- a/clang/test/SemaOpenCL/clang-builtin-version.cl
+++ b/clang/test/SemaOpenCL/clang-builtin-version.cl
@@ -17,12 +17,8 @@ kernel void dse_builtins(void) {
   });
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0) && !defined(__opencl_c_device_enqueue)
 // expected-error@-10{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
-// FIXME: the typo correction for the undeclared identifiers finds alternative
-// suggestions, but instantiating the typo correction causes us to
-// re-instantiate the argument to the call, which triggers the support
-// diagnostic a second time.
-// expected-error@-12 2{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
-// expected-error@-10 2{{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
+// expected-error@-8 {{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
+// expected-error@-6 {{support disabled - compile with -fblocks or for OpenCL C 2.0 or OpenCL C 3.0 with __opencl_c_device_enqueue feature}}
 #endif
 }
 
diff --git a/clang/test/SemaTemplate/concepts-recovery-expr.cpp b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
index b338f3bc271bf..6bed1790051f3 100644
--- a/clang/test/SemaTemplate/concepts-recovery-expr.cpp
+++ b/clang/test/SemaTemplate/concepts-recovery-expr.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++20 -verify %s
 
-// expected-error@+1{{use of undeclared identifier 'b'}}
-constexpr bool CausesRecoveryExpr = b;
+// expected-error@+1 {{invalid operands to binary expression ('const char[5]' and 'float')}}
+constexpr bool CausesRecoveryExpr = "test" + 1.0f;
 
 template<typename T>
 concept ReferencesCRE = CausesRecoveryExpr;
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index a99df2390a551..62a4f95d79c74 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -814,11 +814,7 @@ static_assert(invalid<int> also here ; // expected-error{{use of undeclared iden
 
 int foo() {
     bool b;
-    b = invalid<int> not just in declarations; // expected-error{{expected ';' after expression}}
-                                               // expected-error@-1{{use of undeclared identifier 'invalid'}}
-                                               // expected-error@-2{{expected ';' after expression}}
-                                               // expected-error@-3{{use of undeclared identifier 'just'}}
-                                               // expected-error@-4{{unknown type name 'in'}}
+    b = invalid<int> not just in declarations; // expected-error{{use of undeclared identifier 'invalid'}}
     return b;
 }
 } // namespace GH48182
diff --git a/clang/test/SemaTemplate/typo-variadic.cpp b/clang/test/SemaTemplate/typo-variadic.cpp
index c9b777aebbe91..48306fb9ce805 100644
--- a/clang/test/SemaTemplate/typo-variadic.cpp
+++ b/clang/test/SemaTemplate/typo-variadic.cpp
@@ -1,2 +1,2 @@
 // RUN: %clang_cc1 -fsyntax-only %s -verify
-int x = m(s...); // expected-error{{pack expansion does not}} expected-error{{undeclared identifier}}
+int x = m(s...); // expected-error{{undeclared identifier}}
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 635d03a88d105..a6301daa672c3 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -598,7 +598,6 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::SubstNonTypeTemplateParmPackExprClass:
   case Stmt::FunctionParmPackExprClass:
   case Stmt::UnresolvedLookupExprClass:
-  case Stmt::TypoExprClass: // A typo could actually be a DeclRef or a MemberRef
     K = CXCursor_DeclRefExpr;
     break;
 
diff --git a/clang/unittests/Sema/ExternalSemaSourceTest.cpp b/clang/unittests/Sema/ExternalSemaSourceTest.cpp
index 2b271d4bf7825..cc9dd4175af55 100644
--- a/clang/unittests/Sema/ExternalSemaSourceTest.cpp
+++ b/clang/unittests/Sema/ExternalSemaSourceTest.cpp
@@ -268,20 +268,6 @@ TEST(ExternalSemaSource, ExternalTypoCorrectionOrdering) {
   ASSERT_EQ(1, Watcher.SeenCount);
 }
 
-TEST(ExternalSemaSource, ExternalDelayedTypoCorrection) {
-  auto Installer = std::make_unique<ExternalSemaSourceInstaller>();
-  auto Provider = makeIntrusiveRefCnt<FunctionTypoProvider>("aaa", "bbb");
-  DiagnosticWatcher Watcher("aaa", "bbb");
-  Installer->PushSource(Provider.get());
-  Installer->PushWatcher(&Watcher);
-  std::vector<std::string> Args(1, "-std=c++11");
-  ASSERT_TRUE(clang::tooling::runToolOnCodeWithArgs(
-      std::move(Installer), "namespace AAA { } void foo() { AAA::aaa(); }",
-      Args));
-  ASSERT_LE(0, Provider->CallCount);
-  ASSERT_EQ(1, Watcher.SeenCount);
-}
-
 // We should only try MaybeDiagnoseMissingCompleteType if we can't otherwise
 // solve the problem.
 TEST(ExternalSemaSource, TryOtherTacticsBeforeDiagnosing) {

From a5cbd2ab0bebc722f836cd3b04dbab691ef9ed2f Mon Sep 17 00:00:00 2001
From: Diana Picus <Diana-Magda.Picus@amd.com>
Date: Fri, 13 Jun 2025 12:48:24 +0200
Subject: [PATCH 353/851] =?UTF-8?q?Revert=20"[AMDGPU]=20Skip=20register=20?=
 =?UTF-8?q?uses=20in=20AMDGPUResourceUsageAnalysis=20(#=E2=80=A6=20(#14403?=
 =?UTF-8?q?9)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…133242)"

This reverts commit 130080fab11cde5efcb338b77f5c3b31097df6e6 because it
causes issues in testcases similar to coalescer_remat.ll [1], i.e. when
we use a VGPR tuple but only write to its lower parts. The high VGPRs
would then not be included in the vgpr_count, and accessing them would
be an out of bounds violation.

[1]
https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
---
 llvm/docs/AMDGPUUsage.rst                     |  11 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  11 +-
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 283 ++++++++++++++++--
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  15 -
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  14 -
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   5 -
 .../AMDGPU/GlobalISel/extractelement.ll       |  38 +--
 .../AMDGPU/amdgpu-no-agprs-violations.ll      |   7 +-
 .../amdhsa-kernarg-preload-num-sgprs.ll       |  28 +-
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/amdpal-elf.ll        |  16 +-
 .../attr-amdgpu-flat-work-group-size.ll       |   8 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |  24 +-
 .../AMDGPU/call-alias-register-usage-agpr.ll  |   2 +-
 .../AMDGPU/call-alias-register-usage0.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage1.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage2.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage3.ll      |   2 +-
 .../AMDGPU/call-graph-register-usage.ll       |  10 +-
 llvm/test/CodeGen/AMDGPU/coalescer_remat.ll   |   2 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/elf-notes.ll         |   2 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  | 106 +++----
 .../CodeGen/AMDGPU/function-resource-usage.ll |  24 +-
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |   2 +-
 .../init-whole-wave-vgpr-count-large.ll       |  72 -----
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll |  46 ---
 ...init-whole-wave-vgpr-count-use-inactive.ll |  74 -----
 .../AMDGPU/init-whole-wave-vgpr-count.ll      |  71 -----
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 ...-knownbits-assign-crash-gh-issue-110930.ll |  24 +-
 .../multi-call-resource-usage-mcexpr.ll       |   2 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   8 +-
 .../CodeGen/AMDGPU/ps-shader-arg-count.ll     |   6 +-
 .../CodeGen/AMDGPU/register-count-comments.ll |   4 +-
 .../AMDGPU/resource-optimization-remarks.ll   |   4 +-
 .../AMDGPU/schedule-amdgpu-tracker-physreg.ll |   4 +-
 .../AMDGPU/schedule-amdgpu-trackers.ll        |   4 +-
 .../AMDGPU/schedule-regpressure-limit2.ll     |   6 +-
 .../CodeGen/AMDGPU/stack-realign-kernel.ll    |  12 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   4 +-
 .../AMDGPU/unnamed-function-resource-info.ll  |   4 +-
 .../CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll  |   4 +-
 .../test/CodeGen/AMDGPU/vgpr-count-compute.ll |  30 --
 .../CodeGen/AMDGPU/vgpr-count-graphics.ll     |  35 ---
 48 files changed, 473 insertions(+), 587 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3aa8773fa506b..e0a43225e81be 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4263,9 +4263,10 @@ same *vendor-name*.
                                                                   wavefront for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                   if a higher numbered
-                                                                  register is written to. This
+                                                                  register is used
+                                                                  explicitly. This
                                                                   includes the special
                                                                   SGPRs for VCC, Flat
                                                                   Scratch (GFX7-GFX9)
@@ -4283,10 +4284,10 @@ same *vendor-name*.
                                                                   each work-item for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  written to, or
+                                                                  used explicitly, or
                                                                   if a higher numbered
-                                                                  register is
-                                                                  written to.
+                                                                  register is used
+                                                                  explicitly.
      ".agpr_count"                       integer        Required  Number of accumulator
                                                                   registers required by
                                                                   each work-item for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d4fea30f21f45..491314daf2d81 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
-  if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
+  if (isShader(F.getCallingConv())) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 
@@ -1060,6 +1060,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+  } else if (isKernel(F.getCallingConv()) &&
+             MFI->getNumKernargPreloadedSGPRs()) {
+    // Consider cases where the total number of UserSGPRs with trailing
+    // allocated preload SGPRs, is greater than the number of explicitly
+    // referenced SGPRs.
+    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
+        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
+    ProgInfo.NumSGPR =
+        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 7bde59412d905..9a609a1752de0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,29 +137,274 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
-  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
-
-  Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-  Info.NumExplicitSGPR =
-      TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-  if (ST.hasMAIInsts())
-    Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
-
-  // Preloaded registers are written by the hardware, not defined in the
-  // function body, so they need special handling.
-  if (MFI->isEntryFunction()) {
-    Info.NumExplicitSGPR =
-        std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
-    Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
-  }
-
-  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
+  Info.UsesVCC =
+      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+  // If there are no calls, MachineRegisterInfo can tell us the used register
+  // count easily.
+  // A tail call isn't considered a call for MachineFrameInfo's purposes.
+  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+    if (ST.hasMAIInsts())
+      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
     return Info;
+  }
 
+  int32_t MaxVGPR = -1;
+  int32_t MaxAGPR = -1;
+  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
+      // TODO: Check regmasks? Do they occur anywhere except calls?
+      for (const MachineOperand &MO : MI.operands()) {
+        unsigned Width = 0;
+        bool IsSGPR = false;
+        bool IsAGPR = false;
+
+        if (!MO.isReg())
+          continue;
+
+        Register Reg = MO.getReg();
+        switch (Reg) {
+        case AMDGPU::EXEC:
+        case AMDGPU::EXEC_LO:
+        case AMDGPU::EXEC_HI:
+        case AMDGPU::SCC:
+        case AMDGPU::M0:
+        case AMDGPU::M0_LO16:
+        case AMDGPU::M0_HI16:
+        case AMDGPU::SRC_SHARED_BASE_LO:
+        case AMDGPU::SRC_SHARED_BASE:
+        case AMDGPU::SRC_SHARED_LIMIT_LO:
+        case AMDGPU::SRC_SHARED_LIMIT:
+        case AMDGPU::SRC_PRIVATE_BASE_LO:
+        case AMDGPU::SRC_PRIVATE_BASE:
+        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
+        case AMDGPU::SRC_PRIVATE_LIMIT:
+        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
+        case AMDGPU::SGPR_NULL:
+        case AMDGPU::SGPR_NULL64:
+        case AMDGPU::MODE:
+          continue;
+
+        case AMDGPU::NoRegister:
+          assert(MI.isDebugInstr() &&
+                 "Instruction uses invalid noreg register");
+          continue;
+
+        case AMDGPU::VCC:
+        case AMDGPU::VCC_LO:
+        case AMDGPU::VCC_HI:
+        case AMDGPU::VCC_LO_LO16:
+        case AMDGPU::VCC_LO_HI16:
+        case AMDGPU::VCC_HI_LO16:
+        case AMDGPU::VCC_HI_HI16:
+          Info.UsesVCC = true;
+          continue;
+
+        case AMDGPU::FLAT_SCR:
+        case AMDGPU::FLAT_SCR_LO:
+        case AMDGPU::FLAT_SCR_HI:
+          continue;
+
+        case AMDGPU::XNACK_MASK:
+        case AMDGPU::XNACK_MASK_LO:
+        case AMDGPU::XNACK_MASK_HI:
+          llvm_unreachable("xnack_mask registers should not be used");
+
+        case AMDGPU::LDS_DIRECT:
+          llvm_unreachable("lds_direct register should not be used");
+
+        case AMDGPU::TBA:
+        case AMDGPU::TBA_LO:
+        case AMDGPU::TBA_HI:
+        case AMDGPU::TMA:
+        case AMDGPU::TMA_LO:
+        case AMDGPU::TMA_HI:
+          llvm_unreachable("trap handler registers should not be used");
+
+        case AMDGPU::SRC_VCCZ:
+          llvm_unreachable("src_vccz register should not be used");
+
+        case AMDGPU::SRC_EXECZ:
+          llvm_unreachable("src_execz register should not be used");
+
+        case AMDGPU::SRC_SCC:
+          llvm_unreachable("src_scc register should not be used");
+
+        default:
+          break;
+        }
+
+        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
+            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
+            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 1;
+        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 1;
+        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 1;
+        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 2;
+        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 2;
+        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 3;
+        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 3;
+        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 3;
+        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 4;
+        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 4;
+        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 5;
+        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 5;
+        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 5;
+        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 6;
+        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 6;
+        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 6;
+        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 7;
+        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 7;
+        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 7;
+        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 8;
+        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 8;
+        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 9;
+        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 9;
+        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 9;
+        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 10;
+        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 10;
+        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 10;
+        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 11;
+        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 11;
+        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 11;
+        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 12;
+        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 12;
+        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 12;
+        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 16;
+        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 16;
+        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 16;
+        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 32;
+        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 32;
+        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 32;
+        } else {
+          // We only expect TTMP registers or registers that do not belong to
+          // any RC.
+          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
+                  !TRI.getPhysRegBaseClass(Reg)) &&
+                 "Unknown register class");
+        }
+        unsigned HWReg = TRI.getHWRegIndex(Reg);
+        int MaxUsed = HWReg + Width - 1;
+        if (IsSGPR) {
+          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+        } else if (IsAGPR) {
+          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
+        } else {
+          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
+        }
+      }
+
       if (MI.isCall()) {
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
@@ -219,5 +464,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     }
   }
 
+  Info.NumExplicitSGPR = MaxSGPR + 1;
+  Info.NumVGPR = MaxVGPR + 1;
+  Info.NumAGPR = MaxAGPR + 1;
+
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 01718faaf5c2e..0e7635a045588 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -970,25 +970,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return NumUserSGPRs;
   }
 
-  // Get the number of preloaded SGPRs for compute kernels.
   unsigned getNumPreloadedSGPRs() const {
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
-  // Get the number of preloaded VGPRs for compute kernels.
-  unsigned getNumPreloadedVGPRs() const {
-    if (hasWorkItemIDZ())
-      return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDY())
-      return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
-
-    if (hasWorkItemIDX())
-      return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
-
-    return 0;
-  }
-
   unsigned getNumKernargPreloadedSGPRs() const {
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index b76823a128e07..e41189adfb46f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4055,20 +4055,6 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
   return 0;
 }
 
-unsigned
-SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                      const TargetRegisterClass &RC) const {
-  for (MCPhysReg Reg : reverse(RC.getRegisters())) {
-    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
-      if (llvm::any_of(MRI.def_instructions(*AI), [](const MachineInstr &MI) {
-            return !MI.isImplicitDef();
-          }))
-        return getHWRegIndex(Reg) + 1;
-    }
-  }
-  return 0;
-}
-
 SmallVector<StringLiteral>
 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
                                   const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7726762ad0e6d..a4b135d5e0b59 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,11 +486,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass &RC) const;
 
-  // \returns the number of registers of a given \p RC defined in a function.
-  // Does not go inside function calls.
-  unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
-                                 const TargetRegisterClass &RC) const;
-
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
                              : std::optional<uint8_t>{};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index bdd86c1af6248..9b35920f8547a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 17
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
+; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
-; GFX10-NEXT:     workitem_vgpr_count = 3
+; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     workitem_vgpr_count = 2
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 6
 ; GFX11-NEXT:     workitem_vgpr_count = 2
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     wavefront_sgpr_count = 16
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     wavefront_sgpr_count = 10
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 16
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index cc614bb24839c..7bf9a29e9ff44 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,9 +13,8 @@
 ; CHECK: {{^}}kernel_illegal_agpr_use_asm:
 ; CHECK: ; use a0
 
-; GFX908: NumVgprs: 3
-; GFX90A: NumVgprs: 1
-; CHECK: NumAgprs: 0
+; CHECK: NumVgprs: 0
+; CHECK: NumAgprs: 1
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
@@ -25,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
 ; CHECK: ; use a0
 
 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 0
+; CHECK: NumAgprs: 1
 define void @func_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index 7851de641c5a3..dd760c2a215ca 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@
 
 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 15
-; ASM: ; TotalNumSgprs: 21
-; ASM: ; NumSGPRsForWavesPerEU: 21
+; ASM: .amdhsa_next_free_sgpr 12
+; ASM: ; TotalNumSgprs: 18
+; ASM: ; NumSGPRsForWavesPerEU: 18
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
 ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000  ................
+; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
 
 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 11
-; ASM: ; TotalNumSgprs: 17
-; ASM: ; NumSGPRsForWavesPerEU: 17
+; ASM: .amdhsa_next_free_sgpr 10
+; ASM: ; TotalNumSgprs: 16
+; ASM: ; NumSGPRsForWavesPerEU: 16
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
 
 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 4
-; ASM: ; TotalNumSgprs: 10
-; ASM: ; NumSGPRsForWavesPerEU: 10
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
 ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000  @...............
+; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
 
 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 0
+; ASM: ; TotalNumSgprs: 6
+; ASM: ; NumSGPRsForWavesPerEU: 6
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 494ade73cb5f8..f4d17e50cf18c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GFX8-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
-; GFX9-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
+; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      no_stack:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x20{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x1{{$}}
+; GCN-NEXT:        .sgpr_count:     0x21{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index 5ccf41c408b72..f52ba7000edeb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
@@ -23,16 +23,8 @@
 ; ELF: Section: .text (0x2)
 ; ELF: }
 
-; GFX10: NumSGPRsForWavesPerEU: 12
-; GFX10: NumVGPRsForWavesPerEU: 3
-
-; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
-; which only affects Wave32.
-; GFX11W32: NumSGPRsForWavesPerEU: 16
-; GFX11W32: NumVGPRsForWavesPerEU: 1
-
-; GFX11W64: NumSGPRsForWavesPerEU: 11
-; GFX11W64: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 6
+; GFX10: NumVGPRsForWavesPerEU: 1
 
 define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 0e0a81d4657df..616867481d177 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
@@ -13,10 +13,10 @@ entry:
 attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 
 ; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 5617a80fc94b4..e9fe4f3c618c7 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 
 ; At least 1 wave per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 
 ; At least 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 
 ; At least 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
 
 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 
 ; Between 5 and 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 3
+; CHECK: NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index efa416e301ccc..2e79d8bab46a6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
 }
 ; ALL:      .set .Laliasee_default.num_vgpr, 0
 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 62ca985bc6400..337da5d0ecbe0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index 344f8200608f6..075eddd2763d3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 3d36f8a514c47..4fd181d3c0f43 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 2274c437daf62..00f72d5d8b1dd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index db1269e8e95c2..dbd00f09943c0 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.
 
 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 2
+; GCN: ; TotalNumSgprs: 34
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
   call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }
 
 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 4
-; VI: ; TotalNumSgprs: 6
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
   call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 55dc394628176..61830f18ad7a7 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 5
+; CHECK: ; NumVgprs: 8
 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index d8d7494d0dc1c..3fe3cafd729a7 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 3
+; ALL-ASM:     .amdhsa_next_free_vgpr 1
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
     i32 %i,
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 59cf9825116fa..22d699a8f4809 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
 ; OSABI-HSA-ELF:     .sgpr_count:     96
 ; OSABI-HSA-ELF:     .sgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .symbol:         elf_notes.kd
-; OSABI-HSA-ELF:     .vgpr_count:     1
+; OSABI-HSA-ELF:     .vgpr_count:     0
 ; OSABI-HSA-ELF:     .vgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .wavefront_size: 64
 ; OSABI-HSA-ELF: amdhsa.target:   amdgcn-amd-amdhsa--gfx802
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index ed1f3e1397abc..a59382ba20dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 12
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 14
-; VI-NOXNACK: ; TotalNumSgprs: 14
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
-; VI-XNACK: ; TotalNumSgprs: 16
-; HSA-VI-XNACK: ; TotalNumSgprs: 22
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 16
-; VI-NOXNACK: ; TotalNumSgprs: 18
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 14
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 16
-; VI-NOXNACK: NumSgprs: 18
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 18
+; VI-XNACK: NumSgprs: 6
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index 2a18d40e0bd8a..59bcc5d8be9b1 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
 ; GCN-LABEL: {{^}}use_vcc:
 ; GCN: .set use_vcc.num_vgpr, 0
 ; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 0
+; GCN: .set use_vcc.numbered_sgpr, 32
 ; GCN: .set use_vcc.private_seg_size, 0
 ; GCN: .set use_vcc.uses_vcc, 1
 ; GCN: .set use_vcc.uses_flat_scratch, 0
 ; GCN: .set use_vcc.has_dyn_sized_stack, 0
 ; GCN: .set use_vcc.has_recursion, 0
 ; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 ; GCN-LABEL: {{^}}use_flat_scratch:
 ; GCN: .set use_flat_scratch.num_vgpr, 0
 ; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 0
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
 ; GCN: .set use_flat_scratch.private_seg_size, 0
 ; GCN: .set use_flat_scratch.uses_vcc, 0
 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1
 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
 ; GCN: .set use_flat_scratch.has_recursion, 0
 ; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 6
+; GCN: TotalNumSgprs: 38
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
 ; GCN-LABEL: {{^}}use_10_vgpr:
 ; GCN: .set use_10_vgpr.num_vgpr, 10
 ; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 0
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
 ; GCN: .set use_10_vgpr.private_seg_size, 0
 ; GCN: .set use_10_vgpr.uses_vcc, 0
 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0
 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
 ; GCN: .set use_10_vgpr.has_recursion, 0
 ; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 10
 ; GCN: ScratchSize: 0
 define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
 ; GCN-LABEL: {{^}}use_50_vgpr:
 ; GCN:	.set use_50_vgpr.num_vgpr, 50
 ; GCN:	.set use_50_vgpr.num_agpr, 0
-; GCN:	.set use_50_vgpr.numbered_sgpr, 0
+; GCN:	.set use_50_vgpr.numbered_sgpr, 32
 ; GCN:	.set use_50_vgpr.private_seg_size, 0
 ; GCN:	.set use_50_vgpr.uses_vcc, 0
 ; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
 ; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_50_vgpr.has_recursion, 0
 ; GCN:	.set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 36
 ; GCN: NumVgprs: 50
 ; GCN: ScratchSize: 0
 define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
 ; GCN-LABEL: {{^}}use_stack0:
 ; GCN:	.set use_stack0.num_vgpr, 1
 ; GCN:	.set use_stack0.num_agpr, 0
-; GCN:	.set use_stack0.numbered_sgpr, 0
+; GCN:	.set use_stack0.numbered_sgpr, 33
 ; GCN:	.set use_stack0.private_seg_size, 2052
 ; GCN:	.set use_stack0.uses_vcc, 0
 ; GCN:	.set use_stack0.uses_flat_scratch, 0
 ; GCN:	.set use_stack0.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack0.has_recursion, 0
 ; GCN:	.set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 2052
 define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
 ; GCN-LABEL: {{^}}use_stack1:
 ; GCN:	.set use_stack1.num_vgpr, 1
 ; GCN:	.set use_stack1.num_agpr, 0
-; GCN:	.set use_stack1.numbered_sgpr, 0
+; GCN:	.set use_stack1.numbered_sgpr, 33
 ; GCN:	.set use_stack1.private_seg_size, 404
 ; GCN:	.set use_stack1.uses_vcc, 0
 ; GCN:	.set use_stack1.uses_flat_scratch, 0
 ; GCN:	.set use_stack1.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack1.has_recursion, 0
 ; GCN:	.set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 4
+; GCN: TotalNumSgprs: 37
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 404
 define void @use_stack1() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index bf452a9e38e01..cd89a36fe538b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     16
+; CHECK:     .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index f7e3498907005..024593c49dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF:   0250: 6770725F 636F756E 7401B12E 76677072
+; ELF:   0250: 6770725F 636F756E 7402B12E 76677072
 ; ELF:   0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
 ; ELF:   0270: 61766566 726F6E74 5F73697A 6540AD61
 ; ELF:   0280: 6D646873 612E7461 72676574 BD616D64
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
deleted file mode 100644
index 45de8a79fe88d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Use VGPRs above the input arguments.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; clobber v28", "~{v28}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
deleted file mode 100644
index 9c636d4516a80..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: leaf_shader:
-; CHECK: .vgpr_count:{{.*}}0x1{{$}}
-
-; Function without calls.
-define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
-                              i32 %active.vgpr1, i32 %active.vgpr2,
-                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
-                              local_unnamed_addr {
-entry:
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %compute, label %merge
-
-compute:
-  ; Perform a more complex computation using active VGPRs
-  %square = mul i32 %active.vgpr1, %active.vgpr1
-  %product = mul i32 %square, %active.vgpr2
-  %sum = add i32 %product, %input.value
-  %result = add i32 %sum, 42
-  br label %merge
-
-merge:
-  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
-
-  store i32 %final.result, ptr %output.ptr, align 4
-
-  ret void
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
deleted file mode 100644
index 1b0d33cec7052..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
-; In that case, the VGPR should be included in the .vgpr_count
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xd{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
deleted file mode 100644
index 9408501718784..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
-
-; CHECK-LABEL: .shader_functions:
-
-; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; CHECK-LABEL: _miss_1:
-; CHECK: .vgpr_count:{{.*}}0xa{{$}}
-
-define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
-                                    i32 %vcr, { i32 } %system.data,
-                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
-                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
-                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
-                                    local_unnamed_addr {
-entry:
-  %system.data.value = extractvalue { i32 } %system.data, 0
-  %dead.val = call i32 @llvm.amdgcn.dead.i32()
-  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
-  br i1 %is.whole.wave, label %shader, label %tail
-
-shader:
-  %system.data.extract = extractvalue { i32 } %system.data, 0
-  %data.mul = mul i32 %system.data.extract, 2
-  %data.add = add i32 %data.mul, 1
-  br label %tail
-
-tail:
-  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
-  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
-  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
-  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
-  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
-  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
-  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
-  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
-  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
-  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
-  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
-  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
-
-  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
-  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
-  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
-  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
-  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
-  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
-  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
-  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
-  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
-  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
-  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
-  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
-
-  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
-  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
-  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
-  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
-
-  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
-        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
-        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
-        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
-        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
-  unreachable
-}
-
-declare i32 @llvm.amdgcn.dead.i32()
-declare i1 @llvm.amdgcn.init.whole.wave()
-declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
-
-declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index c3b033113431f..464cd820028cc 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]
 
-; GCN: ; TotalNumSgprs: 18
+; GCN: ; TotalNumSgprs: 32
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
   tail call void @func()
diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 03694b913d6e0..60bbf4646ee03 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
 ; CHECK-LABEL: P_RemoveMobj:
 ; CHECK: .set P_RemoveMobj.num_vgpr, 0
 ; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
 ; CHECK: .set P_RemoveMobj.private_seg_size, 0
 ; CHECK: .set P_RemoveMobj.uses_vcc, 0
 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
 ; CHECK-LABEL: P_SpawnMobj:
 ; CHECK: .set P_SpawnMobj.num_vgpr, 0
 ; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
 ; CHECK: .set P_SpawnMobj.private_seg_size, 0
 ; CHECK: .set P_SpawnMobj.uses_vcc, 0
 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
 ; CHECK-LABEL: G_PlayerReborn:
 ; CHECK: .set G_PlayerReborn.num_vgpr, 0
 ; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
 ; CHECK: .set G_PlayerReborn.private_seg_size, 0
 ; CHECK: .set G_PlayerReborn.uses_vcc, 0
 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
 ; CHECK-LABEL: P_SetThingPosition:
 ; CHECK: .set P_SetThingPosition.num_vgpr, 0
 ; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
 ; CHECK: .set P_SetThingPosition.private_seg_size, 0
 ; CHECK: .set P_SetThingPosition.uses_vcc, 0
 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
 ; CHECK-LABEL: HU_Start:
 ; CHECK: .set HU_Start.num_vgpr, 0
 ; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 0
+; CHECK: .set HU_Start.numbered_sgpr, 32
 ; CHECK: .set HU_Start.private_seg_size, 0
 ; CHECK: .set HU_Start.uses_vcc, 0
 ; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
 ; CHECK-LABEL: AM_Stop:
 ; CHECK: .set AM_Stop.num_vgpr, 0
 ; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 0
+; CHECK: .set AM_Stop.numbered_sgpr, 32
 ; CHECK: .set AM_Stop.private_seg_size, 0
 ; CHECK: .set AM_Stop.uses_vcc, 0
 ; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
 ; CHECK-LABEL: D_AdvanceDemo:
 ; CHECK: .set D_AdvanceDemo.num_vgpr, 0
 ; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0
 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0
 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
 ; CHECK-LABEL: F_StartFinale:
 ; CHECK: .set F_StartFinale.num_vgpr, 0
 ; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 0
+; CHECK: .set F_StartFinale.numbered_sgpr, 32
 ; CHECK: .set F_StartFinale.private_seg_size, 0
 ; CHECK: .set F_StartFinale.uses_vcc, 0
 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
 ; CHECK-LABEL: F_Ticker:
 ; CHECK: .set F_Ticker.num_vgpr, 0
 ; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 0
+; CHECK: .set F_Ticker.numbered_sgpr, 32
 ; CHECK: .set F_Ticker.private_seg_size, 0
 ; CHECK: .set F_Ticker.uses_vcc, 0
 ; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
 ; CHECK-LABEL: P_TempSaveGameFile:
 ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
 ; CHECK-LABEL: P_SaveGameFile:
 ; CHECK: .set P_SaveGameFile.num_vgpr, 2
 ; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
 ; CHECK: .set P_SaveGameFile.private_seg_size, 0
 ; CHECK: .set P_SaveGameFile.uses_vcc, 0
 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
index 83f58db1aa67f..7a810d0067c17 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, 13
 ; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 0
+; CHECK: .set qux.numbered_sgpr, 32
 ; CHECK: .set qux.private_seg_size, 0
 ; CHECK: .set qux.uses_vcc, 0
 ; CHECK: .set qux.uses_flat_scratch, 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 28c3131302a31..638dc8965987e 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x20
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
@@ -134,7 +134,7 @@
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x1
+; CHECK-NEXT:        .sgpr_count:     0x21
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index a71fd7fe782ff..5b9b0feea9900 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
 
 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 2
+; ;CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   %i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; Additionally set the PSInputAddr to 0 via the metadata
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 2
+; CHECK: NumVgprs: 4
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
 .entry:
   ret { <4 x float> } undef
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index bfcf90037bfd3..35e11ad6a648b 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,9 +24,7 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
   store i32 %x, ptr addrspace(1) %out, align 4
   ret void
 }
-
-attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index a2470a60cb19f..afb77ed190896 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 22
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 3
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 557ffd27a07f6..0d25bc97ff775 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: {{^}}spill:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 3
-; GCN-GCNTRACKERS:    NumVgprs: 3
+; GCN:    NumVgprs: 1
+; GCN-GCNTRACKERS:    NumVgprs: 2
 ; GCN:    ScratchSize: 0
 ; GCN-GCNTRACKERS:    ScratchSize: 0
 ; GCN:    Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 95d707aee5662..c5732531f5423 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
 ; allow scheduling of other instructions which reduce RP
 
 ; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL:    NumSgprs: 0
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 0
+; GFX11-PAL:    NumSgprs: 33
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 220
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 8300a52955b91..462ac23ec7e0e 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
 
-; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1-9]$}}
 
 ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
 ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
 
 ; stores may alias loads
-; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
+; VI-MINREG: NumSgprs: {{[0-9]$}}
 ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
 
 ; stores may alias loads
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 682bbdedb37a3..6ddf0986755f9 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 3
+; VI-NEXT:     .amdhsa_next_free_vgpr 1
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index d3def45c4f9d2..30accc846d2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index ad831e040d722..4f84b31f1877b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 0
 
 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     8
+; ELF: .sgpr_count:     5
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index d1e28e11601ce..644f434923368 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 8
+; ASM: .amdhsa_next_free_sgpr 5
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     12
+; ELF: .sgpr_count:     9
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index 4802ec861d685..cf5b95a729974 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: __unnamed_1:
 ; CHECK: .set __unnamed_1.num_vgpr, 0
 ; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 0
+; CHECK: .set __unnamed_1.numbered_sgpr, 32
 ; CHECK: .set __unnamed_1.private_seg_size, 0
 ; CHECK: .set __unnamed_1.uses_vcc, 0
 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
 }
 
 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index ee35dc4cddade..2cb5e309c8c21 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }
 
 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 1
+; GFX90A: NumVgprs: 32
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 5
+; GFX90A: TotalNumVgprs: 33
 define void @f1024_0() #1024 {
   call void @foo()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
deleted file mode 100644
index 8c8182db7b479..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
-; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
-target triple = "amdgcn-amd-amdhsa"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
-; Only hardware-initialized VGPRs (v0) are read in this kernel.
-
-; CHECK-LABEL: amdhsa.kernels:
-; CHECK-LABEL: kernel_x
-; CHECK: .vgpr_count:     1
-define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.x()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: kernel_z
-; PACKED: .vgpr_count:     1
-; NOTPACKED: .vgpr_count:     3
-define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
-entry:
-  %id = call i32 @llvm.amdgcn.workitem.id.z()
-  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
deleted file mode 100644
index f5d28a0ae1628..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
-; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
-target triple = "amdgcn--amdpal"
-
-@global = addrspace(1) global i32 poison, align 4
-
-; CHECK-LABEL: amdpal.pipelines:
-
-; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
-; CHECK-LABEL: .entry_point_symbol: kernel_use
-; CHECK: .vgpr_count:     0x20
-define amdgpu_cs void @kernel_use([32 x i32] %args) {
-entry:
-  %a = extractvalue [32 x i32] %args, 14
-  store i32 %a, ptr addrspace(1) @global
-  ret void
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: chain_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
-entry:
-  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
-        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
-  unreachable
-}
-
-; Neither uses not writes a VGPR
-; CHECK-LABEL: gfx_func:
-; CHECK: .vgpr_count:     0x1
-define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
-entry:
-  ret [32 x i32] %args
-}

From be9994b09206a84a32c3029b409587008d179b95 Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Fri, 13 Jun 2025 07:00:36 -0400
Subject: [PATCH 354/851] [SystemZ][z/OS] Refactor AutoConvert more (#143955)

This patch removes the C++
disablezOSAutoConversion,enablezOSAutoConversion declarations and also
updates Path.inc to use the common function.
---
 llvm/include/llvm/Support/AutoConvert.h | 45 +++++++++----------------
 llvm/lib/Support/AutoConvert.cpp        |  1 -
 llvm/lib/Support/Unix/Path.inc          |  4 +--
 llvm/lib/Support/Unix/Program.inc       |  2 +-
 4 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h
index 56ad91425bcc3..1e6792636e169 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -18,7 +18,7 @@
 #include <_Ccsid.h>
 #endif
 #ifdef __cplusplus
-#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/Error.h"
 #include <system_error>
 #endif /* __cplusplus */
 
@@ -41,6 +41,21 @@ int restorezOSStdHandleAutoConversion(int FD);
 #ifdef __cplusplus
 namespace llvm {
 
+#ifdef __MVS__
+
+/** \brief Set the tag information for a file descriptor. */
+std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
+
+/** \brief Get the the tag ccsid for a file name or a file descriptor. */
+ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
+
+/** \brief Query the file tag to determine if it needs conversion to UTF-8
+ *  codepage.
+ */
+ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
+
+#endif /* __MVS__*/
+
 inline std::error_code disableAutoConversion(int FD) {
 #ifdef __MVS__
   if (::disablezOSAutoConversion(FD) == -1)
@@ -79,34 +94,6 @@ inline ErrorOr<bool> needConversion(const char *FileName, const int FD = -1) {
   return false;
 }
 
-#ifdef __MVS__
-
-/** \brief Disable the z/OS enhanced ASCII auto-conversion for the file
- * descriptor.
- */
-std::error_code disablezOSAutoConversion(int FD);
-
-/** \brief Query the z/OS enhanced ASCII auto-conversion status of a file
- * descriptor and force the conversion if the file is not tagged with a
- * codepage.
- */
-std::error_code enablezOSAutoConversion(int FD);
-
-/** Restore the z/OS enhanced ASCII auto-conversion for the std handle. */
-std::error_code restorezOSStdHandleAutoConversion(int FD);
-
-/** \brief Set the tag information for a file descriptor. */
-std::error_code setzOSFileTag(int FD, int CCSID, bool Text);
-
-/** \brief Get the the tag ccsid for a file name or a file descriptor. */
-ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1);
-
-/** \brief Query the file tag to determine if it needs conversion to UTF-8
- *  codepage.
- */
-ErrorOr<bool> needzOSConversion(const char *FileName, const int FD = -1);
-
-#endif /* __MVS__*/
 } /* namespace llvm */
 #endif /* __cplusplus */
 
diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp
index c69e9a8f97c0e..0b6928e10ef5a 100644
--- a/llvm/lib/Support/AutoConvert.cpp
+++ b/llvm/lib/Support/AutoConvert.cpp
@@ -14,7 +14,6 @@
 #ifdef __MVS__
 
 #include "llvm/Support/AutoConvert.h"
-#include "llvm/Support/Error.h"
 #include <cassert>
 #include <fcntl.h>
 #include <sys/stat.h>
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 0728413f4db6e..277247e3cc236 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -1097,14 +1097,14 @@ std::error_code openFile(const Twine &Name, int &ResultFD,
                     !Stat.st_tag.ft_txtflag && !Stat.st_tag.ft_ccsid &&
                     Stat.st_size == 0;
     if (Flags & OF_Text) {
-      if (auto EC = llvm::enablezOSAutoConversion(ResultFD))
+      if (auto EC = llvm::enableAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
         if (auto EC = llvm::setzOSFileTag(ResultFD, CCSID_IBM_1047, true))
           return EC;
       }
     } else {
-      if (auto EC = llvm::disablezOSAutoConversion(ResultFD))
+      if (auto EC = llvm::disableAutoConversion(ResultFD))
         return EC;
       if (DoSetTag) {
         if (auto EC = llvm::setzOSFileTag(ResultFD, FT_BINARY, false))
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 6d68369ad191c..4f17b2257a756 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -516,7 +516,7 @@ std::error_code llvm::sys::ChangeStdoutMode(fs::OpenFlags Flags) {
 
 std::error_code llvm::sys::ChangeStdinToBinary() {
 #ifdef __MVS__
-  return disablezOSAutoConversion(STDIN_FILENO);
+  return disableAutoConversion(STDIN_FILENO);
 #else
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return std::error_code();

From 30725efe671bc82bf9095a575aece60fc40fbef5 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 13 Jun 2025 07:12:41 -0400
Subject: [PATCH 355/851] Fix build after removing delayed typo expression

This addresses issues found by:
  https://lab.llvm.org/buildbot/#/builders/64/builds/4220
  https://lab.llvm.org/buildbot/#/builders/51/builds/17890
---
 clang/lib/Parse/ParseExpr.cpp  |  1 -
 clang/lib/Sema/SemaExpr.cpp    | 48 ----------------------------------
 clang/lib/Sema/SemaExprCXX.cpp | 45 -------------------------------
 3 files changed, 94 deletions(-)

diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index a27a44455b621..3cf3d4ea7d705 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -513,7 +513,6 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
       }
     }
 
-    ExprResult OrigLHS = LHS;
     if (!LHS.isInvalid()) {
       // Combine the LHS and RHS into the LHS (e.g. build AST).
       if (TernaryMiddle.isInvalid()) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index b7031bc8c0220..413eff4aa294a 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2446,42 +2446,6 @@ Sema::DecomposeUnqualifiedId(const UnqualifiedId &Id,
   }
 }
 
-static void emitEmptyLookupTypoDiagnostic(const TypoCorrection &TC,
-                                          Sema &SemaRef, const CXXScopeSpec &SS,
-                                          DeclarationName Typo,
-                                          SourceRange TypoRange,
-                                          unsigned DiagnosticID,
-                                          unsigned DiagnosticSuggestID) {
-  DeclContext *Ctx =
-      SS.isEmpty() ? nullptr : SemaRef.computeDeclContext(SS, false);
-  if (!TC) {
-    // Emit a special diagnostic for failed member lookups.
-    // FIXME: computing the declaration context might fail here (?)
-    if (Ctx)
-      SemaRef.Diag(TypoRange.getBegin(), diag::err_no_member)
-          << Typo << Ctx << TypoRange;
-    else
-      SemaRef.Diag(TypoRange.getBegin(), DiagnosticID) << Typo << TypoRange;
-    return;
-  }
-
-  std::string CorrectedStr = TC.getAsString(SemaRef.getLangOpts());
-  bool DroppedSpecifier =
-      TC.WillReplaceSpecifier() && Typo.getAsString() == CorrectedStr;
-  unsigned NoteID = TC.getCorrectionDeclAs<ImplicitParamDecl>()
-                        ? diag::note_implicit_param_decl
-                        : diag::note_previous_decl;
-  if (!Ctx)
-    SemaRef.diagnoseTypo(
-        TC, SemaRef.PDiag(DiagnosticSuggestID) << Typo << TypoRange,
-        SemaRef.PDiag(NoteID));
-  else
-    SemaRef.diagnoseTypo(TC,
-                         SemaRef.PDiag(diag::err_no_member_suggest)
-                             << Typo << Ctx << DroppedSpecifier << TypoRange,
-                         SemaRef.PDiag(NoteID));
-}
-
 bool Sema::DiagnoseDependentMemberLookup(const LookupResult &R) {
   // During a default argument instantiation the CurContext points
   // to a CXXMethodDecl; but we can't apply a this-> fixit inside a
@@ -14922,18 +14886,6 @@ static void checkObjCPointerIntrospection(Sema &S, ExprResult &L, ExprResult &R,
   }
 }
 
-static NamedDecl *getDeclFromExpr(Expr *E) {
-  if (!E)
-    return nullptr;
-  if (auto *DRE = dyn_cast<DeclRefExpr>(E))
-    return DRE->getDecl();
-  if (auto *ME = dyn_cast<MemberExpr>(E))
-    return ME->getMemberDecl();
-  if (auto *IRE = dyn_cast<ObjCIvarRefExpr>(E))
-    return IRE->getDecl();
-  return nullptr;
-}
-
 // This helper function promotes a binary operator's operands (which are of a
 // half vector type) to a vector of floats and then truncates the result to
 // a vector of either half or short.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c653cb56351cb..ba52e8f8932d3 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -7647,51 +7647,6 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures(
   CurrentLSI->clearPotentialCaptures();
 }
 
-static ExprResult attemptRecovery(Sema &SemaRef,
-                                  const TypoCorrectionConsumer &Consumer,
-                                  const TypoCorrection &TC) {
-  LookupResult R(SemaRef, Consumer.getLookupResult().getLookupNameInfo(),
-                 Consumer.getLookupResult().getLookupKind());
-  const CXXScopeSpec *SS = Consumer.getSS();
-  CXXScopeSpec NewSS;
-
-  // Use an approprate CXXScopeSpec for building the expr.
-  if (auto *NNS = TC.getCorrectionSpecifier())
-    NewSS.MakeTrivial(SemaRef.Context, NNS, TC.getCorrectionRange());
-  else if (SS && !TC.WillReplaceSpecifier())
-    NewSS = *SS;
-
-  if (auto *ND = TC.getFoundDecl()) {
-    R.setLookupName(ND->getDeclName());
-    R.addDecl(ND);
-    if (ND->isCXXClassMember()) {
-      // Figure out the correct naming class to add to the LookupResult.
-      CXXRecordDecl *Record = nullptr;
-      if (auto *NNS = TC.getCorrectionSpecifier())
-        Record = NNS->getAsType()->getAsCXXRecordDecl();
-      if (!Record)
-        Record =
-            dyn_cast<CXXRecordDecl>(ND->getDeclContext()->getRedeclContext());
-      if (Record)
-        R.setNamingClass(Record);
-
-      // Detect and handle the case where the decl might be an implicit
-      // member.
-      if (SemaRef.isPotentialImplicitMemberAccess(
-              NewSS, R, Consumer.isAddressOfOperand()))
-        return SemaRef.BuildPossibleImplicitMemberExpr(
-            NewSS, /*TemplateKWLoc*/ SourceLocation(), R,
-            /*TemplateArgs*/ nullptr, /*S*/ nullptr);
-    } else if (auto *Ivar = dyn_cast<ObjCIvarDecl>(ND)) {
-      return SemaRef.ObjC().LookupInObjCMethod(R, Consumer.getScope(),
-                                               Ivar->getIdentifier());
-    }
-  }
-
-  return SemaRef.BuildDeclarationNameExpr(NewSS, R, /*NeedsADL*/ false,
-                                          /*AcceptInvalidDecl*/ true);
-}
-
 ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
                                      bool DiscardedValue, bool IsConstexpr,
                                      bool IsTemplateArgument) {

From 4236423ee863be5903819db57205fc83a4bd21e1 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Fri, 13 Jun 2025 16:31:25 +0500
Subject: [PATCH 356/851] [LLDB] Add bit extraction to DIL (#141422)

---
 lldb/include/lldb/ValueObject/DILAST.h        | 27 +++++++++
 lldb/include/lldb/ValueObject/DILEval.h       |  2 +
 lldb/include/lldb/ValueObject/DILLexer.h      |  1 +
 lldb/source/ValueObject/DILAST.cpp            |  5 ++
 lldb/source/ValueObject/DILEval.cpp           | 32 +++++++++++
 lldb/source/ValueObject/DILLexer.cpp          |  7 ++-
 lldb/source/ValueObject/DILParser.cpp         | 22 ++++++--
 .../TestFrameVarDILArraySubscript.py          |  2 +-
 .../basics/BitFieldExtraction/Makefile        |  3 +
 .../TestFrameVarDILBitFieldExtraction.py      | 56 +++++++++++++++++++
 .../basics/BitFieldExtraction/main.cpp        |  9 +++
 11 files changed, 159 insertions(+), 7 deletions(-)
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
 create mode 100644 lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp

diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h
index 6c7838e05c93c..709f0639135f1 100644
--- a/lldb/include/lldb/ValueObject/DILAST.h
+++ b/lldb/include/lldb/ValueObject/DILAST.h
@@ -19,6 +19,7 @@ namespace lldb_private::dil {
 /// The various types DIL AST nodes (used by the DIL parser).
 enum class NodeKind {
   eArraySubscriptNode,
+  eBitExtractionNode,
   eErrorNode,
   eIdentifierNode,
   eMemberOfNode,
@@ -153,6 +154,30 @@ class ArraySubscriptNode : public ASTNode {
   int64_t m_index;
 };
 
+class BitFieldExtractionNode : public ASTNode {
+public:
+  BitFieldExtractionNode(uint32_t location, ASTNodeUP base, int64_t first_index,
+                         int64_t last_index)
+      : ASTNode(location, NodeKind::eBitExtractionNode),
+        m_base(std::move(base)), m_first_index(first_index),
+        m_last_index(last_index) {}
+
+  llvm::Expected<lldb::ValueObjectSP> Accept(Visitor *v) const override;
+
+  ASTNode *GetBase() const { return m_base.get(); }
+  int64_t GetFirstIndex() const { return m_first_index; }
+  int64_t GetLastIndex() const { return m_last_index; }
+
+  static bool classof(const ASTNode *node) {
+    return node->GetKind() == NodeKind::eBitExtractionNode;
+  }
+
+private:
+  ASTNodeUP m_base;
+  int64_t m_first_index;
+  int64_t m_last_index;
+};
+
 /// This class contains one Visit method for each specialized type of
 /// DIL AST node. The Visit methods are used to dispatch a DIL AST node to
 /// the correct function in the DIL expression evaluator for evaluating that
@@ -168,6 +193,8 @@ class Visitor {
   Visit(const UnaryOpNode *node) = 0;
   virtual llvm::Expected<lldb::ValueObjectSP>
   Visit(const ArraySubscriptNode *node) = 0;
+  virtual llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BitFieldExtractionNode *node) = 0;
 };
 
 } // namespace lldb_private::dil
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index 9d0fa53c6622a..2a0cb548a810f 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -54,6 +54,8 @@ class Interpreter : Visitor {
   llvm::Expected<lldb::ValueObjectSP> Visit(const UnaryOpNode *node) override;
   llvm::Expected<lldb::ValueObjectSP>
   Visit(const ArraySubscriptNode *node) override;
+  llvm::Expected<lldb::ValueObjectSP>
+  Visit(const BitFieldExtractionNode *node) override;
 
   // Used by the interpreter to create objects, perform casts, etc.
   lldb::TargetSP m_target;
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
index 7d70f88f9a718..9c1ba97680253 100644
--- a/lldb/include/lldb/ValueObject/DILLexer.h
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -31,6 +31,7 @@ class Token {
     identifier,
     l_paren,
     l_square,
+    minus,
     numeric_constant,
     period,
     r_paren,
diff --git a/lldb/source/ValueObject/DILAST.cpp b/lldb/source/ValueObject/DILAST.cpp
index 8b5e64ad462cc..b1cd824c2299e 100644
--- a/lldb/source/ValueObject/DILAST.cpp
+++ b/lldb/source/ValueObject/DILAST.cpp
@@ -32,4 +32,9 @@ ArraySubscriptNode::Accept(Visitor *v) const {
   return v->Visit(this);
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+BitFieldExtractionNode::Accept(Visitor *v) const {
+  return v->Visit(this);
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index c8cb54aa18a93..b2bb4e20ddc24 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -430,4 +430,36 @@ Interpreter::Visit(const ArraySubscriptNode *node) {
   return base->GetSyntheticArrayMember(signed_child_idx, true);
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::Visit(const BitFieldExtractionNode *node) {
+  auto lhs_or_err = Evaluate(node->GetBase());
+  if (!lhs_or_err)
+    return lhs_or_err;
+  lldb::ValueObjectSP base = *lhs_or_err;
+  int64_t first_index = node->GetFirstIndex();
+  int64_t last_index = node->GetLastIndex();
+
+  // if the format given is [high-low], swap range
+  if (first_index > last_index)
+    std::swap(first_index, last_index);
+
+  Status error;
+  if (base->GetCompilerType().IsReferenceType()) {
+    base = base->Dereference(error);
+    if (error.Fail())
+      return error.ToError();
+  }
+  lldb::ValueObjectSP child_valobj_sp =
+      base->GetSyntheticBitFieldChild(first_index, last_index, true);
+  if (!child_valobj_sp) {
+    std::string message = llvm::formatv(
+        "bitfield range {0}-{1} is not valid for \"({2}) {3}\"", first_index,
+        last_index, base->GetTypeName().AsCString("<invalid type>"),
+        base->GetName().AsCString());
+    return llvm::make_error<DILDiagnosticError>(m_expr, message,
+                                                node->GetLocation());
+  }
+  return child_valobj_sp;
+}
+
 } // namespace lldb_private::dil
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
index 99182d2da1131..eaefaf484bc18 100644
--- a/lldb/source/ValueObject/DILLexer.cpp
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -34,6 +34,8 @@ llvm::StringRef Token::GetTokenName(Kind kind) {
     return "l_paren";
   case Kind::l_square:
     return "l_square";
+  case Kind::minus:
+    return "minus";
   case Kind::numeric_constant:
     return "numeric_constant";
   case Kind::period:
@@ -113,8 +115,9 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
 
   constexpr std::pair<Token::Kind, const char *> operators[] = {
       {Token::amp, "&"},     {Token::arrow, "->"},   {Token::coloncolon, "::"},
-      {Token::l_paren, "("}, {Token::l_square, "["}, {Token::period, "."},
-      {Token::r_paren, ")"}, {Token::r_square, "]"}, {Token::star, "*"},
+      {Token::l_paren, "("}, {Token::l_square, "["}, {Token::minus, "-"},
+      {Token::period, "."},  {Token::r_paren, ")"},  {Token::r_square, "]"},
+      {Token::star, "*"},
   };
   for (auto [kind, str] : operators) {
     if (remainder.consume_front(str))
diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp
index 9667885734f21..32af0820acb98 100644
--- a/lldb/source/ValueObject/DILParser.cpp
+++ b/lldb/source/ValueObject/DILParser.cpp
@@ -120,6 +120,7 @@ ASTNodeUP DILParser::ParseUnaryExpression() {
 //  postfix_expression:
 //    primary_expression
 //    postfix_expression "[" integer_literal "]"
+//    postfix_expression "[" integer_literal "-" integer_literal "]"
 //    postfix_expression "." id_expression
 //    postfix_expression "->" id_expression
 //
@@ -131,17 +132,30 @@ ASTNodeUP DILParser::ParsePostfixExpression() {
     switch (token.GetKind()) {
     case Token::l_square: {
       m_dil_lexer.Advance();
-      std::optional<int64_t> rhs = ParseIntegerConstant();
-      if (!rhs) {
+      std::optional<int64_t> index = ParseIntegerConstant();
+      if (!index) {
         BailOut(
             llvm::formatv("failed to parse integer constant: {0}", CurToken()),
             CurToken().GetLocation(), CurToken().GetSpelling().length());
         return std::make_unique<ErrorNode>();
       }
+      if (CurToken().GetKind() == Token::minus) {
+        m_dil_lexer.Advance();
+        std::optional<int64_t> last_index = ParseIntegerConstant();
+        if (!last_index) {
+          BailOut(llvm::formatv("failed to parse integer constant: {0}",
+                                CurToken()),
+                  CurToken().GetLocation(), CurToken().GetSpelling().length());
+          return std::make_unique<ErrorNode>();
+        }
+        lhs = std::make_unique<BitFieldExtractionNode>(
+            loc, std::move(lhs), std::move(*index), std::move(*last_index));
+      } else {
+        lhs = std::make_unique<ArraySubscriptNode>(loc, std::move(lhs),
+                                                   std::move(*index));
+      }
       Expect(Token::r_square);
       m_dil_lexer.Advance();
-      lhs = std::make_unique<ArraySubscriptNode>(loc, std::move(lhs),
-                                                 std::move(*rhs));
       break;
     }
     case Token::period:
diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
index 0d91f804ce565..c90e0eaa63638 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py
@@ -63,7 +63,7 @@ def test_subscript(self):
         self.expect(
             "frame var 'int_arr[-1]'",
             error=True,
-            substrs=["unrecognized token"],
+            substrs=["failed to parse integer constant"],
         )
 
         # Test for floating point index
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
new file mode 100644
index 0000000000000..7b5ef0650b6e1
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/TestFrameVarDILBitFieldExtraction.py
@@ -0,0 +1,56 @@
+"""
+Test DIL BifField extraction.
+"""
+
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test import lldbutil
+
+
+class TestFrameVarDILBitFieldExtraction(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def expect_var_path(self, expr, compare_to_framevar=False, value=None, type=None):
+        value_dil = super().expect_var_path(expr, value=value, type=type)
+        if compare_to_framevar:
+            self.runCmd("settings set target.experimental.use-DIL false")
+            value_frv = super().expect_var_path(expr, value=value, type=type)
+            self.runCmd("settings set target.experimental.use-DIL true")
+            self.assertEqual(value_dil.GetValue(), value_frv.GetValue())
+
+    def test_bitfield_extraction(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp")
+        )
+
+        self.runCmd("settings set target.experimental.use-DIL true")
+
+        # Test ranges and type
+        self.expect_var_path("value[0-1]", True, value="3", type="int:2")
+        self.expect_var_path("value[4-7]", True, value="7", type="int:4")
+        self.expect_var_path("value[7-0]", True, value="115", type="int:8")
+
+        # Test reference and dereferenced pointer
+        self.expect_var_path("value_ref[0-1]", value="3", type="int:2")
+        self.expect_var_path("(*value_ptr)[0-1]", value="3", type="int:2")
+
+        # Test array and pointer
+        self.expect(
+            "frame var 'int_arr[0-2]'",
+            error=True,
+            substrs=["bitfield range 0-2 is not valid"],
+        )
+        self.expect(
+            "frame var 'value_ptr[0-1]'",
+            error=True,
+            substrs=["bitfield range 0-1 is not valid"],
+        )
+
+        # Test invalid input
+        self.expect(
+            "frame var 'value[1-]'",
+            error=True,
+            substrs=["failed to parse integer constant"],
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp
new file mode 100644
index 0000000000000..a35f68a9e30a8
--- /dev/null
+++ b/lldb/test/API/commands/frame/var-dil/basics/BitFieldExtraction/main.cpp
@@ -0,0 +1,9 @@
+int main(int argc, char **argv) {
+  int value = 0b01110011;
+  int &value_ref = value;
+  int *value_ptr = &value;
+
+  int int_arr[] = {7, 3, 1};
+
+  return 0; // Set a breakpoint here
+}

From 41b37f05554ae59974675ae219430b5598c6159f Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 13 Jun 2025 12:43:27 +0100
Subject: [PATCH 357/851] [lldb] CommandObjectMemoryFind: Improve expression
 evaluation error messages (#144036)

We now bubble up the expression evaluation diagnostics to the user and
also distinguish between "expression failed to parse/run" versus other
ways in which expressions didn't complete (e.g., setup errors, etc.).

Before:
```
(lldb) memory find -e "" 0x16fdfedc0 0x16fdfede0
error: expression evaluation failed. pass a string instead
(lldb) memory find -e "invalid" 0x16fdfedc0 0x16fdfede0
error: expression evaluation failed. pass a string instead
```

After:
```
(lldb) memory find -e "" 0x16fdfedc0 0x16fdfede0
error: Expression evaluation failed:
error: No result returned from expression. Exit status: 1
(lldb) memory find -e "invalid" 0x16fdfedc0 0x16fdfede0
error: Expression evaluation failed:
error: <user expression 0>:1:1: use of undeclared identifier 'invalid'
    1 | invalid
      | ^~~~~~~
```
---
 lldb/source/Commands/CommandObjectMemory.cpp      |  8 ++++++--
 .../functionalities/memory/find/TestMemoryFind.py | 15 ++++++++++++++-
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index ccb06d8ff4d59..5792c13373c1e 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -925,9 +925,12 @@ EvaluateExpression(llvm::StringRef expression, StackFrame &frame,
   ValueObjectSP result_sp;
   auto status =
       process.GetTarget().EvaluateExpression(expression, &frame, result_sp);
-  if (status != eExpressionCompleted || !result_sp)
+  if (!result_sp)
     return llvm::createStringError(
-        "expression evaluation failed. pass a string instead");
+        "No result returned from expression. Exit status: %d", status);
+
+  if (status != eExpressionCompleted)
+    return result_sp->GetError().ToError();
 
   result_sp = result_sp->GetQualifiedRepresentationIfAvailable(
       result_sp->GetDynamicValueType(), /*synthValue=*/true);
@@ -1082,6 +1085,7 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
           m_memory_options.m_expr.GetValueAs<llvm::StringRef>().value_or(""),
           m_exe_ctx.GetFrameRef(), *process);
       if (!result_or_err) {
+        result.AppendError("Expression evaluation failed: ");
         result.AppendError(llvm::toString(result_or_err.takeError()));
         return;
       }
diff --git a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
index 72426e75e013e..a06b0d960889c 100644
--- a/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
+++ b/lldb/test/API/functionalities/memory/find/TestMemoryFind.py
@@ -56,10 +56,23 @@ def test_memory_find(self):
         # Invalid expr is an error.
         self.expect(
             'memory find -e "not_a_symbol" `&bytedata[0]` `&bytedata[15]`',
+            substrs=[
+                "Expression evaluation failed:",
+                "use of undeclared identifier 'not_a_symbol'",
+            ],
+            error=True,
+        )
+
+        self.expect(
+            'memory find -e "" `&bytedata[0]` `&bytedata[2]`',
+            substrs=[
+                "Expression evaluation failed:",
+                "No result returned from expression. Exit status: 1",
+            ],
             error=True,
-            substrs=["error: expression evaluation failed. pass a string instead"],
         )
 
+        # Valid expressions/strings
         self.expect(
             'memory find -e "(uint8_t)0x22" `&bytedata[0]` `&bytedata[15]`',
             substrs=["data found at location: 0x", "22 33 44 55 66"],

From f1036d844e4b886ac702859ccf8a19cf2153c7f7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 12:49:22 +0100
Subject: [PATCH 358/851] [X86] X86InstrInfo::commuteInstructionImpl - remove
 (V)BLENDPD/S commutation to (V)MOVSD/S optsize handling (#144051)

Just commute with (V)BLENDPD/S like all other BLEND instructions

This is now handled more generally by the X86FixupInstTuningPass (OptSize fold occurs even without a scheduler model).

First step towards #142972
---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 212e134c512a7..abf365eedec39 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2353,33 +2353,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     break;
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
+  case X86::PBLENDWrri:
   case X86::VBLENDPDrri:
   case X86::VBLENDPSrri:
-    // If we're optimizing for size, try to use MOVSD/MOVSS.
-    if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
-      unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
-      if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
-#define FROM_TO(FROM, TO)                                                      \
-  case X86::FROM:                                                              \
-    Opc = X86::TO;                                                             \
-    break;
-        switch (Opc) {
-        default:
-          llvm_unreachable("Unreachable!");
-        FROM_TO(BLENDPDrri, MOVSDrr)
-        FROM_TO(BLENDPSrri, MOVSSrr)
-        FROM_TO(VBLENDPDrri, VMOVSDrr)
-        FROM_TO(VBLENDPSrri, VMOVSSrr)
-        }
-        WorkingMI = CloneIfNew(MI);
-        WorkingMI->setDesc(get(Opc));
-        WorkingMI->removeOperand(3);
-        break;
-      }
-#undef FROM_TO
-    }
-    [[fallthrough]];
-  case X86::PBLENDWrri:
   case X86::VBLENDPDYrri:
   case X86::VBLENDPSYrri:
   case X86::VPBLENDDrri:

From cc365331af423de99ae98655d035e4892842fe97 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Fri, 13 Jun 2025 13:54:30 +0200
Subject: [PATCH 359/851] [DLCov] Origin-Tracking: Add config options (#143590)

This patch is part of a series that adds origin-tracking to the debugify
source location coverage checks, allowing us to report symbolized stack
traces of the point where missing source locations appear.

This patch adds the configuration options needed to enable this feature,
in the form of a new CMake option that enables a flag in
`llvm-config.h`; this is not an entirely new CMake flag, but a new
option, `COVERAGE_AND_ORIGIN`, for the existing flag
`LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING`. This patch contains
documentation, but no actual implementation for the flag itself.
---
 llvm/CMakeLists.txt                          |  4 ++--
 llvm/cmake/modules/HandleLLVMOptions.cmake   |  3 +++
 llvm/docs/CMake.rst                          | 13 ++++++++-----
 llvm/include/llvm/Config/llvm-config.h.cmake |  4 ++++
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cfb67472aa71e..0849bec26d56a 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -569,8 +569,8 @@ endif()
 option(LLVM_ENABLE_CRASH_DUMPS "Turn on memory dumps on crashes. Currently only implemented on Windows." OFF)
 
 set(LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING "DISABLED" CACHE STRING
-  "Enhance Debugify's line number coverage tracking; enabling this is ABI-breaking. Can be DISABLED, or COVERAGE.")
-set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE)
+  "Enhance Debugify's line number coverage tracking; enabling this is ABI-breaking. Can be DISABLED, COVERAGE, or COVERAGE_AND_ORIGIN.")
+set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE COVERAGE_AND_ORIGIN)
 
 option(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS
   "Add additional fields to DILocations to support Key Instructions" OFF)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 9721dacbcbe84..c35d9763a3301 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -200,6 +200,9 @@ string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
   set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
+  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default and requires no additional defines.
 else()
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 674e4969c6912..72f19fd353922 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -482,11 +482,14 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING**:STRING
   Enhances Debugify's ability to detect line number errors by storing extra
   information inside Instructions, removing false positives from Debugify's
-  results at the cost of performance. Allowed values are `DISABLED` (default)
-  and `COVERAGE`. `COVERAGE` tracks whether and why a line number was
-  intentionally dropped or not generated for an instruction, allowing Debugify
-  to avoid reporting these as errors; this comes with a small performance cost
-  of ~0.1%. `COVERAGE` is an ABI-breaking option.
+  results at the cost of performance. Allowed values are `DISABLED` (default),
+  `COVERAGE`, and `COVERAGE_AND_ORIGIN`. `COVERAGE` tracks whether and why a
+  line number was intentionally dropped or not generated for an instruction,
+  allowing Debugify to avoid reporting these as errors; this comes with a small
+  performance cost of ~0.1%. `COVERAGE_AND_ORIGIN` additionally stores a
+  stacktrace of the point where each DebugLoc is unintentionally dropped,
+  allowing for much easier bug triaging at the cost of a ~10x performance
+  slowdown. `COVERAGE` and `COVERAGE_AND_ORIGIN` are ABI-breaking options.
 
 **LLVM_ENABLE_DIA_SDK**:BOOL
   Enable building with MSVC DIA SDK for PDB debugging support. Available
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index dbc882937b4f4..6d3c37cc8b194 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -133,4 +133,8 @@
    and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+
 #endif

From fbea0fc5c77713a4d62db2512b1b51cc76ed6a25 Mon Sep 17 00:00:00 2001
From: Martin Wehking <martin.wehking@arm.com>
Date: Fri, 13 Jun 2025 13:33:46 +0100
Subject: [PATCH 360/851] Add Macro for CSSC Feature (#143148)

Add a new __ARM_FEATURE_CSSC macro that can be utilized during the
preprocessing stage.

__ARM_FEATURE_CSSC is defined to 1 if there is hardware support for
CSSC.

Implements the ACLE change:
https://github.com/ARM-software/acle/pull/394
---
 clang/lib/Basic/Targets/AArch64.cpp               | 6 ++++++
 clang/lib/Basic/Targets/AArch64.h                 | 1 +
 clang/test/Preprocessor/aarch64-target-features.c | 5 ++++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index e8abdf9aafd82..124b340b62d9f 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -625,6 +625,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasCRC)
     Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
 
+  if (HasCSSC)
+    Builder.defineMacro("__ARM_FEATURE_CSSC", "1");
+
   if (HasRCPC3)
     Builder.defineMacro("__ARM_FEATURE_RCPC", "3");
   else if (HasRCPC)
@@ -874,6 +877,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("rdm", HasRDM)
       .Case("lse", HasLSE)
       .Case("crc", HasCRC)
+      .Case("cssc", HasCSSC)
       .Case("sha2", HasSHA2)
       .Case("sha3", HasSHA3)
       .Cases("aes", "pmull", HasAES)
@@ -1249,6 +1253,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasPAuthLR = true;
       HasPAuth = true;
     }
+    if (Feature == "+cssc")
+      HasCSSC = true;
   }
 
   // Check features that are manually disabled by command line options.
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index a4c65361105e4..1951e0679d2ec 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -66,6 +66,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   unsigned FPU = FPUMode;
   bool HasCRC = false;
+  bool HasCSSC = false;
   bool HasAES = false;
   bool HasSHA2 = false;
   bool HasSHA3 = false;
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index 4cb9b6ce53b0d..fd83e4b689a2a 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -744,7 +744,10 @@
 // CHECK-SMEB16B16: __ARM_FEATURE_SME2 1
 // CHECK-SMEB16B16: __ARM_FEATURE_SME_B16B16 1
 // CHECK-SMEB16B16: __ARM_FEATURE_SVE_B16B16 1
-//
+
+// RUN: %clang --target=aarch64 -march=armv9-a+cssc -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-CSSC %s
+// CHECK-CSSC: __ARM_FEATURE_CSSC 1
+
 //  RUN: %clang --target=aarch64 -march=armv9-a+fp8 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-FP8 %s
 // CHECK-FP8: __ARM_FEATURE_FP8 1
 

From 9a237f35ef58c838a461d560908e380c481aadad Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Fri, 13 Jun 2025 08:39:00 -0400
Subject: [PATCH 361/851] [AMDGPU][AsmParser] Support true16 register suffix
 for valid register range (#143997)

---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 19 ++++++++++++++++---
 llvm/test/MC/AMDGPU/gfx11_asm_vop1.s          |  6 ++++++
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s  |  6 ++++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b43876582daa8..0dc1d13773229 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1395,7 +1395,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   MCRegister ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
                           unsigned &RegWidth,
                           SmallVectorImpl<AsmToken> &Tokens);
-  bool ParseRegRange(unsigned& Num, unsigned& Width);
+  bool ParseRegRange(unsigned &Num, unsigned &Width, unsigned &SubReg);
   MCRegister getRegularReg(RegisterKind RegKind, unsigned RegNum,
                            unsigned SubReg, unsigned RegWidth, SMLoc Loc);
 
@@ -2857,7 +2857,8 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
   return Reg;
 }
 
-bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
+bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth,
+                                    unsigned &SubReg) {
   int64_t RegLo, RegHi;
   if (!skipToken(AsmToken::LBrac, "missing register index"))
     return false;
@@ -2894,8 +2895,20 @@ bool AMDGPUAsmParser::ParseRegRange(unsigned &Num, unsigned &RegWidth) {
     return false;
   }
 
+  if (RegHi == RegLo) {
+    StringRef RegSuffix = getTokenStr();
+    if (RegSuffix == ".l") {
+      SubReg = AMDGPU::lo16;
+      lex();
+    } else if (RegSuffix == ".h") {
+      SubReg = AMDGPU::hi16;
+      lex();
+    }
+  }
+
   Num = static_cast<unsigned>(RegLo);
   RegWidth = 32 * ((RegHi - RegLo) + 1);
+
   return true;
 }
 
@@ -2949,7 +2962,7 @@ MCRegister AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
     RegWidth = 32;
   } else {
     // Range of registers: v[XX:YY]. ":YY" is optional.
-    if (!ParseRegRange(RegNum, RegWidth))
+    if (!ParseRegRange(RegNum, RegWidth, SubReg))
       return MCRegister();
   }
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
index 93e01954bea55..f1438532d7c5e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1.s
@@ -3808,3 +3808,9 @@ v_trunc_f64 v[5:6], src_scc
 
 v_trunc_f64 v[254:255], 0xaf123456
 // GFX11: v_trunc_f64_e32 v[254:255], 0xaf123456  ; encoding: [0xff,0x2e,0xfc,0x7f,0x56,0x34,0x12,0xaf]
+
+v_trunc_f16 v[5].l, v[1].h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
+
+v_trunc_f16 v[5:5].l, v[1:1].h
+// GFX11: v_trunc_f16_e32 v5.l, v1.h              ; encoding: [0x81,0xbb,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
index 55a25ad3ec81b..d19220867f29f 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_t16_err.s
@@ -1231,3 +1231,9 @@ v_trunc_f16_e32 v5.l, v199.l dpp8:[7,6,5,4,3,2,1,0]
 
 v_trunc_f16_e32 v5.l, v199.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:23: error: invalid operand for instruction
+
+v_ceil_f16_e32 v[5:5].s, 0xfe0b
+// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+
+v_ceil_f16_e32 v[6:7].l, 0xfe0b
+// GFX11: :[[@LINE-1]]:16: error: invalid operand for instruction

From d7ddd461162cc5585408417f64dd160929dd0691 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 13:47:45 +0100
Subject: [PATCH 362/851] [X86] Add start/end debug messages for the
 X86CompressEVEXPass and X86PadShortFunctionPass (#144056)

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp     | 3 ++-
 llvm/lib/Target/X86/X86PadShortFunction.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index fe593aa307df5..4ea30de78402f 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -300,6 +300,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
 }
 
 bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "Start X86CompressEVEXPass\n";);
 #ifndef NDEBUG
   // Make sure the tables are sorted.
   static std::atomic<bool> TableChecked(false);
@@ -320,7 +321,7 @@ bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB)
       Changed |= CompressEVEXImpl(MI, ST);
   }
-
+  LLVM_DEBUG(dbgs() << "End X86CompressEVEXPass\n";);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 170ca2a932502..049384eefa188 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -100,6 +100,7 @@ FunctionPass *llvm::createX86PadShortFunctions() {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// NOOP instructions before early exits.
 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "Start X86PadShortFunctionPass\n";);
   if (skipFunction(MF.getFunction()))
     return false;
 
@@ -149,7 +150,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
       MadeChange = true;
     }
   }
-
+  LLVM_DEBUG(dbgs() << "End X86PadShortFunctionPass\n";);
   return MadeChange;
 }
 

From 4f8187c0dc6e7a818ebf3272a0c022203f901e96 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 13 Jun 2025 14:53:58 +0200
Subject: [PATCH 363/851] [TSan] Regenerate test checks (NFC)

---
 .../ThreadSanitizer/atomic-non-integer.ll     | 76 ++++++++++++++-----
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
index 40c4bef3bff9b..8bcabaecf0fdc 100644
--- a/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
+++ b/llvm/test/Instrumentation/ThreadSanitizer/atomic-non-integer.ll
@@ -1,51 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=tsan -S | FileCheck %s
 ; Check that atomic memory operations on floating-point types are converted to calls into ThreadSanitizer runtime.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define float @load_float(ptr %fptr) {
+; CHECK-LABEL: define float @load_float(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__tsan_atomic32_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; CHECK-NEXT:    [[V:%.*]] = load atomic float, ptr [[FPTR]] unordered, align 4
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret float [[TMP3]]
+;
   %v = load atomic float, ptr %fptr unordered, align 4
   ret float %v
-  ; CHECK-LABEL: load_float
-  ; CHECK: call i32 @__tsan_atomic32_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i32 {{.+}} to float
 }
 
 define double @load_double(ptr %fptr) {
+; CHECK-LABEL: define double @load_double(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @__tsan_atomic64_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
+; CHECK-NEXT:    [[V:%.*]] = load atomic double, ptr [[FPTR]] unordered, align 8
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret double [[TMP3]]
+;
   %v = load atomic double, ptr %fptr unordered, align 8
   ret double %v
-  ; CHECK-LABEL: load_double
-  ; CHECK: call i64 @__tsan_atomic64_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i64 {{.+}} to double
 }
 
 define fp128 @load_fp128(ptr %fptr) {
+; CHECK-LABEL: define fp128 @load_fp128(
+; CHECK-SAME: ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i128 @__tsan_atomic128_load(ptr [[FPTR]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128 [[TMP2]] to fp128
+; CHECK-NEXT:    [[V:%.*]] = load atomic fp128, ptr [[FPTR]] unordered, align 16
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret fp128 [[TMP3]]
+;
   %v = load atomic fp128, ptr %fptr unordered, align 16
   ret fp128 %v
-  ; CHECK-LABEL: load_fp128
-  ; CHECK: call i128 @__tsan_atomic128_load(ptr %{{.+}}, i32 0)
-  ; CHECK: bitcast i128 {{.+}} to fp128
 }
 
 define void @store_float(ptr %fptr, float %v) {
+; CHECK-LABEL: define void @store_float(
+; CHECK-SAME: ptr [[FPTR:%.*]], float [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[V]] to i32
+; CHECK-NEXT:    call void @__tsan_atomic32_store(ptr [[FPTR]], i32 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic float %v, ptr %fptr unordered, align 4
   ret void
-  ; CHECK-LABEL: store_float
-  ; CHECK: bitcast float %v to i32
-  ; CHECK: call void @__tsan_atomic32_store(ptr %{{.+}}, i32 %{{.+}}, i32 0)
 }
 
 define void @store_double(ptr %fptr, double %v) {
+; CHECK-LABEL: define void @store_double(
+; CHECK-SAME: ptr [[FPTR:%.*]], double [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[V]] to i64
+; CHECK-NEXT:    call void @__tsan_atomic64_store(ptr [[FPTR]], i64 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic double %v, ptr %fptr unordered, align 8
   ret void
-  ; CHECK-LABEL: store_double
-  ; CHECK: bitcast double %v to i64
-  ; CHECK: call void @__tsan_atomic64_store(ptr %{{.+}}, i64 %{{.+}}, i32 0)
 }
 
 define void @store_fp128(ptr %fptr, fp128 %v) {
+; CHECK-LABEL: define void @store_fp128(
+; CHECK-SAME: ptr [[FPTR:%.*]], fp128 [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__tsan_func_entry(ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast fp128 [[V]] to i128
+; CHECK-NEXT:    call void @__tsan_atomic128_store(ptr [[FPTR]], i128 [[TMP2]], i32 0)
+; CHECK-NEXT:    call void @__tsan_func_exit()
+; CHECK-NEXT:    ret void
+;
   store atomic fp128 %v, ptr %fptr unordered, align 16
   ret void
-  ; CHECK-LABEL: store_fp128
-  ; CHECK: bitcast fp128 %v to i128
-  ; CHECK: call void @__tsan_atomic128_store(ptr %{{.+}}, i128 %{{.+}}, i32 0)
 }

From a59e4acd753007c83594a6a56654025d4202a528 Mon Sep 17 00:00:00 2001
From: Ryan Buchner <92571492+bababuck@users.noreply.github.com>
Date: Fri, 13 Jun 2025 05:57:46 -0700
Subject: [PATCH 364/851] [RISCV] Lower SELECT's with one constant more
 efficiently using Zicond (#143581)

See #143580 for MR with the test commit.

Performs the following transformations:
(select c, c1, t) -> (add (czero_nez t - c1, c), c1)
(select c, t, c1) -> (add (czero_eqz t - c1, c), c1)


@mgudim
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 26 +++++++++++
 .../CodeGen/RISCV/short-forward-branch-opt.ll | 46 ++++++++++++-------
 llvm/test/CodeGen/RISCV/zicond-opts.ll        | 38 +++++++--------
 3 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7839af5c16917..7cfada6c0601c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9096,6 +9096,32 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getNode(ISD::ADD, DL, VT, CMOV, RHSVal);
     }
 
+    // (select c, c1, t) -> (add (czero_nez t - c1, c), c1)
+    // (select c, t, c1) -> (add (czero_eqz t - c1, c), c1)
+    if (isa<ConstantSDNode>(TrueV) != isa<ConstantSDNode>(FalseV)) {
+      bool IsCZERO_NEZ = isa<ConstantSDNode>(TrueV);
+      SDValue ConstVal = IsCZERO_NEZ ? TrueV : FalseV;
+      SDValue RegV = IsCZERO_NEZ ? FalseV : TrueV;
+      int64_t RawConstVal = cast<ConstantSDNode>(ConstVal)->getSExtValue();
+      // Fall back to XORI if Const == -0x800
+      if (RawConstVal == -0x800) {
+        SDValue XorOp = DAG.getNode(ISD::XOR, DL, VT, RegV, ConstVal);
+        SDValue CMOV =
+            DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                        DL, VT, XorOp, CondV);
+        return DAG.getNode(ISD::XOR, DL, VT, CMOV, ConstVal);
+      }
+      // Efficient only if the constant and its negation fit into `ADDI`
+      // Prefer Add/Sub over Xor since can be compressed for small immediates
+      if (isInt<12>(RawConstVal)) {
+        SDValue SubOp = DAG.getNode(ISD::SUB, DL, VT, RegV, ConstVal);
+        SDValue CMOV =
+            DAG.getNode(IsCZERO_NEZ ? RISCVISD::CZERO_NEZ : RISCVISD::CZERO_EQZ,
+                        DL, VT, SubOp, CondV);
+        return DAG.getNode(ISD::ADD, DL, VT, CMOV, ConstVal);
+      }
+    }
+
     // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))
     // Unless we have the short forward branch optimization.
     if (!Subtarget.hasConditionalMoveFusion())
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
index b7b88584f3bdb..13c43a3875a08 100644
--- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll
@@ -173,14 +173,21 @@ define signext i32 @test6(i32 signext %x, i32 signext %z) {
 ; NOSFB-NEXT:    or a0, a0, a1
 ; NOSFB-NEXT:    ret
 ;
-; SFB-LABEL: test6:
-; SFB:       # %bb.0:
-; SFB-NEXT:    li a2, -1
-; SFB-NEXT:    beqz a1, .LBB5_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    mv a0, a2
-; SFB-NEXT:  .LBB5_2:
-; SFB-NEXT:    ret
+; NOZICOND-LABEL: test6:
+; NOZICOND:       # %bb.0:
+; NOZICOND-NEXT:    li a2, -1
+; NOZICOND-NEXT:    beqz a1, .LBB5_2
+; NOZICOND-NEXT:  # %bb.1:
+; NOZICOND-NEXT:    mv a0, a2
+; NOZICOND-NEXT:  .LBB5_2:
+; NOZICOND-NEXT:    ret
+;
+; ZICOND-LABEL: test6:
+; ZICOND:       # %bb.0:
+; ZICOND-NEXT:    addi a0, a0, 1
+; ZICOND-NEXT:    czero.nez a0, a0, a1
+; ZICOND-NEXT:    addi a0, a0, -1
+; ZICOND-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %b = select i1 %c, i32 %x, i32 -1
   ret i32 %b
@@ -195,14 +202,21 @@ define signext i32 @test7(i32 signext %x, i32 signext %z) {
 ; NOSFB-NEXT:    or a0, a0, a1
 ; NOSFB-NEXT:    ret
 ;
-; SFB-LABEL: test7:
-; SFB:       # %bb.0:
-; SFB-NEXT:    li a2, -1
-; SFB-NEXT:    bnez a1, .LBB6_2
-; SFB-NEXT:  # %bb.1:
-; SFB-NEXT:    mv a0, a2
-; SFB-NEXT:  .LBB6_2:
-; SFB-NEXT:    ret
+; NOZICOND-LABEL: test7:
+; NOZICOND:       # %bb.0:
+; NOZICOND-NEXT:    li a2, -1
+; NOZICOND-NEXT:    bnez a1, .LBB6_2
+; NOZICOND-NEXT:  # %bb.1:
+; NOZICOND-NEXT:    mv a0, a2
+; NOZICOND-NEXT:  .LBB6_2:
+; NOZICOND-NEXT:    ret
+;
+; ZICOND-LABEL: test7:
+; ZICOND:       # %bb.0:
+; ZICOND-NEXT:    addi a0, a0, 1
+; ZICOND-NEXT:    czero.eqz a0, a0, a1
+; ZICOND-NEXT:    addi a0, a0, -1
+; ZICOND-NEXT:    ret
   %c = icmp eq i32 %z, 0
   %b = select i1 %c, i32 -1, i32 %x
   ret i32 %b
diff --git a/llvm/test/CodeGen/RISCV/zicond-opts.ll b/llvm/test/CodeGen/RISCV/zicond-opts.ll
index f5a25868bd12b..2512ba803cf48 100644
--- a/llvm/test/CodeGen/RISCV/zicond-opts.ll
+++ b/llvm/test/CodeGen/RISCV/zicond-opts.ll
@@ -146,20 +146,18 @@ define i64 @select_imm_reg(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_imm_reg:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, 3
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    addi a0, a0, -3
 ; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_imm_reg:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, 3
+; RV64ZICOND-NEXT:    addi a0, a0, -3
 ; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
-; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
-; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    addi a0, a0, 3
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 3, i64 %t
   ret i64 %4
@@ -170,20 +168,18 @@ define i64 @select_reg_imm(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_reg_imm:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, 3
-; RV32ZICOND-NEXT:    czero.nez a3, a3, a2
-; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a3
+; RV32ZICOND-NEXT:    addi a0, a0, -3
 ; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.eqz a0, a0, a2
+; RV32ZICOND-NEXT:    addi a0, a0, 3
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_reg_imm:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, 3
-; RV64ZICOND-NEXT:    czero.nez a2, a2, a1
+; RV64ZICOND-NEXT:    addi a0, a0, -3
 ; RV64ZICOND-NEXT:    czero.eqz a0, a0, a1
-; RV64ZICOND-NEXT:    or a0, a0, a2
+; RV64ZICOND-NEXT:    addi a0, a0, 3
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 %t, i64 3
   ret i64 %4
@@ -194,21 +190,19 @@ define i64 @select_imm_reg_neg_2048(i64 %t, i1 %cond) {
 ; RV32ZICOND-LABEL: select_imm_reg_neg_2048:
 ; RV32ZICOND:       # %bb.0:
 ; RV32ZICOND-NEXT:    andi a2, a2, 1
-; RV32ZICOND-NEXT:    li a3, -2048
+; RV32ZICOND-NEXT:    xori a0, a0, -2048
+; RV32ZICOND-NEXT:    neg a3, a2
 ; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    neg a2, a2
-; RV32ZICOND-NEXT:    or a0, a3, a0
-; RV32ZICOND-NEXT:    or a1, a2, a1
+; RV32ZICOND-NEXT:    or a1, a3, a1
+; RV32ZICOND-NEXT:    xori a0, a0, -2048
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: select_imm_reg_neg_2048:
 ; RV64ZICOND:       # %bb.0:
 ; RV64ZICOND-NEXT:    andi a1, a1, 1
-; RV64ZICOND-NEXT:    li a2, -2048
+; RV64ZICOND-NEXT:    xori a0, a0, -2048
 ; RV64ZICOND-NEXT:    czero.nez a0, a0, a1
-; RV64ZICOND-NEXT:    czero.eqz a1, a2, a1
-; RV64ZICOND-NEXT:    or a0, a1, a0
+; RV64ZICOND-NEXT:    xori a0, a0, -2048
 ; RV64ZICOND-NEXT:    ret
   %4 = select i1 %cond, i64 -2048, i64 %t
   ret i64 %4

From 85a9f2e14859b472750f13fb441291e6e9c893a0 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:14:48 -0400
Subject: [PATCH 365/851] [PowerPC] enable
 AtomicExpandImpl::expandAtomicCmpXchg for powerpc (#142395)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In PowerPC, the AtomicCmpXchgInst is lowered to
ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS. However, this node does not handle
the weak attribute of AtomicCmpXchgInst. As a result, when compiling C++
atomic_compare_exchange_weak_explicit, the generated assembly includes a
"reservation lost" loop — i.e., it branches back and retries if the
stwcx. (store-conditional) fails. This differs from GCC’s codegen, which
does not include that loop for weak compare-exchange.

Since PowerPC uses LL/SC-style atomic instructions, the patch enables
AtomicExpandImpl::expandAtomicCmpXchg for PowerPC. With this, the weak
attribute is properly respected, and the "reservation lost" loop is
removed for weak operations.

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   20 +-
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |   15 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   73 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |    6 +
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td      |    2 +
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |    8 +-
 .../CodeGen/PowerPC/PR35812-neg-cmpxchg.ll    |  142 +-
 llvm/test/CodeGen/PowerPC/all-atomics.ll      | 1672 +++++++++-------
 llvm/test/CodeGen/PowerPC/atomic-2.ll         |    4 +-
 .../PowerPC/atomic-compare-exchange-weak.ll   |   52 +-
 llvm/test/CodeGen/PowerPC/atomic-float.ll     |  108 +-
 .../PowerPC/atomicrmw-cond-sub-clamp.ll       |  526 +++--
 .../PowerPC/atomicrmw-uinc-udec-wrap.ll       |  524 +++--
 .../CodeGen/PowerPC/atomics-regression.ll     | 1740 +++++++++++------
 llvm/test/CodeGen/PowerPC/atomics.ll          |  227 +--
 llvm/test/CodeGen/PowerPC/loop-comment.ll     |    9 +-
 .../AtomicExpand/PowerPC/atomicrmw-fp.ll      |  116 +-
 17 files changed, 3133 insertions(+), 2111 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 04bc0e9353101..4ed81d25e8e22 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -254,20 +254,20 @@ class LLVM_ABI TargetLoweringBase {
   /// support for these atomic instructions, and also have different options
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
-    None,    // Don't expand the instruction.
-    CastToInteger,    // Cast the atomic instruction to another type, e.g. from
-                      // floating-point to integer type.
+    None,          // Don't expand the instruction.
+    CastToInteger, // Cast the atomic instruction to another type, e.g. from
+                   // floating-point to integer type.
     LLSC,    // Expand the instruction into loadlinked/storeconditional; used
-             // by ARM/AArch64.
+             // by ARM/AArch64/PowerPC.
     LLOnly,  // Expand the (load) instruction into just a load-linked, which has
              // greater atomic guarantees than a normal load.
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
-    MaskedIntrinsic,  // Use a target-specific intrinsic for the LL/SC loop.
-    BitTestIntrinsic, // Use a target-specific intrinsic for special bit
-                      // operations; used by X86.
-    CmpArithIntrinsic,// Use a target-specific intrinsic for special compare
-                      // operations; used by X86.
-    Expand,           // Generic expansion in terms of other atomic operations.
+    MaskedIntrinsic,   // Use a target-specific intrinsic for the LL/SC loop.
+    BitTestIntrinsic,  // Use a target-specific intrinsic for special bit
+                       // operations; used by X86.
+    CmpArithIntrinsic, // Use a target-specific intrinsic for special compare
+                       // operations; used by X86.
+    Expand,            // Generic expansion in terms of other atomic operations.
 
     // Rewrite to a non-atomic form for use in a known non-preemptible
     // environment.
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 751628cee58c0..84c26599b5b70 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1835,6 +1835,19 @@ let TargetPrefix = "ppc" in {
                       Intrinsic<[],[],[]>;
   def int_ppc_iospace_eieio : ClangBuiltin<"__builtin_ppc_iospace_eieio">,
                               Intrinsic<[],[],[]>;
+  def int_ppc_lbarx :
+    Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_lharx :
+    Intrinsic<[llvm_i32_ty],[llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_lwarx :
+    Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+  def int_ppc_ldarx :
+    Intrinsic<[llvm_i64_ty],[llvm_ptr_ty],
+              [IntrReadMem, IntrArgMemOnly]>;
+
   def int_ppc_stdcx :
     ClangBuiltin<"__builtin_ppc_stdcx">,
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i64_ty],
@@ -1844,7 +1857,7 @@ let TargetPrefix = "ppc" in {
     Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
               [IntrWriteMem, IntrArgMemOnly]>;
   def int_ppc_sthcx :
-    Intrinsic<[llvm_i32_ty], [ llvm_ptr_ty, llvm_i32_ty ],
+    Intrinsic<[llvm_i32_ty], [ llvm_ptr_ty, llvm_i32_ty],
               [IntrWriteMem, IntrArgMemOnly, IntrNoDuplicate]>;
   def int_ppc_stbcx :
     ClangBuiltin<"__builtin_ppc_stbcx">,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 59c89985c6cff..0f8e5e57c58b7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1442,6 +1442,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setMinimumJumpTableEntries(PPCMinimumJumpTableEntries);
 
   setMinFunctionAlignment(Align(4));
+  setMinCmpXchgSizeInBits(Subtarget.hasPartwordAtomics() ? 8 : 32);
 
   auto CPUDirective = Subtarget.getCPUDirective();
   switch (CPUDirective) {
@@ -12690,6 +12691,76 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
   return Builder.CreateIntrinsic(Id, {});
 }
 
+Value *PPCTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
+                                         Value *Addr,
+                                         AtomicOrdering Ord) const {
+  unsigned SZ = ValueTy->getPrimitiveSizeInBits();
+
+  assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
+         "Only 8/16/32/64-bit atomic loads supported");
+  Intrinsic::ID IntID;
+  switch (SZ) {
+  default:
+    llvm_unreachable("Unexpected PrimitiveSize");
+  case 8:
+    IntID = Intrinsic::ppc_lbarx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 16:
+    IntID = Intrinsic::ppc_lharx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 32:
+    IntID = Intrinsic::ppc_lwarx;
+    break;
+  case 64:
+    IntID = Intrinsic::ppc_ldarx;
+    break;
+  }
+  Value *Call =
+      Builder.CreateIntrinsic(IntID, Addr, /*FMFSource=*/nullptr, "larx");
+
+  return Builder.CreateTruncOrBitCast(Call, ValueTy);
+}
+
+// Perform a store-conditional operation to Addr. Return the status of the
+// store. This should be 0 if the store succeeded, non-zero otherwise.
+Value *PPCTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
+                                               Value *Val, Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Type *Ty = Val->getType();
+  unsigned SZ = Ty->getPrimitiveSizeInBits();
+
+  assert((SZ == 8 || SZ == 16 || SZ == 32 || SZ == 64) &&
+         "Only 8/16/32/64-bit atomic loads supported");
+  Intrinsic::ID IntID;
+  switch (SZ) {
+  default:
+    llvm_unreachable("Unexpected PrimitiveSize");
+  case 8:
+    IntID = Intrinsic::ppc_stbcx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 16:
+    IntID = Intrinsic::ppc_sthcx;
+    assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
+    break;
+  case 32:
+    IntID = Intrinsic::ppc_stwcx;
+    break;
+  case 64:
+    IntID = Intrinsic::ppc_stdcx;
+    break;
+  }
+
+  if (SZ == 8 || SZ == 16)
+    Val = Builder.CreateZExt(Val, Builder.getInt32Ty());
+
+  Value *Call = Builder.CreateIntrinsic(IntID, {Addr, Val},
+                                        /*FMFSource=*/nullptr, "stcx");
+  return Builder.CreateXor(Call, Builder.getInt32(1));
+}
+
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
@@ -19651,7 +19722,7 @@ PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
   unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
   if (shouldInlineQuadwordAtomics() && Size == 128)
     return AtomicExpansionKind::MaskedIntrinsic;
-  return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
+  return AtomicExpansionKind::LLSC;
 }
 
 static Intrinsic::ID
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 2c55b5427297a..4c88bd372b106 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -927,6 +927,12 @@ namespace llvm {
       return true;
     }
 
+    Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
+                          AtomicOrdering Ord) const override;
+
+    Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
+                                AtomicOrdering Ord) const override;
+
     Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
                                   AtomicOrdering Ord) const override;
     Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 659c1a9079c33..fd2084398c857 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -2023,6 +2023,8 @@ def SLBSYNC : XForm_0<31, 338, (outs), (ins), "slbsync", IIC_SprSLBSYNC, []>;
 
 } // IsISA3_0
 
+def : Pat<(int_ppc_ldarx ForceXForm:$ptr),
+          (LDARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stdcx ForceXForm:$dst, g8rc:$A),
           (RLWINM (STDCX g8rc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, g8rc:$A, 8),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index b70290df07b1c..99ef89a7fdc0c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5143,7 +5143,6 @@ def : Pat<(int_ppc_store2r gprc:$a, ForceXForm:$ptr),
 def : Pat<(int_ppc_store4r gprc:$a, ForceXForm:$ptr),
           (STWBRX gprc:$a, ForceXForm:$ptr)>;
 
-
 // Fast 32-bit reverse bits algorithm:
 // Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
 // n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA);
@@ -5324,10 +5323,14 @@ def CFENCE : PPCPostRAExpPseudo<(outs), (ins gprc:$cr), "#CFENCE", []>;
 def : Pat<(i64 (bitreverse i64:$A)),
   (OR8 (RLDICR DWBytes7654.DWord, 32, 31), DWBytes3210.DWord)>;
 
+def : Pat<(int_ppc_lwarx ForceXForm:$ptr),
+          (LWARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stwcx ForceXForm:$dst, gprc:$A),
           (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 4),
           (RLWINM (STWCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
+def : Pat<(int_ppc_lbarx ForceXForm:$ptr),
+          (LBARX ForceXForm:$ptr)>;
 def : Pat<(int_ppc_stbcx ForceXForm:$dst, gprc:$A),
           (RLWINM (STBCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
 def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 1),
@@ -5360,6 +5363,9 @@ def : Pat<(int_ppc_mtmsr gprc:$RS),
           (MTMSR $RS, 0)>;
 
 let Predicates = [IsISA2_07] in {
+  def : Pat<(int_ppc_lharx ForceXForm:$ptr),
+          (LHARX ForceXForm:$ptr)>;
+
   def : Pat<(int_ppc_sthcx ForceXForm:$dst, gprc:$A),
             (RLWINM (STHCX gprc:$A, ForceXForm:$dst), 31, 31, 31)>;
   def : Pat<(PPCStoreCond ForceXForm:$dst, gprc:$A, 2),
diff --git a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
index 8517783e3ebd7..1a8dabc5ad719 100644
--- a/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
+++ b/llvm/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@@ -15,50 +15,57 @@ define signext i32 @main() nounwind {
 ; CHECK-NEXT:    stdu 1, -48(1)
 ; CHECK-NEXT:    li 3, -32477
 ; CHECK-NEXT:    std 0, 64(1)
-; CHECK-NEXT:    li 4, 234
-; CHECK-NEXT:    addi 6, 1, 46
 ; CHECK-NEXT:    sth 3, 46(1)
-; CHECK-NEXT:    lis 3, 0
+; CHECK-NEXT:    addi 3, 1, 46
+; CHECK-NEXT:    lharx 4, 0, 3
+; CHECK-NEXT:    clrlwi  4, 4, 16
+; CHECK-NEXT:    cmplwi  4, 33059
+; CHECK-NEXT:    bne     0, .LBB0_4
+; CHECK-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    ori 3, 3, 33059
-; CHECK-NEXT:  .LBB0_1: # %L.entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 5, 0, 6
-; CHECK-NEXT:    cmpw 5, 3
-; CHECK-NEXT:    bne 0, .LBB0_3
-; CHECK-NEXT:  # %bb.2: # %L.entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 4, 0, 6
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  .LBB0_3: # %L.entry
-; CHECK-NEXT:    cmplwi 5, 33059
+; CHECK-NEXT:    li 4, 234
+; CHECK-NEXT:    .p2align        5
+; CHECK-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
+; CHECK-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sthcx. 4, 0, 3
+; CHECK-NEXT:    beq     0, .LBB0_7
+; CHECK-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    lharx 5, 0, 3
+; CHECK-NEXT:    clrlwi  5, 5, 16
+; CHECK-NEXT:    cmplwi  5, 33059
+; CHECK-NEXT:    beq     0, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
 ; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    bne 0, .LBB0_6
-; CHECK-NEXT:  # %bb.4: # %L.B0000
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:  .LBB0_5:                                # %L.B0000
 ; CHECK-NEXT:    lhz 3, 46(1)
-; CHECK-NEXT:    cmplwi 3, 234
-; CHECK-NEXT:    bne 0, .LBB0_7
-; CHECK-NEXT:  # %bb.5: # %L.B0001
+; CHECK-NEXT:    cmplwi  3, 234
+; CHECK-NEXT:    bne     0, .LBB0_9
+; CHECK-NEXT:  # %bb.6:                                # %L.B0001
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    b .LBB0_9
-; CHECK-NEXT:  .LBB0_6: # %L.B0003
+; CHECK-NEXT:    b .LBB0_11
+; CHECK-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    b .LBB0_5
+; CHECK-NEXT:  .LBB0_8:                                # %L.B0003
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 16
-; CHECK-NEXT:    b .LBB0_8
-; CHECK-NEXT:  .LBB0_7: # %L.B0005
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:  .LBB0_9:                                # %L.B0005
 ; CHECK-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-NEXT:    addi 3, 3, 64
-; CHECK-NEXT:  .LBB0_8: # %L.B0003
+; CHECK-NEXT:  .LBB0_10:                               # %L.B0003
 ; CHECK-NEXT:    bl puts
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB0_9: # %L.B0003
+; CHECK-NEXT:  .LBB0_11:                               # %L.B0003
 ; CHECK-NEXT:    addi 1, 1, 48
 ; CHECK-NEXT:    ld 0, 16(1)
 ; CHECK-NEXT:    mtlr 0
@@ -69,64 +76,69 @@ define signext i32 @main() nounwind {
 ; CHECK-P7-NEXT:    mflr 0
 ; CHECK-P7-NEXT:    stdu 1, -48(1)
 ; CHECK-P7-NEXT:    li 3, -32477
-; CHECK-P7-NEXT:    std 0, 64(1)
 ; CHECK-P7-NEXT:    addi 4, 1, 46
-; CHECK-P7-NEXT:    li 6, 234
+; CHECK-P7-NEXT:    std 0, 64(1)
 ; CHECK-P7-NEXT:    sth 3, 46(1)
-; CHECK-P7-NEXT:    lis 3, 0
+; CHECK-P7-NEXT:    rldicr 3, 4, 0, 61
+; CHECK-P7-NEXT:    rlwinm 4, 4, 3, 27, 27
+; CHECK-P7-NEXT:    lwarx 5, 0, 3
+; CHECK-P7-NEXT:    srw 6, 5, 4
+; CHECK-P7-NEXT:    clrlwi  6, 6, 16
+; CHECK-P7-NEXT:    cmplwi  6, 33059
+; CHECK-P7-NEXT:    bne     0, .LBB0_4
+; CHECK-P7-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-P7-NEXT:    lis 6, 0
+; CHECK-P7-NEXT:    li 7, 234
 ; CHECK-P7-NEXT:    sync
-; CHECK-P7-NEXT:    ori 5, 3, 33059
-; CHECK-P7-NEXT:    rlwinm 3, 4, 3, 27, 27
-; CHECK-P7-NEXT:    rldicr 4, 4, 0, 61
-; CHECK-P7-NEXT:    slw 7, 5, 3
-; CHECK-P7-NEXT:    li 5, 0
-; CHECK-P7-NEXT:    slw 6, 6, 3
-; CHECK-P7-NEXT:    ori 5, 5, 65535
-; CHECK-P7-NEXT:    slw 5, 5, 3
-; CHECK-P7-NEXT:    and 6, 6, 5
-; CHECK-P7-NEXT:    and 7, 7, 5
-; CHECK-P7-NEXT:  .LBB0_1: # %L.entry
-; CHECK-P7-NEXT:    #
-; CHECK-P7-NEXT:    lwarx 9, 0, 4
-; CHECK-P7-NEXT:    and 8, 9, 5
-; CHECK-P7-NEXT:    cmpw 8, 7
-; CHECK-P7-NEXT:    bne 0, .LBB0_3
-; CHECK-P7-NEXT:  # %bb.2: # %L.entry
-; CHECK-P7-NEXT:    #
-; CHECK-P7-NEXT:    andc 9, 9, 5
-; CHECK-P7-NEXT:    or 9, 9, 6
-; CHECK-P7-NEXT:    stwcx. 9, 0, 4
-; CHECK-P7-NEXT:    bne 0, .LBB0_1
-; CHECK-P7-NEXT:  .LBB0_3: # %L.entry
-; CHECK-P7-NEXT:    srw 3, 8, 3
+; CHECK-P7-NEXT:    ori 6, 6, 65535
+; CHECK-P7-NEXT:    slw 7, 7, 4
+; CHECK-P7-NEXT:    slw 6, 6, 4
+; CHECK-P7-NEXT:    not     6, 6
+; CHECK-P7-NEXT:    .p2align        4
+; CHECK-P7-NEXT:  .LBB0_2:                                # %cmpxchg.trystore
+; CHECK-P7-NEXT:                                        # =>This Inner Loop Header: Depth=1
+; CHECK-P7-NEXT:    and 5, 5, 6
+; CHECK-P7-NEXT:    or 5, 5, 7
+; CHECK-P7-NEXT:    stwcx. 5, 0, 3
+; CHECK-P7-NEXT:    beq     0, .LBB0_7
+; CHECK-P7-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-P7-NEXT:                                        #   in Loop: Header=BB0_2 Depth=1
+; CHECK-P7-NEXT:    lwarx 5, 0, 3
+; CHECK-P7-NEXT:    srw 8, 5, 4
+; CHECK-P7-NEXT:    clrlwi  8, 8, 16
+; CHECK-P7-NEXT:    cmplwi  8, 33059
+; CHECK-P7-NEXT:    beq     0, .LBB0_2
+; CHECK-P7-NEXT:  .LBB0_4:                                # %cmpxchg.nostore
 ; CHECK-P7-NEXT:    lwsync
-; CHECK-P7-NEXT:    cmplwi 3, 33059
-; CHECK-P7-NEXT:    bne 0, .LBB0_6
-; CHECK-P7-NEXT:  # %bb.4: # %L.B0000
+; CHECK-P7-NEXT:    b .LBB0_8
+; CHECK-P7-NEXT:  .LBB0_5:                                # %L.B0000
 ; CHECK-P7-NEXT:    lhz 3, 46(1)
-; CHECK-P7-NEXT:    cmplwi 3, 234
-; CHECK-P7-NEXT:    bne 0, .LBB0_7
-; CHECK-P7-NEXT:  # %bb.5: # %L.B0001
+; CHECK-P7-NEXT:    cmplwi  3, 234
+; CHECK-P7-NEXT:    bne     0, .LBB0_9
+; CHECK-P7-NEXT:  # %bb.6:                                # %L.B0001
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 0
-; CHECK-P7-NEXT:    b .LBB0_9
-; CHECK-P7-NEXT:  .LBB0_6: # %L.B0003
+; CHECK-P7-NEXT:    b .LBB0_11
+; CHECK-P7-NEXT:  .LBB0_7:                                # %cmpxchg.success
+; CHECK-P7-NEXT:    lwsync
+; CHECK-P7-NEXT:    b .LBB0_5
+; CHECK-P7-NEXT:  .LBB0_8:                                # %L.B0003
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 16
-; CHECK-P7-NEXT:    b .LBB0_8
-; CHECK-P7-NEXT:  .LBB0_7: # %L.B0005
+; CHECK-P7-NEXT:    b .LBB0_10
+; CHECK-P7-NEXT:  .LBB0_9:                                # %L.B0005
 ; CHECK-P7-NEXT:    addis 3, 2, .L_MergedGlobals@toc@ha
 ; CHECK-P7-NEXT:    addi 3, 3, .L_MergedGlobals@toc@l
 ; CHECK-P7-NEXT:    addi 3, 3, 64
-; CHECK-P7-NEXT:  .LBB0_8: # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_10:                               # %L.B0003
 ; CHECK-P7-NEXT:    bl puts
 ; CHECK-P7-NEXT:    nop
 ; CHECK-P7-NEXT:    li 3, 1
-; CHECK-P7-NEXT:  .LBB0_9: # %L.B0003
+; CHECK-P7-NEXT:  .LBB0_11:                               # %L.B0003
 ; CHECK-P7-NEXT:    addi 1, 1, 48
 ; CHECK-P7-NEXT:    ld 0, 16(1)
 ; CHECK-P7-NEXT:    mtlr 0
diff --git a/llvm/test/CodeGen/PowerPC/all-atomics.ll b/llvm/test/CodeGen/PowerPC/all-atomics.ll
index 531e559ea7309..67cee358882ff 100644
--- a/llvm/test/CodeGen/PowerPC/all-atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/all-atomics.ll
@@ -4336,704 +4336,959 @@ entry:
 define dso_local void @test_compare_and_swap() local_unnamed_addr #0 {
 ; CHECK-LABEL: test_compare_and_swap:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addis 3, 2, uc@toc@ha
-; CHECK-NEXT:    addis 4, 2, sc@toc@ha
-; CHECK-NEXT:    std 27, -40(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 28, -32(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
-; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    lbz 8, sc@toc@l(4)
-; CHECK-NEXT:    addi 6, 3, uc@toc@l
-; CHECK-NEXT:    addi 0, 4, sc@toc@l
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_1: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 7, 0, 0
-; CHECK-NEXT:    cmpw 7, 5
-; CHECK-NEXT:    bne 0, .LBB3_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 8, 0, 0
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  .LBB3_3: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stb 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 8, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_4: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 5, 0, 6
-; CHECK-NEXT:    cmpw 5, 8
-; CHECK-NEXT:    bne 0, .LBB3_6
-; CHECK-NEXT:  # %bb.5: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 7, 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_4
-; CHECK-NEXT:  .LBB3_6: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stb 5, uc@toc@l(3)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, ss@toc@ha
-; CHECK-NEXT:    addi 12, 7, ss@toc@l
-; CHECK-NEXT:  .LBB3_7: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 9, 0, 12
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_9
-; CHECK-NEXT:  # %bb.8: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 8, 0, 12
-; CHECK-NEXT:    bne 0, .LBB3_7
-; CHECK-NEXT:  .LBB3_9: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    sth 9, ss@toc@l(7)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, us@toc@ha
-; CHECK-NEXT:    addi 11, 7, us@toc@l
-; CHECK-NEXT:  .LBB3_10: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 9, 0, 11
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_12
-; CHECK-NEXT:  # %bb.11: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 8, 0, 11
-; CHECK-NEXT:    bne 0, .LBB3_10
-; CHECK-NEXT:  .LBB3_12: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    sth 9, us@toc@l(7)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 5, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 7
-; CHECK-NEXT:    addis 7, 2, si@toc@ha
-; CHECK-NEXT:    addi 10, 7, si@toc@l
-; CHECK-NEXT:  .LBB3_13: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 9, 0, 10
-; CHECK-NEXT:    cmpw 9, 5
-; CHECK-NEXT:    bne 0, .LBB3_15
-; CHECK-NEXT:  # %bb.14: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 8, 0, 10
-; CHECK-NEXT:    bne 0, .LBB3_13
-; CHECK-NEXT:  .LBB3_15: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stw 9, si@toc@l(7)
-; CHECK-NEXT:    lbz 5, sc@toc@l(4)
-; CHECK-NEXT:    lbz 7, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 8, 5
-; CHECK-NEXT:    addis 5, 2, ui@toc@ha
-; CHECK-NEXT:    addi 9, 5, ui@toc@l
-; CHECK-NEXT:  .LBB3_16: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 30, 0, 9
-; CHECK-NEXT:    cmpw 30, 7
-; CHECK-NEXT:    bne 0, .LBB3_18
-; CHECK-NEXT:  # %bb.17: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 8, 0, 9
-; CHECK-NEXT:    bne 0, .LBB3_16
-; CHECK-NEXT:  .LBB3_18: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    stw 30, ui@toc@l(5)
-; CHECK-NEXT:    addis 30, 2, sll@toc@ha
-; CHECK-NEXT:    lbz 8, sc@toc@l(4)
-; CHECK-NEXT:    lbz 7, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 29, 8
-; CHECK-NEXT:    addi 8, 30, sll@toc@l
-; CHECK-NEXT:  .LBB3_19: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 28, 0, 8
-; CHECK-NEXT:    cmpd 28, 7
-; CHECK-NEXT:    bne 0, .LBB3_21
-; CHECK-NEXT:  # %bb.20: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 29, 0, 8
-; CHECK-NEXT:    bne 0, .LBB3_19
-; CHECK-NEXT:  .LBB3_21: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    addis 29, 2, ull@toc@ha
-; CHECK-NEXT:    std 28, sll@toc@l(30)
-; CHECK-NEXT:    lbz 7, sc@toc@l(4)
-; CHECK-NEXT:    lbz 30, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:    extsb 28, 7
-; CHECK-NEXT:    addi 7, 29, ull@toc@l
-; CHECK-NEXT:  .LBB3_22: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 27, 0, 7
-; CHECK-NEXT:    cmpd 27, 30
-; CHECK-NEXT:    bne 0, .LBB3_24
-; CHECK-NEXT:  # %bb.23: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 28, 0, 7
-; CHECK-NEXT:    bne 0, .LBB3_22
-; CHECK-NEXT:  .LBB3_24: # %entry
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    std 27, ull@toc@l(29)
-; CHECK-NEXT:    lbz 30, uc@toc@l(3)
-; CHECK-NEXT:    lbz 29, sc@toc@l(4)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_25: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 28, 0, 0
-; CHECK-NEXT:    cmpw 28, 30
-; CHECK-NEXT:    bne 0, .LBB3_27
-; CHECK-NEXT:  # %bb.26: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 29, 0, 0
-; CHECK-NEXT:    bne 0, .LBB3_25
-; CHECK-NEXT:  .LBB3_27: # %entry
-; CHECK-NEXT:    xor 0, 28, 30
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 30, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 0, 0
-; CHECK-NEXT:    srwi 0, 0, 5
-; CHECK-NEXT:    stw 0, ui@toc@l(5)
-; CHECK-NEXT:    lbz 0, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_28: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lbarx 29, 0, 6
-; CHECK-NEXT:    cmpw 29, 0
-; CHECK-NEXT:    bne 0, .LBB3_30
-; CHECK-NEXT:  # %bb.29: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stbcx. 30, 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_28
-; CHECK-NEXT:  .LBB3_30: # %entry
-; CHECK-NEXT:    xor 6, 29, 0
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 0, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 0, 0
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_31: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 30, 0, 12
-; CHECK-NEXT:    cmpw 30, 6
-; CHECK-NEXT:    bne 0, .LBB3_33
-; CHECK-NEXT:  # %bb.32: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 0, 0, 12
-; CHECK-NEXT:    bne 0, .LBB3_31
-; CHECK-NEXT:  .LBB3_33: # %entry
-; CHECK-NEXT:    xor 6, 30, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 12, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 12, 12
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_34: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lharx 0, 0, 11
-; CHECK-NEXT:    cmpw 0, 6
-; CHECK-NEXT:    bne 0, .LBB3_36
-; CHECK-NEXT:  # %bb.35: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    sthcx. 12, 0, 11
-; CHECK-NEXT:    bne 0, .LBB3_34
-; CHECK-NEXT:  .LBB3_36: # %entry
-; CHECK-NEXT:    xor 6, 0, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 11, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 11, 11
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_37: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 12, 0, 10
-; CHECK-NEXT:    cmpw 12, 6
-; CHECK-NEXT:    bne 0, .LBB3_39
-; CHECK-NEXT:  # %bb.38: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 11, 0, 10
-; CHECK-NEXT:    bne 0, .LBB3_37
-; CHECK-NEXT:  .LBB3_39: # %entry
-; CHECK-NEXT:    xor 6, 12, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 10, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 10, 10
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_40: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    lwarx 11, 0, 9
-; CHECK-NEXT:    cmpw 11, 6
-; CHECK-NEXT:    bne 0, .LBB3_42
-; CHECK-NEXT:  # %bb.41: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. 10, 0, 9
-; CHECK-NEXT:    bne 0, .LBB3_40
-; CHECK-NEXT:  .LBB3_42: # %entry
-; CHECK-NEXT:    xor 6, 11, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 9, sc@toc@l(4)
-; CHECK-NEXT:    cntlzw 6, 6
-; CHECK-NEXT:    extsb 9, 9
-; CHECK-NEXT:    srwi 6, 6, 5
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    lbz 6, uc@toc@l(3)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_43: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 10, 0, 8
-; CHECK-NEXT:    cmpd 10, 6
-; CHECK-NEXT:    bne 0, .LBB3_45
-; CHECK-NEXT:  # %bb.44: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 9, 0, 8
-; CHECK-NEXT:    bne 0, .LBB3_43
-; CHECK-NEXT:  .LBB3_45: # %entry
-; CHECK-NEXT:    xor 6, 10, 6
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    lbz 4, sc@toc@l(4)
-; CHECK-NEXT:    lbz 3, uc@toc@l(3)
-; CHECK-NEXT:    cntlzd 6, 6
-; CHECK-NEXT:    extsb 4, 4
-; CHECK-NEXT:    rldicl 6, 6, 58, 63
-; CHECK-NEXT:    stw 6, ui@toc@l(5)
-; CHECK-NEXT:    sync
-; CHECK-NEXT:  .LBB3_46: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 6, 0, 7
-; CHECK-NEXT:    cmpd 6, 3
-; CHECK-NEXT:    bne 0, .LBB3_48
-; CHECK-NEXT:  # %bb.47: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 4, 0, 7
-; CHECK-NEXT:    bne 0, .LBB3_46
-; CHECK-NEXT:  .LBB3_48: # %entry
-; CHECK-NEXT:    xor 3, 6, 3
-; CHECK-NEXT:    lwsync
-; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
-; CHECK-NEXT:    cntlzd 3, 3
-; CHECK-NEXT:    ld 28, -32(1) # 8-byte Folded Reload
-; CHECK-NEXT:    ld 27, -40(1) # 8-byte Folded Reload
-; CHECK-NEXT:    rldicl 3, 3, 58, 63
-; CHECK-NEXT:    stw 3, ui@toc@l(5)
-; CHECK-NEXT:    blr
+; CHECK-NEXT:   addis 4, 2, sc@toc@ha
+; CHECK-NEXT:   addis 3, 2, uc@toc@ha
+; CHECK-NEXT:   std 27, -40(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 28, -32(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 29, -24(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   std 30, -16(1)                          # 8-byte Folded Spill
+; CHECK-NEXT:   addi 6, 4, sc@toc@l
+; CHECK-NEXT:   lbz 7, uc@toc@l(3)
+; CHECK-NEXT:   lbz 8, sc@toc@l(4)
+; CHECK-NEXT:   lbarx 5, 0, 6
+; CHECK-NEXT:   clrlwi  9, 5, 24
+; CHECK-NEXT:   cmplw   9, 7
+; CHECK-NEXT:   bne     0, .LBB3_4
+; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore276
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_2:                                # %cmpxchg.trystore275
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 8, 0, 6
+; CHECK-NEXT:   beq     0, .LBB3_4
+; CHECK-NEXT: # %bb.3:                                # %cmpxchg.releasedload274
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:   lbarx 5, 0, 6
+; CHECK-NEXT:   clrlwi  9, 5, 24
+; CHECK-NEXT:   cmplw   9, 7
+; CHECK-NEXT:   beq     0, .LBB3_2
+; CHECK-NEXT: .LBB3_4:                                # %cmpxchg.nostore272
+; CHECK-NEXT:   addi 7, 3, uc@toc@l
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stb 5, sc@toc@l(4)
+; CHECK-NEXT:   lbz 9, uc@toc@l(3)
+; CHECK-NEXT:   lbarx 8, 0, 7
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   cmplw   10, 9
+; CHECK-NEXT:   bne     0, .LBB3_8
+; CHECK-NEXT: # %bb.5:                                # %cmpxchg.fencedstore257
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  5, 5, 24
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_6:                                # %cmpxchg.trystore256
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 5, 0, 7
+; CHECK-NEXT:   beq     0, .LBB3_8
+; CHECK-NEXT: # %bb.7:                                # %cmpxchg.releasedload255
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_6 Depth=1
+; CHECK-NEXT:   lbarx 8, 0, 7
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   cmplw   10, 9
+; CHECK-NEXT:   beq     0, .LBB3_6
+; CHECK-NEXT: .LBB3_8:                                # %cmpxchg.nostore253
+; CHECK-NEXT:   addis 5, 2, ss@toc@ha
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stb 8, uc@toc@l(3)
+; CHECK-NEXT:   clrlwi  10, 8, 24
+; CHECK-NEXT:   lbz 11, sc@toc@l(4)
+; CHECK-NEXT:   addi 8, 5, ss@toc@l
+; CHECK-NEXT:   lharx 9, 0, 8
+; CHECK-NEXT:   clrlwi  12, 9, 16
+; CHECK-NEXT:   cmplw   12, 10
+; CHECK-NEXT:   bne     0, .LBB3_12
+; CHECK-NEXT: # %bb.9:                                # %cmpxchg.fencedstore238
+; CHECK-NEXT:   extsb 11, 11
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  11, 11, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_10:                               # %cmpxchg.trystore237
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 11, 0, 8
+; CHECK-NEXT:   beq     0, .LBB3_12
+; CHECK-NEXT: # %bb.11:                               # %cmpxchg.releasedload236
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_10 Depth=1
+; CHECK-NEXT:   lharx 9, 0, 8
+; CHECK-NEXT:   clrlwi  12, 9, 16
+; CHECK-NEXT:   cmplw   12, 10
+; CHECK-NEXT:   beq     0, .LBB3_10
+; CHECK-NEXT: .LBB3_12:                               # %cmpxchg.nostore234
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   sth 9, ss@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, us@toc@ha
+; CHECK-NEXT:   lbz 11, uc@toc@l(3)
+; CHECK-NEXT:   lbz 12, sc@toc@l(4)
+; CHECK-NEXT:   addi 9, 5, us@toc@l
+; CHECK-NEXT:   lharx 10, 0, 9
+; CHECK-NEXT:   clrlwi  0, 10, 16
+; CHECK-NEXT:   cmplw   0, 11
+; CHECK-NEXT:   bne     0, .LBB3_16
+; CHECK-NEXT: # %bb.13:                               # %cmpxchg.fencedstore219
+; CHECK-NEXT:   extsb 12, 12
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  12, 12, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_14:                               # %cmpxchg.trystore218
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 12, 0, 9
+; CHECK-NEXT:   beq     0, .LBB3_16
+; CHECK-NEXT: # %bb.15:                               # %cmpxchg.releasedload217
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_14 Depth=1
+; CHECK-NEXT:   lharx 10, 0, 9
+; CHECK-NEXT:   clrlwi  0, 10, 16
+; CHECK-NEXT:   cmplw   0, 11
+; CHECK-NEXT:   beq     0, .LBB3_14
+; CHECK-NEXT: .LBB3_16:                               # %cmpxchg.nostore215
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   sth 10, us@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, si@toc@ha
+; CHECK-NEXT:   lbz 12, uc@toc@l(3)
+; CHECK-NEXT:   lbz 0, sc@toc@l(4)
+; CHECK-NEXT:   addi 10, 5, si@toc@l
+; CHECK-NEXT:   lwarx 11, 0, 10
+; CHECK-NEXT:   cmplw   11, 12
+; CHECK-NEXT:   bne     0, .LBB3_20
+; CHECK-NEXT: # %bb.17:                               # %cmpxchg.fencedstore200
+; CHECK-NEXT:   extsb 0, 0
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_18:                               # %cmpxchg.trystore199
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 0, 0, 10
+; CHECK-NEXT:   beq     0, .LBB3_20
+; CHECK-NEXT: # %bb.19:                               # %cmpxchg.releasedload198
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_18 Depth=1
+; CHECK-NEXT:   lwarx 11, 0, 10
+; CHECK-NEXT:   cmplw   11, 12
+; CHECK-NEXT:   beq     0, .LBB3_18
+; CHECK-NEXT: .LBB3_20:                               # %cmpxchg.nostore196
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stw 11, si@toc@l(5)
+; CHECK-NEXT:   addis 5, 2, ui@toc@ha
+; CHECK-NEXT:   lbz 0, uc@toc@l(3)
+; CHECK-NEXT:   lbz 30, sc@toc@l(4)
+; CHECK-NEXT:   addi 11, 5, ui@toc@l
+; CHECK-NEXT:   lwarx 12, 0, 11
+; CHECK-NEXT:   cmplw   12, 0
+; CHECK-NEXT:   bne     0, .LBB3_24
+; CHECK-NEXT: # %bb.21:                               # %cmpxchg.fencedstore181
+; CHECK-NEXT:   extsb 30, 30
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_22:                               # %cmpxchg.trystore180
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 30, 0, 11
+; CHECK-NEXT:   beq     0, .LBB3_24
+; CHECK-NEXT: # %bb.23:                               # %cmpxchg.releasedload179
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_22 Depth=1
+; CHECK-NEXT:   lwarx 12, 0, 11
+; CHECK-NEXT:   cmplw   12, 0
+; CHECK-NEXT:   beq     0, .LBB3_22
+; CHECK-NEXT: .LBB3_24:                               # %cmpxchg.nostore177
+; CHECK-NEXT:   addis 30, 2, sll@toc@ha
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   stw 12, ui@toc@l(5)
+; CHECK-NEXT:   lbz 29, uc@toc@l(3)
+; CHECK-NEXT:   lbz 28, sc@toc@l(4)
+; CHECK-NEXT:   addi 12, 30, sll@toc@l
+; CHECK-NEXT:   ldarx 0, 0, 12
+; CHECK-NEXT:   cmpld   0, 29
+; CHECK-NEXT:   bne     0, .LBB3_28
+; CHECK-NEXT: # %bb.25:                               # %cmpxchg.fencedstore162
+; CHECK-NEXT:   extsb 28, 28
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_26:                               # %cmpxchg.trystore161
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 28, 0, 12
+; CHECK-NEXT:   beq     0, .LBB3_28
+; CHECK-NEXT: # %bb.27:                               # %cmpxchg.releasedload160
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_26 Depth=1
+; CHECK-NEXT:   ldarx 0, 0, 12
+; CHECK-NEXT:   cmpld   0, 29
+; CHECK-NEXT:   beq     0, .LBB3_26
+; CHECK-NEXT: .LBB3_28:                               # %cmpxchg.nostore158
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   std 0, sll@toc@l(30)
+; CHECK-NEXT:   addis 30, 2, ull@toc@ha
+; CHECK-NEXT:   lbz 28, uc@toc@l(3)
+; CHECK-NEXT:   lbz 27, sc@toc@l(4)
+; CHECK-NEXT:   addi 0, 30, ull@toc@l
+; CHECK-NEXT:   ldarx 29, 0, 0
+; CHECK-NEXT:   cmpld   29, 28
+; CHECK-NEXT:   bne     0, .LBB3_32
+; CHECK-NEXT: # %bb.29:                               # %cmpxchg.fencedstore143
+; CHECK-NEXT:   extsb 27, 27
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_30:                               # %cmpxchg.trystore142
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 27, 0, 0
+; CHECK-NEXT:   beq     0, .LBB3_32
+; CHECK-NEXT: # %bb.31:                               # %cmpxchg.releasedload141
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_30 Depth=1
+; CHECK-NEXT:   ldarx 29, 0, 0
+; CHECK-NEXT:   cmpld   29, 28
+; CHECK-NEXT:   beq     0, .LBB3_30
+; CHECK-NEXT: .LBB3_32:                               # %cmpxchg.nostore139
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   std 29, ull@toc@l(30)
+; CHECK-NEXT:   lbz 30, uc@toc@l(3)
+; CHECK-NEXT:   lbz 29, sc@toc@l(4)
+; CHECK-NEXT:   lbarx 28, 0, 6
+; CHECK-NEXT:   clrlwi  28, 28, 24
+; CHECK-NEXT:   cmplw   28, 30
+; CHECK-NEXT:   bne     0, .LBB3_36
+; CHECK-NEXT: # %bb.33:                               # %cmpxchg.fencedstore124
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_34:                               # %cmpxchg.trystore123
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 29, 0, 6
+; CHECK-NEXT:   beq     0, .LBB3_37
+; CHECK-NEXT: # %bb.35:                               # %cmpxchg.releasedload122
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_34 Depth=1
+; CHECK-NEXT:   lbarx 28, 0, 6
+; CHECK-NEXT:   clrlwi  28, 28, 24
+; CHECK-NEXT:   cmplw   28, 30
+; CHECK-NEXT:   beq     0, .LBB3_34
+; CHECK-NEXT: .LBB3_36:                               # %cmpxchg.nostore120
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_38
+; CHECK-NEXT: .LBB3_37:                               # %cmpxchg.success121
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_38:                               # %cmpxchg.end118
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 30, 1
+; CHECK-NEXT:   isel 6, 30, 6, 20
+; CHECK-NEXT:   lbz 30, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lbarx 29, 0, 7
+; CHECK-NEXT:   clrlwi  29, 29, 24
+; CHECK-NEXT:   cmplw   29, 6
+; CHECK-NEXT:   bne     0, .LBB3_42
+; CHECK-NEXT: # %bb.39:                               # %cmpxchg.fencedstore105
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_40:                               # %cmpxchg.trystore104
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stbcx. 30, 0, 7
+; CHECK-NEXT:   beq     0, .LBB3_43
+; CHECK-NEXT: # %bb.41:                               # %cmpxchg.releasedload103
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_40 Depth=1
+; CHECK-NEXT:   lbarx 29, 0, 7
+; CHECK-NEXT:   clrlwi  29, 29, 24
+; CHECK-NEXT:   cmplw   29, 6
+; CHECK-NEXT:   beq     0, .LBB3_40
+; CHECK-NEXT: .LBB3_42:                               # %cmpxchg.nostore101
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_44
+; CHECK-NEXT: .LBB3_43:                               # %cmpxchg.success102
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_44:                               # %cmpxchg.end99
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lharx 30, 0, 8
+; CHECK-NEXT:   clrlwi  30, 30, 16
+; CHECK-NEXT:   cmplw   30, 6
+; CHECK-NEXT:   bne     0, .LBB3_48
+; CHECK-NEXT: # %bb.45:                               # %cmpxchg.fencedstore86
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  7, 7, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_46:                               # %cmpxchg.trystore85
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 7, 0, 8
+; CHECK-NEXT:   beq     0, .LBB3_49
+; CHECK-NEXT: # %bb.47:                               # %cmpxchg.releasedload84
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_46 Depth=1
+; CHECK-NEXT:   lharx 30, 0, 8
+; CHECK-NEXT:   clrlwi  30, 30, 16
+; CHECK-NEXT:   cmplw   30, 6
+; CHECK-NEXT:   beq     0, .LBB3_46
+; CHECK-NEXT: .LBB3_48:                               # %cmpxchg.nostore82
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_50
+; CHECK-NEXT: .LBB3_49:                               # %cmpxchg.success83
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_50:                               # %cmpxchg.end80
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lharx 8, 0, 9
+; CHECK-NEXT:   clrlwi  8, 8, 16
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_54
+; CHECK-NEXT: # %bb.51:                               # %cmpxchg.fencedstore67
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   clrlwi  7, 7, 16
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_52:                               # %cmpxchg.trystore66
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   sthcx. 7, 0, 9
+; CHECK-NEXT:   beq     0, .LBB3_55
+; CHECK-NEXT: # %bb.53:                               # %cmpxchg.releasedload65
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_52 Depth=1
+; CHECK-NEXT:   lharx 8, 0, 9
+; CHECK-NEXT:   clrlwi  8, 8, 16
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_52
+; CHECK-NEXT: .LBB3_54:                               # %cmpxchg.nostore63
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_56
+; CHECK-NEXT: .LBB3_55:                               # %cmpxchg.success64
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_56:                               # %cmpxchg.end61
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lwarx 8, 0, 10
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_60
+; CHECK-NEXT: # %bb.57:                               # %cmpxchg.fencedstore48
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_58:                               # %cmpxchg.trystore47
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 7, 0, 10
+; CHECK-NEXT:   beq     0, .LBB3_61
+; CHECK-NEXT: # %bb.59:                               # %cmpxchg.releasedload46
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_58 Depth=1
+; CHECK-NEXT:   lwarx 8, 0, 10
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_58
+; CHECK-NEXT: .LBB3_60:                               # %cmpxchg.nostore44
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_62
+; CHECK-NEXT: .LBB3_61:                               # %cmpxchg.success45
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_62:                               # %cmpxchg.end42
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   lwarx 8, 0, 11
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_66
+; CHECK-NEXT: # %bb.63:                               # %cmpxchg.fencedstore29
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_64:                               # %cmpxchg.trystore28
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stwcx. 7, 0, 11
+; CHECK-NEXT:   beq     0, .LBB3_67
+; CHECK-NEXT: # %bb.65:                               # %cmpxchg.releasedload27
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_64 Depth=1
+; CHECK-NEXT:   lwarx 8, 0, 11
+; CHECK-NEXT:   cmplw   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_64
+; CHECK-NEXT: .LBB3_66:                               # %cmpxchg.nostore25
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_68
+; CHECK-NEXT: .LBB3_67:                               # %cmpxchg.success26
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_68:                               # %cmpxchg.end23
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   lbz 7, sc@toc@l(4)
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   lbz 6, uc@toc@l(3)
+; CHECK-NEXT:   ldarx 8, 0, 12
+; CHECK-NEXT:   cmpld   8, 6
+; CHECK-NEXT:   bne     0, .LBB3_72
+; CHECK-NEXT: # %bb.69:                               # %cmpxchg.fencedstore10
+; CHECK-NEXT:   extsb 7, 7
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_70:                               # %cmpxchg.trystore9
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 7, 0, 12
+; CHECK-NEXT:   beq     0, .LBB3_73
+; CHECK-NEXT: # %bb.71:                               # %cmpxchg.releasedload8
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_70 Depth=1
+; CHECK-NEXT:   ldarx 8, 0, 12
+; CHECK-NEXT:   cmpld   8, 6
+; CHECK-NEXT:   beq     0, .LBB3_70
+; CHECK-NEXT: .LBB3_72:                               # %cmpxchg.nostore6
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_74
+; CHECK-NEXT: .LBB3_73:                               # %cmpxchg.success7
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_74:                               # %cmpxchg.end4
+; CHECK-NEXT:   li 6, 0
+; CHECK-NEXT:   li 7, 1
+; CHECK-NEXT:   lbz 3, uc@toc@l(3)
+; CHECK-NEXT:   lbz 4, sc@toc@l(4)
+; CHECK-NEXT:   isel 6, 7, 6, 20
+; CHECK-NEXT:   stw 6, ui@toc@l(5)
+; CHECK-NEXT:   ldarx 6, 0, 0
+; CHECK-NEXT:   cmpld   6, 3
+; CHECK-NEXT:   bne     0, .LBB3_78
+; CHECK-NEXT: # %bb.75:                               # %cmpxchg.fencedstore
+; CHECK-NEXT:   extsb 4, 4
+; CHECK-NEXT:   sync
+; CHECK-NEXT:   .p2align        5
+; CHECK-NEXT: .LBB3_76:                               # %cmpxchg.trystore
+; CHECK-NEXT:                                         # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:   stdcx. 4, 0, 0
+; CHECK-NEXT:   beq     0, .LBB3_79
+; CHECK-NEXT: # %bb.77:                               # %cmpxchg.releasedload
+; CHECK-NEXT:                                         #   in Loop: Header=BB3_76 Depth=1
+; CHECK-NEXT:   ldarx 6, 0, 0
+; CHECK-NEXT:   cmpld   6, 3
+; CHECK-NEXT:   beq     0, .LBB3_76
+; CHECK-NEXT: .LBB3_78:                               # %cmpxchg.nostore
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT:   b .LBB3_80
+; CHECK-NEXT: .LBB3_79:                               # %cmpxchg.success
+; CHECK-NEXT:   lwsync
+; CHECK-NEXT:   creqv 20, 20, 20
+; CHECK-NEXT: .LBB3_80:                               # %cmpxchg.end
+; CHECK-NEXT:   li 3, 0
+; CHECK-NEXT:   li 4, 1
+; CHECK-NEXT:   ld 30, -16(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 29, -24(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 28, -32(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   ld 27, -40(1)                           # 8-byte Folded Reload
+; CHECK-NEXT:   isel 3, 4, 3, 20
+; CHECK-NEXT:   stw 3, ui@toc@l(5)
+; CHECK-NEXT:   blr
 ;
 ; AIX32-LABEL: test_compare_and_swap:
 ; AIX32:       # %bb.0: # %entry
 ; AIX32-NEXT:    mflr 0
-; AIX32-NEXT:    stwu 1, -128(1)
-; AIX32-NEXT:    stw 0, 136(1)
-; AIX32-NEXT:    stw 28, 112(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lwz 28, L..C0(2) # @sc
-; AIX32-NEXT:    stw 29, 116(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lwz 29, L..C1(2) # @uc
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    rlwinm 5, 28, 3, 27, 28
-; AIX32-NEXT:    stw 21, 84(1) # 4-byte Folded Spill
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 17, 68(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 18, 72(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 19, 76(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 20, 80(1) # 4-byte Folded Spill
-; AIX32-NEXT:    xori 21, 5, 24
-; AIX32-NEXT:    stw 22, 88(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 23, 92(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 24, 96(1) # 4-byte Folded Spill
-; AIX32-NEXT:    slw 5, 3, 21
-; AIX32-NEXT:    li 3, 255
-; AIX32-NEXT:    slw 4, 4, 21
-; AIX32-NEXT:    stw 25, 100(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 26, 104(1) # 4-byte Folded Spill
-; AIX32-NEXT:    slw 3, 3, 21
-; AIX32-NEXT:    stw 27, 108(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 30, 120(1) # 4-byte Folded Spill
-; AIX32-NEXT:    stw 31, 124(1) # 4-byte Folded Spill
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    rlwinm 18, 28, 0, 0, 29
-; AIX32-NEXT:    and 4, 4, 3
-; AIX32-NEXT:    and 5, 5, 3
-; AIX32-NEXT:  L..BB3_1: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 7, 0, 18
-; AIX32-NEXT:    and 6, 7, 3
-; AIX32-NEXT:    cmpw 6, 5
-; AIX32-NEXT:    bne 0, L..BB3_3
-; AIX32-NEXT:  # %bb.2: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 7, 7, 3
-; AIX32-NEXT:    or 7, 7, 4
-; AIX32-NEXT:    stwcx. 7, 0, 18
-; AIX32-NEXT:    bne 0, L..BB3_1
-; AIX32-NEXT:  L..BB3_3: # %entry
-; AIX32-NEXT:    rlwinm 5, 29, 3, 27, 28
-; AIX32-NEXT:    srw 3, 6, 21
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    rlwinm 20, 29, 0, 0, 29
-; AIX32-NEXT:    xori 25, 5, 24
-; AIX32-NEXT:    slw 5, 3, 25
-; AIX32-NEXT:    stb 3, 0(28)
-; AIX32-NEXT:    li 3, 255
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 4, 25
-; AIX32-NEXT:    slw 3, 3, 25
-; AIX32-NEXT:    and 4, 5, 3
-; AIX32-NEXT:    and 5, 6, 3
-; AIX32-NEXT:  L..BB3_4: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 7, 0, 20
-; AIX32-NEXT:    and 6, 7, 3
-; AIX32-NEXT:    cmpw 6, 5
-; AIX32-NEXT:    bne 0, L..BB3_6
-; AIX32-NEXT:  # %bb.5: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 7, 7, 3
-; AIX32-NEXT:    or 7, 7, 4
-; AIX32-NEXT:    stwcx. 7, 0, 20
-; AIX32-NEXT:    bne 0, L..BB3_4
-; AIX32-NEXT:  L..BB3_6: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    srw 4, 6, 25
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    extsb 5, 3
-; AIX32-NEXT:    lwz 3, L..C2(2) # @ss
-; AIX32-NEXT:    stb 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    rlwinm 6, 3, 3, 27, 27
-; AIX32-NEXT:    rlwinm 22, 3, 0, 0, 29
-; AIX32-NEXT:    xori 26, 6, 16
-; AIX32-NEXT:    slw 6, 4, 26
-; AIX32-NEXT:    li 4, 0
-; AIX32-NEXT:    slw 5, 5, 26
-; AIX32-NEXT:    ori 4, 4, 65535
-; AIX32-NEXT:    slw 4, 4, 26
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_7: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 22
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_9
-; AIX32-NEXT:  # %bb.8: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 22
-; AIX32-NEXT:    bne 0, L..BB3_7
-; AIX32-NEXT:  L..BB3_9: # %entry
-; AIX32-NEXT:    srw 4, 7, 26
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    sth 4, 0(3)
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 3
-; AIX32-NEXT:    lwz 3, L..C3(2) # @us
-; AIX32-NEXT:    rlwinm 6, 3, 3, 27, 27
-; AIX32-NEXT:    rlwinm 19, 3, 0, 0, 29
-; AIX32-NEXT:    xori 24, 6, 16
-; AIX32-NEXT:    slw 6, 4, 24
-; AIX32-NEXT:    li 4, 0
-; AIX32-NEXT:    slw 5, 5, 24
-; AIX32-NEXT:    ori 4, 4, 65535
-; AIX32-NEXT:    slw 4, 4, 24
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_10: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 19
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_12
-; AIX32-NEXT:  # %bb.11: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 19
-; AIX32-NEXT:    bne 0, L..BB3_10
-; AIX32-NEXT:  L..BB3_12: # %entry
-; AIX32-NEXT:    srw 4, 7, 24
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lwz 17, L..C4(2) # @si
-; AIX32-NEXT:    sth 4, 0(3)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 4, 4
-; AIX32-NEXT:  L..BB3_13: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 5, 0, 17
-; AIX32-NEXT:    cmpw 5, 3
-; AIX32-NEXT:    bne 0, L..BB3_15
-; AIX32-NEXT:  # %bb.14: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 4, 0, 17
-; AIX32-NEXT:    bne 0, L..BB3_13
-; AIX32-NEXT:  L..BB3_15: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    stw 5, 0(17)
-; AIX32-NEXT:    lwz 27, L..C5(2) # @ui
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 4, 4
-; AIX32-NEXT:  L..BB3_16: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 5, 0, 27
-; AIX32-NEXT:    cmpw 5, 3
-; AIX32-NEXT:    bne 0, L..BB3_18
-; AIX32-NEXT:  # %bb.17: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 4, 0, 27
-; AIX32-NEXT:    bne 0, L..BB3_16
-; AIX32-NEXT:  L..BB3_18: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lwz 31, L..C6(2) # @sll
-; AIX32-NEXT:    stw 5, 0(27)
-; AIX32-NEXT:    lbz 3, 0(28)
-; AIX32-NEXT:    li 23, 0
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    extsb 6, 3
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lwz 3, 60(1)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    lwz 30, L..C7(2) # @ull
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    stw 3, 4(31)
-; AIX32-NEXT:    lwz 3, 56(1)
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    stw 3, 0(31)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 30
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lwz 4, 60(1)
-; AIX32-NEXT:    lwz 3, 56(1)
-; AIX32-NEXT:    stw 4, 4(30)
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(30)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 5, 4, 21
-; AIX32-NEXT:    li 4, 255
-; AIX32-NEXT:    slw 6, 3, 21
-; AIX32-NEXT:    slw 4, 4, 21
-; AIX32-NEXT:    and 5, 5, 4
-; AIX32-NEXT:    and 6, 6, 4
-; AIX32-NEXT:  L..BB3_19: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 8, 0, 18
-; AIX32-NEXT:    and 7, 8, 4
-; AIX32-NEXT:    cmpw 7, 6
-; AIX32-NEXT:    bne 0, L..BB3_21
-; AIX32-NEXT:  # %bb.20: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 8, 8, 4
-; AIX32-NEXT:    or 8, 8, 5
-; AIX32-NEXT:    stwcx. 8, 0, 18
-; AIX32-NEXT:    bne 0, L..BB3_19
-; AIX32-NEXT:  L..BB3_21: # %entry
-; AIX32-NEXT:    srw 4, 7, 21
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    cmpw 4, 3
-; AIX32-NEXT:    li 3, 1
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    slw 6, 5, 25
-; AIX32-NEXT:    li 5, 255
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    slw 5, 5, 25
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 7, 4, 25
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_22: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 20
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_24
-; AIX32-NEXT:  # %bb.23: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 20
-; AIX32-NEXT:    bne 0, L..BB3_22
-; AIX32-NEXT:  L..BB3_24: # %entry
-; AIX32-NEXT:    srw 5, 8, 25
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 5, 26
-; AIX32-NEXT:    li 5, 0
-; AIX32-NEXT:    slw 7, 4, 26
-; AIX32-NEXT:    ori 5, 5, 65535
-; AIX32-NEXT:    slw 5, 5, 26
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_25: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 22
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_27
-; AIX32-NEXT:  # %bb.26: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 22
-; AIX32-NEXT:    bne 0, L..BB3_25
-; AIX32-NEXT:  L..BB3_27: # %entry
-; AIX32-NEXT:    srw 5, 8, 26
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    slw 6, 5, 24
-; AIX32-NEXT:    li 5, 0
-; AIX32-NEXT:    slw 7, 4, 24
-; AIX32-NEXT:    ori 5, 5, 65535
-; AIX32-NEXT:    slw 5, 5, 24
-; AIX32-NEXT:    and 6, 6, 5
-; AIX32-NEXT:    and 7, 7, 5
-; AIX32-NEXT:  L..BB3_28: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 9, 0, 19
-; AIX32-NEXT:    and 8, 9, 5
-; AIX32-NEXT:    cmpw 8, 7
-; AIX32-NEXT:    bne 0, L..BB3_30
-; AIX32-NEXT:  # %bb.29: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    andc 9, 9, 5
-; AIX32-NEXT:    or 9, 9, 6
-; AIX32-NEXT:    stwcx. 9, 0, 19
-; AIX32-NEXT:    bne 0, L..BB3_28
-; AIX32-NEXT:  L..BB3_30: # %entry
-; AIX32-NEXT:    srw 5, 8, 24
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    cmpw 5, 4
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    iseleq 4, 3, 23
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:  L..BB3_31: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 6, 0, 17
-; AIX32-NEXT:    cmpw 1, 6, 4
-; AIX32-NEXT:    bne 1, L..BB3_33
-; AIX32-NEXT:  # %bb.32: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 5, 0, 17
-; AIX32-NEXT:    bne 0, L..BB3_31
-; AIX32-NEXT:  L..BB3_33: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    isel 4, 3, 23, 6
-; AIX32-NEXT:    lbz 5, 0(28)
-; AIX32-NEXT:    stw 4, 0(27)
-; AIX32-NEXT:    lbz 4, 0(29)
-; AIX32-NEXT:    sync
-; AIX32-NEXT:    extsb 5, 5
-; AIX32-NEXT:  L..BB3_34: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    lwarx 6, 0, 27
-; AIX32-NEXT:    cmpw 1, 6, 4
-; AIX32-NEXT:    bne 1, L..BB3_36
-; AIX32-NEXT:  # %bb.35: # %entry
-; AIX32-NEXT:    #
-; AIX32-NEXT:    stwcx. 5, 0, 27
-; AIX32-NEXT:    bne 0, L..BB3_34
-; AIX32-NEXT:  L..BB3_36: # %entry
-; AIX32-NEXT:    lwsync
-; AIX32-NEXT:    isel 3, 3, 23, 6
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 31
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    lbz 4, 0(28)
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lbz 3, 0(29)
-; AIX32-NEXT:    li 7, 5
-; AIX32-NEXT:    li 8, 5
-; AIX32-NEXT:    extsb 6, 4
-; AIX32-NEXT:    addi 4, 1, 56
-; AIX32-NEXT:    stw 3, 60(1)
-; AIX32-NEXT:    mr 3, 30
-; AIX32-NEXT:    stw 23, 56(1)
-; AIX32-NEXT:    srawi 5, 6, 31
-; AIX32-NEXT:    bl .__atomic_compare_exchange_8[PR]
-; AIX32-NEXT:    nop
-; AIX32-NEXT:    stw 3, 0(27)
-; AIX32-NEXT:    lwz 31, 124(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 30, 120(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 29, 116(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 28, 112(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 27, 108(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 26, 104(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 25, 100(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 24, 96(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 23, 92(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 22, 88(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 21, 84(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 20, 80(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 19, 76(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 18, 72(1) # 4-byte Folded Reload
-; AIX32-NEXT:    lwz 17, 68(1) # 4-byte Folded Reload
-; AIX32-NEXT:    addi 1, 1, 128
-; AIX32-NEXT:    lwz 0, 8(1)
-; AIX32-NEXT:    mtlr 0
+; AIX32-NEXT:   stwu 1, -144(1)
+; AIX32-NEXT:   stw 0, 152(1)
+; AIX32-NEXT:   stw 29, 132(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   lwz 29, L..C0(2)                        # @sc
+; AIX32-NEXT:   stw 26, 120(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   not     3, 29
+; AIX32-NEXT:   stw 30, 136(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   lwz 30, L..C1(2)                        # @uc
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   stw 27, 124(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   rlwinm 27, 29, 0, 0, 29
+; AIX32-NEXT:   stw 14, 72(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 15, 76(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   rlwinm 26, 3, 3, 27, 28
+; AIX32-NEXT:   li 3, 255
+; AIX32-NEXT:   slw 3, 3, 26
+; AIX32-NEXT:   stw 16, 80(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 17, 84(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 18, 88(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 19, 92(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 20, 96(1)                           # 4-byte Folded Spill
+; AIX32-NEXT:   stw 21, 100(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 22, 104(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 23, 108(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 24, 112(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 25, 116(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 28, 128(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   stw 31, 140(1)                          # 4-byte Folded Spill
+; AIX32-NEXT:   not     25, 3
+; AIX32-NEXT:   lwarx 3, 0, 27
+; AIX32-NEXT:   srw 6, 3, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 4
+; AIX32-NEXT:   bne     0, L..BB3_4
+; AIX32-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore289
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 26
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_2:                               # %cmpxchg.trystore288
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 6, 3, 25
+; AIX32-NEXT:   or 6, 6, 5
+; AIX32-NEXT:   stwcx. 6, 0, 27
+; AIX32-NEXT:   beq     0, L..BB3_4
+; AIX32-NEXT:  # %bb.3:                                # %cmpxchg.releasedload287
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 27
+; AIX32-NEXT:   srw 6, 3, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 4
+; AIX32-NEXT:   beq     0, L..BB3_2
+; AIX32-NEXT:  L..BB3_4:                               # %cmpxchg.nostore285
+; AIX32-NEXT:   not     4, 30
+; AIX32-NEXT:   srw 5, 3, 26
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   rlwinm 24, 30, 0, 0, 29
+; AIX32-NEXT:   rlwinm 23, 4, 3, 27, 28
+; AIX32-NEXT:   li 4, 255
+; AIX32-NEXT:   stb 5, 0(29)
+; AIX32-NEXT:   slw 4, 4, 23
+; AIX32-NEXT:   not     22, 4
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_8
+; AIX32-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore256
+; AIX32-NEXT:   clrlwi  5, 5, 24
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 23
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_6:                               # %cmpxchg.trystore255
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 6, 4, 22
+; AIX32-NEXT:   or 6, 6, 5
+; AIX32-NEXT:   stwcx. 6, 0, 24
+; AIX32-NEXT:   beq     0, L..BB3_8
+; AIX32-NEXT:  # %bb.7:                                # %cmpxchg.releasedload254
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_6 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_6
+; AIX32-NEXT:  L..BB3_8:                               # %cmpxchg.nostore252
+; AIX32-NEXT:   srw 4, 4, 23
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lis 3, 0
+; AIX32-NEXT:   lbz 7, 0(29)
+; AIX32-NEXT:   stb 4, 0(30)
+; AIX32-NEXT:   clrlwi  6, 4, 24
+; AIX32-NEXT:   lwz 4, L..C2(2)                         # @ss
+; AIX32-NEXT:   ori 3, 3, 65535
+; AIX32-NEXT:   clrlwi  5, 4, 30
+; AIX32-NEXT:   rlwinm 21, 4, 0, 0, 29
+; AIX32-NEXT:   xori 5, 5, 2
+; AIX32-NEXT:   slwi 20, 5, 3
+; AIX32-NEXT:   slw 5, 3, 20
+; AIX32-NEXT:   not     19, 5
+; AIX32-NEXT:   lwarx 5, 0, 21
+; AIX32-NEXT:   srw 8, 5, 20
+; AIX32-NEXT:   clrlwi  8, 8, 16
+; AIX32-NEXT:   cmplw   8, 6
+; AIX32-NEXT:   bne     0, L..BB3_12
+; AIX32-NEXT:  # %bb.9:                                # %cmpxchg.fencedstore223
+; AIX32-NEXT:   extsb 7, 7
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   slw 7, 7, 20
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_10:                              # %cmpxchg.trystore222
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 8, 5, 19
+; AIX32-NEXT:   or 8, 8, 7
+; AIX32-NEXT:   stwcx. 8, 0, 21
+; AIX32-NEXT:   beq     0, L..BB3_12
+; AIX32-NEXT:  # %bb.11:                               # %cmpxchg.releasedload221
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_10 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 21
+; AIX32-NEXT:   srw 8, 5, 20
+; AIX32-NEXT:   clrlwi  8, 8, 16
+; AIX32-NEXT:   cmplw   8, 6
+; AIX32-NEXT:   beq     0, L..BB3_10
+; AIX32-NEXT:  L..BB3_12:                              # %cmpxchg.nostore219
+; AIX32-NEXT:   srw 5, 5, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lbz 6, 0(29)
+; AIX32-NEXT:   sth 5, 0(4)
+; AIX32-NEXT:   lwz 4, L..C3(2)                         # @us
+; AIX32-NEXT:   lbz 5, 0(30)
+; AIX32-NEXT:   clrlwi  7, 4, 30
+; AIX32-NEXT:   rlwinm 18, 4, 0, 0, 29
+; AIX32-NEXT:   xori 7, 7, 2
+; AIX32-NEXT:   slwi 17, 7, 3
+; AIX32-NEXT:   slw 3, 3, 17
+; AIX32-NEXT:   not     16, 3
+; AIX32-NEXT:   lwarx 3, 0, 18
+; AIX32-NEXT:   srw 7, 3, 17
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   cmplw   7, 5
+; AIX32-NEXT:   bne     0, L..BB3_16
+; AIX32-NEXT:  # %bb.13:                               # %cmpxchg.fencedstore190
+; AIX32-NEXT:   extsb 6, 6
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   slw 6, 6, 17
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_14:                              # %cmpxchg.trystore189
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 7, 3, 16
+; AIX32-NEXT:   or 7, 7, 6
+; AIX32-NEXT:   stwcx. 7, 0, 18
+; AIX32-NEXT:   beq     0, L..BB3_16
+; AIX32-NEXT:  # %bb.15:                               # %cmpxchg.releasedload188
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_14 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 18
+; AIX32-NEXT:   srw 7, 3, 17
+; AIX32-NEXT:   clrlwi  7, 7, 16
+; AIX32-NEXT:   cmplw   7, 5
+; AIX32-NEXT:   beq     0, L..BB3_14
+; AIX32-NEXT:  L..BB3_16:                              # %cmpxchg.nostore186
+; AIX32-NEXT:   srw 3, 3, 17
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lwz 15, L..C4(2)                        # @si
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   sth 3, 0(4)
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lwarx 3, 0, 15
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   bne     0, L..BB3_20
+; AIX32-NEXT:  # %bb.17:                               # %cmpxchg.fencedstore171
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_18:                              # %cmpxchg.trystore170
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 5, 0, 15
+; AIX32-NEXT:   beq     0, L..BB3_20
+; AIX32-NEXT:  # %bb.19:                               # %cmpxchg.releasedload169
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_18 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 15
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   beq     0, L..BB3_18
+; AIX32-NEXT:  L..BB3_20:                              # %cmpxchg.nostore167
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   lwz 28, L..C5(2)                        # @ui
+; AIX32-NEXT:   stw 3, 0(15)
+; AIX32-NEXT:   lbz 4, 0(30)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   lwarx 3, 0, 28
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   bne     0, L..BB3_24
+; AIX32-NEXT:  # %bb.21:                               # %cmpxchg.fencedstore152
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_22:                              # %cmpxchg.trystore151
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 5, 0, 28
+; AIX32-NEXT:   beq     0, L..BB3_24
+; AIX32-NEXT:  # %bb.23:                               # %cmpxchg.releasedload150
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_22 Depth=1
+; AIX32-NEXT:   lwarx 3, 0, 28
+; AIX32-NEXT:   cmplw   3, 4
+; AIX32-NEXT:   beq     0, L..BB3_22
+; AIX32-NEXT:  L..BB3_24:                              # %cmpxchg.nostore148
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lwz 31, L..C6(2)                        # @sll
+; AIX32-NEXT:   lbz 3, 0(29)
+; AIX32-NEXT:   li 14, 0
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   stw 14, 64(1)
+; AIX32-NEXT:   extsb 6, 3
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   mr      3, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lwz 3, 68(1)
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   stw 3, 4(31)
+; AIX32-NEXT:   lwz 3, 64(1)
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 14, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 0(31)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwz 31, L..C7(2)                        # @ull
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   mr      3, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lwz 3, 64(1)
+; AIX32-NEXT:   lwz 4, 68(1)
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   stw 4, 4(31)
+; AIX32-NEXT:   stw 3, 0(31)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 27
+; AIX32-NEXT:   srw 6, 4, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_28
+; AIX32-NEXT:  # %bb.25:                               # %cmpxchg.fencedstore119
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 26
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_26:                              # %cmpxchg.trystore118
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 25
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 27
+; AIX32-NEXT:   beq     0, L..BB3_29
+; AIX32-NEXT:  # %bb.27:                               # %cmpxchg.releasedload117
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_26 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 27
+; AIX32-NEXT:   srw 6, 4, 26
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_26
+; AIX32-NEXT:  L..BB3_28:                              # %cmpxchg.nostore115
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_30
+; AIX32-NEXT:  L..BB3_29:                              # %cmpxchg.success116
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_30:                              # %cmpxchg.end113
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_34
+; AIX32-NEXT:  # %bb.31:                               # %cmpxchg.fencedstore86
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   slw 5, 5, 23
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_32:                              # %cmpxchg.trystore85
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 22
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 24
+; AIX32-NEXT:   beq     0, L..BB3_35
+; AIX32-NEXT:  # %bb.33:                               # %cmpxchg.releasedload84
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_32 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 24
+; AIX32-NEXT:   srw 6, 4, 23
+; AIX32-NEXT:   clrlwi  6, 6, 24
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_32
+; AIX32-NEXT:  L..BB3_34:                              # %cmpxchg.nostore82
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_36
+; AIX32-NEXT:  L..BB3_35:                              # %cmpxchg.success83
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_36:                              # %cmpxchg.end80
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 21
+; AIX32-NEXT:   srw 6, 4, 20
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_40
+; AIX32-NEXT:  # %bb.37:                               # %cmpxchg.fencedstore53
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  5, 5, 16
+; AIX32-NEXT:   slw 5, 5, 20
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_38:                              # %cmpxchg.trystore52
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 19
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 21
+; AIX32-NEXT:   beq     0, L..BB3_41
+; AIX32-NEXT:  # %bb.39:                               # %cmpxchg.releasedload51
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_38 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 21
+; AIX32-NEXT:   srw 6, 4, 20
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_38
+; AIX32-NEXT:  L..BB3_40:                              # %cmpxchg.nostore49
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_42
+; AIX32-NEXT:  L..BB3_41:                              # %cmpxchg.success50
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_42:                              # %cmpxchg.end47
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   lbz 5, 0(29)
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 4, 0, 18
+; AIX32-NEXT:   srw 6, 4, 17
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   bne     0, L..BB3_46
+; AIX32-NEXT:  # %bb.43:                               # %cmpxchg.fencedstore29
+; AIX32-NEXT:   extsb 5, 5
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   clrlwi  5, 5, 16
+; AIX32-NEXT:   slw 5, 5, 17
+; AIX32-NEXT:   .align  4
+; AIX32-NEXT:  L..BB3_44:                              # %cmpxchg.trystore28
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   and 4, 4, 16
+; AIX32-NEXT:   or 4, 4, 5
+; AIX32-NEXT:   stwcx. 4, 0, 18
+; AIX32-NEXT:   beq     0, L..BB3_47
+; AIX32-NEXT:  # %bb.45:                               # %cmpxchg.releasedload27
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_44 Depth=1
+; AIX32-NEXT:   lwarx 4, 0, 18
+; AIX32-NEXT:   srw 6, 4, 17
+; AIX32-NEXT:   clrlwi  6, 6, 16
+; AIX32-NEXT:   cmplw   6, 3
+; AIX32-NEXT:   beq     0, L..BB3_44
+; AIX32-NEXT:  L..BB3_46:                              # %cmpxchg.nostore25
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_48
+; AIX32-NEXT:  L..BB3_47:                              # %cmpxchg.success26
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_48:                              # %cmpxchg.end23
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 5, 0, 15
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   bne     0, L..BB3_52
+; AIX32-NEXT:  # %bb.49:                               # %cmpxchg.fencedstore10
+; AIX32-NEXT:   extsb 4, 4
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_50:                              # %cmpxchg.trystore9
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 4, 0, 15
+; AIX32-NEXT:   beq     0, L..BB3_53
+; AIX32-NEXT:  # %bb.51:                               # %cmpxchg.releasedload8
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_50 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 15
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   beq     0, L..BB3_50
+; AIX32-NEXT:  L..BB3_52:                              # %cmpxchg.nostore6
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_54
+; AIX32-NEXT:  L..BB3_53:                              # %cmpxchg.success7
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_54:                              # %cmpxchg.end4
+; AIX32-NEXT:   li 3, 0
+; AIX32-NEXT:   li 4, 1
+; AIX32-NEXT:   isel 3, 4, 3, 20
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   lwarx 5, 0, 28
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   bne     0, L..BB3_58
+; AIX32-NEXT:  # %bb.55:                               # %cmpxchg.fencedstore
+; AIX32-NEXT:   extsb 4, 4
+; AIX32-NEXT:   sync
+; AIX32-NEXT:   .align  5
+; AIX32-NEXT:  L..BB3_56:                              # %cmpxchg.trystore
+; AIX32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; AIX32-NEXT:   stwcx. 4, 0, 28
+; AIX32-NEXT:   beq     0, L..BB3_59
+; AIX32-NEXT:  # %bb.57:                               # %cmpxchg.releasedload
+; AIX32-NEXT:                                          #   in Loop: Header=BB3_56 Depth=1
+; AIX32-NEXT:   lwarx 5, 0, 28
+; AIX32-NEXT:   cmplw   5, 3
+; AIX32-NEXT:   beq     0, L..BB3_56
+; AIX32-NEXT:  L..BB3_58:                              # %cmpxchg.nostore
+; AIX32-NEXT:   crxor 20, 20, 20
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   b L..BB3_60
+; AIX32-NEXT:  L..BB3_59:                              # %cmpxchg.success
+; AIX32-NEXT:   lwsync
+; AIX32-NEXT:   creqv 20, 20, 20
+; AIX32-NEXT:  L..BB3_60:                              # %cmpxchg.end
+; AIX32-NEXT:   li 3, 1
+; AIX32-NEXT:   li 31, 0
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   isel 3, 3, 31, 20
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 31, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   lwz 3, L..C6(2)                         # @sll
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   lbz 4, 0(29)
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lbz 3, 0(30)
+; AIX32-NEXT:   li 7, 5
+; AIX32-NEXT:   li 8, 5
+; AIX32-NEXT:   extsb 6, 4
+; AIX32-NEXT:   stw 3, 68(1)
+; AIX32-NEXT:   lwz 3, L..C7(2)                         # @ull
+; AIX32-NEXT:   addi 4, 1, 64
+; AIX32-NEXT:   stw 31, 64(1)
+; AIX32-NEXT:   srawi 5, 6, 31
+; AIX32-NEXT:   bl .__atomic_compare_exchange_8[PR]
+; AIX32-NEXT:   nop
+; AIX32-NEXT:   stw 3, 0(28)
+; AIX32-NEXT:   lwz 31, 140(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 30, 136(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 29, 132(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 28, 128(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 27, 124(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 26, 120(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 25, 116(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 24, 112(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 23, 108(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 22, 104(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 21, 100(1)                          # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 20, 96(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 19, 92(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 18, 88(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 17, 84(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 16, 80(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 15, 76(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   lwz 14, 72(1)                           # 4-byte Folded Reload
+; AIX32-NEXT:   addi 1, 1, 144
+; AIX32-NEXT:   lwz 0, 8(1)
+; AIX32-NEXT:   mtlr 0
 ; AIX32-NEXT:    blr
 entry:
   %0 = load i8, ptr @uc, align 1
@@ -5597,21 +5852,20 @@ entry:
 define dso_local i64 @cmpswplp(ptr noundef %ptr, ptr nocapture noundef readnone %oldval, i64 noundef %newval) local_unnamed_addr #0 {
 ; CHECK-LABEL: cmpswplp:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi 4, 5, 1
-; CHECK-NEXT:  .LBB6_1: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    ldarx 6, 0, 3
-; CHECK-NEXT:    cmpd 1, 6, 5
-; CHECK-NEXT:    bne 1, .LBB6_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stdcx. 4, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  .LBB6_3: # %entry
-; CHECK-NEXT:    li 3, 66
-; CHECK-NEXT:    li 4, 55
-; CHECK-NEXT:    isel 3, 4, 3, 6
-; CHECK-NEXT:    blr
+; CHECK-NEXT:   ldarx 4, 0, 3
+; CHECK-NEXT:   cmpld   4, 5
+; CHECK-NEXT:   bne     0, .LBB6_2
+; CHECK-NEXT: # %bb.1:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:   addi 4, 5, 1
+; CHECK-NEXT:   stdcx. 4, 0, 3
+; CHECK-NEXT:   beq     0, .LBB6_4
+; CHECK-NEXT: .LBB6_2:                                # %cmpxchg.failure
+; CHECK-NEXT:   crxor 20, 20, 20
+; CHECK-NEXT: .LBB6_3:                                # %cmpxchg.end
+; CHECK-NEXT:   li 3, 66
+; CHECK-NEXT:   li 4, 55
+; CHECK-NEXT:   isel 3, 4, 3, 20
+; CHECK-NEXT:   blr
 ;
 ; AIX32-LABEL: cmpswplp:
 ; AIX32:       # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/PowerPC/atomic-2.ll b/llvm/test/CodeGen/PowerPC/atomic-2.ll
index 10476541870f9..8fa0d767b329d 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-2.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-2.ll
@@ -42,8 +42,8 @@ define i64 @exchange_and_cmp(ptr %mem) nounwind {
 
 define i8 @exchange_and_cmp8(ptr %mem) nounwind {
 ; CHECK-LABEL: exchange_and_cmp8:
-; CHECK-BE: xori
-; CHECK-LE-NOT: xori
+; CHECK-BE: or r{{.*}} r{{.*}} r{{.*}}
+; CHECK-LE-NOT: or r{{.*}} r{{.*}} r{{.*}}
 ; CHECK-P8U: lbarx
   %tmppair = cmpxchg ptr %mem, i8 0, i8 1 monotonic monotonic
   %tmp = extractvalue { i8, i1 } %tmppair, 0
diff --git a/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll b/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
index 399645f671f7b..65a12a6222f24 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-compare-exchange-weak.ll
@@ -12,62 +12,60 @@
 define i32 @foo(ptr noundef %cp, ptr noundef %old, i32 noundef %c)  {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lwz r7, 0(r4)
 ; CHECK-NEXT:    stw r3, -4(r1)
 ; CHECK-NEXT:    stw r4, -8(r1)
+; CHECK-NEXT:    lwz r7, 0(r4)
 ; CHECK-NEXT:    stw r5, -12(r1)
 ; CHECK-NEXT:    stw r5, -16(r1)
-; CHECK-NEXT:  L..BB0_1: # %entry
-; CHECK-NEXT:    #
 ; CHECK-NEXT:    lwarx r6, 0, r3
-; CHECK-NEXT:    cmpw cr1, r6, r7
-; CHECK-NEXT:    bne cr1, L..BB0_3
-; CHECK-NEXT:  # %bb.2: # %entry
-; CHECK-NEXT:    #
-; CHECK-NEXT:    stwcx. r5, 0, r3
-; CHECK-NEXT:    bne cr0, L..BB0_1
-; CHECK-NEXT:  L..BB0_3: # %entry
 ; CHECK-NEXT:    cmplw r6, r7
+; CHECK-NEXT:    bne cr0, L..BB0_2
+; CHECK-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; CHECK-NEXT:    stwcx. r5, 0, r3
 ; CHECK-NEXT:    beq cr0, L..BB0_5
-; CHECK-NEXT:  # %bb.4: # %cmpxchg.store_expected
+; CHECK-NEXT:  L..BB0_2: # %cmpxchg.failure
+; CHECK-NEXT:    crxor 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK-NEXT:  # %bb.3: # %cmpxchg.store_expected
 ; CHECK-NEXT:    stw r6, 0(r4)
-; CHECK-NEXT:  L..BB0_5: # %cmpxchg.continue
+; CHECK-NEXT:  L..BB0_4: # %cmpxchg.continue
 ; CHECK-NEXT:    li r3, 0
 ; CHECK-NEXT:    li r4, 1
-; CHECK-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK-NEXT:    stb r3, -17(r1)
 ; CHECK-NEXT:    blr
+; CHECK-NEXT:  L..BB0_5:
+; CHECK-NEXT:    creqv 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK-NEXT:    b L..BB0_4
 ;
 ; CHECK64-LABEL: foo:
 ; CHECK64:       # %bb.0: # %entry
-; CHECK64-NEXT:    lwz r7, 0(r4)
 ; CHECK64-NEXT:    std r3, -8(r1)
 ; CHECK64-NEXT:    std r4, -16(r1)
+; CHECK64-NEXT:    lwz r7, 0(r4)
 ; CHECK64-NEXT:    stw r5, -20(r1)
 ; CHECK64-NEXT:    stw r5, -24(r1)
-; CHECK64-NEXT:  L..BB0_1: # %entry
-; CHECK64-NEXT:    #
 ; CHECK64-NEXT:    lwarx r6, 0, r3
-; CHECK64-NEXT:    cmpw cr1, r6, r7
-; CHECK64-NEXT:    bne cr1, L..BB0_3
-; CHECK64-NEXT:  # %bb.2: # %entry
-; CHECK64-NEXT:    #
-; CHECK64-NEXT:    stwcx. r5, 0, r3
-; CHECK64-NEXT:    bne cr0, L..BB0_1
-; CHECK64-NEXT:  L..BB0_3: # %entry
 ; CHECK64-NEXT:    cmplw r6, r7
+; CHECK64-NEXT:    bne cr0, L..BB0_2
+; CHECK64-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; CHECK64-NEXT:    stwcx. r5, 0, r3
 ; CHECK64-NEXT:    beq cr0, L..BB0_5
-; CHECK64-NEXT:  # %bb.4: # %cmpxchg.store_expected
+; CHECK64-NEXT:  L..BB0_2: # %cmpxchg.failure
+; CHECK64-NEXT:    crxor 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK64-NEXT:  # %bb.3: # %cmpxchg.store_expected
 ; CHECK64-NEXT:    stw r6, 0(r4)
-; CHECK64-NEXT:  L..BB0_5: # %cmpxchg.continue
+; CHECK64-NEXT:  L..BB0_4: # %cmpxchg.continue
 ; CHECK64-NEXT:    li r3, 0
 ; CHECK64-NEXT:    li r4, 1
-; CHECK64-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK64-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK64-NEXT:    li r4, 1
 ; CHECK64-NEXT:    stb r3, -25(r1)
 ; CHECK64-NEXT:    li r3, 0
-; CHECK64-NEXT:    isel r3, r4, r3, 4*cr1+eq
+; CHECK64-NEXT:    isel r3, r4, r3, 4*cr5+lt
 ; CHECK64-NEXT:    blr
+; CHECK64-NEXT:  L..BB0_5:
+; CHECK64-NEXT:    creqv 4*cr5+lt, 4*cr5+lt, 4*cr5+lt
+; CHECK64-NEXT:    b L..BB0_4
 entry:
   %cp.addr = alloca ptr, align 4
   %old.addr = alloca ptr, align 4
diff --git a/llvm/test/CodeGen/PowerPC/atomic-float.ll b/llvm/test/CodeGen/PowerPC/atomic-float.ll
index e2a46130ab797..600d28936c162 100644
--- a/llvm/test/CodeGen/PowerPC/atomic-float.ll
+++ b/llvm/test/CodeGen/PowerPC/atomic-float.ll
@@ -9,33 +9,37 @@ define float @test_add(ptr %ptr, float %incr) {
 ; CHECK-64:       # %bb.0: # %entry
 ; CHECK-64-NEXT:    sync
 ; CHECK-64-NEXT:    lfs 0, 0(3)
-; CHECK-64-NEXT:    b .LBB0_2
-; CHECK-64-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-64-NEXT:    #
-; CHECK-64-NEXT:    stw 6, -4(1)
-; CHECK-64-NEXT:    cmplw 6, 4
-; CHECK-64-NEXT:    lfs 0, -4(1)
-; CHECK-64-NEXT:    beq 0, .LBB0_5
-; CHECK-64-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-64-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-64-NEXT:    # Child Loop BB0_3 Depth 2
+; CHECK-64-NEXT:    b .LBB0_3
+; CHECK-64-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    crxor 20, 20, 20
+; CHECK-64-NEXT:  .LBB0_2:                                # %cmpxchg.end
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    stw 4, -12(1)
+; CHECK-64-NEXT:    lfs 0, -12(1)
+; CHECK-64-NEXT:    bc 12, 20, .LBB0_7
+; CHECK-64-NEXT:  .LBB0_3:                                # %atomicrmw.start
+; CHECK-64-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-64-NEXT:                                          #     Child Loop BB0_4 Depth 2
 ; CHECK-64-NEXT:    fadds 2, 0, 1
-; CHECK-64-NEXT:    stfs 2, -8(1)
-; CHECK-64-NEXT:    stfs 0, -12(1)
-; CHECK-64-NEXT:    lwz 5, -8(1)
-; CHECK-64-NEXT:    lwz 4, -12(1)
-; CHECK-64-NEXT:  .LBB0_3: # %atomicrmw.start
-; CHECK-64-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-64-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-64-NEXT:    lwarx 6, 0, 3
-; CHECK-64-NEXT:    cmpw 6, 4
-; CHECK-64-NEXT:    bne 0, .LBB0_1
-; CHECK-64-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-64-NEXT:    #
+; CHECK-64-NEXT:    stfs 2, -4(1)
+; CHECK-64-NEXT:    stfs 0, -8(1)
+; CHECK-64-NEXT:    lwz 5, -4(1)
+; CHECK-64-NEXT:    lwz 6, -8(1)
+; CHECK-64-NEXT:  .LBB0_4:                                # %cmpxchg.start
+; CHECK-64-NEXT:                                          #   Parent Loop BB0_3 Depth=1
+; CHECK-64-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-64-NEXT:    lwarx 4, 0, 3
+; CHECK-64-NEXT:    cmplw   4, 6
+; CHECK-64-NEXT:    bne     0, .LBB0_1
+; CHECK-64-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-64-NEXT:                                          #   in Loop: Header=BB0_4 Depth=2
 ; CHECK-64-NEXT:    stwcx. 5, 0, 3
-; CHECK-64-NEXT:    bne 0, .LBB0_3
-; CHECK-64-NEXT:    b .LBB0_1
-; CHECK-64-NEXT:  .LBB0_5: # %atomicrmw.end
+; CHECK-64-NEXT:    bne     0, .LBB0_4
+; CHECK-64-NEXT:  # %bb.6:                                #   in Loop: Header=BB0_3 Depth=1
+; CHECK-64-NEXT:    creqv 20, 20, 20
+; CHECK-64-NEXT:    b .LBB0_2
+; CHECK-64-NEXT:  .LBB0_7:                                # %atomicrmw.end
 ; CHECK-64-NEXT:    fmr 1, 0
 ; CHECK-64-NEXT:    lwsync
 ; CHECK-64-NEXT:    blr
@@ -46,33 +50,37 @@ define float @test_add(ptr %ptr, float %incr) {
 ; CHECK-32-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-32-NEXT:    sync
 ; CHECK-32-NEXT:    lfs 0, 0(3)
-; CHECK-32-NEXT:    b .LBB0_2
-; CHECK-32-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-32-NEXT:    #
-; CHECK-32-NEXT:    stw 6, 28(1)
-; CHECK-32-NEXT:    cmplw 6, 4
-; CHECK-32-NEXT:    lfs 0, 28(1)
-; CHECK-32-NEXT:    beq 0, .LBB0_5
-; CHECK-32-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-32-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-32-NEXT:    # Child Loop BB0_3 Depth 2
+; CHECK-32-NEXT:    b .LBB0_3
+; CHECK-32-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    crxor 20, 20, 20
+; CHECK-32-NEXT:  .LBB0_2:                                # %cmpxchg.end
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    stw 4, 20(1)
+; CHECK-32-NEXT:    lfs 0, 20(1)
+; CHECK-32-NEXT:    bc 12, 20, .LBB0_7
+; CHECK-32-NEXT:  .LBB0_3:                                # %atomicrmw.start
+; CHECK-32-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-32-NEXT:                                          #     Child Loop BB0_4 Depth 2
 ; CHECK-32-NEXT:    fadds 2, 0, 1
-; CHECK-32-NEXT:    stfs 2, 24(1)
-; CHECK-32-NEXT:    stfs 0, 20(1)
-; CHECK-32-NEXT:    lwz 5, 24(1)
-; CHECK-32-NEXT:    lwz 4, 20(1)
-; CHECK-32-NEXT:  .LBB0_3: # %atomicrmw.start
-; CHECK-32-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-32-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-32-NEXT:    lwarx 6, 0, 3
-; CHECK-32-NEXT:    cmpw 6, 4
-; CHECK-32-NEXT:    bne 0, .LBB0_1
-; CHECK-32-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-32-NEXT:    #
+; CHECK-32-NEXT:    stfs 2, 28(1)
+; CHECK-32-NEXT:    stfs 0, 24(1)
+; CHECK-32-NEXT:    lwz 5, 28(1)
+; CHECK-32-NEXT:    lwz 6, 24(1)
+; CHECK-32-NEXT:  .LBB0_4:                                # %cmpxchg.start
+; CHECK-32-NEXT:                                          #   Parent Loop BB0_3 Depth=1
+; CHECK-32-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-32-NEXT:    lwarx 4, 0, 3
+; CHECK-32-NEXT:    cmplw   4, 6
+; CHECK-32-NEXT:    bne     0, .LBB0_1
+; CHECK-32-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-32-NEXT:                                          #   in Loop: Header=BB0_4 Depth=2
 ; CHECK-32-NEXT:    stwcx. 5, 0, 3
-; CHECK-32-NEXT:    bne 0, .LBB0_3
-; CHECK-32-NEXT:    b .LBB0_1
-; CHECK-32-NEXT:  .LBB0_5: # %atomicrmw.end
+; CHECK-32-NEXT:    bne     0, .LBB0_4
+; CHECK-32-NEXT:  # %bb.6:                                #   in Loop: Header=BB0_3 Depth=1
+; CHECK-32-NEXT:    creqv 20, 20, 20
+; CHECK-32-NEXT:    b .LBB0_2
+; CHECK-32-NEXT:  .LBB0_7:                                # %atomicrmw.end
 ; CHECK-32-NEXT:    fmr 1, 0
 ; CHECK-32-NEXT:    lwsync
 ; CHECK-32-NEXT:    addi 1, 1, 32
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
index 4f00cff83942a..27a26aaca8b26 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll
@@ -5,49 +5,47 @@ define i8 @atomicrmw_usub_cond_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_usub_cond_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 7, 7, 24
-; CHECK-NEXT:    li 8, 255
-; CHECK-NEXT:    clrlwi 6, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 24
 ; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB0_7
-; CHECK-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    clrlwi 9, 3, 24
-; CHECK-NEXT:    cmplw 9, 6
-; CHECK-NEXT:    blt 0, .LBB0_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:  .LBB0_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 3, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_5
-; CHECK-NEXT:    b .LBB0_1
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.end
+; CHECK-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB0_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    clrlwi  10, 9, 24
+; CHECK-NEXT:    cmplw   10, 7
+; CHECK-NEXT:    blt     0, .LBB0_4
+; CHECK-NEXT:  # %bb.3:                                #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    sub     9, 9, 4
+; CHECK-NEXT:  .LBB0_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 9, 24
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB0_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB0_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB0_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_cond ptr %ptr, i8 %val seq_cst
@@ -58,50 +56,49 @@ define i16 @atomicrmw_usub_cond_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_usub_cond_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 8, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 27
-; CHECK-NEXT:    xori 7, 7, 16
-; CHECK-NEXT:    ori 8, 8, 65535
-; CHECK-NEXT:    clrlwi 6, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 16
 ; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB1_7
-; CHECK-NEXT:  .LBB1_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB1_5 Depth 2
-; CHECK-NEXT:    clrlwi 9, 3, 16
-; CHECK-NEXT:    cmplw 9, 6
-; CHECK-NEXT:    blt 0, .LBB1_4
-; CHECK-NEXT:  # %bb.3:
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:  .LBB1_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 3, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB1_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB1_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB1_5
-; CHECK-NEXT:    b .LBB1_1
-; CHECK-NEXT:  .LBB1_7: # %atomicrmw.end
+; CHECK-NEXT:  .LBB1_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB1_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB1_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    clrlwi  10, 9, 16
+; CHECK-NEXT:    cmplw   10, 7
+; CHECK-NEXT:    blt     0, .LBB1_4
+; CHECK-NEXT:  # %bb.3:                                #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    sub     9, 9, 4
+; CHECK-NEXT:  .LBB1_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 9, 16
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB1_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB1_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB1_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB1_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_cond ptr %ptr, i16 %val seq_cst
@@ -114,34 +111,33 @@ define i32 @atomicrmw_usub_cond_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB2_2
-; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB2_7
-; CHECK-NEXT:  .LBB2_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB2_5 Depth 2
-; CHECK-NEXT:    cmplw 6, 4
+; CHECK-NEXT:  .LBB2_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB2_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB2_5 Depth 2
+; CHECK-NEXT:    cmplw   6, 4
 ; CHECK-NEXT:    bge 0, .LBB2_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      7, 6
 ; CHECK-NEXT:    b .LBB2_5
-; CHECK-NEXT:  .LBB2_4:
-; CHECK-NEXT:    sub 7, 6, 4
-; CHECK-NEXT:  .LBB2_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB2_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB2_4:                                #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    sub     7, 6, 4
+; CHECK-NEXT:  .LBB2_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB2_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB2_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_5 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB2_5
-; CHECK-NEXT:    b .LBB2_1
-; CHECK-NEXT:  .LBB2_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB2_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -155,34 +151,33 @@ define i64 @atomicrmw_usub_cond_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB3_2
-; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB3_7
-; CHECK-NEXT:  .LBB3_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB3_5 Depth 2
-; CHECK-NEXT:    cmpld 6, 4
+; CHECK-NEXT:  .LBB3_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB3_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB3_5 Depth 2
+; CHECK-NEXT:    cmpld   6, 4
 ; CHECK-NEXT:    bge 0, .LBB3_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 6
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      7, 6
 ; CHECK-NEXT:    b .LBB3_5
-; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    sub 7, 6, 4
-; CHECK-NEXT:  .LBB3_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB3_4:                                #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    sub     7, 6, 4
+; CHECK-NEXT:  .LBB3_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB3_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_5 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB3_5
-; CHECK-NEXT:    b .LBB3_1
-; CHECK-NEXT:  .LBB3_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB3_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -194,52 +189,49 @@ define i8 @atomicrmw_usub_sat_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_usub_sat_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 6, 6, 24
-; CHECK-NEXT:    li 7, 255
-; CHECK-NEXT:    clrlwi 4, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 24
 ; CHECK-NEXT:    b .LBB4_2
-; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB4_7
-; CHECK-NEXT:  .LBB4_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB4_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 24
-; CHECK-NEXT:    sub 3, 8, 4
-; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:  .LBB4_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB4_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB4_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 24
+; CHECK-NEXT:    sub     8, 9, 4
+; CHECK-NEXT:    cmplw   8, 9
 ; CHECK-NEXT:    li 9, 0
-; CHECK-NEXT:    bgt 0, .LBB4_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 3
-; CHECK-NEXT:  .LBB4_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB4_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB4_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB4_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB4_5
-; CHECK-NEXT:    b .LBB4_1
-; CHECK-NEXT:  .LBB4_7: # %atomicrmw.end
+; CHECK-NEXT:    bgt     0, .LBB4_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      9, 8
+; CHECK-NEXT:  .LBB4_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB4_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB4_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB4_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB4_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_sat ptr %ptr, i8 %val seq_cst
@@ -250,53 +242,51 @@ define i16 @atomicrmw_usub_sat_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_usub_sat_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 27
-; CHECK-NEXT:    xori 6, 6, 16
-; CHECK-NEXT:    ori 7, 7, 65535
-; CHECK-NEXT:    clrlwi 4, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 16
 ; CHECK-NEXT:    b .LBB5_2
-; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB5_7
-; CHECK-NEXT:  .LBB5_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB5_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 16
-; CHECK-NEXT:    sub 3, 8, 4
-; CHECK-NEXT:    cmplw 3, 8
+; CHECK-NEXT:  .LBB5_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB5_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB5_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 16
+; CHECK-NEXT:    sub     8, 9, 4
+; CHECK-NEXT:    cmplw   8, 9
 ; CHECK-NEXT:    li 9, 0
-; CHECK-NEXT:    bgt 0, .LBB5_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 9, 3
-; CHECK-NEXT:  .LBB5_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB5_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB5_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB5_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB5_5
-; CHECK-NEXT:    b .LBB5_1
-; CHECK-NEXT:  .LBB5_7: # %atomicrmw.end
+; CHECK-NEXT:    bgt     0, .LBB5_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      9, 8
+; CHECK-NEXT:  .LBB5_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    slw 8, 9, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB5_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB5_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB5_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB5_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw usub_sat ptr %ptr, i16 %val seq_cst
@@ -309,33 +299,32 @@ define i32 @atomicrmw_usub_sat_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB6_2
-; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB6_6
-; CHECK-NEXT:  .LBB6_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB6_4 Depth 2
-; CHECK-NEXT:    sub 5, 6, 4
-; CHECK-NEXT:    cmplw 5, 6
+; CHECK-NEXT:  .LBB6_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB6_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB6_4 Depth 2
+; CHECK-NEXT:    sub     5, 6, 4
+; CHECK-NEXT:    cmplw   5, 6
 ; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    bgt 0, .LBB6_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 5
-; CHECK-NEXT:  .LBB6_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB6_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    bgt     0, .LBB6_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      7, 5
+; CHECK-NEXT:  .LBB6_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB6_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB6_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_4 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_4
-; CHECK-NEXT:    b .LBB6_1
-; CHECK-NEXT:  .LBB6_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB6_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -349,33 +338,32 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB7_2
-; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB7_6
-; CHECK-NEXT:  .LBB7_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB7_4 Depth 2
-; CHECK-NEXT:    subc 5, 6, 4
+; CHECK-NEXT:  .LBB7_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB7_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB7_4 Depth 2
+; CHECK-NEXT:    subc    5, 6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    addze. 8, 7
-; CHECK-NEXT:    beq 0, .LBB7_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    mr 7, 5
-; CHECK-NEXT:  .LBB7_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB7_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    beq     0, .LBB7_4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      7, 5
+; CHECK-NEXT:  .LBB7_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB7_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB7_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB7_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_4 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_4
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB7_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
index 2882dc420b608..6ced47bd6bcba 100644
--- a/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/PowerPC/atomicrmw-uinc-udec-wrap.ll
@@ -5,51 +5,49 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 6, 6, 24
-; CHECK-NEXT:    li 7, 255
-; CHECK-NEXT:    clrlwi 4, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 24
 ; CHECK-NEXT:    b .LBB0_2
-; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB0_7
-; CHECK-NEXT:  .LBB0_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB0_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 24
-; CHECK-NEXT:    cmplw 8, 4
+; CHECK-NEXT:  .LBB0_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB0_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB0_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 24
+; CHECK-NEXT:    cmplw   9, 4
 ; CHECK-NEXT:    li 9, 0
 ; CHECK-NEXT:    bge 0, .LBB0_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 9, 3, 1
-; CHECK-NEXT:  .LBB0_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB0_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB0_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB0_5
-; CHECK-NEXT:    b .LBB0_1
-; CHECK-NEXT:  .LBB0_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    addi 9, 8, 1
+; CHECK-NEXT:  .LBB0_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    clrlwi  8, 9, 24
+; CHECK-NEXT:    slw 8, 8, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB0_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB0_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB0_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB0_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
@@ -60,52 +58,51 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_uinc_wrap_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 7, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 6, 5, 3, 27, 27
-; CHECK-NEXT:    xori 6, 6, 16
-; CHECK-NEXT:    ori 7, 7, 65535
-; CHECK-NEXT:    clrlwi 4, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 7, 7, 6
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 7, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  4, 4, 16
 ; CHECK-NEXT:    b .LBB1_2
-; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 10, 6
-; CHECK-NEXT:    cmplw 3, 8
-; CHECK-NEXT:    beq 0, .LBB1_7
-; CHECK-NEXT:  .LBB1_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB1_5 Depth 2
-; CHECK-NEXT:    clrlwi 8, 3, 16
-; CHECK-NEXT:    cmplw 8, 4
+; CHECK-NEXT:  .LBB1_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  .LBB1_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB1_5 Depth 2
+; CHECK-NEXT:    srw 8, 7, 3
+; CHECK-NEXT:    clrlwi  9, 8, 16
+; CHECK-NEXT:    cmplw   9, 4
 ; CHECK-NEXT:    li 9, 0
 ; CHECK-NEXT:    bge 0, .LBB1_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 9, 3, 1
-; CHECK-NEXT:  .LBB1_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 9, 6
-; CHECK-NEXT:    slw 9, 8, 6
-; CHECK-NEXT:    and 3, 3, 7
-; CHECK-NEXT:    and 9, 9, 7
-; CHECK-NEXT:  .LBB1_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 11, 0, 5
-; CHECK-NEXT:    and 10, 11, 7
-; CHECK-NEXT:    cmpw 10, 9
-; CHECK-NEXT:    bne 0, .LBB1_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 11, 11, 7
-; CHECK-NEXT:    or 11, 11, 3
-; CHECK-NEXT:    stwcx. 11, 0, 5
-; CHECK-NEXT:    bne 0, .LBB1_5
-; CHECK-NEXT:    b .LBB1_1
-; CHECK-NEXT:  .LBB1_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    addi 9, 8, 1
+; CHECK-NEXT:  .LBB1_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    clrlwi  8, 9, 16
+; CHECK-NEXT:    slw 8, 8, 3
+; CHECK-NEXT:    and 9, 7, 6
+; CHECK-NEXT:    or 9, 9, 8
+; CHECK-NEXT:  .LBB1_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB1_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 8, 0, 5
+; CHECK-NEXT:    cmplw   8, 7
+; CHECK-NEXT:    bne     0, .LBB1_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB1_5 Depth=2
+; CHECK-NEXT:    stwcx. 9, 0, 5
+; CHECK-NEXT:    bne     0, .LBB1_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      7, 8
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 8, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
@@ -118,32 +115,31 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB2_2
-; CHECK-NEXT:  .LBB2_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB2_6
-; CHECK-NEXT:  .LBB2_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB2_4 Depth 2
-; CHECK-NEXT:    cmplw 6, 4
+; CHECK-NEXT:  .LBB2_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB2_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB2_4 Depth 2
+; CHECK-NEXT:    cmplw   6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    bge 0, .LBB2_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, 1
-; CHECK-NEXT:  .LBB2_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB2_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB2_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB2_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB2_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB2_4 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB2_4
-; CHECK-NEXT:    b .LBB2_1
-; CHECK-NEXT:  .LBB2_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB2_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -157,32 +153,31 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB3_2
-; CHECK-NEXT:  .LBB3_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB3_6
-; CHECK-NEXT:  .LBB3_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB3_4 Depth 2
-; CHECK-NEXT:    cmpld 6, 4
+; CHECK-NEXT:  .LBB3_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB3_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB3_4 Depth 2
+; CHECK-NEXT:    cmpld   6, 4
 ; CHECK-NEXT:    li 7, 0
 ; CHECK-NEXT:    bge 0, .LBB3_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, 1
-; CHECK-NEXT:  .LBB3_4: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB3_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB3_4:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB3_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB3_1
-; CHECK-NEXT:  # %bb.5: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB3_1
+; CHECK-NEXT:  # %bb.5:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB3_4 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB3_4
-; CHECK-NEXT:    b .LBB3_1
-; CHECK-NEXT:  .LBB3_6: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB3_4
+; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.7:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -194,52 +189,50 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; CHECK-LABEL: atomicrmw_udec_wrap_i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 28
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    xori 7, 7, 24
-; CHECK-NEXT:    li 8, 255
-; CHECK-NEXT:    clrlwi 6, 4, 24
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    not     3, 3
+; CHECK-NEXT:    li 6, 255
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    rlwinm 3, 3, 3, 27, 28
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 24
 ; CHECK-NEXT:    b .LBB4_2
-; CHECK-NEXT:  .LBB4_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB4_7
-; CHECK-NEXT:  .LBB4_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB4_5 Depth 2
-; CHECK-NEXT:    andi. 9, 3, 255
-; CHECK-NEXT:    cmplw 1, 9, 6
+; CHECK-NEXT:  .LBB4_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB4_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB4_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    andi. 10, 9, 255
+; CHECK-NEXT:    cmplw 1, 10, 7
 ; CHECK-NEXT:    cror 20, 2, 5
-; CHECK-NEXT:    mr 10, 4
+; CHECK-NEXT:    mr      10, 4
 ; CHECK-NEXT:    bc 12, 20, .LBB4_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 10, 3, -1
-; CHECK-NEXT:  .LBB4_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 10, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB4_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB4_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB4_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB4_5
-; CHECK-NEXT:    b .LBB4_1
-; CHECK-NEXT:  .LBB4_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    addi 10, 9, -1
+; CHECK-NEXT:  .LBB4_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 10, 24
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB4_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB4_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB4_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB4_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB4_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
@@ -250,53 +243,52 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; CHECK-LABEL: atomicrmw_udec_wrap_i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    sync
-; CHECK-NEXT:    mr 5, 3
-; CHECK-NEXT:    li 8, 0
-; CHECK-NEXT:    lhz 3, 0(3)
-; CHECK-NEXT:    rlwinm 7, 5, 3, 27, 27
-; CHECK-NEXT:    xori 7, 7, 16
-; CHECK-NEXT:    ori 8, 8, 65535
-; CHECK-NEXT:    clrlwi 6, 4, 16
-; CHECK-NEXT:    rldicr 5, 5, 0, 61
-; CHECK-NEXT:    slw 8, 8, 7
+; CHECK-NEXT:    rldicr 5, 3, 0, 61
+; CHECK-NEXT:    clrlwi  3, 3, 30
+; CHECK-NEXT:    lis 6, 0
+; CHECK-NEXT:    xori 3, 3, 2
+; CHECK-NEXT:    lwz 8, 0(5)
+; CHECK-NEXT:    ori 6, 6, 65535
+; CHECK-NEXT:    slwi 3, 3, 3
+; CHECK-NEXT:    slw 6, 6, 3
+; CHECK-NEXT:    not     6, 6
+; CHECK-NEXT:    clrlwi  7, 4, 16
 ; CHECK-NEXT:    b .LBB5_2
-; CHECK-NEXT:  .LBB5_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    srw 3, 11, 7
-; CHECK-NEXT:    cmplw 3, 9
-; CHECK-NEXT:    beq 0, .LBB5_7
-; CHECK-NEXT:  .LBB5_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB5_5 Depth 2
-; CHECK-NEXT:    andi. 9, 3, 65535
-; CHECK-NEXT:    cmplw 1, 9, 6
+; CHECK-NEXT:  .LBB5_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  .LBB5_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB5_5 Depth 2
+; CHECK-NEXT:    srw 9, 8, 3
+; CHECK-NEXT:    andi. 10, 9, 65535
+; CHECK-NEXT:    cmplw 1, 10, 7
 ; CHECK-NEXT:    cror 20, 2, 5
-; CHECK-NEXT:    mr 10, 4
+; CHECK-NEXT:    mr      10, 4
 ; CHECK-NEXT:    bc 12, 20, .LBB5_4
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    addi 10, 3, -1
-; CHECK-NEXT:  .LBB5_4: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    slw 3, 10, 7
-; CHECK-NEXT:    slw 10, 9, 7
-; CHECK-NEXT:    and 3, 3, 8
-; CHECK-NEXT:    and 10, 10, 8
-; CHECK-NEXT:  .LBB5_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB5_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    lwarx 12, 0, 5
-; CHECK-NEXT:    and 11, 12, 8
-; CHECK-NEXT:    cmpw 11, 10
-; CHECK-NEXT:    bne 0, .LBB5_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    andc 12, 12, 8
-; CHECK-NEXT:    or 12, 12, 3
-; CHECK-NEXT:    stwcx. 12, 0, 5
-; CHECK-NEXT:    bne 0, .LBB5_5
-; CHECK-NEXT:    b .LBB5_1
-; CHECK-NEXT:  .LBB5_7: # %atomicrmw.end
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    addi 10, 9, -1
+; CHECK-NEXT:  .LBB5_4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_2 Depth=1
+; CHECK-NEXT:    clrlwi  9, 10, 16
+; CHECK-NEXT:    slw 9, 9, 3
+; CHECK-NEXT:    and 10, 8, 6
+; CHECK-NEXT:    or 10, 10, 9
+; CHECK-NEXT:  .LBB5_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB5_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
+; CHECK-NEXT:    lwarx 9, 0, 5
+; CHECK-NEXT:    cmplw   9, 8
+; CHECK-NEXT:    bne     0, .LBB5_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB5_5 Depth=2
+; CHECK-NEXT:    stwcx. 10, 0, 5
+; CHECK-NEXT:    bne     0, .LBB5_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      8, 9
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    srw 3, 9, 3
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
@@ -309,37 +301,36 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    lwz 6, 0(3)
 ; CHECK-NEXT:    b .LBB6_2
-; CHECK-NEXT:  .LBB6_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB6_7
-; CHECK-NEXT:  .LBB6_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB6_5 Depth 2
-; CHECK-NEXT:    cmpwi 6, 0
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  .LBB6_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB6_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB6_5 Depth 2
+; CHECK-NEXT:    cmpwi   6, 0
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 2, .LBB6_5
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmplw 6, 4
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
+; CHECK-NEXT:    cmplw   6, 4
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 1, .LBB6_5
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, -1
-; CHECK-NEXT:  .LBB6_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB6_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB6_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB6_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    lwarx 5, 0, 3
-; CHECK-NEXT:    cmpw 5, 6
-; CHECK-NEXT:    bne 0, .LBB6_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmplw   5, 6
+; CHECK-NEXT:    bne     0, .LBB6_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB6_5 Depth=2
 ; CHECK-NEXT:    stwcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB6_5
-; CHECK-NEXT:    b .LBB6_1
-; CHECK-NEXT:  .LBB6_7: # %atomicrmw.end
+; CHECK-NEXT:    bne     0, .LBB6_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
 ; CHECK-NEXT:    mr 3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
@@ -353,38 +344,37 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    sync
 ; CHECK-NEXT:    ld 6, 0(3)
 ; CHECK-NEXT:    b .LBB7_2
-; CHECK-NEXT:  .LBB7_1: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    mr 6, 5
-; CHECK-NEXT:    beq 0, .LBB7_7
-; CHECK-NEXT:  .LBB7_2: # %atomicrmw.start
-; CHECK-NEXT:    # =>This Loop Header: Depth=1
-; CHECK-NEXT:    # Child Loop BB7_5 Depth 2
-; CHECK-NEXT:    cmpdi 6, 0
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  .LBB7_1:                                # %cmpxchg.nostore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  .LBB7_2:                                # %atomicrmw.start
+; CHECK-NEXT:                                          # =>This Loop Header: Depth=1
+; CHECK-NEXT:                                          #     Child Loop BB7_5 Depth 2
+; CHECK-NEXT:    cmpdi   6, 0
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 2, .LBB7_5
-; CHECK-NEXT:  # %bb.3: # %atomicrmw.start
-; CHECK-NEXT:    #
-; CHECK-NEXT:    cmpld 6, 4
-; CHECK-NEXT:    mr 7, 4
+; CHECK-NEXT:  # %bb.3:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
+; CHECK-NEXT:    cmpld   6, 4
+; CHECK-NEXT:    mr      7, 4
 ; CHECK-NEXT:    bc 12, 1, .LBB7_5
-; CHECK-NEXT:  # %bb.4: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:  # %bb.4:                                # %atomicrmw.start
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_2 Depth=1
 ; CHECK-NEXT:    addi 7, 6, -1
-; CHECK-NEXT:  .LBB7_5: # %atomicrmw.start
-; CHECK-NEXT:    # Parent Loop BB7_2 Depth=1
-; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  .LBB7_5:                                # %cmpxchg.start
+; CHECK-NEXT:                                          #   Parent Loop BB7_2 Depth=1
+; CHECK-NEXT:                                          # =>  This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldarx 5, 0, 3
-; CHECK-NEXT:    cmpd 5, 6
-; CHECK-NEXT:    bne 0, .LBB7_1
-; CHECK-NEXT:  # %bb.6: # %atomicrmw.start
-; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld   5, 6
+; CHECK-NEXT:    bne     0, .LBB7_1
+; CHECK-NEXT:  # %bb.6:                                # %cmpxchg.fencedstore
+; CHECK-NEXT:                                          #   in Loop: Header=BB7_5 Depth=2
 ; CHECK-NEXT:    stdcx. 7, 0, 3
-; CHECK-NEXT:    bne 0, .LBB7_5
-; CHECK-NEXT:    b .LBB7_1
-; CHECK-NEXT:  .LBB7_7: # %atomicrmw.end
-; CHECK-NEXT:    mr 3, 5
+; CHECK-NEXT:    bne     0, .LBB7_5
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mr      6, 5
+; CHECK-NEXT:  # %bb.8:                                # %atomicrmw.end
+; CHECK-NEXT:    mr      3, 5
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %result = atomicrmw udec_wrap ptr %ptr, i64 %val seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/atomics-regression.ll b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
index b31be701454da..280c4299c30b7 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-regression.ll
@@ -400,15 +400,20 @@ define void @test39() {
 define void @test40(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test40:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB40_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB40_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB40_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val monotonic monotonic
   ret void
@@ -417,15 +422,20 @@ define void @test40(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test41(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test41:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB41_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB41_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB41_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB41_1
-; PPC64LE-NEXT:  .LBB41_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire monotonic
@@ -435,15 +445,20 @@ define void @test41(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test42(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test42:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB42_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB42_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB42_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB42_1
-; PPC64LE-NEXT:  .LBB42_3:
+; PPC64LE-NEXT:  .LBB42_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
@@ -452,17 +467,26 @@ define void @test42(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test43:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB43_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB43_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB43_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB43_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val release monotonic
   ret void
@@ -470,17 +494,27 @@ define void @test43(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test44:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB44_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB44_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB44_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB44_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB44_1
-; PPC64LE-NEXT:  .LBB44_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB44_2
+; PPC64LE-NEXT:  .LBB44_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val release acquire
@@ -489,17 +523,29 @@ define void @test44(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test45:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB45_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB45_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB45_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB45_1
-; PPC64LE-NEXT:  .LBB45_3:
+; PPC64LE-NEXT:    beq 0, .LBB45_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB45_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB45_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acq_rel monotonic
@@ -508,17 +554,27 @@ define void @test45(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test46:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB46_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB46_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB46_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB46_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB46_1
-; PPC64LE-NEXT:  .LBB46_3:
+; PPC64LE-NEXT:    beq 0, .LBB46_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB46_2
+; PPC64LE-NEXT:  .LBB46_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acq_rel acquire
@@ -527,17 +583,29 @@ define void @test46(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test47:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB47_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB47_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB47_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB47_1
-; PPC64LE-NEXT:  .LBB47_3:
+; PPC64LE-NEXT:    beq 0, .LBB47_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB47_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB47_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst monotonic
@@ -546,17 +614,27 @@ define void @test47(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test48:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB48_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB48_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB48_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB48_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB48_1
-; PPC64LE-NEXT:  .LBB48_3:
+; PPC64LE-NEXT:    beq 0, .LBB48_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB48_2
+; PPC64LE-NEXT:  .LBB48_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst acquire
@@ -565,17 +643,27 @@ define void @test48(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test49:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB49_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB49_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB49_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB49_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB49_1
-; PPC64LE-NEXT:  .LBB49_3:
+; PPC64LE-NEXT:    beq 0, .LBB49_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB49_2
+; PPC64LE-NEXT:  .LBB49_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val seq_cst seq_cst
@@ -585,15 +673,20 @@ define void @test49(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test50(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test50:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB50_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB50_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB50_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val monotonic monotonic
   ret void
@@ -602,15 +695,20 @@ define void @test50(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test51(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test51:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB51_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB51_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB51_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB51_1
-; PPC64LE-NEXT:  .LBB51_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire monotonic
@@ -620,15 +718,20 @@ define void @test51(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test52(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test52:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB52_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB52_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB52_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB52_1
-; PPC64LE-NEXT:  .LBB52_3:
+; PPC64LE-NEXT:  .LBB52_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
@@ -637,17 +740,26 @@ define void @test52(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test53:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB53_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB53_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB53_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB53_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val release monotonic
   ret void
@@ -655,17 +767,27 @@ define void @test53(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test54:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB54_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB54_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB54_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB54_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB54_1
-; PPC64LE-NEXT:  .LBB54_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB54_2
+; PPC64LE-NEXT:  .LBB54_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val release acquire
@@ -674,17 +796,29 @@ define void @test54(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test55:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB55_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB55_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB55_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB55_1
-; PPC64LE-NEXT:  .LBB55_3:
+; PPC64LE-NEXT:    beq 0, .LBB55_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB55_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB55_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acq_rel monotonic
@@ -693,17 +827,27 @@ define void @test55(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test56:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB56_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB56_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB56_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB56_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB56_1
-; PPC64LE-NEXT:  .LBB56_3:
+; PPC64LE-NEXT:    beq 0, .LBB56_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB56_2
+; PPC64LE-NEXT:  .LBB56_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acq_rel acquire
@@ -712,17 +856,29 @@ define void @test56(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test57:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB57_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB57_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB57_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB57_1
-; PPC64LE-NEXT:  .LBB57_3:
+; PPC64LE-NEXT:    beq 0, .LBB57_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB57_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB57_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst monotonic
@@ -731,17 +887,27 @@ define void @test57(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test58:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB58_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB58_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB58_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB58_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB58_1
-; PPC64LE-NEXT:  .LBB58_3:
+; PPC64LE-NEXT:    beq 0, .LBB58_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB58_2
+; PPC64LE-NEXT:  .LBB58_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst acquire
@@ -750,17 +916,27 @@ define void @test58(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test59:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB59_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB59_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB59_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB59_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB59_1
-; PPC64LE-NEXT:  .LBB59_3:
+; PPC64LE-NEXT:    beq 0, .LBB59_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB59_2
+; PPC64LE-NEXT:  .LBB59_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val seq_cst seq_cst
@@ -770,14 +946,17 @@ define void @test59(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test60(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test60:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB60_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB60_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB60_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val monotonic monotonic
   ret void
@@ -786,14 +965,17 @@ define void @test60(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test61(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test61:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB61_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB61_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB61_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB61_1
-; PPC64LE-NEXT:  .LBB61_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire monotonic
@@ -803,14 +985,17 @@ define void @test61(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test62(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test62:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB62_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB62_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB62_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB62_1
-; PPC64LE-NEXT:  .LBB62_3:
+; PPC64LE-NEXT:  .LBB62_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
@@ -819,16 +1004,22 @@ define void @test62(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test63(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test63:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB63_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB63_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB63_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB63_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val release monotonic
   ret void
@@ -836,16 +1027,23 @@ define void @test63(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test64(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test64:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB64_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB64_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB64_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB64_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB64_1
-; PPC64LE-NEXT:  .LBB64_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB64_2
+; PPC64LE-NEXT:  .LBB64_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val release acquire
@@ -854,16 +1052,25 @@ define void @test64(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test65(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test65:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB65_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB65_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB65_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB65_1
-; PPC64LE-NEXT:  .LBB65_3:
+; PPC64LE-NEXT:    beq 0, .LBB65_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB65_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB65_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acq_rel monotonic
@@ -872,16 +1079,23 @@ define void @test65(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test66(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test66:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB66_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB66_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB66_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB66_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB66_1
-; PPC64LE-NEXT:  .LBB66_3:
+; PPC64LE-NEXT:    beq 0, .LBB66_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB66_2
+; PPC64LE-NEXT:  .LBB66_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acq_rel acquire
@@ -890,16 +1104,25 @@ define void @test66(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test67(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test67:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB67_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB67_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB67_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB67_1
-; PPC64LE-NEXT:  .LBB67_3:
+; PPC64LE-NEXT:    beq 0, .LBB67_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB67_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB67_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst monotonic
@@ -908,16 +1131,23 @@ define void @test67(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test68(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test68:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB68_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB68_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB68_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB68_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB68_1
-; PPC64LE-NEXT:  .LBB68_3:
+; PPC64LE-NEXT:    beq 0, .LBB68_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB68_2
+; PPC64LE-NEXT:  .LBB68_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst acquire
@@ -926,16 +1156,23 @@ define void @test68(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test69(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test69:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB69_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB69_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB69_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB69_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB69_1
-; PPC64LE-NEXT:  .LBB69_3:
+; PPC64LE-NEXT:    beq 0, .LBB69_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB69_2
+; PPC64LE-NEXT:  .LBB69_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst seq_cst
@@ -945,14 +1182,17 @@ define void @test69(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test70(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test70:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB70_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB70_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB70_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val monotonic monotonic
   ret void
@@ -961,14 +1201,17 @@ define void @test70(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test71(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test71:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB71_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB71_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB71_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB71_1
-; PPC64LE-NEXT:  .LBB71_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire monotonic
@@ -978,14 +1221,17 @@ define void @test71(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test72(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test72:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB72_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB72_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB72_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB72_1
-; PPC64LE-NEXT:  .LBB72_3:
+; PPC64LE-NEXT:  .LBB72_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
@@ -994,16 +1240,22 @@ define void @test72(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test73(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test73:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB73_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB73_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB73_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB73_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val release monotonic
   ret void
@@ -1011,16 +1263,23 @@ define void @test73(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test74(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test74:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB74_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB74_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB74_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB74_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB74_1
-; PPC64LE-NEXT:  .LBB74_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB74_2
+; PPC64LE-NEXT:  .LBB74_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val release acquire
@@ -1029,16 +1288,25 @@ define void @test74(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test75(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test75:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB75_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB75_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB75_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB75_1
-; PPC64LE-NEXT:  .LBB75_3:
+; PPC64LE-NEXT:    beq 0, .LBB75_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB75_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB75_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acq_rel monotonic
@@ -1047,16 +1315,23 @@ define void @test75(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test76(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test76:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB76_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB76_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB76_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB76_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB76_1
-; PPC64LE-NEXT:  .LBB76_3:
+; PPC64LE-NEXT:    beq 0, .LBB76_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB76_2
+; PPC64LE-NEXT:  .LBB76_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acq_rel acquire
@@ -1065,16 +1340,25 @@ define void @test76(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test77(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test77:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB77_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB77_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB77_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB77_1
-; PPC64LE-NEXT:  .LBB77_3:
+; PPC64LE-NEXT:    beq 0, .LBB77_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB77_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB77_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst monotonic
@@ -1083,16 +1367,23 @@ define void @test77(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test78(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test78:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB78_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB78_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB78_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB78_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB78_1
-; PPC64LE-NEXT:  .LBB78_3:
+; PPC64LE-NEXT:    beq 0, .LBB78_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB78_2
+; PPC64LE-NEXT:  .LBB78_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst acquire
@@ -1101,16 +1392,23 @@ define void @test78(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test79(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test79:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB79_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB79_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB79_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB79_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB79_1
-; PPC64LE-NEXT:  .LBB79_3:
+; PPC64LE-NEXT:    beq 0, .LBB79_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB79_2
+; PPC64LE-NEXT:  .LBB79_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val seq_cst seq_cst
@@ -1120,15 +1418,20 @@ define void @test79(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test80(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test80:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB80_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB80_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB80_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1137,15 +1440,20 @@ define void @test80(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test81(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test81:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB81_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB81_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB81_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB81_1
-; PPC64LE-NEXT:  .LBB81_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire monotonic
@@ -1155,15 +1463,20 @@ define void @test81(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test82(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test82:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB82_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB82_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB82_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB82_1
-; PPC64LE-NEXT:  .LBB82_3:
+; PPC64LE-NEXT:  .LBB82_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acquire acquire
@@ -1172,17 +1485,26 @@ define void @test82(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test83:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB83_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB83_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB83_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB83_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
   ret void
@@ -1190,17 +1512,27 @@ define void @test83(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test84:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB84_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB84_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB84_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB84_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB84_1
-; PPC64LE-NEXT:  .LBB84_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB84_2
+; PPC64LE-NEXT:  .LBB84_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
@@ -1209,17 +1541,29 @@ define void @test84(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test85:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB85_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB85_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB85_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB85_1
-; PPC64LE-NEXT:  .LBB85_3:
+; PPC64LE-NEXT:    beq 0, .LBB85_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB85_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB85_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel monotonic
@@ -1228,17 +1572,27 @@ define void @test85(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test86:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB86_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB86_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB86_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB86_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB86_1
-; PPC64LE-NEXT:  .LBB86_3:
+; PPC64LE-NEXT:    beq 0, .LBB86_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB86_2
+; PPC64LE-NEXT:  .LBB86_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") acq_rel acquire
@@ -1247,17 +1601,29 @@ define void @test86(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test87:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB87_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB87_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB87_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB87_1
-; PPC64LE-NEXT:  .LBB87_3:
+; PPC64LE-NEXT:    beq 0, .LBB87_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB87_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB87_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst monotonic
@@ -1266,17 +1632,27 @@ define void @test87(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test88:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB88_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB88_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB88_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB88_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB88_1
-; PPC64LE-NEXT:  .LBB88_3:
+; PPC64LE-NEXT:    beq 0, .LBB88_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB88_2
+; PPC64LE-NEXT:  .LBB88_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst acquire
@@ -1285,17 +1661,27 @@ define void @test88(ptr %ptr, i8 %cmp, i8 %val) {
 
 define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test89:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB89_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB89_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 24
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB89_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB89_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB89_1
-; PPC64LE-NEXT:  .LBB89_3:
+; PPC64LE-NEXT:    beq 0, .LBB89_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB89_2
+; PPC64LE-NEXT:  .LBB89_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i8 %cmp, i8 %val syncscope("singlethread") seq_cst seq_cst
@@ -1305,15 +1691,20 @@ define void @test89(ptr %ptr, i8 %cmp, i8 %val) {
 define void @test90(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test90:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB90_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB90_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB90_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1322,15 +1713,20 @@ define void @test90(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test91(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test91:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB91_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB91_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB91_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB91_1
-; PPC64LE-NEXT:  .LBB91_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire monotonic
@@ -1340,15 +1736,20 @@ define void @test91(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test92(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test92:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
 ; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB92_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB92_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB92_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB92_1
-; PPC64LE-NEXT:  .LBB92_3:
+; PPC64LE-NEXT:  .LBB92_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acquire acquire
@@ -1357,17 +1758,26 @@ define void @test92(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test93:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB93_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB93_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB93_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB93_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
   ret void
@@ -1375,17 +1785,27 @@ define void @test93(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test94:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB94_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB94_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB94_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB94_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB94_1
-; PPC64LE-NEXT:  .LBB94_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB94_2
+; PPC64LE-NEXT:  .LBB94_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
@@ -1394,17 +1814,29 @@ define void @test94(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test95:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB95_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB95_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB95_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB95_1
-; PPC64LE-NEXT:  .LBB95_3:
+; PPC64LE-NEXT:    beq 0, .LBB95_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB95_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB95_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel monotonic
@@ -1413,17 +1845,27 @@ define void @test95(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test96:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB96_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB96_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB96_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB96_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB96_1
-; PPC64LE-NEXT:  .LBB96_3:
+; PPC64LE-NEXT:    beq 0, .LBB96_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB96_2
+; PPC64LE-NEXT:  .LBB96_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") acq_rel acquire
@@ -1432,17 +1874,29 @@ define void @test96(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test97:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB97_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB97_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB97_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB97_1
-; PPC64LE-NEXT:  .LBB97_3:
+; PPC64LE-NEXT:    beq 0, .LBB97_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB97_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB97_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst monotonic
@@ -1451,17 +1905,27 @@ define void @test97(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test98:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB98_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB98_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB98_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB98_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB98_1
-; PPC64LE-NEXT:  .LBB98_3:
+; PPC64LE-NEXT:    beq 0, .LBB98_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB98_2
+; PPC64LE-NEXT:  .LBB98_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst acquire
@@ -1470,17 +1934,27 @@ define void @test98(ptr %ptr, i16 %cmp, i16 %val) {
 
 define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test99:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:    clrlwi 4, 4, 16
-; PPC64LE-NEXT:  .LBB99_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lharx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB99_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    clrlwi 4, 4, 16
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB99_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    clrlwi 5, 5, 16
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB99_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB99_1
-; PPC64LE-NEXT:  .LBB99_3:
+; PPC64LE-NEXT:    beq 0, .LBB99_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    clrlwi 6, 6, 16
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB99_2
+; PPC64LE-NEXT:  .LBB99_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i16 %cmp, i16 %val syncscope("singlethread") seq_cst seq_cst
@@ -1490,14 +1964,17 @@ define void @test99(ptr %ptr, i16 %cmp, i16 %val) {
 define void @test100(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test100:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB100_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB100_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB100_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1506,14 +1983,17 @@ define void @test100(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test101(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test101:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB101_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB101_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB101_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB101_1
-; PPC64LE-NEXT:  .LBB101_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire monotonic
@@ -1523,14 +2003,17 @@ define void @test101(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test102(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test102:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB102_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB102_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB102_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB102_1
-; PPC64LE-NEXT:  .LBB102_3:
+; PPC64LE-NEXT:  .LBB102_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acquire acquire
@@ -1539,16 +2022,22 @@ define void @test102(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test103(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test103:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB103_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB103_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB103_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB103_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
   ret void
@@ -1556,16 +2045,23 @@ define void @test103(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test104(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test104:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB104_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB104_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB104_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB104_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB104_1
-; PPC64LE-NEXT:  .LBB104_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB104_2
+; PPC64LE-NEXT:  .LBB104_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
@@ -1574,16 +2070,25 @@ define void @test104(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test105(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test105:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB105_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB105_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB105_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB105_1
-; PPC64LE-NEXT:  .LBB105_3:
+; PPC64LE-NEXT:    beq 0, .LBB105_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB105_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB105_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel monotonic
@@ -1592,16 +2097,23 @@ define void @test105(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test106(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test106:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB106_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB106_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB106_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB106_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB106_1
-; PPC64LE-NEXT:  .LBB106_3:
+; PPC64LE-NEXT:    beq 0, .LBB106_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB106_2
+; PPC64LE-NEXT:  .LBB106_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") acq_rel acquire
@@ -1610,16 +2122,25 @@ define void @test106(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test107(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test107:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB107_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB107_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB107_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB107_1
-; PPC64LE-NEXT:  .LBB107_3:
+; PPC64LE-NEXT:    beq 0, .LBB107_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB107_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB107_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst monotonic
@@ -1628,16 +2149,23 @@ define void @test107(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test108(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test108:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB108_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB108_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB108_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB108_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB108_1
-; PPC64LE-NEXT:  .LBB108_3:
+; PPC64LE-NEXT:    beq 0, .LBB108_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB108_2
+; PPC64LE-NEXT:  .LBB108_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst acquire
@@ -1646,16 +2174,23 @@ define void @test108(ptr %ptr, i32 %cmp, i32 %val) {
 
 define void @test109(ptr %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test109:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB109_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB109_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB109_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB109_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB109_1
-; PPC64LE-NEXT:  .LBB109_3:
+; PPC64LE-NEXT:    beq 0, .LBB109_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmplw 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB109_2
+; PPC64LE-NEXT:  .LBB109_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i32 %cmp, i32 %val syncscope("singlethread") seq_cst seq_cst
@@ -1665,14 +2200,17 @@ define void @test109(ptr %ptr, i32 %cmp, i32 %val) {
 define void @test110(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test110:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB110_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB110_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB110_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.end
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
   ret void
@@ -1681,14 +2219,17 @@ define void @test110(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test111(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test111:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB111_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB111_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB111_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB111_1
-; PPC64LE-NEXT:  .LBB111_3:
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire monotonic
@@ -1698,14 +2239,17 @@ define void @test111(ptr %ptr, i64 %cmp, i64 %val) {
 define void @test112(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test112:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:  .LBB112_1:
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB112_1: # %cmpxchg.start
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bne 0, .LBB112_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.2: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB112_1
-; PPC64LE-NEXT:  .LBB112_3:
+; PPC64LE-NEXT:  .LBB112_3: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acquire acquire
@@ -1714,16 +2258,22 @@ define void @test112(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test113(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test113:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB113_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
+; PPC64LE-NEXT:    cmpld 6, 4
 ; PPC64LE-NEXT:    bnelr 0
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB113_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB113_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB113_2
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
   ret void
@@ -1731,16 +2281,23 @@ define void @test113(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test114(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test114:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB114_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB114_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB114_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB114_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB114_1
-; PPC64LE-NEXT:  .LBB114_3:
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB114_2
+; PPC64LE-NEXT:  .LBB114_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire
@@ -1749,16 +2306,25 @@ define void @test114(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test115(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test115:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB115_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB115_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB115_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB115_1
-; PPC64LE-NEXT:  .LBB115_3:
+; PPC64LE-NEXT:    beq 0, .LBB115_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB115_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB115_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel monotonic
@@ -1767,16 +2333,23 @@ define void @test115(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test116(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test116:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:  .LBB116_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB116_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB116_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB116_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB116_1
-; PPC64LE-NEXT:  .LBB116_3:
+; PPC64LE-NEXT:    beq 0, .LBB116_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB116_2
+; PPC64LE-NEXT:  .LBB116_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") acq_rel acquire
@@ -1785,16 +2358,25 @@ define void @test116(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test117(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test117:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB117_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB117_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bnelr 0
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB117_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB117_1
-; PPC64LE-NEXT:  .LBB117_3:
+; PPC64LE-NEXT:    beq 0, .LBB117_5
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB117_2
+; PPC64LE-NEXT:  # %bb.4: # %cmpxchg.end
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB117_5: # %cmpxchg.success
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst monotonic
@@ -1803,16 +2385,23 @@ define void @test117(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test118(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test118:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB118_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB118_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB118_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB118_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB118_1
-; PPC64LE-NEXT:  .LBB118_3:
+; PPC64LE-NEXT:    beq 0, .LBB118_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB118_2
+; PPC64LE-NEXT:  .LBB118_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst acquire
@@ -1821,16 +2410,23 @@ define void @test118(ptr %ptr, i64 %cmp, i64 %val) {
 
 define void @test119(ptr %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test119:
-; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    sync
-; PPC64LE-NEXT:  .LBB119_1:
+; PPC64LE:       # %bb.0: # %cmpxchg.start
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
-; PPC64LE-NEXT:    cmpd 6, 4
-; PPC64LE-NEXT:    bne 0, .LBB119_3
-; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    bne 0, .LBB119_4
+; PPC64LE-NEXT:  # %bb.1: # %cmpxchg.fencedstore
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB119_2: # %cmpxchg.trystore
+; PPC64LE-NEXT:    #
 ; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    bne 0, .LBB119_1
-; PPC64LE-NEXT:  .LBB119_3:
+; PPC64LE-NEXT:    beq 0, .LBB119_4
+; PPC64LE-NEXT:  # %bb.3: # %cmpxchg.releasedload
+; PPC64LE-NEXT:    #
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpld 6, 4
+; PPC64LE-NEXT:    beq 0, .LBB119_2
+; PPC64LE-NEXT:  .LBB119_4: # %cmpxchg.nostore
 ; PPC64LE-NEXT:    lwsync
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg ptr %ptr, i64 %cmp, i64 %val syncscope("singlethread") seq_cst seq_cst
diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll
index 24e71c87414e8..40786057ead5f 100644
--- a/llvm/test/CodeGen/PowerPC/atomics.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics.ll
@@ -139,59 +139,67 @@ define void @store_i64_seq_cst(ptr %mem) {
 define i8 @cas_strong_i8_sc_sc(ptr %mem) {
 ; PPC32-LABEL: cas_strong_i8_sc_sc:
 ; PPC32:       # %bb.0:
-; PPC32-NEXT:    rlwinm r8, r3, 3, 27, 28
-; PPC32-NEXT:    li r5, 1
-; PPC32-NEXT:    li r6, 0
-; PPC32-NEXT:    li r7, 255
-; PPC32-NEXT:    rlwinm r4, r3, 0, 0, 29
-; PPC32-NEXT:    xori r3, r8, 24
-; PPC32-NEXT:    slw r8, r5, r3
-; PPC32-NEXT:    slw r9, r6, r3
-; PPC32-NEXT:    slw r5, r7, r3
-; PPC32-NEXT:    and r6, r8, r5
-; PPC32-NEXT:    and r7, r9, r5
+; PPC32-NEXT:    rlwinm r5, r3, 0, 0, 29
+; PPC32-NEXT:    lwarx r4, 0, r5
+; PPC32-NEXT:    not     r3, r3
+; PPC32-NEXT:    rlwinm r3, r3, 3, 27, 28
+; PPC32-NEXT:    srw r6, r4, r3
+; PPC32-NEXT:    andi. r6, r6, 255
+; PPC32-NEXT:    bne     cr0, .LBB8_4
+; PPC32-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; PPC32-NEXT:    li r6, 255
+; PPC32-NEXT:    li r7, 1
+; PPC32-NEXT:    slw r6, r6, r3
+; PPC32-NEXT:    not     r6, r6
+; PPC32-NEXT:    slw r7, r7, r3
 ; PPC32-NEXT:    sync
-; PPC32-NEXT:  .LBB8_1:
-; PPC32-NEXT:    lwarx r9, 0, r4
-; PPC32-NEXT:    and r8, r9, r5
-; PPC32-NEXT:    cmpw r8, r7
-; PPC32-NEXT:    bne cr0, .LBB8_3
-; PPC32-NEXT:  # %bb.2:
-; PPC32-NEXT:    andc r9, r9, r5
-; PPC32-NEXT:    or r9, r9, r6
-; PPC32-NEXT:    stwcx. r9, 0, r4
-; PPC32-NEXT:    bne cr0, .LBB8_1
-; PPC32-NEXT:  .LBB8_3:
-; PPC32-NEXT:    srw r3, r8, r3
+; PPC32-NEXT:  .LBB8_2:                                # %cmpxchg.trystore
+; PPC32-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; PPC32-NEXT:    and r8, r4, r6
+; PPC32-NEXT:    or r8, r8, r7
+; PPC32-NEXT:    stwcx. r8, 0, r5
+; PPC32-NEXT:    beq     cr0, .LBB8_4
+; PPC32-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; PPC32-NEXT:                                          #   in Loop: Header=BB8_2 Depth=1
+; PPC32-NEXT:    lwarx r4, 0, r5
+; PPC32-NEXT:    srw r8, r4, r3
+; PPC32-NEXT:    andi. r8, r8, 255
+; PPC32-NEXT:    beq     cr0, .LBB8_2
+; PPC32-NEXT:  .LBB8_4:                                # %cmpxchg.nostore
+; PPC32-NEXT:    srw r3, r4, r3
 ; PPC32-NEXT:    lwsync
 ; PPC32-NEXT:    blr
 ;
 ; PPC64-LABEL: cas_strong_i8_sc_sc:
 ; PPC64:       # %bb.0:
-; PPC64-NEXT:    rlwinm r8, r3, 3, 27, 28
-; PPC64-NEXT:    li r5, 1
-; PPC64-NEXT:    li r6, 0
-; PPC64-NEXT:    li r7, 255
-; PPC64-NEXT:    rldicr r4, r3, 0, 61
-; PPC64-NEXT:    xori r3, r8, 24
-; PPC64-NEXT:    slw r8, r5, r3
-; PPC64-NEXT:    slw r9, r6, r3
-; PPC64-NEXT:    slw r5, r7, r3
-; PPC64-NEXT:    and r6, r8, r5
-; PPC64-NEXT:    and r7, r9, r5
+; PPC64-NEXT:    rldicr r5, r3, 0, 61
+; PPC64-NEXT:    not     r3, r3
+; PPC64-NEXT:    lwarx r4, 0, r5
+; PPC64-NEXT:    rlwinm r3, r3, 3, 27, 28
+; PPC64-NEXT:    srw r6, r4, r3
+; PPC64-NEXT:    andi. r6, r6, 255
+; PPC64-NEXT:    bne     cr0, .LBB8_4
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
+; PPC64-NEXT:    li r6, 255
+; PPC64-NEXT:    li r7, 1
+; PPC64-NEXT:    slw r6, r6, r3
+; PPC64-NEXT:    not     r6, r6
+; PPC64-NEXT:    slw r7, r7, r3
 ; PPC64-NEXT:    sync
-; PPC64-NEXT:  .LBB8_1:
-; PPC64-NEXT:    lwarx r9, 0, r4
-; PPC64-NEXT:    and r8, r9, r5
-; PPC64-NEXT:    cmpw r8, r7
-; PPC64-NEXT:    bne cr0, .LBB8_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    andc r9, r9, r5
-; PPC64-NEXT:    or r9, r9, r6
-; PPC64-NEXT:    stwcx. r9, 0, r4
-; PPC64-NEXT:    bne cr0, .LBB8_1
-; PPC64-NEXT:  .LBB8_3:
-; PPC64-NEXT:    srw r3, r8, r3
+; PPC64-NEXT:  .LBB8_2:                                # %cmpxchg.trystore
+; PPC64-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; PPC64-NEXT:    and r8, r4, r6
+; PPC64-NEXT:    or r8, r8, r7
+; PPC64-NEXT:    stwcx. r8, 0, r5
+; PPC64-NEXT:    beq     cr0, .LBB8_4
+; PPC64-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; PPC64-NEXT:                                          #   in Loop: Header=BB8_2 Depth=1
+; PPC64-NEXT:    lwarx r4, 0, r5
+; PPC64-NEXT:    srw r8, r4, r3
+; PPC64-NEXT:    andi. r8, r8, 255
+; PPC64-NEXT:    beq     cr0, .LBB8_2
+; PPC64-NEXT:  .LBB8_4:                                # %cmpxchg.nostore
+; PPC64-NEXT:    srw r3, r4, r3
 ; PPC64-NEXT:    lwsync
 ; PPC64-NEXT:    blr
   %val = cmpxchg ptr %mem, i8 0, i8 1 seq_cst seq_cst
@@ -201,57 +209,53 @@ define i8 @cas_strong_i8_sc_sc(ptr %mem) {
 define i16 @cas_weak_i16_acquire_acquire(ptr %mem) {
 ; PPC32-LABEL: cas_weak_i16_acquire_acquire:
 ; PPC32:       # %bb.0:
-; PPC32-NEXT:    li r6, 0
-; PPC32-NEXT:    rlwinm r4, r3, 3, 27, 27
-; PPC32-NEXT:    li r5, 1
-; PPC32-NEXT:    ori r7, r6, 65535
-; PPC32-NEXT:    xori r4, r4, 16
-; PPC32-NEXT:    slw r8, r5, r4
-; PPC32-NEXT:    slw r9, r6, r4
-; PPC32-NEXT:    slw r5, r7, r4
-; PPC32-NEXT:    rlwinm r3, r3, 0, 0, 29
-; PPC32-NEXT:    and r6, r8, r5
-; PPC32-NEXT:    and r7, r9, r5
-; PPC32-NEXT:  .LBB9_1:
-; PPC32-NEXT:    lwarx r9, 0, r3
-; PPC32-NEXT:    and r8, r9, r5
-; PPC32-NEXT:    cmpw r8, r7
-; PPC32-NEXT:    bne cr0, .LBB9_3
-; PPC32-NEXT:  # %bb.2:
-; PPC32-NEXT:    andc r9, r9, r5
-; PPC32-NEXT:    or r9, r9, r6
-; PPC32-NEXT:    stwcx. r9, 0, r3
-; PPC32-NEXT:    bne cr0, .LBB9_1
-; PPC32-NEXT:  .LBB9_3:
-; PPC32-NEXT:    srw r3, r8, r4
+; PPC32-NEXT:    rlwinm r4, r3, 0, 0, 29
+; PPC32-NEXT:    lwarx r5, 0, r4
+; PPC32-NEXT:    clrlwi  r3, r3, 30
+; PPC32-NEXT:    xori r3, r3, 2
+; PPC32-NEXT:    slwi r6, r3, 3
+; PPC32-NEXT:    srw r3, r5, r6
+; PPC32-NEXT:    andi. r7, r3, 65535
+; PPC32-NEXT:    beq     cr0, .LBB9_2
+; PPC32-NEXT:  # %bb.1:                                # %cmpxchg.failure
+; PPC32-NEXT:    lwsync
+; PPC32-NEXT:    blr
+; PPC32-NEXT:  .LBB9_2:                                # %cmpxchg.fencedstore
+; PPC32-NEXT:    lis r7, 0
+; PPC32-NEXT:    ori r7, r7, 65535
+; PPC32-NEXT:    slw r7, r7, r6
+; PPC32-NEXT:    li r8, 1
+; PPC32-NEXT:    not     r7, r7
+; PPC32-NEXT:    slw r6, r8, r6
+; PPC32-NEXT:    and r5, r5, r7
+; PPC32-NEXT:    or r5, r5, r6
+; PPC32-NEXT:    stwcx. r5, 0, r4
 ; PPC32-NEXT:    lwsync
 ; PPC32-NEXT:    blr
 ;
 ; PPC64-LABEL: cas_weak_i16_acquire_acquire:
 ; PPC64:       # %bb.0:
-; PPC64-NEXT:    li r6, 0
-; PPC64-NEXT:    rlwinm r4, r3, 3, 27, 27
-; PPC64-NEXT:    li r5, 1
-; PPC64-NEXT:    ori r7, r6, 65535
-; PPC64-NEXT:    xori r4, r4, 16
-; PPC64-NEXT:    slw r8, r5, r4
-; PPC64-NEXT:    slw r9, r6, r4
-; PPC64-NEXT:    slw r5, r7, r4
-; PPC64-NEXT:    rldicr r3, r3, 0, 61
-; PPC64-NEXT:    and r6, r8, r5
-; PPC64-NEXT:    and r7, r9, r5
-; PPC64-NEXT:  .LBB9_1:
-; PPC64-NEXT:    lwarx r9, 0, r3
-; PPC64-NEXT:    and r8, r9, r5
-; PPC64-NEXT:    cmpw r8, r7
-; PPC64-NEXT:    bne cr0, .LBB9_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    andc r9, r9, r5
-; PPC64-NEXT:    or r9, r9, r6
-; PPC64-NEXT:    stwcx. r9, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB9_1
-; PPC64-NEXT:  .LBB9_3:
-; PPC64-NEXT:    srw r3, r8, r4
+; PPC64-NEXT:   rldicr r4, r3, 0, 61
+; PPC64-NEXT:    clrlwi  r3, r3, 30
+; PPC64-NEXT:    lwarx r5, 0, r4
+; PPC64-NEXT:    xori r3, r3, 2
+; PPC64-NEXT:    slwi r6, r3, 3
+; PPC64-NEXT:    srw r3, r5, r6
+; PPC64-NEXT:    andi. r7, r3, 65535
+; PPC64-NEXT:    beq     cr0, .LBB9_2
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.failure
+; PPC64-NEXT:    lwsync
+; PPC64-NEXT:    blr
+; PPC64-NEXT:  .LBB9_2:                                # %cmpxchg.fencedstore
+; PPC64-NEXT:    lis r7, 0
+; PPC64-NEXT:    ori r7, r7, 65535
+; PPC64-NEXT:    slw r7, r7, r6
+; PPC64-NEXT:    li r8, 1
+; PPC64-NEXT:    not     r7, r7
+; PPC64-NEXT:    slw r6, r8, r6
+; PPC64-NEXT:    and r5, r5, r7
+; PPC64-NEXT:    or r5, r5, r6
+; PPC64-NEXT:    stwcx. r5, 0, r4
 ; PPC64-NEXT:    lwsync
 ; PPC64-NEXT:    blr
   %val = cmpxchg weak ptr %mem, i16 0, i16 1 acquire acquire
@@ -261,17 +265,23 @@ define i16 @cas_weak_i16_acquire_acquire(ptr %mem) {
 define i32 @cas_strong_i32_acqrel_acquire(ptr %mem) {
 ; CHECK-LABEL: cas_strong_i32_acqrel_acquire:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr      r4, r3
+; CHECK-NEXT:    lwarx r3, 0, r3
+; CHECK-NEXT:    cmplwi  r3, 0
+; CHECK-NEXT:    bne     cr0, .LBB10_4
+; CHECK-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; CHECK-NEXT:    li r5, 1
 ; CHECK-NEXT:    lwsync
-; CHECK-NEXT:  .LBB10_1:
-; CHECK-NEXT:    lwarx r4, 0, r3
-; CHECK-NEXT:    cmpwi r4, 0
-; CHECK-NEXT:    bne cr0, .LBB10_3
-; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    stwcx. r5, 0, r3
-; CHECK-NEXT:    bne cr0, .LBB10_1
-; CHECK-NEXT:  .LBB10_3:
-; CHECK-NEXT:    mr r3, r4
+; CHECK-NEXT:  .LBB10_2:                               # %cmpxchg.trystore
+; CHECK-NEXT:                                          # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    stwcx. r5, 0, r4
+; CHECK-NEXT:    beq     cr0, .LBB10_4
+; CHECK-NEXT:  # %bb.3:                                # %cmpxchg.releasedload
+; CHECK-NEXT:                                          #   in Loop: Header=BB10_2 Depth=1
+; CHECK-NEXT:    lwarx r3, 0, r4
+; CHECK-NEXT:    cmplwi  r3, 0
+; CHECK-NEXT:    beq     cr0, .LBB10_2
+; CHECK-NEXT:  .LBB10_4:                               # %cmpxchg.nostore
 ; CHECK-NEXT:    lwsync
 ; CHECK-NEXT:    blr
   %val = cmpxchg ptr %mem, i32 0, i32 1 acq_rel acquire
@@ -304,17 +314,14 @@ define i64 @cas_weak_i64_release_monotonic(ptr %mem) {
 ;
 ; PPC64-LABEL: cas_weak_i64_release_monotonic:
 ; PPC64:       # %bb.0:
+; PPC64-NEXT:    mr      r4, r3
+; PPC64-NEXT:    ldarx r3, 0, r3
+; PPC64-NEXT:    cmpldi  r3, 0
+; PPC64-NEXT:    bnelr   cr0
+; PPC64-NEXT:  # %bb.1:                                # %cmpxchg.fencedstore
 ; PPC64-NEXT:    li r5, 1
 ; PPC64-NEXT:    lwsync
-; PPC64-NEXT:  .LBB11_1:
-; PPC64-NEXT:    ldarx r4, 0, r3
-; PPC64-NEXT:    cmpdi r4, 0
-; PPC64-NEXT:    bne cr0, .LBB11_3
-; PPC64-NEXT:  # %bb.2:
-; PPC64-NEXT:    stdcx. r5, 0, r3
-; PPC64-NEXT:    bne cr0, .LBB11_1
-; PPC64-NEXT:  .LBB11_3:
-; PPC64-NEXT:    mr r3, r4
+; PPC64-NEXT:    stdcx. r5, 0, r4
 ; PPC64-NEXT:    blr
   %val = cmpxchg weak ptr %mem, i64 0, i64 1 release monotonic
   %loaded = extractvalue { i64, i1} %val, 0
diff --git a/llvm/test/CodeGen/PowerPC/loop-comment.ll b/llvm/test/CodeGen/PowerPC/loop-comment.ll
index 14f6791fc7792..1fa9dda51ef9e 100644
--- a/llvm/test/CodeGen/PowerPC/loop-comment.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-comment.ll
@@ -4,12 +4,17 @@
 define void @test(ptr %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test:
 ; PPC64LE:       # %bb.0:
+; PPC64LE-NEXT:    clrlwi 5, 5, 24
 ; PPC64LE-NEXT:    clrlwi 4, 4, 24
-; PPC64LE-NEXT:  .LBB0_1:
+; PPC64LE-NEXT:    .p2align        5
+; PPC64LE-NEXT:  .LBB0_1:                                # %cmpxchg.start
+; PPC64LE-NEXT:                                          # =>This Inner Loop Header: Depth=1
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
-; PPC64LE-NEXT:    cmpw 6, 4
+; PPC64LE-NEXT:    clrlwi  6, 6, 24
+; PPC64LE-NEXT:    cmplw 6, 4
 ; PPC64LE-NEXT:    bnelr 0
 ; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:                                          #   in Loop: Header=BB0_1 Depth=1
 ; PPC64LE-NEXT:    stbcx. 5, 0, 3
 ; PPC64LE-NEXT:    bne 0, .LBB0_1
 ; PPC64LE-NEXT:  # %bb.3:
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
index f787aa7f6a42b..840e2d3eee553 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
@@ -7,19 +7,51 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    %loaded = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %cmpxchg.end ]
+; CHECK-NEXT:    %new = fadd float %loaded, %value
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float %new to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float %loaded to i32
+; CHECK-NEXT:    br label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.start:                                    ; preds = %cmpxchg.trystore, %atomicrmw.start
+; CHECK-NEXT:    %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
+; CHECK-NEXT:    %should_store = icmp eq i32 %larx, [[TMP3]]
+; CHECK-NEXT:    br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.fencedstore:                              ; preds = %cmpxchg.start
+; CHECK-NEXT:    br label %cmpxchg.trystore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.trystore:                                 ; preds = %cmpxchg.fencedstore
+; CHECK-NEXT:    %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
+; CHECK-NEXT:    %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 %stcx, 1
+; CHECK-NEXT:    %success1 = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.releasedload:                             ; No predecessors!
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.success:                                  ; preds = %cmpxchg.trystore
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.nostore:                                  ; preds = %cmpxchg.start
+; CHECK-NEXT:    %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
+; CHECK-NEXT:    br label %cmpxchg.failure
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.failure:                                  ; preds = %cmpxchg.nostore
+; CHECK-NEXT:    %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.end:                                      ; preds = %cmpxchg.failure, %cmpxchg.success
+; CHECK-NEXT:    %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
+; CHECK-NEXT:    %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 %loaded.exit to float
+; CHECK-NEXT:    br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.end:                                    ; preds = %cmpxchg.end
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
 ; CHECK-NEXT:    ret float [[TMP5]]
-;
+; CHECK-NEXT:  }
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
 }
@@ -28,22 +60,56 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-LABEL: @test_atomicrmw_fsub_f32(
 ; CHECK-NEXT:    call void @llvm.ppc.sync()
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
+; CHECK-NEXT:    br label %atomicrmw.start 
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.start:
+; CHECK-NEXT:    %loaded = phi float [ [[TMP1]], %0 ], [ [[TMP5:%.*]], %cmpxchg.end ]
+; CHECK-NEXT:    %new = fsub float %loaded, %value
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float %new to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float %loaded to i32
+; CHECK-NEXT:    br label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.start:
+; CHECK-NEXT:    %larx = call i32 @llvm.ppc.lwarx(ptr %ptr)
+; CHECK-NEXT:    %should_store = icmp eq i32 %larx, [[TMP3]]
+; CHECK-NEXT:    br i1 %should_store, label %cmpxchg.fencedstore, label %cmpxchg.nostore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.fencedstore:                              ; preds = %cmpxchg.start
+; CHECK-NEXT:    br label %cmpxchg.trystore
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.trystore:                                 ; preds = %cmpxchg.fencedstore
+; CHECK-NEXT:    %loaded.trystore = phi i32 [ %larx, %cmpxchg.fencedstore ]
+; CHECK-NEXT:    %stcx = call i32 @llvm.ppc.stwcx(ptr %ptr, i32 %2)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i32 %stcx, 1
+; CHECK-NEXT:    %success1 = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    br i1 %success1, label %cmpxchg.success, label %cmpxchg.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.releasedload:                             ; No predecessors!
+; CHECK-NEXT:    unreachable
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.success:                                  ; preds = %cmpxchg.trystore
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.nostore:                                  ; preds = %cmpxchg.start
+; CHECK-NEXT:    %loaded.nostore = phi i32 [ %larx, %cmpxchg.start ]
+; CHECK-NEXT:    br label %cmpxchg.failure
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.failure:                                  ; preds = %cmpxchg.nostore
+; CHECK-NEXT:    %loaded.failure = phi i32 [ %loaded.nostore, %cmpxchg.nostore ]
+; CHECK-NEXT:    br label %cmpxchg.end
+; CHECK-EMPTY:
+; CHECK-NEXT:  cmpxchg.end:                                      ; preds = %cmpxchg.failure, %cmpxchg.success
+; CHECK-NEXT:    %loaded.exit = phi i32 [ %loaded.trystore, %cmpxchg.success ], [ %loaded.failure, %cmpxchg.failure ]
+; CHECK-NEXT:    %success2 = phi i1 [ true, %cmpxchg.success ], [ false, %cmpxchg.failure ]
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 %loaded.exit to float
+; CHECK-NEXT:    br i1 %success2, label %atomicrmw.end, label %atomicrmw.start
+; CHECK-EMPTY:
+; CHECK-NEXT:  atomicrmw.end:                                    ; preds = %cmpxchg.end
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
 ; CHECK-NEXT:    ret float [[TMP5]]
-;
-  %res = atomicrmw fsub ptr %ptr, float %value seq_cst
+; CHECK-NEXT:  }
+
+%res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
 }
 

From 4a47634a0075c49051cb4708a7f54577ecb080f4 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 13 Jun 2025 14:16:58 +0100
Subject: [PATCH 366/851] [flang][OpenMP] Support substrings and complex part
 refs for DEPEND (#143907)

Fixes #142404

The parser can't tell the difference between array indexing and a
substring: that has to be done in semantics once we have types.
Substrings can only be in the form string([lower]:[higher]) not
string(index) or string(lower:higher:step). I added semantic checks to
catch this for the DEPEND clause.

This patch also adds lowering for correct substrings and for complex
part references.
---
 flang/include/flang/Evaluate/tools.h          |  18 +--
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |  13 +--
 flang/lib/Lower/OpenMP/Clauses.cpp            |  11 +-
 flang/lib/Semantics/check-omp-structure.cpp   |  34 ++++++
 flang/test/Lower/OpenMP/depend-complex.f90    |  22 ++++
 flang/test/Lower/OpenMP/depend-substring.f90  | 108 ++++++++++++++++++
 .../Semantics/OpenMP/depend-substring.f90     |  65 +++++++++++
 7 files changed, 250 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/depend-complex.f90
 create mode 100644 flang/test/Lower/OpenMP/depend-substring.f90
 create mode 100644 flang/test/Semantics/OpenMP/depend-substring.f90

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 4dce1257a6507..1959d5f3a5899 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -490,26 +490,30 @@ template <typename A> std::optional<CoarrayRef> ExtractCoarrayRef(const A &x) {
   }
 }
 
-struct ExtractSubstringHelper {
-  template <typename T> static std::optional<Substring> visit(T &&) {
+template <typename TARGET> struct ExtractFromExprDesignatorHelper {
+  template <typename T> static std::optional<TARGET> visit(T &&) {
     return std::nullopt;
   }
 
-  static std::optional<Substring> visit(const Substring &e) { return e; }
+  static std::optional<TARGET> visit(const TARGET &t) { return t; }
 
   template <typename T>
-  static std::optional<Substring> visit(const Designator<T> &e) {
+  static std::optional<TARGET> visit(const Designator<T> &e) {
     return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 
-  template <typename T>
-  static std::optional<Substring> visit(const Expr<T> &e) {
+  template <typename T> static std::optional<TARGET> visit(const Expr<T> &e) {
     return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 };
 
 template <typename A> std::optional<Substring> ExtractSubstring(const A &x) {
-  return ExtractSubstringHelper::visit(x);
+  return ExtractFromExprDesignatorHelper<Substring>::visit(x);
+}
+
+template <typename A>
+std::optional<ComplexPart> ExtractComplexPart(const A &x) {
+  return ExtractFromExprDesignatorHelper<ComplexPart>::visit(x);
 }
 
 // If an expression is simply a whole symbol data designator,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 88baad8827e92..b5c8de8c2ce8b 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -926,14 +926,10 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
     for (const omp::Object &object : objects) {
       assert(object.ref() && "Expecting designator");
       mlir::Value dependVar;
+      SomeExpr expr = *object.ref();
 
-      if (evaluate::ExtractSubstring(*object.ref())) {
-        TODO(converter.getCurrentLocation(),
-             "substring not supported for task depend");
-      } else if (evaluate::IsArrayElement(*object.ref())) {
-        // Array Section
-        SomeExpr expr = *object.ref();
-
+      if (evaluate::IsArrayElement(expr) || evaluate::ExtractSubstring(expr)) {
+        // Array Section or character (sub)string
         if (isVectorSubscript(expr)) {
           // OpenMP needs the address of the first indexed element (required by
           // the standard to be the lowest index) to identify the dependency. We
@@ -947,7 +943,8 @@ bool ClauseProcessor::processDepend(lower::SymMap &symMap,
               converter.getCurrentLocation(), converter, expr, symMap, stmtCtx);
           dependVar = entity.getBase();
         }
-      } else if (evaluate::isStructureComponent(*object.ref())) {
+      } else if (evaluate::isStructureComponent(expr) ||
+                 evaluate::ExtractComplexPart(expr)) {
         SomeExpr expr = *object.ref();
         hlfir::EntityWithAttributes entity = convertExprToHLFIR(
             converter.getCurrentLocation(), converter, expr, symMap, stmtCtx);
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index f3088b18b77ff..4d0f5c3a127e1 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -70,19 +70,18 @@ struct SymbolAndDesignatorExtractor {
 
   static void verify(const SymbolWithDesignator &sd) {
     const semantics::Symbol *symbol = std::get<0>(sd);
-    assert(symbol && "Expecting symbol");
-    auto &maybeDsg = std::get<1>(sd);
+    const std::optional<evaluate::Expr<evaluate::SomeType>> &maybeDsg =
+        std::get<1>(sd);
     if (!maybeDsg)
       return; // Symbol with no designator -> OK
-    std::optional<evaluate::DataRef> maybeRef =
-        evaluate::ExtractDataRef(*maybeDsg);
+    assert(symbol && "Expecting symbol");
+    std::optional<evaluate::DataRef> maybeRef = evaluate::ExtractDataRef(
+        *maybeDsg, /*intoSubstring=*/true, /*intoComplexPart=*/true);
     if (maybeRef) {
       if (&maybeRef->GetLastSymbol() == symbol)
         return; // Symbol with a designator for it -> OK
       llvm_unreachable("Expecting designator for given symbol");
     } else {
-      // This could still be a Substring or ComplexPart, but at least Substring
-      // is not allowed in OpenMP.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       maybeDsg->dump();
 #endif
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 4dccb0e88e324..58d28dce7094a 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -11,6 +11,7 @@
 #include "resolve-names-utils.h"
 #include "flang/Evaluate/check-expression.h"
 #include "flang/Evaluate/expression.h"
+#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
@@ -6524,6 +6525,29 @@ void OmpStructureChecker::CheckDependList(const parser::DataRef &d) {
 void OmpStructureChecker::CheckArraySection(
     const parser::ArrayElement &arrayElement, const parser::Name &name,
     const llvm::omp::Clause clause) {
+  // Sometimes substring operations are incorrectly parsed as array accesses.
+  // Detect this by looking for array accesses on character variables which are
+  // not arrays.
+  bool isSubstring{false};
+  evaluate::ExpressionAnalyzer ea{context_};
+  if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
+    std::optional<evaluate::Shape> shape = evaluate::GetShape(expr);
+    // Not an array: rank 0
+    if (shape && shape->size() == 0) {
+      if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
+        if (type->category() == evaluate::TypeCategory::Character) {
+          // Substrings are explicitly denied by the standard [6.0:163:9-11].
+          // This is supported as an extension. This restriction was added in
+          // OpenMP 5.2.
+          isSubstring = true;
+          context_.Say(GetContext().clauseSource,
+              "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
+        } else {
+          llvm_unreachable("Array indexing on a variable that isn't an array");
+        }
+      }
+    }
+  }
   if (!arrayElement.subscripts.empty()) {
     for (const auto &subscript : arrayElement.subscripts) {
       if (const auto *triplet{
@@ -6541,6 +6565,10 @@ void OmpStructureChecker::CheckArraySection(
                   name.ToString(),
                   parser::ToUpperCaseLetters(getClauseName(clause).str()));
             }
+            if (isSubstring) {
+              context_.Say(GetContext().clauseSource,
+                  "Cannot specify a step for a substring"_err_en_US);
+            }
           }
           const auto &lower{std::get<0>(triplet->t)};
           const auto &upper{std::get<1>(triplet->t)};
@@ -6564,6 +6592,12 @@ void OmpStructureChecker::CheckArraySection(
             }
           }
         }
+      } else if (std::get_if<parser::IntExpr>(&subscript.u)) {
+        // base(n) is valid as an array index but not as a substring operation
+        if (isSubstring) {
+          context_.Say(GetContext().clauseSource,
+              "Substrings must be in the form parent-string(lb:ub)"_err_en_US);
+        }
       }
     }
   }
diff --git a/flang/test/Lower/OpenMP/depend-complex.f90 b/flang/test/Lower/OpenMP/depend-complex.f90
new file mode 100644
index 0000000000000..488696b565077
--- /dev/null
+++ b/flang/test/Lower/OpenMP/depend-complex.f90
@@ -0,0 +1,22 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir -o - %s | FileCheck %s
+
+subroutine depend_complex(z)
+! CHECK-LABEL:   func.func @_QPdepend_complex(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<complex<f32>> {fir.bindc_name = "z"}) {
+  complex :: z
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {uniq_name = "_QFdepend_complexEz"} : (!fir.ref<complex<f32>>, !fir.dscope) -> (!fir.ref<complex<f32>>, !fir.ref<complex<f32>>)
+  !$omp task depend(in:z%re)
+! CHECK:           %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0  real : (!fir.ref<complex<f32>>) -> !fir.ref<f32>
+! CHECK:           omp.task depend(taskdependin -> %[[VAL_2]] : !fir.ref<f32>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+  !$omp end task
+  !$omp task depend(in:z%im)
+! CHECK:           %[[VAL_3:.*]] = hlfir.designate %[[VAL_1]]#0  imag : (!fir.ref<complex<f32>>) -> !fir.ref<f32>
+! CHECK:           omp.task depend(taskdependin -> %[[VAL_3]] : !fir.ref<f32>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+  !$omp end task
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/depend-substring.f90 b/flang/test/Lower/OpenMP/depend-substring.f90
new file mode 100644
index 0000000000000..5de11e06cc10b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/depend-substring.f90
@@ -0,0 +1,108 @@
+! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+subroutine substring_0(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(:))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_0(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_0Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_9:.*]] = fir.box_elesize %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_11]]  typeparams %[[VAL_17]] : (!fir.boxchar<1>, index, index, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.boxchar<1>) -> !fir.ref<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_19]] : !fir.ref<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_1(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(2:))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_1(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_1Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_9:.*]] = fir.box_elesize %[[VAL_8]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (index) -> i64
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i64) -> index
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_13:.*]] = arith.subi %[[VAL_11]], %[[VAL_7]] : index
+! CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_12]] : index
+! CHECK:           %[[VAL_15:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_14]], %[[VAL_15]] : index
+! CHECK:           %[[VAL_18:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_11]]  typeparams %[[VAL_17]] : (!fir.boxchar<1>, index, index, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_19:.*]] = fir.box_addr %[[VAL_18]] : (!fir.boxchar<1>) -> !fir.ref<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_19]] : !fir.ref<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_2(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c(:2))
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_2(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_2Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_elesize %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> index
+! CHECK:           %[[VAL_6:.*]] = fir.emboxchar %[[VAL_3]], %[[VAL_5]] : (!fir.ptr<!fir.char<1,?>>, index) -> !fir.boxchar<1>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_8:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_9:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_10:.*]] = hlfir.designate %[[VAL_6]]  substr %[[VAL_7]], %[[VAL_8]]  typeparams %[[VAL_9]] : (!fir.boxchar<1>, index, index, index) -> !fir.ref<!fir.char<1,2>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_10]] : !fir.ref<!fir.char<1,2>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine substring_4(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c)
+  !$omp end task
+end
+! CHECK-LABEL:   func.func @_QPsubstring_4(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>> {fir.bindc_name = "c"}) {
+! CHECK:           %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFsubstring_4Ec"} : (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.char<1,?>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.char<1,?>>>) -> !fir.ptr<!fir.char<1,?>>
+! CHECK:           omp.task depend(taskdependout -> %[[VAL_3]] : !fir.ptr<!fir.char<1,?>>) {
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Semantics/OpenMP/depend-substring.f90 b/flang/test/Semantics/OpenMP/depend-substring.f90
new file mode 100644
index 0000000000000..23d6bb4c0b7b3
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/depend-substring.f90
@@ -0,0 +1,65 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Test for parsing confusion between array indexing and string subscripts
+
+! This is okay: selects the whole substring
+subroutine substring_0(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(:))
+  !$omp end task
+end
+
+! This is okay: selects from the second character onwards
+subroutine substring_1(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(2:))
+  !$omp end task
+end
+
+! This is okay: selects the first 2 characters
+subroutine substring_2(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !$omp task depend(out:c(:2))
+  !$omp end task
+end
+
+! Error
+subroutine substring_3(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !ERROR: Substrings must be in the form parent-string(lb:ub)
+  !$omp task depend(out:c(2))
+  !$omp end task
+end
+
+! This is okay: interpreted as indexing into the array not as a substring
+subroutine substring_3b(c)
+  character(:), pointer :: c(:)
+  !$omp task depend(out:c(2))
+  !$omp end task
+end
+
+! This is okay: no indexing or substring at all
+subroutine substring_4(c)
+  character(:), pointer :: c
+  !$omp task depend(out:c)
+  !$omp end task
+end
+
+! This is not okay: substrings can't have a stride
+subroutine substring_5(c)
+  character(:), pointer :: c
+  !PORTABILITY: The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2.
+  !ERROR: Cannot specify a step for a substring
+  !$omp task depend(out:c(1:20:5))
+  !$omp end task
+end
+
+! This is okay: interpreted as indexing the array
+subroutine substring_5b(c)
+  character(:), pointer :: c(:)
+  !$omp task depend(out:c(1:20:5))
+  !$omp end task
+end

From 6ca31ad720ba32bff3664af218ec2d3c29bdd1b0 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 13 Jun 2025 14:17:39 +0100
Subject: [PATCH 367/851] [flang][OpenMP] improve semantic check for invalid
 goto (#144040)

Fixes #143229
---
 flang/lib/Semantics/resolve-directives.cpp        |  8 ++++++--
 .../Semantics/OpenMP/parallel-master-goto.f90     | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/parallel-master-goto.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 93bf510fbc3c7..b5f8667fe36f2 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -3023,10 +3023,14 @@ void OmpAttributeVisitor::CheckSourceLabel(const parser::Label &label) {
 void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
     const parser::CharBlock target, std::optional<DirContext> sourceContext,
     std::optional<DirContext> targetContext) {
+  auto dirContextsSame = [](DirContext &lhs, DirContext &rhs) -> bool {
+    // Sometimes nested constructs share a scope but are different contexts
+    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive);
+  };
   unsigned version{context_.langOptions().OpenMPVersion};
   if (targetContext &&
       (!sourceContext ||
-          (sourceContext->scope != targetContext->scope &&
+          (!dirContextsSame(*targetContext, *sourceContext) &&
               !DoesScopeContain(
                   &targetContext->scope, sourceContext->scope)))) {
     context_
@@ -3038,7 +3042,7 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
   }
   if (sourceContext &&
       (!targetContext ||
-          (sourceContext->scope != targetContext->scope &&
+          (!dirContextsSame(*sourceContext, *targetContext) &&
               !DoesScopeContain(
                   &sourceContext->scope, targetContext->scope)))) {
     context_
diff --git a/flang/test/Semantics/OpenMP/parallel-master-goto.f90 b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
new file mode 100644
index 0000000000000..72c8002ab4c59
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/parallel-master-goto.f90
@@ -0,0 +1,15 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Regression test for #143229
+
+!$omp parallel
+do i = 1, 2
+!ERROR: invalid branch into an OpenMP structured block
+!ERROR: invalid branch leaving an OpenMP structured block
+  goto 10
+end do
+!WARNING: OpenMP directive MASTER has been deprecated, please use MASKED instead.
+!$omp master
+10 print *, i
+!$omp end master
+!$omp end parallel
+end

From 9c2e0bd59ce0438fcad61b0468fd939c6282d048 Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:19:10 -0400
Subject: [PATCH 368/851] [PowerPC][NFC] Pre-commit test case for checking
 whether  `mtvsrbmi` power10 instruction not used (#143956)

Verify whether the generated assembly for the following function
includes the mtvsrbmi instruction.
 vector unsigned char v00FF()
{
 vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
 return x;
 }
---
 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll | 44 +++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll

diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
new file mode 100644
index 0000000000000..7ed57c300ec71
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Verify whether the generated assembly for the following function includes the mtvsrbmi instruction.
+; vector unsigned char v00FF()
+; {
+; vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
+; return x;
+; }
+
+; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc-ibm-aix -mcpu=pwr10  -verify-machineinstrs \
+; RUN:   | FileCheck %s --check-prefix=CHECK
+
+define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
+; CHECK-LABEL: _Z5v00FFv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-NEXT:    lxv vs34, 0(r3)
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}
+
+; CHECK:      L..CPI0_0:
+; CHECK-NEXT:   .byte   255                             # 0xff
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+; CHECK-NEXT:   .byte   0                               # 0x0
+
+; CHECK:      ._Z5v00FFv:
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT:   lwz r3, L..C0(r2)
+; CHECK-NEXT:   lxv vs34, 0(r3)
+; CHECK-NEXT:   blr

From 7e0bb2b0b9f66715c07c5eeaadb367d1a084d4c7 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 13 Jun 2025 15:21:23 +0200
Subject: [PATCH 369/851] [flang][fir] Extend locality specs lowering to
 support `init` and `dealloc` regions (#144027)

Extending `fir.do_concurrent` to `fir.do_loop ... unordered` lowering by
adding support for lowring/inlining non-empty `init` and `dealloc`
regions.

Resolves https://github.com/llvm/llvm-project/issues/143897 (actually
handles the todo).
---
 .../Transforms/SimplifyFIROperations.cpp      |  51 ++++---
 ...do-concurrent-localizer-dealloc-region.fir | 126 ++++++++++++++++++
 .../do-concurrent-localizer-init-region.fir   | 102 ++++++++++++++
 3 files changed, 258 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
 create mode 100644 flang/test/Transforms/do-concurrent-localizer-init-region.fir

diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
index cb9e48cced2a1..e440852b3103a 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp
@@ -180,41 +180,50 @@ class DoConcurrentConversion
 
       std::optional<mlir::ArrayAttr> localSyms = loop.getLocalSyms();
 
-      for (auto [localVar, localArg, localizerSym] : llvm::zip_equal(
+      for (auto localInfo : llvm::zip_equal(
                loop.getLocalVars(), loop.getRegionLocalArgs(), *localSyms)) {
+        mlir::Value localVar = std::get<0>(localInfo);
+        mlir::BlockArgument localArg = std::get<1>(localInfo);
+        mlir::Attribute localizerSym = std::get<2>(localInfo);
         mlir::SymbolRefAttr localizerName =
             llvm::cast<mlir::SymbolRefAttr>(localizerSym);
         fir::LocalitySpecifierOp localizer = findLocalizer(loop, localizerName);
 
-        if (!localizer.getInitRegion().empty() ||
-            !localizer.getDeallocRegion().empty())
-          TODO(localizer.getLoc(), "localizers with `init` and `dealloc` "
-                                   "regions are not handled yet.");
-
         // TODO Should this be a heap allocation instead? For now, we allocate
         // on the stack for each loop iteration.
         mlir::Value localAlloc =
             rewriter.create<fir::AllocaOp>(loop.getLoc(), localizer.getType());
 
-        if (localizer.getLocalitySpecifierType() ==
-            fir::LocalitySpecifierType::LocalInit) {
+        auto cloneLocalizerRegion = [&](mlir::Region &region,
+                                        mlir::ValueRange regionArgs,
+                                        mlir::Block::iterator insertionPoint) {
           // It is reasonable to make this assumption since, at this stage,
           // control-flow ops are not converted yet. Therefore, things like `if`
           // conditions will still be represented by their encapsulating `fir`
           // dialect ops.
-          assert(localizer.getCopyRegion().hasOneBlock() &&
-                 "Expected localizer to have a single block.");
-          mlir::Block *beforeLocalInit = rewriter.getInsertionBlock();
-          mlir::Block *afterLocalInit = rewriter.splitBlock(
-              rewriter.getInsertionBlock(), rewriter.getInsertionPoint());
-          rewriter.cloneRegionBefore(localizer.getCopyRegion(), afterLocalInit);
-          mlir::Block *copyRegionBody = beforeLocalInit->getNextNode();
-
-          rewriter.eraseOp(copyRegionBody->getTerminator());
-          rewriter.mergeBlocks(afterLocalInit, copyRegionBody);
-          rewriter.mergeBlocks(copyRegionBody, beforeLocalInit,
-                               {localVar, localArg});
-        }
+          assert(region.hasOneBlock() &&
+                 "Expected localizer region to have a single block.");
+          mlir::OpBuilder::InsertionGuard guard(rewriter);
+          rewriter.setInsertionPoint(rewriter.getInsertionBlock(),
+                                     insertionPoint);
+          mlir::IRMapping mapper;
+          mapper.map(region.getArguments(), regionArgs);
+          for (mlir::Operation &op : region.front().without_terminator())
+            (void)rewriter.clone(op, mapper);
+        };
+
+        if (!localizer.getInitRegion().empty())
+          cloneLocalizerRegion(localizer.getInitRegion(), {localVar, localArg},
+                               rewriter.getInsertionPoint());
+
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          cloneLocalizerRegion(localizer.getCopyRegion(), {localVar, localArg},
+                               rewriter.getInsertionPoint());
+
+        if (!localizer.getDeallocRegion().empty())
+          cloneLocalizerRegion(localizer.getDeallocRegion(), {localArg},
+                               rewriter.getInsertionBlock()->end());
 
         rewriter.replaceAllUsesWith(localArg, localAlloc);
       }
diff --git a/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir b/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
new file mode 100644
index 0000000000000..b59ffdfb34adf
--- /dev/null
+++ b/flang/test/Transforms/do-concurrent-localizer-dealloc-region.fir
@@ -0,0 +1,126 @@
+// Tests converting `fir.local` ops that have `dealloc` regions.
+
+// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck %s
+
+fir.local {type = local} @_QFlocalizer_with_dealloc_regionEa_private_box_Uxi32 : !fir.box<!fir.array<?xi32>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>, %arg1: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+  %c0 = arith.constant 0 : index
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  %1:3 = fir.box_dims %0, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+  %2 = fir.shape %1#1 : (index) -> !fir.shape<1>
+  %3 = fir.allocmem !fir.array<?xi32>, %1#1 {bindc_name = ".tmp", uniq_name = ""}
+  %4 = fir.declare %3(%2) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
+  %5 = fir.embox %4(%2) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  %6 = fir.shape_shift %1#0, %1#1 : (index, index) -> !fir.shapeshift<1>
+  %7 = fir.rebox %5(%6) : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.store %7 to %arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.array<?xi32>>>)
+} dealloc {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.array<?xi32>>>):
+  %c0_i64 = arith.constant 0 : i64
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  %1 = fir.box_addr %0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+  %2 = fir.convert %1 : (!fir.ref<!fir.array<?xi32>>) -> i64
+  %3 = arith.cmpi ne, %2, %c0_i64 : i64
+  fir.if %3 {
+    %4 = fir.convert %1 : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+    fir.freemem %4 : !fir.heap<!fir.array<?xi32>>
+  }
+  fir.yield
+}
+
+func.func @_QPlocalizer_with_dealloc_region(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) {
+  %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.alloca !fir.box<!fir.array<?xi32>>
+  %1 = fir.dummy_scope : !fir.dscope
+  %2 = fir.declare %arg0 dummy_scope %1 {uniq_name = "_QFlocalizer_with_dealloc_regionEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+  %3 = fir.load %2 : !fir.ref<i32>
+  %4 = fir.convert %3 : (i32) -> index
+  %5 = arith.cmpi sgt, %4, %c0 : index
+  %6 = arith.select %5, %4, %c0 : index
+  %7 = fir.alloca !fir.array<?xi32>, %6 {bindc_name = "a", uniq_name = "_QFlocalizer_with_dealloc_regionEa"}
+  %8 = fir.shape %6 : (index) -> !fir.shape<1>
+  %9 = fir.declare %7(%8) {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+  %10 = fir.embox %9(%8) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+  fir.store %10 to %0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+  fir.do_concurrent {
+    %11 = fir.alloca i32 {bindc_name = "i"}
+    %12 = fir.declare %11 {uniq_name = "_QFlocalizer_with_dealloc_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    fir.do_concurrent.loop (%arg1) = (%c1) to (%4) step (%c1) local(@_QFlocalizer_with_dealloc_regionEa_private_box_Uxi32 %0 -> %arg2 : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+      %13 = fir.convert %arg1 : (index) -> i32
+      fir.store %13 to %12 : !fir.ref<i32>
+      %14 = fir.declare %arg2 {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
+      %15 = fir.load %14 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+      %16 = fir.load %12 : !fir.ref<i32>
+      %17 = fir.convert %16 : (i32) -> i64
+      %18:3 = fir.box_dims %15, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+      %19 = fir.shift %18#0 : (index) -> !fir.shift<1>
+      %20 = fir.array_coor %15(%19) %17 : (!fir.box<!fir.array<?xi32>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      fir.store %c42_i32 to %20 : !fir.ref<i32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL:   func.func @_QPlocalizer_with_dealloc_region(
+// CHECK-SAME:      %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[VAL_4]] {uniq_name = "_QFlocalizer_with_dealloc_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_8:.*]] = fir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFlocalizer_with_dealloc_regionEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref<i32>
+// CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
+// CHECK:           %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_1]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_1]] : index
+// CHECK:           %[[VAL_13:.*]] = fir.alloca !fir.array<?xi32>, %[[VAL_12]] {bindc_name = "a", uniq_name = "_QFlocalizer_with_dealloc_regionEa"}
+// CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_15:.*]] = fir.declare %[[VAL_13]](%[[VAL_14]]) {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:           %[[VAL_16:.*]] = fir.embox %[[VAL_15]](%[[VAL_14]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           fir.store %[[VAL_16]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:           fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]] to %[[VAL_10]] step %[[VAL_2]] unordered {
+
+// Local allocation
+// CHECK:             %[[VAL_18:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+
+// `init` region body
+// CHECK:             %[[VAL_19:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_20:.*]]:3 = fir.box_dims %[[VAL_19]], %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_21:.*]] = fir.shape %[[VAL_20]]#1 : (index) -> !fir.shape<1>
+// CHECK:             %[[VAL_22:.*]] = fir.allocmem !fir.array<?xi32>, %[[VAL_20]]#1 {bindc_name = ".tmp", uniq_name = ""}
+// CHECK:             %[[VAL_23:.*]] = fir.declare %[[VAL_22]](%[[VAL_21]]) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:             %[[VAL_24:.*]] = fir.embox %[[VAL_23]](%[[VAL_21]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             %[[VAL_25:.*]] = fir.shape_shift %[[VAL_20]]#0, %[[VAL_20]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:             %[[VAL_26:.*]] = fir.rebox %[[VAL_24]](%[[VAL_25]]) : (!fir.box<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:             fir.store %[[VAL_26]] to %[[VAL_18]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+
+// Loop body
+// CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (index) -> i32
+// CHECK:             fir.store %[[VAL_27]] to %[[VAL_5]] : !fir.ref<i32>
+// CHECK:             %[[VAL_28:.*]] = fir.declare %[[VAL_18]] {uniq_name = "_QFlocalizer_with_dealloc_regionEa"} : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_29:.*]] = fir.load %[[VAL_28]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_30:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+// CHECK:             %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> i64
+// CHECK:             %[[VAL_32:.*]]:3 = fir.box_dims %[[VAL_29]], %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_33:.*]] = fir.shift %[[VAL_32]]#0 : (index) -> !fir.shift<1>
+// CHECK:             %[[VAL_34:.*]] = fir.array_coor %[[VAL_29]](%[[VAL_33]]) %[[VAL_31]] : (!fir.box<!fir.array<?xi32>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+// CHECK:             fir.store %[[VAL_3]] to %[[VAL_34]] : !fir.ref<i32>
+
+// `dealloc` region
+// CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_18]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+// CHECK:             %[[VAL_36:.*]] = fir.box_addr %[[VAL_35]] : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// CHECK:             %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (!fir.ref<!fir.array<?xi32>>) -> i64
+// CHECK:             %[[VAL_38:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_0]] : i64
+// CHECK:             fir.if %[[VAL_38]] {
+// CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_36]] : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:               fir.freemem %[[VAL_39]] : !fir.heap<!fir.array<?xi32>>
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Transforms/do-concurrent-localizer-init-region.fir b/flang/test/Transforms/do-concurrent-localizer-init-region.fir
new file mode 100644
index 0000000000000..ebb56aec278f6
--- /dev/null
+++ b/flang/test/Transforms/do-concurrent-localizer-init-region.fir
@@ -0,0 +1,102 @@
+// Tests converting `fir.local` ops that have `init` regions.
+
+// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck %s
+
+fir.local {type = local_init} @_QFlocalizer_with_init_regionEp_firstprivate_box_ptr_Uxi32 : !fir.box<!fir.ptr<!fir.array<?xi32>>> init {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %c0 = arith.constant 0 : index
+  %0 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %1 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+  %2 = fir.embox %1(%0) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.store %2 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+} copy {
+^bb0(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %arg1: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+  %0 = fir.load %arg0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.store %0 to %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  fir.yield(%arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+}
+
+func.func @_QPlocalizer_with_init_region() {
+  %c42_i32 = arith.constant 42 : i32
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFlocalizer_with_init_regionEn"}
+  %2 = fir.declare %1 {uniq_name = "_QFlocalizer_with_init_regionEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  %3 = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", uniq_name = "_QFlocalizer_with_init_regionEp"}
+  %4 = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+  %5 = fir.shape %c0 : (index) -> !fir.shape<1>
+  %6 = fir.embox %4(%5) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+  fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  %7 = fir.declare %3 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+  %8 = fir.load %2 : !fir.ref<i32>
+  %9 = fir.convert %8 : (i32) -> index
+
+  fir.do_concurrent {
+    %10 = fir.alloca i32 {bindc_name = "i"}
+    %11 = fir.declare %10 {uniq_name = "_QFlocalizer_with_init_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    fir.do_concurrent.loop (%arg0) = (%c1) to (%9) step (%c1) local(@_QFlocalizer_with_init_regionEp_firstprivate_box_ptr_Uxi32 %7 -> %arg1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+      %12 = fir.convert %arg0 : (index) -> i32
+      fir.store %12 to %11 : !fir.ref<i32>
+      %13 = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %14 = fir.load %13 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %15 = fir.load %11 : !fir.ref<i32>
+      %16 = fir.convert %15 : (i32) -> i64
+      %17:3 = fir.box_dims %14, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+      %18 = fir.shift %17#0 : (index) -> !fir.shift<1>
+      %19 = fir.array_coor %14(%18) %16 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      fir.store %c42_i32 to %19 : !fir.ref<i32>
+    }
+  }
+
+  return
+}
+
+// CHECK-LABEL:   func.func @_QPlocalizer_with_init_region() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 42 : i32
+// CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i"}
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_3]] {uniq_name = "_QFlocalizer_with_init_regionEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFlocalizer_with_init_regionEn"}
+// CHECK:           %[[VAL_7:.*]] = fir.declare %[[VAL_6]] {uniq_name = "_QFlocalizer_with_init_regionEn"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", uniq_name = "_QFlocalizer_with_init_regionEp"}
+// CHECK:           %[[VAL_9:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+// CHECK:           %[[VAL_10:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_11:.*]] = fir.embox %[[VAL_9]](%[[VAL_10]]) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:           fir.store %[[VAL_11]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_12:.*]] = fir.declare %[[VAL_8]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+// CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> index
+// CHECK:           fir.do_loop %[[VAL_15:.*]] = %[[VAL_1]] to %[[VAL_14]] step %[[VAL_1]] unordered {
+
+// Local allocation
+// CHECK:             %[[VAL_16:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+
+// `init` region body
+// CHECK:             %[[VAL_17:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1>
+// CHECK:             %[[VAL_18:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+// CHECK:             %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_17]]) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:             fir.store %[[VAL_19]] to %[[VAL_16]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+// `copy` region body
+// CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             fir.store %[[VAL_20]] to %[[VAL_16]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+// loop body
+// CHECK:             %[[VAL_21:.*]] = fir.convert %[[VAL_15]] : (index) -> i32
+// CHECK:             fir.store %[[VAL_21]] to %[[VAL_4]] : !fir.ref<i32>
+// CHECK:             %[[VAL_22:.*]] = fir.declare %[[VAL_16]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFlocalizer_with_init_regionEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:             %[[VAL_24:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+// CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> i64
+// CHECK:             %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_23]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+// CHECK:             %[[VAL_27:.*]] = fir.shift %[[VAL_26]]#0 : (index) -> !fir.shift<1>
+// CHECK:             %[[VAL_28:.*]] = fir.array_coor %[[VAL_23]](%[[VAL_27]]) %[[VAL_25]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+// CHECK:             fir.store %[[VAL_2]] to %[[VAL_28]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+

From ea73fc5f079d1849ca3bed902e598191105a95dc Mon Sep 17 00:00:00 2001
From: zhijian lin <zhijian@ca.ibm.com>
Date: Fri, 13 Jun 2025 09:38:54 -0400
Subject: [PATCH 370/851] [PowerPC] fixed mtvsrbmi.ll test case error caused by
 run the update_llc_test_checks.py (#144075)

fixed mtvsrbmi.ll test case error which caused by run the
update_llc_test_checks.py
---
 llvm/test/CodeGen/PowerPC/mtvsrbmi.ll | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
index 7ed57c300ec71..5486dc02faf90 100644
--- a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -10,15 +10,6 @@
 ; RUN:   | FileCheck %s --check-prefix=CHECK
 
 define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
-; CHECK-LABEL: _Z5v00FFv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
-; CHECK-NEXT:    lxv vs34, 0(r3)
-; CHECK-NEXT:    blr
-entry:
-  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
-}
-
 ; CHECK:      L..CPI0_0:
 ; CHECK-NEXT:   .byte   255                             # 0xff
 ; CHECK-NEXT:   .byte   0                               # 0x0
@@ -37,8 +28,11 @@ entry:
 ; CHECK-NEXT:   .byte   0                               # 0x0
 ; CHECK-NEXT:   .byte   0                               # 0x0
 
-; CHECK:      ._Z5v00FFv:
-; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT:   lwz r3, L..C0(r2)
-; CHECK-NEXT:   lxv vs34, 0(r3)
-; CHECK-NEXT:   blr
+; CHECK-LABEL: _Z5v00FFv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lwz r3, L..C0(r2) # %const.0
+; CHECK-NEXT:    lxv vs34, 0(r3)
+; CHECK-NEXT:    blr
+entry:
+  ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+}

From c3ec9e3f6553b43caf2b9d754f128abbf44cf80e Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 13 Jun 2025 14:40:27 +0100
Subject: [PATCH 371/851] [lldb][DWARF] Don't try to compute address range
 information of forward declarations (#144059)

This fixes the error reported in
https://github.com/llvm/llvm-project/pull/144037.

When computing the aranges table of a CU, LLDB would currently visit all
`DW_TAG_subprogram` DIEs and check their
`DW_AT_low_pc`/`DW_AT_high_pc`/`DW_AT_ranges` attributes. If those don't
exist it would error out and spam the console. Some subprograms
(particularly forward declarations) don't have low/high pc attributes,
so it's not really an "error". See DWARFv5 spec section `3.3.3
Subroutine and Entry Point Locations`:
```
A subroutine entry may have either a DW_AT_low_pc and DW_AT_high_pc
pair of attributes or a DW_AT_ranges attribute whose values encode the
contiguous or non-contiguous address ranges, respectively, of the machine
instructions generated for the subroutine (see Section 2.17 on page 51).
...
A subroutine entry representing a subroutine declaration that is not also a
definition does not have code address or range attributes.
```

We should just ignore those DIEs.
---
 .../source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 5196ce89a2c13..8217c85f86014 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -611,7 +611,11 @@ void DWARFDebugInfoEntry::BuildFunctionAddressRangeTable(
     DWARFUnit *cu, DWARFDebugAranges *debug_aranges) const {
   Log *log = GetLog(DWARFLog::DebugInfo);
   if (m_tag) {
-    if (m_tag == DW_TAG_subprogram) {
+    // Subprogram forward declarations don't have
+    // DW_AT_ranges/DW_AT_low_pc/DW_AT_high_pc attributes, so don't even try
+    // getting address range information for them.
+    if (m_tag == DW_TAG_subprogram &&
+        !GetAttributeValueAsOptionalUnsigned(cu, DW_AT_declaration)) {
       if (llvm::Expected<llvm::DWARFAddressRangesVector> ranges =
               GetAttributeAddressRanges(cu, /*check_hi_lo_pc=*/true)) {
         for (const auto &r : *ranges)

From 6f999a5d99e5cb21520d8a7878ed0d3a32971af6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 14:52:17 +0100
Subject: [PATCH 372/851] [x86] vector-pcmp.ll - regenerate VPTERNLOGD asm
 comment

---
 llvm/test/CodeGen/X86/vector-pcmp.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 5b43acbe52375..30eb2279bda85 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1749,7 +1749,7 @@ define <16 x i1> @is_positive_mask_v16i16_v16i1(<16 x i16> %x, <16 x i1> %y) {
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq

From a361a3dc7a12b776507f48035f245e764c45455d Mon Sep 17 00:00:00 2001
From: Yash Solanki <67216443+yashnator@users.noreply.github.com>
Date: Fri, 13 Jun 2025 19:23:34 +0530
Subject: [PATCH 373/851] [llvm][InstCombine] Fold select to cmp for weak and
 inverted inequalities (#143445)

---
 .../InstCombine/InstCombineSelect.cpp         |  22 ++
 .../Transforms/InstCombine/select-to-cmp.ll   | 293 ++++++++++++++++++
 2 files changed, 315 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/select-to-cmp.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 979a803a79ed8..320b827bdbe86 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3631,6 +3631,28 @@ Instruction *InstCombinerImpl::foldSelectToCmp(SelectInst &SI) {
   if (!LHS->getType()->isIntOrIntVectorTy())
     return nullptr;
 
+  // If there is no -1, 0 or 1 at TV, then invert the select statement and try
+  // to canonicalize to one of the forms above
+  if (!isa<Constant>(TV)) {
+    if (!isa<Constant>(FV))
+      return nullptr;
+    Pred = ICmpInst::getInverseCmpPredicate(Pred);
+    std::swap(TV, FV);
+  }
+
+  if (ICmpInst::isNonStrictPredicate(Pred)) {
+    if (Constant *C = dyn_cast<Constant>(RHS)) {
+      auto FlippedPredAndConst =
+          getFlippedStrictnessPredicateAndConstant(Pred, C);
+      if (!FlippedPredAndConst)
+        return nullptr;
+      Pred = FlippedPredAndConst->first;
+      RHS = FlippedPredAndConst->second;
+    } else {
+      return nullptr;
+    }
+  }
+
   // Try to swap operands and the predicate. We need to be careful when doing
   // so because two of the patterns have opposite predicates, so use the
   // constant inside select to determine if swapping operands would be
diff --git a/llvm/test/Transforms/InstCombine/select-to-cmp.ll b/llvm/test/Transforms/InstCombine/select-to-cmp.ll
new file mode 100644
index 0000000000000..a76d4b138686b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-to-cmp.ll
@@ -0,0 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+; Tests for select to scmp
+
+define i32 @scmp_x_0_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %2 = icmp ne i32 %x, 0
+  %3 = zext i1 %2 to i32
+  %4 = icmp sgt i32 %x, -1
+  %5 = select i1 %4, i32 %3, i32 -1
+  ret i32 %5
+}
+
+; y = -10
+define i32 @scmp_x_0_inverted_const_neg10(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_neg10(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 -10)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, -10
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -11
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; y = 7 (i8)
+define i8 @scmp_x_0_inverted_i8(i8 %x) {
+; CHECK-LABEL: define i8 @scmp_x_0_inverted_i8(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[X]], i8 7)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = icmp ne i8 %x, 7
+  %2 = zext i1 %1 to i8
+  %3 = icmp sgt i8 %x, 6
+  %4 = select i1 %3, i8 %2, i8 -1
+  ret i8 %4
+}
+
+; scmp using ints of two kinds- i32 and i64
+define i32 @scmp_x_0_inverted_i64_neq(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_i64_neq(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call i64 @llvm.scmp.i64.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i64 [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %x64 = sext i32 %x to i64
+  %cmp1 = icmp ne i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp sgt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; Same example as previous but with inequality
+define i32 @scmp_x_0_inverted_i64_sgt(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_i64_sgt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call i64 @llvm.scmp.i64.i32(i32 [[X]], i32 0)
+; CHECK-NEXT:    [[RET:%.*]] = trunc i64 [[SEL]] to i32
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %x64 = sext i32 %x to i64
+  %cmp1 = icmp sgt i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp sgt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; y = -1000
+define i32 @scmp_x_0_inverted_const_neg1000(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_neg1000(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 -1000)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp sgt i32 %x, -1000
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -1001
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; y = 1729
+define i32 @scmp_x_0_inverted_const_1729_sgt(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_1729_sgt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[X]], i32 1729)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp sgt i32 %x, 1729
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, 1728
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with 10
+define i32 @ucmp_x_10_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_10_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[X]], i32 10)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, 10
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, 9
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with -3, wraps around
+define i32 @ucmp_x_neg1_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_neg1_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ucmp.i32.i32(i32 [[X]], i32 -3)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %1 = icmp ne i32 %x, -3
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, -4
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; ucmp with -4, wraps around
+define i8 @ucmp_x_neg4_i8_ugt(i8 %x) {
+; CHECK-LABEL: define i8 @ucmp_x_neg4_i8_ugt(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.ucmp.i8.i8(i8 [[X]], i8 -4)
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = icmp ugt i8 %x, -4
+  %2 = zext i1 %1 to i8
+  %3 = icmp ugt i8 %x, -5
+  %4 = select i1 %3, i8 %2, i8 -1
+  ret i8 %4
+}
+
+; Vector tests
+
+; Test with splat vec
+define <4 x i32> @scmp_x_0_inverted_splat_vec(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @scmp_x_0_inverted_splat_vec(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> [[X]], <4 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+;
+  %2 = icmp ne <4 x i32> %x, zeroinitializer
+  %3 = zext <4 x i1> %2 to <4 x i32>
+  %4 = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %5
+}
+
+; Test with non-splat vector and different bitwidth
+define <4 x i32> @non_splat_vec_scmp_diff_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @non_splat_vec_scmp_diff_bitwidth(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = call <4 x i64> @llvm.scmp.v4i64.v4i32(<4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 -1, i32 5>)
+; CHECK-NEXT:    [[RET:%.*]] = trunc <4 x i64> [[SEL]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[RET]]
+;
+  %x64 = sext <4 x i32> %x to <4 x i64>
+  %cmp1 = icmp slt <4 x i64> %x64, <i64 0, i64 1, i64 -1, i64 5>
+  %sext = sext <4 x i1> %cmp1 to <4 x i64>
+  %cmp2 = icmp slt <4 x i64> %x64, <i64 1, i64 2, i64 0, i64 6>
+  %sel = select <4 x i1> %cmp2, <4 x i64> %sext, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+  %ret = trunc <4 x i64> %sel to <4 x i32>
+  ret <4 x i32> %ret
+}
+
+; Negative examples
+
+; Not scmp due to wrong RHS of the predicate
+define i32 @scmp_ne_0(i32 %0) {
+; CHECK-LABEL: define i32 @scmp_ne_0(
+; CHECK-SAME: i32 [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 -1
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %2 = icmp ne i32 %0, 0
+  %3 = zext i1 %2 to i32
+  %4 = icmp sgt i32 %0, 1
+  %5 = select i1 %4, i32 %3, i32 -1
+  ret i32 %5
+}
+
+; y = 0 with unsigned compare but RHS wraps
+define i32 @ucmp_x_0_inverted(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_0_inverted(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 -1
+;
+  %1 = icmp ne i32 %x, 0
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, -1
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Don't fold with INT32_MIN
+define i32 @scmp_x_0_inverted_const_min(i32 %x) {
+; CHECK-LABEL: define i32 @scmp_x_0_inverted_const_min(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -2147483648
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = icmp ne i32 %x, -2147483648
+  %2 = zext i1 %1 to i32
+  %3 = icmp sge i32 %x, -2147483648
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Unsigned cmp of zext of i32 with i64 -1 should always be -1
+define i32 @ucmp_x_0_inverted_i64_ugt(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_0_inverted_i64_ugt(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    ret i32 -1
+;
+  %x64 = zext i32 %x to i64
+  %cmp1 = icmp ugt i64 %x64, 0
+  %zext = zext i1 %cmp1 to i64
+  %cmp2 = icmp ugt i64 %x64, -1
+  %sel = select i1 %cmp2, i64 %zext, i64 -1
+  %ret = trunc i64 %sel to i32
+  ret i32 %ret
+}
+
+; y = 4294967295 (UINT32_MAX), simply sign extend neq
+define i32 @ucmp_x_const_u32max(i32 %x) {
+; CHECK-LABEL: define i32 @ucmp_x_const_u32max(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[X]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %1 = icmp ugt i32 %x, 4294967295
+  %2 = zext i1 %1 to i32
+  %3 = icmp ugt i32 %x, 4294967294
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Don't fold with different signedness
+define i32 @different_signedness_neg(i32 %x) {
+; CHECK-LABEL: define i32 @different_signedness_neg(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X]], -10
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[X]], -11
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 -1
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %1 = icmp ugt i32 %x, -10
+  %2 = zext i1 %1 to i32
+  %3 = icmp sgt i32 %x, -11
+  %4 = select i1 %3, i32 %2, i32 -1
+  ret i32 %4
+}
+
+; Test with wrong false value
+define <4 x i32> @scmp_x_0_inverted_vec(<4 x i32> %x) {
+; CHECK-LABEL: define <4 x i32> @scmp_x_0_inverted_vec(
+; CHECK-SAME: <4 x i32> [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
+;
+  %2 = icmp ne <4 x i32> %x, zeroinitializer
+  %3 = zext <4 x i1> %2 to <4 x i32>
+  %4 = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -1, i32 -2, i32 -1, i32 -1>
+  ret <4 x i32> %5
+}

From 8b11de70681355d7e7a4f8f3da85afa31fa7fc74 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Fri, 13 Jun 2025 15:59:58 +0200
Subject: [PATCH 374/851] [AMDGPU][SDAG] Initial support for ISD::PTRADD
 (#141725)

Enable generation of PTRADD SelectionDAG nodes for pointer arithmetic for SI,
for now behind an internal CLI option. Also add basic patterns to match these
nodes. Optimizations will come in follow-up PRs. Basic tests for SDAG codegen
with PTRADD are in test/CodeGen/AMDGPU/ptradd-sdag.ll

Only affects 64-bit address spaces for now, since the immediate use case only
affects the flat address space.

For SWDEV-516125.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  13 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h   |   2 +
 llvm/lib/Target/AMDGPU/SIInstructions.td  |  14 +
 llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll   | 537 ++++++++++++++++++++++
 4 files changed, 566 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 53dc540cbd635..30535ae88f7ba 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -61,6 +61,14 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
     cl::desc("Use indirect register addressing for divergent indexes"),
     cl::init(false));
 
+// TODO: This option should be removed once we switch to always using PTRADD in
+// the SelectionDAG.
+static cl::opt<bool> UseSelectionDAGPTRADD(
+    "amdgpu-use-sdag-ptradd", cl::Hidden,
+    cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
+             "SelectionDAG ISel"),
+    cl::init(false));
+
 static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
@@ -10457,6 +10465,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   }
 }
 
+bool SITargetLowering::shouldPreservePtrArith(const Function &F,
+                                              EVT PtrVT) const {
+  return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
+}
+
 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
 // offset (the offset that is included in bounds checking and swizzling, to be
 // split between the instruction's voffset and immoffset fields) and soffset
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e14611d999641..d71a22722129e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -260,6 +260,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool shouldExpandVectorDynExt(SDNode *N) const;
 
+  bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
+
 private:
   // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
   // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 360fd05cb3d96..1419f63202a7c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1376,6 +1376,20 @@ def : GCNPat <
       (i32 (V_MOV_B32_e32 (i32 0))), sub1)
 >;
 
+//===----------------------------------------------------------------------===//
+// PTRADD Patterns
+//===----------------------------------------------------------------------===//
+
+// GlobalISel shouldn't generate 64-bit addition pseudos.
+let GISelShouldIgnore = 1 in {
+def : GCNPat<
+  (DivergentBinFrag<ptradd> i64:$src0, i64:$src1),
+  (V_ADD_U64_PSEUDO $src0, $src1)>;
+def : GCNPat<
+  (UniformBinFrag<ptradd> i64:$src0, i64:$src1),
+  (S_ADD_U64_PSEUDO $src0, $src1)>;
+}
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
new file mode 100644
index 0000000000000..653d4b85a9a5b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
@@ -0,0 +1,537 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY
+
+; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address
+; spaces since PTRADD is currently only used for these.
+
+; Check that basic pointer arithmetic can be lowered.
+define ptr @gep_as0(ptr %p, i64 %offset) {
+; GFX8-LABEL: gep_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_PTRADD-LABEL: gep_as0:
+; GFX942_PTRADD:       ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: gep_as0:
+; GFX942_LEGACY:       ; %bb.0: ; %entry
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: gep_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: gep_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: gep_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  ret ptr %gep2
+}
+
+define amdgpu_kernel void @gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) {
+; GFX8-LABEL: gep_as0_uniform:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT:    s_add_i32 s12, s12, s17
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_endpgm
+;
+; GFX942-LABEL: gep_as0_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s3
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: gep_as0_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: gep_as0_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: gep_as0_uniform:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr inbounds i32, ptr %p, i64 %offset
+  store ptr %gep, ptr %ret
+  ret void
+}
+
+; Check that pointer arithmetic with multiple indexing steps can be lowered.
+define ptr @multi_gep_as0(ptr %p, i64 %offset) {
+; GFX8-LABEL: multi_gep_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 5, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_PTRADD-LABEL: multi_gep_as0:
+; GFX942_PTRADD:       ; %bb.0: ; %entry
+; GFX942_PTRADD-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_PTRADD-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942_PTRADD-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_PTRADD-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942_LEGACY-LABEL: multi_gep_as0:
+; GFX942_LEGACY:       ; %bb.0: ; %entry
+; GFX942_LEGACY-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1]
+; GFX942_LEGACY-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 5
+; GFX942_LEGACY-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: multi_gep_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: multi_gep_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: multi_gep_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b64_e32 v[2:3], 2, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
+; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 5
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  ret ptr %gep2
+}
+
+define amdgpu_kernel void @multi_gep_as0_uniform(ptr %p, i64 %offset, ptr %ret) {
+; GFX8-LABEL: multi_gep_as0_uniform:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT:    s_add_i32 s12, s12, s17
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX8-NEXT:    s_add_u32 s0, s0, s2
+; GFX8-NEXT:    s_addc_u32 s1, s1, s3
+; GFX8-NEXT:    s_add_u32 s0, s0, 5
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    s_addc_u32 s1, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_endpgm
+;
+; GFX942-LABEL: multi_gep_as0_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, 5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX10-LABEL: multi_gep_as0_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_add_u32 s12, s12, s17
+; GFX10-NEXT:    s_addc_u32 s13, s13, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    s_add_u32 s0, s0, 5
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: multi_gep_as0_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT:    s_add_u32 s0, s0, s2
+; GFX11-NEXT:    s_addc_u32 s1, s1, s3
+; GFX11-NEXT:    s_add_u32 s0, s0, 5
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: multi_gep_as0_uniform:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 5
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_store_b64 v[0:1], v[2:3]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep1 = getelementptr inbounds i32, ptr %p, i64 %offset
+  %gep2 = getelementptr inbounds i8, ptr %gep1, i64 5
+  store ptr %gep2, ptr %ret
+  ret void
+}
+
+; Check that constant offsets are folded into memory instructions.
+
+define void @fold_as0(ptr %from, ptr %to) {
+; GFX8-LABEL: fold_as0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as0:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dword v0, v[0:1] offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_store_dword v[2:3], v0 offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v0, v[0:1] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_store_dword v[2:3], v0 offset:8
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v0, v[0:1] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_store_b32 v[2:3], v0 offset:8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v0, v[0:1] offset:8
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    flat_store_b32 v[2:3], v0 offset:8
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr %from, i64 8
+  %val = load i32, ptr %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr %to, i64 8
+  store i32 %val, ptr %gep.to, align 4
+  ret void
+}
+
+define void @fold_as1(ptr addrspace(1) %from, ptr addrspace(1) %to) {
+; GFX8-LABEL: fold_as1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as1:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as1:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr addrspace(1) %from, i64 8
+  %val = load i32, ptr addrspace(1) %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr addrspace(1) %to, i64 8
+  store i32 %val, ptr addrspace(1) %gep.to, align 4
+  ret void
+}
+
+define void @fold_as4(ptr addrspace(4) %from, ptr addrspace(1) %to) {
+; GFX8-LABEL: fold_as4:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[0:1], v4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fold_as4:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: fold_as4:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off offset:8
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fold_as4:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: fold_as4:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v0, off offset:8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %gep.from = getelementptr inbounds i8, ptr addrspace(4) %from, i64 8
+  %val = load i32, ptr addrspace(4) %gep.from, align 4
+  %gep.to = getelementptr inbounds i8, ptr addrspace(1) %to, i64 8
+  store i32 %val, ptr addrspace(1) %gep.to, align 4
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10_LEGACY: {{.*}}
+; GFX10_PTRADD: {{.*}}
+; GFX11_LEGACY: {{.*}}
+; GFX11_PTRADD: {{.*}}
+; GFX12_LEGACY: {{.*}}
+; GFX12_PTRADD: {{.*}}
+; GFX8_LEGACY: {{.*}}
+; GFX8_PTRADD: {{.*}}

From 0a0960dac69fc88a3c8bd5e2099f8d45b0292c78 Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:14:45 -0400
Subject: [PATCH 375/851] [mlir][spirv] Add bfloat16 support (#141458)

Adds bf16 support to SPIRV by using the `SPV_KHR_bfloat16` extension.
Only a few operations are supported, including loading from and storing
to memory, conversion to/from other types, cooperative matrix operations
(including coop matrix arithmetic ops) and dot product support.

This PR adds the type definition and implements the basic cast
operations. Arithmetic/coop matrix ops will be added in a separate PR.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        | 41 ++++++++++--
 .../mlir/Dialect/SPIRV/IR/SPIRVCastOps.td     | 12 ++--
 mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp    |  5 +-
 mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp      | 18 +++++-
 .../SPIRV/Deserialization/Deserializer.cpp    | 27 ++++++--
 .../Target/SPIRV/Serialization/Serializer.cpp | 11 +++-
 .../FuncToSPIRV/types-to-spirv.mlir           | 18 ++----
 .../test/Dialect/SPIRV/IR/arithmetic-ops.mlir | 64 +++++++++++++++++++
 mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir    |  8 +++
 mlir/test/Dialect/SPIRV/IR/cast-ops.mlir      | 56 ++++++++++++++++
 mlir/test/Dialect/SPIRV/IR/composite-ops.mlir |  7 ++
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 17 +++++
 mlir/test/Dialect/SPIRV/IR/logical-ops.mlir   |  8 +++
 .../Dialect/SPIRV/IR/non-uniform-ops.mlir     | 16 +++++
 mlir/test/Dialect/SPIRV/IR/types.mlir         |  8 +--
 .../SPIRV/Transforms/vce-deduction.mlir       | 14 ++++
 mlir/test/Target/SPIRV/cast-ops.mlir          | 32 +++++++++-
 mlir/test/Target/SPIRV/logical-ops.mlir       | 23 +++++++
 18 files changed, 343 insertions(+), 42 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index b143cf9a5f509..e413503bbd672 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -344,6 +344,7 @@ def SPV_KHR_subgroup_rotate                  : I32EnumAttrCase<"SPV_KHR_subgroup
 def SPV_KHR_non_semantic_info                : I32EnumAttrCase<"SPV_KHR_non_semantic_info", 29>;
 def SPV_KHR_terminate_invocation             : I32EnumAttrCase<"SPV_KHR_terminate_invocation", 30>;
 def SPV_KHR_cooperative_matrix               : I32EnumAttrCase<"SPV_KHR_cooperative_matrix", 31>;
+def SPV_KHR_bfloat16                         : I32EnumAttrCase<"SPV_KHR_bfloat16", 32>;
 
 def SPV_EXT_demote_to_helper_invocation  : I32EnumAttrCase<"SPV_EXT_demote_to_helper_invocation", 1000>;
 def SPV_EXT_descriptor_indexing          : I32EnumAttrCase<"SPV_EXT_descriptor_indexing", 1001>;
@@ -436,7 +437,7 @@ def SPIRV_ExtensionAttr :
       SPV_KHR_fragment_shader_barycentric, SPV_KHR_ray_cull_mask,
       SPV_KHR_uniform_group_instructions, SPV_KHR_subgroup_rotate,
       SPV_KHR_non_semantic_info, SPV_KHR_terminate_invocation,
-      SPV_KHR_cooperative_matrix,
+      SPV_KHR_cooperative_matrix, SPV_KHR_bfloat16,
       SPV_EXT_demote_to_helper_invocation, SPV_EXT_descriptor_indexing,
       SPV_EXT_fragment_fully_covered, SPV_EXT_fragment_invocation_density,
       SPV_EXT_fragment_shader_interlock, SPV_EXT_physical_storage_buffer,
@@ -1412,6 +1413,23 @@ def SPIRV_C_ShaderStereoViewNV                          : I32EnumAttrCase<"Shade
     Extension<[SPV_NV_stereo_view_rendering]>
   ];
 }
+def SPIRV_C_BFloat16TypeKHR                             : I32EnumAttrCase<"BFloat16TypeKHR", 5116> {
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
+def SPIRV_C_BFloat16DotProductKHR                       : I32EnumAttrCase<"BFloat16DotProductKHR", 5117> {
+  list<I32EnumAttrCase> implies = [SPIRV_C_BFloat16TypeKHR];
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
+def SPIRV_C_BFloat16CooperativeMatrixKHR                : I32EnumAttrCase<"BFloat16CooperativeMatrixKHR", 5118> {
+  list<I32EnumAttrCase> implies = [SPIRV_C_BFloat16TypeKHR, SPIRV_C_CooperativeMatrixKHR];
+  list<Availability> availability = [
+    Extension<[SPV_KHR_bfloat16]>
+  ];
+}
 
 def SPIRV_C_Bfloat16ConversionINTEL                         : I32EnumAttrCase<"Bfloat16ConversionINTEL", 6115> {
   list<Availability> availability = [
@@ -1518,7 +1536,8 @@ def SPIRV_CapabilityAttr :
       SPIRV_C_StorageTexelBufferArrayNonUniformIndexing,
       SPIRV_C_ShaderViewportIndexLayerEXT, SPIRV_C_ShaderViewportMaskNV,
       SPIRV_C_ShaderStereoViewNV, SPIRV_C_Bfloat16ConversionINTEL,
-      SPIRV_C_CacheControlsINTEL
+      SPIRV_C_CacheControlsINTEL, SPIRV_C_BFloat16TypeKHR,
+      SPIRV_C_BFloat16DotProductKHR, SPIRV_C_BFloat16CooperativeMatrixKHR
     ]>;
 
 def SPIRV_AM_Logical                 : I32EnumAttrCase<"Logical", 0>;
@@ -3217,6 +3236,16 @@ def SPIRV_ExecutionModelAttr :
       SPIRV_EM_TaskEXT, SPIRV_EM_MeshEXT
     ]>;
 
+def SPIRV_FPE_BFloat16KHR : I32EnumAttrCase<"BFloat16KHR", 0> {
+  list<Availability> availability = [
+    Capability<[SPIRV_C_BFloat16TypeKHR]>
+  ];
+}
+def SPIRV_FPEncodingAttr :
+    SPIRV_I32EnumAttr<"FPEncoding", "valid SPIR-V FPEncoding", "f_p_encoding", [
+      SPIRV_FPE_BFloat16KHR
+    ]>;
+
 def SPIRV_FC_None         : I32BitEnumAttrCaseNone<"None">;
 def SPIRV_FC_Inline       : I32BitEnumAttrCaseBit<"Inline", 0>;
 def SPIRV_FC_DontInline   : I32BitEnumAttrCaseBit<"DontInline", 1>;
@@ -4161,10 +4190,12 @@ def SPIRV_Integer : AnyIntOfWidths<[8, 16, 32, 64]>;
 def SPIRV_Int16 : TypeAlias<I16, "Int16">;
 def SPIRV_Int32 : TypeAlias<I32, "Int32">;
 def SPIRV_Float32 : TypeAlias<F32, "Float32">;
+def SPIRV_BFloat16KHR : TypeAlias<BF16, "BFloat16">;
 def SPIRV_Float : FloatOfWidths<[16, 32, 64]>;
 def SPIRV_Float16or32 : FloatOfWidths<[16, 32]>;
+def SPIRV_AnyFloat : AnyTypeOf<[SPIRV_Float, SPIRV_BFloat16KHR]>;
 def SPIRV_Vector : VectorOfLengthAndType<[2, 3, 4, 8, 16],
-                                       [SPIRV_Bool, SPIRV_Integer, SPIRV_Float]>;
+                                       [SPIRV_Bool, SPIRV_Integer, SPIRV_AnyFloat]>;
 // Component type check is done in the type parser for the following SPIR-V
 // dialect-specific types so we use "Any" here.
 def SPIRV_AnyPtr : DialectType<SPIRV_Dialect, SPIRV_IsPtrType,
@@ -4187,14 +4218,14 @@ def SPIRV_AnyStruct : DialectType<SPIRV_Dialect, SPIRV_IsStructType,
 def SPIRV_AnySampledImage : DialectType<SPIRV_Dialect, SPIRV_IsSampledImageType,
                                 "any SPIR-V sampled image type">;
 
-def SPIRV_Numerical : AnyTypeOf<[SPIRV_Integer, SPIRV_Float]>;
+def SPIRV_Numerical : AnyTypeOf<[SPIRV_Integer, SPIRV_AnyFloat]>;
 def SPIRV_Scalar : AnyTypeOf<[SPIRV_Numerical, SPIRV_Bool]>;
 def SPIRV_Aggregate : AnyTypeOf<[SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct]>;
 def SPIRV_Composite :
     AnyTypeOf<[SPIRV_Vector, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
                SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix]>;
 def SPIRV_Type : AnyTypeOf<[
-    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector,
+    SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_AnyFloat, SPIRV_Vector,
     SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct,
     SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage,
     SPIRV_AnyImage
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
index b05ee0251df5b..a5c8aa8fb450c 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCastOps.td
@@ -86,7 +86,7 @@ def SPIRV_BitcastOp : SPIRV_Op<"Bitcast", [Pure]> {
 
 // -----
 
-def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_Float, []> {
+def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_AnyFloat, []> {
   let summary = [{
     Convert value numerically from floating point to signed integer, with
     round toward 0.0.
@@ -111,7 +111,7 @@ def SPIRV_ConvertFToSOp : SPIRV_CastOp<"ConvertFToS", SPIRV_Integer, SPIRV_Float
 
 // -----
 
-def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_Float, []> {
+def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_AnyFloat, []> {
   let summary = [{
     Convert value numerically from floating point to unsigned integer, with
     round toward 0.0.
@@ -138,7 +138,7 @@ def SPIRV_ConvertFToUOp : SPIRV_CastOp<"ConvertFToU", SPIRV_Integer, SPIRV_Float
 // -----
 
 def SPIRV_ConvertSToFOp : SPIRV_CastOp<"ConvertSToF",
-                                   SPIRV_Float,
+                                   SPIRV_AnyFloat,
                                    SPIRV_Integer,
                                    [SignedOp]> {
   let summary = [{
@@ -165,7 +165,7 @@ def SPIRV_ConvertSToFOp : SPIRV_CastOp<"ConvertSToF",
 // -----
 
 def SPIRV_ConvertUToFOp : SPIRV_CastOp<"ConvertUToF",
-                                   SPIRV_Float,
+                                   SPIRV_AnyFloat,
                                    SPIRV_Integer,
                                    [UnsignedOp]> {
   let summary = [{
@@ -192,8 +192,8 @@ def SPIRV_ConvertUToFOp : SPIRV_CastOp<"ConvertUToF",
 // -----
 
 def SPIRV_FConvertOp : SPIRV_CastOp<"FConvert",
-                                SPIRV_Float,
-                                SPIRV_Float,
+                                SPIRV_AnyFloat,
+                                SPIRV_AnyFloat,
                                 [UsableInSpecConstantOp]> {
   let summary = [{
     Convert value numerically from one floating-point width to another
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
index 0cf5f0823be63..a21acef1c4b43 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp
@@ -175,10 +175,7 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect,
 
   // Check other allowed types
   if (auto t = llvm::dyn_cast<FloatType>(type)) {
-    if (type.isBF16()) {
-      parser.emitError(typeLoc, "cannot use 'bf16' to compose SPIR-V types");
-      return Type();
-    }
+    // TODO: All float types are allowed for now, but this should be fixed.
   } else if (auto t = llvm::dyn_cast<IntegerType>(type)) {
     if (!ScalarType::isValid(t)) {
       parser.emitError(typeLoc,
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index 1aff43c301334..93e0c9b33c546 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -526,7 +526,7 @@ bool ScalarType::classof(Type type) {
 }
 
 bool ScalarType::isValid(FloatType type) {
-  return llvm::is_contained({16u, 32u, 64u}, type.getWidth()) && !type.isBF16();
+  return llvm::is_contained({16u, 32u, 64u}, type.getWidth());
 }
 
 bool ScalarType::isValid(IntegerType type) {
@@ -535,6 +535,11 @@ bool ScalarType::isValid(IntegerType type) {
 
 void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions,
                                std::optional<StorageClass> storage) {
+  if (isa<BFloat16Type>(*this)) {
+    static const Extension ext = Extension::SPV_KHR_bfloat16;
+    extensions.push_back(ext);
+  }
+
   // 8- or 16-bit integer/floating-point numbers will require extra extensions
   // to appear in interface storage classes. See SPV_KHR_16bit_storage and
   // SPV_KHR_8bit_storage for more details.
@@ -640,7 +645,16 @@ void ScalarType::getCapabilities(
   } else {
     assert(llvm::isa<FloatType>(*this));
     switch (bitwidth) {
-      WIDTH_CASE(Float, 16);
+    case 16: {
+      if (isa<BFloat16Type>(*this)) {
+        static const Capability cap = Capability::BFloat16TypeKHR;
+        capabilities.push_back(cap);
+      } else {
+        static const Capability cap = Capability::Float16;
+        capabilities.push_back(cap);
+      }
+      break;
+    }
       WIDTH_CASE(Float, 64);
     case 32:
       break;
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index c43d584d7b913..b9d9a9015eb61 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -867,11 +867,15 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
     typeMap[operands[0]] = IntegerType::get(context, operands[1], sign);
   } break;
   case spirv::Opcode::OpTypeFloat: {
-    if (operands.size() != 2)
-      return emitError(unknownLoc, "OpTypeFloat must have bitwidth parameter");
+    if (operands.size() != 2 && operands.size() != 3)
+      return emitError(unknownLoc,
+                       "OpTypeFloat expects either 2 operands (type, bitwidth) "
+                       "or 3 operands (type, bitwidth, encoding), but got ")
+             << operands.size();
+    uint32_t bitWidth = operands[1];
 
     Type floatTy;
-    switch (operands[1]) {
+    switch (bitWidth) {
     case 16:
       floatTy = opBuilder.getF16Type();
       break;
@@ -883,8 +887,20 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode,
       break;
     default:
       return emitError(unknownLoc, "unsupported OpTypeFloat bitwidth: ")
-             << operands[1];
+             << bitWidth;
+    }
+
+    if (operands.size() == 3) {
+      if (spirv::FPEncoding(operands[2]) != spirv::FPEncoding::BFloat16KHR)
+        return emitError(unknownLoc, "unsupported OpTypeFloat FP encoding: ")
+               << operands[2];
+      if (bitWidth != 16)
+        return emitError(unknownLoc,
+                         "invalid OpTypeFloat bitwidth for bfloat16 encoding: ")
+               << bitWidth << " (expected 16)";
+      floatTy = opBuilder.getBF16Type();
     }
+
     typeMap[operands[0]] = floatTy;
   } break;
   case spirv::Opcode::OpTypeVector: {
@@ -1399,6 +1415,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef<uint32_t> operands,
     } else if (floatType.isF16()) {
       APInt data(16, operands[2]);
       value = APFloat(APFloat::IEEEhalf(), data);
+    } else if (floatType.isBF16()) {
+      APInt data(16, operands[2]);
+      value = APFloat(APFloat::BFloat(), data);
     }
 
     auto attr = opBuilder.getFloatAttr(floatType, value);
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index 647535809554c..d258bfd852961 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -523,6 +523,9 @@ LogicalResult Serializer::prepareBasicType(
   if (auto floatType = dyn_cast<FloatType>(type)) {
     typeEnum = spirv::Opcode::OpTypeFloat;
     operands.push_back(floatType.getWidth());
+    if (floatType.isBF16()) {
+      operands.push_back(static_cast<uint32_t>(spirv::FPEncoding::BFloat16KHR));
+    }
     return success();
   }
 
@@ -1022,21 +1025,23 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr,
 
   auto resultID = getNextID();
   APFloat value = floatAttr.getValue();
+  const llvm::fltSemantics *semantics = &value.getSemantics();
 
   auto opcode =
       isSpec ? spirv::Opcode::OpSpecConstant : spirv::Opcode::OpConstant;
 
-  if (&value.getSemantics() == &APFloat::IEEEsingle()) {
+  if (semantics == &APFloat::IEEEsingle()) {
     uint32_t word = llvm::bit_cast<uint32_t>(value.convertToFloat());
     encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
-  } else if (&value.getSemantics() == &APFloat::IEEEdouble()) {
+  } else if (semantics == &APFloat::IEEEdouble()) {
     struct DoubleWord {
       uint32_t word1;
       uint32_t word2;
     } words = llvm::bit_cast<DoubleWord>(value.convertToDouble());
     encodeInstructionInto(typesGlobalValues, opcode,
                           {typeID, resultID, words.word1, words.word2});
-  } else if (&value.getSemantics() == &APFloat::IEEEhalf()) {
+  } else if (semantics == &APFloat::IEEEhalf() ||
+             semantics == &APFloat::BFloat()) {
     uint32_t word =
         static_cast<uint32_t>(value.bitcastToAPInt().getZExtValue());
     encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word});
diff --git a/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir b/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
index 82d750755ffe2..1737f4a906bf8 100644
--- a/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
+++ b/mlir/test/Conversion/FuncToSPIRV/types-to-spirv.mlir
@@ -173,6 +173,12 @@ func.func @float16(%arg0: f16) { return }
 // NOEMU-SAME: f64
 func.func @float64(%arg0: f64) { return }
 
+// CHECK-LABEL: spirv.func @bfloat16
+// CHECK-SAME: f32
+// NOEMU-LABEL: func.func @bfloat16
+// NOEMU-SAME: bf16
+func.func @bfloat16(%arg0: bf16) { return }
+
 // f80 is not supported by SPIR-V.
 // CHECK-LABEL: func.func @float80
 // CHECK-SAME: f80
@@ -206,18 +212,6 @@ func.func @float64(%arg0: f64) { return }
 
 // -----
 
-// Check that bf16 is not supported.
-module attributes {
-  spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [], []>, #spirv.resource_limits<>>
-} {
-
-// CHECK-NOT: spirv.func @bf16_type
-func.func @bf16_type(%arg0: bf16) { return }
-
-} // end module
-
-// -----
-
 //===----------------------------------------------------------------------===//
 // Complex types
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
index 2d0c86e08de5a..d58c27598f2b8 100644
--- a/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/arithmetic-ops.mlir
@@ -12,6 +12,14 @@ func.func @fadd_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fadd_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FAdd %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FDiv
 //===----------------------------------------------------------------------===//
@@ -24,6 +32,14 @@ func.func @fdiv_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fdiv_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FDiv %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FMod
 //===----------------------------------------------------------------------===//
@@ -36,6 +52,14 @@ func.func @fmod_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fmod_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FMod %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FMul
 //===----------------------------------------------------------------------===//
@@ -70,6 +94,14 @@ func.func @fmul_bf16(%arg: bf16) -> bf16 {
 
 // -----
 
+func.func @fmul_bf16_vector(%arg: vector<4xbf16>) -> vector<4xbf16> {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FMul %arg, %arg : vector<4xbf16>
+  return %0 : vector<4xbf16>
+}
+
+// -----
+
 func.func @fmul_tensor(%arg: tensor<4xf32>) -> tensor<4xf32> {
   // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
   %0 = spirv.FMul %arg, %arg : tensor<4xf32>
@@ -90,6 +122,14 @@ func.func @fnegate_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fnegate_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FNegate %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FRem
 //===----------------------------------------------------------------------===//
@@ -102,6 +142,14 @@ func.func @frem_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @frem_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FRem %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FSub
 //===----------------------------------------------------------------------===//
@@ -114,6 +162,14 @@ func.func @fsub_scalar(%arg: f32) -> f32 {
 
 // -----
 
+func.func @fsub_bf16_scalar(%arg: bf16) -> bf16 {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %0 = spirv.FSub %arg, %arg : bf16
+  return %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.IAdd
 //===----------------------------------------------------------------------===//
@@ -489,3 +545,11 @@ func.func @vector_times_scalar(%vector: vector<4xf32>, %scalar: f32) -> vector<3
   %0 = spirv.VectorTimesScalar %vector, %scalar : (vector<4xf32>, f32) -> vector<3xf32>
   return %0 : vector<3xf32>
 }
+
+// -----
+
+func.func @vector_bf16_times_scalar_bf16(%vector: vector<4xbf16>, %scalar: bf16) -> vector<4xbf16> {
+  // expected-error @+1 {{op operand #0 must be vector of 16/32/64-bit float values of length 2/3/4}}
+  %0 = spirv.VectorTimesScalar %vector, %scalar : (vector<4xbf16>, bf16) -> vector<4xbf16>
+  return %0 : vector<4xbf16>
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir b/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
index cc0abd3a42dcb..661497d5fff38 100644
--- a/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/atomic-ops.mlir
@@ -272,3 +272,11 @@ func.func @atomic_fadd(%ptr : !spirv.ptr<f32, StorageBuffer>, %value : f32) -> f
   %0 = spirv.EXT.AtomicFAdd <Device> <Acquire|Release> %ptr, %value : !spirv.ptr<f32, StorageBuffer>
   return %0 : f32
 }
+
+// -----
+
+func.func @atomic_bf16_fadd(%ptr : !spirv.ptr<bf16, StorageBuffer>, %value : bf16) -> bf16 {
+  // expected-error @+1 {{op operand #1 must be 16/32/64-bit float, but got 'bf16'}}
+  %0 = spirv.EXT.AtomicFAdd <Device> <None> %ptr, %value : !spirv.ptr<bf16, StorageBuffer>
+  return %0 : bf16
+}
diff --git a/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir b/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
index 34d0109e6bb44..4480a1f3720f2 100644
--- a/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/cast-ops.mlir
@@ -110,6 +110,14 @@ func.func @convert_f_to_s_vector(%arg0 : vector<3xf32>) -> vector<3xi32> {
 
 // -----
 
+func.func @convert_bf16_to_s32_scalar(%arg0 : bf16) -> i32 {
+  // CHECK: {{%.*}} = spirv.ConvertFToS {{%.*}} : bf16 to i32
+  %0 = spirv.ConvertFToS %arg0 : bf16 to i32
+  spirv.ReturnValue %0 : i32
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertFToU
 //===----------------------------------------------------------------------===//
@@ -146,6 +154,14 @@ func.func @convert_f_to_u.coopmatrix(%arg0 : !spirv.coopmatrix<8x16xf32, Subgrou
 
 // -----
 
+func.func @convert_bf16_to_u32_scalar(%arg0 : bf16) -> i32 {
+  // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : bf16 to i32
+  %0 = spirv.ConvertFToU %arg0 : bf16 to i32
+  spirv.ReturnValue %0 : i32
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertSToF
 //===----------------------------------------------------------------------===//
@@ -174,6 +190,14 @@ func.func @convert_s_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> {
 
 // -----
 
+func.func @convert_s32_to_bf16_scalar(%arg0 : i32) -> bf16 {
+  // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i32 to bf16
+  %0 = spirv.ConvertSToF %arg0 : i32 to bf16
+  spirv.ReturnValue %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ConvertUToF
 //===----------------------------------------------------------------------===//
@@ -202,6 +226,14 @@ func.func @convert_u_to_f_vector(%arg0 : vector<3xi32>) -> vector<3xf32> {
 
 // -----
 
+func.func @convert_u32_to_bf16_scalar(%arg0 : i32) -> bf16 {
+  // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i32 to bf16
+  %0 = spirv.ConvertUToF %arg0 : i32 to bf16
+  spirv.ReturnValue %0 : bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.FConvert
 //===----------------------------------------------------------------------===//
@@ -238,6 +270,30 @@ func.func @f_convert_vector(%arg0 : f32) -> f32 {
 
 // -----
 
+func.func @f_convert_bf16_to_f32_scalar(%arg0 : bf16) -> f32 {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : bf16 to f32
+  %0 = spirv.FConvert %arg0 : bf16 to f32
+  spirv.ReturnValue %0 : f32
+}
+
+// -----
+
+func.func @f_convert_f32_to_bf16_vector(%arg0 : vector<3xf32>) -> vector<3xbf16> {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : vector<3xf32> to vector<3xbf16>
+  %0 = spirv.FConvert %arg0 : vector<3xf32> to vector<3xbf16>
+  spirv.ReturnValue %0 : vector<3xbf16>
+}
+
+// -----
+
+func.func @f_convert_f32_to_bf16_coop_matrix(%arg0 : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA>) -> !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA> {
+  // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA> to !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+  %0 = spirv.FConvert %arg0 : !spirv.coopmatrix<8x16xf32, Subgroup, MatrixA> to !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+  spirv.ReturnValue %0 : !spirv.coopmatrix<8x16xbf16, Subgroup, MatrixA>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.SConvert
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir b/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
index 3fc8dfb2767d1..e71b545de11df 100644
--- a/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/composite-ops.mlir
@@ -11,6 +11,13 @@ func.func @composite_construct_vector(%arg0: f32, %arg1: f32, %arg2 : f32) -> ve
   return %0: vector<3xf32>
 }
 
+// CHECK-LABEL: func @composite_construct_bf16_vector
+func.func @composite_construct_bf16_vector(%arg0: bf16, %arg1: bf16, %arg2 : bf16) -> vector<3xbf16> {
+  // CHECK: spirv.CompositeConstruct {{%.*}}, {{%.*}}, {{%.*}} : (bf16, bf16, bf16) -> vector<3xbf16>
+  %0 = spirv.CompositeConstruct %arg0, %arg1, %arg2 : (bf16, bf16, bf16) -> vector<3xbf16>
+  return %0: vector<3xbf16>
+}
+
 // CHECK-LABEL: func @composite_construct_struct
 func.func @composite_construct_struct(%arg0: vector<3xf32>, %arg1: !spirv.array<4xf32>, %arg2 : !spirv.struct<(f32)>) -> !spirv.struct<(vector<3xf32>, !spirv.array<4xf32>, !spirv.struct<(f32)>)> {
   // CHECK: spirv.CompositeConstruct
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 2b75767feaf92..642346cc40b0d 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -50,6 +50,14 @@ func.func @exp(%arg0 : i32) -> () {
 
 // -----
 
+func.func @exp_bf16(%arg0 : bf16) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values of length 2/3/4}}
+  %2 = spirv.GL.Exp %arg0 : bf16
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GL.{F|S|U}{Max|Min}
 //===----------------------------------------------------------------------===//
@@ -92,6 +100,15 @@ func.func @iminmax(%arg0: i32, %arg1: i32) {
 
 // -----
 
+func.func @fmaxminbf16vec(%arg0 : vector<3xbf16>, %arg1 : vector<3xbf16>) {
+  // expected-error @+1 {{operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values}}
+  %1 = spirv.GL.FMax %arg0, %arg1 : vector<3xbf16>
+  %2 = spirv.GL.FMin %arg0, %arg1 : vector<3xbf16>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GL.InverseSqrt
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
index 5c24f0e6a7d33..d6c34645f5746 100644
--- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
@@ -201,6 +201,14 @@ func.func @select_op_float(%arg0: i1) -> () {
   return
 }
 
+func.func @select_op_bfloat16(%arg0: i1) -> () {
+  %0 = spirv.Constant 2.0 : bf16
+  %1 = spirv.Constant 3.0 : bf16
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, bf16
+  %2 = spirv.Select %arg0, %0, %1 : i1, bf16
+  return
+}
+
 func.func @select_op_ptr(%arg0: i1) -> () {
   %0 = spirv.Variable : !spirv.ptr<f32, Function>
   %1 = spirv.Variable : !spirv.ptr<f32, Function>
diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
index 5f56de6ad1fa9..7ab94f17360d5 100644
--- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir
@@ -184,6 +184,14 @@ func.func @group_non_uniform_fmul_clustered_reduce(%val: vector<2xf32>) -> vecto
 
 // -----
 
+func.func @group_non_uniform_bf16_fmul_reduce(%val: bf16) -> bf16 {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'bf16'}}
+  %0 = spirv.GroupNonUniformFMul <Workgroup> <Reduce> %val : bf16 -> bf16
+  return %0: bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GroupNonUniformFMax
 //===----------------------------------------------------------------------===//
@@ -197,6 +205,14 @@ func.func @group_non_uniform_fmax_reduce(%val: f32) -> f32 {
 
 // -----
 
+func.func @group_non_uniform_bf16_fmax_reduce(%val: bf16) -> bf16 {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'bf16'}}
+  %0 = spirv.GroupNonUniformFMax <Workgroup> <Reduce> %val : bf16 -> bf16
+  return %0: bf16
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.GroupNonUniformFMin
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/types.mlir b/mlir/test/Dialect/SPIRV/IR/types.mlir
index b63a08d96e6af..c23894c62826b 100644
--- a/mlir/test/Dialect/SPIRV/IR/types.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/types.mlir
@@ -15,6 +15,9 @@ func.func private @vector_array_type(!spirv.array< 32 x vector<4xf32> >) -> ()
 // CHECK: func private @array_type_stride(!spirv.array<4 x !spirv.array<4 x f32, stride=4>, stride=128>)
 func.func private @array_type_stride(!spirv.array< 4 x !spirv.array<4 x f32, stride=4>, stride = 128>) -> ()
 
+// CHECK: func private @vector_array_type_bf16(!spirv.array<32 x vector<4xbf16>>)
+func.func private @vector_array_type_bf16(!spirv.array<32 x vector<4xbf16> >) -> ()
+
 // -----
 
 // expected-error @+1 {{expected '<'}}
@@ -57,11 +60,6 @@ func.func private @tensor_type(!spirv.array<4xtensor<4xf32>>) -> ()
 
 // -----
 
-// expected-error @+1 {{cannot use 'bf16' to compose SPIR-V types}}
-func.func private @bf16_type(!spirv.array<4xbf16>) -> ()
-
-// -----
-
 // expected-error @+1 {{only 1/8/16/32/64-bit integer type allowed but found 'i256'}}
 func.func private @i256_type(!spirv.array<4xi256>) -> ()
 
diff --git a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
index ff5ac7cea8fc6..2b237665ffc4a 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir
@@ -217,3 +217,17 @@ spirv.module Logical GLSL450 attributes {
   spirv.GlobalVariable @data : !spirv.ptr<!spirv.struct<(i8 [0], f16 [2], i64 [4])>, Uniform>
   spirv.GlobalVariable @img  : !spirv.ptr<!spirv.image<f32, Buffer, NoDepth, NonArrayed, SingleSampled, SamplerUnknown, Rg32f>, UniformConstant>
 }
+
+// Using bfloat16 requires BFloat16TypeKHR capability and SPV_KHR_bfloat16 extension.
+// CHECK: requires #spirv.vce<v1.0, [StorageBuffer16BitAccess, Shader, BFloat16TypeKHR], [SPV_KHR_bfloat16, SPV_KHR_16bit_storage, SPV_KHR_storage_buffer_storage_class]>
+spirv.module Logical GLSL450 attributes {
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader, StorageBuffer16BitAccess, BFloat16TypeKHR], [SPV_KHR_bfloat16, SPV_KHR_16bit_storage, SPV_KHR_storage_buffer_storage_class]>,
+    #spirv.resource_limits<>
+  >
+} {
+  spirv.func @load_bf16(%ptr : !spirv.ptr<bf16, StorageBuffer>) -> bf16 "None" {
+    %val = spirv.Load "StorageBuffer" %ptr : bf16
+    spirv.ReturnValue %val : bf16
+  }
+}
diff --git a/mlir/test/Target/SPIRV/cast-ops.mlir b/mlir/test/Target/SPIRV/cast-ops.mlir
index ede0bf30511ef..04a468b39b645 100644
--- a/mlir/test/Target/SPIRV/cast-ops.mlir
+++ b/mlir/test/Target/SPIRV/cast-ops.mlir
@@ -25,6 +25,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertFToS %arg0 : f64 to i32
     spirv.ReturnValue %0 : i32
   }
+  spirv.func @convert_bf16_to_s32(%arg0 : bf16) -> i32 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertFToS {{%.*}} : bf16 to i32
+    %0 = spirv.ConvertFToS %arg0 : bf16 to i32
+    spirv.ReturnValue %0 : i32
+  }
   spirv.func @convert_f_to_u(%arg0 : f32) -> i32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : f32 to i32
     %0 = spirv.ConvertFToU %arg0 : f32 to i32
@@ -35,6 +40,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertFToU %arg0 : f64 to i32
     spirv.ReturnValue %0 : i32
   }
+  spirv.func @convert_bf16_to_u32(%arg0 : bf16) -> i32 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertFToU {{%.*}} : bf16 to i32
+    %0 = spirv.ConvertFToU %arg0 : bf16 to i32
+    spirv.ReturnValue %0 : i32
+  }
   spirv.func @convert_s_to_f(%arg0 : i32) -> f32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i32 to f32
     %0 = spirv.ConvertSToF %arg0 : i32 to f32
@@ -45,6 +55,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertSToF %arg0 : i64 to f32
     spirv.ReturnValue %0 : f32
   }
+  spirv.func @convert_s64_to_bf16(%arg0 : i64) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertSToF {{%.*}} : i64 to bf16
+    %0 = spirv.ConvertSToF %arg0 : i64 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
   spirv.func @convert_u_to_f(%arg0 : i32) -> f32 "None" {
     // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i32 to f32
     %0 = spirv.ConvertUToF %arg0 : i32 to f32
@@ -55,11 +70,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %0 = spirv.ConvertUToF %arg0 : i64 to f32
     spirv.ReturnValue %0 : f32
   }
-  spirv.func @f_convert(%arg0 : f32) -> f64 "None" {
+  spirv.func @convert_u64_to_bf16(%arg0 : i64) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.ConvertUToF {{%.*}} : i64 to bf16
+    %0 = spirv.ConvertUToF %arg0 : i64 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
+  spirv.func @convert_f32_to_f64(%arg0 : f32) -> f64 "None" {
     // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : f32 to f64
     %0 = spirv.FConvert %arg0 : f32 to f64
     spirv.ReturnValue %0 : f64
   }
+  spirv.func @convert_f32_to_bf16(%arg0 : f32) -> bf16 "None" {
+    // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : f32 to bf16
+    %0 = spirv.FConvert %arg0 : f32 to bf16
+    spirv.ReturnValue %0 : bf16
+  }
+  spirv.func @convert_bf16_to_f32(%arg0 : bf16) -> f32 "None" {
+    // CHECK: {{%.*}} = spirv.FConvert {{%.*}} : bf16 to f32
+    %0 = spirv.FConvert %arg0 : bf16 to f32
+    spirv.ReturnValue %0 : f32
+  }
   spirv.func @s_convert(%arg0 : i32) -> i64 "None" {
     // CHECK: {{%.*}} = spirv.SConvert {{%.*}} : i32 to i64
     %0 = spirv.SConvert %arg0 : i32 to i64
diff --git a/mlir/test/Target/SPIRV/logical-ops.mlir b/mlir/test/Target/SPIRV/logical-ops.mlir
index 16846ac84e38c..b2008719b021c 100644
--- a/mlir/test/Target/SPIRV/logical-ops.mlir
+++ b/mlir/test/Target/SPIRV/logical-ops.mlir
@@ -108,3 +108,26 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     spirv.Return
   }
 }
+
+// -----
+
+// Test select works with bf16 scalar and vectors.
+
+spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+  spirv.SpecConstant @condition_scalar = true
+  spirv.func @select_bf16() -> () "None" {
+    %0 = spirv.Constant 4.0 : bf16
+    %1 = spirv.Constant 5.0 : bf16
+    %2 = spirv.mlir.referenceof @condition_scalar : i1
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : i1, bf16
+    %3 = spirv.Select %2, %0, %1 : i1, bf16
+    %4 = spirv.Constant dense<[2.0, 3.0, 4.0, 5.0]> : vector<4xbf16>
+    %5 = spirv.Constant dense<[6.0, 7.0, 8.0, 9.0]> : vector<4xbf16>
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : i1, vector<4xbf16>
+    %6 = spirv.Select %2, %4, %5 : i1, vector<4xbf16>
+    %7 = spirv.Constant dense<[true, true, true, true]> : vector<4xi1>
+    // CHECK: spirv.Select {{.*}}, {{.*}}, {{.*}} : vector<4xi1>, vector<4xbf16>
+    %8 = spirv.Select %7, %4, %5 : vector<4xi1>, vector<4xbf16>
+    spirv.Return
+  }
+}

From e6a3579653196af337f191ed2a3acbbf0e6d01bb Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 13 Jun 2025 15:22:47 +0100
Subject: [PATCH 376/851] [Offload] Replace device info queue with a tree
 (#144050)

Previously, device info was returned as a queue with each element having
a "Level" field indicating its nesting level. This replaces this queue
with a more traditional tree-like structure.

This should not result in a change to the output of
`llvm-offload-device-info`.
---
 offload/liboffload/src/OffloadImpl.cpp        |  15 +-
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  45 +++---
 .../common/include/PluginInterface.h          | 139 ++++++++++--------
 .../common/src/PluginInterface.cpp            |   6 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  21 +--
 offload/plugins-nextgen/host/src/rtl.cpp      |   5 +-
 6 files changed, 125 insertions(+), 106 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 0a784cddeaecb..770c212d804d2 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -229,26 +229,19 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
 
   // Find the info if it exists under any of the given names
   auto GetInfo = [&](std::vector<std::string> Names) {
-    InfoQueueTy DevInfo;
     if (Device == HostDevice())
       return std::string("Host");
 
     if (!Device->Device)
       return std::string("");
 
-    if (auto Err = Device->Device->obtainInfoImpl(DevInfo))
+    auto Info = Device->Device->obtainInfoImpl();
+    if (auto Err = Info.takeError())
       return std::string("");
 
     for (auto Name : Names) {
-      auto InfoKeyMatches = [&](const InfoQueueTy::InfoQueueEntryTy &Info) {
-        return Info.Key == Name;
-      };
-      auto Item = std::find_if(DevInfo.getQueue().begin(),
-                               DevInfo.getQueue().end(), InfoKeyMatches);
-
-      if (Item != std::end(DevInfo.getQueue())) {
-        return Item->Value;
-      }
+      if (auto Entry = Info->get(Name))
+        return (*Entry)->Value;
     }
 
     return std::string("");
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..73e1e66928fac 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2551,7 +2551,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
     const char *TmpCharPtr = "Unknown";
     uint16_t Major, Minor;
@@ -2562,6 +2562,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     uint16_t WorkgrpMaxDim[3];
     hsa_dim3_t GridMaxDim;
     hsa_status_t Status, Status2;
+    InfoTreeNode Info;
 
     Status = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major);
     Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor);
@@ -2617,11 +2618,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // runtime.
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_CACHE_SIZE, CacheSize);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Cache");
+      auto &Cache = *Info.add("Cache");
 
       for (int I = 0; I < 4; I++)
         if (CacheSize[I])
-          Info.add<InfoLevel2>("L" + std::to_string(I), CacheSize[I]);
+          Cache.add("L" + std::to_string(I), CacheSize[I]);
     }
 
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_CACHELINE_SIZE, TmpUInt);
@@ -2654,10 +2655,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Workgroup Max Size per Dimension");
-      Info.add<InfoLevel2>("x", WorkgrpMaxDim[0]);
-      Info.add<InfoLevel2>("y", WorkgrpMaxDim[1]);
-      Info.add<InfoLevel2>("z", WorkgrpMaxDim[2]);
+      auto &MaxSize = *Info.add("Workgroup Max Size per Dimension");
+      MaxSize.add("x", WorkgrpMaxDim[0]);
+      MaxSize.add("y", WorkgrpMaxDim[1]);
+      MaxSize.add("z", WorkgrpMaxDim[2]);
     }
 
     Status = getDeviceAttrRaw(
@@ -2673,17 +2674,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_GRID_MAX_DIM, GridMaxDim);
     if (Status == HSA_STATUS_SUCCESS) {
-      Info.add("Grid Max Size per Dimension");
-      Info.add<InfoLevel2>("x", GridMaxDim.x);
-      Info.add<InfoLevel2>("y", GridMaxDim.y);
-      Info.add<InfoLevel2>("z", GridMaxDim.z);
+      auto &MaxDim = *Info.add("Grid Max Size per Dimension");
+      MaxDim.add("x", GridMaxDim.x);
+      MaxDim.add("y", GridMaxDim.y);
+      MaxDim.add("z", GridMaxDim.z);
     }
 
     Status = getDeviceAttrRaw(HSA_AGENT_INFO_FBARRIER_MAX_SIZE, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Max fbarriers/Workgrp", TmpUInt);
 
-    Info.add("Memory Pools");
+    auto &RootPool = *Info.add("Memory Pools");
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
       std::string TmpStr, TmpStr2;
 
@@ -2698,7 +2699,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       else
         TmpStr = "Unknown";
 
-      Info.add<InfoLevel2>(std::string("Pool ") + TmpStr);
+      auto &PoolNode = *RootPool.add(std::string("Pool ") + TmpStr);
 
       if (Pool->isGlobal()) {
         if (Pool->isFineGrained())
@@ -2708,39 +2709,39 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         if (Pool->supportsKernelArgs())
           TmpStr2 += "Kernarg ";
 
-        Info.add<InfoLevel3>("Flags", TmpStr2);
+        PoolNode.add("Flags", TmpStr2);
       }
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Size", TmpSt, "bytes");
+        PoolNode.add("Size", TmpSt, "bytes");
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
                                 TmpBool);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Allocatable", TmpBool);
+        PoolNode.add("Allocatable", TmpBool);
 
       Status = Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
                                 TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Granule", TmpSt, "bytes");
+        PoolNode.add("Runtime Alloc Granule", TmpSt, "bytes");
 
       Status = Pool->getAttrRaw(
           HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, TmpSt);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Runtime Alloc Alignment", TmpSt, "bytes");
+        PoolNode.add("Runtime Alloc Alignment", TmpSt, "bytes");
 
       Status =
           Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel3>("Accessible by all", TmpBool);
+        PoolNode.add("Accessible by all", TmpBool);
     }
 
-    Info.add("ISAs");
+    auto &ISAs = *Info.add("ISAs");
     auto Err = hsa_utils::iterateAgentISAs(getAgent(), [&](hsa_isa_t ISA) {
       Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, TmpChar);
       if (Status == HSA_STATUS_SUCCESS)
-        Info.add<InfoLevel2>("Name", TmpChar);
+        ISAs.add("Name", TmpChar);
 
       return Status;
     });
@@ -2749,7 +2750,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Err)
       consumeError(std::move(Err));
 
-    return Plugin::success();
+    return Info;
   }
 
   /// Returns true if auto zero-copy the best configuration for the current
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..f5d995532b7a5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -112,77 +112,100 @@ struct AsyncInfoWrapperTy {
   __tgt_async_info *AsyncInfoPtr;
 };
 
-/// The information level represents the level of a key-value property in the
-/// info tree print (i.e. indentation). The first level should be the default.
-enum InfoLevelKind { InfoLevel1 = 1, InfoLevel2, InfoLevel3 };
-
-/// Class for storing device information and later be printed. An object of this
-/// type acts as a queue of key-value properties. Each property has a key, a
-/// a value, and an optional unit for the value. For printing purposes, the
-/// information can be classified into several levels. These levels are useful
-/// for defining sections and subsections. Thus, each key-value property also
-/// has an additional field indicating to which level belongs to. Notice that
-/// we use the level to determine the indentation of the key-value property at
-/// printing time. See the enum InfoLevelKind for the list of accepted levels.
-class InfoQueueTy {
-public:
-  struct InfoQueueEntryTy {
-    std::string Key;
-    std::string Value;
-    std::string Units;
-    uint64_t Level;
-  };
-
-private:
-  std::deque<InfoQueueEntryTy> Queue;
-
-public:
-  /// Add a new info entry to the queue. The entry requires at least a key
-  /// string in \p Key. The value in \p Value is optional and can be any type
-  /// that is representable as a string. The units in \p Units is optional and
-  /// must be a string. The info level is a template parameter that defaults to
-  /// the first level (top level).
-  template <InfoLevelKind L = InfoLevel1, typename T = std::string>
-  void add(const std::string &Key, T Value = T(),
-           const std::string &Units = std::string()) {
+/// Tree node for device information
+///
+/// This information is either printed or used by liboffload to extract certain
+/// device queries. Each property has an optional key, an optional value
+/// and optional children. The children can be used to store additional
+/// information (such as x, y and z components of ranges).
+struct InfoTreeNode {
+  static constexpr uint64_t IndentSize = 4;
+
+  std::string Key;
+  std::string Value;
+  std::string Units;
+  // Need to specify a default value number of elements here as `InfoTreeNode`'s
+  // size is unknown. This is a vector (rather than a Key->Value map) since:
+  // * The keys need to be owned and thus `std::string`s
+  // * The order of keys is important
+  // * The same key can appear multiple times
+  std::unique_ptr<llvm::SmallVector<InfoTreeNode, 8>> Children;
+
+  InfoTreeNode() : InfoTreeNode("", "", "") {}
+  InfoTreeNode(std::string Key, std::string Value, std::string Units)
+      : Key(Key), Value(Value), Units(Units) {}
+
+  /// Add a new info entry as a child of this node. The entry requires at least
+  /// a key string in \p Key. The value in \p Value is optional and can be any
+  /// type that is representable as a string. The units in \p Units is optional
+  /// and must be a string.
+  template <typename T = std::string>
+  InfoTreeNode *add(std::string Key, T Value = T(),
+                    const std::string &Units = std::string()) {
     assert(!Key.empty() && "Invalid info key");
 
-    // Convert the value to a string depending on its type.
+    if (!Children)
+      Children = std::make_unique<llvm::SmallVector<InfoTreeNode, 8>>();
+
+    std::string ValueStr;
     if constexpr (std::is_same_v<T, bool>)
-      Queue.push_back({Key, Value ? "Yes" : "No", Units, L});
+      ValueStr = Value ? "Yes" : "No";
     else if constexpr (std::is_arithmetic_v<T>)
-      Queue.push_back({Key, std::to_string(Value), Units, L});
+      ValueStr = std::to_string(Value);
     else
-      Queue.push_back({Key, Value, Units, L});
+      ValueStr = Value;
+
+    return &Children->emplace_back(Key, ValueStr, Units);
   }
 
-  const std::deque<InfoQueueEntryTy> &getQueue() const { return Queue; }
+  std::optional<InfoTreeNode *> get(StringRef Key) {
+    if (!Children)
+      return std::nullopt;
 
-  /// Print all info entries added to the queue.
-  void print() const {
-    // We print four spances for each level.
-    constexpr uint64_t IndentSize = 4;
+    auto It = std::find_if(Children->begin(), Children->end(),
+                           [&](auto &V) { return V.Key == Key; });
+    if (It == Children->end())
+      return std::nullopt;
+    return It;
+  }
 
-    // Find the maximum key length (level + key) to compute the individual
-    // indentation of each entry.
-    uint64_t MaxKeySize = 0;
-    for (const auto &Entry : Queue) {
-      uint64_t KeySize = Entry.Key.size() + Entry.Level * IndentSize;
-      if (KeySize > MaxKeySize)
-        MaxKeySize = KeySize;
-    }
+  /// Print all info entries in the tree
+  void print() const {
+    // Fake an additional indent so that values are offset from the keys
+    doPrint(0, maxKeySize(1));
+  }
 
-    // Print all info entries.
-    for (const auto &Entry : Queue) {
+private:
+  void doPrint(int Level, uint64_t MaxKeySize) const {
+    if (Key.size()) {
       // Compute the indentations for the current entry.
-      uint64_t KeyIndentSize = Entry.Level * IndentSize;
+      uint64_t KeyIndentSize = Level * IndentSize;
       uint64_t ValIndentSize =
-          MaxKeySize - (Entry.Key.size() + KeyIndentSize) + IndentSize;
+          MaxKeySize - (Key.size() + KeyIndentSize) + IndentSize;
 
-      llvm::outs() << std::string(KeyIndentSize, ' ') << Entry.Key
-                   << std::string(ValIndentSize, ' ') << Entry.Value
-                   << (Entry.Units.empty() ? "" : " ") << Entry.Units << "\n";
+      llvm::outs() << std::string(KeyIndentSize, ' ') << Key
+                   << std::string(ValIndentSize, ' ') << Value
+                   << (Units.empty() ? "" : " ") << Units << "\n";
     }
+
+    // Print children
+    if (Children)
+      for (const auto &Entry : *Children)
+        Entry.doPrint(Level + 1, MaxKeySize);
+  }
+
+  // Recursively calculates the maximum width of each key, including indentation
+  uint64_t maxKeySize(int Level) const {
+    uint64_t MaxKeySize = 0;
+
+    if (Children)
+      for (const auto &Entry : *Children) {
+        uint64_t KeySize = Entry.Key.size() + Level * IndentSize;
+        MaxKeySize = std::max(MaxKeySize, KeySize);
+        MaxKeySize = std::max(MaxKeySize, Entry.maxKeySize(Level + 1));
+      }
+
+    return MaxKeySize;
   }
 };
 
@@ -871,7 +894,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Print information about the device.
   Error printInfo();
-  virtual Error obtainInfoImpl(InfoQueueTy &Info) = 0;
+  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
   /// Getters of the grid values.
   uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9a6b3c1f4324..6fd3405d03afa 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1578,14 +1578,14 @@ Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
 }
 
 Error GenericDeviceTy::printInfo() {
-  InfoQueueTy InfoQueue;
+  auto Info = obtainInfoImpl();
 
   // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = obtainInfoImpl(InfoQueue))
+  if (auto Err = Info.takeError())
     return Err;
 
   // Print all info entries.
-  InfoQueue.print();
+  Info->print();
 
   return Plugin::success();
 }
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..9943f533ef5a8 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -922,11 +922,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
     const char *TmpCharPtr;
     size_t TmpSt;
     int TmpInt;
+    InfoTreeNode Info;
 
     CUresult Res = cuDriverGetVersion(&TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -971,27 +972,27 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       Info.add("Maximum Threads per Block", TmpInt);
 
-    Info.add("Maximum Block Dimensions", "");
+    auto &MaxBlock = *Info.add("Maximum Block Dimensions", "");
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
+      MaxBlock.add("x", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
+      MaxBlock.add("y", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
+      MaxBlock.add("z", TmpInt);
 
-    Info.add("Maximum Grid Dimensions", "");
+    auto &MaxGrid = *Info.add("Maximum Grid Dimensions", "");
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("x", TmpInt);
+      MaxGrid.add("x", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("y", TmpInt);
+      MaxGrid.add("y", TmpInt);
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
     if (Res == CUDA_SUCCESS)
-      Info.add<InfoLevel2>("z", TmpInt);
+      MaxGrid.add("z", TmpInt);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
     if (Res == CUDA_SUCCESS)
@@ -1087,7 +1088,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     Info.add("Compute Capabilities", ComputeCapability.str());
 
-    return Plugin::success();
+    return Info;
   }
 
   virtual bool shouldSetupDeviceMemoryPool() const override {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..ced9208acaedc 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -326,9 +326,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
 
   /// Print information about the device.
-  Error obtainInfoImpl(InfoQueueTy &Info) override {
+  Expected<InfoTreeNode> obtainInfoImpl() override {
+    InfoTreeNode Info;
     Info.add("Device Type", "Generic-elf-64bit");
-    return Plugin::success();
+    return Info;
   }
 
   /// This plugin should not setup the device environment or memory pool.

From 82911f188be7ce7cb0a04b7fd648ea8b4aad2e59 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 13 Jun 2025 14:23:02 +0000
Subject: [PATCH 377/851] [lldb][test] Skip ReadAfterClose JSON Transport tests
 on Windows

These were failing on our Windows on Arm bot, or more precisely,
not even completing.

This is because Microsoft's C runtime does extra parameter validation.
So when we called _read with an invalid fd, it called an invalid
parameter handler instead of returning an error.

https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/reference/read?view=msvc-170
https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/parameter-validation?view=msvc-170

(lldb) run
Process 8440 launched: 'C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\tools\lldb\unittests\Host\HostTests.exe' (aarch64)
Process 8440 stopped
* thread #1, stop reason = Exception 0xc0000409 encountered at address 0x7ffb7453564c
    frame #0: 0x00007ffb7453564c ucrtbase.dll`_get_thread_local_invalid_parameter_handler + 652
ucrtbase.dll`_get_thread_local_invalid_parameter_handler:
->  0x7ffb7453564c <+652>: brk    #0xf003

ucrtbase.dll`_invalid_parameter_noinfo:
    0x7ffb74535650 <+0>:   b      0x7ffb745354d8 ; _get_thread_local_invalid_parameter_handler + 280
    0x7ffb74535654 <+4>:   nop
    0x7ffb74535658 <+8>:   nop

You can override this handler but I'm assuming that this reading
after close isn't a crucial feature, so disabling the tests seems
like the way to go.

If it is crucial, we can check the fd before we use it.

Tests added by https://github.com/llvm/llvm-project/pull/143946.
---
 lldb/unittests/Host/JSONTransportTest.cpp | 31 +++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/lldb/unittests/Host/JSONTransportTest.cpp b/lldb/unittests/Host/JSONTransportTest.cpp
index f1ec5e03bbeca..4621869887ac8 100644
--- a/lldb/unittests/Host/JSONTransportTest.cpp
+++ b/lldb/unittests/Host/JSONTransportTest.cpp
@@ -84,12 +84,6 @@ TEST_F(HTTPDelimitedJSONTransportTest, ReadWithEOF) {
       Failed<TransportEOFError>());
 }
 
-TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
-  input.CloseReadFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
-      llvm::Failed());
-}
 
 TEST_F(HTTPDelimitedJSONTransportTest, InvalidTransport) {
   transport = std::make_unique<HTTPDelimitedJSONTransport>(nullptr, nullptr);
@@ -136,13 +130,6 @@ TEST_F(JSONRPCTransportTest, ReadWithEOF) {
       Failed<TransportEOFError>());
 }
 
-TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
-  input.CloseReadFileDescriptor();
-  ASSERT_THAT_EXPECTED(
-      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
-      llvm::Failed());
-}
-
 TEST_F(JSONRPCTransportTest, Write) {
   ASSERT_THAT_ERROR(transport->Write(JSONTestType{"foo"}), Succeeded());
   output.CloseWriteFileDescriptor();
@@ -173,4 +160,22 @@ TEST_F(JSONRPCTransportTest, ReadWithTimeout) {
       transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
       Failed<TransportTimeoutError>());
 }
+
+// Windows CRT _read checks that the file descriptor is valid and calls a
+// handler if not. This handler is normally a breakpoint, which looks like a
+// crash when not handled by a debugger.
+// https://learn.microsoft.com/en-us/%20cpp/c-runtime-library/reference/read?view=msvc-170
+TEST_F(HTTPDelimitedJSONTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
+
+TEST_F(JSONRPCTransportTest, ReadAfterClosed) {
+  input.CloseReadFileDescriptor();
+  ASSERT_THAT_EXPECTED(
+      transport->Read<JSONTestType>(std::chrono::milliseconds(1)),
+      llvm::Failed());
+}
 #endif

From 9670e09d0eac596fba6bf03ef1a6f3229dddee46 Mon Sep 17 00:00:00 2001
From: Devon Loehr <DKLoehr@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:29:42 -0400
Subject: [PATCH 378/851] Enable unique-object-duplication warning for windows
 (#143537)

Followup to #125526. This expands the logic of the
unique-object-duplication warning so that it also works for windows
code.

For the most part, the logic is unchanged, merely substituting "has no
import/export annotation" in place of "has hidden visibility". However,
there are some small inconsistencies between the two; namely, visibility
is propagated through nested classes, while import/export annotations
aren't.

This PR:
1. Updates the logic for the warning to account for the differences
between posix and windows
2. Changes the warning message and documentation appropriately
3. Updates the tests to cover windows, and adds new test cases for the
places where behavior differs.

This PR was tested by building chromium (cross compiling linux->windows)
with the changes in place. After accounting for the differences in
semantics, no new warnings were discovered.
---
 clang/include/clang/Basic/DiagnosticGroups.td | 16 ++--
 .../clang/Basic/DiagnosticSemaKinds.td        | 21 +++--
 clang/lib/Sema/SemaDecl.cpp                   | 31 +++++--
 .../SemaCXX/unique_object_duplication.cpp     | 10 ++-
 .../test/SemaCXX/unique_object_duplication.h  | 90 +++++++++++++------
 5 files changed, 114 insertions(+), 54 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index beda73e675fc6..38b4f581fa5c9 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -806,7 +806,9 @@ def UniqueObjectDuplication : DiagGroup<"unique-object-duplication"> {
 Warns when objects which are supposed to be globally unique might get duplicated
 when built into a shared library.
 
-If an object with hidden visibility is built into a shared library, each instance
+This can occur to objects which are hidden from the dynamic linker, due to
+having hidden visibility (on posix) or lacking a dllimport/dllexport attribute
+(on windows). If such an object is built into a shared library, each instance
 of the library will get its own copy. This can cause very subtle bugs if there was
 only supposed to be one copy of the object in question: singletons aren't single,
 changes to one object won't affect the others, the object's initializer will run
@@ -815,7 +817,7 @@ once per copy, etc.
 Specifically, this warning fires when it detects an object which:
 1. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
 2. Has external linkage (otherwise it's supposed to be duplicated), and
-3. Has hidden visibility.
+3. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
 
 As well as one of the following:
 1. The object is mutable, or
@@ -825,13 +827,15 @@ The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
 1. Marking the object ``const`` (if possible)
 2. Moving the object's definition to a source file
-3. Giving the object non-hidden visibility, e.g. using ``__attribute((visibility("default")))``.
+3. Making the object visible using ``__attribute((visibility("default")))``,
+   ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
+
+When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,
+take care to ensure that the object is only exported from one dll, and is imported
+everywhere else.
 
 Note that for (2), all levels of a pointer variable must be constant;
 ``const int*`` will trigger the warning because the pointer itself is mutable.
-
-This warning is not yet implemented for Windows, since Windows uses
-import/export rules instead of visibility.
 }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a2cf84d024193..95d24e9f1e6b5 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6267,14 +6267,19 @@ def warn_static_local_in_extern_inline : Warning<
 def note_convert_inline_to_static : Note<
   "use 'static' to give inline function %0 internal linkage">;
 
-def warn_possible_object_duplication_mutable : Warning<
-  "%0 may be duplicated when built into a shared library: "
-  "it is mutable, has hidden visibility, and external linkage">,
-  InGroup<UniqueObjectDuplication>, DefaultIgnore;
-def warn_possible_object_duplication_init : Warning<
-  "initialization of %0 may run twice when built into a shared library: "
-  "it has hidden visibility and external linkage">,
-  InGroup<UniqueObjectDuplication>, DefaultIgnore;
+def warn_possible_object_duplication_mutable
+    : Warning<"%0 may be duplicated when built into a shared library: "
+              "it is mutable, with external linkage and "
+              "%select{hidden visibility|no import/export annotation}1">,
+      InGroup<UniqueObjectDuplication>,
+      DefaultIgnore;
+def warn_possible_object_duplication_init
+    : Warning<"initialization of %0 may run twice when built into a shared "
+              "library: "
+              "it has external linkage and "
+              "%select{hidden visibility|no import/export annotation}1">,
+      InGroup<UniqueObjectDuplication>,
+      DefaultIgnore;
 
 def ext_redefinition_of_typedef : ExtWarn<
   "redefinition of typedef %0 is a C11 feature">,
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index c152f406b4977..5cffd82e3372e 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13518,8 +13518,28 @@ bool Sema::GloballyUniqueObjectMightBeAccidentallyDuplicated(
 
   // If the object isn't hidden, the dynamic linker will prevent duplication.
   clang::LinkageInfo Lnk = Target->getLinkageAndVisibility();
-  if (Lnk.getVisibility() != HiddenVisibility)
+
+  // The target is "hidden" (from the dynamic linker) if:
+  // 1. On posix, it has hidden visibility, or
+  // 2. On windows, it has no import/export annotation
+  if (Context.getTargetInfo().shouldDLLImportComdatSymbols()) {
+    if (Target->hasAttr<DLLExportAttr>() || Target->hasAttr<DLLImportAttr>())
+      return false;
+
+    // If the variable isn't directly annotated, check to see if it's a member
+    // of an annotated class.
+    const VarDecl *VD = dyn_cast<VarDecl>(Target);
+
+    if (VD && VD->isStaticDataMember()) {
+      const CXXRecordDecl *Ctx = dyn_cast<CXXRecordDecl>(VD->getDeclContext());
+      if (Ctx &&
+          (Ctx->hasAttr<DLLExportAttr>() || Ctx->hasAttr<DLLImportAttr>()))
+        return false;
+    }
+  } else if (Lnk.getVisibility() != HiddenVisibility) {
+    // Posix case
     return false;
+  }
 
   // If the obj doesn't have external linkage, it's supposed to be duplicated.
   if (!isExternalFormalLinkage(Lnk.getLinkage()))
@@ -13550,19 +13570,16 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
   // duplicated when built into a shared library, which causes problems if it's
   // mutable (since the copies won't be in sync) or its initialization has side
   // effects (since it will run once per copy instead of once globally).
-  // FIXME: Windows uses dllexport/dllimport instead of visibility, and we don't
-  // handle that yet. Disable the warning on Windows for now.
 
   // Don't diagnose if we're inside a template, because it's not practical to
   // fix the warning in most cases.
-  if (!Context.getTargetInfo().shouldDLLImportComdatSymbols() &&
-      !VD->isTemplated() &&
+  if (!VD->isTemplated() &&
       GloballyUniqueObjectMightBeAccidentallyDuplicated(VD)) {
 
     QualType Type = VD->getType();
     if (looksMutable(Type, VD->getASTContext())) {
       Diag(VD->getLocation(), diag::warn_possible_object_duplication_mutable)
-          << VD;
+          << VD << Context.getTargetInfo().shouldDLLImportComdatSymbols();
     }
 
     // To keep false positives low, only warn if we're certain that the
@@ -13575,7 +13592,7 @@ void Sema::DiagnoseUniqueObjectDuplication(const VarDecl *VD) {
                              /*IncludePossibleEffects=*/false) &&
         !isa<CXXNewExpr>(Init->IgnoreParenImpCasts())) {
       Diag(Init->getExprLoc(), diag::warn_possible_object_duplication_init)
-          << VD;
+          << VD << Context.getTargetInfo().shouldDLLImportComdatSymbols();
     }
   }
 }
diff --git a/clang/test/SemaCXX/unique_object_duplication.cpp b/clang/test/SemaCXX/unique_object_duplication.cpp
index 4b41bfbfdc2f7..ff3b85d19fa67 100644
--- a/clang/test/SemaCXX/unique_object_duplication.cpp
+++ b/clang/test/SemaCXX/unique_object_duplication.cpp
@@ -1,7 +1,9 @@
-// RUN: %clang_cc1 -fsyntax-only -verify=hidden -Wunique-object-duplication -fvisibility=hidden -Wno-unused-value %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wunique-object-duplication -Wno-unused-value %s
-// The check is currently disabled on windows in MSVC-like environments. The test should fail because we're not getting the expected warnings.
-// XFAIL: target={{.*}}-windows-msvc, {{.*}}-ps{{(4|5)(-.+)?}}
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify -triple=x86_64-pc-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify=hidden -triple=x86_64-pc-linux-gnu -fvisibility=hidden  %s
+// RUN: %clang_cc1 -fsyntax-only -Wunique-object-duplication -Wno-unused-value \
+// RUN:   -verify=windows -triple=x86_64-windows-msvc -DWINDOWS_TEST -fdeclspec %s
 
 #include "unique_object_duplication.h"
 
diff --git a/clang/test/SemaCXX/unique_object_duplication.h b/clang/test/SemaCXX/unique_object_duplication.h
index 537429d9ebdaa..bd0ee6bd14d64 100644
--- a/clang/test/SemaCXX/unique_object_duplication.h
+++ b/clang/test/SemaCXX/unique_object_duplication.h
@@ -3,8 +3,14 @@
  * See the warning's documentation for more information.
  */
 
+#ifdef WINDOWS_TEST
+#define HIDDEN
+// dllimport also suffices for visibility, but those can't have definitions
+#define VISIBLE __declspec(dllexport)
+#else
 #define HIDDEN __attribute__((visibility("hidden")))
-#define DEFAULT __attribute__((visibility("default")))
+#define VISIBLE __attribute__((visibility("default")))
+#endif
 
 // Helper functions
 constexpr int init_constexpr(int x) { return x; };
@@ -17,10 +23,11 @@ namespace StaticLocalTest {
 
 inline void has_static_locals_external() {
   // Mutable
-  static int disallowedStatic1 = 0; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  static int disallowedStatic1 = 0; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                    // windows-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   // Initialization might run more than once
-  static const double disallowedStatic2 = disallowedStatic1++; // hidden-warning {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has hidden visibility and external linkage}}
-  
+  static const double disallowedStatic2 = disallowedStatic1++; // hidden-warning {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has external linkage and hidden visibility}}
+                                                               // windows-warning@-1 {{initialization of 'disallowedStatic2' may run twice when built into a shared library: it has external linkage and no import/export annotation}}
   // OK, because immutable and compile-time-initialized
   static constexpr int allowedStatic1 = 0;
   static const float allowedStatic2 = 1;
@@ -53,29 +60,33 @@ void has_static_locals_anon() {
   static double allowedStatic2 = init_dynamic(2);
   static char allowedStatic3 = []() { return allowedStatic1++; }();
   static constexpr int allowedStatic4 = init_constexpr(3);
-} 
+}
 
 } // Anonymous namespace
 
 HIDDEN inline void static_local_always_hidden() {
-    static int disallowedStatic1 = 3; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                      // expected-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+    static int disallowedStatic1 = 3; // hidden-warning {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                      // expected-warning@-1 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                      // windows-warning@-2 {{'disallowedStatic1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     {
-      static int disallowedStatic2 = 3; // hidden-warning {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                        // expected-warning@-1 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+      static int disallowedStatic2 = 3; // hidden-warning {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // expected-warning@-1 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // windows-warning@-2 {{'disallowedStatic2' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     }
 
     auto lmb = []() {
-      static int disallowedStatic3 = 3; // hidden-warning {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-                                        // expected-warning@-1 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+      static int disallowedStatic3 = 3; // hidden-warning {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // expected-warning@-1 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                        // windows-warning@-2 {{'disallowedStatic3' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     };
 }
 
-DEFAULT void static_local_never_hidden() {
-    static int allowedStatic1 = 3; 
+// Always visible
+VISIBLE void static_local_never_hidden() {
+    static int allowedStatic1 = 3;
 
     {
-      static int allowedStatic2 = 3; 
+      static int allowedStatic2 = 3;
     }
 
     auto lmb = []() {
@@ -96,7 +107,8 @@ inline void has_regular_local() {
 
 inline void has_thread_local() {
   // thread_local variables are static by default
-  thread_local int disallowedThreadLocal = 0; // hidden-warning {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  thread_local int disallowedThreadLocal = 0; // hidden-warning {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                              // windows-warning@-1 {{'disallowedThreadLocal' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
 }
 
 // Functions themselves are always immutable, so referencing them is okay
@@ -109,11 +121,13 @@ inline auto& allowedFunctionReference = has_static_locals_external;
  ******************************************************************************/
 namespace GlobalTest {
   // Mutable
-  inline float disallowedGlobal1 = 3.14; // hidden-warning {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  
-  // Initialization might run more than once
-  inline const double disallowedGlobal5 = disallowedGlobal1++; // hidden-warning {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has hidden visibility and external linkage}}
+  inline float disallowedGlobal1 = 3.14; // hidden-warning {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                         // windows-warning@-1 {{'disallowedGlobal1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+
 
+  // Initialization might run more than once
+  inline const double disallowedGlobal5 = disallowedGlobal1++; // hidden-warning {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has external linkage and hidden visibility}}
+                                                               // windows-warning@-1 {{initialization of 'disallowedGlobal5' may run twice when built into a shared library: it has external linkage and no import/export annotation}}
   // OK because internal linkage, so duplication is intended
   static float allowedGlobal1 = 3.14;
   const double allowedGlobal2 = init_dynamic(2);
@@ -129,34 +143,52 @@ namespace GlobalTest {
   // We don't warn on this because non-inline variables can't (legally) appear
   // in more than one TU.
   float allowedGlobal9 = 3.14;
-  
+
   // Pointers need to be double-const-qualified
-  inline float& nonConstReference = disallowedGlobal1; // hidden-warning {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline float& nonConstReference = disallowedGlobal1; // hidden-warning {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                       // windows-warning@-1 {{'nonConstReference' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   const inline int& constReference = allowedGlobal5;
 
-  inline int* nonConstPointerToNonConst = nullptr; // hidden-warning {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  inline int const* nonConstPointerToConst = nullptr; // hidden-warning {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
-  inline int* const constPointerToNonConst = nullptr; // hidden-warning {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int* nonConstPointerToNonConst = nullptr; // hidden-warning {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                   // windows-warning@-1 {{'nonConstPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+  inline int const* nonConstPointerToConst = nullptr; // hidden-warning {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                      // windows-warning@-1 {{'nonConstPointerToConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+  inline int* const constPointerToNonConst = nullptr; // hidden-warning {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                      // windows-warning@-1 {{'constPointerToNonConst' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
   inline int const* const constPointerToConst = nullptr;
   // Don't warn on new because it tends to generate false positives
   inline int const* const constPointerToConstNew = new int(7);
 
   inline int const * const * const * const nestedConstPointer = nullptr;
-  inline int const * const ** const * const nestedNonConstPointer = nullptr; // hidden-warning {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline int const * const ** const * const nestedNonConstPointer = nullptr; // hidden-warning {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                                             // windows-warning@-1 {{'nestedNonConstPointer' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
 
   struct Test {
-    static inline float disallowedStaticMember1; // hidden-warning {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}       
+    static inline float disallowedStaticMember1; // hidden-warning {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                 // windows-warning@-1 {{'disallowedStaticMember1' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
     // Defined below, in the header file
-    static float disallowedStaticMember2;                                       
+    static float disallowedStaticMember2;
     // Defined in the cpp file, so won't get duplicated
     static float allowedStaticMember1;
 
+    // Always visible
+    VISIBLE static inline float allowedStaticMember2 = 0.0;
+
     // Tests here are sparse because the AddrTest case below will define plenty
     // more, which aren't problematic to define (because they're immutable), but
     // may still cause problems if their address is taken.
   };
 
-  inline float Test::disallowedStaticMember2 = 2.3; // hidden-warning {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, has hidden visibility, and external linkage}}
+  inline float Test::disallowedStaticMember2 = 2.3; // hidden-warning {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, with external linkage and hidden visibility}}
+                                                    // windows-warning@-1 {{'disallowedStaticMember2' may be duplicated when built into a shared library: it is mutable, with external linkage and no import/export annotation}}
+
+  // This is always visible, so nothing inside it will get duplicated
+  struct VISIBLE NeverHidden {
+    static inline float allowedStaticMember3;
+    static float allowedStaticMember4;
+  };
+
+  inline float NeverHidden::allowedStaticMember4 = 3.4;
 } // namespace GlobalTest
 
 /******************************************************************************
@@ -165,7 +197,7 @@ namespace GlobalTest {
 
 namespace TemplateTest {
 
-// We never warn inside templates because it's frequently infeasible to actually
+// We never warn inside templates because it's usually infeasible to actually
 // fix the warning.
 
 template <typename T>

From cf6ae065a042aae6324b28e99628c40bc53be0b7 Mon Sep 17 00:00:00 2001
From: nicebert <110385235+nicebert@users.noreply.github.com>
Date: Fri, 13 Jun 2025 16:46:36 +0200
Subject: [PATCH 379/851] [OpenMP] Remove declaration and usage of
 __AMDGCN_WAVEFRONT_SIZE (#143761)

Removes usage of __AMDGCN_WAVEFRONT_SIZE as compile time constant.

---------

Co-authored-by: Shilei Tian <i@tianshilei.me>
---
 openmp/runtime/src/include/ompx.h.var | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 623f0b9c315bd..6884745f4240c 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,13 +9,21 @@
 #ifndef __OMPX_H
 #define __OMPX_H
 
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-#define __WARP_SIZE __AMDGCN_WAVEFRONT_SIZE
-#else
-#define __WARP_SIZE 32
+#if (defined(__NVPTX__) || defined(__AMDGPU__))
+#include <gpuintrin.h>
+#define __OMPX_TARGET_IS_GPU
 #endif
 
 typedef unsigned long uint64_t;
+typedef unsigned int uint32_t;
+
+static inline uint32_t __warpSize(void) {
+#ifdef __OMPX_TARGET_IS_GPU
+  return __gpu_num_lanes();
+#else
+  __builtin_trap();
+#endif
+}
 
 #ifdef __cplusplus
 extern "C" {
@@ -212,7 +220,7 @@ static inline uint64_t ballot_sync(uint64_t mask, int pred) {
 ///{
 #define _TGT_KERNEL_LANGUAGE_SHFL_DOWN_SYNC(TYPE, TY)                          \
   static inline TYPE shfl_down_sync(uint64_t mask, TYPE var, unsigned delta,   \
-                                    int width = __WARP_SIZE) {                 \
+                                    int width = __warpSize()) {                \
     return ompx_shfl_down_sync_##TY(mask, var, delta, width);                  \
   }
 

From ebd7f7539b1c2bc7d5e391bbb00cb56dc245b2dd Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Fri, 13 Jun 2025 13:26:29 +0100
Subject: [PATCH 380/851] [KeyInstr][NFC] Fix incorrect atomGroup/rank uint
 size in computeKeyInstructions

---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5fb74a016a75e..0edfca78b0886 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2368,8 +2368,8 @@ void DwarfDebug::computeKeyInstructions(const MachineFunction *MF) {
   // Map {(InlinedAt, Group): (Rank, Instructions)}.
   // NOTE: Anecdotally, for a large C++ blob, 99% of the instruction
   // SmallVectors contain 2 or fewer elements; use 2 inline elements.
-  DenseMap<std::pair<DILocation *, uint32_t>,
-           std::pair<uint16_t, SmallVector<const MachineInstr *, 2>>>
+  DenseMap<std::pair<DILocation *, uint64_t>,
+           std::pair<uint8_t, SmallVector<const MachineInstr *, 2>>>
       GroupCandidates;
 
   // For each instruction:

From 9e622986526a35f3f8bc60a7fc756b5c7bf825c0 Mon Sep 17 00:00:00 2001
From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:06:31 -0400
Subject: [PATCH 381/851] [mlir][spirv] Fix FuncOpVectorUnroll to process
 placeholder values in all blocks (#142339)

`FuncOpVectorUnroll` contains logic that replaces function arguments by
placeholders values. These replacements also involve changing all
instructions in the function that use the arguments to use these
placeholders. These placeholder values will later be changed back to use
the function arguments (either new or original if already legal).

The current implementation however only replaces back (the second
replacement, i.e. replacing the placeholder values to new/legal
arguments) the first block of instructions and not all of the blocks.
This may leave some instructions to use these placeholder values (which
for already legal arguments are just zeroattr values that will get
DCE'd) instead of the arguments, which is incorrect.

Closes #132158.
---
 .../SPIRV/Transforms/SPIRVConversion.cpp      | 26 +++----
 .../func-signature-vector-unroll.mlir         | 73 +++++++++++++++++++
 2 files changed, 86 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index 62a24646d0662..f5a58c58e05df 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -1020,22 +1020,22 @@ struct FuncOpVectorUnroll final : OpRewritePattern<func::FuncOp> {
     SmallVector<Location> locs(convertedTypes.size(), newFuncOp.getLoc());
     entryBlock.addArguments(convertedTypes, locs);
 
-    // Replace the placeholder values with the new arguments. We assume there is
-    // only one block for now.
+    // Replace all uses of placeholders for initially legal arguments with their
+    // original function arguments (that were added to `newFuncOp`).
+    for (auto &[placeholderOp, argIdx] : tmpOps) {
+      if (!placeholderOp)
+        continue;
+      Value replacement = newFuncOp.getArgument(argIdx);
+      rewriter.replaceAllUsesWith(placeholderOp->getResult(0), replacement);
+    }
+
+    // Replace dummy operands of new `vector.insert_strided_slice` ops with
+    // their corresponding new function arguments. The new
+    // `vector.insert_strided_slice` ops are inserted only into the entry block,
+    // so iterating over that block is sufficient.
     size_t unrolledInputIdx = 0;
     for (auto [count, op] : enumerate(entryBlock.getOperations())) {
-      // We first look for operands that are placeholders for initially legal
-      // arguments.
       Operation &curOp = op;
-      for (auto [operandIdx, operandVal] : llvm::enumerate(op.getOperands())) {
-        Operation *operandOp = operandVal.getDefiningOp();
-        if (auto it = tmpOps.find(operandOp); it != tmpOps.end()) {
-          size_t idx = operandIdx;
-          rewriter.modifyOpInPlace(&curOp, [&curOp, &newFuncOp, it, idx] {
-            curOp.setOperand(idx, newFuncOp.getArgument(it->second));
-          });
-        }
-      }
       // Since all newly created operations are in the beginning, reaching the
       // end of them means that any later `vector.insert_strided_slice` should
       // not be touched.
diff --git a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
index c018ccb924983..211d6c90243bd 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/func-signature-vector-unroll.mlir
@@ -189,3 +189,76 @@ func.func @unsupported_scalable(%arg0 : vector<[8]xi32>) -> (vector<[8]xi32>) {
   return %arg0 : vector<[8]xi32>
 }
 
+// -----
+
+// Check that already legal function parameters are properly preserved across multiple blocks.
+
+// CHECK-LABEL: func.func @legal_params_multiple_blocks_simple
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32) -> i32
+func.func @legal_params_multiple_blocks_simple(%arg0: i32, %arg1: i32) -> i32 {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[ARG1]] : i32
+  // CHECK: return %[[ADD1]] : i32
+  cf.br ^bb1(%arg0 : i32)
+^bb1(%acc0: i32):
+  %acc1_val = arith.addi %acc0, %arg1 : i32
+  cf.br ^bb2(%acc1_val : i32)
+^bb2(%acc1: i32):
+  %acc2_val = arith.addi %acc1, %arg1 : i32
+  cf.br ^bb3(%acc2_val : i32)
+^bb3(%acc_final: i32):
+  return %acc_final : i32
+}
+
+// -----
+
+// Check that legal parameters and existing `vector.insert_strided_slice`s are properly preserved across multiple blocks.
+
+// CHECK-LABEL: func.func @legal_params_with_vec_insert_multiple_blocks
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: vector<4xi32>) -> vector<4xi32>
+func.func @legal_params_with_vec_insert_multiple_blocks(%arg0: i32, %arg1: i32, %arg2: vector<4xi32>) -> vector<4xi32> {
+  // CHECK: %[[ADD0:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+  // CHECK: %[[ADD1:.*]] = arith.addi %[[ADD0]], %[[ARG1]] : i32
+  // CHECK: %[[VEC1D:.*]] = vector.broadcast %[[ADD1]] : i32 to vector<1xi32>
+  // CHECK: %[[VEC0:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[ARG2]] {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: %[[VEC1:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[VEC0]] {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: %[[RESULT:.*]] = vector.insert_strided_slice %[[VEC1D]], %[[VEC1]] {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
+  // CHECK: return %[[RESULT]] : vector<4xi32>
+  cf.br ^bb1(%arg0 : i32)
+^bb1(%acc0: i32):
+  %acc1_val = arith.addi %acc0, %arg1 : i32
+  cf.br ^bb2(%acc1_val : i32)
+^bb2(%acc1: i32):
+  %acc2_val = arith.addi %acc1, %arg1 : i32
+  cf.br ^bb3(%acc2_val : i32)
+^bb3(%acc_final: i32):
+  %scalar_vec = vector.broadcast %acc_final : i32 to vector<1xi32>
+  %vec0 = vector.insert_strided_slice %scalar_vec, %arg2 {offsets = [1], strides = [1]} : vector<1xi32> into vector<4xi32>
+  %vec1 = vector.insert_strided_slice %scalar_vec, %vec0 {offsets = [2], strides = [1]} : vector<1xi32> into vector<4xi32>
+  %result = vector.insert_strided_slice %scalar_vec, %vec1 {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
+  return %result : vector<4xi32>
+}
+
+// -----
+
+// Check that already legal function parameters are preserved across a loop (which contains multiple blocks).
+
+// CHECK-LABEL: @legal_params_for_loop
+// CHECK-SAME: (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32)
+func.func @legal_params_for_loop(%arg0: i32, %arg1: i32, %arg2: i32) -> i32 {
+  // CHECK: %[[CST0:.*]] = arith.constant 0 : index
+  // CHECK: %[[CST1:.*]] = arith.constant 1 : index
+  // CHECK: %[[UB:.*]] = arith.index_cast %[[ARG2]] : i32 to index
+  // CHECK: %[[RESULT:.*]] = scf.for %[[STEP:.*]] = %[[CST0]] to %[[UB]] step %[[CST1]] iter_args(%[[ACC:.*]] = %[[ARG0]]) -> (i32) {
+  // CHECK:   %[[ADD:.*]] = arith.addi %[[ACC]], %[[ARG1]] : i32
+  // CHECK:   scf.yield %[[ADD]] : i32
+  // CHECK: return %[[RESULT]] : i32
+  %zero = arith.constant 0 : index
+  %one = arith.constant 1 : index
+  %ub = arith.index_cast %arg2 : i32 to index
+  %result = scf.for %i = %zero to %ub step %one iter_args(%acc = %arg0) -> (i32) {
+    %new_acc = arith.addi %acc, %arg1 : i32
+    scf.yield %new_acc : i32
+  }
+  return %result : i32
+}

From bcfbba12e6754e0a2a5a1c8e3aac3a24316bba2d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 08:11:20 -0700
Subject: [PATCH 382/851] [llvm] Compare std::optional<T> to values directly
 (NFC) (#143913)

This patch transforms:

  X && *X == Y

to:

  X == Y

where X is of std::optional<T>, and Y is of T or similar.
---
 llvm/lib/Analysis/ConstantFolding.cpp | 4 ++--
 llvm/lib/IR/Attributes.cpp            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 139a0b81e299b..64a0f4641250c 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -2132,7 +2132,7 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
 
   // If evaluation raised FP exception, the result can depend on rounding
   // mode. If the latter is unknown, folding is not possible.
-  if (ORM && *ORM == RoundingMode::Dynamic)
+  if (ORM == RoundingMode::Dynamic)
     return false;
 
   // If FP exceptions are ignored, fold the call, even if such exception is
@@ -2418,7 +2418,7 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         if (IntrinsicID == Intrinsic::experimental_constrained_rint &&
             St == APFloat::opInexact) {
           std::optional<fp::ExceptionBehavior> EB = CI->getExceptionBehavior();
-          if (EB && *EB == fp::ebStrict)
+          if (EB == fp::ebStrict)
             return nullptr;
         }
       } else if (U.isSignaling()) {
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index ed485f9656996..bfb32ff9995d1 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -295,7 +295,7 @@ Attribute Attribute::getWithCaptureInfo(LLVMContext &Context, CaptureInfo CI) {
 Attribute
 Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
                                 const std::optional<unsigned> &NumElemsArg) {
-  assert(!(ElemSizeArg == 0 && NumElemsArg && *NumElemsArg == 0) &&
+  assert(!(ElemSizeArg == 0 && NumElemsArg == 0) &&
          "Invalid allocsize arguments -- given allocsize(0, 0)");
   return get(Context, AllocSize, packAllocSizeArgs(ElemSizeArg, NumElemsArg));
 }

From 6751b3a549ebef78a7e75b100d61742c20945592 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Fri, 13 Jun 2025 16:16:09 +0100
Subject: [PATCH 383/851] Revert "[lit] cleanup unused imports" (#144054)

Reverts llvm/llvm-project#143930 as it causes build failures:
https://github.com/llvm/llvm-project/pull/143930#issuecomment-2969115461
---
 lld/test/Unit/lit.cfg.py         | 1 +
 lldb/test/API/lit.cfg.py         | 2 ++
 lldb/test/Shell/lit.cfg.py       | 5 ++++-
 lldb/test/lit.cfg.py             | 3 +++
 llvm/utils/lit/lit/LitConfig.py  | 6 ++++--
 llvm/utils/lit/lit/TestRunner.py | 6 ++++++
 llvm/utils/lit/lit/discovery.py  | 2 +-
 llvm/utils/lit/lit/worker.py     | 2 ++
 8 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/lld/test/Unit/lit.cfg.py b/lld/test/Unit/lit.cfg.py
index 47375db517e96..1cf890a05cb28 100644
--- a/lld/test/Unit/lit.cfg.py
+++ b/lld/test/Unit/lit.cfg.py
@@ -3,6 +3,7 @@
 # Configuration file for the 'lit' test runner.
 
 import os
+import subprocess
 
 import lit.formats
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 04b360e8d3307..646a446c86fdb 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -9,6 +9,8 @@
 import subprocess
 import sys
 
+import lit.formats
+
 # name: The name of this test suite.
 config.name = "lldb-api"
 
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 6f0e017fb7cb9..ab6113767187a 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -7,9 +7,12 @@
 import shutil
 import site
 import subprocess
+import sys
 
-import lit.util
+import lit.formats
 from lit.llvm import llvm_config
+from lit.llvm.subst import FindTool
+from lit.llvm.subst import ToolSubst
 
 site.addsitedir(os.path.dirname(__file__))
 from helper import toolchain
diff --git a/lldb/test/lit.cfg.py b/lldb/test/lit.cfg.py
index 6a4255c2627d9..eefc32aabd16d 100644
--- a/lldb/test/lit.cfg.py
+++ b/lldb/test/lit.cfg.py
@@ -2,6 +2,9 @@
 
 import os
 
+import lit.formats
+from lit.llvm import llvm_config
+
 # This is the top level configuration. Most of these configuration options will
 # be overriden by individual lit configuration files in the test
 # subdirectories. Anything configured here will *not* be loaded when pointing
diff --git a/llvm/utils/lit/lit/LitConfig.py b/llvm/utils/lit/lit/LitConfig.py
index 5bb2d3c5c986c..cb4aef6f72a87 100644
--- a/llvm/utils/lit/lit/LitConfig.py
+++ b/llvm/utils/lit/lit/LitConfig.py
@@ -1,12 +1,14 @@
 from __future__ import absolute_import
-
 import inspect
 import os
+import platform
 import sys
 
+import lit.Test
+import lit.formats
+import lit.TestingConfig
 import lit.util
 
-
 # LitConfig must be a new style class for properties to work
 class LitConfig(object):
     """LitConfig - Configuration data for a 'lit' test runner instance, shared
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 1d3bf8e4e8df1..73db67aedb739 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1,4 +1,7 @@
 from __future__ import absolute_import
+import errno
+import io
+import itertools
 import getopt
 import os, signal, subprocess, sys
 import re
@@ -9,8 +12,11 @@
 import shutil
 import tempfile
 import threading
+import typing
 from typing import Optional, Tuple
 
+import io
+
 try:
     from StringIO import StringIO
 except ImportError:
diff --git a/llvm/utils/lit/lit/discovery.py b/llvm/utils/lit/lit/discovery.py
index 2e93bacc12368..2e7f90c6bb0c9 100644
--- a/llvm/utils/lit/lit/discovery.py
+++ b/llvm/utils/lit/lit/discovery.py
@@ -6,8 +6,8 @@
 import os
 import sys
 
-from lit import Test, util
 from lit.TestingConfig import TestingConfig
+from lit import LitConfig, Test, util
 
 
 def chooseConfigFileFromDir(dir, config_names):
diff --git a/llvm/utils/lit/lit/worker.py b/llvm/utils/lit/lit/worker.py
index dbc3ab53bc627..8e78bfd45d38b 100644
--- a/llvm/utils/lit/lit/worker.py
+++ b/llvm/utils/lit/lit/worker.py
@@ -12,6 +12,8 @@
 import traceback
 
 import lit.Test
+import lit.util
+
 
 _lit_config = None
 _parallelism_semaphores = None

From 3ea45a65edb2f033e59a12f71a8241f220791ac8 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Fri, 13 Jun 2025 16:18:54 +0100
Subject: [PATCH 384/851] [AArch64] Add fixed-length SVE USDOT support
 (#143730)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  11 +
 .../sve-fixed-length-partial-reduce.ll        | 230 +++++++++++++++++-
 2 files changed, 238 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 781a1281db402..7519ac5260a64 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2272,6 +2272,17 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
       setPartialReduceMLAAction(MLAOps, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
     }
+
+    if (Subtarget->hasMatMulInt8()) {
+      if (VT.getVectorElementType() == MVT::i32)
+        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
+                                  Custom);
+      else if (VT.getVectorElementType() == MVT::i64)
+        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
+                                  Custom);
+    }
   }
 
   // Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index 79d766d1b9908..af813ff16a202 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 
@@ -407,6 +407,154 @@ define <4 x i32> @four_way_i8_i32_vl128(ptr %accptr, ptr %uptr, ptr %sptr) {
   ret <4 x i32> %partial.reduce
 }
 
+define <4 x i32> @four_way_i8_i32_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i8_i32_vl128_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr q0, [x0]
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <4 x i32>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = zext <16 x i8> %u to <16 x i32>
+  %s.wide = sext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <4 x i32> @four_way_i8_i32_vl128_sudot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i8_i32_vl128_sudot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_sudot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr q0, [x0]
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z2.b, z1.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <4 x i32>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = sext <16 x i8> %u to <16 x i32>
+  %s.wide = zext <16 x i8> %s to <16 x i32>
+  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add(<4 x i32> %acc, <16 x i32> %mult)
+  ret <4 x i32> %partial.reduce
+}
+
+define <2 x i64> @four_way_i8_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; NEON-LABEL: four_way_i8_i64_vl128_usdot:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    ldr q1, [x1]
+; NEON-NEXT:    ldr q2, [x2]
+; NEON-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; NEON-NEXT:    ldr q1, [x0]
+; NEON-NEXT:    saddw v1.2d, v1.2d, v0.2s
+; NEON-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: four_way_i8_i64_vl128_usdot:
+; SVE:       // %bb.0:
+; SVE-NEXT:    movi v0.2d, #0000000000000000
+; SVE-NEXT:    ldr q1, [x1]
+; SVE-NEXT:    ldr q2, [x2]
+; SVE-NEXT:    usdot z0.s, z1.b, z2.b
+; SVE-NEXT:    ldr q2, [x0]
+; SVE-NEXT:    sunpklo z1.d, z0.s
+; SVE-NEXT:    sunpkhi z0.d, z0.s
+; SVE-NEXT:    add z1.d, z2.d, z1.d
+; SVE-NEXT:    add z0.d, z1.d, z0.d
+; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i64_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    mov z0.s, #0 // =0x0
+; SME-NEXT:    ldr q1, [x1]
+; SME-NEXT:    ldr q2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    ldr q1, [x0]
+; SME-NEXT:    saddwb z1.d, z1.d, z0.s
+; SME-NEXT:    saddwt z0.d, z1.d, z0.s
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <2 x i64>, ptr %accptr
+  %u = load <16 x i8>, ptr %uptr
+  %s = load <16 x i8>, ptr %sptr
+  %u.wide = zext <16 x i8> %u to <16 x i64>
+  %s.wide = sext <16 x i8> %s to <16 x i64>
+  %mult = mul nuw nsw <16 x i64> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <16 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
+define <2 x i64> @four_way_i16_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+; COMMON-LABEL: four_way_i16_i64_vl128_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldr q1, [x1]
+; COMMON-NEXT:    ldr q2, [x2]
+; COMMON-NEXT:    ldr q0, [x0]
+; COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; COMMON-NEXT:    smlal v0.2d, v4.2s, v3.2s
+; COMMON-NEXT:    smlal2 v0.2d, v4.4s, v3.4s
+; COMMON-NEXT:    smlal v0.2d, v2.2s, v1.2s
+; COMMON-NEXT:    smlal2 v0.2d, v2.4s, v1.4s
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i16_i64_vl128_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ptrue p0.d, vl2
+; SME-NEXT:    ldr q2, [x0]
+; SME-NEXT:    mov x8, #2 // =0x2
+; SME-NEXT:    ld1h { z0.d }, p0/z, [x1]
+; SME-NEXT:    ld1sh { z1.d }, p0/z, [x2]
+; SME-NEXT:    mad z0.d, p0/m, z1.d, z2.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mov x8, #4 // =0x4
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mov x8, #6 // =0x6
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    ld1h { z1.d }, p0/z, [x1, x8, lsl #1]
+; SME-NEXT:    ld1sh { z2.d }, p0/z, [x2, x8, lsl #1]
+; SME-NEXT:    mla z0.d, p0/m, z2.d, z1.d
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    ret
+  %acc = load <2 x i64>, ptr %accptr
+  %u = load <8 x i16>, ptr %uptr
+  %s = load <8 x i16>, ptr %sptr
+  %u.wide = zext <8 x i16> %u to <8 x i64>
+  %s.wide = sext <8 x i16> %s to <8 x i64>
+  %mult = mul nuw nsw <8 x i64> %s.wide, %u.wide
+  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %mult)
+  ret <2 x i64> %partial.reduce
+}
+
 define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr %sptr) {
 ;
 ; COMMON-LABEL: four_way_i8_i32_vl128_double_width:
@@ -438,6 +586,37 @@ define <8 x i32> @four_way_i8_i32_vl128_double_width(ptr %accptr, ptr %uptr, ptr
   ret <8 x i32> %partial.reduce
 }
 
+define <8 x i32> @four_way_i8_i32_vl128_double_width_usdot(ptr %accptr, ptr %uptr, ptr %sptr) {
+;
+; COMMON-LABEL: four_way_i8_i32_vl128_double_width_usdot:
+; COMMON:       // %bb.0:
+; COMMON-NEXT:    ldp q0, q1, [x0]
+; COMMON-NEXT:    ldp q3, q2, [x1]
+; COMMON-NEXT:    ldp q5, q4, [x2]
+; COMMON-NEXT:    usdot v0.4s, v3.16b, v5.16b
+; COMMON-NEXT:    usdot v1.4s, v2.16b, v4.16b
+; COMMON-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl128_double_width_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldp q0, q1, [x0]
+; SME-NEXT:    ldp q3, q2, [x1]
+; SME-NEXT:    ldp q5, q4, [x2]
+; SME-NEXT:    usdot z0.s, z3.b, z5.b
+; SME-NEXT:    usdot z1.s, z2.b, z4.b
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SME-NEXT:    ret
+  %acc = load <8 x i32>, ptr %accptr
+  %u = load <32 x i8>, ptr %uptr
+  %s = load <32 x i8>, ptr %sptr
+  %u.wide = zext <32 x i8> %u to <32 x i32>
+  %s.wide = sext <32 x i8> %s to <32 x i32>
+  %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  ret <8 x i32> %partial.reduce
+}
+
 define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
 ;
 ;
@@ -483,6 +662,51 @@ define <8 x i32> @four_way_i8_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal
   ret <8 x i32> %partial.reduce
 }
 
+define <8 x i32> @four_way_i8_i32_vl256_usdot(ptr %accptr, ptr %uptr, ptr %sptr) vscale_range(2,2) {
+;
+;
+; NEON-LABEL: four_way_i8_i32_vl256_usdot:
+; NEON:       // %bb.0:
+; NEON-NEXT:    ldp q0, q1, [x0]
+; NEON-NEXT:    ldp q3, q2, [x1]
+; NEON-NEXT:    ldp q5, q4, [x2]
+; NEON-NEXT:    usdot v0.4s, v3.16b, v5.16b
+; NEON-NEXT:    usdot v1.4s, v2.16b, v4.16b
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: four_way_i8_i32_vl256_usdot:
+; SVE:       // %bb.0:
+; SVE-NEXT:    ldr z0, [x0]
+; SVE-NEXT:    ldr z1, [x1]
+; SVE-NEXT:    ldr z2, [x2]
+; SVE-NEXT:    usdot z0.s, z1.b, z2.b
+; SVE-NEXT:    mov z1.d, z0.d
+; SVE-NEXT:    ext z1.b, z1.b, z0.b, #16
+; SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SVE-NEXT:    ret
+;
+; SME-LABEL: four_way_i8_i32_vl256_usdot:
+; SME:       // %bb.0:
+; SME-NEXT:    ldr z0, [x0]
+; SME-NEXT:    ldr z1, [x1]
+; SME-NEXT:    ldr z2, [x2]
+; SME-NEXT:    usdot z0.s, z1.b, z2.b
+; SME-NEXT:    mov z1.d, z0.d
+; SME-NEXT:    ext z1.b, z1.b, z0.b, #16
+; SME-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; SME-NEXT:    // kill: def $q1 killed $q1 killed $z1
+; SME-NEXT:    ret
+  %acc = load <8 x i32>, ptr %accptr
+  %u = load <32 x i8>, ptr %uptr
+  %s = load <32 x i8>, ptr %sptr
+  %u.wide = zext <32 x i8> %u to <32 x i32>
+  %s.wide = sext <32 x i8> %s to <32 x i32>
+  %mult = mul nuw nsw <32 x i32> %s.wide, %u.wide
+  %partial.reduce = tail call <8 x i32> @llvm.experimental.vector.partial.reduce.add(<8 x i32> %acc, <32 x i32> %mult)
+  ret <8 x i32> %partial.reduce
+}
+
 ;
 ; Four-way dot (i16 -> i64)
 ;

From eba63cd76f7ba7f9e9964b1263f76409d08fcd04 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Fri, 13 Jun 2025 10:29:31 -0500
Subject: [PATCH 385/851] [flang][OpenMP] Improve handling of REQUIRES
 ATOMIC_DEFAULT_MEM_ORDER (#143917)

According to OpenMP 5.0 rules, the ACQ_REL ordering coming from a
REQUIRES directive may need to be replaced with ACQUIRE or RELEASE
depending on the directive in the ATOMIC construct. This was not done,
leading to an incorrect "memory-order" clause appearing in the generated
HLFIR.

This may need to be relaxed a bit to fully comply with later spec
versions, that will be done in a future PR.
---
 flang/lib/Semantics/rewrite-directives.cpp    | 15 ++++++++++++-
 .../requires-atomic-default-mem-order.f90     | 22 +++++++++++++++++++
 .../Semantics/OpenMP/requires-atomic02.f90    |  8 +++----
 3 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90

diff --git a/flang/lib/Semantics/rewrite-directives.cpp b/flang/lib/Semantics/rewrite-directives.cpp
index b4fef2c881b67..91b60ea151dee 100644
--- a/flang/lib/Semantics/rewrite-directives.cpp
+++ b/flang/lib/Semantics/rewrite-directives.cpp
@@ -112,9 +112,22 @@ bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) {
 
   // Add a memory order clause to the atomic directive.
   atomicDirectiveDefaultOrderFound_ = true;
+  llvm::omp::Clause kind{x.GetKind()};
   switch (*defaultMemOrder) {
   case common::OmpMemoryOrderType::Acq_Rel:
-    clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::AcqRel{}});
+    // FIXME: Implement 5.0 rules, pending clarification on later spec
+    // versions.
+    // [5.0:62:22-26]
+    if (kind == llvm::omp::Clause::OMPC_read) {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::Acquire{}});
+    } else if (kind == llvm::omp::Clause::OMPC_update && x.IsCapture()) {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::AcqRel{}});
+    } else {
+      clauseList->v.emplace_back(
+          parser::OmpClause{parser::OmpClause::Release{}});
+    }
     break;
   case common::OmpMemoryOrderType::Relaxed:
     clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::Relaxed{}});
diff --git a/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90 b/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90
new file mode 100644
index 0000000000000..91cb654aeeb3a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/requires-atomic-default-mem-order.f90
@@ -0,0 +1,22 @@
+!RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 %s -o - | FileCheck %s
+
+module m
+!$omp requires atomic_default_mem_order(acq_rel)
+
+contains
+
+!CHECK: %[[V:[0-9]+]]:2 = hlfir.declare {{.*}} {uniq_name = "_QMmFf00Ev"}
+!CHECK: %[[X:[0-9]+]]:2 = hlfir.declare {{.*}} {uniq_name = "_QMmFf00Ex"}
+!CHECK: omp.atomic.read %[[V]]#0 = %[[X]]#0 memory_order(acquire)
+!CHECK: omp.atomic.write %[[X]]#0 = %{{[0-9]+}} memory_order(release)
+
+subroutine f00(x, v)
+  integer :: x, v
+  !$omp atomic read
+    v = x
+
+  !$omp atomic write
+    x = v
+end
+
+end module
diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90
index a3724a83456fd..04a9b7a09aa98 100644
--- a/flang/test/Semantics/OpenMP/requires-atomic02.f90
+++ b/flang/test/Semantics/OpenMP/requires-atomic02.f90
@@ -12,7 +12,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Read
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Acquire
   !$omp atomic read
   i = j
 
@@ -36,7 +36,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Write
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic write
   i = j
 
@@ -60,7 +60,7 @@ program requires
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
   ! CHECK: OmpClause -> Update
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic update
   i = i + j
 
@@ -79,7 +79,7 @@ program requires
   i = i + j
 
   ! CHECK-LABEL: OpenMPAtomicConstruct
-  ! CHECK: OmpClause -> AcqRel
+  ! CHECK: OmpClause -> Release
   !$omp atomic
   i = i + j
 

From ec21b0fc9f64e8cffe689699d1e39533c62fcfc3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 16:26:10 +0100
Subject: [PATCH 386/851] [X86] Add X86FixupInstTuning test coverage for
 (V)BLENDPD/S <-> (V)MOVSD/S patterns for various scheduler models

---
 llvm/test/CodeGen/X86/fixup-blend.ll | 713 +++++++++++++++++++++++++++
 1 file changed, 713 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/fixup-blend.ll

diff --git a/llvm/test/CodeGen/X86/fixup-blend.ll b/llvm/test/CodeGen/X86/fixup-blend.ll
new file mode 100644
index 0000000000000..3126e4823bee6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fixup-blend.ll
@@ -0,0 +1,713 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64      | FileCheck %s -check-prefixes=SSE,SSE-MOV,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2   | FileCheck %s -check-prefixes=SSE,SSE4,SSE4-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=slm         | FileCheck %s -check-prefixes=SSE,SSE-MOV,SSE4,SSE4-MOV
+
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=sandybridge | FileCheck %s -check-prefixes=AVX,AVX1,AVX-BLEND,AVX1-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2      | FileCheck %s -check-prefixes=AVX,AVX1,AVX-MOV,AVX1-MOV
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-BLEND,AVX2-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-BLEND,AVX2-BLEND
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake   | FileCheck %s -check-prefixes=AVX,AVX2,AVX-MOV,AVX2-MOV
+
+;
+; v2f64 patterns
+;
+
+define <2 x double> @test_v2f64_blend_movsd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; SSE-MOV-LABEL: test_v2f64_blend_movsd:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addpd %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v2f64_blend_movsd:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-BLEND-NEXT:    addpd %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v2f64_blend_movsd:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-BLEND-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v2f64_blend_movsd:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_optsize(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) optsize {
+; SSE-LABEL: test_v2f64_blend_movsd_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    addpd %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_load(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+; SSE2-LABEL: test_v2f64_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2f64_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    addpd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <2 x double>, ptr %p1
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_zero(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; SSE-MOV-LABEL: test_v2f64_blend_movsd_zero:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    xorpd %xmm1, %xmm1
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addpd %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v2f64_blend_movsd_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorpd %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-BLEND-NEXT:    addpd %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v2f64_blend_movsd_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-BLEND-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v2f64_blend_movsd_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+;
+; v2i64 patterns
+;
+
+define <2 x i64> @test_v2i64_blend_movsd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_optsize(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) optsize {
+; SSE2-LABEL: test_v2i64_blend_movsd_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_load(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <2 x i64>, ptr %p1
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_zero(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+;
+; v4f32 patterns
+;
+
+define <4 x float> @test_v4f32_blend_movss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movss:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movss:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movss:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movss:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movsd:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movsd:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movsd:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movsd:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_optsize(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) optsize {
+; SSE-LABEL: test_v4f32_blend_movss_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_optsize(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) optsize {
+; SSE-LABEL: test_v4f32_blend_movsd_optsize:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_optsize:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movss_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <4 x float>, ptr %p1
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = load <4 x float>, ptr %p1
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_zero(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE-MOV-LABEL: test_v4f32_blend_movss_zero:
+; SSE-MOV:       # %bb.0:
+; SSE-MOV-NEXT:    xorps %xmm1, %xmm1
+; SSE-MOV-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE-MOV-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movss_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorps %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movss_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movss_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_zero(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    addps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-BLEND-LABEL: test_v4f32_blend_movsd_zero:
+; SSE4-BLEND:       # %bb.0:
+; SSE4-BLEND-NEXT:    xorps %xmm1, %xmm1
+; SSE4-BLEND-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; SSE4-BLEND-NEXT:    addps %xmm2, %xmm0
+; SSE4-BLEND-NEXT:    retq
+;
+; SSE4-MOV-LABEL: test_v4f32_blend_movsd_zero:
+; SSE4-MOV:       # %bb.0:
+; SSE4-MOV-NEXT:    xorps %xmm1, %xmm1
+; SSE4-MOV-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE4-MOV-NEXT:    addps %xmm2, %xmm0
+; SSE4-MOV-NEXT:    retq
+;
+; AVX-BLEND-LABEL: test_v4f32_blend_movsd_zero:
+; AVX-BLEND:       # %bb.0:
+; AVX-BLEND-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-BLEND-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-BLEND-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-BLEND-NEXT:    retq
+;
+; AVX-MOV-LABEL: test_v4f32_blend_movsd_zero:
+; AVX-MOV:       # %bb.0:
+; AVX-MOV-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-MOV-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-MOV-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; AVX-MOV-NEXT:    retq
+  %s = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+;
+; v4i32 patterns
+;
+
+define <4 x i32> @test_v4i32_blend_movss(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_optsize(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) optsize {
+; SSE2-LABEL: test_v4i32_blend_movss_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_optsize(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) optsize {
+; SSE2-LABEL: test_v4i32_blend_movsd_optsize:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_optsize:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_optsize:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_optsize:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <4 x i32>, ptr %p1
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a1 = load <4 x i32>, ptr %p1
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_zero(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_zero(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_zero:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm2, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_zero:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_zero:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX1-BLEND: {{.*}}
+; AVX1-MOV: {{.*}}
+; AVX2-BLEND: {{.*}}
+; AVX2-MOV: {{.*}}

From ca5040990ed17fa444d30c22fffcfa7ddc72612f Mon Sep 17 00:00:00 2001
From: Aleksandr Platonov <platonov.aleksandr@huawei.com>
Date: Fri, 13 Jun 2025 18:32:42 +0300
Subject: [PATCH 387/851] [clangd] Collect references in array designators
 (#140356)

---
 clang-tools-extra/clangd/unittests/XRefsTests.cpp | 8 ++++++++
 clang/lib/Index/IndexBody.cpp                     | 7 +++++++
 2 files changed, 15 insertions(+)

diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 1892f87c8e82a..b04d6431f89f9 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -2311,6 +2311,14 @@ TEST(FindReferences, WithinAST) {
             $(S::deleteObject)[[de^lete]] S;
           }
         };
+      )cpp",
+      // Array designators
+      R"cpp(
+        const int $def[[F^oo]] = 0;
+        int Bar[] = {
+          [$(Bar)[[F^oo]]...$(Bar)[[Fo^o]] + 1] = 0,
+          [$(Bar)[[^Foo]] + 2] = 1
+        };
       )cpp"};
   for (const char *Test : Tests)
     checkFindRefs(Test);
diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp
index 2ed20df22bda0..98ce6f73ec849 100644
--- a/clang/lib/Index/IndexBody.cpp
+++ b/clang/lib/Index/IndexBody.cpp
@@ -435,6 +435,13 @@ class BodyIndexer : public RecursiveASTVisitor<BodyIndexer> {
                                             ParentDC, SymbolRoleSet(),
                                             /*Relations=*/{}, E);
           }
+        } else {
+          if (D.isArrayDesignator())
+            TraverseStmt(E->getArrayIndex(D));
+          else if (D.isArrayRangeDesignator()) {
+            TraverseStmt(E->getArrayRangeStart(D));
+            TraverseStmt(E->getArrayRangeEnd(D));
+          }
         }
       }
       return true;

From dc9e300f12f3b9c8160dbfb0bc32252ad99c3ba7 Mon Sep 17 00:00:00 2001
From: Fabian Meumertzheim <fabian@meumertzhe.im>
Date: Fri, 13 Jun 2025 17:49:30 +0200
Subject: [PATCH 388/851] [llvm-cov] Add support for baseline coverage
 (#117910)

When no profile is provided, but the new --empty-profile option is
specifed, the export/report/show commands now emit coverage data
equivalent to that obtained from a profile with all zero counters
("baseline coverage").

This is useful for build systems (e.g. Bazel) that can track coverage
information for each build target, even those that are never linked into
tests and thus don't have runtime coverage data recorded. By merging in
baseline coverage, lines in files that aren't linked into tests are
correctly reported as uncovered.
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 +++
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++++++-------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ++++++
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 +++++++----
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 195 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index 968f3c452f558..f4db60cf06fa7 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,6 +380,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -470,6 +475,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -562,6 +572,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Export the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e62ce5e3d8fa6..d1230b0ba7c58 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,18 +991,23 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
+      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader,
+      CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-               bool &DataFound,
+               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+                   &ProfileReader,
+               CoverageMapping &Coverage, bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(const CoverageMappingRecord &Record,
-                           IndexedInstrProfReader &ProfileReader);
+  Error loadFunctionRecord(
+      const CoverageMappingRecord &Record,
+      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1018,15 +1023,16 @@ class CoverageMapping {
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       IndexedInstrProfReader &ProfileReader);
+       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+           &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
-       StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames,
+       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index dd74eb054a34c..429ec5c19f1f8 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,7 +823,8 @@ class MCDCDecisionRecorder {
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    IndexedInstrProfReader &ProfileReader) {
+    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -837,35 +838,44 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
-                                                Record.FunctionHash, Counts)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionCounts(
+            Record.FunctionName, Record.FunctionHash, Counts)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
+  } else {
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader && ProfileReader.value().get().getVersion() <
+                           IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
-                                                Record.FunctionHash, Bitmap)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionBitmap(
+            Record.FunctionName, Record.FunctionHash, Bitmap)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
-    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  } else {
+    Bitmap = BitVector(getMaxBitmapSize(Record, false));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -959,10 +969,14 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage ||
-         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
-  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
+         *Coverage.SingleByteCoverage ==
+             ProfileReader.value().get().hasSingleByteCoverage());
+  Coverage.SingleByteCoverage =
+      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -977,7 +991,8 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -986,18 +1001,19 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(
-      std::move(E), [](const CoverageMapError &CME) {
-        if (CME.get() == coveragemap_error::no_data_found)
-          return static_cast<Error>(Error::success());
-        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-      });
+  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
+    if (CME.get() == coveragemap_error::no_data_found)
+      return static_cast<Error>(Error::success());
+    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+  });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage, bool &DataFound,
+    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1033,13 +1049,23 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames,
+    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+    ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
-  if (Error E = ProfileReaderOrErr.takeError())
-    return createFileError(ProfileFilename, std::move(E));
-  auto ProfileReader = std::move(ProfileReaderOrErr.get());
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+  if (ProfileFilename) {
+    auto ProfileReaderOrErr =
+        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
+    if (Error E = ProfileReaderOrErr.takeError())
+      return createFileError(ProfileFilename.value(), std::move(E));
+    ProfileReader = std::move(ProfileReaderOrErr.get());
+  }
+  auto ProfileReaderRef =
+      ProfileReader
+          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
+                *ProfileReader)
+          : std::nullopt;
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1053,16 +1079,17 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E =
-            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
-                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
+    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
+                               CompilationDir, ProfileReaderRef, *Coverage,
+                               DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-      return createFileError(ProfileFilename, std::move(E));
+    if (ProfileReader)
+      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+        return createFileError(ProfileFilename.value(), std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1082,12 +1109,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
-                                  *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
+                                   *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename,
+            ProfileFilename.value(),
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
new file mode 100644
index 0000000000000..bce886bdf510b
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
@@ -0,0 +1,37 @@
+// FULL: SF:{{.*}}showLineExecutionCounts.cpp
+// FULL: FN:6,main
+// FULL: FNDA:0,main
+// FULL: FNF:1
+// FULL: FNH:0
+int main() {                              // FULL: DA:[[@LINE]],0
+  int x = 0;                              // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  if (x) {                                // FULL: DA:[[@LINE]],0
+    x = 0;                                // FULL: DA:[[@LINE]],0
+  } else {                                // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
+  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
+        x - 1:                            // FULL: DA:[[@LINE]],0
+        x + 1;                            // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  return 0;                               // FULL: DA:[[@LINE]],0
+}                                         // FULL: DA:[[@LINE]],0
+// FULL: LF:20
+// FULL: LH:0
+// FULL: end_of_record
+// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
+
+// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
+// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
+// SUMMARYONLY: FNF:1
+// SUMMARYONLY: FNH:0
+// SUMMARYONLY: LF:20
+// SUMMARYONLY: LH:0
+// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 1f2484cd4dda9..6c66858c4de8c 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ class CodeCoverageTool {
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::string PGOFilename;
+  std::optional<std::string> PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,10 +455,12 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  for (StringRef ObjectFilename : ObjectFilenames)
-    if (modifiedTimeGT(ObjectFilename, PGOFilename))
-      warning("profile data may be out of date - object is newer",
-              ObjectFilename);
+  if (PGOFilename) {
+    for (StringRef ObjectFilename : ObjectFilenames)
+      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
+        warning("profile data may be out of date - object is newer",
+                ObjectFilename);
+  }
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -668,11 +670,16 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string, true> PGOFilename(
-      "instr-profile", cl::Required, cl::location(this->PGOFilename),
+  cl::opt<std::string> PGOFilename(
+      "instr-profile", cl::Optional,
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
+  cl::opt<bool> EmptyProfile(
+      "empty-profile", cl::Optional,
+      cl::desc("Use a synthetic profile with no data to generate "
+               "baseline coverage"));
+
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -805,6 +812,15 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
+    if (!PGOFilename.empty() == EmptyProfile) {
+      error(
+          "exactly one of -instr-profile and -empty-profile must be specified");
+      return 1;
+    }
+    if (!PGOFilename.empty()) {
+      this->PGOFilename = std::make_optional(PGOFilename.getValue());
+    }
+
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1116,20 +1132,22 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
-  }
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
 
-  if (ShowCreatedTime) {
-    auto ModifiedTime = Status.getLastModificationTime();
-    std::string ModifiedTimeStr = to_string(ModifiedTime);
-    size_t found = ModifiedTimeStr.rfind(':');
-    ViewOpts.CreatedTimeStr =
-        (found != std::string::npos)
-            ? "Created: " + ModifiedTimeStr.substr(0, found)
-            : "Created: " + ModifiedTimeStr;
+    if (ShowCreatedTime) {
+      auto ModifiedTime = Status.getLastModificationTime();
+      std::string ModifiedTimeStr = to_string(ModifiedTime);
+      size_t found = ModifiedTimeStr.rfind(':');
+      ViewOpts.CreatedTimeStr =
+          (found != std::string::npos)
+              ? "Created: " + ModifiedTimeStr.substr(0, found)
+              : "Created: " + ModifiedTimeStr;
+    }
   }
 
   auto Coverage = load();
@@ -1238,10 +1256,12 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
@@ -1303,10 +1323,12 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 46f881ecddb5f..c0e99cf80b944 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,7 +277,9 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    return CoverageMapping::load(CoverageReaders, *ProfileReader);
+    auto ProfileReaderRef =
+        std::make_optional(std::reference_wrapper(*ProfileReader));
+    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From 18b67a7a102c0052e5ae0e76ef1297902ffeb22d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 08:58:19 -0700
Subject: [PATCH 389/851] MC: Add MCAsmInfo::printExpr to replace MCExpr::print

* Make relocation specifier code closer (MCAsmInfo defines specifiers).
* MCExpr::print has an optional MCAsmInfo argument, which is
  error-prone when omitted.
* Enable MCSpecifierExpr
---
 llvm/include/llvm/MC/MCAsmInfo.h |  3 +++
 llvm/lib/MC/MCAsmInfo.cpp        |  5 +++++
 llvm/lib/MC/MCAsmStreamer.cpp    | 24 ++++++++++++------------
 llvm/lib/MC/MCStreamer.cpp       |  2 +-
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index e98cd17a9df50..18303e028f623 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -32,6 +32,7 @@ class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
+class raw_ostream;
 
 namespace WinEH {
 
@@ -709,6 +710,8 @@ class LLVM_ABI MCAsmInfo {
 
   StringRef getSpecifierName(uint32_t S) const;
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
+
+  void printExpr(raw_ostream &, const MCExpr &) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 86759c32bb752..fbacca4f56796 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -148,3 +149,7 @@ std::optional<uint32_t> MCAsmInfo::getSpecifierForName(StringRef Name) const {
     return It->second;
   return {};
 }
+
+void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
+  Expr.print(OS, this);
+}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 4380f74318e7b..c43619d712172 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -700,7 +700,7 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
       OS << ".set ";
     Symbol->print(OS, MAI);
     OS << (UseSet ? ", " : " = ");
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
 
     EmitEOL();
   }
@@ -713,7 +713,7 @@ void MCAsmStreamer::emitConditionalAssignment(MCSymbol *Symbol,
   OS << ".lto_set_conditional ";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1065,7 +1065,7 @@ void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   OS << "\t.size\t";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1399,7 +1399,7 @@ void MCAsmStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
   if (MCTargetStreamer *TS = getTargetStreamer()) {
     TS->emitValue(Value);
   } else {
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     EmitEOL();
   }
 }
@@ -1411,7 +1411,7 @@ void MCAsmStreamer::emitULEB128Value(const MCExpr *Value) {
     return;
   }
   OS << "\t.uleb128 ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1422,7 +1422,7 @@ void MCAsmStreamer::emitSLEB128Value(const MCExpr *Value) {
     return;
   }
   OS << "\t.sleb128 ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   EmitEOL();
 }
 
@@ -1437,7 +1437,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
     if (!MAI->isAIX() || FillValue == 0) {
       // FIXME: Emit location directives
       OS << ZeroDirective;
-      NumBytes.print(OS, MAI);
+      MAI->printExpr(OS, NumBytes);
       if (FillValue != 0)
         OS << ',' << (int)FillValue;
       EmitEOL();
@@ -1460,7 +1460,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                              int64_t Expr, SMLoc Loc) {
   // FIXME: Emit location directives
   OS << "\t.fill\t";
-  NumValues.print(OS, MAI);
+  MAI->printExpr(OS, NumValues);
   OS << ", " << Size << ", 0x";
   OS.write_hex(truncateToSize(Expr, 4));
   EmitEOL();
@@ -1558,7 +1558,7 @@ void MCAsmStreamer::emitValueToOffset(const MCExpr *Offset,
                                       SMLoc Loc) {
   // FIXME: Verify that Offset is associated with the current section.
   OS << ".org ";
-  Offset->print(OS, MAI);
+  MAI->printExpr(OS, *Offset);
   OS << ", " << (unsigned)Value;
   EmitEOL();
 }
@@ -2417,7 +2417,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
     MCFixup &F = Fixups[i];
     OS << "  fixup " << char('A' + i) << " - "
        << "offset: " << F.getOffset() << ", value: ";
-    F.getValue()->print(OS, MAI);
+    MAI->printExpr(OS, *F.getValue());
     auto Kind = F.getKind();
     if (mc::isRelocation(Kind))
       OS << ", relocation type: " << Kind;
@@ -2496,11 +2496,11 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
                                   const MCExpr *Expr, SMLoc,
                                   const MCSubtargetInfo &STI) {
   OS << "\t.reloc ";
-  Offset.print(OS, MAI);
+  MAI->printExpr(OS, Offset);
   OS << ", " << Name;
   if (Expr) {
     OS << ", ";
-    Expr->print(OS, MAI);
+    MAI->printExpr(OS, *Expr);
   }
   EmitEOL();
   return std::nullopt;
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index d70639b7bfe20..5f1fd57802c7b 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -72,7 +72,7 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
 
-  Value->print(OS, Streamer.getContext().getAsmInfo());
+  Streamer.getContext().getAsmInfo()->printExpr(OS, *Value);
   Streamer.emitRawText(OS.str());
 }
 

From d688df52ba9012197b3716ae85f818fafee7cf62 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 13 Jun 2025 08:56:49 -0700
Subject: [PATCH 390/851] [instsimplify] Add tests for missing vp.reverse
 simplifications

---
 .../Transforms/InstSimplify/vp-reverse.ll     | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/vp-reverse.ll

diff --git a/llvm/test/Transforms/InstSimplify/vp-reverse.ll b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
new file mode 100644
index 0000000000000..3c3bb871dc610
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+define <vscale x 4 x i32> @rev_of_rev(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_rev(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @rev_of_rev_diffevl(<vscale x 4 x i32> %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_rev_diffevl(
+; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+;
+  %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 10)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
+; CHECK-LABEL: @rev_of_poison(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
+; CHECK-LABEL: @rev_of_undef(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_zero(i32 %evl) {
+; CHECK-LABEL: @rev_of_zero(
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_splat(i32 %a, i32 %evl) {
+; CHECK-LABEL: @rev_of_splat(
+; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+  %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.vec, <vscale x 4 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}
+
+define <vscale x 4 x i32> @rev_of_splat2(i32 %a, <vscale x 4 x i1> %m, i32 %evl) {
+; CHECK-LABEL: @rev_of_splat2(
+; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+;
+  %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+  %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.vec, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %rev
+}

From dec576514cb7106c59a5059ac6d52ebdf5de5275 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Jun 2025 17:11:18 +0100
Subject: [PATCH 391/851] [X86] X86FixupInstTuning - add dbg message for each
 instruction replacement (#144083)

Help debug the changes the pass makes
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp | 57 +++++++++++++++-------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 8c1ff523c975a..89093b2e1a3f5 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -132,11 +132,15 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessVPERMILPDri = [&](unsigned NewOpc) -> bool {
     if (!NewOpcPreferable(NewOpc))
       return false;
-    unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
-    MI.removeOperand(NumOperands - 1);
-    MI.addOperand(MI.getOperand(NumOperands - 2));
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -147,11 +151,15 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool {
     if (!NewOpcPreferable(NewOpc))
       return false;
-    unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
-    MI.removeOperand(NumOperands - 1);
-    MI.addOperand(MI.getOperand(NumOperands - 2));
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm();
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MI.getOperand(NumOperands - 2));
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -164,7 +172,11 @@ bool X86FixupInstTuningPass::processInstruction(
     if (!ST->hasNoDomainDelayShuffle() ||
         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-    MI.setDesc(TII->get(NewOpc));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -185,9 +197,12 @@ bool X86FixupInstTuningPass::processInstruction(
   auto ProcessUNPCK = [&](unsigned NewOpc, unsigned MaskImm) -> bool {
     if (!NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-
-    MI.setDesc(TII->get(NewOpc));
-    MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+      MI.addOperand(MachineOperand::CreateImm(MaskImm));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -198,7 +213,11 @@ bool X86FixupInstTuningPass::processInstruction(
     if (!ST->hasNoDomainDelayShuffle() ||
         !NewOpcPreferable(NewOpc, /*ReplaceInTie*/ false))
       return false;
-    MI.setDesc(TII->get(NewOpc));
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(NewOpc));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 
@@ -229,8 +248,12 @@ bool X86FixupInstTuningPass::processInstruction(
       return false;
     if (!OptSize && !NewOpcPreferable(MovOpc))
       return false;
-    MI.setDesc(TII->get(MovOpc));
-    MI.removeOperand(NumOperands - 1);
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(MovOpc));
+      MI.removeOperand(NumOperands - 1);
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
     return true;
   };
 

From bd33eef7f1013bea24289a898f788a2efe9d8282 Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 13 Jun 2025 12:21:38 -0400
Subject: [PATCH 392/851] [HLSL][SPIRV] Use resource names (#143412)

The SPIR-V backend does not have access to the original name of a
resource in the source, so it tries to create a name. This leads to some
problems with reflection.

That is why start to pass the name of the resource from Clang to the
SPIR-V backend.

Fixes #138533
---
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 17 ++--
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 49 ++--------
 clang/lib/CodeGen/CGHLSLRuntime.h             | 13 +--
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       | 16 ++--
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 94 +------------------
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h   |  1 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 16 ++--
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          | 10 ++
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |  4 +
 .../SPIRV/hlsl-resources/BufferLoad.ll        |  8 +-
 .../SPIRV/hlsl-resources/BufferLoadStore.ll   | 14 +--
 .../SPIRV/hlsl-resources/BufferStore.ll       |  4 +-
 .../CodeGen/SPIRV/hlsl-resources/Packed.ll    |  8 +-
 .../hlsl-resources/ScalarResourceType.ll      | 11 ++-
 .../hlsl-resources/StorageImageDynIdx.ll      |  6 +-
 .../StorageImageNonUniformIdx.ll              |  6 +-
 .../SPIRV/hlsl-resources/StructuredBuffer.ll  | 13 +--
 .../SPIRV/hlsl-resources/UnknownBufferLoad.ll |  7 +-
 .../hlsl-resources/UnknownBufferStore.ll      |  4 +-
 .../SPIRV/hlsl-resources/spirv.layout.type.ll | 16 +++-
 .../pointers/resource-addrspacecast-2.ll      |  6 +-
 .../SPIRV/pointers/resource-addrspacecast.ll  |  6 +-
 .../CodeGen/SPIRV/spirv-explicit-layout.ll    | 29 +++---
 23 files changed, 135 insertions(+), 223 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index abebc201808b0..ccf45c0c6ff1d 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -295,17 +295,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *SpaceOp = EmitScalarExpr(E->getArg(2));
     Value *RangeOp = EmitScalarExpr(E->getArg(3));
     Value *IndexOp = EmitScalarExpr(E->getArg(4));
+    Value *Name = EmitScalarExpr(E->getArg(5));
     // FIXME: NonUniformResourceIndex bit is not yet implemented
     // (llvm/llvm-project#135452)
     Value *NonUniform =
         llvm::ConstantInt::get(llvm::Type::getInt1Ty(getLLVMContext()), false);
 
-    auto [IntrinsicID, HasNameArg] =
+    llvm::Intrinsic::ID IntrinsicID =
         CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic();
-    SmallVector<Value *> Args{SpaceOp, RegisterOp, RangeOp, IndexOp,
-                              NonUniform};
-    if (HasNameArg)
-      Args.push_back(EmitScalarExpr(E->getArg(5)));
+    SmallVector<Value *> Args{SpaceOp, RegisterOp, RangeOp,
+                              IndexOp, NonUniform, Name};
     return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
   }
   case Builtin::BI__builtin_hlsl_resource_handlefromimplicitbinding: {
@@ -314,16 +313,16 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *RangeOp = EmitScalarExpr(E->getArg(2));
     Value *IndexOp = EmitScalarExpr(E->getArg(3));
     Value *OrderID = EmitScalarExpr(E->getArg(4));
+    Value *Name = EmitScalarExpr(E->getArg(5));
     // FIXME: NonUniformResourceIndex bit is not yet implemented
     // (llvm/llvm-project#135452)
     Value *NonUniform =
         llvm::ConstantInt::get(llvm::Type::getInt1Ty(getLLVMContext()), false);
 
-    auto [IntrinsicID, HasNameArg] =
+    llvm::Intrinsic::ID IntrinsicID =
         CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
-    SmallVector<Value *> Args{OrderID, SpaceOp, RangeOp, IndexOp, NonUniform};
-    if (HasNameArg)
-      Args.push_back(EmitScalarExpr(E->getArg(5)));
+    SmallVector<Value *> Args{OrderID, SpaceOp,    RangeOp,
+                              IndexOp, NonUniform, Name};
     return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args);
   }
   case Builtin::BI__builtin_hlsl_all: {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 720dac8383c05..977ff792bae2c 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -237,35 +237,6 @@ static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
   }
 }
 
-std::pair<llvm::Intrinsic::ID, bool>
-CGHLSLRuntime::getCreateHandleFromBindingIntrinsic() {
-  switch (getArch()) {
-  case llvm::Triple::dxil:
-    return std::pair(llvm::Intrinsic::dx_resource_handlefrombinding, true);
-  case llvm::Triple::spirv:
-    return std::pair(llvm::Intrinsic::spv_resource_handlefrombinding, false);
-  default:
-    llvm_unreachable("Intrinsic resource_handlefrombinding not supported by "
-                     "target architecture");
-  }
-}
-
-std::pair<llvm::Intrinsic::ID, bool>
-CGHLSLRuntime::getCreateHandleFromImplicitBindingIntrinsic() {
-  switch (getArch()) {
-  case llvm::Triple::dxil:
-    return std::pair(llvm::Intrinsic::dx_resource_handlefromimplicitbinding,
-                     true);
-  case llvm::Triple::spirv:
-    return std::pair(llvm::Intrinsic::spv_resource_handlefromimplicitbinding,
-                     false);
-  default:
-    llvm_unreachable(
-        "Intrinsic resource_handlefromimplicitbinding not supported by "
-        "target architecture");
-  }
-}
-
 // Codegen for HLSLBufferDecl
 void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
 
@@ -625,31 +596,27 @@ void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl,
       llvm::ConstantInt::get(CGM.IntTy, RBA ? RBA->getSpaceNumber() : 0);
   Value *Name = nullptr;
 
-  auto [IntrinsicID, HasNameArg] =
+  llvm::Intrinsic::ID IntrinsicID =
       RBA->hasRegisterSlot()
           ? CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic()
           : CGM.getHLSLRuntime().getCreateHandleFromImplicitBindingIntrinsic();
 
-  if (HasNameArg) {
-    std::string Str(BufDecl->getName());
-    std::string GlobalName(Str + ".str");
-    Name = CGM.GetAddrOfConstantCString(Str, GlobalName.c_str()).getPointer();
-  }
+  std::string Str(BufDecl->getName());
+  std::string GlobalName(Str + ".str");
+  Name = CGM.GetAddrOfConstantCString(Str, GlobalName.c_str()).getPointer();
 
   // buffer with explicit binding
   if (RBA->hasRegisterSlot()) {
     auto *RegSlot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber());
-    SmallVector<Value *> Args{Space, RegSlot, RangeSize, Index, NonUniform};
-    if (Name)
-      Args.push_back(Name);
+    SmallVector<Value *> Args{Space, RegSlot,    RangeSize,
+                              Index, NonUniform, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   } else {
     // buffer with implicit binding
     auto *OrderID =
         llvm::ConstantInt::get(CGM.IntTy, RBA->getImplicitBindingOrderID());
-    SmallVector<Value *> Args{OrderID, Space, RangeSize, Index, NonUniform};
-    if (Name)
-      Args.push_back(Name);
+    SmallVector<Value *> Args{OrderID, Space,      RangeSize,
+                              Index,   NonUniform, Name};
     initializeBuffer(CGM, GV, IntrinsicID, Args);
   }
 }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index bb2b82fa1f5aa..89d2aff85d913 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -118,6 +118,10 @@ class CGHLSLRuntime {
 
   GENERATE_HLSL_INTRINSIC_FUNCTION(CreateResourceGetPointer,
                                    resource_getpointer)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding,
+                                   resource_handlefrombinding)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromImplicitBinding,
+                                   resource_handlefromimplicitbinding)
   GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
@@ -126,15 +130,6 @@ class CGHLSLRuntime {
   // End of reserved area for HLSL intrinsic getters.
   //===----------------------------------------------------------------------===//
 
-  // Returns ID of the intrinsic that initializes resource handle from binding
-  // and a bool value indicating whether the last argument of the intrinsic is
-  // the resource name (not all targets need that).
-  std::pair<llvm::Intrinsic::ID, bool> getCreateHandleFromBindingIntrinsic();
-
-  // Same as above but for implicit binding.
-  std::pair<llvm::Intrinsic::ID, bool>
-  getCreateHandleFromImplicitBindingIntrinsic();
-
 protected:
   CodeGenModule &CGM;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index e1c4a7aaf5a2f..43335f81ed87f 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -117,15 +117,15 @@ let TargetPrefix = "spv" in {
   // array size of the binding, as well as an index and an indicator
   // whether that index may be non-uniform.
   def int_spv_resource_handlefrombinding
-      : DefaultAttrsIntrinsic<
-            [llvm_any_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem]>;
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i1_ty, llvm_ptr_ty],
+                              [IntrNoMem]>;
   def int_spv_resource_handlefromimplicitbinding
-      : DefaultAttrsIntrinsic<
-            [llvm_any_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
-            [IntrNoMem]>;
+      : DefaultAttrsIntrinsic<[llvm_any_ty],
+                              [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                               llvm_i32_ty, llvm_i1_ty, llvm_ptr_ty],
+                              [IntrNoMem]>;
 
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index c5e8269efd25a..292b83e05b56d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -799,107 +799,15 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   return Reg;
 }
 
-static std::string GetSpirvImageTypeName(const SPIRVType *Type,
-                                         MachineIRBuilder &MIRBuilder,
-                                         const std::string &Prefix,
-                                         SPIRVGlobalRegistry &GR);
-
 // Returns a name based on the Type. Notes that this does not look at
 // decorations, and will return the same string for two types that are the same
 // except for decorations.
-static std::string buildSpirvTypeName(const SPIRVType *Type,
-                                      MachineIRBuilder &MIRBuilder,
-                                      SPIRVGlobalRegistry &GR) {
-  switch (Type->getOpcode()) {
-  case SPIRV::OpTypeSampledImage: {
-    return GetSpirvImageTypeName(Type, MIRBuilder, "sampled_image_", GR);
-  }
-  case SPIRV::OpTypeImage: {
-    return GetSpirvImageTypeName(Type, MIRBuilder, "image_", GR);
-  }
-  case SPIRV::OpTypeArray: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t ArraySize = getArrayComponentCount(MRI, Type);
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(ArraySize) + Twine("]"))
-        .str();
-  }
-  case SPIRV::OpTypeFloat:
-    return ("f" + Twine(Type->getOperand(1).getImm())).str();
-  case SPIRV::OpTypeSampler:
-    return ("sampler");
-  case SPIRV::OpTypeInt:
-    if (Type->getOperand(2).getImm())
-      return ("i" + Twine(Type->getOperand(1).getImm())).str();
-    return ("u" + Twine(Type->getOperand(1).getImm())).str();
-  case SPIRV::OpTypePointer: {
-    uint32_t StorageClass = GR.getPointerStorageClass(Type);
-    SPIRVType *PointeeType = GR.getPointeeType(Type);
-    return ("p_" + Twine(StorageClass) + Twine("_") +
-            buildSpirvTypeName(PointeeType, MIRBuilder, GR))
-        .str();
-  }
-  case SPIRV::OpTypeStruct: {
-    std::string TypeName = "{";
-    for (uint32_t I = 1; I < Type->getNumOperands(); ++I) {
-      SPIRVType *MemberType =
-          GR.getSPIRVTypeForVReg(Type->getOperand(I).getReg());
-      TypeName += '_' + buildSpirvTypeName(MemberType, MIRBuilder, GR);
-    }
-    return TypeName + "}";
-  }
-  case SPIRV::OpTypeVector: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t VectorSize = GR.getScalarOrVectorComponentCount(Type);
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(VectorSize) + Twine("]"))
-        .str();
-  }
-  case SPIRV::OpTypeRuntimeArray: {
-    MachineRegisterInfo *MRI = MIRBuilder.getMRI();
-    Register ElementTypeReg = Type->getOperand(1).getReg();
-    auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg);
-    uint32_t ArraySize = 0;
-    return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") +
-            Twine(ArraySize) + Twine("]"))
-        .str();
-  }
-  default:
-    llvm_unreachable("Trying to the the name of an unknown type.");
-  }
-}
-
-static std::string GetSpirvImageTypeName(const SPIRVType *Type,
-                                         MachineIRBuilder &MIRBuilder,
-                                         const std::string &Prefix,
-                                         SPIRVGlobalRegistry &GR) {
-  Register SampledTypeReg = Type->getOperand(1).getReg();
-  auto *SampledType = MIRBuilder.getMRI()->getUniqueVRegDef(SampledTypeReg);
-  std::string TypeName =
-      Prefix + buildSpirvTypeName(SampledType, MIRBuilder, GR);
-  for (uint32_t I = 2; I < Type->getNumOperands(); ++I) {
-    TypeName = (TypeName + '_' + Twine(Type->getOperand(I).getImm())).str();
-  }
-  return TypeName;
-}
-
 Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding(
-    const SPIRVType *VarType, uint32_t Set, uint32_t Binding,
+    const SPIRVType *VarType, uint32_t Set, uint32_t Binding, StringRef Name,
     MachineIRBuilder &MIRBuilder) {
   Register VarReg =
       MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass);
 
-  // TODO(138533): The name should come from the llvm-ir, but how that name will
-  // be passed from the HLSL to the backend has not been decided. Using this
-  // place holder for now.
-  std::string Name =
-      ("__resource_" + buildSpirvTypeName(VarType, MIRBuilder, *this) + "_" +
-       Twine(Set) + "_" + Twine(Binding))
-          .str();
   buildGlobalVariable(VarReg, VarType, Name, nullptr,
                       getPointerStorageClass(VarType), nullptr, false, false,
                       SPIRV::LinkageType::Import, MIRBuilder, false);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 3b481b3aba0c1..35f616a1981d2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -548,6 +548,7 @@ class SPIRVGlobalRegistry : public SPIRVIRMapping {
                                bool IsInstSelector);
   Register getOrCreateGlobalVariableWithBinding(const SPIRVType *VarType,
                                                 uint32_t Set, uint32_t Binding,
+                                                StringRef Name,
                                                 MachineIRBuilder &MIRBuilder);
 
   // Convenient helpers for getting types with check for duplicates.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 2dae0721886c7..8edd0b533b9fa 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -322,7 +322,7 @@ class SPIRVInstructionSelector : public InstructionSelector {
                                   SPIRV::StorageClass::StorageClass SC,
                                   uint32_t Set, uint32_t Binding,
                                   uint32_t ArraySize, Register IndexReg,
-                                  bool IsNonUniform,
+                                  bool IsNonUniform, StringRef Name,
                                   MachineIRBuilder MIRBuilder) const;
   SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const;
   bool extractSubvector(Register &ResVReg, const SPIRVType *ResType,
@@ -3380,14 +3380,14 @@ bool SPIRVInstructionSelector::selectImageWriteIntrinsic(
 Register SPIRVInstructionSelector::buildPointerToResource(
     const SPIRVType *SpirvResType, SPIRV::StorageClass::StorageClass SC,
     uint32_t Set, uint32_t Binding, uint32_t ArraySize, Register IndexReg,
-    bool IsNonUniform, MachineIRBuilder MIRBuilder) const {
+    bool IsNonUniform, StringRef Name, MachineIRBuilder MIRBuilder) const {
   const Type *ResType = GR.getTypeForSPIRVType(SpirvResType);
   if (ArraySize == 1) {
     SPIRVType *PtrType =
         GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC);
     assert(GR.getPointeeType(PtrType) == SpirvResType &&
            "SpirvResType did not have an explicit layout.");
-    return GR.getOrCreateGlobalVariableWithBinding(PtrType, Set, Binding,
+    return GR.getOrCreateGlobalVariableWithBinding(PtrType, Set, Binding, Name,
                                                    MIRBuilder);
   }
 
@@ -3395,7 +3395,7 @@ Register SPIRVInstructionSelector::buildPointerToResource(
   SPIRVType *VarPointerType =
       GR.getOrCreateSPIRVPointerType(VarType, MIRBuilder, SC);
   Register VarReg = GR.getOrCreateGlobalVariableWithBinding(
-      VarPointerType, Set, Binding, MIRBuilder);
+      VarPointerType, Set, Binding, Name, MIRBuilder);
 
   SPIRVType *ResPointerType =
       GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC);
@@ -4081,6 +4081,9 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
   uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI);
   Register IndexReg = HandleDef.getOperand(5).getReg();
   bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI);
+  std::string Name =
+      getStringValueFromReg(HandleDef.getOperand(7).getReg(), *MRI);
+
   bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer;
   MachineIRBuilder MIRBuilder(HandleDef);
   SPIRVType *VarType = ResType;
@@ -4091,8 +4094,9 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
     SC = GR.getPointerStorageClass(ResType);
   }
 
-  Register VarReg = buildPointerToResource(VarType, SC, Set, Binding, ArraySize,
-                                           IndexReg, IsNonUniform, MIRBuilder);
+  Register VarReg =
+      buildPointerToResource(VarType, SC, Set, Binding, ArraySize, IndexReg,
+                             IsNonUniform, Name, MIRBuilder);
 
   if (IsNonUniform)
     buildOpDecorate(HandleReg, HandleDef, TII, SPIRV::Decoration::NonUniformEXT,
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 725a7979d3e5b..768efb96a53e9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -80,6 +80,16 @@ std::string getStringImm(const MachineInstr &MI, unsigned StartIndex) {
   return getSPIRVStringOperand(MI, StartIndex);
 }
 
+std::string getStringValueFromReg(Register Reg, MachineRegisterInfo &MRI) {
+  MachineInstr *Def = getVRegDef(MRI, Reg);
+  assert(Def && Def->getOpcode() == TargetOpcode::G_GLOBAL_VALUE &&
+         "Expected G_GLOBAL_VALUE");
+  const GlobalValue *GV = Def->getOperand(1).getGlobal();
+  Value *V = GV->getOperand(0);
+  const ConstantDataArray *CDA = cast<ConstantDataArray>(V);
+  return CDA->getAsCString().str();
+}
+
 void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
   const auto Bitwidth = Imm.getBitWidth();
   if (Bitwidth == 1)
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index f14a7d356ea58..d732188f9289f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -125,6 +125,10 @@ void addStringImm(const StringRef &Str, IRBuilder<> &B,
 // the reverse of the logic in addStringImm.
 std::string getStringImm(const MachineInstr &MI, unsigned StartIndex);
 
+// Returns the string constant that the register refers to. It is assumed that
+// Reg is a global value that contains a string.
+std::string getStringValueFromReg(Register Reg, MachineRegisterInfo &MRI);
+
 // Add the given numerical immediate to MIB.
 void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB);
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
index 58252fe297f3e..b14b6af156caf 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-NOT: OpCapability StorageImageReadWithoutFormat
 
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
@@ -20,7 +22,7 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
   %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
@@ -35,7 +37,7 @@ define void @RWBufferLoad_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: OpCompositeExtract [[int]] [[V]] 0
@@ -51,7 +53,7 @@ define void @RWBufferLoad_Vec2_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: [[e0:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 0
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
index d810ef9ccecc4..22fb4c3e78dcc 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoadStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: [[float:%[0-9]+]] = OpTypeFloat 32
 ; CHECK-DAG: [[v2float:%[0-9]+]] = OpTypeVector [[float]] 2
 ; CHECK-DAG: [[v4float:%[0-9]+]] = OpTypeVector [[float]] 4
@@ -18,7 +20,7 @@
 define void @main_scalar() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
 ; CHECK: [[V:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
@@ -57,7 +59,7 @@ bb_both:
 define void @main_vector2() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
 ; CHECK: [[E0:%[0-9]+]] = OpCompositeExtract [[float]] [[R]] 0
@@ -100,7 +102,7 @@ bb_both:
 define void @main_vector4() local_unnamed_addr #0 {
 entry:
 ; CHECK: [[H:%[0-9]+]] = OpLoad [[ImageType]] [[Var]]
-  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+  %s_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
 
 ; CHECK: [[R:%[0-9]+]] = OpImageRead [[v4float]] [[H]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1) %s_h.i, i32 1)
@@ -132,11 +134,5 @@ bb_both:
   ret void
 }
 
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 1), i32) #1
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.Image", float, 5, 2, 0, 0, 2, 1) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32, i32, i32, i32, i1) #1
-
 attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
index 812e20e45565b..ee976f1a4110e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O3 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+
 ; CHECK-NOT: OpCapability StorageImageReadWithoutFormat
 
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
@@ -22,7 +24,7 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferStore_Vec4_I32() #0 {
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
index d5f6545180147..5e9d88fd9af0e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/Packed.ll
@@ -3,6 +3,10 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
+
+@.str.unpacked = private unnamed_addr constant [12 x i8] c"UnpackedRes\00", align 1
+@.str.packed = private unnamed_addr constant [10 x i8] c"PackedRes\00", align 1
+
 ; CHECK-DAG: OpName [[unpacked:%[0-9]+]] "unpacked"
 ; CHECK-DAG: OpName [[packed:%[0-9]+]] "packed"
 
@@ -22,7 +26,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 
 define external i32 @unpacked_vulkan_buffer_load() {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.unpacked)
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %unpacked], 12, 0) %handle, i32 1)
   %1 = load i32, ptr addrspace(11) %0, align 4
   ret i32 %1
@@ -30,7 +34,7 @@ entry:
 
 define external i32 @packed_vulkan_buffer_load() {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %packed], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %packed], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.packed)
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %packed], 12, 0) %handle, i32 1)
   %1 = load i32, ptr addrspace(11) %0, align 4
   ret i32 %1
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
index f52fd44bf3801..03b41ae0df31f 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.int_buf = private unnamed_addr constant [7 x i8] c"IntBuf\00", align 1
+@.str.float_buf = private unnamed_addr constant [9 x i8] c"FloatBuf\00", align 1
+
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
 ; CHECK-DAG: OpDecorate [[FloatBufferVar:%[0-9]+]] DescriptorSet 16
@@ -21,7 +24,7 @@ define void @RWBufferLoad() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -29,7 +32,7 @@ define void @RWBufferLoad() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
@@ -43,7 +46,7 @@ define void @UseDifferentGlobalVar() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeFloat]] [[FloatBufferVar]]
   %buffer0 = call target("spirv.Image", float, 5, 2, 0, 0, 2, 3)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_3(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.float_buf )
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", float, 5, 2, 0, 0, 2, 3) %buffer0, i32 0)
   store float 0.0, ptr %ptr0, align 4
   ret void
@@ -57,7 +60,7 @@ define void @ReuseGlobalVarFromFirstFunction() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.int_buf)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
index 6a6d810e6babd..236c5e4ea56a5 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: OpCapability Shader
 ; CHECK-DAG: OpCapability StorageImageArrayDynamicIndexing
 ; CHECK-DAG: OpCapability Image1D
@@ -26,7 +28,7 @@ define void @main() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 false)
+          i32 3, i32 4, i32 3, i32 0, i1 false, ptr nonnull @.str.b0)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -34,7 +36,7 @@ define void @main() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 false)
+          i32 3, i32 4, i32 3, i32 1, i1 false, ptr nonnull @.str.b0)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
index 16f3724d5d10a..5693f797c798e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 ; CHECK-DAG: OpCapability Shader
 ; CHECK-DAG: OpCapability ShaderNonUniformEXT
 ; CHECK-DAG: OpCapability StorageImageArrayNonUniformIndexing
@@ -33,7 +35,7 @@ define void @main() #0 {
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 0, i1 true)
+          i32 3, i32 4, i32 3, i32 0, i1 true, ptr nonnull @.str.b0)
   %ptr0 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer0, i32 0)
   store i32 0, ptr %ptr0, align 4
 
@@ -41,7 +43,7 @@ define void @main() #0 {
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
-          i32 3, i32 4, i32 3, i32 1, i1 true)
+          i32 3, i32 4, i32 3, i32 1, i1 true, ptr nonnull @.str.b0)
   %ptr1 = tail call noundef nonnull align 4 dereferenceable(4) ptr @llvm.spv.resource.getpointer.p0.tspirv.Image_f32_5_2_0_0_2_0t(target("spirv.Image", i32, 0, 2, 0, 0, 2, 24) %buffer1, i32 0)
   store i32 0, ptr %ptr1, align 4
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
index f539fdefa3fa2..e47685cd38a2a 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll
@@ -3,11 +3,8 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32, i32, i32, i32, i1) #0
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #0
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+@.str.rwb = private unnamed_addr constant [4 x i8] c"RWB\00", align 1
 
 ; CHECK: OpDecorate [[BufferVar:%.+]] DescriptorSet 0
 ; CHECK: OpDecorate [[BufferVar]] Binding 0
@@ -40,9 +37,9 @@ entry:
 ; CHECK-DAG: [[BufferHandle:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]]
 ; CHECK-DAG: [[BufferHandle2:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]]
 ; CHECK-DAG: [[RWBufferHandle:%.+]] = OpCopyObject [[RWBufferPtrType]] [[RWBufferVar]]
-  %BufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %BufferHandle2 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %RWBufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %BufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
+  %BufferHandle2 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
+  %RWBufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.rwb)
 
 ; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[BufferHandle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_0t(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %BufferHandle,  i32 1)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
index 4ec8605f68137..704665d7e52ea 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
@@ -1,8 +1,11 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [4 x i8] c"Buf\00", align 1
+
 ; CHECK: OpCapability StorageImageReadWithoutFormat
-; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
+; CHECK: OpName [[IntBufferVar:%[0-9]+]] "Buf"
+; CHECK-DAG: OpDecorate [[IntBufferVar]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
 
 ; CHECK-DAG: [[int:%[0-9]+]] = OpTypeInt 32 0
@@ -18,7 +21,7 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
   %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
index a4123c36a4488..27ae6a03797c3 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
 
+@.str.b = private unnamed_addr constant [2 x i8] c"B\00", align 1
+
 ; CHECK: OpCapability StorageImageWriteWithoutFormat
 ; CHECK-DAG: OpDecorate [[IntBufferVar:%[0-9]+]] DescriptorSet 16
 ; CHECK-DAG: OpDecorate [[IntBufferVar]] Binding 7
@@ -20,7 +22,7 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferLoad_Vec4_I32() #0 {
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
       @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
-          i32 16, i32 7, i32 1, i32 0, i1 false)
+          i32 16, i32 7, i32 1, i32 0, i1 false, ptr nonnull @.str.b)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
index 14c98b2fd55a5..064251a57dfc6 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/spirv.layout.type.ll
@@ -3,6 +3,12 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G10"
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+@.str.b1 = private unnamed_addr constant [3 x i8] c"B1\00", align 1
+@.str.b2 = private unnamed_addr constant [3 x i8] c"B2\00", align 1
+@.str.b3 = private unnamed_addr constant [3 x i8] c"B3\00", align 1
+@.str.b4 = private unnamed_addr constant [3 x i8] c"B4\00", align 1
+
 ; CHECK-DAG: OpName [[standard_layout:%[0-9]+]] "standard_layout"
 ; CHECK-DAG: OpMemberDecorate [[standard_layout]] 0 Offset 0
 ; CHECK-DAG: OpMemberDecorate [[standard_layout]] 1 Offset 4
@@ -33,11 +39,11 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 
 define void @main() local_unnamed_addr #1 {
 entry:
-  %standard_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 8, 0, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %standard_handle_with_different_offset = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 12, 0, 8), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %backwards_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %backwards_layout, 8, 4, 0), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %large_gap_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %large_gap, 1024, 0, 64, 1020, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
-  %mixed_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %mixed_layout, 16, 0, 8, 4, 12), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %standard_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 8, 0, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
+  %standard_handle_with_different_offset = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %standard_layout, 12, 0, 8), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b1)
+  %backwards_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %backwards_layout, 8, 4, 0), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b2)
+  %large_gap_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %large_gap, 1024, 0, 64, 1020, 4), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b3)
+  %mixed_handle = tail call target("spirv.VulkanBuffer", target("spirv.Layout", %mixed_layout, 16, 0, 8, 4, 12), 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_tspirv.Layout_s___cblayout_Bs_8_0_4t_2_0t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b4)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
index d608529b421cc..d87c175c36916 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast-2.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s --match-full-lines
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 %S2 = type { { [10 x { i32, i32 } ] }, i32 }
 
 ; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
@@ -21,11 +23,9 @@
 ; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
 ; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
 
-declare target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32, i32, i32, i32, i1)
-
 define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %S2], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str)
 ; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
 
   %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_Ss_12_1t(target("spirv.VulkanBuffer", [0 x %S2], 12, 1) %handle, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
index b1446b7529ea4..5a469a4515b79 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/resource-addrspacecast.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -verify-machineinstrs -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O3 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val %}
 
+@.str = private unnamed_addr constant [3 x i8] c"B0\00", align 1
+
 %struct.S = type { i32 }
 
 ; CHECK-DAG:                     %[[#uint:]] = OpTypeInt 32 0
@@ -13,11 +15,9 @@
 ; CHECK-DAG:              %[[#rarr_struct:]] = OpTypeStruct %[[#rarr]]
 ; CHECK-DAG:       %[[#spirv_VulkanBuffer:]] = OpTypePointer StorageBuffer %[[#rarr_struct]]
 
-declare target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32, i32, i32, i32, i1)
-
 define void @main() "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" {
 entry:
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str)
 ; CHECK:      %[[#resource:]] = OpVariable %[[#spirv_VulkanBuffer]] StorageBuffer
 
   %ptr = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0s_struct.Ss_12_1t(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 1) %handle, i32 0)
diff --git a/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
index 7303471c9929c..4cc07c249be93 100644
--- a/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
+++ b/llvm/test/CodeGen/SPIRV/spirv-explicit-layout.ll
@@ -3,9 +3,14 @@
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
 
-; CHECK-DAG: OpName [[ScalarBlock_var:%[0-9]+]] "__resource_p_12_{_u32[0]}_0_0"
-; CHECK-DAG: OpName [[buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}_0_0"
-; CHECK-DAG: OpName [[array_buffer_var:%[0-9]+]] "__resource_p_12_{_{_{_u32_f32[3]}[10]}[0]}[10]_0_0"
+@.str.scalarblock = private unnamed_addr constant [12 x i8] c"ScalarBlock\00", align 1
+@.str.buffervar = private unnamed_addr constant [10 x i8] c"BufferVar\00", align 1
+@.str.arraybuffervar = private unnamed_addr constant [15 x i8] c"ArrayBufferVar\00", align 1
+
+
+; CHECK-DAG: OpName [[ScalarBlock_var:%[0-9]+]] "ScalarBlock"
+; CHECK-DAG: OpName [[buffer_var:%[0-9]+]] "BufferVar"
+; CHECK-DAG: OpName [[array_buffer_var:%[0-9]+]] "ArrayBufferVar"
 
 ; CHECK-DAG: OpMemberDecorate [[ScalarBlock:%[0-9]+]] 0 Offset 0
 ; CHECK-DAG: OpDecorate [[ScalarBlock]] Block
@@ -63,8 +68,8 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
 define external i32 @scalar_vulkan_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[ScalarBlock_ptr]] [[ScalarBlock_var]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[ScalarBlock_ptr]] [[ScalarBlock_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.scalarblock)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_int_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %handle, i32 1)
@@ -83,7 +88,7 @@ define external %struct.S @private_load() {
 ; CHECK-NEXT: OpLabel
 entry:
 
-; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S]] [[private_var]] Aligned 4
+; CHECK: [[ld:%[0-9]+]] = OpLoad [[S]] [[private_var]] Aligned 4
   %1 = load %struct.S, ptr addrspace(10) @private, align 4
 
 ; CHECK-NEXT: OpReturnValue [[ld]]
@@ -97,7 +102,7 @@ define external %struct.S @storage_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
 
-; CHECK-NEXT: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[storage_buffer]] Aligned 4
+; CHECK: [[ld:%[0-9]+]] = OpLoad [[S_explicit]] [[storage_buffer]] Aligned 4
 ; CHECK-NEXT: [[copy:%[0-9]+]] = OpCopyLogical [[S]] [[ld]]
   %1 = load %struct.S, ptr addrspace(11) @storage_buffer, align 4
 
@@ -111,8 +116,8 @@ entry:
 define external %struct.S @vulkan_buffer_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[buffer_var]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[buffer_var]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false, ptr nonnull @.str.buffervar)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)
@@ -131,9 +136,9 @@ entry:
 define external %struct.S @array_of_vulkan_buffers_load() {
 ; CHECK-NEXT: OpLabel
 entry:
-; CHECK-NEXT: [[h:%[0-9]+]] = OpAccessChain [[buffer_ptr]] [[array_buffer_var]] [[one]]
-; CHECK-NEXT: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[h]]
-  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 1, i1 false)
+; CHECK: [[h:%[0-9]+]] = OpAccessChain [[buffer_ptr]] [[array_buffer_var]] [[one]]
+; CHECK: [[handle:%[0-9]+]] = OpCopyObject [[buffer_ptr]] [[h]]
+  %handle = tail call target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 1, i1 false, ptr nonnull @.str.arraybuffervar)
 
 ; CHECK-NEXT: [[ptr:%[0-9]+]] = OpAccessChain [[storagebuffer_S_ptr]] [[handle]] [[zero]] [[one]]
   %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.VulkanBuffer", [0 x %struct.S], 12, 0) %handle, i32 1)

From 68b6f392ed446ff8edfbb2a52899c9361d45ba28 Mon Sep 17 00:00:00 2001
From: Daniel Hernandez-Juarez <danherna@amd.com>
Date: Fri, 13 Jun 2025 18:33:51 +0200
Subject: [PATCH 393/851] [MLIR][AMDGPU] Fix bug in GatherToLDSOpLowering, get
 the correct MemRefType for destination (#142915)

This PR fixes a bug in GatherToLDSOpLowering, we were getting the
MemRefType of source for the destination. Additionally, some related
typos are corrected.

CC: @krzysz00 @umangyadav @lialan
---
 llvm/docs/AMDGPUUsage.rst                     |  4 ++--
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           |  2 +-
 .../Conversion/AMDGPUToROCDL/load_lds.mlir    | 20 ++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e0a43225e81be..39f04f8e01b85 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1215,12 +1215,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    denormalization mode, enabled traps, and floating point exceptions.
                                                    The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
 
-  :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specifies state.
+  :ref:`llvm.set.fpenv<int_set_fpenv>`             Sets the floating point environment to the specified state.
   llvm.amdgcn.load.to.lds.p<1/7>                   Loads values from global memory (either in the form of a global
                                                    a raw fat buffer pointer) to LDS. The size of the data copied can be 1, 2,
                                                    or 4 bytes (and gfx950 also allows 12 or 16 bytes). The LDS pointer
                                                    argument should be wavefront-uniform; the global pointer need not be.
-                                                   The LDS pointer is implicitly offset by 4 * lane_id bytes for sies <= 4 bytes
+                                                   The LDS pointer is implicitly offset by 4 * lane_id bytes for size <= 4 bytes
                                                    and 16 * lane_id bytes for larger sizes. This lowers to `global_load_lds`,
                                                    `buffer_load_* ... lds`, or `global_load__* ... lds` depending on address
                                                    space and architecture. `amdgcn.global.load.lds` has the same semantics as
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5e6f675a6414b..074404add47f1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -1101,7 +1101,7 @@ struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern<GatherToLDSOp> {
     Location loc = op.getLoc();
 
     auto srcMemRefType = cast<MemRefType>(op.getSrc().getType());
-    auto dstMemRefType = cast<MemRefType>(op.getSrc().getType());
+    auto dstMemRefType = cast<MemRefType>(op.getDst().getType());
 
     // TODO: instead of only transfering one element per thread, we could
     // augment it to transfer multiple elements per thread by issuing multiple
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
index cb3539dd11be3..581346e03b893 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir
@@ -31,8 +31,8 @@ func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_add
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -65,8 +65,8 @@ func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrs
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -103,8 +103,8 @@ func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_add
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C128:.*]] = llvm.mlir.constant(128 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C128]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]
@@ -130,7 +130,9 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g
   // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1]
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
-  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]]
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64
+  // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]]
   // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4
   %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace>
   %c0 = arith.constant 0 : index
@@ -166,8 +168,8 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat
   // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]]
   // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1]
 
-  // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64
-  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64
+  // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64
+  // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C64]] : i64
   // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64
 
   // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]]

From 3b09a3d5ae41faac3c0046b93a9c6e0297cc860b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 09:36:38 -0700
Subject: [PATCH 394/851] MC,SPARC: Replace SparcMCExpr with MCSpecifierExpr

Add a hook printSpecifierExpr so that targets can implement
relocation specifier printing without inheriting from MCSpecifierExpr.
---
 llvm/include/llvm/MC/MCAsmInfo.h                    |  2 ++
 llvm/include/llvm/MC/MCExpr.h                       |  7 +++++--
 llvm/lib/MC/MCAsmInfo.cpp                           | 12 +++++++++++-
 llvm/lib/MC/MCExpr.cpp                              |  2 ++
 .../Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp  |  5 +++--
 .../Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp    | 10 ++++++++++
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h |  2 ++
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp  | 13 ++-----------
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h    |  7 +------
 9 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 18303e028f623..1f2ea0cfaaff0 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -28,6 +28,7 @@ namespace llvm {
 class MCContext;
 class MCCFIInstruction;
 class MCExpr;
+class MCSpecifierExpr;
 class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
@@ -712,6 +713,7 @@ class LLVM_ABI MCAsmInfo {
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
 
   void printExpr(raw_ostream &, const MCExpr &) const;
+  virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 0b8af09fe1c2f..1c72269e53e29 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -82,6 +82,7 @@ class MCExpr {
   /// \name Utility Methods
   /// @{
 
+  // TODO: Make this private. Users should call MCAsmInfo::printExpr instead.
   LLVM_ABI void print(raw_ostream &OS, const MCAsmInfo *MAI,
                       int SurroundingPrec = 0) const;
   LLVM_ABI void dump() const;
@@ -509,7 +510,7 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
   // Target-specific relocation specifier code
   const Spec specifier;
 
-protected:
+public:
   explicit MCSpecifierExpr(const MCExpr *Expr, Spec S)
       : MCExpr(Specifier, SMLoc()), Expr(Expr), specifier(S) {}
   virtual ~MCSpecifierExpr() = default;
@@ -518,7 +519,9 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
-  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const = 0;
+  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+    llvm_unreachable("Replace MCExpr::print calls with MCAsmInfo::printExpr");
+  }
   virtual bool evaluateAsRelocatableImpl(MCValue &Res,
                                          const MCAssembler *Asm) const;
 
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index fbacca4f56796..13b077349a587 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -151,5 +151,15 @@ std::optional<uint32_t> MCAsmInfo::getSpecifierForName(StringRef Name) const {
 }
 
 void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
-  Expr.print(OS, this);
+  if (auto *SE = dyn_cast<MCSpecifierExpr>(&Expr))
+    printSpecifierExpr(OS, *SE);
+  else
+    Expr.print(OS, this);
+}
+
+void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                   const MCSpecifierExpr &Expr) const {
+  // TODO: Switch to unreachable after all targets that use MCSpecifierExpr
+  // migrate to MCAsmInfo::printSpecifierExpr.
+  Expr.printImpl(OS, this);
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index aec698721d9d7..2ae440cba46f9 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -174,6 +174,8 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI,
   }
 
   case MCExpr::Specifier:
+    // TODO: Remove after all targets that use MCSpecifierExpr migrate to
+    // MCAsmInfo::printSpecifierExpr.
     return cast<MCSpecifierExpr>(this)->printImpl(OS, MAI);
   }
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index 936518da35110..2d1a4fe94d180 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -12,6 +12,7 @@
 
 #include "SparcInstPrinter.h"
 #include "Sparc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -142,7 +143,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void SparcInstPrinter::printMemOperand(const MCInst *MI, int opNum,
@@ -288,5 +289,5 @@ void SparcInstPrinter::printCTILabel(const MCInst *MI, uint64_t Address,
   }
 
   // Otherwise, just print the expression.
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 3049072b001cb..4156780e962dc 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -66,3 +66,13 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
+
+void SparcELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  StringRef S = Sparc::getSpecifierName(Expr.getSpecifier());
+  if (!S.empty())
+    OS << '%' << S << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (!S.empty())
+    OS << ')';
+}
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index c9162f2dc8a53..7ea800f119174 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -32,6 +32,8 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
                                     unsigned Encoding,
                                     MCStreamer &Streamer) const override;
 
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 2e03e47399864..6a08fa5c9f3f7 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -24,21 +24,12 @@ using namespace llvm;
 
 const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
                                               const MCExpr *Expr, uint16_t S) {
-  return new (Ctx) SparcMCExpr(Expr, S);
+  return new (Ctx) MCSpecifierExpr(Expr, S);
 }
 
 const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
                                               const MCSymbol *Sym, uint16_t S) {
-  return new (Ctx) SparcMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
-void SparcMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  StringRef S = Sparc::getSpecifierName(specifier);
-  if (!S.empty())
-    OS << '%' << S << '(';
-  getSubExpr()->print(OS, MAI);
-  if (!S.empty())
-    OS << ')';
+  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
 }
 
 StringRef Sparc::getSpecifierName(uint16_t S) {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 612b439bfc740..78af9a8150200 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -20,12 +20,7 @@
 namespace llvm {
 
 class StringRef;
-class SparcMCExpr : public MCSpecifierExpr {
-public:
-  explicit SparcMCExpr(const MCExpr *Expr, uint16_t S)
-      : MCSpecifierExpr(Expr, S) {}
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
+using SparcMCExpr = MCSpecifierExpr;
 
 namespace Sparc {
 const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,

From 36c710c40e8a59f74f56eb0e04e438cec5532ec5 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 09:42:58 -0700
Subject: [PATCH 395/851] [CIR] Change default assumption about allowing
 builtins (#144004)

The code to read the "nobuiltins" attributes hasn't been implemented
yet, but we were defaulting to the assumption that use of builtins is
allowed for function calls that we recognize as standard C library calls
and have builtin equivalents of. This change reverses that assumption so
that when such calls are encountered, we just emit the call. This is a
better default assumption, and since our builtin handling for these
functions isn't implemented yet, it also allows us to compile more
programs.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp |  3 +-
 clang/test/CIR/CodeGen/libc.c        | 55 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/CodeGen/libc.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 99f942fcf2cd3..42d0c78013f57 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1056,7 +1056,8 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
 
     bool isPredefinedLibFunction =
         cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID);
-    bool hasAttributeNoBuiltin = false;
+    // Assume nobuiltins everywhere until we actually read the attributes.
+    bool hasAttributeNoBuiltin = true;
     assert(!cir::MissingFeatures::attributeNoBuiltin());
 
     // When directing calling an inline builtin, call it through it's mangled
diff --git a/clang/test/CIR/CodeGen/libc.c b/clang/test/CIR/CodeGen/libc.c
new file mode 100644
index 0000000000000..f65fe92cd36a0
--- /dev/null
+++ b/clang/test/CIR/CodeGen/libc.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+// Note: In the final implementation, we will want these to generate
+//       CIR-specific libc operations. This test is just a placeholder
+//       to make sure we can compile these to normal function calls
+//       until the special handling is implemented.
+
+void *memcpy(void *, const void *, unsigned long);
+void testMemcpy(void *dst, const void *src, unsigned long size) {
+  memcpy(dst, src, size);
+  // CHECK: cir.call @memcpy
+}
+
+void *memmove(void *, const void *, unsigned long);
+void testMemmove(void *src, const void *dst, unsigned long size) {
+  memmove(dst, src, size);
+  // CHECK: cir.call @memmove
+}
+
+void *memset(void *, int, unsigned long);
+void testMemset(void *dst, int val, unsigned long size) {
+  memset(dst, val, size);
+  // CHECK: cir.call @memset
+}
+
+double fabs(double);
+double testFabs(double x) {
+  return fabs(x);
+  // CHECK: cir.call @fabs
+}
+
+float fabsf(float);
+float testFabsf(float x) {
+  return fabsf(x);
+  // CHECK: cir.call @fabsf
+}
+
+int abs(int);
+int testAbs(int x) {
+  return abs(x);
+  // CHECK: cir.call @abs
+}
+
+long labs(long);
+long testLabs(long x) {
+  return labs(x);
+  // CHECK: cir.call @labs
+}
+
+long long llabs(long long);
+long long testLlabs(long long x) {
+  return llabs(x);
+  // CHECK: cir.call @llabs
+}

From 3bf1e1f79ce5b4921586b24014acf5888c35e03f Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Fri, 13 Jun 2025 17:47:06 +0100
Subject: [PATCH 396/851] [mlir][spirv] Add definition of OpImageRead (#144038)

---
 .../mlir/Dialect/SPIRV/IR/SPIRVBase.td        |  4 +-
 .../mlir/Dialect/SPIRV/IR/SPIRVImageOps.td    | 57 +++++++++++++++++++
 mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp        | 17 ++++++
 mlir/test/Dialect/SPIRV/IR/image-ops.mlir     | 28 +++++++++
 mlir/test/Target/SPIRV/image-ops.mlir         | 11 +++-
 5 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index e413503bbd672..d2ba76cdad904 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -4370,6 +4370,7 @@ def SPIRV_OC_OpImageSampleImplicitLod         : I32EnumAttrCase<"OpImageSampleIm
 def SPIRV_OC_OpImageSampleExplicitLod         : I32EnumAttrCase<"OpImageSampleExplicitLod", 88>;
 def SPIRV_OC_OpImageSampleProjDrefImplicitLod : I32EnumAttrCase<"OpImageSampleProjDrefImplicitLod", 93>;
 def SPIRV_OC_OpImageDrefGather                : I32EnumAttrCase<"OpImageDrefGather", 97>;
+def SPIRV_OC_OpImageRead                      : I32EnumAttrCase<"OpImageRead", 98>;
 def SPIRV_OC_OpImageWrite                     : I32EnumAttrCase<"OpImageWrite", 99>;
 def SPIRV_OC_OpImage                          : I32EnumAttrCase<"OpImage", 100>;
 def SPIRV_OC_OpImageQuerySize                 : I32EnumAttrCase<"OpImageQuerySize", 104>;
@@ -4577,7 +4578,8 @@ def SPIRV_OpcodeAttr :
       SPIRV_OC_OpCompositeInsert, SPIRV_OC_OpTranspose,
       SPIRV_OC_OpImageSampleImplicitLod, SPIRV_OC_OpImageSampleExplicitLod,
       SPIRV_OC_OpImageSampleProjDrefImplicitLod, SPIRV_OC_OpImageDrefGather,
-      SPIRV_OC_OpImageWrite, SPIRV_OC_OpImage, SPIRV_OC_OpImageQuerySize,
+      SPIRV_OC_OpImageRead, SPIRV_OC_OpImageWrite, SPIRV_OC_OpImage,
+      SPIRV_OC_OpImageQuerySize,
       SPIRV_OC_OpConvertFToU, SPIRV_OC_OpConvertFToS, SPIRV_OC_OpConvertSToF,
       SPIRV_OC_OpConvertUToF, SPIRV_OC_OpUConvert, SPIRV_OC_OpSConvert,
       SPIRV_OC_OpFConvert, SPIRV_OC_OpConvertPtrToU, SPIRV_OC_OpConvertUToPtr,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
index 9999e5cc07b86..7610966b84be3 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVImageOps.td
@@ -186,6 +186,63 @@ def SPIRV_ImageQuerySizeOp : SPIRV_Op<"ImageQuerySize", [Pure]> {
 
 // -----
 
+def SPIRV_ImageReadOp : SPIRV_Op<"ImageRead",
+    [SPIRV_SampledOperandIs<"image", ["SamplerUnknown", "NoSampler"]>,
+     SPIRV_NoneOrElementMatchImage<"result", "image">]> {
+  let summary = "Read a texel from an image without a sampler.";
+
+  let description = [{
+    Result Type must be a scalar or vector of floating-point type or integer
+    type. It must be a scalar or vector with component type the same as Sampled
+    Type of the OpTypeImage (unless that Sampled Type is OpTypeVoid).
+
+    Image must be an object whose type is OpTypeImage with a Sampled operand of
+    0 or 2. If the Arrayed operand is 1, then additional capabilities may be
+    required; e.g., ImageCubeArray, or ImageMSArray.
+
+    Coordinate must be a scalar or vector of floating-point type or integer
+    type. It contains non-normalized texel coordinates (u[, v] ... [, array
+    layer]) as needed by the definition of Image. See the client API
+    specification for handling of coordinates outside the image.
+
+    If the Image Dim operand is SubpassData, Coordinate is relative to the
+    current fragment location. See the client API specification for more detail
+    on how these coordinates are applied.
+
+    If the Image Dim operand is not SubpassData, the Image Format must not be
+    Unknown, unless the StorageImageReadWithoutFormat Capability was declared.
+
+    Image Operands encodes what operands follow, as per Image Operands.
+
+    <!-- End of AutoGen section -->
+
+    #### Example:
+
+    ```mlir
+    %0 = spirv.ImageRead %1, %2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, R32f>, vector<2xsi32> -> vector<4xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_AnyImage:$image,
+    AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>]>:$coordinate,
+    OptionalAttr<SPIRV_ImageOperandsAttr>:$image_operands,
+    Variadic<SPIRV_Type>:$operand_arguments
+  );
+
+  let results = (outs
+    AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>]>:$result
+  );
+
+  let assemblyFormat = [{
+    $image `,` $coordinate custom<ImageOperands>($image_operands) ( `,` $operand_arguments^ )? attr-dict
+    `:` type($image) `,` type($coordinate) ( `,` type($operand_arguments)^ )?
+    `->` type($result)
+  }];
+}
+
+// -----
+
 def SPIRV_ImageWriteOp : SPIRV_Op<"ImageWrite",
     [SPIRV_SampledOperandIs<"image", ["SamplerUnknown", "NoSampler"]>,
      SPIRV_DimIsNot<"image", ["SubpassData"]>,
diff --git a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
index a021931425fb0..f7af79ceefa82 100644
--- a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp
@@ -204,6 +204,23 @@ LogicalResult spirv::ImageDrefGatherOp::verify() {
                              getOperandArguments());
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.ImageReadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult spirv::ImageReadOp::verify() {
+  // TODO: Do we need check for: "If the Arrayed operand is 1, then additional
+  // capabilities may be required; e.g., ImageCubeArray, or ImageMSArray."?
+
+  // TODO: Ideally it should be somewhere verified that "If the Image Dim
+  // operand is not SubpassData, the Image Format must not be Unknown, unless
+  // the StorageImageReadWithoutFormat Capability was declared." This function
+  // however may not be the suitable place for such verification.
+
+  return verifyImageOperands(getOperation(), getImageOperandsAttr(),
+                             getOperandArguments());
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.ImageWriteOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
index 1ebdfdb41de1b..484a54023edc0 100644
--- a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir
@@ -116,6 +116,34 @@ func.func @image_query_size_error_result2(%arg0 : !spirv.image<f32, Buffer, NoDe
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// spirv.ImageRead
+//===----------------------------------------------------------------------===//
+
+func.func @image_read(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // CHECK: {{%.*}} = spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+  spirv.Return
+}
+
+// -----
+
+func.func @image_read_type_mismatch(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // expected-error @+1 {{op failed to verify that the result component type must match the image sampled type}}
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf16>
+  spirv.Return
+}
+
+// -----
+
+func.func @image_read_need_sampler(%arg0: !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba8>, %arg1: vector<2xsi32>) -> () {
+  // expected-error @+1 {{op failed to verify that the sampled operand of the underlying image must be SamplerUnknown or NoSampler}}
+  %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NeedSampler, Rgba8>, vector<2xsi32> -> vector<4xf16>
+  spirv.Return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.ImageWrite
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Target/SPIRV/image-ops.mlir b/mlir/test/Target/SPIRV/image-ops.mlir
index 6dd23844d46a9..b8d19f0f9a7d1 100644
--- a/mlir/test/Target/SPIRV/image-ops.mlir
+++ b/mlir/test/Target/SPIRV/image-ops.mlir
@@ -13,6 +13,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Link
     %0 = spirv.ImageQuerySize %arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown> -> vector<2xi32>
     spirv.Return
   }
+  spirv.func @image_read(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>) "None" {
+    // CHECK: {{.*}} = spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+    %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32> -> vector<4xf32>
+    spirv.Return
+  }
   spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
     // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
     spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Rgba8>, vector<2xsi32>, vector<4xf32>
@@ -38,9 +43,11 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, ImageQuery, Link
 // -----
 
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader, StorageImageWriteWithoutFormat, Linkage], []> {
-  spirv.func @image_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>, %arg2 : vector<4xf32>) "None" {
+  spirv.func @image_read_write(%arg0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, %arg1 : vector<2xsi32>) "None" {
+    // CHECK: spirv.ImageRead {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32> -> vector<4xf32>
+    %0 = spirv.ImageRead %arg0, %arg1 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32> -> vector<4xf32>
     // CHECK: spirv.ImageWrite {{%.*}}, {{%.*}}, {{%.*}} : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
-    spirv.ImageWrite %arg0, %arg1, %arg2 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
+    spirv.ImageWrite %arg0, %arg1, %0 : !spirv.image<f32, Dim2D, NoDepth, NonArrayed, SingleSampled, NoSampler, Unknown>, vector<2xsi32>, vector<4xf32>
     spirv.Return
   }
 }

From b184672ec7f1433e5dc698cda7e61be8a6085aa6 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Fri, 13 Jun 2025 16:48:24 +0000
Subject: [PATCH 397/851] [libc] Implemented wmemmove (#142245)

Implemented wmemmove and added tests
---
 libc/config/linux/x86_64/entrypoints.txt |   1 +
 libc/include/wchar.yaml                  |   8 ++
 libc/src/wchar/CMakeLists.txt            |  12 +++
 libc/src/wchar/wmemmove.cpp              |  27 ++++++
 libc/src/wchar/wmemmove.h                |  22 +++++
 libc/test/src/wchar/CMakeLists.txt       |  10 ++
 libc/test/src/wchar/wmemmove_test.cpp    | 111 +++++++++++++++++++++++
 7 files changed, 191 insertions(+)
 create mode 100644 libc/src/wchar/wmemmove.cpp
 create mode 100644 libc/src/wchar/wmemmove.h
 create mode 100644 libc/test/src/wchar/wmemmove_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c993ef8303a59..aa2079faed409 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -367,6 +367,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.btowc
     libc.src.wchar.wcslen
     libc.src.wchar.wctob
+    libc.src.wchar.wmemmove
     libc.src.wchar.wmemset
     libc.src.wchar.wcschr
     libc.src.wchar.wcsncmp
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 57f4f6660827e..1af15a6c112b5 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -112,6 +112,14 @@ functions:
       - type: __restrict wchar_t *
       - type: const __restrict wchar_t *
       - type: size_t
+  - name: wmemmove
+    standards:
+      - stdc
+    return_type: wchar_t *
+    arguments:
+      - type: wchar_t *
+      - type: const wchar_t *
+      - type: size_t
   - name: wcsncpy
     standards:
       - stdc
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index 4b8802ede5f5d..491dd5b34340a 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -131,6 +131,18 @@ add_entrypoint_object(
     libc.hdr.wchar_macros
 )
 
+add_entrypoint_object(
+  wmemmove
+  SRCS
+    wmemmove.cpp
+  HDRS
+    wmemmove.h
+  DEPENDS
+    libc.hdr.types.size_t
+    libc.hdr.wchar_macros
+    libc.src.__support.macros.null_check
+)
+
 add_entrypoint_object(
   wcsncpy
   SRCS
diff --git a/libc/src/wchar/wmemmove.cpp b/libc/src/wchar/wmemmove.cpp
new file mode 100644
index 0000000000000..3282077003bd7
--- /dev/null
+++ b/libc/src/wchar/wmemmove.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation of wmemmove ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wmemmove.h"
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(wchar_t *, wmemmove,
+                   (wchar_t * dest, const wchar_t *src, size_t n)) {
+  LIBC_CRASH_ON_NULLPTR(dest);
+  LIBC_CRASH_ON_NULLPTR(src);
+
+  __builtin_memmove(dest, src, n * sizeof(wchar_t));
+  return dest;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wmemmove.h b/libc/src/wchar/wmemmove.h
new file mode 100644
index 0000000000000..b4c31ac7b397c
--- /dev/null
+++ b/libc/src/wchar/wmemmove.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for wmemmove --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
+#define LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+wchar_t *wmemmove(wchar_t *dest, const wchar_t *src, size_t n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WMEMMOVE_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 6293e8e3d55cf..4990b6953348b 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -145,6 +145,16 @@ add_libc_test(
     libc.src.wchar.wmemcpy
 )
 
+add_libc_test(
+  wmemmove_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wmemmove_test.cpp
+  DEPENDS
+    libc.src.wchar.wmemmove
+)
+
 add_libc_test(
   wcsncpy_test
   SUITE
diff --git a/libc/test/src/wchar/wmemmove_test.cpp b/libc/test/src/wchar/wmemmove_test.cpp
new file mode 100644
index 0000000000000..d23aa0f0b3af1
--- /dev/null
+++ b/libc/test/src/wchar/wmemmove_test.cpp
@@ -0,0 +1,111 @@
+//===-- Unittests for wmemmove --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wchar_t.h"
+#include "src/wchar/wmemmove.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcWMemmoveTest, MoveZeroByte) {
+  wchar_t buffer[] = {L'a', L'b', L'y', L'z'};
+
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(buffer, buffer + 2, 0);
+  EXPECT_EQ(ret, buffer);
+
+  const wchar_t expected[] = {L'a', L'b', L'y', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstAndSrcPointToSameAddress) {
+  wchar_t buffer[] = {L'a', L'b'};
+
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(buffer, buffer, 1);
+  EXPECT_EQ(ret, buffer);
+
+  const wchar_t expected[] = {L'a', L'b'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstStartsBeforeSrc) {
+  // Set boundary at beginning and end for not overstepping when
+  // copy forward or backward.
+  wchar_t buffer[] = {L'z', L'a', L'b', L'c', L'z'};
+
+  wchar_t *dst = buffer + 1;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 2, 2);
+  EXPECT_EQ(ret, dst);
+
+  const wchar_t expected[] = {L'z', L'b', L'c', L'c', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+  EXPECT_TRUE(buffer[4] == expected[4]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstStartsAfterSrc) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'c', L'z'};
+
+  wchar_t *dst = buffer + 2;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 1, 2);
+  EXPECT_EQ(ret, dst);
+
+  const wchar_t expected[] = {L'z', L'a', L'a', L'b', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+  EXPECT_TRUE(buffer[4] == expected[4]);
+}
+
+// e.g. `Dst` follow `src`.
+// str: [abcdefghij]
+//      [__src_____]
+//      [_____Dst__]
+TEST(LlvmLibcWMemmoveTest, SrcFollowDst) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'z'};
+
+  wchar_t *dst = buffer + 1;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 2, 1);
+  EXPECT_EQ(ret, dst);
+
+  const char expected[] = {L'z', L'b', L'b', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+TEST(LlvmLibcWMemmoveTest, DstFollowSrc) {
+  wchar_t buffer[] = {L'z', L'a', L'b', L'z'};
+
+  wchar_t *dst = buffer + 2;
+  wchar_t *ret = LIBC_NAMESPACE::wmemmove(dst, buffer + 1, 1);
+  EXPECT_EQ(ret, dst);
+
+  const char expected[] = {L'z', L'a', L'a', L'z'};
+  EXPECT_TRUE(buffer[0] == expected[0]);
+  EXPECT_TRUE(buffer[1] == expected[1]);
+  EXPECT_TRUE(buffer[2] == expected[2]);
+  EXPECT_TRUE(buffer[3] == expected[3]);
+}
+
+#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER)
+TEST(LlvmLibcWMemmoveTest, NullptrCrash) {
+  wchar_t buffer[] = {L'a', L'b'};
+  // Passing in a nullptr should crash the program.
+  EXPECT_DEATH([&buffer] { LIBC_NAMESPACE::wmemmove(buffer, nullptr, 2); },
+               WITH_SIGNAL(-1));
+  EXPECT_DEATH([&buffer] { LIBC_NAMESPACE::wmemmove(nullptr, buffer, 2); },
+               WITH_SIGNAL(-1));
+}
+#endif // LIBC_HAS_ADDRESS_SANITIZER

From c403cf1e38faa456fdd6f1301efabea3f36c3e6b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 09:49:55 -0700
Subject: [PATCH 398/851] VE: Replace VEMCExpr::printImpl with
 printSpecifierExpr

Prepare for removing the VEMCExpr subclass.
VEMCExpr overrides evaluateAsRelocatableImpl, so it cannot be removed
yet.
---
 llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp | 3 ++-
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp   | 8 ++++++++
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h     | 2 ++
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp      | 8 --------
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h        | 1 -
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
index 77bd30e96f7b2..b78b86f70f39c 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
@@ -12,6 +12,7 @@
 
 #include "VEInstPrinter.h"
 #include "VE.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -61,7 +62,7 @@ void VEInstPrinter::printOperand(const MCInst *MI, int OpNum,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void VEInstPrinter::printMemASXOperand(const MCInst *MI, int OpNum,
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index f290804ae4497..fdde46f09d5b1 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -56,3 +56,11 @@ VEELFMCAsmInfo::VEELFMCAsmInfo(const Triple &TheTriple) {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  auto specifier = Expr.getSpecifier();
+  if (specifier && specifier != VEMCExpr::VK_REFLONG)
+    OS << '@' << getSpecifierName(specifier);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
index 6557d68b383cd..444f422c7ec12 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
@@ -24,6 +24,8 @@ class VEELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit VEELFMCAsmInfo(const Triple &TheTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index a7986ab9006dc..fa4d9b18a9ad9 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -27,14 +27,6 @@ const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) VEMCExpr(Expr, S);
 }
 
-void VEMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-
-  const MCExpr *Expr = getSubExpr();
-  Expr->print(OS, MAI);
-  if (specifier != VK_None && specifier != VK_REFLONG)
-    OS << '@' << MAI->getSpecifierName(specifier);
-}
-
 VE::Fixups VEMCExpr::getFixupKind(MCSpecifierExpr::Spec S) {
   switch (S) {
   default:
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index 80ea350a61661..4d191149d4aa0 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -50,7 +50,6 @@ class VEMCExpr : public MCSpecifierExpr {
   static const VEMCExpr *create(Specifier Kind, const MCExpr *Expr,
                                 MCContext &Ctx);
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAssembler *Asm) const override;
 

From 6e988bd33f5fa8a529ef9208d3e147945b7bb7ed Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 09:56:57 -0700
Subject: [PATCH 399/851] [mlir] Forward **kwargs through gentbl_shard_rule
 (#144001)

This allows clients to pass additional cc_library arguments through this
macro to the build rules it calls.
---
 utils/bazel/llvm-project-overlay/mlir/tblgen.bzl | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 16a7ecadeaffa..884d6f381b02d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -484,7 +484,8 @@ def gentbl_sharded_ops(
         test = False,
         includes = [],
         strip_include_prefix = None,
-        deps = []):
+        deps = [],
+        **kwargs):
     """Generate sharded op declarations and definitions.
 
     This special build rule shards op definitions in a TableGen file and generates multiple copies
@@ -524,6 +525,7 @@ def gentbl_sharded_ops(
         td_file = td_file,
         test = test,
         deps = deps,
+        **kwargs,
     )
     all_files = [hdr_out, src_out]
     for i in range(0, shard_count):
@@ -535,9 +537,14 @@ def gentbl_sharded_ops(
             out = out_file,
             sharder = sharder,
             src_file = src_file,
+            **kwargs,
         )
         all_files.append(out_file)
-    native.filegroup(name = name, srcs = all_files)
+    native.filegroup(
+        name = name,
+        srcs = all_files,
+        **kwargs,
+    )
 
 def gentbl_sharded_op_defs(name, source_file, shard_count):
     """Generates multiple copies of a source file that includes sharded op definitions.

From 2704b27a0b452f4aaf87ab26d315fdc92857373a Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 13 Jun 2025 10:02:41 -0700
Subject: [PATCH 400/851] [lldb] Include unistd.h for _exit  in
 multi-process-driver.cpp

This test fails to build on macOS without the correct header include.
---
 lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
index 5ad75e3c1e472..68d73f1dee011 100644
--- a/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
+++ b/lldb/test/API/api/multiple-debuggers/multi-process-driver.cpp
@@ -13,10 +13,11 @@
 // that are hit when lldb is being used to debug multiple processes
 // simultaneously.
 
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <inttypes.h>
+#include <unistd.h>
 
 #include "lldb/API/LLDB.h"
 #include "lldb/API/SBCommandInterpreter.h"

From 65d88d31ea279bbab8a0fa2c8abfb3f723a1715b Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Fri, 13 Jun 2025 10:04:45 -0700
Subject: [PATCH 401/851] Revert "[llvm-cov] Add support for baseline coverage"
 (#144121)

Reverts llvm/llvm-project#117910

```
/home/buildbots/llvm-external-buildbots/workers/ppc64le-lld-multistage-test/ppc64le-lld-multistage-test/llvm-project/llvm/unittests/ProfileData/CoverageMappingTest.cpp
/home/buildbots/llvm-external-buildbots/workers/ppc64le-lld-multistage-test/ppc64le-lld-multistage-test/llvm-project/llvm/unittests/ProfileData/CoverageMappingTest.cpp:281:28: error: 'std::reference_wrapper' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported]
  281 |         std::make_optional(std::reference_wrapper(*ProfileReader));
      |                            ^
/usr/lib/gcc/ppc64le-redhat-linux/8/../../../../include/c++/8/bits/refwrap.h:289:11: note: add a deduction guide to suppress this warning
  289 |     class reference_wrapper
      |           ^
```
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 ---
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++-----------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ------
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 ++++-------
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 86 insertions(+), 195 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index f4db60cf06fa7..968f3c452f558 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,11 +380,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Display the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -475,11 +470,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Display the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -572,11 +562,6 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
-.. option:: -empty-profile
-
- Export the baseline coverage of the binaries with all zero execution counts.
- Mutually exclusive with -instr-profile.
-
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index d1230b0ba7c58..e62ce5e3d8fa6 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,23 +991,18 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-          &ProfileReader,
-      CoverageMapping &Coverage);
+      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-                   &ProfileReader,
-               CoverageMapping &Coverage, bool &DataFound,
+               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
+               bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(
-      const CoverageMappingRecord &Record,
-      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-          &ProfileReader);
+  Error loadFunctionRecord(const CoverageMappingRecord &Record,
+                           IndexedInstrProfReader &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1023,16 +1018,15 @@ class CoverageMapping {
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-           &ProfileReader);
+       IndexedInstrProfReader &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames,
-       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
-       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
+       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
+       StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 429ec5c19f1f8..dd74eb054a34c 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,8 +823,7 @@ class MCDCDecisionRecorder {
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader) {
+    IndexedInstrProfReader &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -838,44 +837,35 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (ProfileReader) {
-    if (Error E = ProfileReader.value().get().getFunctionCounts(
-            Record.FunctionName, Record.FunctionHash, Counts)) {
-      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-      if (IPE == instrprof_error::hash_mismatch) {
-        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                        Record.FunctionHash);
-        return Error::success();
-      }
-      if (IPE != instrprof_error::unknown_function)
-        return make_error<InstrProfError>(IPE);
-      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
+  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
+                                                Record.FunctionHash, Counts)) {
+    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+    if (IPE == instrprof_error::hash_mismatch) {
+      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                      Record.FunctionHash);
+      return Error::success();
     }
-  } else {
+    if (IPE != instrprof_error::unknown_function)
+      return make_error<InstrProfError>(IPE);
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader && ProfileReader.value().get().getVersion() <
-                           IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (ProfileReader) {
-    if (Error E = ProfileReader.value().get().getFunctionBitmap(
-            Record.FunctionName, Record.FunctionHash, Bitmap)) {
-      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-      if (IPE == instrprof_error::hash_mismatch) {
-        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                        Record.FunctionHash);
-        return Error::success();
-      }
-      if (IPE != instrprof_error::unknown_function)
-        return make_error<InstrProfError>(IPE);
-      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
+                                                Record.FunctionHash, Bitmap)) {
+    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+    if (IPE == instrprof_error::hash_mismatch) {
+      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                      Record.FunctionHash);
+      return Error::success();
     }
-  } else {
-    Bitmap = BitVector(getMaxBitmapSize(Record, false));
+    if (IPE != instrprof_error::unknown_function)
+      return make_error<InstrProfError>(IPE);
+    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -969,14 +959,10 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader,
-    CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
-         *Coverage.SingleByteCoverage ==
-             ProfileReader.value().get().hasSingleByteCoverage());
-  Coverage.SingleByteCoverage =
-      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
+    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage ||
+         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
+  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -991,8 +977,7 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader) {
+    IndexedInstrProfReader &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -1001,19 +986,18 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
-    if (CME.get() == coveragemap_error::no_data_found)
-      return static_cast<Error>(Error::success());
-    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-  });
+  return handleErrors(
+      std::move(E), [](const CoverageMapError &CME) {
+        if (CME.get() == coveragemap_error::no_data_found)
+          return static_cast<Error>(Error::success());
+        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+      });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
-        &ProfileReader,
-    CoverageMapping &Coverage, bool &DataFound,
-    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
+    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1049,23 +1033,13 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames,
-    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
-    ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
+    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
-  if (ProfileFilename) {
-    auto ProfileReaderOrErr =
-        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
-    if (Error E = ProfileReaderOrErr.takeError())
-      return createFileError(ProfileFilename.value(), std::move(E));
-    ProfileReader = std::move(ProfileReaderOrErr.get());
-  }
-  auto ProfileReaderRef =
-      ProfileReader
-          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
-                *ProfileReader)
-          : std::nullopt;
+  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
+  if (Error E = ProfileReaderOrErr.takeError())
+    return createFileError(ProfileFilename, std::move(E));
+  auto ProfileReader = std::move(ProfileReaderOrErr.get());
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1079,17 +1053,16 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
-                               CompilationDir, ProfileReaderRef, *Coverage,
-                               DataFound, &FoundBinaryIDs))
+    if (Error E =
+            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
+                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (ProfileReader)
-      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-        return createFileError(ProfileFilename.value(), std::move(E));
+    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+      return createFileError(ProfileFilename, std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1109,12 +1082,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
-                                   *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
+                                  *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename.value(),
+            ProfileFilename,
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
deleted file mode 100644
index bce886bdf510b..0000000000000
--- a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
+++ /dev/null
@@ -1,37 +0,0 @@
-// FULL: SF:{{.*}}showLineExecutionCounts.cpp
-// FULL: FN:6,main
-// FULL: FNDA:0,main
-// FULL: FNF:1
-// FULL: FNH:0
-int main() {                              // FULL: DA:[[@LINE]],0
-  int x = 0;                              // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  if (x) {                                // FULL: DA:[[@LINE]],0
-    x = 0;                                // FULL: DA:[[@LINE]],0
-  } else {                                // FULL: DA:[[@LINE]],0
-    x = 1;                                // FULL: DA:[[@LINE]],0
-  }                                       // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
-    x = 1;                                // FULL: DA:[[@LINE]],0
-  }                                       // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
-  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
-        x - 1:                            // FULL: DA:[[@LINE]],0
-        x + 1;                            // FULL: DA:[[@LINE]],0
-                                          // FULL: DA:[[@LINE]],0
-  return 0;                               // FULL: DA:[[@LINE]],0
-}                                         // FULL: DA:[[@LINE]],0
-// FULL: LF:20
-// FULL: LH:0
-// FULL: end_of_record
-// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
-
-// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
-// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
-// SUMMARYONLY: FNF:1
-// SUMMARYONLY: FNH:0
-// SUMMARYONLY: LF:20
-// SUMMARYONLY: LH:0
-// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 6c66858c4de8c..1f2484cd4dda9 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ class CodeCoverageTool {
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::optional<std::string> PGOFilename;
+  std::string PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,12 +455,10 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  if (PGOFilename) {
-    for (StringRef ObjectFilename : ObjectFilenames)
-      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
-        warning("profile data may be out of date - object is newer",
-                ObjectFilename);
-  }
+  for (StringRef ObjectFilename : ObjectFilenames)
+    if (modifiedTimeGT(ObjectFilename, PGOFilename))
+      warning("profile data may be out of date - object is newer",
+              ObjectFilename);
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -670,16 +668,11 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string> PGOFilename(
-      "instr-profile", cl::Optional,
+  cl::opt<std::string, true> PGOFilename(
+      "instr-profile", cl::Required, cl::location(this->PGOFilename),
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
-  cl::opt<bool> EmptyProfile(
-      "empty-profile", cl::Optional,
-      cl::desc("Use a synthetic profile with no data to generate "
-               "baseline coverage"));
-
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -812,15 +805,6 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
-    if (!PGOFilename.empty() == EmptyProfile) {
-      error(
-          "exactly one of -instr-profile and -empty-profile must be specified");
-      return 1;
-    }
-    if (!PGOFilename.empty()) {
-      this->PGOFilename = std::make_optional(PGOFilename.getValue());
-    }
-
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1132,22 +1116,20 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
+  }
 
-    if (ShowCreatedTime) {
-      auto ModifiedTime = Status.getLastModificationTime();
-      std::string ModifiedTimeStr = to_string(ModifiedTime);
-      size_t found = ModifiedTimeStr.rfind(':');
-      ViewOpts.CreatedTimeStr =
-          (found != std::string::npos)
-              ? "Created: " + ModifiedTimeStr.substr(0, found)
-              : "Created: " + ModifiedTimeStr;
-    }
+  if (ShowCreatedTime) {
+    auto ModifiedTime = Status.getLastModificationTime();
+    std::string ModifiedTimeStr = to_string(ModifiedTime);
+    size_t found = ModifiedTimeStr.rfind(':');
+    ViewOpts.CreatedTimeStr =
+        (found != std::string::npos)
+            ? "Created: " + ModifiedTimeStr.substr(0, found)
+            : "Created: " + ModifiedTimeStr;
   }
 
   auto Coverage = load();
@@ -1256,12 +1238,10 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
   }
 
   auto Coverage = load();
@@ -1323,12 +1303,10 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  if (PGOFilename) {
-    sys::fs::file_status Status;
-    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
-      error("could not read profile data!" + EC.message(), PGOFilename.value());
-      return 1;
-    }
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
+    error("could not read profile data!" + EC.message(), PGOFilename);
+    return 1;
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index c0e99cf80b944..46f881ecddb5f 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,9 +277,7 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    auto ProfileReaderRef =
-        std::make_optional(std::reference_wrapper(*ProfileReader));
-    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
+    return CoverageMapping::load(CoverageReaders, *ProfileReader);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From 9e23e85d6597bd59ff316a3ce93bb8ec41919b19 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 02:10:56 +0900
Subject: [PATCH 402/851] [LLD][Cygwin] Implement --dll-search-prefix (#143263)

GCC on Cygwin environment invokes linker with passing
`--dll-search-prefix=cyg`.
Implementing this option makes lld-mingw invokable by `gcc -fuse-ld=lld`.

---------

Co-authored-by: jeremyd2019 <github@jdrake.com>
---
 lld/MinGW/Driver.cpp    | 14 ++++++++++----
 lld/MinGW/Options.td    |  2 ++
 lld/test/MinGW/lib.test |  7 +++++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index 8996293fdfa1b..98d48bdfcf311 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -138,8 +138,9 @@ static std::optional<std::string> findFile(StringRef path1,
 }
 
 // This is for -lfoo. We'll look for libfoo.dll.a or libfoo.a from search paths.
-static std::string
-searchLibrary(StringRef name, ArrayRef<StringRef> searchPaths, bool bStatic) {
+static std::string searchLibrary(StringRef name,
+                                 ArrayRef<StringRef> searchPaths, bool bStatic,
+                                 StringRef prefix) {
   if (name.starts_with(":")) {
     for (StringRef dir : searchPaths)
       if (std::optional<std::string> s = findFile(dir, name.substr(1)))
@@ -160,7 +161,7 @@ searchLibrary(StringRef name, ArrayRef<StringRef> searchPaths, bool bStatic) {
     if (std::optional<std::string> s = findFile(dir, name + ".lib"))
       return *s;
     if (!bStatic) {
-      if (std::optional<std::string> s = findFile(dir, "lib" + name + ".dll"))
+      if (std::optional<std::string> s = findFile(dir, prefix + name + ".dll"))
         return *s;
       if (std::optional<std::string> s = findFile(dir, name + ".dll"))
         return *s;
@@ -554,6 +555,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-libpath:" + StringRef(a->getValue()));
   }
 
+  StringRef dllPrefix = "lib";
+  if (auto *arg = args.getLastArg(OPT_dll_search_prefix))
+    dllPrefix = arg->getValue();
+
   StringRef prefix = "";
   bool isStatic = false;
   for (auto *a : args) {
@@ -565,7 +570,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
         add(prefix + StringRef(a->getValue()));
       break;
     case OPT_l:
-      add(prefix + searchLibrary(a->getValue(), searchPaths, isStatic));
+      add(prefix +
+          searchLibrary(a->getValue(), searchPaths, isStatic, dllPrefix));
       break;
     case OPT_whole_archive:
       prefix = "-wholearchive:";
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 01b01972112a1..e6cf48e685b74 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -79,6 +79,8 @@ defm exclude_symbols: Eq<"exclude-symbols",
     "Exclude symbols from automatic export">, MetaVarName<"<symbol[,symbol,...]>">;
 def export_all_symbols: F<"export-all-symbols">,
     HelpText<"Export all symbols even if a def file or dllexport attributes are used">;
+defm dll_search_prefix:Eq<"dll-search-prefix", "Specify DLL prefix instead of 'lib'">,
+    MetaVarName<"<dll_search_prefix>">;
 defm fatal_warnings: B<"fatal-warnings",
     "Treat warnings as errors",
     "Do not treat warnings as errors (default)">;
diff --git a/lld/test/MinGW/lib.test b/lld/test/MinGW/lib.test
index 8bd8a0e9304da..ac002e7549b43 100644
--- a/lld/test/MinGW/lib.test
+++ b/lld/test/MinGW/lib.test
@@ -5,6 +5,7 @@ LIB1: unable to find library -lfoo
 
 RUN: echo > %t/lib/libfoo.dll.a
 RUN: ld.lld -### -m i386pep -lfoo -L%t/lib 2>&1 | FileCheck -check-prefix=LIB2 %s
+RUN: ld.lld -### -m i386pep -lfoo --dll-search-prefix=cyg -L%t/lib 2>&1 | FileCheck -check-prefix=LIB2 %s
 LIB2: libfoo.dll.a
 
 RUN: not ld.lld -### -m i386pep -l:barefilename -L%t/lib 2>&1 | FileCheck -check-prefix=LIB-LITERAL-FAIL %s
@@ -22,6 +23,7 @@ LIB3: unable to find library -lfoo
 
 RUN: echo > %t/lib/libfoo.a
 RUN: ld.lld -### -m i386pep -Bstatic -lfoo -L%t/lib 2>&1 | FileCheck -check-prefix=LIB4 %s
+RUN: ld.lld -### -m i386pep -Bstatic -lfoo --dll-search-prefix=cyg -L%t/lib 2>&1 | FileCheck -check-prefix=LIB4 %s
 LIB4: libfoo.a
 
 RUN: echo > %t/lib/libbar.dll.a
@@ -46,12 +48,17 @@ MSVCSTYLE: msvcstyle.lib
 
 RUN: echo > %t/lib/libnoimplib.dll
 RUN: echo > %t/lib/noprefix_noimplib.dll
+RUN: echo > %t/lib/cygnoimplib2.dll
 RUN: ld.lld -### -m i386pep -L%t/lib -lnoimplib 2>&1 | FileCheck -check-prefix=DLL1 %s
 RUN: ld.lld -### -m i386pep -L%t/lib -lnoprefix_noimplib 2>&1 | FileCheck -check-prefix=DLL2 %s
+RUN: ld.lld -### -m i386pep -L%t/lib -lnoimplib2 --dll-search-prefix=cyg 2>&1 | FileCheck -check-prefix=DLL3 %s
 DLL1: libnoimplib.dll
 DLL2: noprefix_noimplib.dll
+DLL3: cygnoimplib2.dll
 
 RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoimplib 2>&1 | FileCheck -check-prefix=ERROR-NOIMPLIB %s
 RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoprefix_noimplib 2>&1 | FileCheck -check-prefix=ERROR-NOPREFIX-NOIMPLIB %s
+RUN: not ld.lld -### -m i386pep -L%t/lib -static -lnoimplib2 --dll-search-prefix=cyg 2>&1 | FileCheck -check-prefix=ERROR-CYG-NOIMPLIB %s
 ERROR-NOIMPLIB: unable to find library -lnoimplib
 ERROR-NOPREFIX-NOIMPLIB: unable to find library -lnoprefix_noimplib
+ERROR-CYG-NOIMPLIB: unable to find library -lnoimplib2

From 1072196c2737fcf921ad52e9a44c13423789111b Mon Sep 17 00:00:00 2001
From: Tai Ly <tai.ly@arm.com>
Date: Fri, 13 Jun 2025 12:12:25 -0500
Subject: [PATCH 403/851] [tosa] Add duplicate indices check for Scatter
 (#143736)

Tosa scatter operator disallow duplicate indices (per batch)
This patch adds, to the validation pass, checking for duplicate values
in scatter operator's constant indices values.

Signed-off-by: Tai Ly <tai.ly@arm.com>
---
 .../mlir/Dialect/Tosa/Utils/ConversionUtils.h |  5 ++++
 .../Tosa/Transforms/TosaValidation.cpp        | 28 ++++++++++++++++++-
 .../Dialect/Tosa/Utils/ConversionUtils.cpp    | 27 ++++++++++++++++++
 mlir/test/Dialect/Tosa/invalid.mlir           | 10 +++++++
 4 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index 096510a09e324..6f3b0916a7a60 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -243,6 +243,11 @@ bool getConstShapeValues(Operation *op,
 // returns a small vector of int64_t values that attr contains
 SmallVector<int64_t> convertFromIntAttr(const DenseElementsAttr &attr,
                                         const int rank);
+
+// returns true iff constant indices for scatter op contains unique indices
+// per batch
+bool hasUniqueConstantScatterIndices(ShapedType indicesType,
+                                     DenseIntElementsAttr indicesAttr);
 } // namespace tosa
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index d33fc902de3a1..229f42d3178b5 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -1244,10 +1244,36 @@ bool checkErrorIfCondIf(Operation *op) {
   return true;
 }
 
+bool checkErrorIfScatter(Operation *op) {
+  auto scatterOp = dyn_cast<tosa::ScatterOp>(op);
+  if (!scatterOp)
+    return true;
+
+  // for constant indices, check that there are no duplicate values
+  DenseIntElementsAttr indicesAttr;
+  if (!matchPattern(scatterOp.getIndices(), m_Constant(&indicesAttr)))
+    return true;
+
+  auto const indicesType =
+      dyn_cast<ShapedType>(scatterOp.getIndices().getType());
+  if (!indicesType || !indicesType.hasRank()) {
+    op->emitOpError("expect ranked indices tensor");
+    return false;
+  }
+
+  if (!hasUniqueConstantScatterIndices(indicesType, indicesAttr)) {
+    op->emitOpError("indices values contain duplicates");
+    return false;
+  }
+
+  return true;
+}
+
 LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) {
   if (!checkErrorIfResize(op) || !checkErrorIfMul(op) ||
       !checkErrorIfTable(op) || !checkErrorIfRescale(op) ||
-      !checkErrorIfPad(op) || !checkErrorIfCondIf(op))
+      !checkErrorIfPad(op) || !checkErrorIfCondIf(op) ||
+      !checkErrorIfScatter(op))
     return failure();
   return success();
 }
diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index e1b3be74b50fd..9844abcc34cb1 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -213,3 +213,30 @@ mlir::tosa::convertFromIntAttr(const DenseElementsAttr &attr, const int rank) {
   }
   return {};
 }
+
+bool mlir::tosa::hasUniqueConstantScatterIndices(
+    ShapedType indicesType, DenseIntElementsAttr indicesAttr) {
+  llvm::ArrayRef<int64_t> const indicesShape = indicesType.getShape();
+  const unsigned int indicesRank = indicesShape.size();
+  const unsigned int lastDimSize = indicesShape[indicesRank - 1];
+
+  // check each batch of indices from the flat indicesAttr values
+  // for duplicates
+  auto const indicesValues = indicesAttr.getValues<int32_t>();
+  assert(
+      (indicesValues.size() % lastDimSize == 0) &&
+      "Constant indices data length should be a multiple of indicesShape[-1]");
+
+  std::vector<uint64_t> indices(lastDimSize);
+  for (auto beg = indicesValues.begin(); beg < indicesValues.end();
+       beg += lastDimSize) {
+    std::copy(beg, beg + lastDimSize, indices.begin());
+    std::sort(indices.begin(), indices.end());
+    if (std::adjacent_find(indices.begin(), indices.end()) != indices.end()) {
+      // found duplicate values in indices in batch
+      return false;
+    }
+  }
+
+  return true;
+}
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index a4617fc6fba8b..805522799a6d8 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -2015,3 +2015,13 @@ func.func @test_rescale_output_unsigned(%arg0: tensor<1x1xi8>) -> (tensor<1x1xui
   %r = tosa.rescale %arg0, %1, %0, %3, %2 {input_unsigned = false, output_unsigned = true, per_channel = false, rounding_mode = "SINGLE_ROUND", scale32 = true} : (tensor<1x1xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x1xui8>
   return %r : tensor<1x1xui8>
 }
+
+// -----
+
+// CHECK-LABEL: test_scatter_duplicate_indices
+func.func @test_scatter_duplicate_indices(%arg0: tensor<2x52x3xf32>, %arg2: tensor<2x12x3xf32>) -> tensor<2x52x3xf32> {
+  %indices = "tosa.const"() { values = dense<[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], [1, 2, 3, 4, 5, 6, 7, 8, 9, 3, 11, 12]]> : tensor<2x12xi32> } : () -> tensor<2x12xi32>
+  // expected-error@+1 {{'tosa.scatter' op indices values contain duplicates}}
+  %0 = tosa.scatter %arg0, %indices, %arg2 : (tensor<2x52x3xf32>, tensor<2x12xi32>, tensor<2x12x3xf32>) -> tensor<2x52x3xf32>
+  return %0 : tensor<2x52x3xf32>
+}

From b81d5e06c7cba8c9f1f5380daed4b9ee139214ba Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 13 Jun 2025 18:25:07 +0100
Subject: [PATCH 404/851] [InstCombine] Fold shuffles through all trivially
 vectorizable intrinsics (#141979)

This addresses a TODO in foldShuffledIntrinsicOperands to use
isTriviallyVectorizable instead of a hardcoded list of intrinsics, which
in turn allows more intriniscs to be scalarized by VectorCombine.

From what I can tell every intrinsic here should be speculatable so an
assertion was added.

Because this enables intrinsics like abs which have a scalar operand, we
need to also check isVectorIntrinsicWithScalarOpAtArg.
---
 .../InstCombine/InstCombineCalls.cpp          | 52 +++++++++++--------
 llvm/test/Transforms/InstCombine/abs-1.ll     | 11 ++++
 llvm/test/Transforms/InstCombine/fma.ll       | 13 +++++
 .../InstCombine/minmax-intrinsics.ll          | 15 ++++++
 llvm/test/Transforms/InstCombine/powi.ll      | 26 ++++++++++
 llvm/test/Transforms/InstCombine/scmp.ll      | 13 +++++
 llvm/test/Transforms/InstCombine/sqrt.ll      | 11 ++++
 .../AMDGPU/add_sub_sat-inseltpoison.ll        | 30 ++++-------
 .../SLPVectorizer/AMDGPU/add_sub_sat.ll       | 30 ++++-------
 .../X86/alternate-calls-inseltpoison.ll       | 36 ++++++-------
 .../SLPVectorizer/X86/alternate-calls.ll      | 36 ++++++-------
 11 files changed, 175 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c169ab25b2106..8c8cc0859e4af 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1400,42 +1400,46 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
 /// try to shuffle after the intrinsic.
 Instruction *
 InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
-  // TODO: This should be extended to handle other intrinsics like fshl, ctpop,
-  //       etc. Use llvm::isTriviallyVectorizable() and related to determine
-  //       which intrinsics are safe to shuffle?
-  switch (II->getIntrinsicID()) {
-  case Intrinsic::smax:
-  case Intrinsic::smin:
-  case Intrinsic::umax:
-  case Intrinsic::umin:
-  case Intrinsic::fma:
-  case Intrinsic::fshl:
-  case Intrinsic::fshr:
-    break;
-  default:
+  if (!isTriviallyVectorizable(II->getIntrinsicID()) ||
+      !II->getCalledFunction()->isSpeculatable())
+    return nullptr;
+
+  // fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so
+  // avoid undoing it.
+  if (match(II, m_FAbs(m_Value())))
     return nullptr;
-  }
 
   Value *X;
   Constant *C;
   ArrayRef<int> Mask;
-  auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>);
+  auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) {
+    return isa<Constant>(Arg.get()) ||
+           isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                              Arg.getOperandNo(), nullptr);
+  });
   if (!NonConstArg ||
       !match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask))))
     return nullptr;
 
-  // At least 1 operand must have 1 use because we are creating 2 instructions.
-  if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); }))
+  // At least 1 operand must be a shuffle with 1 use because we are creating 2
+  // instructions.
+  if (none_of(II->args(), [](Value *V) {
+        return isa<ShuffleVectorInst>(V) && V->hasOneUse();
+      }))
     return nullptr;
 
   // See if all arguments are shuffled with the same mask.
   SmallVector<Value *, 4> NewArgs;
   Type *SrcTy = X->getType();
-  for (Value *Arg : II->args()) {
-    if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
-        X->getType() == SrcTy)
+  for (Use &Arg : II->args()) {
+    if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                           Arg.getOperandNo(), nullptr))
+      NewArgs.push_back(Arg);
+    else if (match(&Arg,
+                   m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
+             X->getType() == SrcTy)
       NewArgs.push_back(X);
-    else if (match(Arg, m_ImmConstant(C))) {
+    else if (match(&Arg, m_ImmConstant(C))) {
       // If it's a constant, try find the constant that would be shuffled to C.
       if (Constant *ShuffledC =
               unshuffleConstant(Mask, C, cast<VectorType>(SrcTy)))
@@ -1448,8 +1452,12 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
 
   // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M
   Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr;
+  // Result type might be a different vector width.
+  // TODO: Check that the result type isn't widened?
+  VectorType *ResTy =
+      VectorType::get(II->getType()->getScalarType(), cast<VectorType>(SrcTy));
   Value *NewIntrinsic =
-      Builder.CreateIntrinsic(II->getIntrinsicID(), SrcTy, NewArgs, FPI);
+      Builder.CreateIntrinsic(ResTy, II->getIntrinsicID(), NewArgs, FPI);
   return new ShuffleVectorInst(NewIntrinsic, Mask);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 7037647d116ba..fd67fc3421498 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -978,3 +978,14 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
   %cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab
   ret i32 %cond
 }
+
+define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
+; CHECK-LABEL: @abs_unary_shuffle_ops(
+; CHECK-NEXT:    [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false)
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
+  ret <2 x i32> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index f0d4f776a5d90..e3d3e722bcc23 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -972,6 +972,19 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
   ret <2 x half> %sub
 }
 
+define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
+; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+  ret <2 x float> %r
+}
+
 ; negative tests
 
 define half @fma_non_negone(half %x, half %y) {
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index 38930956eda2f..52bc3636be359 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2511,6 +2511,21 @@ define <3 x i8> @smin_unary_shuffle_ops_uses(<3 x i8> %x, <3 x i8> %y) {
   ret <3 x i8> %r
 }
 
+; negative test - too many uses
+
+define <3 x i8> @smin_unary_shuffle_ops_uses_const(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @smin_unary_shuffle_ops_uses_const(
+; CHECK-NEXT:    [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    call void @use_vec(<3 x i8> [[SX]])
+; CHECK-NEXT:    [[R:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[SX]], <3 x i8> <i8 1, i8 2, i8 3>)
+; CHECK-NEXT:    ret <3 x i8> [[R]]
+;
+  %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  call void @use_vec(<3 x i8> %sx)
+  %r = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %sx, <3 x i8> <i8 1, i8 2, i8 3>)
+  ret <3 x i8> %r
+}
+
 ; This would assert/crash because we tried to zext to i1.
 
 @g = external dso_local global i32, align 4
diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll
index d76f92c1849af..422792a5a2c28 100644
--- a/llvm/test/Transforms/InstCombine/powi.ll
+++ b/llvm/test/Transforms/InstCombine/powi.ll
@@ -564,3 +564,29 @@ define double @powi_fmul_powi_x_overflow(double noundef %x) {
   %mul = fmul reassoc double %p1, %x
   ret double %mul
 }
+
+define <3 x float> @powi_unary_shuffle_ops(<3 x float> %x, i32 %power) {
+; CHECK-LABEL: @powi_unary_shuffle_ops(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[X:%.*]], i32 [[POWER:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
+  ret <3 x float> %r
+}
+
+; Negative test - multiple uses
+
+define <3 x float> @powi_unary_shuffle_ops_use(<3 x float> %x, i32 %power, ptr %p) {
+; CHECK-LABEL: @powi_unary_shuffle_ops_use(
+; CHECK-NEXT:    [[SX:%.*]] = shufflevector <3 x float> [[X:%.*]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    store <3 x float> [[SX]], ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[SX]], i32 [[POWER:%.*]])
+; CHECK-NEXT:    ret <3 x float> [[R]]
+;
+  %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  store <3 x float> %sx, ptr %p
+  %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power)
+  ret <3 x float> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index 2140a59de3fa9..2bf22aeb7a6e9 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -423,6 +423,19 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) {
   ret i8 %r
 }
 
+define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops(
+; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i2> @llvm.scmp.v3i2.v3i8(<3 x i8> [[X]], <3 x i8> [[Y]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x i2> [[TMP1]], <3 x i2> poison, <3 x i32> <i32 1, i32 0, i32 2>
+; CHECK-NEXT:    ret <3 x i2> [[R]]
+;
+  %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %sy = shufflevector <3 x i8> %y, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2>
+  %r = call <3 x i2> @llvm.scmp(<3 x i8> %sx, <3 x i8> %sy)
+  ret <3 x i2> %r
+}
+
 ; Negative test: true value of outer select is not zero
 define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) {
 ; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1(
diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll
index 0f4db3b3a65ae..2fda5bc37d023 100644
--- a/llvm/test/Transforms/InstCombine/sqrt.ll
+++ b/llvm/test/Transforms/InstCombine/sqrt.ll
@@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
   ret <2 x float> %res
 }
 
+define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
+; CHECK-LABEL: @sqrt_unary_shuffle_ops(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]])
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    ret <2 x float> [[R1]]
+;
+  %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
+  %r = call <2 x float> @llvm.sqrt(<2 x float> %a)
+  ret <2 x float> %r
+}
+
 declare i32 @foo(double)
 declare double @sqrt(double) readnone
 declare float @sqrtf(float)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 3749bdf1bba39..783a1e83c6724 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 0bb641371825b..7e31ec9a0b39a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:  bb:
 ; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX8-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX9-NEXT:  bb:
 ; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
 ; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
 ; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
 ; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
 ;
@@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX8-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
 ; GFX9-NEXT:  bb:
-; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
+; GFX9-NEXT:    [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
 ; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
-; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index b790e6f3c99c6..77d36f0107665 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index ef1a67032c237..18d79752b0b44 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; SLM-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; SLM-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; SLM-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; SLM-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; SLM-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]])
+; AVX-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX-NEXT:    [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
@@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
 ; AVX2-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX2-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
 ; AVX2-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
-; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX2-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]])
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
 ; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
 ; AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX2-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>

From c609112a5383c10272e3afceedd4d03f26437cf0 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Fri, 13 Jun 2025 10:25:26 -0700
Subject: [PATCH 405/851] Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest." (#143972)

This reverts commit a93e55e57ed00a55f822c64e3520c7c732b58480 and fixes
build and test failures:

* Proper include added to setvbuf_test.cpp
* fgetc/fgetc_unlocked/fgets tests are ported to ErrnoSetterMatcher and
are made more precise. This fixes inconsistencies between expectations
in regular and GPU builds - ErrnoSetterMatcher is configured to omit
errno matching on GPUs, as fgetc implementation on GPU doesn't set
errno, in contrast to Linux.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ++++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++-------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++++---------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++++---------
 libc/test/src/stdio/fgets_test.cpp           | 18 +++++++++-------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++-------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 84 insertions(+), 77 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index ce2171f19597b..4aa8b95880018 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -488,6 +495,8 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -510,6 +519,8 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -527,6 +538,8 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100e..b53184c30be36 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a8..be2e50271b510 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,15 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -27,29 +30,28 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82b..bef9dafd3d87c 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,15 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,31 +33,30 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d4052939..ca8d4d4546635 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,14 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -29,15 +32,15 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+              Succeeds(WRITE_SIZE));
   // This is a write-only file so reads should fail.
-  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
+  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file), Fails(EBADF, nullptr));
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -55,6 +58,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -86,5 +90,5 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b8..e097785832d56 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b646..bcf5e674141a7 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c0..296bff1f5dc15 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8e..135fb98c07fbb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb41..a0936ba79ef73 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,12 +11,14 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +54,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +104,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064bd..e99b382d12112 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e77..03f0a6539c785 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c3..eb4056dc7ba64 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 493c1612d6f8f7a40d0bf0ba28fb753be83fac1c Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Fri, 13 Jun 2025 13:26:26 -0400
Subject: [PATCH 406/851] [SPIRV] Fix ExecutionMode_fragment.ll test (#144116)

Fix test broken by https://github.com/llvm/llvm-project/pull/143412.
---
 llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
index 4fa764fe192d3..aab0ae05753fa 100644
--- a/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
+++ b/llvm/test/CodeGen/SPIRV/ExecutionMode_Fragment.ll
@@ -4,17 +4,16 @@
 ; CHECK-DAG: OpEntryPoint Fragment %[[#entry:]] "main" {{.*}}
 ; CHECK-DAG: OpExecutionMode %[[#entry]] OriginUpperLeft
 
+@.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1
 
 define void @main() #0 {
 entry:
-  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false)
+  %0 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false, ptr nonnull @.str.b0)
   %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %0, i32 0)
   store i32 1, ptr addrspace(11) %1, align 4
 
   ret void
 }
 
-declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #1
-
 attributes #0 = { "hlsl.shader"="pixel" }
 attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

From fd432151a607a997c417f32cb70650fc7728629a Mon Sep 17 00:00:00 2001
From: William Huynh <113542065+saturn691@users.noreply.github.com>
Date: Fri, 13 Jun 2025 18:26:40 +0100
Subject: [PATCH 407/851] [libc] Fix bugs found when testing with all headers
 (#144049)

Fixes a couple of bugs found when building. The PR to enable the headers
can be found here: #144114.

- math.yaml: float128 guard
- wchar.yaml: __restrict keyword order
---
 libc/include/math.yaml                        |  2 +-
 libc/include/wchar.yaml                       | 20 +++++++++----------
 .../src/stdio/printf_core/converter_test.cpp  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/libc/include/math.yaml b/libc/include/math.yaml
index 466c08ade6fc4..11bead0745954 100644
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@@ -734,7 +734,7 @@ functions:
       - type: float128
       - type: float128
       - type: float128
-    guards: LIBC_TYPES_HAS_FLOAT128
+    guard: LIBC_TYPES_HAS_FLOAT128
   - name: ffmal
     standards:
       - stdc
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 1af15a6c112b5..84db73d8f01ea 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -109,8 +109,8 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments: 
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict 
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wmemmove
     standards:
@@ -125,16 +125,16 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wcscat
     standards:
       - stdc
     return_type: wchar_t *
     arguments: 
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
   - name: wcsstr
     standards:
       - stdc
@@ -147,13 +147,13 @@ functions:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
       - type: size_t
   - name: wcscpy
     standards:
       - stdc
     return_type: wchar_t *
     arguments:
-      - type: __restrict wchar_t *
-      - type: const __restrict wchar_t *
+      - type: wchar_t *__restrict
+      - type: const wchar_t *__restrict
diff --git a/libc/test/src/stdio/printf_core/converter_test.cpp b/libc/test/src/stdio/printf_core/converter_test.cpp
index 96a00ae598ec2..bf088937e4104 100644
--- a/libc/test/src/stdio/printf_core/converter_test.cpp
+++ b/libc/test/src/stdio/printf_core/converter_test.cpp
@@ -124,7 +124,7 @@ TEST_F(LlvmLibcPrintfConverterTest, StringConversionSimple) {
 TEST_F(LlvmLibcPrintfConverterTest, StringConversionPrecisionHigh) {
   LIBC_NAMESPACE::printf_core::FormatSection high_precision_conv;
   high_precision_conv.has_conv = true;
-  high_precision_conv.raw_string = "%4s";
+  high_precision_conv.raw_string = "%.4s";
   high_precision_conv.conv_name = 's';
   high_precision_conv.precision = 4;
   high_precision_conv.conv_val_ptr = const_cast<char *>("456");

From 9a3082276d21873a37925d0c6ad89bd28d065cea Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 10:28:03 -0700
Subject: [PATCH 408/851] [CIR][NFC] Fix forrange.cpp test (#144123)

A recent change has cause the begin and end iterators in the
forrange.cpp CIR codegen test to be marked as 'init' causing the test to
fail. This change fixes the checks in the test.
---
 clang/test/CIR/CodeGen/forrange.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CIR/CodeGen/forrange.cpp b/clang/test/CIR/CodeGen/forrange.cpp
index 6b6ccc79e59dd..45e146e9091d0 100644
--- a/clang/test/CIR/CodeGen/forrange.cpp
+++ b/clang/test/CIR/CodeGen/forrange.cpp
@@ -115,8 +115,8 @@ void for_range3() {
 // CIR:    %[[C_ADDR:.*]] = cir.alloca !rec_C3{{.*}} ["c"]
 // CIR:    cir.scope {
 // CIR:      %[[RANGE_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C3>{{.*}} ["__range1", init, const]
-// CIR:      %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__begin1"]
-// CIR:      %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__end1"]
+// CIR:      %[[BEGIN_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__begin1", init]
+// CIR:      %[[END_ADDR:.*]] = cir.alloca !rec_Iterator, !cir.ptr<!rec_Iterator>{{.*}} ["__end1", init]
 // CIR:      %[[E_ADDR:.*]] = cir.alloca !cir.ptr<!rec_Element>{{.*}} ["e", init, const]
 // CIR:      cir.store{{.*}} %[[C_ADDR]], %[[RANGE_ADDR]]
 // CIR:      cir.for : cond {

From 62eea86424c4eacd38ad8a03f4bdae78687e3ade Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Fri, 13 Jun 2025 19:29:21 +0200
Subject: [PATCH 409/851] [CIR] Update isSized with upstreamed types (#143960)

Update `isSized` function with the upstreamed types
---
 clang/lib/CIR/CodeGen/CIRGenBuilder.h |  5 +++--
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 13 +++++++++++--
 clang/test/CIR/CodeGen/array.cpp      | 23 +++++++++++++++++++++++
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 36c89809b4d90..a4bc69619d60c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -139,8 +139,9 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   }
 
   bool isSized(mlir::Type ty) {
-    if (mlir::isa<cir::PointerType, cir::ArrayType, cir::BoolType,
-                  cir::IntType>(ty))
+    if (mlir::isa<cir::PointerType, cir::ArrayType, cir::BoolType, cir::IntType,
+                  cir::CIRFPTypeInterface, cir::ComplexType, cir::RecordType>(
+            ty))
       return true;
 
     if (const auto vt = mlir::dyn_cast<cir::VectorType>(ty))
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index eaba3dfd1105e..bab47924dd719 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -419,6 +419,15 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   case Type::ConstantArray: {
     const ConstantArrayType *arrTy = cast<ConstantArrayType>(ty);
     mlir::Type elemTy = convertTypeForMem(arrTy->getElementType());
+
+    // TODO(CIR): In LLVM, "lower arrays of undefined struct type to arrays of
+    // i8 just to have a concrete type"
+    if (!builder.isSized(elemTy)) {
+      cgm.errorNYI(SourceLocation(), "arrays of undefined struct type", type);
+      resultType = cgm.UInt32Ty;
+      break;
+    }
+
     resultType = cir::ArrayType::get(elemTy, arrTy->getSize().getZExtValue());
     break;
   }
@@ -432,8 +441,8 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
   }
 
   case Type::Enum: {
-    const EnumDecl *ED = cast<EnumType>(ty)->getDecl();
-    if (auto integerType = ED->getIntegerType(); !integerType.isNull())
+    const EnumDecl *ed = cast<EnumType>(ty)->getDecl();
+    if (auto integerType = ed->getIntegerType(); !integerType.isNull())
       return convertType(integerType);
     // Return a placeholder 'i32' type.  This can be changed later when the
     // type is defined (see UpdateCompletedType), but is likely to be the
diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp
index 7b90c1682ec45..26e172a006451 100644
--- a/clang/test/CIR/CodeGen/array.cpp
+++ b/clang/test/CIR/CodeGen/array.cpp
@@ -473,3 +473,26 @@ void func10(int *a) {
 // OGCG:  %[[ELE:.*]] = getelementptr inbounds i32, ptr %[[TMP_1]], i64 5
 // OGCG:  %[[TMP_2:.*]] = load i32, ptr %[[ELE]], align 4
 // OGCG:  store i32 %[[TMP_2]], ptr %[[INIT]], align 4
+
+void func11() { int _Complex a[4]; }
+
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!cir.complex<!s32i> x 4>, !cir.ptr<!cir.array<!cir.complex<!s32i> x 4>>, ["a"]
+
+// LLVM: %[[ARR:.*]] = alloca [4 x { i32, i32 }], i64 1, align 16
+
+// OGCG: %[[ARR:.*]] = alloca [4 x { i32, i32 }], align 16
+
+void func12() {
+  struct Point {
+    int x;
+    int y;
+  };
+
+  Point a[4];
+}
+
+// CIR: %[[ARR:.*]] = cir.alloca !cir.array<!rec_Point x 4>, !cir.ptr<!cir.array<!rec_Point x 4>>, ["a"]
+
+// LLVM: %[[ARR:.*]] = alloca [4 x %struct.Point], i64 1, align 16
+
+// OGCG: %[[ARR:.*]] = alloca [4 x %struct.Point], align 16

From ec330cf6701793525da9eb471e7ff796938ab54a Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 10:31:35 -0700
Subject: [PATCH 410/851] [bazel] Update llvm-config.h and disable DebugLoc
 tracking (#144125)

In c588224ca797886064a7a79f6c0114a6963c325e, @chapuni set
LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING to 1, but from what I can tell,
this is not the default setting for CMake builds. I think the intention
was mostly just to update llvm-config.h to fix the Bazel build.

I'm adding LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING as well to fix the build
for the same purpose.
---
 .../llvm/include/llvm/Config/llvm-config.h                  | 6 +++++-
 utils/bazel/llvm_configs/llvm-config.h.cmake                | 4 ++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 97626d4665092..5dd53cffb7bd7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -132,6 +132,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1
+#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0
+
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#define LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 0
 
 #endif
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index dbc882937b4f4..6d3c37cc8b194 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -133,4 +133,8 @@
    and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+
 #endif

From 51689c9df2fbb81aab1ff802f3efb86cac926853 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Fri, 13 Jun 2025 10:31:47 -0700
Subject: [PATCH 411/851] [libc][NFC] clean internal fd handling (#143991)

The previous internal fcntl implementation modified errno directly, this
patch fixes that. This patch also moves open and close into OSUtil since
they are used in multiple places. There are more places that need
similar cleanup but only got comments in this patch to keep it
relatively reviewable.

Related to: https://github.com/llvm/llvm-project/issues/143937
---
 libc/src/__support/File/linux/file.cpp        | 14 ++--
 libc/src/__support/OSUtil/fcntl.h             |  8 +-
 .../src/__support/OSUtil/linux/CMakeLists.txt |  1 -
 libc/src/__support/OSUtil/linux/fcntl.cpp     | 83 ++++++++++++-------
 libc/src/fcntl/linux/CMakeLists.txt           |  1 +
 libc/src/fcntl/linux/fcntl.cpp                | 10 ++-
 libc/src/fcntl/linux/open.cpp                 | 24 ++----
 libc/src/sys/auxv/linux/getauxval.cpp         | 37 ++++++---
 libc/src/sys/mman/linux/shm_common.h          |  5 ++
 libc/src/sys/mman/linux/shm_open.cpp          | 16 +++-
 libc/src/sys/mman/linux/shm_unlink.cpp        |  9 +-
 libc/src/unistd/linux/close.cpp               | 12 +--
 .../llvm-project-overlay/libc/BUILD.bazel     | 58 ++++++++++++-
 13 files changed, 197 insertions(+), 81 deletions(-)

diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp
index 761e352f74ead..4594dadf1ccdf 100644
--- a/libc/src/__support/File/linux/file.cpp
+++ b/libc/src/__support/File/linux/file.cpp
@@ -19,8 +19,8 @@
 #include "src/__support/macros/config.h"
 
 #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall
-#include <sys/stat.h>    // For S_IS*, S_IF*, and S_IR* flags.
-#include <sys/syscall.h> // For syscall numbers
+#include <sys/stat.h>         // For S_IS*, S_IF*, and S_IR* flags.
+#include <sys/syscall.h>      // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -128,10 +128,11 @@ ErrorOr<LinuxFile *> create_file_from_fd(int fd, const char *mode) {
     return Error(EINVAL);
   }
 
-  int fd_flags = internal::fcntl(fd, F_GETFL);
-  if (fd_flags == -1) {
+  auto result = internal::fcntl(fd, F_GETFL);
+  if (!result.has_value()) {
     return Error(EBADF);
   }
+  int fd_flags = result.value();
 
   using OpenMode = File::OpenMode;
   if (((fd_flags & O_ACCMODE) == O_RDONLY &&
@@ -145,8 +146,9 @@ ErrorOr<LinuxFile *> create_file_from_fd(int fd, const char *mode) {
   if ((modeflags & static_cast<ModeFlags>(OpenMode::APPEND)) &&
       !(fd_flags & O_APPEND)) {
     do_seek = true;
-    if (internal::fcntl(fd, F_SETFL,
-                        reinterpret_cast<void *>(fd_flags | O_APPEND)) == -1) {
+    if (!internal::fcntl(fd, F_SETFL,
+                         reinterpret_cast<void *>(fd_flags | O_APPEND))
+             .has_value()) {
       return Error(EBADF);
     }
   }
diff --git a/libc/src/__support/OSUtil/fcntl.h b/libc/src/__support/OSUtil/fcntl.h
index 46f7d28132396..3983d78f7f89c 100644
--- a/libc/src/__support/OSUtil/fcntl.h
+++ b/libc/src/__support/OSUtil/fcntl.h
@@ -8,12 +8,18 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_FCNTL_H
 #define LLVM_LIBC_SRC___SUPPORT_OSUTIL_FCNTL_H
 
+#include "hdr/types/mode_t.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-int fcntl(int fd, int cmd, void *arg = nullptr);
+ErrorOr<int> fcntl(int fd, int cmd, void *arg = nullptr);
+
+ErrorOr<int> open(const char *path, int flags, mode_t mode_flags = 0);
+
+ErrorOr<int> close(int fd);
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index b9704d42cd33b..4681d8c2bb73c 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -16,7 +16,6 @@ add_object_library(
     .${LIBC_TARGET_ARCHITECTURE}.linux_${LIBC_TARGET_ARCHITECTURE}_util
     libc.src.__support.common
     libc.src.__support.CPP.string_view
-    libc.src.errno.errno
     libc.hdr.fcntl_macros
     libc.hdr.types.struct_flock
     libc.hdr.types.struct_flock64
diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp
index 99e16ad58c918..bb76eee90efd2 100644
--- a/libc/src/__support/OSUtil/linux/fcntl.cpp
+++ b/libc/src/__support/OSUtil/linux/fcntl.cpp
@@ -8,23 +8,24 @@
 
 #include "src/__support/OSUtil/fcntl.h"
 
+#include "hdr/errno_macros.h"
 #include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
 #include "hdr/types/off_t.h"
 #include "hdr/types/struct_f_owner_ex.h"
 #include "hdr/types/struct_flock.h"
 #include "hdr/types/struct_flock64.h"
 #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
 #include "src/__support/common.h"
-#include "src/__support/libc_errno.h"
+#include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
-#include <stdarg.h>
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
-int fcntl(int fd, int cmd, void *arg) {
+ErrorOr<int> fcntl(int fd, int cmd, void *arg) {
 #if SYS_fcntl
   constexpr auto FCNTL_SYSCALL_ID = SYS_fcntl;
 #elif defined(SYS_fcntl64)
@@ -33,8 +34,7 @@ int fcntl(int fd, int cmd, void *arg) {
 #error "fcntl and fcntl64 syscalls not available."
 #endif
 
-  int new_cmd = cmd;
-  switch (new_cmd) {
+  switch (cmd) {
   case F_OFD_SETLKW: {
     struct flock *flk = reinterpret_cast<struct flock *>(arg);
     // convert the struct to a flock64
@@ -45,8 +45,11 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    return LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                             &flk64);
+    int ret =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
+    if (ret < 0)
+      return Error(-ret);
+    return ret;
   }
   case F_OFD_GETLK:
   case F_OFD_SETLK: {
@@ -59,60 +62,80 @@ int fcntl(int fd, int cmd, void *arg) {
     flk64.l_len = flk->l_len;
     flk64.l_pid = flk->l_pid;
     // create a syscall
-    int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
-                                                   new_cmd, &flk64);
+    int ret =
+        LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd, &flk64);
     // On failure, return
-    if (retVal == -1)
-      return -1;
+    if (ret < 0)
+      return Error(-1);
     // Check for overflow, i.e. the offsets are not the same when cast
     // to off_t from off64_t.
     if (static_cast<off_t>(flk64.l_len) != flk64.l_len ||
-        static_cast<off_t>(flk64.l_start) != flk64.l_start) {
-      libc_errno = EOVERFLOW;
-      return -1;
-    }
+        static_cast<off_t>(flk64.l_start) != flk64.l_start)
+      return Error(EOVERFLOW);
+
     // Now copy back into flk, in case flk64 got modified
     flk->l_type = flk64.l_type;
     flk->l_whence = flk64.l_whence;
     flk->l_start = static_cast<decltype(flk->l_start)>(flk64.l_start);
     flk->l_len = static_cast<decltype(flk->l_len)>(flk64.l_len);
     flk->l_pid = flk64.l_pid;
-    return retVal;
+    return ret;
   }
   case F_GETOWN: {
     struct f_owner_ex fex;
     int ret = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd,
                                                 F_GETOWN_EX, &fex);
-    if (ret >= 0)
-      return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
-    libc_errno = -ret;
-    return -1;
+    if (ret < 0)
+      return Error(-ret);
+    return fex.type == F_OWNER_PGRP ? -fex.pid : fex.pid;
   }
 #ifdef SYS_fcntl64
   case F_GETLK: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_GETLK64;
+      cmd = F_GETLK64;
     break;
   }
   case F_SETLK: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLK64;
+      cmd = F_SETLK64;
     break;
   }
   case F_SETLKW: {
     if constexpr (FCNTL_SYSCALL_ID == SYS_fcntl64)
-      new_cmd = F_SETLKW64;
+      cmd = F_SETLKW64;
     break;
   }
 #endif
   }
-  int retVal = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, new_cmd,
-                                                 reinterpret_cast<void *>(arg));
-  if (retVal >= 0) {
-    return retVal;
-  }
-  libc_errno = -retVal;
-  return -1;
+
+  // default, but may use rewritten cmd from above.
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(FCNTL_SYSCALL_ID, fd, cmd,
+                                              reinterpret_cast<void *>(arg));
+  if (ret < 0)
+    return Error(-ret);
+  return ret;
+}
+
+ErrorOr<int> open(const char *path, int flags, mode_t mode_flags) {
+#ifdef SYS_open
+  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_open, path, flags, mode_flags);
+#else
+  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, AT_FDCWD, path, flags,
+                                             mode_flags);
+#endif
+  if (fd < 0)
+    return Error(-fd);
+
+  return fd;
+}
+
+ErrorOr<int> close(int fd) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_close, fd);
+
+  if (ret < 0)
+    return Error(-ret);
+
+  return ret;
 }
 
 } // namespace internal
diff --git a/libc/src/fcntl/linux/CMakeLists.txt b/libc/src/fcntl/linux/CMakeLists.txt
index 580db16cd4132..c31eb3f438c10 100644
--- a/libc/src/fcntl/linux/CMakeLists.txt
+++ b/libc/src/fcntl/linux/CMakeLists.txt
@@ -19,6 +19,7 @@ add_entrypoint_object(
   DEPENDS
     libc.hdr.fcntl_macros
     libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
 )
 
 add_entrypoint_object(
diff --git a/libc/src/fcntl/linux/fcntl.cpp b/libc/src/fcntl/linux/fcntl.cpp
index a0c8459ced342..fd9c48eb562f7 100644
--- a/libc/src/fcntl/linux/fcntl.cpp
+++ b/libc/src/fcntl/linux/fcntl.cpp
@@ -10,6 +10,7 @@
 
 #include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
 #include <stdarg.h>
@@ -22,7 +23,14 @@ LLVM_LIBC_FUNCTION(int, fcntl, (int fd, int cmd, ...)) {
   va_start(varargs, cmd);
   arg = va_arg(varargs, void *);
   va_end(varargs);
-  return LIBC_NAMESPACE::internal::fcntl(fd, cmd, arg);
+
+  auto result = LIBC_NAMESPACE::internal::fcntl(fd, cmd, arg);
+
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp
index a21a03788deaa..3a56d10554198 100644
--- a/libc/src/fcntl/linux/open.cpp
+++ b/libc/src/fcntl/linux/open.cpp
@@ -8,15 +8,13 @@
 
 #include "src/fcntl/open.h"
 
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "hdr/fcntl_macros.h"
+#include "hdr/types/mode_t.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
-
-#include "hdr/fcntl_macros.h"
-#include "hdr/types/mode_t.h"
 #include <stdarg.h>
-#include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
@@ -31,17 +29,13 @@ LLVM_LIBC_FUNCTION(int, open, (const char *path, int flags, ...)) {
     va_end(varargs);
   }
 
-#ifdef SYS_open
-  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_open, path, flags, mode_flags);
-#else
-  int fd = LIBC_NAMESPACE::syscall_impl<int>(SYS_openat, AT_FDCWD, path, flags,
-                                             mode_flags);
-#endif
-  if (fd > 0)
-    return fd;
+  auto result = internal::open(path, flags, mode_flags);
 
-  libc_errno = -fd;
-  return -1;
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index f3ae7c5c4e07a..b50c5845bcc2b 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -8,6 +8,8 @@
 
 #include "src/sys/auxv/getauxval.h"
 #include "config/app.h"
+#include "hdr/fcntl_macros.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
@@ -17,14 +19,18 @@
 #include "src/__support/threads/callonce.h"
 #include "src/__support/threads/linux/futex_word.h"
 
+// -----------------------------------------------------------------------------
+// TODO: This file should not include other public libc functions. Calling other
+// public libc functions is an antipattern within LLVM-libc. This needs to be
+// cleaned up. DO NOT COPY THIS.
+// -----------------------------------------------------------------------------
+
 // for mallocing the global auxv
 #include "src/sys/mman/mmap.h"
 #include "src/sys/mman/munmap.h"
 
 // for reading /proc/self/auxv
-#include "src/fcntl/open.h"
 #include "src/sys/prctl/prctl.h"
-#include "src/unistd/close.h"
 #include "src/unistd/read.h"
 
 // getauxval will work either with or without __cxa_atexit support.
@@ -60,17 +66,18 @@ class AuxvMMapGuard {
   constexpr static size_t AUXV_MMAP_SIZE = sizeof(AuxEntry) * MAX_AUXV_ENTRIES;
 
   AuxvMMapGuard()
-      : ptr(mmap(nullptr, AUXV_MMAP_SIZE, PROT_READ | PROT_WRITE,
-                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
+      : ptr(LIBC_NAMESPACE::mmap(nullptr, AUXV_MMAP_SIZE,
+                                 PROT_READ | PROT_WRITE,
+                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
   ~AuxvMMapGuard() {
     if (ptr != MAP_FAILED)
-      munmap(ptr, AUXV_MMAP_SIZE);
+      LIBC_NAMESPACE::munmap(ptr, AUXV_MMAP_SIZE);
   }
   void submit_to_global() {
     // atexit may fail, we do not set it to global in that case.
     int ret = __cxa_atexit(
         [](void *) {
-          munmap(auxv, AUXV_MMAP_SIZE);
+          LIBC_NAMESPACE::munmap(auxv, AUXV_MMAP_SIZE);
           auxv = nullptr;
         },
         nullptr, nullptr);
@@ -90,10 +97,16 @@ class AuxvMMapGuard {
 
 class AuxvFdGuard {
 public:
-  AuxvFdGuard() : fd(open("/proc/self/auxv", O_RDONLY | O_CLOEXEC)) {}
+  AuxvFdGuard() {
+    auto result = internal::open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
+    if (!result.has_value())
+      fd = -1;
+
+    fd = result.value();
+  }
   ~AuxvFdGuard() {
     if (fd != -1)
-      close(fd);
+      internal::close(fd);
   }
   bool valid() const { return fd != -1; }
   int get() const { return fd; }
@@ -135,7 +148,8 @@ static void initialize_auxv_once(void) {
   bool error_detected = false;
   // Read until we use up all the available space or we finish reading the file.
   while (available_size != 0) {
-    ssize_t bytes_read = read(fd_guard.get(), buf, available_size);
+    ssize_t bytes_read =
+        LIBC_NAMESPACE::read(fd_guard.get(), buf, available_size);
     if (bytes_read <= 0) {
       if (libc_errno == EINTR)
         continue;
@@ -158,7 +172,7 @@ static AuxEntry read_entry(int fd) {
   size_t size = sizeof(AuxEntry);
   char *ptr = reinterpret_cast<char *>(&buf);
   while (size > 0) {
-    ssize_t ret = read(fd, ptr, size);
+    ssize_t ret = LIBC_NAMESPACE::read(fd, ptr, size);
     if (ret < 0) {
       if (libc_errno == EINTR)
         continue;
@@ -195,7 +209,8 @@ LLVM_LIBC_FUNCTION(unsigned long, getauxval, (unsigned long id)) {
     return search_auxv(app.auxv_ptr, id);
 
   static FutexWordType once_flag;
-  callonce(reinterpret_cast<CallOnceFlag *>(&once_flag), initialize_auxv_once);
+  LIBC_NAMESPACE::callonce(reinterpret_cast<CallOnceFlag *>(&once_flag),
+                           initialize_auxv_once);
   if (auxv != nullptr)
     return search_auxv(auxv, id);
 
diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h
index 69911012ff7e9..29d1401821e49 100644
--- a/libc/src/sys/mman/linux/shm_common.h
+++ b/libc/src/sys/mman/linux/shm_common.h
@@ -13,6 +13,11 @@
 #include "src/__support/macros/config.h"
 #include "src/string/memory_utils/inline_memcpy.h"
 
+// TODO: clean this up.
+//  1. Change from optional to ErrorOr, and return the errno instead of setting
+//    it here.
+//  2. Replace inline memcpy with __builtin_memcpy
+
 // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121
 #include <linux/limits.h>
 
diff --git a/libc/src/sys/mman/linux/shm_open.cpp b/libc/src/sys/mman/linux/shm_open.cpp
index 11de482272d00..3099062eace98 100644
--- a/libc/src/sys/mman/linux/shm_open.cpp
+++ b/libc/src/sys/mman/linux/shm_open.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/mman/shm_open.h"
+#include "hdr/fcntl_macros.h"
 #include "hdr/types/mode_t.h"
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/macros/config.h"
-#include "src/fcntl/open.h"
 #include "src/sys/mman/linux/shm_common.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -17,9 +18,16 @@ namespace LIBC_NAMESPACE_DECL {
 static constexpr int DEFAULT_OFLAGS = O_NOFOLLOW | O_CLOEXEC | O_NONBLOCK;
 
 LLVM_LIBC_FUNCTION(int, shm_open, (const char *name, int oflags, mode_t mode)) {
-  using namespace shm_common;
-  if (cpp::optional<SHMPath> buffer = translate_name(name))
-    return open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+  if (cpp::optional<shm_common::SHMPath> buffer =
+          shm_common::translate_name(name)) {
+    auto result = internal::open(buffer->data(), oflags | DEFAULT_OFLAGS, mode);
+
+    if (!result.has_value()) {
+      libc_errno = result.error();
+      return -1;
+    }
+    return result.value();
+  }
   return -1;
 }
 
diff --git a/libc/src/sys/mman/linux/shm_unlink.cpp b/libc/src/sys/mman/linux/shm_unlink.cpp
index 6a76301512201..4c61c7cd16bad 100644
--- a/libc/src/sys/mman/linux/shm_unlink.cpp
+++ b/libc/src/sys/mman/linux/shm_unlink.cpp
@@ -13,10 +13,13 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+// TODO: stop calling the public unlink function. It should be calling an
+// internal shared utility.
+
 LLVM_LIBC_FUNCTION(int, shm_unlink, (const char *name)) {
-  using namespace shm_common;
-  if (cpp::optional<SHMPath> buffer = translate_name(name))
-    return unlink(buffer->data());
+  if (cpp::optional<shm_common::SHMPath> buffer =
+          shm_common::translate_name(name))
+    return LIBC_NAMESPACE::unlink(buffer->data());
   return -1;
 }
 
diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp
index b5842f2b64d20..6ef3a3c6d63f0 100644
--- a/libc/src/unistd/linux/close.cpp
+++ b/libc/src/unistd/linux/close.cpp
@@ -8,9 +8,8 @@
 
 #include "src/unistd/close.h"
 
-#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/OSUtil/fcntl.h"
 #include "src/__support/common.h"
-
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include <sys/syscall.h> // For syscall numbers.
@@ -18,12 +17,13 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, close, (int fd)) {
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_close, fd);
-  if (ret < 0) {
-    libc_errno = -ret;
+  auto result = internal::close(fd);
+
+  if (!result.has_value()) {
+    libc_errno = result.error();
     return -1;
   }
-  return ret;
+  return result.value();
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 84a6b7d230442..7901de161b7ac 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -344,6 +344,21 @@ libc_support_library(
     hdrs = ["hdr/types/struct_epoll_event.h"],
 )
 
+libc_support_library(
+    name = "types_struct_f_owner_ex",
+    hdrs = ["hdr/types/struct_f_owner_ex.h"],
+)
+
+libc_support_library(
+    name = "types_struct_flock",
+    hdrs = ["hdr/types/struct_flock.h"],
+)
+
+libc_support_library(
+    name = "types_struct_flock64",
+    hdrs = ["hdr/types/struct_flock64.h"],
+)
+
 libc_support_library(
     name = "types_struct_timespec",
     hdrs = ["hdr/types/struct_timespec.h"],
@@ -1380,6 +1395,28 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_osutil_fcntl",
+    srcs = ["src/__support/OSUtil/linux/fcntl.cpp"],
+    hdrs = ["src/__support/OSUtil/fcntl.h"],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_common",
+        ":__support_error_or",
+        ":__support_osutil_syscall",
+        ":hdr_errno_macros",
+        ":hdr_fcntl_macros",
+        ":types_mode_t",
+        ":types_off_t",
+        ":types_struct_f_owner_ex",
+        ":types_struct_flock",
+        ":types_struct_flock64",
+    ],
+)
+
 libc_support_library(
     name = "__support_osutil_exit",
     srcs = ["src/__support/OSUtil/linux/exit.cpp"],
@@ -1601,8 +1638,8 @@ libc_support_library(
 libc_header_library(
     name = "libcxx_shared_headers",
     hdrs = [
-        "shared/libc_common.h",
         "shared/fp_bits.h",
+        "shared/libc_common.h",
         "shared/str_to_float.h",
         "shared/str_to_integer.h",
     ],
@@ -4475,13 +4512,28 @@ libc_function(
     }),
     deps = [
         ":__support_common",
-        ":__support_osutil_syscall",
+        ":__support_osutil_fcntl",
         ":errno",
         ":hdr_fcntl_macros",
         ":types_mode_t",
     ],
 )
 
+libc_function(
+    name = "fcntl",
+    srcs = ["src/fcntl/linux/fcntl.cpp"],
+    hdrs = ["src/fcntl/fcntl.h"],
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":__support_common",
+        ":__support_osutil_fcntl",
+        ":errno",
+    ],
+)
+
 libc_function(
     name = "openat",
     srcs = ["src/fcntl/linux/openat.cpp"],
@@ -4542,7 +4594,7 @@ libc_function(
     hdrs = ["src/unistd/close.h"],
     deps = [
         ":__support_common",
-        ":__support_osutil_syscall",
+        ":__support_osutil_fcntl",
         ":errno",
     ],
 )

From 5578bcbcfd25c797d4d14b8dfb3f83360712513d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Fri, 13 Jun 2025 12:32:46 -0500
Subject: [PATCH 412/851] [mlir][xegpu] add support for structure control flow
 ops in workgroup to subgroup distribution (#142618)

This PR introduces support for `scf::ForOp`, `scf::WhileOp`, `scf::If`,
and `scf::Condition` within the workgroup-subgroup-distribution pass,
leveraging the `SCFStructuralTypeConversionsAndLegality`.
---
 .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h     |   3 +
 .../Transforms/XeGPUWgToSgDistribute.cpp      | 220 ++++++++++++++++--
 mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp   |   6 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  98 +++++++-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   | 134 ++++++++++-
 5 files changed, 430 insertions(+), 31 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index f9327d63869c0..6fea10185402a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -26,6 +26,9 @@ class TensorDescType;
 
 namespace xegpu {
 
+/// Flatten a set of ValueRange into a single SmallVector<Value>
+SmallVector<Value> flattenValues(ArrayRef<ValueRange> values);
+
 /// If tensor descriptor has a layout attribute it is used in SIMT mode.
 /// In this mode, the distributed vector shape is determined as follows:
 /// Definitions:
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 3bf76af674ba0..a26c6b52f0ddc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -13,9 +13,11 @@
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -29,6 +31,29 @@ using namespace mlir;
 
 namespace {
 
+static std::pair<SmallVector<int64_t>, int>
+getSgShapeAndCount(ArrayRef<int64_t> shape, xegpu::LayoutAttr layout) {
+  int count = 1;
+  SmallVector<int64_t> sgShape(shape);
+
+  if (layout && layout.isWgLayout()) {
+    DenseI32ArrayAttr sgLayoutAttr = layout.getSgLayout();
+    auto sgLayout = llvm::to_vector_of<int64_t>(sgLayoutAttr.asArrayRef());
+    if (DenseI32ArrayAttr sgDataAttr = layout.getSgData())
+      sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
+    else
+      sgShape = computeShapeRatio(shape, sgLayout).value_or(sgShape);
+    SmallVector<int64_t> distUnit = computeElementwiseMul(sgLayout, sgShape);
+    // Clamp distUnit to the original shape to handle cases where data is
+    // shared among subgroups, which may cause distUnit to exceed the original
+    // shape.
+    for (size_t i = 0; i < distUnit.size(); ++i)
+      distUnit[i] = std::min(shape[i], distUnit[i]);
+    count = computeProduct(shape) / computeProduct(distUnit);
+  }
+  return std::make_pair(sgShape, count);
+}
+
 /// This pattern transforms the CreateNdDescOp to create a subgroup descriptor
 /// from a workgroup descriptor. It replaces the offsets and sizes with
 /// appropriate values for the subgroup.
@@ -129,18 +154,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
       return rewriter.notifyMatchFailure(
           op, "sgLayout attribute is required in layout");
 
-    SmallVector<int64_t> sgShape;
-    if (auto sgDataAttr = layout.getSgData()) {
-      sgShape = llvm::to_vector_of<int64_t>(sgDataAttr.asArrayRef());
-    } else {
-      assert(wgShape.size() == sgLayout.size() &&
-             "sgLayout and wgShape must have the same rank");
-      sgShape.reserve(wgShape.size());
-      for (size_t i = 0; i < wgShape.size(); ++i) {
-        assert(sgLayout[i] != 0 && "sgLayout elements must be non-zero");
-        sgShape.push_back(wgShape[i] / sgLayout[i]);
-      }
-    }
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
 
     // TODO : Handle order attribute
     // Get the subgroup ID
@@ -266,15 +280,15 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
     if (resultTy.getRank() != 2)
       return failure();
 
-    auto originalLayout =
-        llvm::dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
+    auto originalLayout = xegpu::getLayoutAttr(op.getResult());
     if (!originalLayout)
       return failure();
 
-    SmallVector<Value> newDpasOps;
     size_t i = 0;
+    SmallVector<Value> newDpasOps;
     for (auto aVec : adaptor.getLhs()) {
       for (auto bVec : adaptor.getRhs()) {
+
         llvm::SmallVector<Value> operands({aVec, bVec});
         Value tmpC;
         if (op.getAcc()) {
@@ -288,10 +302,10 @@ struct WgToSgDpasOp : public OpConversionPattern<xegpu::DpasOp> {
             llvm::cast<VectorType>(bVec.getType()).getShape();
         VectorType resTy = VectorType::get({aVecShape[0], bVecShape[1]},
                                            resultTy.getElementType());
-        tmpC = rewriter.create<xegpu::DpasOp>(
-            loc, resTy, operands,
-            llvm::ArrayRef<NamedAttribute>(
-                {"layout_result_0", originalLayout.dropSgLayoutAndData()}));
+        tmpC = rewriter.create<xegpu::DpasOp>(loc, resTy, operands);
+        xegpu::setLayoutAttr(cast<OpResult>(tmpC),
+                             originalLayout.dropSgLayoutAndData());
+
         newDpasOps.push_back(tmpC);
       }
     }
@@ -314,14 +328,90 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
+// Handles UnrealizedConversionCastOp generated during
+// SCFStructuralTypeConversions (step 1). This op may appear as either a
+// target or source materialization for Vector values, e.g.:
+// 1. unrealized_cast %1 : vector<256xf32> to vector<16xf32>, ...
+// 2. unrealized_cast %1 : vector<16xf32>, ... to vector<256xf32>
+// it could be either 1:N or N:1 cast. In both cases, the pattern
+// simply forwards the inputs to the outputs using 1:1 or 1:N interface.
+// for example, the following scf::forOp
+// ```
+// %for = scf.for ... iter_args(%arg1 = %0)->(vector<128x128xf16>) {
+//     %n = use(%arg1): vector<128x128xf16>
+//     scf.yield %n : vector<128x128xf16>
+// }
+// ```
+// Could be converted to:
+// ```
+// %1 = unrealized_conversion_cast %0
+//          : vector<128x128xf16> to vector<16x16xf16>, vector<16x16xf16>
+// %for:2 = scf.for ... iter_args(%arg1 = %1#1, %arg2 = %1#2)
+//                    -> (vector<16x16xf16>, vector<16x16xf16) {
+//     %m = unrealized_conversion_cast %arg1, %arg2
+//            : vector<16x16xf16>, vector<16x16xf16> to vector<128x128xf16>
+//     %n = use(%m): vector<128x128xf16>
+//     %b = unrealized_conversion_cast %n
+//            : vector<128x128xf16> to vector<16x16xf16>, vector<16x16xf16>
+//     scf.yield %b#1, %b#2 : vector<16x16xf16>, vector<16x16xf16>
+// }
+// %cast = unrealized_conversion_cast %for:2
+//          : vector<16x16xf16>, vector<16x16xf16> to vector<128x128xf16>
+// ```
+// TODO: remove it when context-aware type converter is ready.
+struct UnrealizedConversionCastOpPattern
+    : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
+  using OpConversionPattern<
+      mlir::UnrealizedConversionCastOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(mlir::UnrealizedConversionCastOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Value> inputs = xegpu::flattenValues(adaptor.getInputs());
+
+    auto inputTy = dyn_cast<VectorType>(inputs[0].getType());
+    auto outputTy = dyn_cast<VectorType>(op->getOpResult(0).getType());
+
+    if (!inputTy || !outputTy || !llvm::all_equal(op->getResultTypes()) ||
+        !llvm::all_equal(ValueRange(inputs).getTypes()))
+      return failure();
+
+    // Handles the case "cast %1 : vector<256xf32> to vector<16xf32>, ...".
+    // It is generated by source materialization (e.g., inits to scf forOp).
+    // The input values provided by the adaptor should already be distributed,
+    // and their types should correspond exactly to the result types of the
+    // operation.
+    if (op.getNumOperands() == 1 &&
+        llvm::equal(ValueRange(inputs).getTypes(), op->getResultTypes())) {
+      rewriter.replaceOp(op, inputs);
+      return success();
+    }
+
+    // Handles the case "cast %1 : vector<16xf32>, ... to vector<256xf32>".
+    // It is generated by target materialization (e.g., arguments/results
+    // of scf forOp). All input values must have the same vector type, and
+    // their shape must be evenly divisible by the output vector's shape
+    // (determined by the nature of the workgroup to subgroup distribution).
+    // TODO: it is not safe to do such forward, since such N:1 cast could be
+    // from others.
+    if (op.getNumResults() == 1 &&
+        computeShapeRatio(outputTy.getShape(), inputTy.getShape())) {
+      rewriter.replaceOpWithMultiple(op, {inputs});
+      return success();
+    }
+
+    return mlir::failure();
+  }
+};
+
 } // namespace
 
 namespace mlir {
 namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
-               WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp>(
-      patterns.getContext());
+               WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
+               UnrealizedConversionCastOpPattern>(patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -334,9 +424,68 @@ struct XeGPUWgToSgDistributePass
 } // namespace
 
 void XeGPUWgToSgDistributePass::runOnOperation() {
+  // Track existing UnrealizedConversionCastOps
+  SmallVector<Operation *> existingCastOps;
+  getOperation()->walk([&](UnrealizedConversionCastOp castOp) {
+    existingCastOps.push_back(castOp.getOperation());
+  });
+
+  {
+    // Step 1: Apply SCFStructuralTypeConversions to SCF operations with
+    // VectorType operands. This first converts such operands to
+    // RankedTensorType, propagates the layout attribute into the encoding
+    // attribute, and finally converts the RankedTensorType to VectorType based
+    // on the encoding.
+
+    TypeConverter converter;
+    converter.addConversion([&](Type type) -> Type { return type; });
+    converter.addConversion(
+        [&](RankedTensorType type,
+            SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+          Type elemTy = type.getElementType();
+          ArrayRef<int64_t> shape = type.getShape();
+
+          int count;
+          SmallVector<int64_t> subShape;
+          std::tie(subShape, count) = getSgShapeAndCount(
+              shape,
+              dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding()));
+
+          auto newTy = VectorType::get(subShape, elemTy);
+          result.append(count, newTy);
+          return success();
+        });
+
+    xegpu::doSCFStructuralTypeConversionWithTensorType(getOperation(),
+                                                       converter);
+  }
+
+  // Step 2: Perform workgroup to subgroup distribution for TensorDesc values,
+  // as well as XeGPU, Arith, and Vector operations.
   MLIRContext *ctx = &getContext();
   RewritePatternSet patterns(ctx);
   ConversionTarget target(*ctx);
+  TypeConverter converter;
+  converter.addConversion([&](Type type) -> Type { return type; });
+  converter.addConversion(
+      [&](xegpu::TensorDescType type,
+          SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+        Type elemTy = type.getElementType();
+        ArrayRef<int64_t> shape = type.getShape();
+
+        int count;
+        SmallVector<int64_t> subShape;
+        xegpu::LayoutAttr layout = type.getLayoutAttr();
+        std::tie(subShape, count) = getSgShapeAndCount(shape, layout);
+
+        if (layout)
+          layout = layout.dropSgLayoutAndData();
+
+        auto newTy = xegpu::TensorDescType::get(
+            type.getContext(), subShape, elemTy, type.getEncoding(), layout);
+        result.append(count, newTy);
+        return success();
+      });
 
   auto getTensorDescType = [](Operation *op) -> xegpu::TensorDescType {
     if (auto createOp = dyn_cast<xegpu::CreateNdDescOp>(op))
@@ -353,26 +502,49 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
   };
 
   auto isLegal = [&](xegpu::LayoutAttr layout) -> bool {
-    return !layout || layout.getSgLayout() == nullptr;
+    return !layout || !layout.isWgLayout();
   };
 
   target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::LoadNdOp,
                                xegpu::StoreNdOp, xegpu::UpdateNdOffsetOp,
                                xegpu::PrefetchNdOp>([=](Operation *op) -> bool {
     auto tdescTy = getTensorDescType(op);
-    auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(tdescTy.getLayout());
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(tdescTy.getLayout());
     return isLegal(layout);
   });
 
   target.addDynamicallyLegalOp<xegpu::DpasOp>([=](xegpu::DpasOp op) -> bool {
-    auto layout = dyn_cast_or_null<xegpu::LayoutAttr>(op->getAttr("layout"));
+    auto layout = xegpu::getLayoutAttr(op.getResult());
     return isLegal(layout);
   });
 
+  target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+      [=](UnrealizedConversionCastOp op) {
+        return llvm::is_contained(existingCastOps, op.getOperation());
+      });
+
   target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
+  scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                       target);
   xegpu::populateXeGPUWgToSgDistributePatterns(patterns);
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
     return signalPassFailure();
+
+  // Remove sg_layout and sg_data attributes from the Layout
+  // attribute for each VectorType result of the operation.
+  // For Structured Control Flow ops, the layout is simply removed,
+  // since in 1:N case, the layout for new results are missing.
+  // Layout propagation pass will activated.
+  getOperation()->walk([](Operation *op) {
+    for (OpResult result : op->getOpResults()) {
+      std::string name = xegpu::getLayoutName(result);
+      if (auto layout = op->getAttrOfType<xegpu::LayoutAttr>(name)) {
+        op->removeAttr(name);
+        if (!isa<scf::IfOp, scf::ForOp, scf::WhileOp, scf::ConditionOp>(op))
+          op->setAttr(name, layout.dropSgLayoutAndData());
+      }
+    }
+  });
 }
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index dcaf4e85a82c5..6b85a66a8bd36 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -27,7 +27,7 @@
 using namespace mlir;
 
 /// convert ArrayRef<ValueRange> into SmallVector<Value>
-static SmallVector<Value> flattenValues(ArrayRef<ValueRange> values) {
+SmallVector<Value> xegpu::flattenValues(ArrayRef<ValueRange> values) {
   SmallVector<Value> result;
   for (const auto &vals : values)
     llvm::append_range(result, vals);
@@ -271,7 +271,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
       auto resultTy = dyn_cast<RankedTensorType>(result.getType());
 
       // Only look at ops casting from VectorType to RankedTensorType
-      if (!isa<VectorType>(inputTy) || !isa<RankedTensorType>(resultTy))
+      if (!inputTy || !resultTy)
         return WalkResult::skip();
 
       xegpu::LayoutAttr layout = xegpu::getLayoutAttr(input);
@@ -342,7 +342,7 @@ void xegpu::doSCFStructuralTypeConversionWithTensorType(
         }
 
         if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
-          SmallVector<Value> values = flattenValues(adaptor.getInputs());
+          SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
           auto newOp = rewriter.create<UnrealizedConversionCastOp>(
               op.getLoc(), outputTy, values);
           rewriter.replaceOp(op, newOp);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index bee026eb2084d..35ad16d8cd9a9 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -85,7 +85,7 @@ gpu.module @test_round_robin_assignment {
     %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<8x8xf32>
       -> !xegpu.tensor_desc<8x8xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<8x8xf32>, vector<8x8xf32> -> vector<8x8xf32>
     gpu.return
   }
@@ -102,4 +102,100 @@ gpu.module @test_round_robin_assignment {
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     gpu.return
   }
+
+  gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1 = arith.constant 1 : index
+    %c10 = arith.constant 10 : index
+    %c0 = arith.constant 0 : index
+    %c256 = arith.constant 256 : index
+    %c1024 = arith.constant 1024 : index
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    // CHECK-LABEL: scf.for
+    // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    %2:2 = scf.for %arg2 = %c0 to %c1024 step %c256 iter_args(%arg3 = %0, %arg4 = %1)
+        -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
+      %3 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      xegpu.store_nd %3, %arg3  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %4 = xegpu.update_nd_offset %arg3, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %5 = xegpu.update_nd_offset %arg4, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %4, %5 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
+    %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
+      %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
+      //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
+      scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
+    } do {
+    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
+    ^bb0(%arg2: vector<256xf32>, %arg3: i32):
+      xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %4 = arith.addi %arg3, %c1_i32 : i32
+      %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      scf.yield %6, %4 : vector<256xf32>, i32
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %0 = gpu.subgroup_id : index
+    %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %3 = arith.cmpi eq, %0, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (vector<16xf32>, vector<16xf32>)
+    %4 = scf.if %3 -> (vector<256xf32>) {
+      %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      scf.yield %5 : vector<256xf32>
+    } else {
+      %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      scf.yield %5 : vector<256xf32>
+    } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
+    xegpu.store_nd %4, %1  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
+
+    %0 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
+      %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    } else {
+      %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    }
+    xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
+    gpu.return
+  }
+
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 7e89ada934071..466842c968448 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -5,7 +5,7 @@
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: test_create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {  
+  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[SGID:.*]] = gpu.subgroup_id
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
@@ -108,7 +108,7 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 12], lane_layout = [8, 2], lane_data = [1, 1]>>
       -> vector<32x24xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
@@ -142,7 +142,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : !xegpu.tensor_desc<32x24xf32, #xegpu.layout<sg_layout = [4, 2], lane_layout = [8, 2], lane_data = [1, 1]>>
       -> vector<32x24xf32>
     %dpas = xegpu.dpas %load_a, %load_b
-      {layout =  #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      {layout_result_0 =  #xegpu.layout<sg_layout = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
@@ -169,4 +169,132 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
       : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32>
     gpu.return
   }
+
+  gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
+    //CHECK: [[c0:%.+]] = arith.constant 0 : index
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
+    %c0 = arith.constant 0 : index
+    %c128 = arith.constant 128 : index
+    %c1024 = arith.constant 1024 : index
+    %block_id_x = gpu.block_id  x
+    %block_id_y = gpu.block_id  y
+    %0 = arith.muli %block_id_x, %c128 : index
+    %1 = arith.muli %block_id_y, %c128 : index
+    %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    %3 = xegpu.load_nd %2 : !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>> -> vector<128x128xf32>
+    %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
+    %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+
+    //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
+    // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
+    // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
+    //      CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    //      CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    //      CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
+    //      CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
+    //      CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
+    //      CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
+    %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
+        -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
+            !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
+      %8 = xegpu.load_nd %arg4  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>> -> vector<128x128xf16>
+      %9 = xegpu.load_nd %arg5  : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>> -> vector<128x128xf16>
+      %10 = xegpu.dpas %8, %9, %arg6 {layout_result_0 = #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>}
+                          : vector<128x128xf16>, vector<128x128xf16>, vector<128x128xf32> -> vector<128x128xf32>
+      %11 = xegpu.update_nd_offset %arg4, [%c0, %c128] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
+      %12 = xegpu.update_nd_offset %arg5, [%c128, %c0] : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
+      scf.yield %11, %12, %10 : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
+                                !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>
+    }
+    %7 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32>
+            -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    xegpu.store_nd %6#2, %7  : vector<128x128xf32>, !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+    %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+
+    // CHECK: scf.while {{.*}} : (vector<16xf32>, i32) -> (vector<16xf32>, i32)
+    %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
+      %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
+      // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
+      scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
+    } do {
+    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32)
+    ^bb0(%arg2: vector<256xf32>, %arg3: i32):
+      xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %4 = arith.addi %arg3, %c1_i32 : i32
+      %5 = xegpu.update_nd_offset %0, [256] : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      %6 = xegpu.load_nd %5  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      scf.yield %6, %4 : vector<256xf32>, i32
+    }
+    gpu.return
+  }
+
+  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %1 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+
+    %4 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (vector<16xf32>)
+    %5 = scf.if %4 -> (vector<256xf32>) {
+      // CHECK-LABEL: xegpu.load_nd
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %2 = xegpu.load_nd %0 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>
+      scf.yield %2 : vector<256xf32>
+    } else {
+      // CHECK-LABEL: xegpu.load_nd
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32> -> vector<16xf32>
+      %3 = xegpu.load_nd %1 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: vector<16xf32>
+      scf.yield %3 : vector<256xf32>
+    } {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [16]>}
+    xegpu.store_nd %5, %0 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    gpu.return
+  }
+
+  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+    %c10 = arith.constant 10 : index
+    %id = gpu.subgroup_id : index
+
+    %t = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    %d = xegpu.load_nd %t : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>> -> vector<256xf32>
+
+    %0 = arith.cmpi eq, %id, %c10 : index
+    // CHECK-LABEL: scf.if
+    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>)
+    %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>) {
+      // CHECK-LABEL: xegpu.create_nd_tdesc
+      //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+      %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
+      scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    } else {
+      // CHECK-LABEL: xegpu.create_nd_tdesc
+      //  CHECK-SAME: memref<1024xf32> -> !xegpu.tensor_desc<16xf32>
+      %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+      // CHECK-LABEL: scf.yield
+      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>
+      scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    }
+    xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
+    gpu.return
+  }
+
+
 }

From ecdb549e6de60b3211cfa860eec498270e3980f1 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 13 Jun 2025 10:36:09 -0700
Subject: [PATCH 413/851] [TableGen] Avoid evaluating RHS of a BinOp until
 short-circuit is complete (#144021)

This patch adds an even more aggressive short-circuit on `!and` and
`!or` that completely avoids the evaluation of RHS operand until short
circuiting decisions are made.
---
 llvm/lib/TableGen/Record.cpp     | 11 ++++++-----
 llvm/test/TableGen/true-false.td |  9 +++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 4c8b41237c604..7f2ed77a74099 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1557,8 +1557,7 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
 }
 
 const Init *BinOpInit::resolveReferences(Resolver &R) const {
-  const Init *lhs = LHS->resolveReferences(R);
-  const Init *rhs = RHS->resolveReferences(R);
+  const Init *NewLHS = LHS->resolveReferences(R);
 
   unsigned Opc = getOpcode();
   if (Opc == AND || Opc == OR) {
@@ -1570,15 +1569,17 @@ const Init *BinOpInit::resolveReferences(Resolver &R) const {
     // limited version of short-circuit against all ones (`true` is casted
     // to 1 rather than all ones before we evaluate `!or`).
     if (const auto *LHSi = dyn_cast_or_null<IntInit>(
-            lhs->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) {
+            NewLHS->convertInitializerTo(IntRecTy::get(getRecordKeeper())))) {
       if ((Opc == AND && !LHSi->getValue()) ||
           (Opc == OR && LHSi->getValue() == -1))
         return LHSi;
     }
   }
 
-  if (LHS != lhs || RHS != rhs)
-    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))
+  const Init *NewRHS = RHS->resolveReferences(R);
+
+  if (LHS != NewLHS || RHS != NewRHS)
+    return (BinOpInit::get(getOpcode(), NewLHS, NewRHS, getType()))
         ->Fold(R.getCurrentRecord());
   return this;
 }
diff --git a/llvm/test/TableGen/true-false.td b/llvm/test/TableGen/true-false.td
index 5a59f20b21d25..5fa5702314489 100644
--- a/llvm/test/TableGen/true-false.td
+++ b/llvm/test/TableGen/true-false.td
@@ -67,13 +67,18 @@ def rec7 {
   bits<3> flags = { true, false, true };
 }
 
-// `!and` and `!or` should be short-circuit such that `!tail` on empty list will never
-// be evaluated.
+// `!and` and `!or` should be short-circuited such that any of the `!head` or
+// `!tail` on empty list below will never be evaluated.
 // CHECK: def rec8
+// CHECK:   bit v = 0;
+// CHECK:   int v2 = -1;
 // CHECK:   list<int> newSeq = [];
 // CHECK:   list<int> newSeq2 = [];
 
 class Foo <list<int> seq = []> {
+  bit v = !and(false, !head(seq));
+  int v2 = !or(-1, !head(seq));
+
   bit unresolved = !ne(!find(NAME, "BAR"), -1);
   list<int> newSeq  = !if(!and(false, unresolved), !tail(seq), seq);
   list<int> newSeq2 = !if(!or(-1, unresolved), seq, !tail(seq));

From 09c54c2e9e044fa0857831e6ce1bf77c8ce16ecc Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Fri, 13 Jun 2025 10:43:22 -0700
Subject: [PATCH 414/851] [IR2Vec] Minor vocab changes and exposing weights
 (#143200)

This PR changes some asserts in Vocab to hard checks that emit error and exposes flags and constructor to help in unit tests.

(Tracking issue - #141817)
---
 llvm/include/llvm/Analysis/IR2Vec.h    |  11 ++
 llvm/lib/Analysis/IR2Vec.cpp           |  82 +++++++++------
 llvm/unittests/Analysis/IR2VecTest.cpp | 137 ++++++++++++++++++-------
 3 files changed, 164 insertions(+), 66 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 8bf21b0e75d67..de67955d85d7c 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -31,7 +31,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/JSON.h"
 #include <map>
 
 namespace llvm {
@@ -43,6 +45,7 @@ class Function;
 class Type;
 class Value;
 class raw_ostream;
+class LLVMContext;
 
 /// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
 /// Symbolic embeddings capture the "syntactic" and "statistical correlation"
@@ -53,6 +56,11 @@ class raw_ostream;
 enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
+
+extern cl::opt<float> OpcWeight;
+extern cl::opt<float> TypeWeight;
+extern cl::opt<float> ArgWeight;
+
 /// Embedding is a datatype that wraps std::vector<double>. It provides
 /// additional functionality for arithmetic and comparison operations.
 /// It is meant to be used *like* std::vector<double> but is more restrictive
@@ -226,10 +234,13 @@ class IR2VecVocabResult {
 class IR2VecVocabAnalysis : public AnalysisInfoMixin<IR2VecVocabAnalysis> {
   ir2vec::Vocab Vocabulary;
   Error readVocabulary();
+  void emitError(Error Err, LLVMContext &Ctx);
 
 public:
   static AnalysisKey Key;
   IR2VecVocabAnalysis() = default;
+  explicit IR2VecVocabAnalysis(const ir2vec::Vocab &Vocab);
+  explicit IR2VecVocabAnalysis(ir2vec::Vocab &&Vocab);
   using Result = IR2VecVocabResult;
   Result run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 25ce35d4ace37..0f7303c1b0917 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -16,13 +16,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -33,6 +31,8 @@ using namespace ir2vec;
 STATISTIC(VocabMissCounter,
           "Number of lookups to entites not present in the vocabulary");
 
+namespace llvm {
+namespace ir2vec {
 static cl::OptionCategory IR2VecCategory("IR2Vec Options");
 
 // FIXME: Use a default vocab when not specified
@@ -40,18 +40,17 @@ static cl::opt<std::string>
     VocabFile("ir2vec-vocab-path", cl::Optional,
               cl::desc("Path to the vocabulary file for IR2Vec"), cl::init(""),
               cl::cat(IR2VecCategory));
-static cl::opt<float> OpcWeight("ir2vec-opc-weight", cl::Optional,
-                                cl::init(1.0),
-                                cl::desc("Weight for opcode embeddings"),
-                                cl::cat(IR2VecCategory));
-static cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional,
-                                 cl::init(0.5),
-                                 cl::desc("Weight for type embeddings"),
-                                 cl::cat(IR2VecCategory));
-static cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional,
-                                cl::init(0.2),
-                                cl::desc("Weight for argument embeddings"),
-                                cl::cat(IR2VecCategory));
+cl::opt<float> OpcWeight("ir2vec-opc-weight", cl::Optional, cl::init(1.0),
+                         cl::desc("Weight for opcode embeddings"),
+                         cl::cat(IR2VecCategory));
+cl::opt<float> TypeWeight("ir2vec-type-weight", cl::Optional, cl::init(0.5),
+                          cl::desc("Weight for type embeddings"),
+                          cl::cat(IR2VecCategory));
+cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional, cl::init(0.2),
+                         cl::desc("Weight for argument embeddings"),
+                         cl::cat(IR2VecCategory));
+} // namespace ir2vec
+} // namespace llvm
 
 AnalysisKey IR2VecVocabAnalysis::Key;
 
@@ -251,9 +250,9 @@ bool IR2VecVocabResult::invalidate(
 // by auto-generating a default vocabulary during the build time.
 Error IR2VecVocabAnalysis::readVocabulary() {
   auto BufOrError = MemoryBuffer::getFileOrSTDIN(VocabFile, /*IsText=*/true);
-  if (!BufOrError) {
+  if (!BufOrError)
     return createFileError(VocabFile, BufOrError.getError());
-  }
+
   auto Content = BufOrError.get()->getBuffer();
   json::Path::Root Path("");
   Expected<json::Value> ParsedVocabValue = json::parse(Content);
@@ -261,39 +260,60 @@ Error IR2VecVocabAnalysis::readVocabulary() {
     return ParsedVocabValue.takeError();
 
   bool Res = json::fromJSON(*ParsedVocabValue, Vocabulary, Path);
-  if (!Res) {
+  if (!Res)
     return createStringError(errc::illegal_byte_sequence,
                              "Unable to parse the vocabulary");
-  }
-  assert(Vocabulary.size() > 0 && "Vocabulary is empty");
+
+  if (Vocabulary.empty())
+    return createStringError(errc::illegal_byte_sequence,
+                             "Vocabulary is empty");
 
   unsigned Dim = Vocabulary.begin()->second.size();
-  assert(Dim > 0 && "Dimension of vocabulary is zero");
-  (void)Dim;
-  assert(std::all_of(Vocabulary.begin(), Vocabulary.end(),
-                     [Dim](const std::pair<StringRef, Embedding> &Entry) {
-                       return Entry.second.size() == Dim;
-                     }) &&
-         "All vectors in the vocabulary are not of the same dimension");
+  if (Dim == 0)
+    return createStringError(errc::illegal_byte_sequence,
+                             "Dimension of vocabulary is zero");
+
+  if (!std::all_of(Vocabulary.begin(), Vocabulary.end(),
+                   [Dim](const std::pair<StringRef, Embedding> &Entry) {
+                     return Entry.second.size() == Dim;
+                   }))
+    return createStringError(
+        errc::illegal_byte_sequence,
+        "All vectors in the vocabulary are not of the same dimension");
+
   return Error::success();
 }
 
+IR2VecVocabAnalysis::IR2VecVocabAnalysis(const Vocab &Vocabulary)
+    : Vocabulary(Vocabulary) {}
+
+IR2VecVocabAnalysis::IR2VecVocabAnalysis(Vocab &&Vocabulary)
+    : Vocabulary(std::move(Vocabulary)) {}
+
+void IR2VecVocabAnalysis::emitError(Error Err, LLVMContext &Ctx) {
+  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
+    Ctx.emitError("Error reading vocabulary: " + EI.message());
+  });
+}
+
 IR2VecVocabAnalysis::Result
 IR2VecVocabAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   auto Ctx = &M.getContext();
+  // FIXME: Scale the vocabulary once. This would avoid scaling per use later.
+  // If vocabulary is already populated by the constructor, use it.
+  if (!Vocabulary.empty())
+    return IR2VecVocabResult(std::move(Vocabulary));
+
+  // Otherwise, try to read from the vocabulary file.
   if (VocabFile.empty()) {
     // FIXME: Use default vocabulary
     Ctx->emitError("IR2Vec vocabulary file path not specified");
     return IR2VecVocabResult(); // Return invalid result
   }
   if (auto Err = readVocabulary()) {
-    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
-      Ctx->emitError("Error reading vocabulary: " + EI.message());
-    });
+    emitError(std::move(Err), *Ctx);
     return IR2VecVocabResult();
   }
-  // FIXME: Scale the vocabulary here once. This would avoid scaling per use
-  // later.
   return IR2VecVocabResult(std::move(Vocabulary));
 }
 
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 053b9f75e7a66..90d07d080443b 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -281,25 +281,30 @@ TEST(IR2VecTest, IR2VecVocabResultValidity) {
   EXPECT_EQ(validResult.getDimension(), 2u);
 }
 
-// Helper to create a minimal function and embedder for getter tests
-struct GetterTestEnv {
-  Vocab V = {};
+// Fixture for IR2Vec tests requiring IR setup and weight management.
+class IR2VecTestFixture : public ::testing::Test {
+protected:
+  Vocab V;
   LLVMContext Ctx;
-  std::unique_ptr<Module> M = nullptr;
+  std::unique_ptr<Module> M;
   Function *F = nullptr;
   BasicBlock *BB = nullptr;
-  Instruction *Add = nullptr;
-  Instruction *Ret = nullptr;
-  std::unique_ptr<Embedder> Emb = nullptr;
+  Instruction *AddInst = nullptr;
+  Instruction *RetInst = nullptr;
 
-  GetterTestEnv() {
+  float OriginalOpcWeight = ::OpcWeight;
+  float OriginalTypeWeight = ::TypeWeight;
+  float OriginalArgWeight = ::ArgWeight;
+
+  void SetUp() override {
     V = {{"add", {1.0, 2.0}},
          {"integerTy", {0.5, 0.5}},
          {"constant", {0.2, 0.3}},
          {"variable", {0.0, 0.0}},
          {"unknownTy", {0.0, 0.0}}};
 
-    M = std::make_unique<Module>("M", Ctx);
+    // Setup IR
+    M = std::make_unique<Module>("TestM", Ctx);
     FunctionType *FTy = FunctionType::get(
         Type::getInt32Ty(Ctx), {Type::getInt32Ty(Ctx), Type::getInt32Ty(Ctx)},
         false);
@@ -308,61 +313,82 @@ struct GetterTestEnv {
     Argument *Arg = F->getArg(0);
     llvm::Value *Const = ConstantInt::get(Type::getInt32Ty(Ctx), 42);
 
-    Add = BinaryOperator::CreateAdd(Arg, Const, "add", BB);
-    Ret = ReturnInst::Create(Ctx, Add, BB);
+    AddInst = BinaryOperator::CreateAdd(Arg, Const, "add", BB);
+    RetInst = ReturnInst::Create(Ctx, AddInst, BB);
+  }
+
+  void setWeights(float OpcWeight, float TypeWeight, float ArgWeight) {
+    ::OpcWeight = OpcWeight;
+    ::TypeWeight = TypeWeight;
+    ::ArgWeight = ArgWeight;
+  }
 
-    auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
-    EXPECT_TRUE(static_cast<bool>(Result));
-    Emb = std::move(*Result);
+  void TearDown() override {
+    // Restore original global weights
+    ::OpcWeight = OriginalOpcWeight;
+    ::TypeWeight = OriginalTypeWeight;
+    ::ArgWeight = OriginalArgWeight;
   }
 };
 
-TEST(IR2VecTest, GetInstVecMap) {
-  GetterTestEnv Env;
-  const auto &InstMap = Env.Emb->getInstVecMap();
+TEST_F(IR2VecTestFixture, GetInstVecMap) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &InstMap = Emb->getInstVecMap();
 
   EXPECT_EQ(InstMap.size(), 2u);
-  EXPECT_TRUE(InstMap.count(Env.Add));
-  EXPECT_TRUE(InstMap.count(Env.Ret));
+  EXPECT_TRUE(InstMap.count(AddInst));
+  EXPECT_TRUE(InstMap.count(RetInst));
 
-  EXPECT_EQ(InstMap.at(Env.Add).size(), 2u);
-  EXPECT_EQ(InstMap.at(Env.Ret).size(), 2u);
+  EXPECT_EQ(InstMap.at(AddInst).size(), 2u);
+  EXPECT_EQ(InstMap.at(RetInst).size(), 2u);
 
   // Check values for add: {1.29, 2.31}
-  EXPECT_THAT(InstMap.at(Env.Add),
+  EXPECT_THAT(InstMap.at(AddInst),
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 
   // Check values for ret: {0.0, 0.}; Neither ret nor voidTy are present in
   // vocab
-  EXPECT_THAT(InstMap.at(Env.Ret), ElementsAre(0.0, 0.0));
+  EXPECT_THAT(InstMap.at(RetInst), ElementsAre(0.0, 0.0));
 }
 
-TEST(IR2VecTest, GetBBVecMap) {
-  GetterTestEnv Env;
-  const auto &BBMap = Env.Emb->getBBVecMap();
+TEST_F(IR2VecTestFixture, GetBBVecMap) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &BBMap = Emb->getBBVecMap();
 
   EXPECT_EQ(BBMap.size(), 1u);
-  EXPECT_TRUE(BBMap.count(Env.BB));
-  EXPECT_EQ(BBMap.at(Env.BB).size(), 2u);
+  EXPECT_TRUE(BBMap.count(BB));
+  EXPECT_EQ(BBMap.at(BB).size(), 2u);
 
   // BB vector should be sum of add and ret: {1.29, 2.31} + {0.0, 0.0} =
   // {1.29, 2.31}
-  EXPECT_THAT(BBMap.at(Env.BB),
+  EXPECT_THAT(BBMap.at(BB),
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
-TEST(IR2VecTest, GetBBVector) {
-  GetterTestEnv Env;
-  const auto &BBVec = Env.Emb->getBBVector(*Env.BB);
+TEST_F(IR2VecTestFixture, GetBBVector) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
   EXPECT_THAT(BBVec,
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
-TEST(IR2VecTest, GetFunctionVector) {
-  GetterTestEnv Env;
-  const auto &FuncVec = Env.Emb->getFunctionVector();
+TEST_F(IR2VecTestFixture, GetFunctionVector) {
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &FuncVec = Emb->getFunctionVector();
 
   EXPECT_EQ(FuncVec.size(), 2u);
 
@@ -371,4 +397,45 @@ TEST(IR2VecTest, GetFunctionVector) {
               ElementsAre(DoubleNear(1.29, 1e-6), DoubleNear(2.31, 1e-6)));
 }
 
+TEST_F(IR2VecTestFixture, GetFunctionVectorWithCustomWeights) {
+  setWeights(1.0, 1.0, 1.0);
+
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  ASSERT_TRUE(static_cast<bool>(Result));
+  auto Emb = std::move(*Result);
+
+  const auto &FuncVec = Emb->getFunctionVector();
+
+  EXPECT_EQ(FuncVec.size(), 2u);
+
+  // Expected: 1*([1.0 2.0] + [0.0 0.0]) + 1*([0.5 0.5] + [0.0 0.0]) + 1*([0.2
+  // 0.3] + [0.0 0.0])
+  EXPECT_THAT(FuncVec,
+              ElementsAre(DoubleNear(1.7, 1e-6), DoubleNear(2.8, 1e-6)));
+}
+
+TEST(IR2VecTest, IR2VecVocabAnalysisWithPrepopulatedVocab) {
+  Vocab InitialVocab = {{"key1", {1.1, 2.2}}, {"key2", {3.3, 4.4}}};
+  Vocab ExpectedVocab = InitialVocab;
+  unsigned ExpectedDim = InitialVocab.begin()->second.size();
+
+  IR2VecVocabAnalysis VocabAnalysis(std::move(InitialVocab));
+
+  LLVMContext TestCtx;
+  Module TestMod("TestModuleForVocabAnalysis", TestCtx);
+  ModuleAnalysisManager MAM;
+  IR2VecVocabResult Result = VocabAnalysis.run(TestMod, MAM);
+
+  EXPECT_TRUE(Result.isValid());
+  ASSERT_FALSE(Result.getVocabulary().empty());
+  EXPECT_EQ(Result.getDimension(), ExpectedDim);
+
+  const auto &ResultVocab = Result.getVocabulary();
+  EXPECT_EQ(ResultVocab.size(), ExpectedVocab.size());
+  for (const auto &pair : ExpectedVocab) {
+    EXPECT_TRUE(ResultVocab.count(pair.first));
+    EXPECT_THAT(ResultVocab.at(pair.first), ElementsAreArray(pair.second));
+  }
+}
+
 } // end anonymous namespace

From 9d49b82de077c730d687593604dfa00770f11965 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Fri, 13 Jun 2025 19:48:05 +0200
Subject: [PATCH 415/851] [clang-scan-deps] Implement P2223R2 for
 DependencyDirectiveScanner.cpp (#143950)

P2223R2 allows the line-continuation slash `\` to be followed by
additional whitespace. The Clang lexer already follows this behavior,
also for versions prior to C++23. The dependency directive scanner
however only implements it for `#define` directives (15d5f5d).

This fully implements P2223R2 for the dependency directive scanner (for
any C++ standard) and aligns the dependency directive scanner's splicing
behavior with that of the Clang lexer.

For example, the following code was previously not scanned correctly by
`clang-scan-deps` but now works as expected:

```cpp
import \<whitespace here>
A;
```
---
 clang/lib/Lex/DependencyDirectivesScanner.cpp | 32 +++++--
 .../Lex/DependencyDirectivesScannerTest.cpp   | 91 +++++++++++++++++++
 2 files changed, 113 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 4606b85d42fe7..1b6b16c561141 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -323,10 +323,6 @@ static unsigned skipNewline(const char *&First, const char *End) {
   return Len;
 }
 
-static bool wasLineContinuation(const char *First, unsigned EOLLen) {
-  return *(First - (int)EOLLen - 1) == '\\';
-}
-
 static void skipToNewlineRaw(const char *&First, const char *const End) {
   for (;;) {
     if (First == End)
@@ -336,13 +332,16 @@ static void skipToNewlineRaw(const char *&First, const char *const End) {
     if (Len)
       return;
 
+    char LastNonWhitespace = ' ';
     do {
+      if (!isHorizontalWhitespace(*First))
+        LastNonWhitespace = *First;
       if (++First == End)
         return;
       Len = isEOL(First, End);
     } while (!Len);
 
-    if (First[-1] != '\\')
+    if (LastNonWhitespace != '\\')
       return;
 
     First += Len;
@@ -394,6 +393,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start,
 }
 
 void Scanner::skipLine(const char *&First, const char *const End) {
+  char LastNonWhitespace = ' ';
   for (;;) {
     assert(First <= End);
     if (First == End)
@@ -419,6 +419,8 @@ void Scanner::skipLine(const char *&First, const char *const End) {
       // Iterate over comments correctly.
       if (*First != '/' || End - First < 2) {
         LastTokenPtr = First;
+        if (!isWhitespace(*First))
+          LastNonWhitespace = *First;
         ++First;
         continue;
       }
@@ -431,6 +433,8 @@ void Scanner::skipLine(const char *&First, const char *const End) {
 
       if (First[1] != '*') {
         LastTokenPtr = First;
+        if (!isWhitespace(*First))
+          LastNonWhitespace = *First;
         ++First;
         continue;
       }
@@ -442,8 +446,9 @@ void Scanner::skipLine(const char *&First, const char *const End) {
       return;
 
     // Skip over the newline.
-    unsigned Len = skipNewline(First, End);
-    if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
+    skipNewline(First, End);
+
+    if (LastNonWhitespace != '\\')
       break;
   }
 }
@@ -468,9 +473,16 @@ static void skipWhitespace(const char *&First, const char *const End) {
     if (End - First < 2)
       return;
 
-    if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
-      skipNewline(++First, End);
-      continue;
+    if (*First == '\\') {
+      const char *Ptr = First + 1;
+      while (Ptr < End && isHorizontalWhitespace(*Ptr))
+        ++Ptr;
+      if (Ptr != End && isVerticalWhitespace(*Ptr)) {
+        skipNewline(Ptr, End);
+        First = Ptr;
+        continue;
+      }
+      return;
     }
 
     // Check for a non-comment character.
diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
index 377c066f031d3..61f74929c1e98 100644
--- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp
@@ -789,6 +789,97 @@ TEST(MinimizeSourceToDependencyDirectivesTest,
                Out.data());
 }
 
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     WhitespaceAfterLineContinuationSlashLineComment) {
+  SmallVector<char, 128> Out;
+
+  ASSERT_FALSE(minimizeSourceToDependencyDirectives("// some comment \\  \n"
+                                                    "module A;\n",
+                                                    Out));
+  EXPECT_STREQ("", Out.data());
+}
+
+TEST(MinimizeSourceToDependencyDirectivesTest,
+     WhitespaceAfterLineContinuationSlashAllDirectives) {
+  SmallVector<char, 512> Out;
+  SmallVector<dependency_directives_scan::Token, 16> Tokens;
+  SmallVector<Directive, 16> Directives;
+
+  StringRef Input = "#define \\   \n"
+                    "A\n"
+                    "#undef\t\\   \n"
+                    "A\n"
+                    "#endif \\\t\t\n"
+                    "\n"
+                    "#if \\     \t\n"
+                    "A\n"
+                    "#ifdef\t\\   \n"
+                    "A\n"
+                    "#ifndef \\ \t\n"
+                    "A\n"
+                    "#elifdef \\  \n"
+                    "A\n"
+                    "#elifndef \\ \n"
+                    "A\n"
+                    "#elif \\\t\t \n"
+                    "A\n"
+                    "#else \\\t \t\n"
+                    "\n"
+                    "#include \\  \n"
+                    "<A>\n"
+                    "#include_next \\    \n"
+                    "<A>\n"
+                    "#__include_macros\\ \n"
+                    "<A>\n"
+                    "#import \\ \t\n"
+                    "<A>\n"
+                    "@import \\\t \n"
+                    "A;\n"
+                    "#pragma clang \\   \n"
+                    "module \\    \n"
+                    "import A\n"
+                    "#pragma \\   \n"
+                    "push_macro(A)\n"
+                    "#pragma \\\t \n"
+                    "pop_macro(A)\n"
+                    "#pragma \\   \n"
+                    "include_alias(<A>,\\ \n"
+                    "<B>)\n"
+                    "export \\    \n"
+                    "module m;\n"
+                    "import\t\\\t \n"
+                    "m;\n"
+                    "#pragma\t\\  \n"
+                    "clang\t\\  \t\n"
+                    "system_header\n";
+  ASSERT_FALSE(
+      minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives));
+
+  EXPECT_EQ(pp_define, Directives[0].Kind);
+  EXPECT_EQ(pp_undef, Directives[1].Kind);
+  EXPECT_EQ(pp_endif, Directives[2].Kind);
+  EXPECT_EQ(pp_if, Directives[3].Kind);
+  EXPECT_EQ(pp_ifdef, Directives[4].Kind);
+  EXPECT_EQ(pp_ifndef, Directives[5].Kind);
+  EXPECT_EQ(pp_elifdef, Directives[6].Kind);
+  EXPECT_EQ(pp_elifndef, Directives[7].Kind);
+  EXPECT_EQ(pp_elif, Directives[8].Kind);
+  EXPECT_EQ(pp_else, Directives[9].Kind);
+  EXPECT_EQ(pp_include, Directives[10].Kind);
+  EXPECT_EQ(pp_include_next, Directives[11].Kind);
+  EXPECT_EQ(pp___include_macros, Directives[12].Kind);
+  EXPECT_EQ(pp_import, Directives[13].Kind);
+  EXPECT_EQ(decl_at_import, Directives[14].Kind);
+  EXPECT_EQ(pp_pragma_import, Directives[15].Kind);
+  EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind);
+  EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind);
+  EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind);
+  EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind);
+  EXPECT_EQ(cxx_import_decl, Directives[20].Kind);
+  EXPECT_EQ(pp_pragma_system_header, Directives[21].Kind);
+  EXPECT_EQ(pp_eof, Directives[22].Kind);
+}
+
 TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
   SmallVector<char, 128> Out;
 

From 92a116c4ef822950f8c57eaa5164c844c73a1f7e Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Fri, 13 Jun 2025 10:48:34 -0700
Subject: [PATCH 416/851] Revert "Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest."" (#144129)

Reverts llvm/llvm-project#143972 - matcher seems to be pedantic for
fgets tests, reverting to verify and fix.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ------------
 libc/test/src/stdio/fdopen_test.cpp          | 14 +++++++------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++-----------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++-----------
 libc/test/src/stdio/fgets_test.cpp           | 18 +++++++---------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++++++++++-----
 libc/test/src/stdio/fopencookie_test.cpp     | 15 ++++++-------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 ++++---
 libc/test/src/stdlib/StrtolTest.h            |  1 +
 libc/test/src/stdlib/strtold_test.cpp        |  1 +
 13 files changed, 77 insertions(+), 84 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 4aa8b95880018..ce2171f19597b 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,7 +20,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -69,7 +68,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -90,7 +88,6 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -112,7 +109,6 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -442,7 +438,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -457,7 +452,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -474,7 +468,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
-      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -495,8 +488,6 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -519,8 +510,6 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -538,8 +527,6 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
-    libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index b53184c30be36..104fc478b100e 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,21 +9,20 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
+TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
+TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
+  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
+  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index be2e50271b510..56bde5f0099a8 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,15 +14,12 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
-
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,28 +27,29 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-                Succeeds(WRITE_SIZE));
+    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
     // This is a write-only file so reads should fail.
-    ASSERT_THAT(func(file), Fails(EBADF, EOF));
+    ASSERT_EQ(func(file), EOF);
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
+      int c = func(file);
+      ASSERT_EQ(c, int('1' + i));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_THAT(func(file), Succeeds(EOF));
+    ASSERT_EQ(func(file), EOF);
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index bef9dafd3d87c..90429ecf4e82b 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,15 +17,12 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
-
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -33,30 +30,31 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-                Succeeds(WRITE_SIZE));
+    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
     // This is a write-only file so reads should fail.
-    ASSERT_THAT(func(file), Fails(EBADF, EOF));
+    ASSERT_EQ(func(file), EOF);
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+    libc_errno = 0;
 
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
+      int c = func(file);
+      ASSERT_EQ(c, int('1' + i));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_THAT(func(file), Succeeds(EOF));
+    ASSERT_EQ(func(file), EOF);
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index ca8d4d4546635..abed3d4052939 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,14 +12,11 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
-#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -32,15 +29,15 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
-              Succeeds(WRITE_SIZE));
+  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
   // This is a write-only file so reads should fail.
-  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file), Fails(EBADF, nullptr));
+  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
-  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -58,7 +55,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
-  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -90,5 +86,5 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
+  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e097785832d56..e624181c795b8 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,18 +17,17 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
+TEST(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
+  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
+  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
+  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST_F(LlvmLibcFILETest, FFlush) {
+TEST(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
+  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index bcf5e674141a7..03e1ac286b646 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,7 +15,6 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -23,7 +22,6 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
-using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -90,7 +88,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
+TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
+TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 296bff1f5dc15..84984e26398c0 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,17 +11,16 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
+#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index 135fb98c07fbb..ac494a4ecaf8e 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,19 +8,18 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
+#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
+TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
+  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
+TEST(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index a0936ba79ef73..5872943c1bb41 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,14 +11,12 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
+#include "src/__support/libc_errno.h"
 
-using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
-
-TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -54,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -104,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
+  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index e99b382d12112..5d482b70064bd 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,12 +15,11 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
-#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+#include "src/__support/libc_errno.h"
 
-TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
+  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 03f0a6539c785..3eeccc5727e77 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,6 +9,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index eb4056dc7ba64..c2f2b9c9a11c3 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 452276ecc0f5d1cb9bf5e1655e422a68eafdb8b9 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Fri, 13 Jun 2025 11:00:08 -0700
Subject: [PATCH 417/851] [libc] Fix missing errno include in fuzzer (#144132)

The printf parser uses errno for setting up the %m conversion. It was
presumably getting this include indirectly until a recent change. This
patch adds a direct dependency to fix it.
---
 libc/fuzzing/stdio/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt
index 8f89baa702000..401785a30469c 100644
--- a/libc/fuzzing/stdio/CMakeLists.txt
+++ b/libc/fuzzing/stdio/CMakeLists.txt
@@ -4,6 +4,7 @@ add_libc_fuzzer(
     printf_parser_fuzz.cpp
   DEPENDS
     libc.src.stdio.printf_core.parser
+    libc.src.errno.errno # needed for the strerror conversion
 )
 
 add_libc_fuzzer(

From 0c7ce6883a04dadd9daf0d41cba58c2f9eec19ad Mon Sep 17 00:00:00 2001
From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:02:05 -0700
Subject: [PATCH 418/851] Revert "[mlir][vector] Fix for WarpOpScfForOp failure
 when scf.for has results that are unused." (#144124)

Reverts llvm/llvm-project#141853

Reverting the bug fix because it does not handle all cases correctly.
---
 .../Vector/Transforms/VectorDistribute.cpp    | 39 +++++--------------
 .../Vector/vector-warp-distribute.mlir        | 36 -----------------
 2 files changed, 10 insertions(+), 65 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 52a9cedb43cc0..045c192787f10 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1554,36 +1554,22 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     llvm::SmallSetVector<Value, 32> escapingValues;
     SmallVector<Type> inputTypes;
     SmallVector<Type> distTypes;
-    auto collectEscapingValues = [&](Value value) {
-      if (!escapingValues.insert(value))
-        return;
-      Type distType = value.getType();
-      if (auto vecType = dyn_cast<VectorType>(distType)) {
-        AffineMap map = distributionMapFn(value);
-        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-      }
-      inputTypes.push_back(value.getType());
-      distTypes.push_back(distType);
-    };
-
     mlir::visitUsedValuesDefinedAbove(
         forOp.getBodyRegion(), [&](OpOperand *operand) {
           Operation *parent = operand->get().getParentRegion()->getParentOp();
           if (warpOp->isAncestor(parent)) {
-            collectEscapingValues(operand->get());
+            if (!escapingValues.insert(operand->get()))
+              return;
+            Type distType = operand->get().getType();
+            if (auto vecType = dyn_cast<VectorType>(distType)) {
+              AffineMap map = distributionMapFn(operand->get());
+              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+            }
+            inputTypes.push_back(operand->get().getType());
+            distTypes.push_back(distType);
           }
         });
 
-    // Any forOp result that is not already yielded by the warpOp
-    // region is also considered escaping and must be returned by the
-    // original warpOp.
-    for (OpResult forResult : forOp.getResults()) {
-      // Check if this forResult is already yielded by the yield op.
-      if (llvm::is_contained(yield->getOperands(), forResult))
-        continue;
-      collectEscapingValues(forResult);
-    }
-
     if (llvm::is_contained(distTypes, Type{}))
       return failure();
 
@@ -1623,12 +1609,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
                                     forOp.getResultTypes().end());
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
     for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
-      auto newWarpResult = newWarpOp.getResult(retIdx);
-      // Unused forOp results yielded by the warpOp region are already included
-      // in the new ForOp.
-      if (llvm::is_contained(newOperands, newWarpResult))
-        continue;
-      warpInput.push_back(newWarpResult);
+      warpInput.push_back(newWarpOp.getResult(retIdx));
       argIndexMapping[escapingValues[i]] = warpInputType.size();
       warpInputType.push_back(inputTypes[i]);
     }
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 6c7ac7a5196a7..38771f2593449 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -584,42 +584,6 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
   return
 }
 
-// -----
-// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield(
-//       CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32>
-//       CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
-//       CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
-//       CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32>
-//       CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32>
-//       CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32>
-//       CHECK-PROP: }
-//       CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
-func.func @warp_scf_for_unused_yield(%arg0: index) {
-  %c128 = arith.constant 128 : index
-  %c1 = arith.constant 1 : index
-  %c0 = arith.constant 0 : index
-  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
-    %ini = "some_def"() : () -> (vector<128xf32>)
-    %ini1 = "some_def"() : () -> (vector<128xf32>)
-    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) {
-      %add = arith.addi %arg3, %c1 : index
-      %1  = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>)
-      %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
-      scf.yield %acc, %1 : vector<128xf32>, vector<128xf32>
-    }
-    gpu.yield %3#0 : vector<128xf32>
-  }
-  "some_use"(%0) : (vector<4xf32>) -> ()
-  return
-}
-
-
 // -----
 
 // CHECK-PROP-LABEL: func @vector_reduction(

From f82cf7442029d3376813db82eca60800e999bfb9 Mon Sep 17 00:00:00 2001
From: Artem Gindinson <gindinson@roofline.ai>
Date: Fri, 13 Jun 2025 20:03:24 +0200
Subject: [PATCH 419/851] =?UTF-8?q?[mlir][tensor]=20Fix=20`getReassociatio?=
 =?UTF-8?q?nForCollapse`=20for=20tensor/scalar=20re=E2=80=A6=20(#144118)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…shapes

Commit 6e5a142 changed the behavior of the function when computing
reassociations between tensors (consisting of unit/dynamic dimensions)
and scalars/0d vectors. The IR representation for such reshapes actually
expects an empty reassociation, like so:
```
func.func @example(%arg0 : tensor<?x?x?xf32>) -> tensor<f32> {
  %0 = tensor.collapse_shape %arg0 [] : tensor<?x?x?xf32> into tensor<f32>
}
```

Restore the original behavior - the routine should resort to reporting
failures when compile time-known non-unit dimensions are part of the
attempted reassociation.

Signed-off-by: Artem Gindinson <gindinson@roofline.ai>
---
 mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp           | 10 ++++------
 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp |  8 ++++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
index 3b1fdb69e8ef1..aa566c0086a2f 100644
--- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp
@@ -299,19 +299,17 @@ mlir::getReassociationIndicesForCollapse(ArrayRef<int64_t> sourceShape,
   // this utility).
   if (numSourceDims <= numTargetDims)
     return std::nullopt;
-  // Early handling for scalar target types.
+  // Early handling for scalar target types. We should report an invalid
+  // reassociation for non-unit static dimensions - no chance to collapse these
+  // into a scalar.
   if (numTargetDims == 0) {
-    ReassociationIndices allSourceIndices;
-    allSourceIndices.reserve(numSourceDims);
     for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims;
          ++sourceDimIdx) {
       int64_t sourceSize = sourceShape[sourceDimIdx];
-      // All source dimensions must be unit or dynamic.
       if (sourceSize != 1 && sourceSize != ShapedType::kDynamic)
         return std::nullopt;
-      allSourceIndices.push_back(sourceDimIdx);
     }
-    return SmallVector<ReassociationIndices>{allSourceIndices};
+    return SmallVector<ReassociationIndices>{};
   }
 
   // Collect source ranges by iterating over the target shape left-to-right.
diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
index db1a87a4de2d5..05f97e875e2dc 100644
--- a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp
@@ -23,16 +23,16 @@ makeOptionalIndices(std::initializer_list<ReassociationIndices> list) {
 
 TEST(ReassociationIndicesForCollapse, ScalarTest) {
   EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}),
-            makeOptionalIndices({{0}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}),
-            makeOptionalIndices({{0, 1}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}),
-            makeOptionalIndices({{0}}));
+            makeOptionalIndices({}));
   EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic,
                                                 ShapedType::kDynamic, 1,
                                                 ShapedType::kDynamic},
                                                {}),
-            makeOptionalIndices({{0, 1, 2, 3, 4}}));
+            makeOptionalIndices({}));
 }
 
 TEST(ReassociationIndicesForCollapse, ScalarTestFailure) {

From 52d34865b9db3485c8a671a88cc571270349f720 Mon Sep 17 00:00:00 2001
From: FYK <fanju110@163.com>
Date: Sat, 14 Jun 2025 02:05:16 +0800
Subject: [PATCH 420/851] Fix and reapply IR PGO support for Flang (#142892)

This PR resubmits the changes from #136098, which was previously
reverted due to a build failure during the linking stage:

```
undefined reference to `llvm::DebugInfoCorrelate'
undefined reference to `llvm::ProfileCorrelate'
```

The root cause was that `llvm/lib/Frontend/Driver/CodeGenOptions.cpp`
references symbols from the `Instrumentation` component, but the
`LINK_COMPONENTS` in the `llvm/lib/Frontend/CMakeLists.txt` for
`LLVMFrontendDriver` did not include it. As a result, linking failed in
configurations where these components were not transitively linked.

### Fix:

This updated patch explicitly adds `Instrumentation` to
`LINK_COMPONENTS` in the relevant `llvm/lib/Frontend/CMakeLists.txt`
file to ensure the required symbols are properly resolved.

---------

Co-authored-by: ict-ql <168183727+ict-ql@users.noreply.github.com>
Co-authored-by: Chyaka <52224511+liliumshade@users.noreply.github.com>
Co-authored-by: Tarun Prabhu <tarunprabhu@gmail.com>
---
 clang/include/clang/Basic/CodeGenOptions.def  |  6 ++-
 clang/include/clang/Basic/CodeGenOptions.h    | 32 +++++++---------
 clang/include/clang/Basic/ProfileList.h       |  9 ++---
 clang/include/clang/Driver/Options.td         |  6 +--
 clang/lib/Basic/ProfileList.cpp               | 22 +++++------
 clang/lib/CodeGen/BackendUtil.cpp             |  9 +----
 clang/lib/CodeGen/CodeGenAction.cpp           |  4 +-
 clang/lib/CodeGen/CodeGenFunction.cpp         |  3 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |  4 ++
 clang/lib/Frontend/CompilerInvocation.cpp     |  6 +--
 .../include/flang/Frontend/CodeGenOptions.def |  7 ++++
 flang/include/flang/Frontend/CodeGenOptions.h | 38 +++++++++++++++++++
 flang/lib/Frontend/CompilerInvocation.cpp     | 10 +++++
 flang/lib/Frontend/FrontendActions.cpp        | 26 +++++++++++++
 flang/test/Driver/flang-f-opts.f90            |  5 +++
 .../Inputs/gcc-flag-compatibility_IR.proftext | 18 +++++++++
 .../gcc-flag-compatibility_IR_entry.proftext  | 11 ++++++
 flang/test/Profile/gcc-flag-compatibility.f90 | 32 ++++++++++++++++
 .../llvm/Frontend/Driver/CodeGenOptions.h     | 13 +++++++
 llvm/lib/Frontend/Driver/CMakeLists.txt       |  1 +
 llvm/lib/Frontend/Driver/CodeGenOptions.cpp   | 13 +++++++
 22 files changed, 223 insertions(+), 54 deletions(-)
 create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
 create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
 create mode 100644 flang/test/Profile/gcc-flag-compatibility.f90

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index fa9474d63ae42..2a30ff11464dd 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -223,9 +223,11 @@ AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is
 CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic
 CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation profiling
 /// Choose profile instrumenation kind or no instrumentation.
-ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileNone)
+
+ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, llvm::driver::ProfileInstrKind::ProfileNone)
+
 /// Choose profile kind for PGO use compilation.
-ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
+ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
 /// Partition functions into N groups and select only functions in group i to be
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index a77232c281f7f..7ba21fca6dd6b 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -80,16 +80,6 @@ class CodeGenOptions : public CodeGenOptionsBase {
     SRCK_InRegs    // Small structs in registers (-freg-struct-return).
   };
 
-  enum ProfileInstrKind {
-    ProfileNone,       // Profile instrumentation is turned off.
-    ProfileClangInstr, // Clang instrumentation to generate execution counts
-                       // to use with PGO.
-    ProfileIRInstr,    // IR level PGO instrumentation in LLVM.
-    ProfileCSIRInstr, // IR level PGO context sensitive instrumentation in LLVM.
-    ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage
-                            // instrumentation in LLVM.
-  };
-
   enum EmbedBitcodeKind {
     Embed_Off,      // No embedded bitcode.
     Embed_All,      // Embed both bitcode and commandline in the output.
@@ -522,35 +512,41 @@ class CodeGenOptions : public CodeGenOptionsBase {
 
   /// Check if Clang profile instrumenation is on.
   bool hasProfileClangInstr() const {
-    return getProfileInstr() == ProfileClangInstr;
+    return getProfileInstr() ==
+           llvm::driver::ProfileInstrKind::ProfileClangInstr;
   }
 
   /// Check if IR level profile instrumentation is on.
   bool hasProfileIRInstr() const {
-    return getProfileInstr() == ProfileIRInstr;
+    return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr;
   }
 
   /// Check if CS IR level profile instrumentation is on.
   bool hasProfileCSIRInstr() const {
-    return getProfileInstr() == ProfileCSIRInstr;
+    return getProfileInstr() ==
+           llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
   }
 
   /// Check if any form of instrumentation is on.
-  bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; }
+  bool hasProfileInstr() const {
+    return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone;
+  }
 
   /// Check if Clang profile use is on.
   bool hasProfileClangUse() const {
-    return getProfileUse() == ProfileClangInstr;
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileClangInstr;
   }
 
   /// Check if IR level profile use is on.
   bool hasProfileIRUse() const {
-    return getProfileUse() == ProfileIRInstr ||
-           getProfileUse() == ProfileCSIRInstr;
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr ||
+           getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
   }
 
   /// Check if CSIR profile use is on.
-  bool hasProfileCSIRUse() const { return getProfileUse() == ProfileCSIRInstr; }
+  bool hasProfileCSIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
+  }
 
   /// Check if type and variable info should be emitted.
   bool hasReducedDebugInfo() const {
diff --git a/clang/include/clang/Basic/ProfileList.h b/clang/include/clang/Basic/ProfileList.h
index b4217e49c18a3..5338ef3992ade 100644
--- a/clang/include/clang/Basic/ProfileList.h
+++ b/clang/include/clang/Basic/ProfileList.h
@@ -49,17 +49,16 @@ class ProfileList {
   ~ProfileList();
 
   bool isEmpty() const { return Empty; }
-  ExclusionType getDefault(CodeGenOptions::ProfileInstrKind Kind) const;
+  ExclusionType getDefault(llvm::driver::ProfileInstrKind Kind) const;
 
   std::optional<ExclusionType>
   isFunctionExcluded(StringRef FunctionName,
-                     CodeGenOptions::ProfileInstrKind Kind) const;
+                     llvm::driver::ProfileInstrKind Kind) const;
   std::optional<ExclusionType>
   isLocationExcluded(SourceLocation Loc,
-                     CodeGenOptions::ProfileInstrKind Kind) const;
+                     llvm::driver::ProfileInstrKind Kind) const;
   std::optional<ExclusionType>
-  isFileExcluded(StringRef FileName,
-                 CodeGenOptions::ProfileInstrKind Kind) const;
+  isFileExcluded(StringRef FileName, llvm::driver::ProfileInstrKind Kind) const;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 152df89118a6a..5951687b095e4 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1772,7 +1772,7 @@ def fmcdc_max_test_vectors_EQ : Joined<["-"], "fmcdc-max-test-vectors=">,
   HelpText<"Maximum number of test vectors in MC/DC coverage">,
   MarshallingInfoInt<CodeGenOpts<"MCDCMaxTVs">, "0x7FFFFFFE">;
 def fprofile_generate : Flag<["-"], "fprofile-generate">,
-    Group<f_Group>, Visibility<[ClangOption, CLOption]>,
+    Group<f_Group>, Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
     Group<f_Group>, Visibility<[ClangOption, CLOption]>,
@@ -1789,7 +1789,7 @@ def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
     Visibility<[ClangOption, CLOption]>, Alias<fprofile_instr_use>;
 def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
     Group<f_Group>,
-    Visibility<[ClangOption, CLOption]>,
+    Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>,
     MetaVarName<"<pathname>">,
     HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
 def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
@@ -7761,7 +7761,7 @@ def fpatchable_function_entry_section_EQ
       MarshallingInfoString<CodeGenOpts<"PatchableFunctionEntrySection">>;
 def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">,
     HelpText<"Enable PGO instrumentation">, Values<"none,clang,llvm,csllvm,sample-coldcov">,
-    NormalizedValuesScope<"CodeGenOptions">,
+    NormalizedValuesScope<"llvm::driver::ProfileInstrKind">,
     NormalizedValues<["ProfileNone", "ProfileClangInstr", "ProfileIRInstr", "ProfileCSIRInstr", "ProfileIRSampleColdCov"]>,
     MarshallingInfoEnum<CodeGenOpts<"ProfileInstr">, "ProfileNone">;
 def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">,
diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp
index aaea5a00ab6ae..8481deffe2a7b 100644
--- a/clang/lib/Basic/ProfileList.cpp
+++ b/clang/lib/Basic/ProfileList.cpp
@@ -69,24 +69,24 @@ ProfileList::ProfileList(ArrayRef<std::string> Paths, SourceManager &SM)
 
 ProfileList::~ProfileList() = default;
 
-static StringRef getSectionName(CodeGenOptions::ProfileInstrKind Kind) {
+static StringRef getSectionName(llvm::driver::ProfileInstrKind Kind) {
   switch (Kind) {
-  case CodeGenOptions::ProfileNone:
+  case llvm::driver::ProfileInstrKind::ProfileNone:
     return "";
-  case CodeGenOptions::ProfileClangInstr:
+  case llvm::driver::ProfileInstrKind::ProfileClangInstr:
     return "clang";
-  case CodeGenOptions::ProfileIRInstr:
+  case llvm::driver::ProfileInstrKind::ProfileIRInstr:
     return "llvm";
-  case CodeGenOptions::ProfileCSIRInstr:
+  case llvm::driver::ProfileInstrKind::ProfileCSIRInstr:
     return "csllvm";
-  case CodeGenOptions::ProfileIRSampleColdCov:
+  case llvm::driver::ProfileInstrKind::ProfileIRSampleColdCov:
     return "sample-coldcov";
   }
-  llvm_unreachable("Unhandled CodeGenOptions::ProfileInstrKind enum");
+  llvm_unreachable("Unhandled llvm::driver::ProfileInstrKind enum");
 }
 
 ProfileList::ExclusionType
-ProfileList::getDefault(CodeGenOptions::ProfileInstrKind Kind) const {
+ProfileList::getDefault(llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "default:<type>"
   if (SCL->inSection(Section, "default", "allow"))
@@ -117,7 +117,7 @@ ProfileList::inSection(StringRef Section, StringRef Prefix,
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isFunctionExcluded(StringRef FunctionName,
-                                CodeGenOptions::ProfileInstrKind Kind) const {
+                                llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "function:<regex>=<case>"
   if (auto V = inSection(Section, "function", FunctionName))
@@ -131,13 +131,13 @@ ProfileList::isFunctionExcluded(StringRef FunctionName,
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isLocationExcluded(SourceLocation Loc,
-                                CodeGenOptions::ProfileInstrKind Kind) const {
+                                llvm::driver::ProfileInstrKind Kind) const {
   return isFileExcluded(SM.getFilename(SM.getFileLoc(Loc)), Kind);
 }
 
 std::optional<ProfileList::ExclusionType>
 ProfileList::isFileExcluded(StringRef FileName,
-                            CodeGenOptions::ProfileInstrKind Kind) const {
+                            llvm::driver::ProfileInstrKind Kind) const {
   StringRef Section = getSectionName(Kind);
   // Check for "source:<regex>=<case>"
   if (auto V = inSection(Section, "source", FileName))
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 88b3a4943e0d8..7e0a3cf5591ce 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -124,17 +124,10 @@ namespace clang {
 extern llvm::cl::opt<bool> ClSanitizeGuardChecks;
 }
 
-// Default filename used for profile generation.
-static std::string getDefaultProfileGenName() {
-  return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE
-             ? "default_%m.proflite"
-             : "default_%m.profraw";
-}
-
 // Path and name of file used for profile generation
 static std::string getProfileGenName(const CodeGenOptions &CodeGenOpts) {
   std::string FileName = CodeGenOpts.InstrProfileOutput.empty()
-                             ? getDefaultProfileGenName()
+                             ? llvm::driver::getDefaultProfileGenName()
                              : CodeGenOpts.InstrProfileOutput;
   if (CodeGenOpts.ContinuousProfileSync)
     FileName = "%c" + FileName;
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 1f5eb427b566f..5493cc92bd8b0 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -273,8 +273,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) {
   std::unique_ptr<llvm::ToolOutputFile> OptRecordFile =
     std::move(*OptRecordFileOrErr);
 
-  if (OptRecordFile &&
-      CodeGenOpts.getProfileUse() != CodeGenOptions::ProfileNone)
+  if (OptRecordFile && CodeGenOpts.getProfileUse() !=
+                           llvm::driver::ProfileInstrKind::ProfileNone)
     Ctx.setDiagnosticsHotnessRequested(true);
 
   if (CodeGenOpts.MisExpect) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 56562002e7194..13d0633e9b1c0 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -943,7 +943,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     }
   }
 
-  if (CGM.getCodeGenOpts().getProfileInstr() != CodeGenOptions::ProfileNone) {
+  if (CGM.getCodeGenOpts().getProfileInstr() !=
+      llvm::driver::ProfileInstrKind::ProfileNone) {
     switch (CGM.isFunctionBlockedFromProfileInstr(Fn, Loc)) {
     case ProfileList::Skip:
       Fn->addFnAttr(llvm::Attribute::SkipProfile);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 16e49aab4fe61..451792dca40c5 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3608,7 +3608,7 @@ CodeGenModule::isFunctionBlockedByProfileList(llvm::Function *Fn,
   // If the profile list is empty, then instrument everything.
   if (ProfileList.isEmpty())
     return ProfileList::Allow;
-  CodeGenOptions::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr();
+  llvm::driver::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr();
   // First, check the function name.
   if (auto V = ProfileList.isFunctionExcluded(Fn->getName(), Kind))
     return *V;
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index a20879dad94d4..47d0e345086b2 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -887,6 +887,10 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   // TODO: Handle interactions between -w, -pedantic, -Wall, -WOption
   Args.AddLastArg(CmdArgs, options::OPT_w);
 
+  // recognise options: fprofile-generate -fprofile-use=
+  Args.addAllArgs(
+      CmdArgs, {options::OPT_fprofile_generate, options::OPT_fprofile_use_EQ});
+
   // Forward flags for OpenMP. We don't do this if the current action is an
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 2c02719121c73..dd021ad2e441b 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1492,11 +1492,11 @@ static void setPGOUseInstrumentor(CodeGenOptions &Opts,
   // which is available (might be one or both).
   if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) {
     if (PGOReader->hasCSIRLevelProfile())
-      Opts.setProfileUse(CodeGenOptions::ProfileCSIRInstr);
+      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileCSIRInstr);
     else
-      Opts.setProfileUse(CodeGenOptions::ProfileIRInstr);
+      Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
   } else
-    Opts.setProfileUse(CodeGenOptions::ProfileClangInstr);
+    Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileClangInstr);
 }
 
 void CompilerInvocation::setDefaultPointerAuthOptions(
diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def
index a697872836569..ae12aec518108 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.def
+++ b/flang/include/flang/Frontend/CodeGenOptions.def
@@ -24,8 +24,15 @@ CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified.
 CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new
                                    ///< pass manager.
 
+
+/// Choose profile instrumenation kind or no instrumentation.
+ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
+/// Choose profile kind for PGO use compilation.
+ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone)
+
 CODEGENOPT(InstrumentFunctions, 1, 0) ///< Set when -finstrument_functions is
                                       ///< enabled on the compile step.
+
 CODEGENOPT(IsPIE, 1, 0) ///< PIE level is the same as PIC Level.
 CODEGENOPT(PICLevel, 2, 0) ///< PIC level of the LLVM module.
 CODEGENOPT(PrepareForFullLTO , 1, 0) ///< Set when -flto is enabled on the
diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h
index e939f10f3c3e7..bad17c8309eb8 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.h
+++ b/flang/include/flang/Frontend/CodeGenOptions.h
@@ -154,6 +154,44 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// OpenMP is enabled.
   using DoConcurrentMappingKind = flangomp::DoConcurrentMappingKind;
 
+  /// Name of the profile file to use as output for -fprofile-instr-generate,
+  /// -fprofile-generate, and -fcs-profile-generate.
+  std::string InstrProfileOutput;
+
+  /// Name of the profile file to use as input for -fmemory-profile-use.
+  std::string MemoryProfileUsePath;
+
+  /// Name of the profile file to use as input for -fprofile-instr-use
+  std::string ProfileInstrumentUsePath;
+
+  /// Name of the profile remapping file to apply to the profile data supplied
+  /// by -fprofile-sample-use or -fprofile-instr-use.
+  std::string ProfileRemappingFile;
+
+  /// Check if Clang profile instrumenation is on.
+  bool hasProfileClangInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileClangInstr;
+  }
+
+  /// Check if IR level profile instrumentation is on.
+  bool hasProfileIRInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileIRInstr;
+  }
+
+  /// Check if CS IR level profile instrumentation is on.
+  bool hasProfileCSIRInstr() const {
+    return getProfileInstr() == llvm::driver::ProfileCSIRInstr;
+  }
+  /// Check if IR level profile use is on.
+  bool hasProfileIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileIRInstr ||
+           getProfileUse() == llvm::driver::ProfileCSIRInstr;
+  }
+  /// Check if CSIR profile use is on.
+  bool hasProfileCSIRUse() const {
+    return getProfileUse() == llvm::driver::ProfileCSIRInstr;
+  }
+
   // Define accessors/mutators for code generation options of enumeration type.
 #define CODEGENOPT(Name, Bits, Default)
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default)                             \
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 15bcff254756e..147849b0b7d2a 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Frontend/Debug/Options.h"
+#include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
@@ -441,6 +442,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
       opts.IsPIE = 1;
   }
 
+  if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) {
+    opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr);
+  }
+
+  if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) {
+    opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr);
+    opts.ProfileInstrumentUsePath = A->getValue();
+  }
+
   // -mcmodel option.
   if (const llvm::opt::Arg *a =
           args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) {
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 1c8a419188b89..d684eeb696755 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -56,10 +56,12 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -67,6 +69,7 @@
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/TargetParser/RISCVTargetParser.h"
 #include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <memory>
 #include <system_error>
@@ -919,6 +922,29 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
   llvm::PassInstrumentationCallbacks pic;
   llvm::PipelineTuningOptions pto;
   std::optional<llvm::PGOOptions> pgoOpt;
+
+  if (opts.hasProfileIRInstr()) {
+    // -fprofile-generate.
+    pgoOpt = llvm::PGOOptions(opts.InstrProfileOutput.empty()
+                                  ? llvm::driver::getDefaultProfileGenName()
+                                  : opts.InstrProfileOutput,
+                              "", "", opts.MemoryProfileUsePath, nullptr,
+                              llvm::PGOOptions::IRInstr,
+                              llvm::PGOOptions::NoCSAction,
+                              llvm::PGOOptions::ColdFuncOpt::Default, false,
+                              /*PseudoProbeForProfiling=*/false, false);
+  } else if (opts.hasProfileIRUse()) {
+    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
+        llvm::vfs::getRealFileSystem();
+    // -fprofile-use.
+    auto CSAction = opts.hasProfileCSIRUse() ? llvm::PGOOptions::CSIRUse
+                                             : llvm::PGOOptions::NoCSAction;
+    pgoOpt = llvm::PGOOptions(
+        opts.ProfileInstrumentUsePath, "", opts.ProfileRemappingFile,
+        opts.MemoryProfileUsePath, VFS, llvm::PGOOptions::IRUse, CSAction,
+        llvm::PGOOptions::ColdFuncOpt::Default, false);
+  }
+
   llvm::StandardInstrumentations si(llvmModule->getContext(),
                                     opts.DebugPassManager);
   si.registerCallbacks(pic, &mam);
diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90
index 4493a519e2010..b972b9b7b2a59 100644
--- a/flang/test/Driver/flang-f-opts.f90
+++ b/flang/test/Driver/flang-f-opts.f90
@@ -8,3 +8,8 @@
 ! CHECK-LABEL: "-fc1"
 ! CHECK: -ffp-contract=off
 ! CHECK: -O3
+
+! RUN: %flang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s
+! CHECK-PROFILE-GENERATE-LLVM: "-fprofile-generate"
+! RUN: %flang -### -S -fprofile-use=%S %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
+! CHECK-PROFILE-USE-DIR: "-fprofile-use={{.*}}"
diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
new file mode 100644
index 0000000000000..2650fb5ebfd35
--- /dev/null
+++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
@@ -0,0 +1,18 @@
+# IR level Instrumentation Flag
+:ir
+_QQmain
+# Func Hash:
+146835646621254984
+# Num Counters:
+2
+# Counter Values:
+100
+1
+
+main
+# Func Hash:
+742261418966908927
+# Num Counters:
+1
+# Counter Values:
+1
\ No newline at end of file
diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
new file mode 100644
index 0000000000000..c4a2a26557e80
--- /dev/null
+++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
@@ -0,0 +1,11 @@
+# IR level Instrumentation Flag
+:ir
+:entry_first
+_QQmain
+# Func Hash:
+146835646621254984
+# Num Counters:
+2
+# Counter Values:
+100
+1
\ No newline at end of file
diff --git a/flang/test/Profile/gcc-flag-compatibility.f90 b/flang/test/Profile/gcc-flag-compatibility.f90
new file mode 100644
index 0000000000000..4490c45232d28
--- /dev/null
+++ b/flang/test/Profile/gcc-flag-compatibility.f90
@@ -0,0 +1,32 @@
+! Tests for -fprofile-generate and -fprofile-use flag compatibility. These two
+! flags behave similarly to their GCC counterparts:
+!
+! -fprofile-generate         Generates the profile file ./default.profraw
+! -fprofile-use=<dir>/file   Uses the profile file <dir>/file
+
+! On AIX, -flto used to be required with -fprofile-generate. gcc-flag-compatibility-aix.c is used to do the testing on AIX with -flto
+! RUN: %flang %s -c -S -o - -emit-llvm -fprofile-generate | FileCheck -check-prefix=PROFILE-GEN %s
+! PROFILE-GEN: @__profc_{{_?}}main = {{(private|internal)}} global [1 x i64] zeroinitializer, section
+! PROFILE-GEN: @__profd_{{_?}}main =
+
+! Check that -fprofile-use=some/path/file.prof reads some/path/file.prof
+! This uses LLVM IR format profile.
+! RUN: rm -rf %t.dir
+! RUN: mkdir -p %t.dir/some/path
+! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR.proftext -o %t.dir/some/path/file.prof
+! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR1 %s
+! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR_entry.proftext -o %t.dir/some/path/file.prof
+! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR2 %s
+! PROFILE-USE-IR1: = !{!"branch_weights", i32 100, i32 1}
+! PROFILE-USE-IR2: = !{!"branch_weights", i32 1, i32 100}
+
+program main
+  implicit none
+  integer :: i
+  integer :: X = 0
+
+  do i = 0, 99
+     X = X + i
+  end do
+
+end program main
diff --git a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
index e8e70c0e126a9..f0168c0407884 100644
--- a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
+++ b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
@@ -14,6 +14,7 @@
 #define LLVM_FRONTEND_DRIVER_CODEGENOPTIONS_H
 
 #include "llvm/Support/Compiler.h"
+#include <string>
 
 namespace llvm {
 class Triple;
@@ -51,6 +52,18 @@ enum class VectorLibrary {
 LLVM_ABI TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
                                            VectorLibrary Veclib);
 
+enum ProfileInstrKind {
+  ProfileNone,       // Profile instrumentation is turned off.
+  ProfileClangInstr, // Clang instrumentation to generate execution counts
+                     // to use with PGO.
+  ProfileIRInstr,    // IR level PGO instrumentation in LLVM.
+  ProfileCSIRInstr,  // IR level PGO context sensitive instrumentation in LLVM.
+  ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage
+                          // instrumentation in LLVM.
+};
+
+// Default filename used for profile generation.
+std::string getDefaultProfileGenName();
 } // end namespace llvm::driver
 
 #endif
diff --git a/llvm/lib/Frontend/Driver/CMakeLists.txt b/llvm/lib/Frontend/Driver/CMakeLists.txt
index 23de4994a300d..9feee6fe6929b 100644
--- a/llvm/lib/Frontend/Driver/CMakeLists.txt
+++ b/llvm/lib/Frontend/Driver/CMakeLists.txt
@@ -12,4 +12,5 @@ add_llvm_component_library(LLVMFrontendDriver
   Core
   Support
   Analysis
+  Instrumentation
   )
diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
index 52080dea93c98..df884908845d2 100644
--- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
+++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp
@@ -8,8 +8,15 @@
 
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/TargetParser/Triple.h"
 
+namespace llvm {
+extern llvm::cl::opt<bool> DebugInfoCorrelate;
+extern llvm::cl::opt<llvm::InstrProfCorrelator::ProfCorrelatorKind>
+    ProfileCorrelate;
+} // namespace llvm
+
 namespace llvm::driver {
 
 TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
@@ -56,4 +63,10 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple,
   return TLII;
 }
 
+std::string getDefaultProfileGenName() {
+  return llvm::DebugInfoCorrelate ||
+                 llvm::ProfileCorrelate != InstrProfCorrelator::NONE
+             ? "default_%m.proflite"
+             : "default_%m.profraw";
+}
 } // namespace llvm::driver

From f6bf3bd5e001918780e7b1e8fceeb02604d65783 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 11:08:15 -0700
Subject: [PATCH 421/851] [bazel] Fix XeGpu deps for 5578bcbcfd25c (#144133)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7bcb1d4ca883c..b62d5595fe941 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3506,6 +3506,7 @@ cc_library(
         ":LoopLikeInterface",
         ":MemRefDialect",
         ":Pass",
+        ":SCFTransforms",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorTransforms",

From 59388fb0b92d7efd5737efd6c7b6d5c82f1bc6a8 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Fri, 13 Jun 2025 11:16:44 -0700
Subject: [PATCH 422/851] [InstCombine] Preserve NSW/NUW flags when folding
 const BOp with min/max (#143471)

When folding `X Pred C2 ? X BOp C1 : C2 BOp C1` to `min/max(X, C2) BOp
C1`, if NUW/NSW flags are present on `X BOp C1` and could be safely
applied to `C2 BOp C1`, then they may be added on the BOp after the fold
is complete. https://alive2.llvm.org/ce/z/n_3aNJ

Preserving these flags can allow subsequent transforms to re-order the
min/max and BOp, which in the case of NVPTX would allow for some
potential future transformations which would improve
instruction-selection.
---
 .../InstCombine/InstCombineInternal.h         |  2 +
 .../InstCombine/InstCombineSelect.cpp         | 36 ++++++--
 .../InstCombine/canonicalize-const-to-bop.ll  | 83 ++++++++++++++++++-
 3 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index ce0e843437b53..8c9de862fe8f2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -771,6 +771,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                             Value *A, Value *B, Instruction &Outer,
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+  Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
+                                      Value *FalseVal);
   Instruction *foldSelectValueEquivalence(SelectInst &SI, CmpInst &CI);
   bool replaceInInstruction(Value *V, Value *Old, Value *New,
                             unsigned Depth = 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 320b827bdbe86..73ba0f78e8053 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1879,9 +1879,9 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
 
 /// Fold `X Pred C1 ? X BOp C2 : C1 BOp C2` to `min/max(X, C1) BOp C2`.
 /// This allows for better canonicalization.
-static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
-                                           Value *FalseVal,
-                                           IRBuilderBase &Builder) {
+Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp,
+                                                      Value *TrueVal,
+                                                      Value *FalseVal) {
   Constant *C1, *C2, *C3;
   Value *X;
   CmpPredicate Predicate;
@@ -1945,11 +1945,29 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal,
     return nullptr;
   }
 
-  Intrinsic::ID IntrinsicID = getMinMaxIntrinsic(SPF);
-  Value *Intrinsic = Builder.CreateBinaryIntrinsic(IntrinsicID, X, RHS);
-  return IsIntrinsic ? Builder.CreateBinaryIntrinsic(Opcode, Intrinsic, C2)
-                     : Builder.CreateBinOp(Instruction::BinaryOps(Opcode),
-                                           Intrinsic, C2);
+  Intrinsic::ID MinMaxID = getMinMaxIntrinsic(SPF);
+  Value *MinMax = Builder.CreateBinaryIntrinsic(MinMaxID, X, RHS);
+  if (IsIntrinsic)
+    return Builder.CreateBinaryIntrinsic(Opcode, MinMax, C2);
+
+  const auto BinOpc = Instruction::BinaryOps(Opcode);
+  Value *BinOp = Builder.CreateBinOp(BinOpc, MinMax, C2);
+
+  // If we can attach no-wrap flags to the new instruction, do so if the
+  // old instruction had them and C1 BinOp C2 does not overflow.
+  if (Instruction *BinOpInst = dyn_cast<Instruction>(BinOp)) {
+    if (BinOpc == Instruction::Add || BinOpc == Instruction::Sub ||
+        BinOpc == Instruction::Mul) {
+      Instruction *OldBinOp = cast<BinaryOperator>(TrueVal);
+      if (OldBinOp->hasNoSignedWrap() &&
+          willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/true))
+        BinOpInst->setHasNoSignedWrap();
+      if (OldBinOp->hasNoUnsignedWrap() &&
+          willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/false))
+        BinOpInst->setHasNoUnsignedWrap();
+    }
+  }
+  return BinOp;
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
@@ -2027,7 +2045,7 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
   if (Value *V = foldAbsDiff(ICI, TrueVal, FalseVal, Builder))
     return replaceInstUsesWith(SI, V);
 
-  if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal, Builder))
+  if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal))
     return replaceInstUsesWith(SI, V);
 
   return Changed ? &SI : nullptr;
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
index c08ec1bb7de0d..b3093a92624ae 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll
@@ -5,7 +5,7 @@ define i8 @add_and_sgt(i8 %x) {
 ; CHECK-LABEL: define i8 @add_and_sgt(
 ; CHECK-SAME: i8 [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8)
-; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 16
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16
 ; CHECK-NEXT:    ret i8 [[S]]
 ;
   %add = add nsw i8 %x, 16
@@ -155,7 +155,7 @@ define i8 @multi_use_cond_and_sel(i8 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X]], 8
 ; CHECK-NEXT:    call void @use(i1 [[CMP]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8)
-; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 16
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16
 ; CHECK-NEXT:    call void @use_byte(i8 [[S]])
 ; CHECK-NEXT:    ret i8 [[S]]
 ;
@@ -450,3 +450,82 @@ define i8 @umax_sgt(i8 %x) {
   %s = select i1 %cmp, i8 100, i8 %umax
   ret i8 %s
 }
+
+define i8 @add_sgt_nuw_nsw_safe(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nuw_nsw_safe(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nuw nsw i8 [[TMP1]], 1
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, 1
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 101, i8 %add
+  ret i8 %s
+}
+
+define i8 @add_sgt_nuw_only(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nuw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[TMP1]], 50
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, 50
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 150, i8 %add
+  ret i8 %s
+}
+
+define i8 @add_sgt_nsw_only(i8 %x) {
+; CHECK-LABEL: define i8 @add_sgt_nsw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100)
+; CHECK-NEXT:    [[S:%.*]] = add nsw i8 [[TMP1]], -99
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %add = add nuw nsw i8 %x, -99
+  %cmp = icmp sgt i8 %x, 100
+  %s = select i1 %cmp, i8 1, i8 %add
+  ret i8 %s
+}
+
+
+define i8 @mul_ult_nuw_nsw_safe(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nuw_nsw_safe(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10)
+; CHECK-NEXT:    [[S:%.*]] = mul nuw nsw i8 [[TMP1]], 3
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, 3
+  %cmp = icmp ult i8 %x, 10
+  %s = select i1 %cmp, i8 30, i8 %mul
+  ret i8 %s
+}
+
+define i8 @mul_ult_nuw_only(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nuw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10)
+; CHECK-NEXT:    [[S:%.*]] = mul nuw i8 [[TMP1]], 25
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, 25
+  %cmp = icmp ult i8 %x, 10
+  %s = select i1 %cmp, i8 250, i8 %mul
+  ret i8 %s
+}
+
+define i8 @mul_ult_nsw_only(i8 %x) {
+; CHECK-LABEL: define i8 @mul_ult_nsw_only(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 40)
+; CHECK-NEXT:    [[S:%.*]] = mul nsw i8 [[TMP1]], -2
+; CHECK-NEXT:    ret i8 [[S]]
+;
+  %mul = mul nuw nsw i8 %x, -2
+  %cmp = icmp ult i8 %x, 40
+  %s = select i1 %cmp, i8 -80, i8 %mul
+  ret i8 %s
+}

From f68848015f62156b8c3539b44f16d9c8b0a93a89 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 13 Jun 2025 19:17:01 +0100
Subject: [PATCH 423/851] [VPlan] Manage Sentinel value for FindLastIV in
 VPlan. (#142291)

Similar to modeling the start value as operand, also model the sentinel
value as operand explicitly. This makes all require information for
code-gen available directly in VPlan.

PR: https://github.com/llvm/llvm-project/pull/142291
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++++--------
 .../Transforms/Vectorize/VPlanPatternMatch.h  | 19 ++++++++++++++++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  8 ++++----
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  2 +-
 .../vplan-printing-reductions.ll              |  2 +-
 5 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa313243a57da..69b60c7b93208 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7266,9 +7266,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
     using namespace llvm::PatternMatch;
     Value *Cmp, *OrigResumeV, *CmpOp;
     bool IsExpectedPattern =
-        match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
-                                        m_Specific(RdxDesc.getSentinelValue()),
-                                        m_Value(OrigResumeV))) &&
+        match(MainResumeValue,
+              m_Select(
+                  m_OneUse(m_Value(Cmp)),
+                  m_Specific(EpiRedResult->getOperand(2)->getLiveInIRValue()),
+                  m_Value(OrigResumeV))) &&
         (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
                                    m_Value(CmpOp))) &&
          ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
@@ -9235,9 +9237,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
             RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
-      FinalReductionResult =
-          Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult,
-                               {PhiR, Start, NewExitingVPV}, ExitDL);
+      FinalReductionResult = Builder.createNaryOp(
+          VPInstruction::ComputeFindLastIVResult,
+          {PhiR, Start, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()),
+           NewExitingVPV},
+          ExitDL);
     } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
                    RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
@@ -9825,8 +9829,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
-        ResumeV =
-            Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
+        ResumeV = Builder.CreateSelect(
+            Cmp, RdxResult->getOperand(2)->getLiveInIRValue(), ResumeV);
       } else {
         VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
         auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index dfd9fc3d4d719..b2535fe3aa578 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -318,6 +318,25 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
       {Op0, Op1, Op2});
 }
 
+template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
+          unsigned Opcode, bool Commutative, typename... RecipeTys>
+using Recipe4Op_match = Recipe_match<std::tuple<Op0_t, Op1_t, Op2_t, Op3_t>,
+                                     Opcode, Commutative, RecipeTys...>;
+
+template <typename Op0_t, typename Op1_t, typename Op2_t, typename Op3_t,
+          unsigned Opcode>
+using VPInstruction4Op_match =
+    Recipe4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode, /*Commutative*/ false,
+                    VPInstruction>;
+
+template <unsigned Opcode, typename Op0_t, typename Op1_t, typename Op2_t,
+          typename Op3_t>
+inline VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>
+m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2,
+                const Op3_t &Op3) {
+  return VPInstruction4Op_match<Op0_t, Op1_t, Op2_t, Op3_t, Opcode>(
+      {Op0, Op1, Op2, Op3});
+}
 template <typename Op0_t>
 inline UnaryVPInstruction_match<Op0_t, Instruction::Freeze>
 m_Freeze(const Op0_t &Op0) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ccce0e07e4d0a..d59cec892d405 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -645,16 +645,16 @@ Value *VPInstruction::generate(VPTransformState &State) {
 
     // The recipe's operands are the reduction phi, followed by one operand for
     // each part of the reduction.
-    unsigned UF = getNumOperands() - 2;
-    Value *ReducedPartRdx = State.get(getOperand(2));
+    unsigned UF = getNumOperands() - 3;
+    Value *ReducedPartRdx = State.get(getOperand(3));
     for (unsigned Part = 1; Part < UF; ++Part) {
       ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx,
-                                      State.get(getOperand(2 + Part)));
+                                      State.get(getOperand(3 + Part)));
     }
 
     return createFindLastIVReduction(Builder, ReducedPartRdx,
                                      State.get(getOperand(1), true),
-                                     RdxDesc.getSentinelValue());
+                                     getOperand(2)->getLiveInIRValue());
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index e4c068ef175bc..dfb5bfabd22b8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -347,7 +347,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
         match(&R, m_VPInstruction<VPInstruction::ComputeReductionResult>(
                       m_VPValue(), m_VPValue(Op1))) ||
         match(&R, m_VPInstruction<VPInstruction::ComputeFindLastIVResult>(
-                      m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
+                      m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) {
       addUniformForAllParts(cast<VPInstruction>(&R));
       for (unsigned Part = 1; Part != UF; ++Part)
         R.addOperand(getValueForPart(Op1, Part));
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
index 95fbc4260587a..978f1b80d26da 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll
@@ -240,7 +240,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond>
+; CHECK-NEXT:   EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<-9223372036854775808>, ir<%cond>
 ; CHECK-NEXT:   EMIT vp<[[EXT:%.+]]> = extract-last-element vp<[[RDX_RES]]>
 ; CHECK-NEXT:   EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<%cmp.n>

From 24bbc820701b49ab8bc7b9670034e39e11da8a16 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 11:20:32 -0700
Subject: [PATCH 424/851] [CIR] Support for static variables (#143980)

This adds support for emitting static variables and their initializers.
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  14 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  18 ++
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          | 248 +++++++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |   6 +
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  13 +
 clang/test/CIR/CodeGen/static-vars.c          |  37 +++
 clang/test/CIR/CodeGen/static-vars.cpp        |  49 ++++
 7 files changed, 383 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/static-vars.c
 create mode 100644 clang/test/CIR/CodeGen/static-vars.cpp

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index a3754f4de66b0..502d58d7db8b5 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -185,11 +185,25 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
                                     global.getSymName());
   }
 
+  mlir::Value createGetGlobal(cir::GlobalOp global) {
+    return createGetGlobal(global.getLoc(), global);
+  }
+
   cir::StoreOp createStore(mlir::Location loc, mlir::Value val, mlir::Value dst,
                            mlir::IntegerAttr align = {}) {
     return create<cir::StoreOp>(loc, val, dst, align);
   }
 
+  [[nodiscard]] cir::GlobalOp createGlobal(mlir::ModuleOp mlirModule,
+                                           mlir::Location loc,
+                                           mlir::StringRef name,
+                                           mlir::Type type,
+                                           cir::GlobalLinkageKind linkage) {
+    mlir::OpBuilder::InsertionGuard guard(*this);
+    setInsertionPointToStart(mlirModule.getBody());
+    return create<cir::GlobalOp>(loc, name, type, linkage);
+  }
+
   cir::GetMemberOp createGetMember(mlir::Location loc, mlir::Type resultTy,
                                    mlir::Value base, llvm::StringRef name,
                                    unsigned index) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a4bc69619d60c..adf7cb77f1a5d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -24,6 +24,7 @@ namespace clang::CIRGen {
 class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   const CIRGenTypeCache &typeCache;
   llvm::StringMap<unsigned> recordNames;
+  llvm::StringMap<unsigned> globalsVersioning;
 
 public:
   CIRGenBuilderTy(mlir::MLIRContext &mlirContext, const CIRGenTypeCache &tc)
@@ -371,6 +372,23 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
   /// pointed to by \p arrayPtr.
   mlir::Value maybeBuildArrayDecay(mlir::Location loc, mlir::Value arrayPtr,
                                    mlir::Type eltTy);
+
+  /// Creates a versioned global variable. If the symbol is already taken, an ID
+  /// will be appended to the symbol. The returned global must always be queried
+  /// for its name so it can be referenced correctly.
+  [[nodiscard]] cir::GlobalOp
+  createVersionedGlobal(mlir::ModuleOp module, mlir::Location loc,
+                        mlir::StringRef name, mlir::Type type,
+                        cir::GlobalLinkageKind linkage) {
+    // Create a unique name if the given name is already taken.
+    std::string uniqueName;
+    if (unsigned version = globalsVersioning[name.str()]++)
+      uniqueName = name.str() + "." + std::to_string(version);
+    else
+      uniqueName = name.str();
+
+    return createGlobal(module, loc, uniqueName, type, linkage);
+  }
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 748c2b5f6fceb..1941b5066edb4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -208,8 +208,25 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) {
   if (d.hasExternalStorage())
     return;
 
-  if (d.getStorageDuration() != SD_Automatic)
-    cgm.errorNYI(d.getSourceRange(), "emitVarDecl automatic storage duration");
+  if (d.getStorageDuration() != SD_Automatic) {
+    // Static sampler variables translated to function calls.
+    if (d.getType()->isSamplerT()) {
+      // Nothing needs to be done here, but let's flag it as an error until we
+      // have a test. It requires OpenCL support.
+      cgm.errorNYI(d.getSourceRange(), "emitVarDecl static sampler type");
+      return;
+    }
+
+    cir::GlobalLinkageKind linkage =
+        cgm.getCIRLinkageVarDefinition(&d, /*IsConstant=*/false);
+
+    // FIXME: We need to force the emission/use of a guard variable for
+    // some variables even if we can constant-evaluate them because
+    // we can't guarantee every translation unit will constant-evaluate them.
+
+    return emitStaticVarDecl(d, linkage);
+  }
+
   if (d.getType().getAddressSpace() == LangAS::opencl_local)
     cgm.errorNYI(d.getSourceRange(), "emitVarDecl openCL address space");
 
@@ -219,6 +236,233 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) {
   return emitAutoVarDecl(d);
 }
 
+static std::string getStaticDeclName(CIRGenModule &cgm, const VarDecl &d) {
+  if (cgm.getLangOpts().CPlusPlus)
+    return cgm.getMangledName(&d).str();
+
+  // If this isn't C++, we don't need a mangled name, just a pretty one.
+  assert(!d.isExternallyVisible() && "name shouldn't matter");
+  std::string contextName;
+  const DeclContext *dc = d.getDeclContext();
+  if (auto *cd = dyn_cast<CapturedDecl>(dc))
+    dc = cast<DeclContext>(cd->getNonClosureContext());
+  if (const auto *fd = dyn_cast<FunctionDecl>(dc))
+    contextName = std::string(cgm.getMangledName(fd));
+  else if (isa<BlockDecl>(dc))
+    cgm.errorNYI(d.getSourceRange(), "block decl context for static var");
+  else if (isa<ObjCMethodDecl>(dc))
+    cgm.errorNYI(d.getSourceRange(), "ObjC decl context for static var");
+  else
+    cgm.errorNYI(d.getSourceRange(), "Unknown context for static var decl");
+
+  contextName += "." + d.getNameAsString();
+  return contextName;
+}
+
+// TODO(cir): LLVM uses a Constant base class. Maybe CIR could leverage an
+// interface for all constants?
+cir::GlobalOp
+CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d,
+                                       cir::GlobalLinkageKind linkage) {
+  // In general, we don't always emit static var decls once before we reference
+  // them. It is possible to reference them before emitting the function that
+  // contains them, and it is possible to emit the containing function multiple
+  // times.
+  if (cir::GlobalOp existingGV = getStaticLocalDeclAddress(&d))
+    return existingGV;
+
+  QualType ty = d.getType();
+  assert(ty->isConstantSizeType() && "VLAs can't be static");
+
+  // Use the label if the variable is renamed with the asm-label extension.
+  if (d.hasAttr<AsmLabelAttr>())
+    errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: asm label");
+
+  std::string name = getStaticDeclName(*this, d);
+
+  mlir::Type lty = getTypes().convertTypeForMem(ty);
+  assert(!cir::MissingFeatures::addressSpace());
+
+  if (d.hasAttr<LoaderUninitializedAttr>() || d.hasAttr<CUDASharedAttr>())
+    errorNYI(d.getSourceRange(),
+             "getOrCreateStaticVarDecl: LoaderUninitializedAttr");
+  assert(!cir::MissingFeatures::addressSpace());
+
+  mlir::Attribute init = builder.getZeroInitAttr(convertType(ty));
+
+  cir::GlobalOp gv = builder.createVersionedGlobal(
+      getModule(), getLoc(d.getLocation()), name, lty, linkage);
+  // TODO(cir): infer visibility from linkage in global op builder.
+  gv.setVisibility(getMLIRVisibilityFromCIRLinkage(linkage));
+  gv.setInitialValueAttr(init);
+  gv.setAlignment(getASTContext().getDeclAlign(&d).getAsAlign().value());
+
+  if (supportsCOMDAT() && gv.isWeakForLinker())
+    gv.setComdat(true);
+
+  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+
+  setGVProperties(gv, &d);
+
+  // OG checks if the expected address space, denoted by the type, is the
+  // same as the actual address space indicated by attributes. If they aren't
+  // the same, an addrspacecast is emitted when this variable is accessed.
+  // In CIR however, cir.get_global already carries that information in
+  // !cir.ptr type - if this global is in OpenCL local address space, then its
+  // type would be !cir.ptr<..., addrspace(offload_local)>. Therefore we don't
+  // need an explicit address space cast in CIR: they will get emitted when
+  // lowering to LLVM IR.
+
+  // Ensure that the static local gets initialized by making sure the parent
+  // function gets emitted eventually.
+  const Decl *dc = cast<Decl>(d.getDeclContext());
+
+  // We can't name blocks or captured statements directly, so try to emit their
+  // parents.
+  if (isa<BlockDecl>(dc) || isa<CapturedDecl>(dc)) {
+    dc = dc->getNonClosureContext();
+    // FIXME: Ensure that global blocks get emitted.
+    if (!dc)
+      errorNYI(d.getSourceRange(), "non-closure context");
+  }
+
+  GlobalDecl gd;
+  if (isa<CXXConstructorDecl>(dc))
+    errorNYI(d.getSourceRange(), "C++ constructors static var context");
+  else if (isa<CXXDestructorDecl>(dc))
+    errorNYI(d.getSourceRange(), "C++ destructors static var context");
+  else if (const auto *fd = dyn_cast<FunctionDecl>(dc))
+    gd = GlobalDecl(fd);
+  else {
+    // Don't do anything for Obj-C method decls or global closures. We should
+    // never defer them.
+    assert(isa<ObjCMethodDecl>(dc) && "unexpected parent code decl");
+  }
+  if (gd.getDecl() && cir::MissingFeatures::openMP()) {
+    // Disable emission of the parent function for the OpenMP device codegen.
+    errorNYI(d.getSourceRange(), "OpenMP");
+  }
+
+  return gv;
+}
+
+/// Add the initializer for 'd' to the global variable that has already been
+/// created for it. If the initializer has a different type than gv does, this
+/// may free gv and return a different one. Otherwise it just returns gv.
+cir::GlobalOp CIRGenFunction::addInitializerToStaticVarDecl(
+    const VarDecl &d, cir::GlobalOp gv, cir::GetGlobalOp gvAddr) {
+  ConstantEmitter emitter(*this);
+  mlir::TypedAttr init =
+      mlir::cast<mlir::TypedAttr>(emitter.tryEmitForInitializer(d));
+
+  // If constant emission failed, then this should be a C++ static
+  // initializer.
+  if (!init) {
+    cgm.errorNYI(d.getSourceRange(), "static var without initializer");
+    return gv;
+  }
+
+  // TODO(cir): There should be debug code here to assert that the decl size
+  // matches the CIR data layout type alloc size, but the code for calculating
+  // the type alloc size is not implemented yet.
+  assert(!cir::MissingFeatures::dataLayoutTypeAllocSize());
+
+  // The initializer may differ in type from the global. Rewrite
+  // the global to match the initializer.  (We have to do this
+  // because some types, like unions, can't be completely represented
+  // in the LLVM type system.)
+  if (gv.getSymType() != init.getType()) {
+    gv.setSymType(init.getType());
+
+    // Normally this should be done with a call to cgm.replaceGlobal(oldGV, gv),
+    // but since at this point the current block hasn't been really attached,
+    // there's no visibility into the GetGlobalOp corresponding to this Global.
+    // Given those constraints, thread in the GetGlobalOp and update it
+    // directly.
+    assert(!cir::MissingFeatures::addressSpace());
+    gvAddr.getAddr().setType(builder.getPointerTo(init.getType()));
+  }
+
+  bool needsDtor =
+      d.needsDestruction(getContext()) == QualType::DK_cxx_destructor;
+
+  assert(!cir::MissingFeatures::opGlobalConstant());
+  gv.setInitialValueAttr(init);
+
+  emitter.finalize(gv);
+
+  if (needsDtor) {
+    // We have a constant initializer, but a nontrivial destructor. We still
+    // need to perform a guarded "initialization" in order to register the
+    // destructor.
+    cgm.errorNYI(d.getSourceRange(), "C++ guarded init");
+  }
+
+  return gv;
+}
+
+void CIRGenFunction::emitStaticVarDecl(const VarDecl &d,
+                                       cir::GlobalLinkageKind linkage) {
+  // Check to see if we already have a global variable for this
+  // declaration.  This can happen when double-emitting function
+  // bodies, e.g. with complete and base constructors.
+  cir::GlobalOp globalOp = cgm.getOrCreateStaticVarDecl(d, linkage);
+  // TODO(cir): we should have a way to represent global ops as values without
+  // having to emit a get global op. Sometimes these emissions are not used.
+  mlir::Value addr = builder.createGetGlobal(globalOp);
+  auto getAddrOp = mlir::cast<cir::GetGlobalOp>(addr.getDefiningOp());
+
+  CharUnits alignment = getContext().getDeclAlign(&d);
+
+  // Store into LocalDeclMap before generating initializer to handle
+  // circular references.
+  mlir::Type elemTy = convertTypeForMem(d.getType());
+  setAddrOfLocalVar(&d, Address(addr, elemTy, alignment));
+
+  // We can't have a VLA here, but we can have a pointer to a VLA,
+  // even though that doesn't really make any sense.
+  // Make sure to evaluate VLA bounds now so that we have them for later.
+  if (d.getType()->isVariablyModifiedType()) {
+    cgm.errorNYI(d.getSourceRange(),
+                 "emitStaticVarDecl: variably modified type");
+  }
+
+  // Save the type in case adding the initializer forces a type change.
+  mlir::Type expectedType = addr.getType();
+
+  cir::GlobalOp var = globalOp;
+
+  assert(!cir::MissingFeatures::cudaSupport());
+
+  // If this value has an initializer, emit it.
+  if (d.getInit())
+    var = addInitializerToStaticVarDecl(d, var, getAddrOp);
+
+  var.setAlignment(alignment.getAsAlign().value());
+
+  // There are a lot of attributes that need to be handled here. Until
+  // we start to support them, we just report an error if there are any.
+  if (d.hasAttrs())
+    cgm.errorNYI(d.getSourceRange(), "static var with attrs");
+
+  if (cgm.getCodeGenOpts().KeepPersistentStorageVariables)
+    cgm.errorNYI(d.getSourceRange(), "static var keep persistent storage");
+
+  // From traditional codegen:
+  // We may have to cast the constant because of the initializer
+  // mismatch above.
+  //
+  // FIXME: It is really dangerous to store this in the map; if anyone
+  // RAUW's the GV uses of this constant will be invalid.
+  mlir::Value castedAddr =
+      builder.createBitcast(getAddrOp.getAddr(), expectedType);
+  localDeclMap.find(&d)->second = Address(castedAddr, elemTy, alignment);
+  cgm.setStaticLocalDeclAddress(&d, var);
+
+  assert(!cir::MissingFeatures::sanitizers());
+  assert(!cir::MissingFeatures::generateDebugInfo());
+}
+
 void CIRGenFunction::emitScalarInit(const Expr *init, mlir::Location loc,
                                     LValue lvalue, bool capturedByInit) {
   assert(!cir::MissingFeatures::objCLifetime());
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 9421ea26a429f..318d3fbf3f9e1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -469,6 +469,10 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// compare the result against zero, returning an Int1Ty value.
   mlir::Value evaluateExprAsBool(const clang::Expr *e);
 
+  cir::GlobalOp addInitializerToStaticVarDecl(const VarDecl &d,
+                                              cir::GlobalOp gv,
+                                              cir::GetGlobalOp gvAddr);
+
   /// Set the address of a local variable.
   void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
     assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!");
@@ -955,6 +959,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   void emitScalarInit(const clang::Expr *init, mlir::Location loc,
                       LValue lvalue, bool capturedByInit = false);
 
+  void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
+
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
                          clang::QualType ty, bool isInit = false,
                          bool isNontemporal = false);
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index f76fd8e733642..03606dba200fd 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -113,8 +113,21 @@ class CIRGenModule : public CIRGenTypeCache {
 
   mlir::Operation *lastGlobalOp = nullptr;
 
+  llvm::DenseMap<const Decl *, cir::GlobalOp> staticLocalDeclMap;
+
   mlir::Operation *getGlobalValue(llvm::StringRef ref);
 
+  cir::GlobalOp getStaticLocalDeclAddress(const VarDecl *d) {
+    return staticLocalDeclMap[d];
+  }
+
+  void setStaticLocalDeclAddress(const VarDecl *d, cir::GlobalOp c) {
+    staticLocalDeclMap[d] = c;
+  }
+
+  cir::GlobalOp getOrCreateStaticVarDecl(const VarDecl &d,
+                                         cir::GlobalLinkageKind linkage);
+
   /// If the specified mangled name is not in the module, create and return an
   /// mlir::GlobalOp value
   cir::GlobalOp getOrCreateCIRGlobal(llvm::StringRef mangledName, mlir::Type ty,
diff --git a/clang/test/CIR/CodeGen/static-vars.c b/clang/test/CIR/CodeGen/static-vars.c
new file mode 100644
index 0000000000000..f45a41d9a00fc
--- /dev/null
+++ b/clang/test/CIR/CodeGen/static-vars.c
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+void func1(void) {
+  // Should lower default-initialized static vars.
+  static int i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func1.i = #cir.int<0> : !s32i
+
+  // Should lower constant-initialized static vars.
+  static int j = 1;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func1.j = #cir.int<1> : !s32i
+
+  // Should properly shadow static vars in nested scopes.
+  {
+    static int j = 2;
+    // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.1 = #cir.int<2> : !s32i
+  }
+  {
+    static int j = 3;
+    // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.2 = #cir.int<3> : !s32i
+  }
+
+  // Should lower basic static vars arithmetics.
+  j++;
+  // CHECK-DAG: %[[#V2:]] = cir.get_global @func1.j : !cir.ptr<!s32i>
+  // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
+}
+
+// Should shadow static vars on different functions.
+void func2(void) {
+  static char i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func2.i = #cir.int<0> : !s8i
+  static float j;
+  // CHECK-DAG: cir.global "private" internal dsolocal @func2.j = #cir.fp<0.000000e+00> : !cir.float
+}
diff --git a/clang/test/CIR/CodeGen/static-vars.cpp b/clang/test/CIR/CodeGen/static-vars.cpp
new file mode 100644
index 0000000000000..9b892c69a6fed
--- /dev/null
+++ b/clang/test/CIR/CodeGen/static-vars.cpp
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t1.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t1.ll %s
+
+void func1(void) {
+  // Should lower default-initialized static vars.
+  static int i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1i = #cir.int<0> : !s32i
+
+  // Should lower constant-initialized static vars.
+  static int j = 1;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j = #cir.int<1> : !s32i
+
+  // Should properly shadow static vars in nested scopes.
+  {
+    static int j = 2;
+    // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_0 = #cir.int<2> : !s32i
+  }
+  {
+    static int j = 3;
+    // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_1 = #cir.int<3> : !s32i
+  }
+
+  // Should lower basic static vars arithmetics.
+  j++;
+  // CHECK-DAG: %[[#V2:]] = cir.get_global @_ZZ5func1vE1j : !cir.ptr<!s32i>
+  // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
+}
+
+// Should shadow static vars on different functions.
+void func2(void) {
+  static char i;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1i = #cir.int<0> : !s8i
+  static float j;
+  // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1j = #cir.fp<0.000000e+00> : !cir.float
+}
+
+// CHECK-DAG: cir.global linkonce_odr comdat @_ZZ4testvE1c = #cir.int<0> : !s32i
+
+// LLVM-DAG: $_ZZ4testvE1c = comdat any
+// LLVM-DAG: @_ZZ4testvE1c = linkonce_odr global i32 0, comdat, align 4
+
+inline void test() { static int c; }
+// CHECK-LABEL: @_Z4testv
+// CHECK: {{%.*}} = cir.get_global @_ZZ4testvE1c : !cir.ptr<!s32i>
+void foo() { test(); }

From 79e06bf1ae9961c5045134288fd8acc9173f6be2 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Fri, 13 Jun 2025 12:22:21 -0600
Subject: [PATCH 425/851] [AMDGPU] Extended vector promotion to aggregate
 types. (#143784)

Extends the `amdgpu-promote-alloca-to-vector` pass to also promote
aggregate types whose elements are all the same type to vector
registers.

The motivation for this extension was to account for IR generated by the
frontend containing several singleton struct types containing vectors or
vector-like elements, though the implementation is strictly more
general.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 106 ++++---
 .../CodeGen/AMDGPU/promote-alloca-structs.ll  | 286 ++++++++++++++++++
 2 files changed, 351 insertions(+), 41 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 700dc87d2f821..e90a3a275f67c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,6 +818,39 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
+/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
+/// type is non-homogeneous.
+static Type *getHomogeneousType(Type *Ty) {
+  Type *ElemTy = nullptr;
+  SmallVector<Type *> WorkList;
+  WorkList.push_back(Ty);
+  while (!WorkList.empty()) {
+    Type *CurTy = WorkList.pop_back_val();
+
+    // Check if the current type is an aggregate type.
+    if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
+      WorkList.push_back(VectorTy->getElementType());
+      continue;
+    }
+    if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
+      WorkList.push_back(ArrayTy->getElementType());
+      continue;
+    }
+    if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
+      WorkList.append(StructTy->element_begin(), StructTy->element_end());
+      continue;
+    }
+
+    // If not, it must be the same as all other non-aggregate types.
+    if (!ElemTy)
+      ElemTy = CurTy;
+    else if (ElemTy != CurTy)
+      return nullptr;
+  }
+
+  return ElemTy;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -828,42 +861,42 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
-  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
-    uint64_t NumElems = 1;
-    Type *ElemTy;
-    do {
-      NumElems *= ArrayTy->getNumElements();
-      ElemTy = ArrayTy->getElementType();
-    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
-
-    // Check for array of vectors
-    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
-    if (InnerVectorTy) {
-      NumElems *= InnerVectorTy->getNumElements();
-      ElemTy = InnerVectorTy->getElementType();
-    }
+  Type *ElemTy = getHomogeneousType(AllocaTy);
 
-    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
-      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
-      if (ElementSize > 0) {
-        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-        // Expand vector if required to match padding of inner type,
-        // i.e. odd size subvectors.
-        // Storage size of new vector must match that of alloca for correct
-        // behaviour of byte offsets and GEP computation.
-        if (NumElems * ElementSize != AllocaSize)
-          NumElems = AllocaSize / ElementSize;
-        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
-          VectorTy = FixedVectorType::get(ElemTy, NumElems);
-      }
-    }
+  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
+    return false;
   }
 
-  if (!VectorTy) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  if (ElementSize == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements\n");
+    return false;
+  }
+
+  // Calculate the size of the corresponding vector, accounting for padding of
+  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
+  // match that of alloca for correct behaviour of byte offsets and GEP
+  // computation.
+  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+  unsigned NumElems = AllocaSize / ElementSize;
+  if (NumElems == 0) {
+    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type\n");
     return false;
   }
+  if (NumElems * ElementSize != AllocaSize) {
+    LLVM_DEBUG(
+        dbgs() << "  Cannot convert type into vector of the same size\n");
+    return false;
+  }
+  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
+  assert(VectorTy && "Failed to create vector type.");
 
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -895,15 +928,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
-  Type *VecEltTy = VectorTy->getElementType();
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -943,7 +967,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
new file mode 100644
index 0000000000000..1cdd027fef89d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
@@ -0,0 +1,286 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s
+
+define i8 @test_v4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_v4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca <4 x i8>, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [4 x i8], align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2v3i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2v3i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2a4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2a4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_a2a3i8(i48 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_a2a3i8(
+; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
+  store i48 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s1v4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s1v4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s1a4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s1a4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s4i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s4i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v2i8v4i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2v2i8v3i8(
+; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
+  store i64 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2s2i8s4i8(
+; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
+  store i48 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s2s2i8s3i8(
+; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
+  store i40 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_s3i8i8s0(
+; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = freeze <2 x i8> poison
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
+  store i16 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+; heterogeneous element types are not supported
+define i8 @test_heterogeneous(i32 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_heterogeneous(
+; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
+; CHECK-NEXT:    store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
+  store i32 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}
+
+; empty types are not supported
+define void @test_empty() {
+; CHECK-LABEL: define void @test_empty() {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
+; CHECK-NEXT:    ret void
+;
+  %stack = alloca {}, align 4, addrspace(5)
+  ret void
+}
+
+; singleton types are not supported
+define i8 @test_singleton(i8 %bits, i64 %idx) {
+; CHECK-LABEL: define i8 @test_singleton(
+; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
+; CHECK-NEXT:    store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+  %stack = alloca {i8, {}}, align 4, addrspace(5)
+  store i8 %bits, ptr addrspace(5) %stack
+  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
+  %val = load i8, ptr addrspace(5) %ptr, align 1
+  ret i8 %val
+}

From a08de429e4ae0baaed23060cbae5c73dc6ffcc5d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 13 Jun 2025 14:46:54 -0400
Subject: [PATCH 426/851] [gn] port cc365331af42

---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index fec917c25b190..ca05ac1b24647 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -299,6 +299,7 @@ write_cmake_config("llvm-config") {
     "LLVM_ENABLE_TELEMETRY=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
     "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=",
+    "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=",
     "LLVM_ENABLE_DUMP=",
     "LLVM_ENABLE_HTTPLIB=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",

From 2f1e6eb6c3e731266052536c3f98cce3a71a316e Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Fri, 13 Jun 2025 11:58:48 -0700
Subject: [PATCH 427/851] [BPF] Report an warning if certain insn imm operand
 cannot fit in 32bit (#142989)

Ihor Solodrai reported a case ([1]) where gcc reports an error but clang
ignores that error and proceeds to generate incorrect code. More
specifically, the problematic code looks like:
   if r1 == 0xcafefeeddeadbeef goto <label>

Here, 0xcafefeeddeadbeef needs to be encoded in a 32-bit imm field
of the insns and the 32-bit imm allows sign extenstion to 64-bit imm.
Obviously, 0xcafefeeddeadbeef cannot encode properly.

The compilation failed for gcc with the following error:
  Error: immediate out of range, shall fit in 32 bits

Given a 64-bit imm value, converting to the proper 32-bit imm value
must satisfy the following 64-bit patterns:
  00000000 00000000 00000000 00000000 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx
  11111111 11111111 11111111 11111111 1xxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx

So if the top 32-bits is 0 or the top 33-bits is 0x1ffffffff, then the 64-bit imm
value can be truncated into proper 32-bit imm. Otherwise, a warning
message, the same as gcc, will be issued. If -Werror is enabled during
compilation, the warning will turn into an error.

[1] https://lore.kernel.org/bpf/70affb12-327b-4882-bd1d-afda8b8c6f56@linux.dev/
---
 .../BPF/MCTargetDesc/BPFMCCodeEmitter.cpp     | 22 ++++++++++++++-----
 llvm/test/CodeGen/BPF/warn-cmp.ll             | 15 +++++++++++++
 2 files changed, 31 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/warn-cmp.ll

diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 10a46f100bbea..bd9d2de58c8b2 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
@@ -33,11 +34,12 @@ namespace {
 class BPFMCCodeEmitter : public MCCodeEmitter {
   const MCRegisterInfo &MRI;
   bool IsLittleEndian;
+  MCContext &Ctx;
 
 public:
   BPFMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri,
-                   bool IsLittleEndian)
-      : MRI(mri), IsLittleEndian(IsLittleEndian) { }
+                   bool IsLittleEndian, MCContext &ctx)
+      : MRI(mri), IsLittleEndian(IsLittleEndian), Ctx(ctx) {}
   BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
   void operator=(const BPFMCCodeEmitter &) = delete;
   ~BPFMCCodeEmitter() override = default;
@@ -67,12 +69,12 @@ class BPFMCCodeEmitter : public MCCodeEmitter {
 
 MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                             MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), true);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), true, Ctx);
 }
 
 MCCodeEmitter *llvm::createBPFbeMCCodeEmitter(const MCInstrInfo &MCII,
                                               MCContext &Ctx) {
-  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), false);
+  return new BPFMCCodeEmitter(MCII, *Ctx.getRegisterInfo(), false, Ctx);
 }
 
 unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
@@ -81,8 +83,16 @@ unsigned BPFMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                              const MCSubtargetInfo &STI) const {
   if (MO.isReg())
     return MRI.getEncodingValue(MO.getReg());
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+  if (MO.isImm()) {
+    uint64_t Imm = MO.getImm();
+    uint64_t High32Bits = Imm >> 32, High33Bits = Imm >> 31;
+    if (MI.getOpcode() != BPF::LD_imm64 && High32Bits != 0 &&
+        High33Bits != 0x1FFFFFFFFULL) {
+      Ctx.reportWarning(MI.getLoc(),
+                        "immediate out of range, shall fit in 32 bits");
+    }
+    return static_cast<unsigned>(Imm);
+  }
 
   assert(MO.isExpr());
 
diff --git a/llvm/test/CodeGen/BPF/warn-cmp.ll b/llvm/test/CodeGen/BPF/warn-cmp.ll
new file mode 100644
index 0000000000000..109d177b0fb42
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/warn-cmp.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=bpfel -filetype=obj < %s 2>&1 >/dev/null | FileCheck %s
+
+; CHECK: warning: immediate out of range, shall fit in 32 bits
+define dso_local void @test_1() naked {
+  tail call void asm sideeffect
+    "r1 = 40; if r1 == 0x1deadbeef goto +0; r0 = 0; exit;", "~{r0},~{r1}"()
+  unreachable
+}
+
+; CHECK: warning: immediate out of range, shall fit in 32 bits
+define dso_local void @test_2() naked {
+  tail call void asm sideeffect
+    "r1 = 40; if r1 == 0xffffffff00000000 goto +0; r0 = 0; exit;", "~{r0},~{r1}"()
+  unreachable
+}

From 90d98a38b273f5d62424a3815447675860947927 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Fri, 13 Jun 2025 21:05:44 +0200
Subject: [PATCH 428/851] Revert "[Clang] Added explanation why
 `is_constructible` evaluated to false. " (#144127)

Reverts llvm/llvm-project#143309

Someone needs to go through the libc++ tests and update the diagnostics
checks in those tests (ie, i don't believe there was anything wrong with
the PR, but it impacts libc++ tests nonetheless
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  8 +--
 clang/lib/Sema/SemaTypeTraits.cpp             | 71 +------------------
 clang/test/CXX/drs/cwg18xx.cpp                |  3 +-
 ...overload-resolution-deferred-templates.cpp | 19 ++---
 .../type-traits-unsatisfied-diags-std.cpp     | 66 -----------------
 .../SemaCXX/type-traits-unsatisfied-diags.cpp | 62 ----------------
 6 files changed, 10 insertions(+), 219 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 95d24e9f1e6b5..8fe7ad6138aa0 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1767,8 +1767,7 @@ def note_unsatisfied_trait
     : Note<"%0 is not %enum_select<TraitName>{"
            "%TriviallyRelocatable{trivially relocatable}|"
            "%Replaceable{replaceable}|"
-           "%TriviallyCopyable{trivially copyable}|"
-           "%Constructible{constructible with provided types}"
+           "%TriviallyCopyable{trivially copyable}"
            "}1">;
 
 def note_unsatisfied_trait_reason
@@ -1798,10 +1797,7 @@ def note_unsatisfied_trait_reason
            "%DeletedAssign{has a deleted %select{copy|move}1 "
            "assignment operator}|"
            "%UnionWithUserDeclaredSMF{is a union with a user-declared "
-           "%sub{select_special_member_kind}1}|"
-           "%FunctionType{is a function type}|"
-           "%CVVoidType{is a cv void type}|"
-           "%IncompleteArrayType{is an incomplete array type}"
+           "%sub{select_special_member_kind}1}"
            "}0">;
 
 def warn_consteval_if_always_true : Warning<
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 22c690bedc1ed..1738ab4466001 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/DiagnosticParse.h"
 #include "clang/Basic/DiagnosticSema.h"
@@ -1948,7 +1947,6 @@ static std::optional<TypeTrait> StdNameToTypeTrait(StringRef Name) {
             TypeTrait::UTT_IsCppTriviallyRelocatable)
       .Case("is_replaceable", TypeTrait::UTT_IsReplaceable)
       .Case("is_trivially_copyable", TypeTrait::UTT_IsTriviallyCopyable)
-      .Case("is_constructible", TypeTrait::TT_IsConstructible)
       .Default(std::nullopt);
 }
 
@@ -1985,16 +1983,8 @@ static ExtractedTypeTraitInfo ExtractTypeTraitFromExpression(const Expr *E) {
     Trait = StdNameToTypeTrait(Name);
     if (!Trait)
       return std::nullopt;
-    for (const auto &Arg : VD->getTemplateArgs().asArray()) {
-      if (Arg.getKind() == TemplateArgument::ArgKind::Pack) {
-        for (const auto &InnerArg : Arg.pack_elements())
-          Args.push_back(InnerArg.getAsType());
-      } else if (Arg.getKind() == TemplateArgument::ArgKind::Type) {
-        Args.push_back(Arg.getAsType());
-      } else {
-        llvm_unreachable("Unexpected kind");
-      }
-    }
+    for (const auto &Arg : VD->getTemplateArgs().asArray())
+      Args.push_back(Arg.getAsType());
     return {{Trait.value(), std::move(Args)}};
   }
 
@@ -2267,60 +2257,6 @@ static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
   }
 }
 
-static void DiagnoseNonConstructibleReason(
-    Sema &SemaRef, SourceLocation Loc,
-    const llvm::SmallVector<clang::QualType, 1> &Ts) {
-  if (Ts.empty()) {
-    return;
-  }
-
-  bool ContainsVoid = false;
-  for (const QualType &ArgTy : Ts) {
-    ContainsVoid |= ArgTy->isVoidType();
-  }
-
-  if (ContainsVoid)
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::CVVoidType;
-
-  QualType T = Ts[0];
-  if (T->isFunctionType())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::FunctionType;
-
-  if (T->isIncompleteArrayType())
-    SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason)
-        << diag::TraitNotSatisfiedReason::IncompleteArrayType;
-
-  const CXXRecordDecl *D = T->getAsCXXRecordDecl();
-  if (!D || D->isInvalidDecl() || !D->hasDefinition())
-    return;
-
-  llvm::BumpPtrAllocator OpaqueExprAllocator;
-  SmallVector<Expr *, 2> ArgExprs;
-  ArgExprs.reserve(Ts.size() - 1);
-  for (unsigned I = 1, N = Ts.size(); I != N; ++I) {
-    QualType ArgTy = Ts[I];
-    if (ArgTy->isObjectType() || ArgTy->isFunctionType())
-      ArgTy = SemaRef.Context.getRValueReferenceType(ArgTy);
-    ArgExprs.push_back(
-        new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
-            OpaqueValueExpr(Loc, ArgTy.getNonLValueExprType(SemaRef.Context),
-                            Expr::getValueKindForType(ArgTy)));
-  }
-
-  EnterExpressionEvaluationContext Unevaluated(
-      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
-  Sema::ContextRAII TUContext(SemaRef,
-                              SemaRef.Context.getTranslationUnitDecl());
-  InitializedEntity To(InitializedEntity::InitializeTemporary(T));
-  InitializationKind InitKind(InitializationKind::CreateDirect(Loc, Loc, Loc));
-  InitializationSequence Init(SemaRef, To, InitKind, ArgExprs);
-
-  Init.Diagnose(SemaRef, To, InitKind, ArgExprs);
-  SemaRef.Diag(D->getLocation(), diag::note_defined_here) << D;
-}
-
 static void DiagnoseNonTriviallyCopyableReason(Sema &SemaRef,
                                                SourceLocation Loc, QualType T) {
   SemaRef.Diag(Loc, diag::note_unsatisfied_trait)
@@ -2360,9 +2296,6 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) {
   case UTT_IsTriviallyCopyable:
     DiagnoseNonTriviallyCopyableReason(*this, E->getBeginLoc(), Args[0]);
     break;
-  case TT_IsConstructible:
-    DiagnoseNonConstructibleReason(*this, E->getBeginLoc(), Args);
-    break;
   default:
     break;
   }
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 9948075852135..5b4551ba0143b 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -564,12 +564,11 @@ struct A {
 namespace ex2 {
 #if __cplusplus >= 201103L
 struct Bar {
-  struct Baz { // #cwg1890-Baz
+  struct Baz {
     int a = 0;
   };
   static_assert(__is_constructible(Baz), "");
   // since-cxx11-error@-1 {{static assertion failed due to requirement '__is_constructible(cwg1890::ex2::Bar::Baz)'}}
-  // since-cxx11-note@#cwg1890-Baz {{'Baz' defined here}}
 };
 #endif
 } // namespace ex2
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 46c3670848529..7cb71e075d50e 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -80,30 +80,21 @@ struct ImplicitlyCopyable {
 static_assert(__is_constructible(ImplicitlyCopyable, const ImplicitlyCopyable&));
 
 
-struct Movable { // #Movable
+struct Movable {
   template <typename T>
   requires __is_constructible(Movable, T) // #err-self-constraint-1
-  explicit Movable(T op) noexcept; // #Movable1
-  Movable(Movable&&) noexcept = default; // #Movable2
+  explicit Movable(T op) noexcept; // #1
+  Movable(Movable&&) noexcept = default; // #2
 };
 static_assert(__is_constructible(Movable, Movable&&));
 static_assert(__is_constructible(Movable, const Movable&));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}} \
-// expected-error@-1 {{call to implicitly-deleted copy constructor of 'Movable'}} \
-// expected-note@#Movable  {{'Movable' defined here}} \
-// expected-note@#Movable  {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const Movable' for 1st argument}} \
-// expected-note@#Movable2  {{copy constructor is implicitly deleted because 'Movable' has a user-declared move constructor}} \
-// expected-note@#Movable2  {{candidate constructor not viable: no known conversion from 'int' to 'Movable' for 1st argument}} \
-// expected-note@#Movable1  {{candidate template ignored: constraints not satisfied [with T = int]}}
-
+// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, const Movable &)'}}
 
 static_assert(__is_constructible(Movable, int));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
-// expected-error@-1 {{no matching constructor for initialization of 'Movable'}} \
+// expected-error@-1{{static assertion failed due to requirement '__is_constructible(Movable, int)'}} \
 // expected-note@-1 2{{}}
 // expected-error@#err-self-constraint-1{{satisfaction of constraint '__is_constructible(Movable, T)' depends on itself}}
 // expected-note@#err-self-constraint-1 4{{}}
-// expected-note@#Movable  {{'Movable' defined here}}
 
 template <typename T>
 struct Members {
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
index a403a0450607a..329b611110c1d 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags-std.cpp
@@ -20,14 +20,6 @@ struct is_trivially_copyable {
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
-
-template <typename... Args>
-struct is_constructible {
-    static constexpr bool value = __is_constructible(Args...);
-};
-
-template <typename... Args>
-constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 #ifdef STD2
@@ -52,17 +44,6 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = __is_trivially_copyable(T);
-
-template <typename... Args>
-struct __details_is_constructible{
-    static constexpr bool value = __is_constructible(Args...);
-};
-
-template <typename... Args>
-using is_constructible  = __details_is_constructible<Args...>;
-
-template <typename... Args>
-constexpr bool is_constructible_v = __is_constructible(Args...);
 #endif
 
 
@@ -92,15 +73,6 @@ using is_trivially_copyable  = __details_is_trivially_copyable<T>;
 
 template <typename T>
 constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
-
-template <typename... Args>
-struct __details_is_constructible : bool_constant<__is_constructible(Args...)> {};
-
-template <typename... Args>
-using is_constructible  = __details_is_constructible<Args...>;
-
-template <typename... Args>
-constexpr bool is_constructible_v = is_constructible<Args...>::value;
 #endif
 
 }
@@ -128,15 +100,6 @@ static_assert(std::is_trivially_copyable_v<int&>);
 // expected-note@-1 {{because it is a reference type}}
 
 
-static_assert(std::is_constructible<int, int>::value);
-
-static_assert(std::is_constructible<void>::value);
-// expected-error-re@-1 {{static assertion failed due to requirement 'std::{{.*}}is_constructible<void>::value'}} \
-// expected-note@-1 {{because it is a cv void type}}
-static_assert(std::is_constructible_v<void>);
-// expected-error@-1 {{static assertion failed due to requirement 'std::is_constructible_v<void>'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
 namespace test_namespace {
     using namespace std;
     static_assert(is_trivially_relocatable<int&>::value);
@@ -156,13 +119,6 @@ namespace test_namespace {
     // expected-error@-1 {{static assertion failed due to requirement 'is_trivially_copyable_v<int &>'}} \
     // expected-note@-1 {{'int &' is not trivially copyable}} \
     // expected-note@-1 {{because it is a reference type}}
-
-    static_assert(is_constructible<void>::value);
-    // expected-error-re@-1 {{static assertion failed due to requirement '{{.*}}is_constructible<void>::value'}} \
-    // expected-note@-1 {{because it is a cv void type}}
-    static_assert(is_constructible_v<void>);
-    // expected-error@-1 {{static assertion failed due to requirement 'is_constructible_v<void>'}} \
-    // expected-note@-1 {{because it is a cv void type}}
 }
 
 
@@ -183,15 +139,6 @@ concept C2 = std::is_trivially_copyable_v<T>; // #concept4
 
 template <C2 T> void g2();  // #cand4
 
-template <typename... Args>
-requires std::is_constructible<Args...>::value void f3();  // #cand5
-
-template <typename... Args>
-concept C3 = std::is_constructible_v<Args...>; // #concept6
-
-template <C3 T> void g3();  // #cand6
-
-
 void test() {
     f<int&>();
     // expected-error@-1 {{no matching function for call to 'f'}} \
@@ -222,19 +169,6 @@ void test() {
     // expected-note@#concept4 {{because 'std::is_trivially_copyable_v<int &>' evaluated to false}} \
     // expected-note@#concept4 {{'int &' is not trivially copyable}} \
     // expected-note@#concept4 {{because it is a reference type}}
-
-    f3<void>();
-    // expected-error@-1 {{no matching function for call to 'f3'}} \
-    // expected-note@#cand5 {{candidate template ignored: constraints not satisfied [with Args = <void>]}} \
-    // expected-note-re@#cand5 {{because '{{.*}}is_constructible<void>::value' evaluated to false}} \
-    // expected-note@#cand5 {{because it is a cv void type}}
-
-    g3<void>();
-    // expected-error@-1 {{no matching function for call to 'g3'}} \
-    // expected-note@#cand6 {{candidate template ignored: constraints not satisfied [with T = void]}} \
-    // expected-note@#cand6 {{because 'void' does not satisfy 'C3'}} \
-    // expected-note@#concept6 {{because 'std::is_constructible_v<void>' evaluated to false}} \
-    // expected-note@#concept6 {{because it is a cv void type}}
 }
 }
 
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index d0b3f294fbcab..a8c78f6304ca9 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -488,65 +488,3 @@ static_assert(__is_trivially_copyable(S12));
 // expected-note@-1 {{'S12' is not trivially copyable}} \
 // expected-note@#tc-S12 {{'S12' defined here}}
 }
-
-namespace constructible {
-
-struct S1 {  // #c-S1
-    S1(int); // #cc-S1
-};
-static_assert(__is_constructible(S1, char*));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S1, char *)'}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S1'}} \
-// expected-note@#c-S1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'char *' to 'const S1' for 1st argument}} \
-// expected-note@#c-S1 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'char *' to 'S1' for 1st argument}} \
-// expected-note@#cc-S1 {{candidate constructor not viable: no known conversion from 'char *' to 'int' for 1st argument; dereference the argument with *}} \
-// expected-note@#c-S1 {{'S1' defined here}}
-
-struct S2 { // #c-S2
-    S2(int, float, double); // #cc-S2
-};
-static_assert(__is_constructible(S2, float));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float)'}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'float' to 'const S2' for 1st argument}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'float' to 'S2' for 1st argument}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
-// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 1 was provided}} \
-// expected-note@#c-S2 {{'S2' defined here}}
-
-static_assert(__is_constructible(S2, float, void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(constructible::S2, float, void)'}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} \
-// expected-note@#c-S2 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} \
-// expected-note@-1{{because it is a cv void type}} \
-// expected-error@-1 {{no matching constructor for initialization of 'S2'}} \
-// expected-note@#cc-S2 {{candidate constructor not viable: requires 3 arguments, but 2 were provided}} \
-// expected-note@#c-S2 {{'S2' defined here}}
-
-static_assert(__is_constructible(int[]));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int[])'}} \
-// expected-note@-1 {{because it is an incomplete array type}}
-
-static_assert(__is_constructible(void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(void, void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void, void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(const void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(const void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(volatile void));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(volatile void)'}} \
-// expected-note@-1 {{because it is a cv void type}}
-
-static_assert(__is_constructible(int ()));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(int ())'}} \
-// expected-note@-1 {{because it is a function type}}
-
-static_assert(__is_constructible(void (int, float)));
-// expected-error@-1 {{static assertion failed due to requirement '__is_constructible(void (int, float))'}} \
-// expected-note@-1 {{because it is a function type}}
-}

From 83f215b0350289f3bd349c1f85826a58d8d80f03 Mon Sep 17 00:00:00 2001
From: Fabian Meumertzheim <fabian@meumertzhe.im>
Date: Fri, 13 Jun 2025 21:09:58 +0200
Subject: [PATCH 429/851] Reland "[llvm-cov] Add support for baseline coverage"
 (#144130)

When no profile is provided, but the new --empty-profile option is
specified, the export/report/show commands now emit coverage data
equivalent to that obtained from a profile with all zero counters
("baseline coverage").

This is useful for build systems (e.g. Bazel) that can track coverage
information for each build target, even those that are never linked into
tests and thus don't have runtime coverage data recorded. By merging in
baseline coverage, lines in files that aren't linked into tests are
correctly reported as uncovered.

Reland with fixes to `CoverageMappingTest.cpp`.

Reverts llvm/llvm-project#144121
---
 llvm/docs/CommandGuide/llvm-cov.rst           |  15 +++
 .../ProfileData/Coverage/CoverageMapping.h    |  24 ++--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 123 +++++++++++-------
 ...showLineExecutionCounts-lcov-baseline.test |  37 ++++++
 llvm/tools/llvm-cov/CodeCoverage.cpp          |  78 +++++++----
 .../ProfileData/CoverageMappingTest.cpp       |   4 +-
 6 files changed, 195 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test

diff --git a/llvm/docs/CommandGuide/llvm-cov.rst b/llvm/docs/CommandGuide/llvm-cov.rst
index 968f3c452f558..f4db60cf06fa7 100644
--- a/llvm/docs/CommandGuide/llvm-cov.rst
+++ b/llvm/docs/CommandGuide/llvm-cov.rst
@@ -380,6 +380,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov report
 
 .. _llvm-cov-report:
@@ -470,6 +475,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Display the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
@@ -562,6 +572,11 @@ OPTIONS
  Fail if an object file cannot be found for a binary ID present in the profile,
  neither on the command line nor via binary ID lookup.
 
+.. option:: -empty-profile
+
+ Export the baseline coverage of the binaries with all zero execution counts.
+ Mutually exclusive with -instr-profile.
+
 CONVERT-FOR-TESTING COMMAND
 ---------------------------
 
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index e62ce5e3d8fa6..d1230b0ba7c58 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -991,18 +991,23 @@ class CoverageMapping {
   // Load coverage records from readers.
   static Error loadFromReaders(
       ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-      IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage);
+      std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader,
+      CoverageMapping &Coverage);
 
   // Load coverage records from file.
   static Error
   loadFromFile(StringRef Filename, StringRef Arch, StringRef CompilationDir,
-               IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-               bool &DataFound,
+               std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+                   &ProfileReader,
+               CoverageMapping &Coverage, bool &DataFound,
                SmallVectorImpl<object::BuildID> *FoundBinaryIDs = nullptr);
 
   /// Add a function record corresponding to \p Record.
-  Error loadFunctionRecord(const CoverageMappingRecord &Record,
-                           IndexedInstrProfReader &ProfileReader);
+  Error loadFunctionRecord(
+      const CoverageMappingRecord &Record,
+      const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+          &ProfileReader);
 
   /// Look up the indices for function records which are at least partially
   /// defined in the specified file. This is guaranteed to return a superset of
@@ -1018,15 +1023,16 @@ class CoverageMapping {
   /// Load the coverage mapping using the given readers.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
   load(ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-       IndexedInstrProfReader &ProfileReader);
+       std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+           &ProfileReader);
 
   /// Load the coverage mapping from the given object files and profile. If
   /// \p Arches is non-empty, it must specify an architecture for each object.
   /// Ignores non-instrumented object files unless all are not instrumented.
   LLVM_ABI static Expected<std::unique_ptr<CoverageMapping>>
-  load(ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-       vfs::FileSystem &FS, ArrayRef<StringRef> Arches = {},
-       StringRef CompilationDir = "",
+  load(ArrayRef<StringRef> ObjectFilenames,
+       std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+       ArrayRef<StringRef> Arches = {}, StringRef CompilationDir = "",
        const object::BuildIDFetcher *BIDFetcher = nullptr,
        bool CheckBinaryIDs = false);
 
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index dd74eb054a34c..429ec5c19f1f8 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -823,7 +823,8 @@ class MCDCDecisionRecorder {
 
 Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
-    IndexedInstrProfReader &ProfileReader) {
+    const std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
   if (OrigFuncName.empty())
     return make_error<CoverageMapError>(coveragemap_error::malformed,
@@ -837,35 +838,44 @@ Error CoverageMapping::loadFunctionRecord(
   CounterMappingContext Ctx(Record.Expressions);
 
   std::vector<uint64_t> Counts;
-  if (Error E = ProfileReader.getFunctionCounts(Record.FunctionName,
-                                                Record.FunctionHash, Counts)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionCounts(
+            Record.FunctionName, Record.FunctionHash, Counts)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
+  } else {
     Counts.assign(getMaxCounterID(Ctx, Record) + 1, 0);
   }
   Ctx.setCounts(Counts);
 
   bool IsVersion11 =
-      ProfileReader.getVersion() < IndexedInstrProf::ProfVersion::Version12;
+      ProfileReader && ProfileReader.value().get().getVersion() <
+                           IndexedInstrProf::ProfVersion::Version12;
 
   BitVector Bitmap;
-  if (Error E = ProfileReader.getFunctionBitmap(Record.FunctionName,
-                                                Record.FunctionHash, Bitmap)) {
-    instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
-    if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
-                                      Record.FunctionHash);
-      return Error::success();
+  if (ProfileReader) {
+    if (Error E = ProfileReader.value().get().getFunctionBitmap(
+            Record.FunctionName, Record.FunctionHash, Bitmap)) {
+      instrprof_error IPE = std::get<0>(InstrProfError::take(std::move(E)));
+      if (IPE == instrprof_error::hash_mismatch) {
+        FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                        Record.FunctionHash);
+        return Error::success();
+      }
+      if (IPE != instrprof_error::unknown_function)
+        return make_error<InstrProfError>(IPE);
+      Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
     }
-    if (IPE != instrprof_error::unknown_function)
-      return make_error<InstrProfError>(IPE);
-    Bitmap = BitVector(getMaxBitmapSize(Record, IsVersion11));
+  } else {
+    Bitmap = BitVector(getMaxBitmapSize(Record, false));
   }
   Ctx.setBitmap(std::move(Bitmap));
 
@@ -959,10 +969,14 @@ Error CoverageMapping::loadFunctionRecord(
 // of CoverageMappingReader instances.
 Error CoverageMapping::loadFromReaders(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage) {
-  assert(!Coverage.SingleByteCoverage ||
-         *Coverage.SingleByteCoverage == ProfileReader.hasSingleByteCoverage());
-  Coverage.SingleByteCoverage = ProfileReader.hasSingleByteCoverage();
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage) {
+  assert(!Coverage.SingleByteCoverage || !ProfileReader ||
+         *Coverage.SingleByteCoverage ==
+             ProfileReader.value().get().hasSingleByteCoverage());
+  Coverage.SingleByteCoverage =
+      !ProfileReader || ProfileReader.value().get().hasSingleByteCoverage();
   for (const auto &CoverageReader : CoverageReaders) {
     for (auto RecordOrErr : *CoverageReader) {
       if (Error E = RecordOrErr.takeError())
@@ -977,7 +991,8 @@ Error CoverageMapping::loadFromReaders(
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
-    IndexedInstrProfReader &ProfileReader) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader) {
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   if (Error E = loadFromReaders(CoverageReaders, ProfileReader, *Coverage))
     return std::move(E);
@@ -986,18 +1001,19 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
 // If E is a no_data_found error, returns success. Otherwise returns E.
 static Error handleMaybeNoDataFoundError(Error E) {
-  return handleErrors(
-      std::move(E), [](const CoverageMapError &CME) {
-        if (CME.get() == coveragemap_error::no_data_found)
-          return static_cast<Error>(Error::success());
-        return make_error<CoverageMapError>(CME.get(), CME.getMessage());
-      });
+  return handleErrors(std::move(E), [](const CoverageMapError &CME) {
+    if (CME.get() == coveragemap_error::no_data_found)
+      return static_cast<Error>(Error::success());
+    return make_error<CoverageMapError>(CME.get(), CME.getMessage());
+  });
 }
 
 Error CoverageMapping::loadFromFile(
     StringRef Filename, StringRef Arch, StringRef CompilationDir,
-    IndexedInstrProfReader &ProfileReader, CoverageMapping &Coverage,
-    bool &DataFound, SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
+    std::optional<std::reference_wrapper<IndexedInstrProfReader>>
+        &ProfileReader,
+    CoverageMapping &Coverage, bool &DataFound,
+    SmallVectorImpl<object::BuildID> *FoundBinaryIDs) {
   auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
       Filename, /*IsText=*/false, /*RequiresNullTerminator=*/false);
   if (std::error_code EC = CovMappingBufOrErr.getError())
@@ -1033,13 +1049,23 @@ Error CoverageMapping::loadFromFile(
 }
 
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
-    ArrayRef<StringRef> ObjectFilenames, StringRef ProfileFilename,
-    vfs::FileSystem &FS, ArrayRef<StringRef> Arches, StringRef CompilationDir,
+    ArrayRef<StringRef> ObjectFilenames,
+    std::optional<StringRef> ProfileFilename, vfs::FileSystem &FS,
+    ArrayRef<StringRef> Arches, StringRef CompilationDir,
     const object::BuildIDFetcher *BIDFetcher, bool CheckBinaryIDs) {
-  auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename, FS);
-  if (Error E = ProfileReaderOrErr.takeError())
-    return createFileError(ProfileFilename, std::move(E));
-  auto ProfileReader = std::move(ProfileReaderOrErr.get());
+  std::unique_ptr<IndexedInstrProfReader> ProfileReader;
+  if (ProfileFilename) {
+    auto ProfileReaderOrErr =
+        IndexedInstrProfReader::create(ProfileFilename.value(), FS);
+    if (Error E = ProfileReaderOrErr.takeError())
+      return createFileError(ProfileFilename.value(), std::move(E));
+    ProfileReader = std::move(ProfileReaderOrErr.get());
+  }
+  auto ProfileReaderRef =
+      ProfileReader
+          ? std::optional<std::reference_wrapper<IndexedInstrProfReader>>(
+                *ProfileReader)
+          : std::nullopt;
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
 
@@ -1053,16 +1079,17 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
 
   SmallVector<object::BuildID> FoundBinaryIDs;
   for (const auto &File : llvm::enumerate(ObjectFilenames)) {
-    if (Error E =
-            loadFromFile(File.value(), GetArch(File.index()), CompilationDir,
-                         *ProfileReader, *Coverage, DataFound, &FoundBinaryIDs))
+    if (Error E = loadFromFile(File.value(), GetArch(File.index()),
+                               CompilationDir, ProfileReaderRef, *Coverage,
+                               DataFound, &FoundBinaryIDs))
       return std::move(E);
   }
 
   if (BIDFetcher) {
     std::vector<object::BuildID> ProfileBinaryIDs;
-    if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
-      return createFileError(ProfileFilename, std::move(E));
+    if (ProfileReader)
+      if (Error E = ProfileReader->readBinaryIds(ProfileBinaryIDs))
+        return createFileError(ProfileFilename.value(), std::move(E));
 
     SmallVector<object::BuildIDRef> BinaryIDsToFetch;
     if (!ProfileBinaryIDs.empty()) {
@@ -1082,12 +1109,12 @@ Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
       if (PathOpt) {
         std::string Path = std::move(*PathOpt);
         StringRef Arch = Arches.size() == 1 ? Arches.front() : StringRef();
-        if (Error E = loadFromFile(Path, Arch, CompilationDir, *ProfileReader,
-                                  *Coverage, DataFound))
+        if (Error E = loadFromFile(Path, Arch, CompilationDir, ProfileReaderRef,
+                                   *Coverage, DataFound))
           return std::move(E);
       } else if (CheckBinaryIDs) {
         return createFileError(
-            ProfileFilename,
+            ProfileFilename.value(),
             createStringError(errc::no_such_file_or_directory,
                               "Missing binary ID: " +
                                   llvm::toHex(BinaryID, /*LowerCase=*/true)));
diff --git a/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
new file mode 100644
index 0000000000000..bce886bdf510b
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/showLineExecutionCounts-lcov-baseline.test
@@ -0,0 +1,37 @@
+// FULL: SF:{{.*}}showLineExecutionCounts.cpp
+// FULL: FN:6,main
+// FULL: FNDA:0,main
+// FULL: FNF:1
+// FULL: FNH:0
+int main() {                              // FULL: DA:[[@LINE]],0
+  int x = 0;                              // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  if (x) {                                // FULL: DA:[[@LINE]],0
+    x = 0;                                // FULL: DA:[[@LINE]],0
+  } else {                                // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  for (int i = 0; i < 100; ++i) {         // FULL: DA:[[@LINE]],0
+    x = 1;                                // FULL: DA:[[@LINE]],0
+  }                                       // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  x = x < 10 ? x + 1 : x - 1;             // FULL: DA:[[@LINE]],0
+  x = x > 10 ?                            // FULL: DA:[[@LINE]],0
+        x - 1:                            // FULL: DA:[[@LINE]],0
+        x + 1;                            // FULL: DA:[[@LINE]],0
+                                          // FULL: DA:[[@LINE]],0
+  return 0;                               // FULL: DA:[[@LINE]],0
+}                                         // FULL: DA:[[@LINE]],0
+// FULL: LF:20
+// FULL: LH:0
+// FULL: end_of_record
+// RUN: llvm-cov export -format=lcov %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=FULL %s
+
+// RUN: llvm-cov export -format=lcov -summary-only %S/Inputs/lineExecutionCounts.covmapping -empty-profile %s | FileCheck -check-prefixes=SUMMARYONLY %s
+// SUMMARYONLY: SF:{{.*}}showLineExecutionCounts.cpp
+// SUMMARYONLY: FNF:1
+// SUMMARYONLY: FNH:0
+// SUMMARYONLY: LF:20
+// SUMMARYONLY: LH:0
+// SUMMARYONLY: end_of_record
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index 1f2484cd4dda9..6c66858c4de8c 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -153,7 +153,7 @@ class CodeCoverageTool {
   bool HadSourceFiles = false;
 
   /// The path to the indexed profile.
-  std::string PGOFilename;
+  std::optional<std::string> PGOFilename;
 
   /// A list of input source files.
   std::vector<std::string> SourceFiles;
@@ -455,10 +455,12 @@ static bool modifiedTimeGT(StringRef LHS, StringRef RHS) {
 }
 
 std::unique_ptr<CoverageMapping> CodeCoverageTool::load() {
-  for (StringRef ObjectFilename : ObjectFilenames)
-    if (modifiedTimeGT(ObjectFilename, PGOFilename))
-      warning("profile data may be out of date - object is newer",
-              ObjectFilename);
+  if (PGOFilename) {
+    for (StringRef ObjectFilename : ObjectFilenames)
+      if (modifiedTimeGT(ObjectFilename, PGOFilename.value()))
+        warning("profile data may be out of date - object is newer",
+                ObjectFilename);
+  }
   auto FS = vfs::getRealFileSystem();
   auto CoverageOrErr = CoverageMapping::load(
       ObjectFilenames, PGOFilename, *FS, CoverageArches,
@@ -668,11 +670,16 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
       "dump-collected-paths", cl::Optional, cl::Hidden,
       cl::desc("Show the collected paths to source files"));
 
-  cl::opt<std::string, true> PGOFilename(
-      "instr-profile", cl::Required, cl::location(this->PGOFilename),
+  cl::opt<std::string> PGOFilename(
+      "instr-profile", cl::Optional,
       cl::desc(
           "File with the profile data obtained after an instrumented run"));
 
+  cl::opt<bool> EmptyProfile(
+      "empty-profile", cl::Optional,
+      cl::desc("Use a synthetic profile with no data to generate "
+               "baseline coverage"));
+
   cl::list<std::string> Arches(
       "arch", cl::desc("architectures of the coverage mapping binaries"));
 
@@ -805,6 +812,15 @@ int CodeCoverageTool::run(Command Cmd, int argc, const char **argv) {
     }
     this->CheckBinaryIDs = CheckBinaryIDs;
 
+    if (!PGOFilename.empty() == EmptyProfile) {
+      error(
+          "exactly one of -instr-profile and -empty-profile must be specified");
+      return 1;
+    }
+    if (!PGOFilename.empty()) {
+      this->PGOFilename = std::make_optional(PGOFilename.getValue());
+    }
+
     if (!CovFilename.empty())
       ObjectFilenames.emplace_back(CovFilename);
     for (const std::string &Filename : CovFilenames)
@@ -1116,20 +1132,22 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     }
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
-  }
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
 
-  if (ShowCreatedTime) {
-    auto ModifiedTime = Status.getLastModificationTime();
-    std::string ModifiedTimeStr = to_string(ModifiedTime);
-    size_t found = ModifiedTimeStr.rfind(':');
-    ViewOpts.CreatedTimeStr =
-        (found != std::string::npos)
-            ? "Created: " + ModifiedTimeStr.substr(0, found)
-            : "Created: " + ModifiedTimeStr;
+    if (ShowCreatedTime) {
+      auto ModifiedTime = Status.getLastModificationTime();
+      std::string ModifiedTimeStr = to_string(ModifiedTime);
+      size_t found = ModifiedTimeStr.rfind(':');
+      ViewOpts.CreatedTimeStr =
+          (found != std::string::npos)
+              ? "Created: " + ModifiedTimeStr.substr(0, found)
+              : "Created: " + ModifiedTimeStr;
+    }
   }
 
   auto Coverage = load();
@@ -1238,10 +1256,12 @@ int CodeCoverageTool::doReport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
@@ -1303,10 +1323,12 @@ int CodeCoverageTool::doExport(int argc, const char **argv,
     return 1;
   }
 
-  sys::fs::file_status Status;
-  if (std::error_code EC = sys::fs::status(PGOFilename, Status)) {
-    error("could not read profile data!" + EC.message(), PGOFilename);
-    return 1;
+  if (PGOFilename) {
+    sys::fs::file_status Status;
+    if (std::error_code EC = sys::fs::status(PGOFilename.value(), Status)) {
+      error("could not read profile data!" + EC.message(), PGOFilename.value());
+      return 1;
+    }
   }
 
   auto Coverage = load();
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 46f881ecddb5f..ec81e5f274efa 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -277,7 +277,9 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
       CoverageReaders.push_back(
           std::make_unique<CoverageMappingReaderMock>(Funcs));
     }
-    return CoverageMapping::load(CoverageReaders, *ProfileReader);
+    auto ProfileReaderRef = std::make_optional(
+        std::reference_wrapper<IndexedInstrProfReader>(*ProfileReader));
+    return CoverageMapping::load(CoverageReaders, ProfileReaderRef);
   }
 
   Error loadCoverageMapping(bool EmitFilenames = true) {

From f952af30fd2efbf6effa3e845f0e49a9f0e2302d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 12:25:31 -0700
Subject: [PATCH 430/851] [clang][docs][RISCV] Prepend the HelpText for
 -mrvv-vector-bits into the DocBrief. (#144128)

The DocBrief is used to generate the webpage description of the option.
The current text only talks about the possible values, but not what the
option does.
---
 clang/include/clang/Driver/Options.td | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5951687b095e4..1b07deb4a8482 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5056,9 +5056,10 @@ def mrvv_vector_bits_EQ : Joined<["-"], "mrvv-vector-bits=">, Group<m_Group>,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Specify the size in bits of an RVV vector register">,
   DocBrief<!strconcat(
-    "Defaults to the vector length agnostic value of \"scalable\". "
-    "Accepts power of 2 values between 64 and 65536. Also accepts "
-    "\"zvl\" to use the value implied by -march/-mcpu.",
+    "Specify the size in bits of an RVV vector register. Defaults to the "
+    "vector length agnostic value of \"scalable\". Accepts power of 2 values "
+    "between 64 and 65536. Also accepts \"zvl\" to use the value implied by "
+    "-march/-mcpu.",
     !cond(
       // Flang does not set the preprocessor define.
       !eq(GlobalDocumentation.Program, "Flang") : "",

From acc58ac8bf792d78233daf913565e2cbb61a8f5c Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 12:37:20 -0700
Subject: [PATCH 431/851] [bazel] Add missing dep for 52d34865b9db3485c
 (#144147)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b54ac1728a598..31855cd5444c2 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1631,6 +1631,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":Analysis",
+        ":ProfileData",
         ":Support",
         ":TargetParser",
     ],

From b7cb34840cd1e8cea932f04d5b4e34b4056cb6de Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 12:40:44 -0700
Subject: [PATCH 432/851] [CIR] Enable floating point casts (#144142)

We already had the code in place to emit CIR floating point cast ops
that get lowered to fpext or fptrunc, but we weren't calling the
function to emit that cast from ScalarExprEmitter::emitScalarCast. This
change adds that call.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 4 ++--
 clang/test/CIR/CodeGen/cast.cpp            | 8 ++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 30d231e2c61de..baaef022ccc68 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -346,8 +346,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         assert(!cir::MissingFeatures::fpConstraints());
         castKind = cir::CastKind::float_to_int;
       } else if (mlir::isa<cir::CIRFPTypeInterface>(dstTy)) {
-        cgf.getCIRGenModule().errorNYI("floating point casts");
-        return cgf.createDummyValue(src.getLoc(), dstType);
+        // TODO: split this to createFPExt/createFPTrunc
+        return builder.createFloatingCast(src, fullDstTy);
       } else {
         llvm_unreachable("Internal error: Cast to unexpected type");
       }
diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp
index a7c11b1939ba5..84f55242a6118 100644
--- a/clang/test/CIR/CodeGen/cast.cpp
+++ b/clang/test/CIR/CodeGen/cast.cpp
@@ -73,6 +73,14 @@ int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) {
   // LLVM: %{{[0-9]+}} = fcmp une float %{{[0-9]+}}, 0.000000e+00
   // LLVM: %{{[0-9]+}} = zext i1 %{{[0-9]+}} to i8
 
+  double d2 = f; // float to double
+  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.float), !cir.double
+  // LLVM: %{{[0-9]+}} = fpext float %{{[0-9]+}} to double
+
+  f = d2; // double to float
+  // CIR: %{{[0-9]+}} = cir.cast(floating, %{{[0-9]+}} : !cir.double), !cir.float
+  // LLVM: %{{[0-9]+}} = fptrunc double %{{[0-9]+}} to float
+
   return 0;
 }
 

From 65eaed7d5a08210cd5b419f45845d5de81435d7e Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Fri, 13 Jun 2025 12:40:57 -0700
Subject: [PATCH 433/851] [CIR] Handle character literal values (#144141)

This change adds a handler for emitting a cir.constant op when a
character literal is encountered outside an initializer expression.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp |  6 ++++++
 clang/test/CIR/CodeGen/basic.c             | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index baaef022ccc68..75b4d2a637e6e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -162,6 +162,12 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         builder.getAttr<cir::FPAttr>(type, e->getValue()));
   }
 
+  mlir::Value VisitCharacterLiteral(const CharacterLiteral *e) {
+    mlir::Type ty = cgf.convertType(e->getType());
+    auto init = cir::IntAttr::get(ty, e->getValue());
+    return builder.create<cir::ConstantOp>(cgf.getLoc(e->getExprLoc()), init);
+  }
+
   mlir::Value VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *e) {
     return builder.getBool(e->getValue(), cgf.getLoc(e->getExprLoc()));
   }
diff --git a/clang/test/CIR/CodeGen/basic.c b/clang/test/CIR/CodeGen/basic.c
index abc1a45fd433f..7ff73ee95f799 100644
--- a/clang/test/CIR/CodeGen/basic.c
+++ b/clang/test/CIR/CodeGen/basic.c
@@ -309,3 +309,17 @@ size_type max_size(void) {
 // CHECK:   %6 = cir.load{{.*}} %0 : !cir.ptr<!u64i>, !u64i
 // CHECK:   cir.return %6 : !u64i
 // CHECK:   }
+
+void test_char_literal() {
+  char c;
+  c = 'X';
+}
+
+// CIR: cir.func @test_char_literal
+// CIR:   cir.const #cir.int<88>
+
+// LLVM: define void @test_char_literal()
+// LLVM:   store i8 88, ptr %{{.*}}, align 1
+
+// OGCG: define{{.*}} void @test_char_literal()
+// OGCG:   store i8 88, ptr %{{.*}}, align 1

From f5df231d8caece81fd800b921cf4fbd7774e2885 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 13 Jun 2025 12:45:34 -0700
Subject: [PATCH 434/851] [LV] Fix test line and regen an autogen test

---
 .../RISCV/riscv-vector-reverse.ll             | 611 +++++++++++++-----
 1 file changed, 466 insertions(+), 145 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 7ec9749eb87ed..b026e68685812 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; This is the loop in c++ being vectorize in this file with
 ;vector.reverse
 ;  #pragma clang loop vectorize_width(4, scalable)
@@ -46,66 +46,100 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK:       LV: Loop does not require scalar epilogue
-; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  Creating VPBasicBlock for for.body
+; CHECK-NEXT:  VPlan 'Plain CFG
+; CHECK-NEXT:   for UF>=1' {
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
+; CHECK-NEXT:  Successor(s): for.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body:
+; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    EMIT ir<%add9> = add ir<%1>, ir<1>
+; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    EMIT store ir<%add9>, ir<%arrayidx3>
+; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
+; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
+; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
+; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
+; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
+; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
 ; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
 ; CHECK-NEXT:      WIDEN ir<%add9> = add ir<%1>, ir<1>
 ; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
+; CHECK-NEXT:      WIDEN store vp<%10>, ir<%add9>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %add9 = add i32 %1, 1
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -151,85 +185,212 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK:       Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  ir<%0> = original trip-count
+; CHECK-NEXT:  Live-in ir<%18> = VF
+; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%0> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR   %4 = add i32 %n, -1
-; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR %4 = add i32 %n, -1
+; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR %10 = or i1 %8, %9
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
-; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
-; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT:    vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
+; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT:    WIDEN ir<%add9> = add ir<%19>, ir<1>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
-; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<%5>, ir<%add9>
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME_1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME_2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME_1]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME_2]]>.1 from ir-bb<scalar.ph>)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %19 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %add9 = add i32 %19, 1
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body.preheader: ; preds = %entry
+; CHECK-NEXT:    %0 = zext i32 %n to i64
+; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
+; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT:    %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    %4 = add i32 %n, -1
+; CHECK-NEXT:    %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    %10 = or i1 %8, %9
+; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %13 = mul i64 %12, 4
+; CHECK-NEXT:    %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
+; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    %20 = sub i32 %n, %.cast
+; CHECK-NEXT:    br
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: created vector.body
+; CHECK-NEXT:  LV: draw edge fromvector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
+; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT:    %22 = zext i32 %21 to i64
+; CHECK-NEXT:    %23 = getelementptr inbounds i32, ptr %B, i64 %22
+; CHECK-NEXT:    %24 = mul i64 0, %18
+; CHECK-NEXT:    %25 = sub i64 1, %18
+; CHECK-NEXT:    %26 = getelementptr inbounds i32, ptr %23, i64 %24
+; CHECK-NEXT:    %27 = getelementptr inbounds i32, ptr %26, i64 %25
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x i32>, ptr %27, align 4
+; CHECK-NEXT:    %reverse = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %wide.load)
+; CHECK-NEXT:    %28 = add <vscale x 4 x i32> %reverse, splat (i32 1)
+; CHECK-NEXT:    %29 = getelementptr inbounds i32, ptr %A, i64 %22
+; CHECK-NEXT:    %30 = mul i64 0, %18
+; CHECK-NEXT:    %31 = sub i64 1, %18
+; CHECK-NEXT:    %32 = getelementptr inbounds i32, ptr %29, i64 %30
+; CHECK-NEXT:    %33 = getelementptr inbounds i32, ptr %32, i64 %31
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %28)
+; CHECK-NEXT:    store <vscale x 4 x i32> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
+; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:  LV: created middle.block
+; CHECK-NEXT:  LV: draw edge fromvector.body
+; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  middle.block: ; preds = %vector.body
+; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
+; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
+; CHECK-NEXT:    br label %for.cond.cleanup
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
+; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
+; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
+; CHECK-NEXT:    %35 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:    %add9 = add i32 %35, 1
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
+; CHECK-NEXT:    store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Vectorizing: innermost loop.
+; CHECK-EMPTY:
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0
@@ -294,66 +455,100 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
-; CHECK:       LV: Loop does not require scalar epilogue
-; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:  Creating VPBasicBlock for for.body
+; CHECK-NEXT:  VPlan 'Plain CFG
+; CHECK-NEXT:   for UF>=1' {
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body.preheader>:
+; CHECK-NEXT:    IR %0 = zext i32 %n to i64
+; CHECK-NEXT:  Successor(s): for.body
+; CHECK-EMPTY:
+; CHECK-NEXT:  for.body:
+; CHECK-NEXT:    WIDEN-PHI ir<%indvars.iv> = phi [ ir<%indvars.iv.next>, for.body ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    WIDEN-PHI ir<%i.0.in8> = phi [ ir<%i.0>, for.body ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT ir<%i.0> = add ir<%i.0.in8>, ir<-1>
+; CHECK-NEXT:    EMIT ir<%idxprom> = zext ir<%i.0>
+; CHECK-NEXT:    EMIT ir<%arrayidx> = getelementptr ir<%B>, ir<%idxprom>
+; CHECK-NEXT:    EMIT ir<%1> = load ir<%arrayidx>
+; CHECK-NEXT:    EMIT ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
+; CHECK-NEXT:    EMIT ir<%arrayidx3> = getelementptr ir<%A>, ir<%idxprom>
+; CHECK-NEXT:    EMIT store ir<%conv1>, ir<%arrayidx3>
+; CHECK-NEXT:    EMIT ir<%cmp> = icmp ir<%indvars.iv>, ir<1>
+; CHECK-NEXT:    EMIT ir<%indvars.iv.next> = add ir<%indvars.iv>, ir<-1>
+; CHECK-NEXT:    EMIT branch-on-cond ir<%cmp>
+; CHECK-NEXT:  Successor(s): for.body, ir-bb<for.cond.cleanup.loopexit>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' {
-; CHECK-NEXT:  Live-in vp<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in vp<[[VFxUF:%.+]]> = VF * UF
-; CHECK-NEXT:  Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  vp<[[TC:%.+]]> = original trip-count
+; CHECK-NEXT:  Live-in vp<%0> = VF
+; CHECK-NEXT:  Live-in vp<%1> = VF * UF
+; CHECK-NEXT:  Live-in vp<%2> = vector-trip-count
+; CHECK-NEXT:  vp<%3> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
-; CHECK-NEXT:    EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64)
+; CHECK-NEXT:    EMIT vp<%3> = EXPAND SCEV (zext i32 %n to i64)
 ; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.ph:
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    vp<%4> = DERIVED-IV ir<%0> + vp<%2> * ir<-1>
+; CHECK-NEXT:    vp<%5> = DERIVED-IV ir<%n> + vp<%2> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector loop
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <x1> vector loop: {
 ; CHECK-NEXT:    vector.body:
-; CHECK-NEXT:      EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
-; CHECK-NEXT:      vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1>
-; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1>
+; CHECK-NEXT:      EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      vp<%7> = DERIVED-IV ir<%n> + vp<%6> * ir<-1>
+; CHECK-NEXT:      vp<%8> = SCALAR-STEPS vp<%7>, ir<-1>, vp<%0>
+; CHECK-NEXT:      CLONE ir<%i.0> = add nsw vp<%8>, ir<-1>
 ; CHECK-NEXT:      CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:      CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN ir<%1> = load vp<[[VEC_PTR]]>
+; CHECK-NEXT:      vp<%9> = vector-end-pointer inbounds ir<%arrayidx>, vp<%0>
+; CHECK-NEXT:      WIDEN ir<%1> = load vp<%9>
 ; CHECK-NEXT:      WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00>
 ; CHECK-NEXT:      CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:      vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, vp<[[VF]]>
-; CHECK-NEXT:      WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1>
-; CHECK-NEXT:      EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:      EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:      vp<%10> = vector-end-pointer inbounds ir<%arrayidx3>, vp<%0>
+; CHECK-NEXT:      WIDEN store vp<%10>, ir<%conv1>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<%6>, vp<%1>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<%2>
 ; CHECK-NEXT:    No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%5>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from scalar.ph)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %conv1 = fadd float %1, 1.000000e+00
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
@@ -399,85 +594,211 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK:       Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  LV: Loop does not require scalar epilogue
+; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
 ; CHECK-NEXT:  VPlan 'Final VPlan for VF={vscale x 4},UF={1}' {
-; CHECK-NEXT:  Live-in ir<[[VF:%.+]]> = VF
-; CHECK-NEXT:  Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF
-; CHECK-NEXT:  Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT:  ir<%0> = original trip-count
+; CHECK-NEXT:  Live-in ir<%18> = VF
+; CHECK-NEXT:  Live-in ir<%18>.1 = VF * UF
+; CHECK-NEXT:  Live-in ir<%n.vec> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<%0> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body.preheader>:
 ; CHECK-NEXT:    IR %0 = zext i32 %n to i64
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.scevcheck>:
-; CHECK-NEXT:    IR   %3 = add nsw i64 %0, -1
-; CHECK-NEXT:    IR   %4 = add i32 %n, -1
-; CHECK-NEXT:    IR   %5 = trunc i64 %3 to i32
-; CHECK-NEXT:    IR   %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
-; CHECK-NEXT:    IR   %mul.result = extractvalue { i32, i1 } %mul, 0
-; CHECK-NEXT:    IR   %mul.overflow = extractvalue { i32, i1 } %mul, 1
-; CHECK-NEXT:    IR   %6 = sub i32 %4, %mul.result
-; CHECK-NEXT:    IR   %7 = icmp ugt i32 %6, %4
-; CHECK-NEXT:    IR   %8 = or i1 %7, %mul.overflow
-; CHECK-NEXT:    IR   %9 = icmp ugt i64 %3, 4294967295
-; CHECK-NEXT:    IR   %10 = or i1 %8, %9
+; CHECK-NEXT:    IR %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    IR %4 = add i32 %n, -1
+; CHECK-NEXT:    IR %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    IR %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    IR %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    IR %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    IR %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    IR %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    IR %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    IR %10 = or i1 %8, %9
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.memcheck>:
-; CHECK-NEXT:    IR   %11 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %12 = mul i64 %11, 4
-; CHECK-NEXT:    IR   %13 = mul i64 %12, 4
-; CHECK-NEXT:    IR   %14 = sub i64 %B1, %A2
-; CHECK-NEXT:    IR   %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    IR %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %12 = mul i64 %11, 4
+; CHECK-NEXT:    IR %13 = mul i64 %12, 4
+; CHECK-NEXT:    IR %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    IR %diff.check = icmp ult i64 %14, %13
 ; CHECK-NEXT:  Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<vector.ph>:
-; CHECK-NEXT:    IR   %15 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %16 = mul i64 %15, 4
-; CHECK-NEXT:    IR   %n.mod.vf = urem i64 %0, %16
-; CHECK-NEXT:    IR   %n.vec = sub i64 %0, %n.mod.vf
-; CHECK-NEXT:    IR   %17 = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    IR   %18 = mul i64 %17, 4
-; CHECK-NEXT:    vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1>
-; CHECK-NEXT:    vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1>
+; CHECK-NEXT:    IR %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %16 = mul i64 %15, 4
+; CHECK-NEXT:    IR %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    IR %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    IR %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    IR %18 = mul i64 %17, 4
+; CHECK-NEXT:    vp<%1> = DERIVED-IV ir<%0> + ir<%n.vec> * ir<-1>
+; CHECK-NEXT:    vp<%2> = DERIVED-IV ir<%n> + ir<%n.vec> * ir<-1>
 ; CHECK-NEXT:  Successor(s): vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  vector.body:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[CAN_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<[[CAN_IV_NEXT:%.+]]>, vector.body ]
-; CHECK-NEXT:    vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1>
-; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<[[DEV_IV]]>, ir<-1>
+; CHECK-NEXT:    EMIT-SCALAR vp<%index> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:    vp<%3> = DERIVED-IV ir<%n> + vp<%index> * ir<-1>
+; CHECK-NEXT:    CLONE ir<%i.0> = add nsw vp<%3>, ir<-1>
 ; CHECK-NEXT:    CLONE ir<%idxprom> = zext ir<%i.0>
 ; CHECK-NEXT:    CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
+; CHECK-NEXT:    vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
+; CHECK-NEXT:    WIDEN ir<%19> = load vp<%4>
+; CHECK-NEXT:    WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
 ; CHECK-NEXT:    CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
-; CHECK-NEXT:    vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
-; CHECK-NEXT:    WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
-; CHECK-NEXT:    EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1
-; CHECK-NEXT:    EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]>
+; CHECK-NEXT:    vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
+; CHECK-NEXT:    WIDEN store vp<%5>, ir<%conv1>
+; CHECK-NEXT:    EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
+; CHECK-NEXT:    EMIT branch-on-count vp<%index.next>, ir<%n.vec>
 ; CHECK-NEXT:  Successor(s): middle.block, vector.body
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  middle.block:
-; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<%0>, ir<[[VEC_TC]]>
-; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT:    EMIT vp<%cmp.n> = icmp eq ir<%0>, ir<%n.vec>
+; CHECK-NEXT:    EMIT branch-on-cond vp<%cmp.n>
 ; CHECK-NEXT:  Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.cond.cleanup.loopexit>:
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<scalar.ph>:
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME1:%.+]]> = phi [ vp<[[END1]]>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ]
-; CHECK-NEXT:    EMIT-SCALAR vp<[[RESUME2:%.+]]>.1 = phi [ vp<[[END2]]>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<%0>, ir-bb<for.body.preheader> ], [ ir<%0>, ir-bb<vector.scevcheck> ], [ ir<%0>, ir-bb<vector.memcheck> ]
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val>.1 = phi [ vp<%2>, middle.block ], [ ir<%n>, ir-bb<for.body.preheader> ], [ ir<%n>, ir-bb<vector.scevcheck> ], [ ir<%n>, ir-bb<vector.memcheck> ]
 ; CHECK-NEXT:  Successor(s): ir-bb<for.body>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  ir-bb<for.body>:
-; CHECK-NEXT:    IR   %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT:    IR   %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>)
-; CHECK:         IR   %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<%bc.resume.val>.1 from ir-bb<scalar.ph>)
+; CHECK-NEXT:    IR %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    IR %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    IR %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    IR %19 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    IR %conv1 = fadd float %19, 1.000000e+00
+; CHECK-NEXT:    IR %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    IR store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    IR %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    IR %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT:  }
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body.preheader> in BB:for.body.preheader
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body.preheader: ; preds = %entry
+; CHECK-NEXT:    %0 = zext i32 %n to i64
+; CHECK-NEXT:    %1 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %2 = mul i64 %1, 4
+; CHECK-NEXT:    %min.iters.check = icmp ult i64 %0, %2
+; CHECK-NEXT:    br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.scevcheck> in BB:vector.scevcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.scevcheck: ; preds = %for.body.preheader
+; CHECK-NEXT:    %3 = add nsw i64 %0, -1
+; CHECK-NEXT:    %4 = add i32 %n, -1
+; CHECK-NEXT:    %5 = trunc i64 %3 to i32
+; CHECK-NEXT:    %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5)
+; CHECK-NEXT:    %mul.result = extractvalue { i32, i1 } %mul, 0
+; CHECK-NEXT:    %mul.overflow = extractvalue { i32, i1 } %mul, 1
+; CHECK-NEXT:    %6 = sub i32 %4, %mul.result
+; CHECK-NEXT:    %7 = icmp ugt i32 %6, %4
+; CHECK-NEXT:    %8 = or i1 %7, %mul.overflow
+; CHECK-NEXT:    %9 = icmp ugt i64 %3, 4294967295
+; CHECK-NEXT:    %10 = or i1 %8, %9
+; CHECK-NEXT:    br i1 %10, label %scalar.ph, label %vector.memcheck
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.memcheck> in BB:vector.memcheck
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.memcheck: ; preds = %vector.scevcheck
+; CHECK-NEXT:    %11 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %12 = mul i64 %11, 4
+; CHECK-NEXT:    %13 = mul i64 %12, 4
+; CHECK-NEXT:    %14 = sub i64 %B1, %A2
+; CHECK-NEXT:    %diff.check = icmp ult i64 %14, %13
+; CHECK-NEXT:    br i1 %diff.check, label %scalar.ph, label %vector.ph
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<vector.ph> in BB:vector.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.ph: ; preds = %vector.memcheck
+; CHECK-NEXT:    %15 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %16 = mul i64 %15, 4
+; CHECK-NEXT:    %n.mod.vf = urem i64 %0, %16
+; CHECK-NEXT:    %n.vec = sub i64 %0, %n.mod.vf
+; CHECK-NEXT:    %17 = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    %18 = mul i64 %17, 4
+; CHECK-NEXT:    %19 = sub i64 %0, %n.vec
+; CHECK-NEXT:    %.cast = trunc i64 %n.vec to i32
+; CHECK-NEXT:    %20 = sub i32 %n, %.cast
+; CHECK-NEXT:    br
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: created vector.body
+; CHECK-NEXT:  LV: draw edge fromvector.ph
+; CHECK-NEXT:  LV: vectorizing VPBB:vector.body in BB:vector.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  vector.body: ; preds = %vector.body, %vector.ph
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ]
+; CHECK-NEXT:    %.cast3 = trunc i64 %index to i32
+; CHECK-NEXT:    %offset.idx = sub i32 %n, %.cast3
+; CHECK-NEXT:    %21 = add nsw i32 %offset.idx, -1
+; CHECK-NEXT:    %22 = zext i32 %21 to i64
+; CHECK-NEXT:    %23 = getelementptr inbounds float, ptr %B, i64 %22
+; CHECK-NEXT:    %24 = mul i64 0, %18
+; CHECK-NEXT:    %25 = sub i64 1, %18
+; CHECK-NEXT:    %26 = getelementptr inbounds float, ptr %23, i64 %24
+; CHECK-NEXT:    %27 = getelementptr inbounds float, ptr %26, i64 %25
+; CHECK-NEXT:    %wide.load = load <vscale x 4 x float>, ptr %27, align 4
+; CHECK-NEXT:    %reverse = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %wide.load)
+; CHECK-NEXT:    %28 = fadd <vscale x 4 x float> %reverse, splat (float 1.000000e+00)
+; CHECK-NEXT:    %29 = getelementptr inbounds float, ptr %A, i64 %22
+; CHECK-NEXT:    %30 = mul i64 0, %18
+; CHECK-NEXT:    %31 = sub i64 1, %18
+; CHECK-NEXT:    %32 = getelementptr inbounds float, ptr %29, i64 %30
+; CHECK-NEXT:    %33 = getelementptr inbounds float, ptr %32, i64 %31
+; CHECK-NEXT:    %reverse4 = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %28)
+; CHECK-NEXT:    store <vscale x 4 x float> %reverse4, ptr %33, align 4
+; CHECK-NEXT:    %index.next = add nuw i64 %index, %18
+; CHECK-NEXT:    %34 = icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1 %34, <null operand!>, label %vector.body
+; CHECK-NEXT:  LV: created middle.block
+; CHECK-NEXT:  LV: draw edge fromvector.body
+; CHECK-NEXT:  LV: vectorizing VPBB:middle.block in BB:middle.block
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  middle.block: ; preds = %vector.body
+; CHECK-NEXT:    %cmp.n = icmp eq i64 %0, %n.vec
+; CHECK-NEXT:    br i1 %cmp.n, <null operand!>, <null operand!>
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.cond.cleanup.loopexit> in BB:for.cond.cleanup.loopexit
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.cond.cleanup.loopexit: ; preds = %for.body
+; CHECK-NEXT:    br label %for.cond.cleanup
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<scalar.ph> in BB:scalar.ph
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  scalar.ph: ; preds = %vector.memcheck, %vector.scevcheck, %for.body.preheader
+; CHECK-NEXT:    %bc.resume.val = phi i64 [ %19, %middle.block ], [ %0, %for.body.preheader ], [ %0, %vector.scevcheck ], [ %0, %vector.memcheck ]
+; CHECK-NEXT:    %bc.resume.val5 = phi i32 [ %20, %middle.block ], [ %n, %for.body.preheader ], [ %n, %vector.scevcheck ], [ %n, %vector.memcheck ]
+; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:  LV: draw edge frommiddle.block
+; CHECK-NEXT:  LV: draw edge fromfor.body.preheader
+; CHECK-NEXT:  LV: draw edge fromvector.scevcheck
+; CHECK-NEXT:  LV: draw edge fromvector.memcheck
+; CHECK-NEXT:  LV: vectorizing VPBB:ir-bb<for.body> in BB:for.body
+; CHECK-NEXT:  LV: filled BB:
+; CHECK-NEXT:  for.body: ; preds = %for.body, %scalar.ph
+; CHECK-NEXT:    %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+; CHECK-NEXT:    %i.0.in8 = phi i32 [ %bc.resume.val5, %scalar.ph ], [ %i.0, %for.body ]
+; CHECK-NEXT:    %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK-NEXT:    %idxprom = zext i32 %i.0 to i64
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
+; CHECK-NEXT:    %35 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:    %conv1 = fadd float %35, 1.000000e+00
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
+; CHECK-NEXT:    store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:    %cmp = icmp ugt i64 %indvars.iv, 1
+; CHECK-NEXT:    %indvars.iv.next = add nsw i64 %indvars.iv, -1
+; CHECK-NEXT:    br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+; CHECK-NEXT:  LV: draw edge fromscalar.ph
+; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
   %cmp7 = icmp sgt i32 %n, 0

From 1ded2c599fd230b2d355386c019a3054f5745d55 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 13 Jun 2025 21:01:11 +0100
Subject: [PATCH 435/851] [LV] Use createIterationCountCheck during epilogue
 skeleton creation.

Use helper already used for minimum trip count checks for the regular
ILV skeleton creation also for epilogue skeleton creation.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 ++++---------------
 .../partial-reduce-dot-product-epilogue.ll    |  3 +--
 ...ctor-loop-backedge-elimination-epilogue.ll |  3 +--
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 69b60c7b93208..93f53996425d3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7535,25 +7535,12 @@ BasicBlock *
 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                                     bool ForEpilogue) {
   assert(Bypass && "Expected valid bypass basic block.");
-  ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
-  unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
   Value *Count = getTripCount();
-  // Reuse existing vector loop preheader for TC checks.
-  // Note that new preheader block is generated for vector loop.
-  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
-  IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
-  // Generate code to check if the loop's trip count is less than VF * UF of the
-  // main vector loop.
-  auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
-                                                    : VF.isVector())
-               ? ICmpInst::ICMP_ULE
-               : ICmpInst::ICMP_ULT;
-
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
-      "min.iters.check");
+  MinProfitableTripCount = ElementCount::getFixed(0);
+  Value *CheckMinIters = createIterationCountCheck(
+      ForEpilogue ? EPI.EpilogueVF : VF, ForEpilogue ? EPI.EpilogueUF : UF);
 
+  BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
   if (!ForEpilogue)
     TCCheckBlock->setName("vector.main.loop.iter.check");
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index c3fc91c4574f1..66dbcff2c123d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -10,8 +10,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
index cb966e4088dbb..2705d6910bb2d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll
@@ -11,8 +11,7 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c)  {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TC]], 64
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TC]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]]

From c42912b8c96ff1130437e47c163aeb5c1191fe5d Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 13:07:39 -0700
Subject: [PATCH 436/851] Fix string_length function so that it always returns.
 (#144148)

Previously setting LIBC_COPT_STRING_UNSAFE_WIDE_READ would cause a build
error because there is a path in the ifdef that doesn't return anything.
---
 libc/src/string/string_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index dcbfc7584a30e..4f56263fce8ec 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -90,12 +90,11 @@ template <typename T> LIBC_INLINE size_t string_length(const T *src) {
   // string a block at a time.
   if constexpr (cpp::is_same_v<T, char>)
     return string_length_wide_read<unsigned int>(src);
-#else
+#endif
   size_t length;
   for (length = 0; *src; ++src, ++length)
     ;
   return length;
-#endif
 }
 
 template <typename Word>

From 938e91e4fe10a9ff810b41ee74f5c0af8d3ac490 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:08 -0700
Subject: [PATCH 437/851] [memprof] Use testing::IsEmpty (NFC) (#144096)

This patch replaces testing::IsEmpty with IsEmpty because we already
have:

  using ::testing::IsEmpty;

near the beginning of the file.
---
 .../ProfileData/DataAccessProfTest.cpp        | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/llvm/unittests/ProfileData/DataAccessProfTest.cpp b/llvm/unittests/ProfileData/DataAccessProfTest.cpp
index 13af3390557d7..2f93e16f28c1c 100644
--- a/llvm/unittests/ProfileData/DataAccessProfTest.cpp
+++ b/llvm/unittests/ProfileData/DataAccessProfTest.cpp
@@ -92,10 +92,10 @@ TEST(MemProf, DataAccessProfile) {
 
     EXPECT_THAT(
         Data.getProfileRecord("foo.llvm.123"),
-        ValueIs(AllOf(
-            Field(&DataAccessProfRecord::SymHandle,
-                  testing::VariantWith<std::string>(testing::Eq("foo"))),
-            Field(&DataAccessProfRecord::Locations, testing::IsEmpty()))));
+        ValueIs(
+            AllOf(Field(&DataAccessProfRecord::SymHandle,
+                        testing::VariantWith<std::string>(testing::Eq("foo"))),
+                  Field(&DataAccessProfRecord::Locations, IsEmpty()))));
     EXPECT_THAT(
         Data.getProfileRecord("bar.__uniq.321"),
         ValueIs(AllOf(
@@ -130,7 +130,7 @@ TEST(MemProf, DataAccessProfile) {
         reinterpret_cast<const unsigned char *>(serializedData.data());
     ASSERT_THAT(llvm::to_vector(llvm::make_first_range(
                     deserializedData.getStrToIndexMapRef())),
-                testing::IsEmpty());
+                IsEmpty());
     EXPECT_FALSE(deserializedData.deserialize(p));
 
     EXPECT_THAT(
@@ -153,11 +153,10 @@ TEST(MemProf, DataAccessProfile) {
     EXPECT_THAT(
         Records,
         ElementsAre(
-            AllOf(
-                Field(&DataAccessProfRecordRef::SymbolID, 0),
-                Field(&DataAccessProfRecordRef::AccessCount, 100),
-                Field(&DataAccessProfRecordRef::IsStringLiteral, false),
-                Field(&DataAccessProfRecordRef::Locations, testing::IsEmpty())),
+            AllOf(Field(&DataAccessProfRecordRef::SymbolID, 0),
+                  Field(&DataAccessProfRecordRef::AccessCount, 100),
+                  Field(&DataAccessProfRecordRef::IsStringLiteral, false),
+                  Field(&DataAccessProfRecordRef::Locations, IsEmpty())),
             AllOf(Field(&DataAccessProfRecordRef::SymbolID, 2),
                   Field(&DataAccessProfRecordRef::AccessCount, 123),
                   Field(&DataAccessProfRecordRef::IsStringLiteral, false),

From 6d0cfc2ca51e9365f1c6f216df30a612958aca70 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:15 -0700
Subject: [PATCH 438/851] [Vectorize] Use llvm::drop_begin (NFC) (#144098)

We can pass a range to llvm::drop_begin.
---
 llvm/lib/Transforms/Vectorize/VPlan.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index acc861b991975..53619b39219e6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2114,7 +2114,7 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors {
   VPWidenPHIRecipe *clone() override {
     auto *C = new VPWidenPHIRecipe(cast<PHINode>(getUnderlyingValue()),
                                    getOperand(0), getDebugLoc(), Name);
-    for (VPValue *Op : make_range(std::next(op_begin()), op_end()))
+    for (VPValue *Op : llvm::drop_begin(operands()))
       C->addOperand(Op);
     return C;
   }

From 2a805589f56b30b27057c7549dd0ad2963ae16b1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:22 -0700
Subject: [PATCH 439/851] [SPIRV] Use llvm::all_of (NFC) (#144099)

We can pass a range to llvm::all_of.
---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 8edd0b533b9fa..911a6966aaef0 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -1564,15 +1564,14 @@ static bool isUSMStorageClass(SPIRV::StorageClass::StorageClass SC) {
 static bool isASCastInGVar(MachineRegisterInfo *MRI, Register ResVReg) {
   bool IsGRef = false;
   bool IsAllowedRefs =
-      std::all_of(MRI->use_instr_begin(ResVReg), MRI->use_instr_end(),
-                  [&IsGRef](auto const &It) {
-                    unsigned Opcode = It.getOpcode();
-                    if (Opcode == SPIRV::OpConstantComposite ||
-                        Opcode == SPIRV::OpVariable ||
-                        isSpvIntrinsic(It, Intrinsic::spv_init_global))
-                      return IsGRef = true;
-                    return Opcode == SPIRV::OpName;
-                  });
+      llvm::all_of(MRI->use_instructions(ResVReg), [&IsGRef](auto const &It) {
+        unsigned Opcode = It.getOpcode();
+        if (Opcode == SPIRV::OpConstantComposite ||
+            Opcode == SPIRV::OpVariable ||
+            isSpvIntrinsic(It, Intrinsic::spv_init_global))
+          return IsGRef = true;
+        return Opcode == SPIRV::OpName;
+      });
   return IsAllowedRefs && IsGRef;
 }
 

From 5064a5bc3e958aeb18bf3f8c7144c99cc3103a91 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Jun 2025 13:16:31 -0700
Subject: [PATCH 440/851] [IR] Remove a redundant control flow statement (NFC)
 (#144100)

---
 llvm/lib/IR/DebugInfo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 9527c3e0b5d67..e6b1f76dfacf6 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2127,7 +2127,6 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
       &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
   (void)Assign;
   LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-  return;
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).

From a89df72ec0864301f102296dcf7b3bd22844adf5 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Fri, 13 Jun 2025 13:30:18 -0700
Subject: [PATCH 441/851] WholeProgramDevirt: Fix importing in
 llvm.type.checked.load case.

We were clearing SummaryTypeCheckedLoadUsers to prevent devirtualized
llvm.type.checked.load calls from being converted to llvm.type.test,
which meant that AddCalls would not see them in the list of
callsites and they would not get imported. Fix that by not clearing
SummaryTypeCheckedLoadUsers so that the list survives to AddCalls and
using AllCallSitesDevirted to control whether to convert them instead.

Reviewers: teresajohnson

Reviewed By: teresajohnson

Pull Request: https://github.com/llvm/llvm-project/pull/144019
---
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 39 +++++++++----------
 .../Inputs/devirt_single_hybrid_foo_tcl.ll    | 31 +++++++++++++++
 llvm/test/ThinLTO/X86/devirt_single_hybrid.ll | 26 ++++++++++++-
 3 files changed, 74 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll

diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index a7d9f3ba24b24..30e1dc7167a39 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -494,28 +494,28 @@ struct CallSiteInfo {
   /// Whether all call sites represented by this CallSiteInfo, including those
   /// in summaries, have been devirtualized. This starts off as true because a
   /// default constructed CallSiteInfo represents no call sites.
+  ///
+  /// If at the end of the pass there are still undevirtualized calls, we will
+  /// need to add a use of llvm.type.test to each of the function summaries in
+  /// the vector.
   bool AllCallSitesDevirted = true;
 
   // These fields are used during the export phase of ThinLTO and reflect
   // information collected from function summaries.
 
-  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
-  /// this slot.
-  bool SummaryHasTypeTestAssumeUsers = false;
-
   /// CFI-specific: a vector containing the list of function summaries that use
   /// the llvm.type.checked.load intrinsic and therefore will require
   /// resolutions for llvm.type.test in order to implement CFI checks if
-  /// devirtualization was unsuccessful. If devirtualization was successful, the
-  /// pass will clear this vector by calling markDevirt(). If at the end of the
-  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
-  /// to each of the function summaries in the vector.
+  /// devirtualization was unsuccessful.
   std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  /// A vector containing the list of function summaries that use
+  /// assume(llvm.type.test).
   std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
 
   bool isExported() const {
-    return SummaryHasTypeTestAssumeUsers ||
-           !SummaryTypeCheckedLoadUsers.empty();
+    return !SummaryTypeCheckedLoadUsers.empty() ||
+           !SummaryTypeTestAssumeUsers.empty();
   }
 
   void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
@@ -525,16 +525,10 @@ struct CallSiteInfo {
 
   void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
     SummaryTypeTestAssumeUsers.push_back(FS);
-    SummaryHasTypeTestAssumeUsers = true;
     AllCallSitesDevirted = false;
   }
 
-  void markDevirt() {
-    AllCallSitesDevirted = true;
-
-    // As explained in the comment for SummaryTypeCheckedLoadUsers.
-    SummaryTypeCheckedLoadUsers.clear();
-  }
+  void markDevirt() { AllCallSitesDevirted = true; }
 };
 
 // Call site information collected for a specific VTableSlot.
@@ -2465,11 +2459,14 @@ bool DevirtModule::run() {
     if (ExportSummary && isa<MDString>(S.first.TypeID)) {
       auto GUID = GlobalValue::getGUIDAssumingExternalLinkage(
           cast<MDString>(S.first.TypeID)->getString());
-      for (auto *FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
-        FS->addTypeTest(GUID);
+      auto AddTypeTestsForTypeCheckedLoads = [&](CallSiteInfo &CSI) {
+        if (!CSI.AllCallSitesDevirted)
+          for (auto *FS : CSI.SummaryTypeCheckedLoadUsers)
+            FS->addTypeTest(GUID);
+      };
+      AddTypeTestsForTypeCheckedLoads(S.second.CSInfo);
       for (auto &CCS : S.second.ConstCSInfo)
-        for (auto *FS : CCS.second.SummaryTypeCheckedLoadUsers)
-          FS->addTypeTest(GUID);
+        AddTypeTestsForTypeCheckedLoads(CCS.second);
     }
   }
 
diff --git a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll
new file mode 100644
index 0000000000000..4a696837bc8e9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_foo_tcl.ll
@@ -0,0 +1,31 @@
+; ModuleID = 'foo.cpp'
+source_filename = "foo.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+
+; Function Attrs: uwtable
+define hidden i32 @_Z3fooP1A(ptr %pA) local_unnamed_addr {
+entry:
+  %vtable = load ptr, ptr %pA, align 8, !tbaa !2
+  %0 = call { ptr, i1 } @llvm.type.checked.load(ptr %vtable, i32 0, metadata !"_ZTS1A")
+  %1 = extractvalue { ptr, i1 } %0, 0
+  %call = tail call i32 %1(ptr %pA)
+  %add = add nsw i32 %call, 10
+  ret i32 %add
+}
+
+declare { ptr, i1 } @llvm.type.checked.load(ptr, i32, metadata)
+
+; Function Attrs: nounwind willreturn
+declare void @llvm.assume(i1)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (trunk 373596)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"vtable pointer", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}
diff --git a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
index 90fdf0d7dfa09..53c001efc132e 100644
--- a/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_single_hybrid.ll
@@ -2,8 +2,32 @@
 ; when we're running hybrid LTO.
 ;
 ; RUN: opt -thinlto-bc -thinlto-split-lto-unit %s -o %t-main.bc
-; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo.ll -o %t-foo.bc
 ; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_bar.ll -o %t-bar.bc
+
+; Test the assume(type.test) case.
+ 
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo.ll -o %t-foo.bc
+; RUN: llvm-lto2 run -save-temps %t-main.bc %t-foo.bc %t-bar.bc -pass-remarks=. -o %t \
+; RUN:   -whole-program-visibility \
+; RUN:    -r=%t-foo.bc,_Z3fooP1A,pl \
+; RUN:    -r=%t-main.bc,main,plx \
+; RUN:    -r=%t-main.bc,_Z3barv,l \
+; RUN:    -r=%t-bar.bc,_Z3barv,pl \
+; RUN:    -r=%t-bar.bc,_Z3fooP1A, \
+; RUN:    -r=%t-bar.bc,_ZNK1A1fEv,pl \
+; RUN:    -r=%t-bar.bc,_ZTV1A,l \
+; RUN:    -r=%t-bar.bc,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:    -r=%t-bar.bc,_ZTS1A,pl \
+; RUN:    -r=%t-bar.bc,_ZTI1A,pl \
+; RUN:    -r=%t-bar.bc,_ZNK1A1fEv, \
+; RUN:    -r=%t-bar.bc,_ZTV1A,pl \
+; RUN:    -r=%t-bar.bc,_ZTI1A, 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-dis %t.1.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN
+
+; Test the type.checked.load case.
+ 
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit %p/Inputs/devirt_single_hybrid_foo_tcl.ll -o %t-foo.bc
 ; RUN: llvm-lto2 run -save-temps %t-main.bc %t-foo.bc %t-bar.bc -pass-remarks=. -o %t \
 ; RUN:   -whole-program-visibility \
 ; RUN:    -r=%t-foo.bc,_Z3fooP1A,pl \

From 52a6492136ef43462c68efa88a0276bb66ee8c52 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <rnk@google.com>
Date: Fri, 13 Jun 2025 20:30:12 +0000
Subject: [PATCH 442/851] [bazel] Add missing errno deps one more time

---
 utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel  | 1 +
 .../bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index 2354337da2dc5..2c5ad7d27ce84 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -81,6 +81,7 @@ libc_test_library(
     ],
     deps = [
         ":LibcUnitTest",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
         "//libc:errno",
     ],
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
index 610978059d7e6..4f66793d44dfe 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel
@@ -204,6 +204,7 @@ libc_test_library(
         "//libc:__support_cpp_limits",
         "//libc:__support_cpp_type_traits",
         "//libc:__support_ctype_utils",
+        "//libc:__support_libc_errno",
         "//libc:__support_macros_properties_architectures",
         "//libc:errno",
         "//libc/test/UnitTest:LibcUnitTest",

From 60d000496b5485c89c51e64b2b339210d48263be Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 05:44:08 +0900
Subject: [PATCH 443/851] [Cygwin] Define LLVM_ABI for Cygwin (#143222)

592243c1cb3ea53b34033132a87b0d14af9d1079 should be also applied to
LLVM_ABI.
---
 llvm/include/llvm/Support/Compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index f6bc05011e3c3..0de789ec68c49 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -209,7 +209,7 @@
 #define LLVM_ABI_FRIEND LLVM_ABI
 #define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX) ||             \
-    defined(__MVS__)
+    defined(__MVS__) || defined(__CYGWIN__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_ABI_FRIEND
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT

From be5c96bfac328fed548c532bbe1710fe23460a85 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 13:48:29 -0700
Subject: [PATCH 444/851] [CodeGen][COFF] Always emit CodeView compiler info on
 Windows targets (#142970)

MSVC always emits minimal CodeView metadata with compiler information,
even when debug info is otherwise disabled. Other tools may rely on this
metadata being present. For example, linkers use it to determine whether
hotpatching is enabled for the object file.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  7 +++-
 clang/test/CodeGen/debug-info-version-coff.c  |  8 ++++
 clang/test/CodeGen/debug-info-version.c       |  1 +
 clang/test/CodeGenCXX/debug-info-coff.cpp     | 37 ++++++++++++++++++
 .../debug-info-hotpatch-aarch64.cpp           |  7 +---
 .../CodeGenCXX/debug-info-hotpatch-arm.cpp    |  7 +---
 clang/test/Frontend/ast-main.c                |  4 +-
 clang/test/Frontend/ast-main.cpp              |  4 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  7 +++-
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 34 ++++++++++++-----
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h   |  4 ++
 .../Generic/selection-dag-determinism.ll      | 10 ++---
 llvm/test/DebugInfo/COFF/dwarf-headers.ll     | 27 +++++++++++++
 .../COFF/emission-kind-no-codeview.ll         | 38 +++++++++++++++++++
 .../DebugInfo/COFF/emission-kind-no-debug.ll  | 28 ++++++++++++--
 llvm/test/DebugInfo/COFF/fission-cu.ll        | 10 ++---
 llvm/test/DebugInfo/COFF/fission-sections.ll  | 15 ++++----
 llvm/test/DebugInfo/COFF/uefi-nodebug.ll      | 16 ++++++++
 .../test/DebugInfo/Generic/directives-only.ll |  2 +-
 19 files changed, 219 insertions(+), 47 deletions(-)
 create mode 100644 clang/test/CodeGen/debug-info-version-coff.c
 create mode 100644 clang/test/CodeGenCXX/debug-info-coff.cpp
 create mode 100644 llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
 create mode 100644 llvm/test/DebugInfo/COFF/uefi-nodebug.ll

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 451792dca40c5..c036902b0b130 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -414,6 +414,11 @@ CodeGenModule::CodeGenModule(ASTContext &C,
       CodeGenOpts.CoverageNotesFile.size() ||
       CodeGenOpts.CoverageDataFile.size())
     DebugInfo.reset(new CGDebugInfo(*this));
+  else if (getTriple().isOSWindows())
+    // On Windows targets, we want to emit compiler info even if debug info is
+    // otherwise disabled. Use a temporary CGDebugInfo instance to emit only
+    // basic compiler metadata.
+    CGDebugInfo(*this);
 
   Block.GlobalUniqueCount = 0;
 
@@ -1051,7 +1056,7 @@ void CodeGenModule::Release() {
                               "StrictVTablePointersRequirement",
                               llvm::MDNode::get(VMContext, Ops));
   }
-  if (getModuleDebugInfo())
+  if (getModuleDebugInfo() || getTriple().isOSWindows())
     // We support a single version in the linked module. The LLVM
     // parser will drop debug info with a different version number
     // (and warn about it, too).
diff --git a/clang/test/CodeGen/debug-info-version-coff.c b/clang/test/CodeGen/debug-info-version-coff.c
new file mode 100644
index 0000000000000..6497a58292362
--- /dev/null
+++ b/clang/test/CodeGen/debug-info-version-coff.c
@@ -0,0 +1,8 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang --target=x86_64-windows -g -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang --target=x86_64-windows -S -emit-llvm -o - %s | FileCheck %s
+int main (void) {
+  return 0;
+}
+
+// CHECK:  i32 2, !"Debug Info Version", i32 3}
diff --git a/clang/test/CodeGen/debug-info-version.c b/clang/test/CodeGen/debug-info-version.c
index fa7e20e7f5279..c7c2bb95017a2 100644
--- a/clang/test/CodeGen/debug-info-version.c
+++ b/clang/test/CodeGen/debug-info-version.c
@@ -1,3 +1,4 @@
+// REQUIRES: !system-windows
 // RUN: %clang -g -S -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang -S -emit-llvm -o - %s | FileCheck %s --check-prefix=NO_DEBUG
 int main (void) {
diff --git a/clang/test/CodeGenCXX/debug-info-coff.cpp b/clang/test/CodeGenCXX/debug-info-coff.cpp
new file mode 100644
index 0000000000000..4507f5f40d411
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-coff.cpp
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+
+// Check that CodeView compiler version is emitted even when debug info is otherwise disabled.
+
+// RUN: %clang --target=i686-pc-windows-msvc -S -emit-llvm %s -o - | FileCheck --check-prefix=IR %s
+// IR: !llvm.dbg.cu = !{!0}
+// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
+
+// RUN: %clang --target=i686-pc-windows-msvc -c %s -o %t.o
+// RUN: llvm-readobj --codeview %t.o | FileCheck %s
+// CHECK:      CodeViewDebugInfo [
+// CHECK-NEXT:   Section: .debug$S (4)
+// CHECK-NEXT:   Magic: 0x4
+// CHECK-NEXT:   Subsection [
+// CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+// CHECK-NEXT:     SubSectionSize:
+// CHECK-NEXT:     ObjNameSym {
+// CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+// CHECK-NEXT:       Signature: 0x0
+// CHECK-NEXT:       ObjectName:
+// CHECK-NEXT:     }
+// CHECK-NEXT:     Compile3Sym {
+// CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+// CHECK-NEXT:       Language: Cpp (0x1)
+// CHECK-NEXT:       Flags [ (0x0)
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       Machine: Pentium3 (0x7)
+// CHECK-NEXT:       FrontendVersion:
+// CHECK-NEXT:       BackendVersion:
+// CHECK-NEXT:       VersionName: clang version
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
+// CHECK-NEXT: ]
+
+int main() {
+  return 0;
+}
diff --git a/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp b/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
index 10fb1750f2c55..ff2dfc19961c0 100644
--- a/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
+++ b/clang/test/CodeGenCXX/debug-info-hotpatch-aarch64.cpp
@@ -11,12 +11,9 @@
 // RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 // HOTPATCH: S_COMPILE3 [size = [[#]]]
 // HOTPATCH: flags = hot patchable
-///
-/// Unfortunately we need /Z7, Clang does not systematically generate S_COMPILE3.
-///
+//
 // RUN: %clang_cl --target=aarch64-pc-windows-msvc /c -o %t.obj -- %s
-// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=NO-HOTPATCH
-// NO-HOTPATCH-NOT: flags = hot patchable
+// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 
 int main() {
   return 0;
diff --git a/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp b/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
index 48a61f7fb1977..e31c762b08872 100644
--- a/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
+++ b/clang/test/CodeGenCXX/debug-info-hotpatch-arm.cpp
@@ -11,12 +11,9 @@
 // RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 // HOTPATCH: S_COMPILE3 [size = [[#]]]
 // HOTPATCH: flags = hot patchable
-///
-/// Unfortunately we need /Z7, Clang does not systematically generate S_COMPILE3.
-///
+//
 // RUN: %clang_cl --target=arm-pc-windows-msvc /c -o %t.obj -- %s
-// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=NO-HOTPATCH
-// NO-HOTPATCH-NOT: flags = hot patchable
+// RUN: llvm-pdbutil dump -symbols %t.obj | FileCheck %s --check-prefix=HOTPATCH
 
 int main() {
   return 0;
diff --git a/clang/test/Frontend/ast-main.c b/clang/test/Frontend/ast-main.c
index cdc74219f73ac..6a64497f4109c 100644
--- a/clang/test/Frontend/ast-main.c
+++ b/clang/test/Frontend/ast-main.c
@@ -1,6 +1,6 @@
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t1.ll -x c - < %s
+// RUN: env SDKROOT="/" %clang -emit-llvm -S -o - -x c - < %s | grep -v DIFile > %t1.ll
 // RUN: env SDKROOT="/" %clang -emit-ast -o %t.ast %s
-// RUN: env SDKROOT="/" %clang -emit-llvm -S -o %t2.ll -x ast - < %t.ast
+// RUN: env SDKROOT="/" %clang -emit-llvm -S -o - -x ast - < %t.ast | grep -v DIFile > %t2.ll
 // RUN: diff %t1.ll %t2.ll
 
 int main(void) {
diff --git a/clang/test/Frontend/ast-main.cpp b/clang/test/Frontend/ast-main.cpp
index fe47ce435f068..fc09e6437f93f 100644
--- a/clang/test/Frontend/ast-main.cpp
+++ b/clang/test/Frontend/ast-main.cpp
@@ -1,6 +1,6 @@
-// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t1.ll -x c++ - < %s
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o - -x c++ - < %s | grep -v DIFile > %t1.ll
 // RUN: env SDKROOT="/" %clang -Wno-error=return-type -fno-delayed-template-parsing -emit-ast -o %t.ast %s
-// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o %t2.ll -x ast - < %t.ast
+// RUN: env SDKROOT="/" %clang -Wno-error=return-type -emit-llvm -S -o - -x ast - < %t.ast | grep -v DIFile > %t2.ll
 // RUN: diff %t1.ll %t2.ll
 
 // http://llvm.org/bugs/show_bug.cgi?id=15377
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index e13e92378d4aa..a2c3b50b24670 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -565,8 +565,11 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = M.getCodeViewFlag();
-    if (EmitCodeView &&
-        (TM.getTargetTriple().isOSWindows() || TM.getTargetTriple().isUEFI()))
+    // On Windows targets, emit minimal CodeView compiler info even when debug
+    // info is disabled.
+    if ((TM.getTargetTriple().isOSWindows() &&
+         M.getNamedMetadata("llvm.dbg.cu")) ||
+        (TM.getTargetTriple().isUEFI() && EmitCodeView))
       Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index fc43bc6f7776d..ea57a8fa1f793 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -125,6 +125,8 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
     return CPUType::ARM64;
   case Triple::ArchType::mipsel:
     return CPUType::MIPS;
+  case Triple::ArchType::UnknownArch:
+    return CPUType::Unknown;
   default:
     report_fatal_error("target architecture doesn't map to a CodeView CPUType");
   }
@@ -611,21 +613,33 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
 }
 
 void CodeViewDebug::beginModule(Module *M) {
-  // If module doesn't have named metadata anchors or COFF debug section
-  // is not available, skip any debug info related stuff.
-  if (!Asm->hasDebugInfo() ||
-      !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+  // If COFF debug section is not available, skip any debug info related stuff.
+  if (!Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) {
     Asm = nullptr;
     return;
   }
 
+  CompilerInfoAsm = Asm;
   TheCPU = mapArchToCVCPUType(M->getTargetTriple().getArch());
 
   // Get the current source language.
-  const MDNode *Node = *M->debug_compile_units_begin();
+  const MDNode *Node;
+  if (Asm->hasDebugInfo()) {
+    Node = *M->debug_compile_units_begin();
+  } else {
+    // When emitting only compiler information, we may have only NoDebug CUs,
+    // which would be skipped by debug_compile_units_begin.
+    NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+    Node = *CUs->operands().begin();
+  }
   const auto *CU = cast<DICompileUnit>(Node);
 
   CurrentSourceLanguage = MapDWLangToCVLang(CU->getSourceLanguage());
+  if (!M->getCodeViewFlag() ||
+      CU->getEmissionKind() == DICompileUnit::NoDebug) {
+    Asm = nullptr;
+    return;
+  }
 
   collectGlobalVariableInfo();
 
@@ -636,7 +650,7 @@ void CodeViewDebug::beginModule(Module *M) {
 }
 
 void CodeViewDebug::endModule() {
-  if (!Asm || !Asm->hasDebugInfo())
+  if (!CompilerInfoAsm)
     return;
 
   // The COFF .debug$S section consists of several subsections, each starting
@@ -652,6 +666,8 @@ void CodeViewDebug::endModule() {
   emitObjName();
   emitCompilerInformation();
   endCVSubsection(CompilerInfo);
+  if (!Asm)
+    return;
 
   emitInlineeLinesSubsection();
 
@@ -788,7 +804,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
 void CodeViewDebug::emitObjName() {
   MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_OBJNAME);
 
-  StringRef PathRef(Asm->TM.Options.ObjectFilenameForDebug);
+  StringRef PathRef(CompilerInfoAsm->TM.Options.ObjectFilenameForDebug);
   llvm::SmallString<256> PathStore(PathRef);
 
   if (PathRef.empty() || PathRef == "-") {
@@ -846,7 +862,7 @@ void CodeViewDebug::emitCompilerInformation() {
   }
   using ArchType = llvm::Triple::ArchType;
   ArchType Arch = MMI->getModule()->getTargetTriple().getArch();
-  if (Asm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
+  if (CompilerInfoAsm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
       Arch == ArchType::aarch64) {
     Flags |= static_cast<uint32_t>(CompileSym3Flags::HotPatch);
   }
@@ -1015,7 +1031,7 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
   const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
 
   MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
-      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+      CompilerInfoAsm->getObjFileLowering().getCOFFDebugSymbolsSection());
   DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
 
   OS.switchSection(DebugSec);
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index d13b315135ad9..5f4f30271d9cb 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -98,6 +98,10 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// The codeview CPU type used by the translation unit.
   codeview::CPUType TheCPU;
 
+  /// The AsmPrinter used for emitting compiler metadata. When only compiler
+  /// info is being emitted, DebugHandlerBase::Asm may be null.
+  AsmPrinter *CompilerInfoAsm = nullptr;
+
   static LocalVarDef createDefRangeMem(uint16_t CVRegister, int Offset);
 
   /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
diff --git a/llvm/test/CodeGen/Generic/selection-dag-determinism.ll b/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
index 1adff3d61ba2c..5228942368075 100644
--- a/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
+++ b/llvm/test/CodeGen/Generic/selection-dag-determinism.ll
@@ -1,8 +1,8 @@
-; RUN: llc -O2 -o %t1.o < %s
-; RUN: llc -O2 -o %t2.o < %s
-; RUN: llc -O2 -o %t3.o < %s
-; RUN: llc -O2 -o %t4.o < %s
-; RUN: llc -O2 -o %t5.o < %s
+; RUN: llc -O2 < %s > %t1.o
+; RUN: llc -O2 < %s > %t2.o
+; RUN: llc -O2 < %s > %t3.o
+; RUN: llc -O2 < %s > %t4.o
+; RUN: llc -O2 < %s > %t5.o
 ; RUN: cmp %t1.o %t2.o
 ; RUN: cmp %t1.o %t3.o
 ; RUN: cmp %t1.o %t4.o
diff --git a/llvm/test/DebugInfo/COFF/dwarf-headers.ll b/llvm/test/DebugInfo/COFF/dwarf-headers.ll
index 9d515f6cec640..919068e966041 100644
--- a/llvm/test/DebugInfo/COFF/dwarf-headers.ll
+++ b/llvm/test/DebugInfo/COFF/dwarf-headers.ll
@@ -43,6 +43,33 @@
 ; DWO-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004, abbr_offset
 ; DWO-4: 0x0000000b: DW_TAG_compile_unit
 
+; Check that basic CodeView compiler info is emitted even when the DWARF debug format is used.
+; RUN: llc -dwarf-version=4 \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-windows-msvc < %s \
+; RUN:     | llvm-readobj --codeview - | FileCheck %s --check-prefix=CODEVIEW
+; CODEVIEW:      CodeViewDebugInfo [
+; CODEVIEW-NEXT:   Section: .debug$S (4)
+; CODEVIEW-NEXT:   Magic: 0x4
+; CODEVIEW-NEXT:   Subsection [
+; CODEVIEW-NEXT:     SubSectionType: Symbols (0xF1)
+; CODEVIEW-NEXT:     SubSectionSize: 0x90
+; CODEVIEW-NEXT:     ObjNameSym {
+; CODEVIEW-NEXT:       Kind: S_OBJNAME (0x1101)
+; CODEVIEW-NEXT:       Signature: 0x0
+; CODEVIEW-NEXT:       ObjectName:
+; CODEVIEW-NEXT:     }
+; CODEVIEW-NEXT:     Compile3Sym {
+; CODEVIEW-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CODEVIEW-NEXT:       Language: Cpp (0x1)
+; CODEVIEW-NEXT:       Flags [ (0x0)
+; CODEVIEW-NEXT:       ]
+; CODEVIEW-NEXT:       Machine: X64 (0xD0)
+; CODEVIEW-NEXT:       FrontendVersion: 17.0.0.0
+; CODEVIEW-NEXT:       BackendVersion:
+; CODEVIEW-NEXT:       VersionName: clang version 17.0.0
+; CODEVIEW-NEXT:     }
+; CODEVIEW-NEXT:   ]
+; CODEVIEW-NEXT: ]
 
 ; ModuleID = 't.cpp'
 source_filename = "t.cpp"
diff --git a/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll b/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
new file mode 100644
index 0000000000000..792aaeef483f1
--- /dev/null
+++ b/llvm/test/DebugInfo/COFF/emission-kind-no-codeview.ll
@@ -0,0 +1,38 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
+; Check that basic CodeView compiler info is emitted even when the CodeView flag is not set.
+
+; CHECK-NOT:  CodeViewTypes
+; CHECK:      CodeViewDebugInfo [
+; CHECK-NEXT:   Section: .debug$S (4)
+; CHECK-NEXT:   Magic: 0x4
+; CHECK-NEXT:   Subsection [
+; CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+; CHECK-NEXT:     SubSectionSize: 0x2C
+; CHECK-NEXT:     ObjNameSym {
+; CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+; CHECK-NEXT:       Signature: 0x0
+; CHECK-NEXT:       ObjectName:
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Compile3Sym {
+; CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CHECK-NEXT:       Language: C (0x0)
+; CHECK-NEXT:       Flags [ (0x0)
+; CHECK-NEXT:       ]
+; CHECK-NEXT:       Machine: X64 (0xD0)
+; CHECK-NEXT:       FrontendVersion:
+; CHECK-NEXT:       BackendVersion:
+; CHECK-NEXT:       VersionName: clang
+; CHECK-NEXT:     }
+; CHECK-NEXT:   ]
+; CHECK-NEXT: ]
+
+source_filename = "empty"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: NoDebug)
+!1 = !DIFile(filename: "empty", directory: "path/to")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll b/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
index 4204df512ac31..94fee0e1812fd 100644
--- a/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
+++ b/llvm/test/DebugInfo/COFF/emission-kind-no-debug.ll
@@ -1,8 +1,30 @@
 ; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
-; Check that debug info isn't emitted for CodeView with emissionKind NoDebug
+; Check that only basic compiler info is emitted for CodeView with emissionKind NoDebug
 
-; CHECK-NOT:      CodeViewTypes
-; CHECK-NOT:      CodeViewDebugInfo
+; CHECK-NOT:  CodeViewTypes
+; CHECK:      CodeViewDebugInfo [
+; CHECK-NEXT:   Section: .debug$S (4)
+; CHECK-NEXT:   Magic: 0x4
+; CHECK-NEXT:   Subsection [
+; CHECK-NEXT:     SubSectionType: Symbols (0xF1)
+; CHECK-NEXT:     SubSectionSize: 0x2C
+; CHECK-NEXT:     ObjNameSym {
+; CHECK-NEXT:       Kind: S_OBJNAME (0x1101)
+; CHECK-NEXT:       Signature: 0x0
+; CHECK-NEXT:       ObjectName:
+; CHECK-NEXT:     }
+; CHECK-NEXT:     Compile3Sym {
+; CHECK-NEXT:       Kind: S_COMPILE3 (0x113C)
+; CHECK-NEXT:       Language: C (0x0)
+; CHECK-NEXT:       Flags [ (0x0)
+; CHECK-NEXT:       ]
+; CHECK-NEXT:       Machine: X64 (0xD0)
+; CHECK-NEXT:       FrontendVersion:
+; CHECK-NEXT:       BackendVersion:
+; CHECK-NEXT:       VersionName: clang
+; CHECK-NEXT:     }
+; CHECK-NEXT:   ]
+; CHECK-NEXT: ]
 
 source_filename = "empty"
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/DebugInfo/COFF/fission-cu.ll b/llvm/test/DebugInfo/COFF/fission-cu.ll
index 3afcb8717e31f..dcc3fdd2efa75 100644
--- a/llvm/test/DebugInfo/COFF/fission-cu.ll
+++ b/llvm/test/DebugInfo/COFF/fission-cu.ll
@@ -107,11 +107,11 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; For COFF we should have this set of relocations for the debug info section
 ;
 ; OBJ: .debug_info
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_abbrev (6)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_line (26)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (10)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (10)
-; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_addr (20)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_abbrev (8)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_line (28)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (12)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_str (12)
+; OBJ-NEXT: IMAGE_REL_AMD64_SECREL .debug_addr (22)
 ; OBJ-NEXT: }
 
 ; HDR-NOT: .debug_aranges
diff --git a/llvm/test/DebugInfo/COFF/fission-sections.ll b/llvm/test/DebugInfo/COFF/fission-sections.ll
index 754e2b888c202..c16a4d072909e 100644
--- a/llvm/test/DebugInfo/COFF/fission-sections.ll
+++ b/llvm/test/DebugInfo/COFF/fission-sections.ll
@@ -27,13 +27,14 @@ source_filename = "test/DebugInfo/X86/fission-cu.ll"
 ; OBJ-NEXT:  0 .text
 ; OBJ-NEXT:  1 .data
 ; OBJ-NEXT:  2 .bss
-; OBJ-NEXT:  3 .debug_abbrev
-; OBJ-NEXT:  4 .debug_info
-; OBJ-NEXT:  5 .debug_str
-; OBJ-NEXT:  6 .debug_addr
-; OBJ-NEXT:  7 .debug_pubnames
-; OBJ-NEXT:  8 .debug_pubtypes
-; OBJ-NEXT:  9 .debug_line
+; OBJ-NEXT:  3 .debug$S
+; OBJ-NEXT:  4 .debug_abbrev
+; OBJ-NEXT:  5 .debug_info
+; OBJ-NEXT:  6 .debug_str
+; OBJ-NEXT:  7 .debug_addr
+; OBJ-NEXT:  8 .debug_pubnames
+; OBJ-NEXT:  9 .debug_pubtypes
+; OBJ-NEXT: 10 .debug_line
 
 ; OBJ:     .debug_abbrev
 ; OBJ:     .debug_info
diff --git a/llvm/test/DebugInfo/COFF/uefi-nodebug.ll b/llvm/test/DebugInfo/COFF/uefi-nodebug.ll
new file mode 100644
index 0000000000000..92e5fd6b5796b
--- /dev/null
+++ b/llvm/test/DebugInfo/COFF/uefi-nodebug.ll
@@ -0,0 +1,16 @@
+; RUN: llc -filetype=obj -o - < %s | llvm-readobj --codeview - | FileCheck %s
+; Check that compiler info is not emitted when CodeView flag is not specified
+
+; CHECK-NOT:  CodeViewTypes
+; CHECK-NOT:  CodeViewDebugInfo
+
+source_filename = "empty"
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-uefi"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "clang", emissionKind: NoDebug)
+!1 = !DIFile(filename: "empty", directory: "path/to")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/DebugInfo/Generic/directives-only.ll b/llvm/test/DebugInfo/Generic/directives-only.ll
index ff9393221e2fe..4754df7186faa 100644
--- a/llvm/test/DebugInfo/Generic/directives-only.ll
+++ b/llvm/test/DebugInfo/Generic/directives-only.ll
@@ -18,7 +18,7 @@
 ; CHECK: .loc 1 4 15
 ; CHECK: .loc 1 5 1
 
-; CHECK-NOT: .section .{{debug.*}}
+; CHECK-NOT: .section .{{debug_.*}}
 
 ; Function Attrs: nounwind uwtable
 define void @f2() #0 !dbg !4 {

From f62a8ab9304fb8b8b3ac3519a7addd7d3d234b04 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Fri, 13 Jun 2025 22:51:33 +0200
Subject: [PATCH 445/851] [CIR] Extend VecShuffleOp verifier to catch invalid
 index (#143262)

Extend the verifier to catch index larger than the size of vector
elements in VecShuffleOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp          |  9 +++++++++
 .../IR/invalid-vector-shuffle-wrong-index.cir    | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir

diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 8ed0ee92574dc..a685253b7d821 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1643,6 +1643,15 @@ LogicalResult cir::VecShuffleOp::verify() {
                          << " and " << getResult().getType() << " don't match";
   }
 
+  const uint64_t maxValidIndex =
+      getVec1().getType().getSize() + getVec2().getType().getSize() - 1;
+  if (llvm::any_of(
+          getIndices().getAsRange<cir::IntAttr>(), [&](cir::IntAttr idxAttr) {
+            return idxAttr.getSInt() != -1 && idxAttr.getUInt() > maxValidIndex;
+          })) {
+    return emitOpError() << ": index for __builtin_shufflevector must be "
+                            "less than the total number of vector elements";
+  }
   return success();
 }
 
diff --git a/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir b/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir
new file mode 100644
index 0000000000000..375b2d3dc563e
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-vector-shuffle-wrong-index.cir
@@ -0,0 +1,16 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!s32i = !cir.int<s, 32>
+!s64i = !cir.int<s, 64>
+
+module  {
+  cir.func @fold_shuffle_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+
+    // expected-error @below {{index for __builtin_shufflevector must be less than the total number of vector elements}}
+    %new_vec = cir.vec.shuffle(%vec_1, %vec_2 : !cir.vector<4 x !s32i>) [#cir.int<9> : !s64i, #cir.int<4> : !s64i,
+      #cir.int<1> : !s64i, #cir.int<5> : !s64i] : !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+}

From 5ab285e0a60ad914bda893dbe18b6c1c562f3db6 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 14:10:30 -0700
Subject: [PATCH 446/851] [LLD][COFF] Fix ARM64X CHPE exception data size
 relocation when no x86 .pdata is present (#144085)

Fixes an issue where we incorrectly skip setting the relocation value if
`hybridPdata.first` is null.
---
 lld/COFF/Writer.cpp              | 16 +++-------
 lld/test/COFF/pdata-arm64ec.test | 53 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index cb9d0001015bd..5f1da5e79daca 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2495,22 +2495,16 @@ void Writer::setECSymbols() {
               offsetof(data_directory, Size),
           ctx.symtab.edataEnd->getRVA() - ctx.symtab.edataStart->getRVA() +
               ctx.symtab.edataEnd->getSize());
-    if (hybridPdata.first) {
+    if (hybridPdata.first)
       ctx.dynamicRelocs->set(
           dataDirOffset64 + EXCEPTION_TABLE * sizeof(data_directory) +
               offsetof(data_directory, Size),
           hybridPdata.last->getRVA() - hybridPdata.first->getRVA() +
               hybridPdata.last->getSize());
-      if (chpeSym) {
-        size_t size = 0;
-        if (pdata.first)
-          size = pdata.last->getRVA() + pdata.last->getSize() -
-                 pdata.first->getRVA();
-        ctx.dynamicRelocs->set(chpeSym->getRVA() +
-                                   offsetof(chpe_metadata, ExtraRFETableSize),
-                               size);
-      }
-    }
+    if (chpeSym && pdata.first)
+      ctx.dynamicRelocs->set(
+          chpeSym->getRVA() + offsetof(chpe_metadata, ExtraRFETableSize),
+          pdata.last->getRVA() + pdata.last->getSize() - pdata.first->getRVA());
   }
 }
 
diff --git a/lld/test/COFF/pdata-arm64ec.test b/lld/test/COFF/pdata-arm64ec.test
index cf59330b23543..6bdcc5c5682bd 100644
--- a/lld/test/COFF/pdata-arm64ec.test
+++ b/lld/test/COFF/pdata-arm64ec.test
@@ -80,10 +80,63 @@ DIR3-NEXT:     ExtraRFETableSize: 0x10
 DIR3:        ]
 DIR3:      }
 
+arm64x with no x86 .pdata:
+
 RUN: llvm-objdump -s --section=.pdata test4.dll | FileCheck -check-prefix=DATA4 %s
 DATA4: 180006000 00100000 11000001 00200000 11000001  ......... ......
 DATA4: 180006010 00300000 0e300000
 
+RUN: lld-link -out:testx2.dll -machine:arm64x arm64-func-sym.obj arm64ec-func-sym.obj \
+RUN:          loadconfig-arm64.obj loadconfig-arm64ec.obj -dll -noentry
+
+RUN: llvm-readobj --headers --coff-load-config testx2.dll | FileCheck -check-prefix=DIR4 %s
+DIR4:      ImageOptionalHeader {
+DIR4:        DataDirectory {
+DIR4:          ExceptionTableRVA: 0x5000
+DIR4-NEXT:     ExceptionTableSize: 0x10
+DIR4:        }
+DIR4:      }
+DIR4:      CHPEMetadata [
+DIR4:        ExtraRFETable: 0x0
+DIR4-NEXT:   ExtraRFETableSize: 0x0
+DIR4:      ]
+DIR4:      HybridObject {
+DIR4:        ImageOptionalHeader {
+DIR4:          ExceptionTableRVA: 0x0
+DIR4-NEXT:     ExceptionTableSize: 0x0
+DIR4:        }
+DIR4:        CHPEMetadata [
+DIR4:          ExtraRFETable: 0x5000
+DIR4-NEXT:     ExtraRFETableSize: 0x10
+DIR4:        ]
+DIR4:      }
+
+arm64x with no ARM .pdata:
+
+RUN: lld-link -out:testx3.dll -machine:arm64x x86_64-func-sym.obj loadconfig-arm64.obj loadconfig-arm64ec.obj -dll -noentry
+
+RUN: llvm-readobj --headers --coff-load-config testx3.dll | FileCheck -check-prefix=DIR5 %s
+DIR5:      ImageOptionalHeader {
+DIR5:        DataDirectory {
+DIR5:          ExceptionTableRVA: 0x0
+DIR5-NEXT:     ExceptionTableSize: 0x0
+DIR5:        }
+DIR5:      }
+DIR5:      CHPEMetadata [
+DIR5:        ExtraRFETable: 0x4000
+DIR5-NEXT:   ExtraRFETableSize: 0xC
+DIR5:      ]
+DIR5:      HybridObject {
+DIR5:        ImageOptionalHeader {
+DIR5:          ExceptionTableRVA: 0x4000
+DIR5-NEXT:     ExceptionTableSize: 0xC
+DIR5:        }
+DIR5:        CHPEMetadata [
+DIR5:          ExtraRFETable: 0x0
+DIR5-NEXT:     ExtraRFETableSize: 0x0
+DIR5:        ]
+DIR5:      }
+
 Order of inputs doesn't matter, the data is sorted by type and RVA:
 
 RUN: lld-link -out:test5.dll -machine:arm64ec x86_64-func-sym.obj arm64ec-func-sym.obj \

From 8229628cf1812e126ff72ee9f4b5f267db4c91da Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Jun 2025 23:22:37 +0200
Subject: [PATCH 447/851] [Clang] Relax DICompileUnit producer check in
 debug-info-coff.cpp test (NFC)

Fixes test from #142970 on Fuchsia CI, which uses "Fuchsia clang version" prefix.
---
 clang/test/CodeGenCXX/debug-info-coff.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGenCXX/debug-info-coff.cpp b/clang/test/CodeGenCXX/debug-info-coff.cpp
index 4507f5f40d411..2535c5cc7511f 100644
--- a/clang/test/CodeGenCXX/debug-info-coff.cpp
+++ b/clang/test/CodeGenCXX/debug-info-coff.cpp
@@ -4,7 +4,7 @@
 
 // RUN: %clang --target=i686-pc-windows-msvc -S -emit-llvm %s -o - | FileCheck --check-prefix=IR %s
 // IR: !llvm.dbg.cu = !{!0}
-// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
+// IR: !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "{{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None)
 
 // RUN: %clang --target=i686-pc-windows-msvc -c %s -o %t.o
 // RUN: llvm-readobj --codeview %t.o | FileCheck %s
@@ -27,7 +27,7 @@
 // CHECK-NEXT:       Machine: Pentium3 (0x7)
 // CHECK-NEXT:       FrontendVersion:
 // CHECK-NEXT:       BackendVersion:
-// CHECK-NEXT:       VersionName: clang version
+// CHECK-NEXT:       VersionName: {{.*}}clang version
 // CHECK-NEXT:     }
 // CHECK-NEXT:   ]
 // CHECK-NEXT: ]

From 3afc2be1f0a4d3e3f646403a7495bcb12ef94246 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Fri, 13 Jun 2025 14:35:40 -0700
Subject: [PATCH 448/851] llvm-lto2: Add print-guid subcommand.

This is useful for debugging ThinLTO issues.

Reviewers: teresajohnson

Reviewed By: teresajohnson

Pull Request: https://github.com/llvm/llvm-project/pull/143992
---
 llvm/test/tools/llvm-lto2/print-guid.test | 2 ++
 llvm/tools/llvm-lto2/llvm-lto2.cpp        | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-lto2/print-guid.test

diff --git a/llvm/test/tools/llvm-lto2/print-guid.test b/llvm/test/tools/llvm-lto2/print-guid.test
new file mode 100644
index 0000000000000..a3d3f202ea437
--- /dev/null
+++ b/llvm/test/tools/llvm-lto2/print-guid.test
@@ -0,0 +1,2 @@
+# RUN: llvm-lto2 print-guid foo | FileCheck %s
+# CHECK: 6699318081062747564
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 2bbb65be3b31e..fbde66666a596 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -247,7 +247,7 @@ template <typename T> static T check(ErrorOr<T> E, std::string Msg) {
 }
 
 static int usage() {
-  errs() << "Available subcommands: dump-symtab run\n";
+  errs() << "Available subcommands: dump-symtab run print-guid\n";
   return 1;
 }
 
@@ -610,5 +610,11 @@ int main(int argc, char **argv) {
     return dumpSymtab(argc - 1, argv + 1);
   if (Subcommand == "run")
     return run(argc - 1, argv + 1);
+  if (Subcommand == "print-guid" && argc > 2) {
+    // Note the name of the function we're calling: this won't return the right
+    // answer for internal linkage symbols.
+    outs() << GlobalValue::getGUIDAssumingExternalLinkage(argv[2]) << '\n';
+    return 0;
+  }
   return usage();
 }

From 473dea9b0b86d48db805079fa3e68b37e1dbcdd9 Mon Sep 17 00:00:00 2001
From: William Huynh <William.Huynh@arm.com>
Date: Fri, 13 Jun 2025 22:37:25 +0100
Subject: [PATCH 449/851] [libc] Output all headers with
 LIBC_CONF_OUTPUT_ALL_HEADERS (#144114)

Following discussion from
https://discourse.llvm.org/t/missing-declarations-in-header-files/86678,
we decided to add a flag to output all headers. Requires #144049.

- Allows outputting all headers
- Minor whitespace change for alignment

---------

Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/CMakeLists.txt                          | 1 +
 libc/cmake/modules/LLVMLibCHeaderRules.cmake | 7 ++++++-
 libc/test/UnitTest/CMakeLists.txt            | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index f21fc2fba7305..9907adfc55a5f 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -135,6 +135,7 @@ endif()
 option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc" ${default_to_full_build})
 option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON)
 option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF)
+option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless of whether they are enabled on this target" OFF)
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 99f90244e0134..01c288f0b9198 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -97,8 +97,13 @@ function(add_gen_header target_name)
   set(out_file ${LIBC_INCLUDE_DIR}/${relative_path})
   set(dep_file "${out_file}.d")
   set(yaml_file ${CMAKE_SOURCE_DIR}/${ADD_GEN_HDR_YAML_FILE})
+  
+  if(LLVM_LIBC_ALL_HEADERS)
+    set(entry_points "")
+  else()
+    set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
+  endif()
 
-  set(entry_points "${TARGET_ENTRYPOINT_NAME_LIST}")
   list(TRANSFORM entry_points PREPEND "--entry-point=")
 
   add_custom_command(
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index b0a3a7431c222..c32809da577d4 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -35,7 +35,7 @@ function(add_unittest_framework_library name)
   else()
     _get_common_test_compile_options(compile_options "" "")
     target_compile_options(${name}.unit PRIVATE ${compile_options})
-endif()
+  endif()
 
   _get_hermetic_test_compile_options(compile_options "")
   target_include_directories(${name}.hermetic PRIVATE ${LIBC_INCLUDE_DIR})

From 2c440232e261746970cdf6f74d6588464eecd48b Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 13 Jun 2025 15:07:06 -0700
Subject: [PATCH 450/851] [bazel][libc] Add missing deps after
 51689c9df2fbb81aab1ff802f3efb86cac926853

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 7901de161b7ac..8e629270c89d2 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -347,16 +347,19 @@ libc_support_library(
 libc_support_library(
     name = "types_struct_f_owner_ex",
     hdrs = ["hdr/types/struct_f_owner_ex.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(
     name = "types_struct_flock",
     hdrs = ["hdr/types/struct_flock.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(
     name = "types_struct_flock64",
     hdrs = ["hdr/types/struct_flock64.h"],
+    deps = [":hdr_fcntl_overlay"],
 )
 
 libc_support_library(

From a591bd222b2e0356b8132b515422fe480b87322b Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 15:09:57 -0700
Subject: [PATCH 451/851] Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on by default
 (#144163)

Configure strlen to use unsafe implementation because it is faster.

Because this is undefined behavior it could cause sanitizers to fail.
---
 libc/config/config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/config/config.json b/libc/config/config.json
index d53b2936edb07..0354b16997cdd 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -59,7 +59,7 @@
   },
   "string": {
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
-      "value": false,
+      "value": true,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
     },
     "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {

From ca38027c036593ae487ccef250ebd5133803bb55 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang@google.com>
Date: Fri, 13 Jun 2025 15:30:28 -0700
Subject: [PATCH 452/851] Revert "Turn LIBC_COPT_STRING_UNSAFE_WIDE_READ on by
 default" (#144167)

Reverts llvm/llvm-project#144163 because for some reason I didn't
realize there are ASan tests.
---
 libc/config/config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/config/config.json b/libc/config/config.json
index 0354b16997cdd..d53b2936edb07 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -59,7 +59,7 @@
   },
   "string": {
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
-      "value": true,
+      "value": false,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
     },
     "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {

From d7e64d9594d241d6a9186fadad2b0d40a8fba8a7 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Fri, 13 Jun 2025 15:48:46 -0700
Subject: [PATCH 453/851] [MSAN] handle assorted AVX permutations (#143462)

---
 .../Instrumentation/MemorySanitizer.cpp       |  82 ++-
 .../X86/avx2-intrinsics-x86.ll                |  38 +-
 .../X86/avx512-intrinsics-upgrade.ll          | 430 +++++++------
 .../MemorySanitizer/X86/avx512-intrinsics.ll  | 428 +++++++------
 .../X86/avx512vl-intrinsics.ll                | 595 ++++++++++++------
 .../MemorySanitizer/X86/x86-vpermi2.ll        | 205 +++---
 .../i386/avx2-intrinsics-i386.ll              |  38 +-
 7 files changed, 1164 insertions(+), 652 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index d3c6a7151ec37..fb55bd7bfe567 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4173,7 +4173,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // Instrument AVX permutation intrinsic.
   // We apply the same permutation (argument index 1) to the shadow.
-  void handleAVXVpermilvar(IntrinsicInst &I) {
+  void handleAVXPermutation(IntrinsicInst &I) {
+    assert(I.arg_size() == 2);
+    assert(isa<FixedVectorType>(I.getArgOperand(0)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(1)->getType()));
+    [[maybe_unused]] auto ArgVectorSize =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(cast<FixedVectorType>(I.getArgOperand(1)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(I.getType() == I.getArgOperand(0)->getType());
     IRBuilder<> IRB(&I);
     Value *Shadow = getShadow(&I, 0);
     insertShadowCheck(I.getArgOperand(1), &I);
@@ -4187,6 +4195,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
     setOriginForNaryOp(I);
   }
+  // Instrument AVX permutation intrinsic.
+  // We apply the same permutation (argument index 1) to the shadows.
+  void handleAVXVpermil2var(IntrinsicInst &I) {
+    assert(I.arg_size() == 3);
+    assert(isa<FixedVectorType>(I.getArgOperand(0)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(1)->getType()));
+    assert(isa<FixedVectorType>(I.getArgOperand(2)->getType()));
+    [[maybe_unused]] auto ArgVectorSize =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(cast<FixedVectorType>(I.getArgOperand(1)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(cast<FixedVectorType>(I.getArgOperand(2)->getType())
+               ->getNumElements() == ArgVectorSize);
+    assert(I.getArgOperand(0)->getType() == I.getArgOperand(2)->getType());
+    assert(I.getType() == I.getArgOperand(0)->getType());
+    assert(I.getArgOperand(1)->getType()->isIntOrIntVectorTy());
+    IRBuilder<> IRB(&I);
+    Value *AShadow = getShadow(&I, 0);
+    Value *Idx = I.getArgOperand(1);
+    Value *BShadow = getShadow(&I, 2);
+    insertShadowCheck(Idx, &I);
+
+    // Shadows are integer-ish types but some intrinsics require a
+    // different (e.g., floating-point) type.
+    AShadow = IRB.CreateBitCast(AShadow, I.getArgOperand(0)->getType());
+    BShadow = IRB.CreateBitCast(BShadow, I.getArgOperand(2)->getType());
+    CallInst *CI = IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(),
+                                       {AShadow, Idx, BShadow});
+
+    setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
+    setOriginForNaryOp(I);
+  }
 
   // Instrument BMI / BMI2 intrinsics.
   // All of these intrinsics are Z = I(X, Y)
@@ -5132,16 +5172,52 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(Success);
       break;
     }
-
+    case Intrinsic::x86_avx2_permd:
+    case Intrinsic::x86_avx2_permps:
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+    case Intrinsic::x86_avx512_permvar_df_256:
+    case Intrinsic::x86_avx512_permvar_df_512:
+    case Intrinsic::x86_avx512_permvar_di_256:
+    case Intrinsic::x86_avx512_permvar_di_512:
+    case Intrinsic::x86_avx512_permvar_hi_128:
+    case Intrinsic::x86_avx512_permvar_hi_256:
+    case Intrinsic::x86_avx512_permvar_hi_512:
+    case Intrinsic::x86_avx512_permvar_qi_128:
+    case Intrinsic::x86_avx512_permvar_qi_256:
+    case Intrinsic::x86_avx512_permvar_qi_512:
+    case Intrinsic::x86_avx512_permvar_sf_512:
+    case Intrinsic::x86_avx512_permvar_si_512:
     case Intrinsic::x86_avx_vpermilvar_pd:
     case Intrinsic::x86_avx_vpermilvar_pd_256:
     case Intrinsic::x86_avx512_vpermilvar_pd_512:
     case Intrinsic::x86_avx_vpermilvar_ps:
     case Intrinsic::x86_avx_vpermilvar_ps_256:
     case Intrinsic::x86_avx512_vpermilvar_ps_512: {
-      handleAVXVpermilvar(I);
+      handleAVXPermutation(I);
       break;
     }
+    case Intrinsic::x86_avx512_vpermi2var_d_128:
+    case Intrinsic::x86_avx512_vpermi2var_d_256:
+    case Intrinsic::x86_avx512_vpermi2var_d_512:
+    case Intrinsic::x86_avx512_vpermi2var_hi_128:
+    case Intrinsic::x86_avx512_vpermi2var_hi_256:
+    case Intrinsic::x86_avx512_vpermi2var_hi_512:
+    case Intrinsic::x86_avx512_vpermi2var_pd_128:
+    case Intrinsic::x86_avx512_vpermi2var_pd_256:
+    case Intrinsic::x86_avx512_vpermi2var_pd_512:
+    case Intrinsic::x86_avx512_vpermi2var_ps_128:
+    case Intrinsic::x86_avx512_vpermi2var_ps_256:
+    case Intrinsic::x86_avx512_vpermi2var_ps_512:
+    case Intrinsic::x86_avx512_vpermi2var_q_128:
+    case Intrinsic::x86_avx512_vpermi2var_q_256:
+    case Intrinsic::x86_avx512_vpermi2var_q_512:
+    case Intrinsic::x86_avx512_vpermi2var_qi_128:
+    case Intrinsic::x86_avx512_vpermi2var_qi_256:
+    case Intrinsic::x86_avx512_vpermi2var_qi_512:
+      handleAVXVpermil2var(I);
+      break;
 
     case Intrinsic::x86_avx512fp16_mask_add_sh_round:
     case Intrinsic::x86_avx512fp16_mask_sub_sh_round:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index f916130fe53e5..9649f2dc71f1f 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -740,8 +740,15 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[RES]]
 ;
@@ -969,8 +976,15 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -985,18 +999,18 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[TMP3]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 5aeaa1221cd21..3eeb5886b5fca 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -13171,18 +13171,18 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP3]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP7]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -13197,24 +13197,24 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP7]], <8 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
@@ -13232,23 +13232,23 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP6]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
@@ -13266,8 +13266,15 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
@@ -13283,8 +13290,15 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
@@ -13307,8 +13321,15 @@ define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
@@ -13331,18 +13352,18 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP3]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -13357,24 +13378,24 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP7]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
@@ -13392,23 +13413,23 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -13426,8 +13447,15 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
@@ -13443,8 +13471,15 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
@@ -13467,8 +13502,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
@@ -13700,8 +13742,8 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
@@ -13714,9 +13756,15 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP14]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X4:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
 ;
@@ -13744,9 +13792,15 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -13768,25 +13822,23 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP9]]
 ;
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -13797,32 +13849,30 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -13838,25 +13888,23 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP9]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -13867,32 +13915,30 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -13908,12 +13954,18 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -13925,13 +13977,19 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -13968,9 +14026,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -13999,7 +14063,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
 ; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
@@ -14013,26 +14077,24 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP5]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP11]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP24]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[TMP13]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       14:
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
+; CHECK:       16:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP14]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -14052,30 +14114,28 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -14093,13 +14153,19 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -14120,12 +14186,18 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -14137,13 +14209,19 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 ;
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 1644a5e3a045c..4b559bc9fb8eb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -5467,9 +5467,15 @@ define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
 ;
@@ -5496,9 +5502,15 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X1:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
@@ -5522,24 +5534,22 @@ declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>,
 define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP9]]
 ;
   %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
@@ -5549,32 +5559,30 @@ define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to <8 x double>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]], <8 x double> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]], <8 x double> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP8]], <8 x i64> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
@@ -5593,24 +5601,22 @@ declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>
 define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP12:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP9]]
 ;
   %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
@@ -5620,32 +5626,30 @@ define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16
 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP8]], <16 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
@@ -5664,12 +5668,18 @@ declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i
 define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
 ;
@@ -5680,13 +5690,19 @@ define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
@@ -5722,9 +5738,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
 ; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP2]], <16 x i32> [[X0:%.*]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP18]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -5753,7 +5775,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
 ; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
@@ -5767,26 +5789,24 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 ; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> [[EXTRA_PARAM:%.*]], double [[X2S]], i32 0
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> [[TMP6]], <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> [[EXTRA_PARAM2:%.*]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP2]] to <8 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[TMP24]], <8 x i64> [[X0:%.*]], <8 x double> [[TMP13]])
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x double> [[TMP14]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
 ; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       15:
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       17:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK:       18:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0]], <8 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
@@ -5805,30 +5825,28 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to <16 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[TMP5]], <16 x i32> [[X0:%.*]], <16 x float> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP19]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP20:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0]], <16 x float> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
@@ -5844,13 +5862,19 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0]], <8 x i64> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
@@ -5871,12 +5895,18 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP8]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -5887,13 +5917,19 @@ define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
 ; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP13]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0]], <16 x i32> [[X2:%.*]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -9441,18 +9477,18 @@ define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP3]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x double> [[TMP7]]
 ;
   %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
@@ -9466,24 +9502,24 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP5]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP7]], <8 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
@@ -9502,23 +9538,23 @@ define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[TMP4]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP6]], <8 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
@@ -9538,8 +9574,15 @@ define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
 ;
@@ -9554,8 +9597,15 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
@@ -9579,8 +9629,15 @@ define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i6
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
@@ -9605,18 +9662,18 @@ define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP3]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
@@ -9630,24 +9687,24 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP5]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP7]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
@@ -9666,23 +9723,23 @@ define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[TMP4]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -9702,8 +9759,15 @@ define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
 ;
@@ -9718,8 +9782,15 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
@@ -9743,8 +9814,15 @@ define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index 14d68b449a7b6..40b5e9338e45e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -3,6 +3,79 @@
 
 ; Forked from llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
 
+; Strictly handled instructions:
+; * llvm.x86.avx512.mask.cmp.pd
+; * llvm.x86.avx512.mask.cmp.ps
+; * llvm.x86.avx512.mask.compress
+; * llvm.x86.avx512.mask.cvtpd2dq
+; * llvm.x86.avx512.mask.cvtp
+; * llvm.x86.avx512.mask.cvtpd2udq
+; * llvm.x86.avx512.mask.cvtps2dq
+; * llvm.x86.avx512.mask.cvtps2udq
+; * llvm.x86.avx512.mask.cvttpd2dq
+; * llvm.x86.avx512.mask.cvttpd2udq
+; * llvm.x86.avx512.mask.cvttps2udq
+; * llvm.x86.avx512.mask.expand
+; * llvm.x86.avx512.mask.fixupimm.pd
+; * llvm.x86.avx512.mask.fixupimm.ps
+; * llvm.x86.avx512.mask.getexp.pd
+; * llvm.x86.avx512.mask.getexp.ps
+; * llvm.x86.avx512.mask.getmant.pd
+; * llvm.x86.avx512.mask.getmant.ps
+; * llvm.x86.avx512.mask.pmov.db
+; * llvm.x86.avx512.mask.pmov.db.mem
+; * llvm.x86.avx512.mask.pmov.dw
+; * llvm.x86.avx512.mask.pmov.dw.mem
+; * llvm.x86.avx512.mask.pmov.qb
+; * llvm.x86.avx512.mask.pmov.qb.mem
+; * llvm.x86.avx512.mask.pmov.qd
+; * llvm.x86.avx512.mask.pmov.qd.mem
+; * llvm.x86.avx512.mask.pmov.qw
+; * llvm.x86.avx512.mask.pmov.qw.mem
+; * llvm.x86.avx512.mask.pmovs.db
+; * llvm.x86.avx512.mask.pmovs.db.mem
+; * llvm.x86.avx512.mask.pmovs.dw
+; * llvm.x86.avx512.mask.pmovs.dw.mem
+; * llvm.x86.avx512.mask.pmovs.qb
+; * llvm.x86.avx512.mask.pmovs.qb.mem
+; * llvm.x86.avx512.mask.pmovs.qd
+; * llvm.x86.avx512.mask.pmovs.qd.mem
+; * llvm.x86.avx512.mask.pmovs.qw
+; * llvm.x86.avx512.mask.pmovs.qw.mem
+; * llvm.x86.avx512.mask.pmovus.db
+; * llvm.x86.avx512.mask.pmovus.db.mem
+; * llvm.x86.avx512.mask.pmovus.dw
+; * llvm.x86.avx512.mask.pmovus.dw.mem
+; * llvm.x86.avx512.mask.pmovus.qb
+; * llvm.x86.avx512.mask.pmovus.qb.mem
+; * llvm.x86.avx512.mask.pmovus.qd
+; * llvm.x86.avx512.mask.pmovus.qd.mem
+; * llvm.x86.avx512.mask.pmovus.qw
+; * llvm.x86.avx512.mask.pmovus.qw.mem
+; * llvm.x86.avx512.mask.rndscale.pd
+; * llvm.x86.avx512.mask.rndscale.ps
+; * llvm.x86.avx512.mask.scalef.pd
+; * llvm.x86.avx512.mask.scalef.ps
+; * llvm.x86.avx512.mask.vcvtps2ph
+; * llvm.x86.avx512.maskz.fixupimm.pd
+; * llvm.x86.avx512.maskz.fixupimm.ps
+; * llvm.x86.avx512.pternlog.d
+; * llvm.x86.avx512.pternlog.q
+; * llvm.x86.avx512.rcp14.pd
+; * llvm.x86.avx512.rcp14.ps
+; * llvm.x86.avx512.rsqrt14.pd
+; * llvm.x86.avx512.rsqrt14.ps
+;
+; Heuristically handled instructions:
+; * llvm.fma.v2f64
+; * llvm.fma.v4f32
+; * llvm.fma.v4f64
+; * llvm.fma.v8f32
+; * llvm.x86.avx.max.ps.256
+; * llvm.x86.avx.min.ps.256
+; * llvm.x86.sse.max.ps
+; * llvm.x86.sse.min.ps
+
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -1901,11 +1974,17 @@ define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X1]], <4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1919,12 +1998,18 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermi2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X1]], <4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -1950,11 +2035,17 @@ define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[X0]], <4 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
@@ -1968,12 +2059,18 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_mask_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2000,12 +2097,18 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_128(
 ; CHECK-SAME: <4 x i32> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i32> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP8]], <4 x i32> [[X0]], <4 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X1]], <4 x i32> [[X0]], <4 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2033,11 +2136,17 @@ define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X1]], <8 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2051,12 +2160,18 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermi2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X1]], <8 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2079,11 +2194,17 @@ define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i3
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_ask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[X0]], <8 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
@@ -2097,12 +2218,18 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_mask_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2126,12 +2253,18 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx512_maskz_vpermt2var_d_256(
 ; CHECK-SAME: <8 x i32> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x i32> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP8]], <8 x i32> [[X0]], <8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X1]], <8 x i32> [[X0]], <8 x i32> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2156,24 +2289,22 @@ define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP9]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP8]], <2 x i64> [[X1]], <2 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP10]] to <2 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2)
@@ -2185,34 +2316,32 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_vpermi2var_pd_128(
 ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP11]] to <2 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[TMP9]], <2 x i64> [[X1]], <2 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <2 x double> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP13]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> [[X0]], <2 x i64> [[X1]], <2 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[X1]] to <2 x double>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> zeroinitializer, <2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[EXTRACT]], <2 x i64> [[TMP18]], <2 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <2 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <2 x i64> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <2 x i1> [[_MSPROP]], <2 x i64> [[TMP21]], <2 x i64> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[EXTRACT]], <2 x double> [[TMP1]], <2 x double> [[TMP2]]
@@ -2233,24 +2362,22 @@ define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP4]] to i256
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP9]] to <4 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP4]] to <4 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP8]], <4 x i64> [[X1]], <4 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP10]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2)
@@ -2262,34 +2389,32 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_vpermi2var_pd_256(
 ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP11]] to <4 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[TMP9]], <4 x i64> [[X1]], <4 x double> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x double> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP13]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> [[X0]], <4 x i64> [[X1]], <4 x double> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[X1]] to <4 x double>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i64> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i64> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP21]], <4 x i64> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x double> [[TMP1]], <4 x double> [[TMP2]]
@@ -2310,24 +2435,22 @@ define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP9]] to <4 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP8]], <4 x i32> [[X1]], <4 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[TMP1]]
 ;
   %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2)
@@ -2339,34 +2462,32 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_vpermi2var_ps_128(
 ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x i32> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP11]] to <4 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP9]], <4 x i32> [[X1]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x float> [[TMP17]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1]], <4 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1]] to <4 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP18]], <4 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP21]], <4 x i32> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2392,30 +2513,28 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP11]] to <4 x i32>
 ; CHECK-NEXT:    [[X1CAST:%.*]] = bitcast <2 x i64> [[X1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP12]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP13]] to i128
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP12]] to <4 x float>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x i32> [[TMP13]] to <4 x float>
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[TMP16]], <4 x i32> [[X1CAST]], <4 x float> [[TMP18]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP19]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP14]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
-; CHECK:       [[BB9]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB10]]:
+; CHECK:       [[BB12]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> [[X0]], <4 x i32> [[X1CAST]], <4 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[X1CAST]] to <4 x float>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP9]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <4 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or <4 x i32> [[TMP21]], [[TMP14]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i32> [[TMP22]], <4 x i32> [[TMP17]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP1]], <4 x float> [[TMP2]]
@@ -2437,24 +2556,22 @@ define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP4]] to i256
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP9]] to <8 x float>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP8]], <8 x i32> [[X1]], <8 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
 ;
   %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2)
@@ -2466,32 +2583,30 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_vpermi2var_ps_256(
 ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x i32> [[X1:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP8]] to i256
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP11]] to <8 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i32> [[TMP8]] to <8 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[TMP9]], <8 x i32> [[X1]], <8 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x float> [[TMP17]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i32> [[TMP13]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK:       [[BB8]]:
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]]
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB9]]:
+; CHECK:       [[BB11]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> [[X0]], <8 x i32> [[X1]], <8 x float> [[X2]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[X1]] to <8 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> zeroinitializer, <8 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP18]], <8 x i32> [[TMP13]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i32> [[TMP20]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP14]], <8 x i32> [[TMP21]], <8 x i32> [[TMP16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP3]], <8 x float> [[TMP1]], <8 x float> [[TMP2]]
@@ -2511,11 +2626,17 @@ define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X1]], <2 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2529,12 +2650,18 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermi2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X1]], <2 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[X1]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2560,11 +2687,17 @@ define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[X0]], <2 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
@@ -2578,12 +2711,18 @@ define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_mask_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2610,12 +2749,18 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x
 ; CHECK-LABEL: define <2 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_128(
 ; CHECK-SAME: <2 x i64> [[X0:%.*]], <2 x i64> [[X1:%.*]], <2 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <2 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP8]], <2 x i64> [[X0]], <2 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X1]], <2 x i64> [[X0]], <2 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2643,11 +2788,17 @@ define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X1]], <4 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2661,12 +2812,18 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermi2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]], <4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[X1]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2692,11 +2849,17 @@ define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[X0]], <4 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -2710,12 +2873,18 @@ define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_mask_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -2742,12 +2911,18 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x
 ; CHECK-LABEL: define <4 x i64> @test_int_x86_avx512_maskz_vpermt2var_q_256(
 ; CHECK-SAME: <4 x i64> [[X0:%.*]], <4 x i64> [[X1:%.*]], <4 x i64> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[_MSPROP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP8]], <4 x i64> [[X0]], <4 x i64> [[TMP9]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X1]], <4 x i64> [[X0]], <4 x i64> [[X2]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP11]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -8458,18 +8633,18 @@ define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i6
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP5]] to <4 x double>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP3]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK:       [[BB5]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB6]]:
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[TMP1]]
 ;
   %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1)
@@ -8485,26 +8660,26 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP8]] to <4 x double>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP14]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x double> [[TMP16]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
-; CHECK:       [[BB7]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]]
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB8]]:
+; CHECK:       [[BB10]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP18]], <4 x i64> [[TMP13]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[X2]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i64> [[TMP7]], [[TMP13]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP17]], <4 x i64> [[TMP12]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> [[X2]]
@@ -8526,25 +8701,25 @@ define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP10]] to <4 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[TMP13]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x double> [[TMP15]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[TMP11]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB7]]:
+; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[EXTRACT1:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> zeroinitializer, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x i64> [[TMP16]], <4 x i64> zeroinitializer
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i64> [[TMP5]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i64> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP]], <4 x i64> [[TMP7]], <4 x i64> [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[EXTRACT1]], <4 x double> [[TMP1]], <4 x double> zeroinitializer
@@ -8566,7 +8741,14 @@ define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP3]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
@@ -8584,7 +8766,14 @@ define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP5]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -8614,7 +8803,14 @@ define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i6
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[TMP8]], <4 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> [[X0]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[X3]] to <8 x i1>
@@ -12267,8 +12463,7 @@ define <8 x i32> @combine_vpermi2d_vpermps(<16 x i32> noundef %a) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> splat (i32 -1), <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i32> [[_MSPROP]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i32> [[_MSPROP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[_MSPROP]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[_MSPROP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 14, i32 13, i32 6, i32 3, i32 5, i32 15, i32 0, i32 1>, <8 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
index 2350d75b29b44..35e1feb3aa201 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/x86-vpermi2.ll
@@ -16,8 +16,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64(<2 x i64> %x0, <2 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -31,8 +30,7 @@ define <2 x i64> @shuffle_vpermv3_v2i64_unary(<2 x i64> %x0) #0 {
 ; CHECK-SAME: <2 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP1]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> <i64 2, i64 0>, <2 x i64> [[X0]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -55,8 +53,14 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits(<2 x i64> %x0, <2 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 4>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -80,8 +84,14 @@ define <2 x i64> @shuffle_vpermv3_v2i64_demandedbits_negative(<2 x i64> %x0, <2
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <2 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <2 x i64> [[M]], <i64 0, i64 2>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[TMP6]], <2 x i64> [[T]], <2 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> [[X0]], <2 x i64> [[T]], <2 x i64> [[X1]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
@@ -97,8 +107,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64(<4 x i64> %x0, <4 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -112,8 +121,7 @@ define <4 x i64> @shuffle_vpermv3_v4i64_unary(<4 x i64> %x0) #0 {
 ; CHECK-SAME: <4 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP1]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> <i64 7, i64 2, i64 6, i64 0>, <4 x i64> [[X0]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -136,8 +144,14 @@ define <4 x i64> @shuffle_vpermv3_v4i64_demandedbits(<4 x i64> %x0, <4 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i64> [[M]], <i64 0, i64 8, i64 16, i64 32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[TMP6]], <4 x i64> [[T]], <4 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> [[X0]], <4 x i64> [[T]], <4 x i64> [[X1]])
 ; CHECK-NEXT:    store <4 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
@@ -153,8 +167,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64(<8 x i64> %x0, <8 x i64> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -168,8 +181,7 @@ define <8 x i64> @shuffle_vpermv3_v8i64_unary(<8 x i64> %x0) #0 {
 ; CHECK-SAME: <8 x i64> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP1]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> <i64 8, i64 6, i64 10, i64 4, i64 12, i64 2, i64 14, i64 0>, <8 x i64> [[X0]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -192,8 +204,14 @@ define <8 x i64> @shuffle_vpermv3_v8i64_demandedbits(<8 x i64> %x0, <8 x i64> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i64> [[M]], <i64 0, i64 16, i64 32, i64 64, i64 256, i64 512, i64 1024, i64 -16>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[TMP6]], <8 x i64> [[T]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0]], <8 x i64> [[T]], <8 x i64> [[X1]])
 ; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i64> [[R]]
@@ -213,8 +231,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32(<4 x i32> %x0, <4 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X1]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -228,8 +245,7 @@ define <4 x i32> @shuffle_vpermv3_v4i32_unary(<4 x i32> %x0) #0 {
 ; CHECK-SAME: <4 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> <i32 7, i32 2, i32 6, i32 0>, <4 x i32> [[X0]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -252,8 +268,14 @@ define <4 x i32> @shuffle_vpermv3_v4i32_demandedbits(<4 x i32> %x0, <4 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <4 x i32> [[M]], <i32 0, i32 8, i32 16, i32 32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[TMP6]], <4 x i32> [[T]], <4 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> [[X0]], <4 x i32> [[T]], <4 x i32> [[X1]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
@@ -269,8 +291,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32(<8 x i32> %x0, <8 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -284,8 +305,7 @@ define <8 x i32> @shuffle_vpermv3_v8i32_unary(<8 x i32> %x0) #0 {
 ; CHECK-SAME: <8 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> <i32 8, i32 6, i32 10, i32 4, i32 12, i32 2, i32 14, i32 0>, <8 x i32> [[X0]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -308,8 +328,14 @@ define <8 x i32> @shuffle_vpermv3_v8i32_demandedbits(<8 x i32> %x0, <8 x i32> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i32> [[M]], <i32 0, i32 16, i32 32, i32 64, i32 256, i32 512, i32 -16, i32 -32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[TMP6]], <8 x i32> [[T]], <8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i32> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> [[X0]], <8 x i32> [[T]], <8 x i32> [[X1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
@@ -325,8 +351,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32(<16 x i32> %x0, <16 x i32> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -340,8 +365,7 @@ define <16 x i32> @shuffle_vpermv3_v16i32_unary(<16 x i32> %x0) #0 {
 ; CHECK-SAME: <16 x i32> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP1]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> <i32 16, i32 14, i32 18, i32 12, i32 20, i32 10, i32 22, i32 8, i32 24, i32 6, i32 26, i32 4, i32 28, i32 2, i32 30, i32 0>, <16 x i32> [[X0]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -364,8 +388,14 @@ define <16 x i32> @shuffle_vpermv3_v16i32_demandedbits(<16 x i32> %x0, <16 x i32
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i32> [[M]], <i32 0, i32 32, i32 64, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 -32, i32 -64, i32 -128, i32 -256, i32 -512, i32 -1024, i32 -2048>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[TMP6]], <16 x i32> [[T]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0]], <16 x i32> [[T]], <16 x i32> [[X1]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
@@ -385,8 +415,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16(<8 x i16> %x0, <8 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X1]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -400,8 +429,7 @@ define <8 x i16> @shuffle_vpermv3_v8i16_unary(<8 x i16> %x0) #0 {
 ; CHECK-SAME: <8 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP1]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> <i16 8, i16 6, i16 10, i16 4, i16 12, i16 2, i16 14, i16 0>, <8 x i16> [[X0]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -424,8 +452,14 @@ define <8 x i16> @shuffle_vpermv3_v8i16_demandedbits(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <8 x i16> [[M]], <i16 0, i16 16, i16 32, i16 64, i16 256, i16 512, i16 -16, i16 -32>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[TMP6]], <8 x i16> [[T]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> [[X0]], <8 x i16> [[T]], <8 x i16> [[X1]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
@@ -441,8 +475,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16(<16 x i16> %x0, <16 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X1]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -456,8 +489,7 @@ define <16 x i16> @shuffle_vpermv3_v16i16_unary(<16 x i16> %x0) #0 {
 ; CHECK-SAME: <16 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP1]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> <i16 16, i16 14, i16 18, i16 12, i16 20, i16 10, i16 22, i16 8, i16 24, i16 6, i16 26, i16 4, i16 28, i16 2, i16 30, i16 0>, <16 x i16> [[X0]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -480,8 +512,14 @@ define <16 x i16> @shuffle_vpermv3_v16i16_demandedbits(<16 x i16> %x0, <16 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i16> [[M]], <i16 0, i16 32, i16 64, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 -32, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[TMP6]], <16 x i16> [[T]], <16 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i16> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> [[X0]], <16 x i16> [[T]], <16 x i16> [[X1]])
 ; CHECK-NEXT:    store <16 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
@@ -497,8 +535,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16(<32 x i16> %x0, <32 x i16> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X1]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -512,8 +549,7 @@ define <32 x i16> @shuffle_vpermv3_v32i16_unary(<32 x i16> %x0) #0 {
 ; CHECK-SAME: <32 x i16> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP1]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> <i16 33, i16 17, i16 35, i16 19, i16 37, i16 21, i16 39, i16 23, i16 41, i16 25, i16 43, i16 27, i16 45, i16 29, i16 47, i16 31, i16 49, i16 14, i16 51, i16 12, i16 53, i16 10, i16 55, i16 8, i16 57, i16 6, i16 59, i16 4, i16 61, i16 2, i16 63, i16 0>, <32 x i16> [[X0]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -536,8 +572,14 @@ define <32 x i16> @shuffle_vpermv3_v32i16_demandedbits(<32 x i16> %x0, <32 x i16
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i16> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i16> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i16> [[M]], <i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096, i16 0, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 0, i16 -64, i16 -128, i16 -256, i16 -512, i16 -1024, i16 -2048, i16 -4096>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i16> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i16> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP6]], <32 x i16> [[T]], <32 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i16> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[X0]], <32 x i16> [[T]], <32 x i16> [[X1]])
 ; CHECK-NEXT:    store <32 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i16> [[R]]
@@ -557,8 +599,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8(<16 x i8> %x0, <16 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X1]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -572,8 +613,7 @@ define <16 x i8> @shuffle_vpermv3_v16i8_unary(<16 x i8> %x0) #0 {
 ; CHECK-SAME: <16 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP1]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> <i8 16, i8 14, i8 18, i8 12, i8 20, i8 10, i8 22, i8 8, i8 24, i8 6, i8 26, i8 4, i8 28, i8 2, i8 30, i8 0>, <16 x i8> [[X0]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -596,8 +636,14 @@ define <16 x i8> @shuffle_vpermv3_v16i8_demandedbits(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <16 x i8> [[M]], <i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128, i8 0, i8 32, i8 64, i8 -128, i8 0, i8 -32, i8 -64, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[TMP6]], <16 x i8> [[T]], <16 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP9]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> [[X0]], <16 x i8> [[T]], <16 x i8> [[X1]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -613,8 +659,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8(<32 x i8> %x0, <32 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -628,8 +673,7 @@ define <32 x i8> @shuffle_vpermv3_v32i8_unary(<32 x i8> %x0) #0 {
 ; CHECK-SAME: <32 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP1]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> <i8 33, i8 17, i8 35, i8 19, i8 37, i8 21, i8 39, i8 23, i8 41, i8 25, i8 43, i8 27, i8 45, i8 29, i8 47, i8 31, i8 49, i8 14, i8 51, i8 12, i8 53, i8 10, i8 55, i8 8, i8 57, i8 6, i8 59, i8 4, i8 61, i8 2, i8 63, i8 0>, <32 x i8> [[X0]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -652,8 +696,14 @@ define <32 x i8> @shuffle_vpermv3_v32i8_demandedbits(<32 x i8> %x0, <32 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <32 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <32 x i8> [[M]], <i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128, i8 0, i8 0, i8 64, i8 -128, i8 0, i8 0, i8 -64, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <32 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[TMP6]], <32 x i8> [[T]], <32 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x i8> [[TMP9]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> [[X0]], <32 x i8> [[T]], <32 x i8> [[X1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[R]]
@@ -669,8 +719,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8(<64 x i8> %x0, <64 x i8> %x1) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP2]])
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X1]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -684,8 +733,7 @@ define <64 x i8> @shuffle_vpermv3_v64i8_unary(<64 x i8> %x0) #0 {
 ; CHECK-SAME: <64 x i8> [[X0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP1]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[TMP1]])
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> <i8 -128, i8 127, i8 126, i8 125, i8 124, i8 123, i8 122, i8 121, i8 120, i8 119, i8 118, i8 115, i8 51, i8 50, i8 49, i8 48, i8 47, i8 46, i8 45, i8 44, i8 43, i8 42, i8 41, i8 40, i8 39, i8 38, i8 37, i8 36, i8 35, i8 34, i8 33, i8 32, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <64 x i8> [[X0]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -708,8 +756,14 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
 ; CHECK-NEXT:    [[TMP8:%.*]] = or <64 x i8> [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[T:%.*]] = or <64 x i8> [[M]], <i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128, i8 0, i8 -128>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <64 x i8> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <64 x i8> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[TMP6]], <64 x i8> [[T]], <64 x i8> [[TMP3]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <64 x i8> [[TMP9]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB13]]:
 ; CHECK-NEXT:    [[R:%.*]] = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> [[X0]], <64 x i8> [[T]], <64 x i8> [[X1]])
 ; CHECK-NEXT:    store <64 x i8> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <64 x i8> [[R]]
@@ -720,3 +774,6 @@ define <64 x i8> @shuffle_vpermv3_v64i8_demandedbits(<64 x i8> %x0, <64 x i8> %x
 }
 
 attributes #0 = { sanitize_memory }
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 5cc56baf0e0de..9d3e9d63eed28 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -780,8 +780,15 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i8> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]])
 ; CHECK-NEXT:    store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <32 x i8> [[RES]]
 ;
@@ -1021,8 +1028,15 @@ define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[TMP1]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> [[A0:%.*]], <8 x i32> [[A1]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -1038,18 +1052,18 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[TMP7]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP10]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx2.permps(<8 x float> [[A0:%.*]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]

From 7f69cd578de899f8b00525a02d1fe25dab567bcf Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Fri, 13 Jun 2025 16:35:30 -0700
Subject: [PATCH 454/851] [clang-doc] remove default label on some switches
 (#143919)

LLVM style prefers no default label on fully covered switches to warn if
new enums are added. This patch removes the default label for that
purpose or uses IT_default instead of default if that was the only enum
not covered.
---
 clang-tools-extra/clang-doc/BitcodeReader.cpp         |  4 +---
 clang-tools-extra/clang-doc/BitcodeWriter.cpp         |  2 +-
 clang-tools-extra/clang-doc/Representation.cpp        |  2 +-
 clang-tools-extra/clang-doc/Serialize.cpp             | 11 ++++++++---
 clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp |  2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/clang-doc/BitcodeReader.cpp b/clang-tools-extra/clang-doc/BitcodeReader.cpp
index 57dd514b90a2b..35058abab0663 100644
--- a/clang-tools-extra/clang-doc/BitcodeReader.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeReader.cpp
@@ -54,10 +54,8 @@ static llvm::Error decodeRecord(const Record &R, AccessSpecifier &Field,
   case AS_none:
     Field = (AccessSpecifier)R[0];
     return llvm::Error::success();
-  default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "invalid value for AccessSpecifier");
   }
+  llvm_unreachable("invalid value for AccessSpecifier");
 }
 
 static llvm::Error decodeRecord(const Record &R, TagTypeKind &Field,
diff --git a/clang-tools-extra/clang-doc/BitcodeWriter.cpp b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
index 708ce09d9e5b2..f8a6859169b01 100644
--- a/clang-tools-extra/clang-doc/BitcodeWriter.cpp
+++ b/clang-tools-extra/clang-doc/BitcodeWriter.cpp
@@ -664,7 +664,7 @@ bool ClangDocBitcodeWriter::dispatchInfoForWrite(Info *I) {
   case InfoType::IT_typedef:
     emitBlock(*static_cast<clang::doc::TypedefInfo *>(I));
     break;
-  default:
+  case InfoType::IT_default:
     llvm::errs() << "Unexpected info, unable to write.\n";
     return true;
   }
diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 3ce930c6965db..820d644ef8b83 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -143,7 +143,7 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return reduce<FunctionInfo>(Values);
   case InfoType::IT_typedef:
     return reduce<TypedefInfo>(Values);
-  default:
+  case InfoType::IT_default:
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
   }
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index 3cda38115ff7f..e8f1a9cee2675 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -388,7 +388,8 @@ std::string serialize(std::unique_ptr<Info> &I) {
     return serialize(*static_cast<EnumInfo *>(I.get()));
   case InfoType::IT_function:
     return serialize(*static_cast<FunctionInfo *>(I.get()));
-  default:
+  case InfoType::IT_typedef:
+  case InfoType::IT_default:
     return "";
   }
 }
@@ -525,9 +526,13 @@ static std::unique_ptr<Info> makeAndInsertIntoParent(ChildType Child) {
     InsertChild(ParentRec->Children, std::forward<ChildType>(Child));
     return ParentRec;
   }
-  default:
-    llvm_unreachable("Invalid reference type for parent namespace");
+  case InfoType::IT_default:
+  case InfoType::IT_enum:
+  case InfoType::IT_function:
+  case InfoType::IT_typedef:
+    break;
   }
+  llvm_unreachable("Invalid reference type for parent namespace");
 }
 
 // There are two uses for this function.
diff --git a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
index bbe158ed50e28..659870d2a5c0d 100644
--- a/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/BitcodeTest.cpp
@@ -37,7 +37,7 @@ static std::string writeInfo(Info *I) {
     return writeInfo(*static_cast<FunctionInfo *>(I));
   case InfoType::IT_typedef:
     return writeInfo(*static_cast<TypedefInfo *>(I));
-  default:
+  case InfoType::IT_default:
     return "";
   }
 }

From 417ab37d85ad1bb3e5623dff487ef108404e37f5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 17:11:52 -0700
Subject: [PATCH 455/851] [ConstantFolding] Fold deinterleave2 of any splat
 vector not just zeroinitializer (#144144)

While there remove an unnecessary dyn_cast from Constant to Constant.
Reverse a branch condition into an early out to reduce nesting.
---
 llvm/lib/Analysis/ConstantFolding.cpp         | 43 +++++++++----------
 .../InstSimplify/ConstProp/vector-calls.ll    | 16 +++++++
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 64a0f4641250c..2b7a438a9ef01 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -3990,31 +3990,30 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID,
     return ConstantStruct::get(StTy, SinResult, CosResult);
   }
   case Intrinsic::vector_deinterleave2: {
-    auto *Vec = dyn_cast<Constant>(Operands[0]);
-    if (!Vec)
-      return nullptr;
-
+    auto *Vec = Operands[0];
     auto *VecTy = cast<VectorType>(Vec->getType());
-    unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2;
-    if (isa<ConstantAggregateZero>(Vec)) {
-      auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy);
-      return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy),
-                                 ConstantAggregateZero::get(HalfVecTy));
+
+    if (auto *EltC = Vec->getSplatValue()) {
+      ElementCount HalfEC = VecTy->getElementCount().divideCoefficientBy(2);
+      auto *HalfVec = ConstantVector::getSplat(HalfEC, EltC);
+      return ConstantStruct::get(StTy, HalfVec, HalfVec);
     }
-    if (isa<FixedVectorType>(Vec->getType())) {
-      SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
-      for (unsigned I = 0; I < NumElements; ++I) {
-        Constant *Elt0 = Vec->getAggregateElement(2 * I);
-        Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
-        if (!Elt0 || !Elt1)
-          return nullptr;
-        Res0[I] = Elt0;
-        Res1[I] = Elt1;
-      }
-      return ConstantStruct::get(StTy, ConstantVector::get(Res0),
-                                 ConstantVector::get(Res1));
+
+    if (!isa<FixedVectorType>(Vec->getType()))
+      return nullptr;
+
+    unsigned NumElements = VecTy->getElementCount().getFixedValue() / 2;
+    SmallVector<Constant *, 4> Res0(NumElements), Res1(NumElements);
+    for (unsigned I = 0; I < NumElements; ++I) {
+      Constant *Elt0 = Vec->getAggregateElement(2 * I);
+      Constant *Elt1 = Vec->getAggregateElement(2 * I + 1);
+      if (!Elt0 || !Elt1)
+        return nullptr;
+      Res0[I] = Elt0;
+      Res1[I] = Elt1;
     }
-    return nullptr;
+    return ConstantStruct::get(StTy, ConstantVector::get(Res0),
+                               ConstantVector::get(Res1));
   }
   default:
     // TODO: Constant folding of vector intrinsics that fall through here does
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
index 9dbe3d4e50ee1..14543f339db5d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll
@@ -66,3 +66,19 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterlea
   %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> zeroinitializer)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
 }
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>} @fold_scalable_vector_deinterleave2_splat() {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @fold_scalable_vector_deinterleave2_splat() {
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } { <vscale x 4 x i32> splat (i32 1), <vscale x 4 x i32> splat (i32 1) }
+;
+  %1 = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<vscale x 8 x i32> splat (i32 1))
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %1
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>} @fold_scalable_vector_deinterleave2_splatfp() {
+; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @fold_scalable_vector_deinterleave2_splatfp() {
+; CHECK-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } { <vscale x 4 x float> splat (float 1.000000e+00), <vscale x 4 x float> splat (float 1.000000e+00) }
+;
+  %1 = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.v4f32.v8f32(<vscale x 8 x float> splat (float 1.0))
+  ret {<vscale x 4 x float>, <vscale x 4 x float>} %1
+}

From 15f100d1445846cdb55c24e588a74fde522fc9c9 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Fri, 13 Jun 2025 17:17:20 -0700
Subject: [PATCH 456/851] [bazel] fix mlir/tblgen.bzl formatting after
 6e988bd33f5fa8a529ef9208d3e147945b7bb7ed

---
 utils/bazel/llvm-project-overlay/mlir/tblgen.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 884d6f381b02d..89b17735e005b 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -525,7 +525,7 @@ def gentbl_sharded_ops(
         td_file = td_file,
         test = test,
         deps = deps,
-        **kwargs,
+        **kwargs
     )
     all_files = [hdr_out, src_out]
     for i in range(0, shard_count):
@@ -537,13 +537,13 @@ def gentbl_sharded_ops(
             out = out_file,
             sharder = sharder,
             src_file = src_file,
-            **kwargs,
+            **kwargs
         )
         all_files.append(out_file)
     native.filegroup(
         name = name,
         srcs = all_files,
-        **kwargs,
+        **kwargs
     )
 
 def gentbl_sharded_op_defs(name, source_file, shard_count):

From bd319d9071fb0c6e1bda9db500d039d32a49c28a Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 09:42:39 +0900
Subject: [PATCH 457/851] [Cygwin] CYGWIN is not WIN32 in current CMake
 (#143130)

On old CMake, Cygwin were also WIN32 but currently not. LLVM_ON_UNIX=1
and LLVM_HAVE_LINK_VERSION_SCRIPT=0 should be defined for Cygwin target.
---
 llvm/cmake/config-ix.cmake                 | 16 +++++-----------
 llvm/cmake/modules/HandleLLVMOptions.cmake |  6 +++---
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 687f5077cbfd2..9895469973e47 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -1,8 +1,3 @@
-if( WIN32 AND NOT CYGWIN )
-  # We consider Cygwin as another Unix
-  set(PURE_WINDOWS 1)
-endif()
-
 include(CheckIncludeFile)
 include(CheckLibraryExists)
 include(CheckSymbolExists)
@@ -31,7 +26,7 @@ elseif (APPLE)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
-elseif (PURE_WINDOWS)
+elseif (WIN32)
   set(HAVE_MACH_MACH_H 0)
   set(HAVE_MALLOC_MALLOC_H 0)
   set(HAVE_PTHREAD_H 0)
@@ -132,7 +127,7 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
 endif()
 
 # library checks
-if( NOT PURE_WINDOWS )
+if(NOT WIN32)
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
   if (HAVE_LIBPTHREAD)
     check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
@@ -275,7 +270,7 @@ endif()
 # party code may call MSan interceptors like strlen, leading to false positives.
 if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
   # Don't look for these libraries on Windows.
-  if (NOT PURE_WINDOWS)
+  if (NOT WIN32)
     # Skip libedit if using ASan as it contains memory leaks.
     if (LLVM_ENABLE_LIBEDIT AND NOT LLVM_USE_SANITIZER MATCHES ".*Address.*")
       if(LLVM_ENABLE_LIBEDIT STREQUAL FORCE_ON)
@@ -384,7 +379,7 @@ check_symbol_exists(sbrk unistd.h HAVE_SBRK)
 check_symbol_exists(strerror_r string.h HAVE_STRERROR_R)
 check_symbol_exists(strerror_s string.h HAVE_DECL_STRERROR_S)
 check_symbol_exists(setenv stdlib.h HAVE_SETENV)
-if( PURE_WINDOWS )
+if(WIN32)
   check_symbol_exists(_chsize_s io.h HAVE__CHSIZE_S)
 
   check_function_exists(_alloca HAVE__ALLOCA)
@@ -420,8 +415,7 @@ else()
       "sys/types.h;sys/stat.h" HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC)
 endif()
 
-# This check requires _GNU_SOURCE.
-if (NOT PURE_WINDOWS)
+if (NOT WIN32)
   if (LLVM_PTHREAD_LIB)
     list(APPEND CMAKE_REQUIRED_LIBRARIES ${LLVM_PTHREAD_LIB})
   endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index c35d9763a3301..e2f9826d39818 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -217,15 +217,15 @@ if( LLVM_REVERSE_ITERATION )
   set( LLVM_ENABLE_REVERSE_ITERATION 1 )
 endif()
 
-if(WIN32)
+if(WIN32 OR CYGWIN)
   set(LLVM_HAVE_LINK_VERSION_SCRIPT 0)
   if(CYGWIN)
     set(LLVM_ON_WIN32 0)
     set(LLVM_ON_UNIX 1)
-  else(CYGWIN)
+  else()
     set(LLVM_ON_WIN32 1)
     set(LLVM_ON_UNIX 0)
-  endif(CYGWIN)
+  endif()
 elseif(FUCHSIA OR UNIX)
   set(LLVM_ON_WIN32 0)
   set(LLVM_ON_UNIX 1)

From e37707b1e85cfc07fe75fd6b7e5d41963c52a8ec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 18:06:03 -0700
Subject: [PATCH 458/851] [RISCV] Use unsigned instead of uint16_t for the
 Opcode argument to getVectorLowDemandedScalarBits. NFC

All the callers pass an unsigned and uint16_t arguments are unusual.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index e5d29e1a8b476..107f645709c70 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4581,7 +4581,7 @@ bool RISCV::hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2) {
 }
 
 std::optional<unsigned>
-RISCV::getVectorLowDemandedScalarBits(uint16_t Opcode, unsigned Log2SEW) {
+RISCV::getVectorLowDemandedScalarBits(unsigned Opcode, unsigned Log2SEW) {
   switch (Opcode) {
   default:
     return std::nullopt;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 8260949cf918a..020be91e90e0b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -364,7 +364,7 @@ bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
 // If \p Opcode is a .vx vector instruction, returns the lower number of bits
 // that are used from the scalar .x operand for a given \p Log2SEW. Otherwise
 // returns null.
-std::optional<unsigned> getVectorLowDemandedScalarBits(uint16_t Opcode,
+std::optional<unsigned> getVectorLowDemandedScalarBits(unsigned Opcode,
                                                        unsigned Log2SEW);
 
 // Returns the MC opcode of RVV pseudo instruction.

From d4c7d0be1f5235555393313bb1f8e46c97f76766 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 13 Jun 2025 19:12:12 -0700
Subject: [PATCH 459/851] MCObjectStreamer: Replace getAssemblerPtr with
 getAssembler

In general getAssemblerPtr should only be called by MCParse.
Revert some changes from https://reviews.llvm.org/D45164?id=143128
---
 llvm/lib/MC/MCObjectStreamer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index e3d5a5a9a1327..1bb2143ed6ab2 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -261,7 +261,7 @@ void MCObjectStreamer::emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
 
 void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
     emitULEB128IntValue(IntValue);
     return;
   }
@@ -270,7 +270,7 @@ void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
 
 void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
-  if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
+  if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
     emitSLEB128IntValue(IntValue);
     return;
   }
@@ -727,7 +727,7 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
                                 int64_t Expr, SMLoc Loc) {
   int64_t IntNumValues;
   // Do additional checking now if we can resolve the value.
-  if (NumValues.evaluateAsAbsolute(IntNumValues, getAssemblerPtr())) {
+  if (NumValues.evaluateAsAbsolute(IntNumValues, getAssembler())) {
     if (IntNumValues < 0) {
       getContext().getSourceManager()->PrintMessage(
           Loc, SourceMgr::DK_Warning,

From 709ba084c5632b786f2e6c503d3f9f27e1f1c433 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:18:12 -0700
Subject: [PATCH 460/851] [RISCV] Use RISCVII::getVecPolicyOpNum instead of
 making assumptions.  NFC (#144175)

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 107f645709c70..7d868bf6e2ab0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3696,7 +3696,8 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VMA_OPCODE_LMULS(NMSAC, VV): {
     // If the tail policy is undisturbed we can't commute.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
-    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return false;
 
     // For these instructions we can only swap operand 1 and operand 3 by
@@ -3716,7 +3717,8 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VMA_OPCODE_LMULS(NMSUB, VV): {
     // If the tail policy is undisturbed we can't commute.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags));
-    if ((MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return false;
 
     // For these instructions we have more freedom. We can commute with the
@@ -4331,7 +4333,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
     // If the tail policy is undisturbed we can't convert.
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
            MI.getNumExplicitOperands() == 6);
-    if ((MI.getOperand(5).getImm() & 1) == 0)
+    if ((MI.getOperand(RISCVII::getVecPolicyOpNum(MI.getDesc())).getImm() &
+         1) == 0)
       return nullptr;
 
     // clang-format off

From ef265ed23038a3719829a08fcbf7384fbdfe0451 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:19:04 -0700
Subject: [PATCH 461/851] [RISCV] Simplify macros used by
 RISCVInstrInfo::convertToThreeAddress. NFC (#144173)

Merge some macros that are only used once by another macro.
Rename macros to remove _MF4 where not needed.

I suspect these are artifacts from FP being split from integer in the
past.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 36 ++++++++----------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7d868bf6e2ab0..949d78b3940e7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4226,38 +4226,32 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
 #define CASE_WIDEOP_OPCODE_COMMON(OP, LMUL)                                    \
   RISCV::PseudoV##OP##_##LMUL##_TIED
 
-#define CASE_WIDEOP_OPCODE_LMULS_MF4(OP)                                       \
-  CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                          \
+#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
+  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
+  case CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                     \
   case CASE_WIDEOP_OPCODE_COMMON(OP, MF2):                                     \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M1):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M2):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M4)
 
-#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
-  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
-  case CASE_WIDEOP_OPCODE_LMULS_MF4(OP)
-
 #define CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, LMUL)                             \
   case RISCV::PseudoV##OP##_##LMUL##_TIED:                                     \
     NewOpc = RISCV::PseudoV##OP##_##LMUL;                                      \
     break;
 
-#define CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)                                \
+#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
+  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4)
 
-#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
-  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
-  CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)
-
 // FP Widening Ops may by SEW aware. Create SEW aware cases for these cases.
 #define CASE_FP_WIDEOP_OPCODE_COMMON(OP, LMUL, SEW)                            \
   RISCV::PseudoV##OP##_##LMUL##_##SEW##_TIED
 
-#define CASE_FP_WIDEOP_OPCODE_LMULS_MF4(OP)                                    \
+#define CASE_FP_WIDEOP_OPCODE_LMULS(OP)                                        \
   CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF4, E16):                                  \
   case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E16):                             \
   case CASE_FP_WIDEOP_OPCODE_COMMON(OP, MF2, E32):                             \
@@ -4273,7 +4267,7 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
     NewOpc = RISCV::PseudoV##OP##_##LMUL##_##SEW;                              \
     break;
 
-#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)                             \
+#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                 \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4, E16)                            \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E16)                            \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2, E32)                            \
@@ -4283,9 +4277,6 @@ bool RISCVInstrInfo::simplifyInstruction(MachineInstr &MI) const {
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2, E32)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E16)                             \
   CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4, E32)                             \
-
-#define CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                 \
-  CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)
 // clang-format on
 
 MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
@@ -4295,8 +4286,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
   switch (MI.getOpcode()) {
   default:
     return nullptr;
-  case CASE_FP_WIDEOP_OPCODE_LMULS_MF4(FWADD_WV):
-  case CASE_FP_WIDEOP_OPCODE_LMULS_MF4(FWSUB_WV): {
+  case CASE_FP_WIDEOP_OPCODE_LMULS(FWADD_WV):
+  case CASE_FP_WIDEOP_OPCODE_LMULS(FWSUB_WV): {
     assert(RISCVII::hasVecPolicyOp(MI.getDesc().TSFlags) &&
            MI.getNumExplicitOperands() == 7 &&
            "Expect 7 explicit operands rd, rs2, rs1, rm, vl, sew, policy");
@@ -4309,8 +4300,8 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected opcode");
-    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWADD_WV)
-    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWSUB_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
+    CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
     }
     // clang-format on
 
@@ -4390,15 +4381,12 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
 }
 
 #undef CASE_WIDEOP_OPCODE_COMMON
-#undef CASE_WIDEOP_OPCODE_LMULS_MF4
 #undef CASE_WIDEOP_OPCODE_LMULS
 #undef CASE_WIDEOP_CHANGE_OPCODE_COMMON
-#undef CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4
 #undef CASE_WIDEOP_CHANGE_OPCODE_LMULS
 #undef CASE_FP_WIDEOP_OPCODE_COMMON
-#undef CASE_FP_WIDEOP_OPCODE_LMULS_MF4
+#undef CASE_FP_WIDEOP_OPCODE_LMULS
 #undef CASE_FP_WIDEOP_CHANGE_OPCODE_COMMON
-#undef CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS_MF4
 #undef CASE_FP_WIDEOP_CHANGE_OPCODE_LMULS
 
 void RISCVInstrInfo::mulImm(MachineFunction &MF, MachineBasicBlock &MBB,

From 35e3c50731870cc37a73ef1286a92f49347ccea4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Jun 2025 22:19:25 -0700
Subject: [PATCH 462/851] [RISCV] Simplify macros used for commuting vector
 multiply-accumulate instructions. NFC (#144169)

Inline some macros that were only instantiated once.
Remove unused macros.
#undef macros when finished with them
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 65 ++++++++----------------
 1 file changed, 22 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 949d78b3940e7..d9ef911b9a32e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3573,24 +3573,15 @@ std::string RISCVInstrInfo::createMIROperandComment(
 #define CASE_VMA_OPCODE_COMMON(OP, TYPE, LMUL)                                 \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
 
-#define CASE_VMA_OPCODE_LMULS_M1(OP, TYPE)                                     \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, M1):                                        \
+#define CASE_VMA_OPCODE_LMULS(OP, TYPE)                                        \
+  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF8):                                       \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, MF4):                                  \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, MF2):                                  \
+  case CASE_VMA_OPCODE_COMMON(OP, TYPE, M1):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M2):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M4):                                   \
   case CASE_VMA_OPCODE_COMMON(OP, TYPE, M8)
 
-#define CASE_VMA_OPCODE_LMULS_MF2(OP, TYPE)                                    \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF2):                                       \
-  case CASE_VMA_OPCODE_LMULS_M1(OP, TYPE)
-
-#define CASE_VMA_OPCODE_LMULS_MF4(OP, TYPE)                                    \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF4):                                       \
-  case CASE_VMA_OPCODE_LMULS_MF2(OP, TYPE)
-
-#define CASE_VMA_OPCODE_LMULS(OP, TYPE)                                        \
-  CASE_VMA_OPCODE_COMMON(OP, TYPE, MF8):                                       \
-  case CASE_VMA_OPCODE_LMULS_MF4(OP, TYPE)
-
 // VFMA instructions are SEW specific.
 #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL, SEW)                           \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL##_##SEW
@@ -3790,29 +3781,15 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
     Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL;                             \
     break;
 
-#define CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)                    \
+#define CASE_VMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                       \
+  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                       \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M1)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M2)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M4)                        \
   CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M8)
 
-#define CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)                   \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)                   \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                       \
-  CASE_VMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)
-
-#define CASE_VMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                            \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VFPR32)                       \
-  CASE_VMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VFPR64)
-
 // VFMA depends on SEW.
 #define CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, LMUL, SEW)          \
   case RISCV::PseudoV##OLDOP##_##TYPE##_##LMUL##_##SEW:                        \
@@ -3829,18 +3806,14 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2, SEW)                 \
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE, SEW)
 
-#define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
-
 #define CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE, SEW)             \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4, SEW)                 \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE, SEW)
 
-#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE, SEW)                 \
-  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8, SEW)                 \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE, SEW)
+#define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
 
 #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                           \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VFPR16, E16)                 \
@@ -3963,6 +3936,15 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
+#undef CASE_VMA_CHANGE_OPCODE_COMMON
+#undef CASE_VMA_CHANGE_OPCODE_LMULS
+#undef CASE_VFMA_CHANGE_OPCODE_COMMON
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_M1
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_MF2
+#undef CASE_VFMA_CHANGE_OPCODE_LMULS_MF4
+#undef CASE_VFMA_CHANGE_OPCODE_VV
+#undef CASE_VFMA_CHANGE_OPCODE_SPLATS
+
 #undef CASE_RVV_OPCODE_UNMASK_LMUL
 #undef CASE_RVV_OPCODE_MASK_LMUL
 #undef CASE_RVV_OPCODE_LMUL
@@ -3974,9 +3956,6 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
 #undef CASE_RVV_OPCODE
 
 #undef CASE_VMA_OPCODE_COMMON
-#undef CASE_VMA_OPCODE_LMULS_M1
-#undef CASE_VMA_OPCODE_LMULS_MF2
-#undef CASE_VMA_OPCODE_LMULS_MF4
 #undef CASE_VMA_OPCODE_LMULS
 #undef CASE_VFMA_OPCODE_COMMON
 #undef CASE_VFMA_OPCODE_LMULS_M1

From 0bd614a8ee11cfc5cee8719b3209f40b163d5a62 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Sat, 14 Jun 2025 14:36:14 +0900
Subject: [PATCH 463/851] [Cygwin] Don't use version script for Cygwin target
 (#143133)

Cygwin is a COFF platform and does not support version-script.
I guess I should use LLVM_HAVE_LINK_VERSION_SCRIPT here, but I don't
know why this is not currently the case.
---
 llvm/tools/llvm-shlib/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-shlib/CMakeLists.txt b/llvm/tools/llvm-shlib/CMakeLists.txt
index 089255f361045..9a2015f61f2bf 100644
--- a/llvm/tools/llvm-shlib/CMakeLists.txt
+++ b/llvm/tools/llvm-shlib/CMakeLists.txt
@@ -66,7 +66,7 @@ if(LLVM_BUILD_LLVM_DYLIB)
     else()
       # GNU ld doesn't resolve symbols in the version script.
       set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
-      if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW)
+      if (NOT LLVM_LINKER_IS_SOLARISLD AND NOT MINGW AND NOT CYGWIN)
         # Solaris ld does not accept global: *; so there is no way to version *all* global symbols
         set(LIB_NAMES -Wl,--version-script,${LLVM_LIBRARY_DIR}/tools/llvm-shlib/simple_version_script.map ${LIB_NAMES})
       endif()

From 07fa6d1d90c714fa269529c3e5004a063d814c4a Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <thevar1able@users.noreply.github.com>
Date: Sat, 14 Jun 2025 09:32:54 +0300
Subject: [PATCH 464/851] [InstCombine] Avoid folding `select(umin(X, Y), X)`
 with min/max values in false arm (#143020)

Fixes https://github.com/llvm/llvm-project/issues/139050.

This patch adds a check to avoid folding min/max reduction into select, which may block loop vectorization.

The issue is that the following snippet:
```
declare i8 @llvm.umin.i8(i8, i8)

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
; CHECK-LABEL: @masked_min_fold_bug(
; CHECK:       %cond = icmp eq i8 %mask, 0
; CHECK:       %masked_val = select i1 %cond, i8 %val, i8 255
; CHECK:       call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
;
  %cond = icmp eq i8 %mask, 0
  %masked_val = select i1 %cond, i8 %val, i8 255
  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
  ret i8 %res
}
```

is being optimized to the following code, which can not be vectorized
later.
```
declare i8 @llvm.umin.i8(i8, i8) #0

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
  %cond = icmp eq i8 %mask, 0
  %1 = call i8 @llvm.umin.i8(i8 %acc, i8 %val)
  %res = select i1 %cond, i8 %1, i8 %acc
  ret i8 %res
}

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

Expected:
```
declare i8 @llvm.umin.i8(i8, i8) #0

define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) {
  %cond = icmp eq i8 %mask, 0
  %masked_val = select i1 %cond, i8 %val, i8 -1
  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
  ret i8 %res
}

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
```

https://godbolt.org/z/cYMheKE5r
---
 .../InstCombine/InstructionCombining.cpp      |  9 ++++
 llvm/test/Transforms/InstCombine/select.ll    | 47 +++++++++++++++++
 .../PhaseOrdering/X86/vector-reductions.ll    | 50 ++++++++++++++-----
 3 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 29582939fa06a..4fe900e9421f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1739,6 +1739,15 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
   if (SI->getType()->isIntOrIntVectorTy(1))
     return nullptr;
 
+  // Avoid breaking min/max reduction pattern,
+  // which is necessary for vectorization later.
+  if (isa<MinMaxIntrinsic>(&Op))
+    for (Value *IntrinOp : Op.operands())
+      if (auto *PN = dyn_cast<PHINode>(IntrinOp))
+        for (Value *PhiOp : PN->operands())
+          if (PhiOp == &Op)
+            return nullptr;
+
   // Test if a FCmpInst instruction is used exclusively by a select as
   // part of a minimum or maximum operation. If so, refrain from doing
   // any other folding. This helps out other analyses which understand
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index e16f6ad2cfc9b..ef5874ffd46ad 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -5047,3 +5047,50 @@ define <2 x ptr> @select_freeze_constant_expression_vector_gep(i1 %cond, <2 x pt
   %sel = select i1 %cond, <2 x ptr> %y, <2 x ptr> %freeze
   ret <2 x ptr> %sel
 }
+
+define void @no_fold_masked_min_loop(ptr nocapture readonly %vals, ptr nocapture readonly %masks, ptr nocapture %out, i64 %n) {
+; CHECK-LABEL: @no_fold_masked_min_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[RES:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[VALS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[MASK_PTR:%.*]] = getelementptr inbounds i8, ptr [[MASKS:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[VAL_PTR]], align 1
+; CHECK-NEXT:    [[MASK:%.*]] = load i8, ptr [[MASK_PTR]], align 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[MASK]], 0
+; CHECK-NEXT:    [[MASKED_VAL:%.*]] = select i1 [[COND]], i8 [[VAL]], i8 -1
+; CHECK-NEXT:    [[RES]] = call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[MASKED_VAL]])
+; CHECK-NEXT:    [[NEXT_INDEX]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXT_INDEX]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    store i8 [[RES]], ptr [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%next_index, %loop]
+  %acc = phi i8 [255, %entry], [%res, %loop]
+
+  %val_ptr = getelementptr inbounds i8, ptr %vals, i64 %index
+  %mask_ptr = getelementptr inbounds i8, ptr %masks, i64 %index
+
+  %val = load i8, ptr %val_ptr, align 1
+  %mask = load i8, ptr %mask_ptr, align 1
+
+  %cond = icmp eq i8 %mask, 0
+  %masked_val = select i1 %cond, i8 %val, i8 -1
+  %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val)
+
+  %next_index = add i64 %index, 1
+  %done = icmp eq i64 %next_index, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  store i8 %res, ptr %out, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index f8450766037b2..2ec48a8637dae 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -326,26 +326,52 @@ cleanup:
   ret i1 %retval.0
 }
 
-; From https://github.com/llvm/llvm-project/issues/139050.
-; FIXME: This should be vectorized.
 define i8 @masked_min_reduction(ptr %data, ptr %mask) {
 ; CHECK-LABEL: @masked_min_reduction(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       loop:
+; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[DATA:%.*]] = getelementptr i8, ptr [[DATA1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr [[DATA]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[DATA]], i64 32
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DATA]], i64 64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[DATA]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[DATA]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[MASK:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[M:%.*]] = load i8, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[M]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[VAL]])
-; CHECK-NEXT:    [[TMP21]] = select i1 [[COND]], i8 [[TMP0]], i8 [[ACC]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP8]], <32 x i8> [[WIDE_LOAD]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP9]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = select <32 x i1> [[TMP10]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP11]], <32 x i8> [[WIDE_LOAD6]], <32 x i8> splat (i8 -1)
+; CHECK-NEXT:    [[TMP16]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI]], <32 x i8> [[TMP12]])
+; CHECK-NEXT:    [[TMP17]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI1]], <32 x i8> [[TMP13]])
+; CHECK-NEXT:    [[TMP18]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI2]], <32 x i8> [[TMP14]])
+; CHECK-NEXT:    [[TMP19]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI3]], <32 x i8> [[TMP15]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP20]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
-; CHECK:       exit:
+; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_MINMAX:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP16]], <32 x i8> [[TMP17]])
+; CHECK-NEXT:    [[RDX_MINMAX11:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX]], <32 x i8> [[TMP18]])
+; CHECK-NEXT:    [[RDX_MINMAX12:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX11]], <32 x i8> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[RDX_MINMAX12]])
 ; CHECK-NEXT:    ret i8 [[TMP21]]
 ;
 entry:

From 2796c412499a276ad23ae184daac33175c32424f Mon Sep 17 00:00:00 2001
From: Kunqiu Chen <camsyn@foxmail.com>
Date: Sat, 14 Jun 2025 14:59:36 +0800
Subject: [PATCH 465/851] [MSan] Fix minor issues in testcases (#144073)

Previously,
1. ifaddrs.cpp : mistake `size_t (xxx)` as `sizeof (xxx)`, resulting in
inadequate checks.
2. qsort.cpp : mistake `kSize2` as `kSize1`, resulting in an unexpected
buffer overlow issue.
---
 compiler-rt/test/msan/ifaddrs.cpp | 10 +++++-----
 compiler-rt/test/msan/qsort.cpp   |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/test/msan/ifaddrs.cpp b/compiler-rt/test/msan/ifaddrs.cpp
index 91730a01f2d8a..e06775db3251a 100644
--- a/compiler-rt/test/msan/ifaddrs.cpp
+++ b/compiler-rt/test/msan/ifaddrs.cpp
@@ -16,10 +16,10 @@
 
 #include <sanitizer/msan_interface.h>
 
-#define CHECK_AND_PUSH(addr, size)                                \
-  if (addr) {                                                     \
-    assert(-1 == __msan_test_shadow(addr, sizeof(size)));         \
-    ranges.push_back(std::make_pair((void *)addr, (size_t)size)); \
+#define CHECK_AND_PUSH(addr, size)                                             \
+  if (addr) {                                                                  \
+    assert(-1 == __msan_test_shadow(addr, (size_t)(size)));                    \
+    ranges.push_back(std::make_pair((void *)addr, (size_t)size));              \
   }
 
 int main(int argc, char *argv[]) {
@@ -35,7 +35,7 @@ int main(int argc, char *argv[]) {
   assert(res == 0);
   assert(-1 == __msan_test_shadow(&ifas, sizeof(ifaddrs *)));
 
-  std::vector<std::pair<void *, size_t> > ranges;
+  std::vector<std::pair<void *, size_t>> ranges;
   ifaddrs *p = ifas;
   while (p) {
     CHECK_AND_PUSH(p, sizeof(ifaddrs));
diff --git a/compiler-rt/test/msan/qsort.cpp b/compiler-rt/test/msan/qsort.cpp
index af287ed64357e..93e6845e1ea7a 100644
--- a/compiler-rt/test/msan/qsort.cpp
+++ b/compiler-rt/test/msan/qsort.cpp
@@ -52,7 +52,7 @@ int compar1(const void *a, const void *b) {
   // kind of random
   for (int i = 0; i < kSize2; ++i)
     p[i] = i * 2 + (i % 3 - 1) * 3;
-  qsort(p, kSize1, sizeof(long), compar2);
+  qsort(p, kSize2, sizeof(long), compar2);
   __msan_check_mem_is_initialized(p, sizeof(long) * kSize2);
   delete[] p;
 

From 2e7fbb94bc268d37996408a525781961989d8627 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 14 Jun 2025 00:21:39 -0700
Subject: [PATCH 466/851] [clang-format] Fix a bug in annotating braces
 (#144095)

Stop looking for function decls after hitting a BK_BracedInit brace.

Fixes #144057.
---
 clang/lib/Format/TokenAnnotator.cpp           | 2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index aed1672afac66..d2f8b2703a9a3 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3978,7 +3978,7 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
   for (auto *Tok = FirstNonComment && FirstNonComment->isNot(tok::kw_using)
                        ? FirstNonComment->Next
                        : nullptr;
-       Tok; Tok = Tok->Next) {
+       Tok && Tok->isNot(BK_BracedInit); Tok = Tok->Next) {
     if (Tok->is(TT_StartOfName))
       SeenName = true;
     if (Tok->Previous->EndsCppAttributeGroup)
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 873c6c492d18c..a1285e4bc9bf8 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3754,6 +3754,13 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_BRACE_KIND(Tokens[4], BK_BracedInit);
   EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit);
+
+  Tokens = annotate("auto f1{&T::operator()};");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_BRACE_KIND(Tokens[2], BK_BracedInit);
+  // Not TT_FunctionDeclarationName.
+  EXPECT_TOKEN(Tokens[6], tok::kw_operator, TT_Unknown);
+  EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) {

From f46c44dbc0d225277178cf5b6646a96f591fdeaa Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002@gmail.com>
Date: Sat, 14 Jun 2025 10:55:42 +0300
Subject: [PATCH 467/851] [clang-tidy][NFC] change patterns 'anyOf(...,
 anything())' to 'optionally(...)' (#143558)

Writing `optionally()` instead of `anyOf(..., anything())` lowers code
size and gives the author's intention better.
---
 .../bugprone/NotNullTerminatedResultCheck.cpp | 21 +++++++++----------
 .../hicpp/ExceptionBaseclassCheck.cpp         | 12 +++++------
 .../clang-tidy/misc/StaticAssertCheck.cpp     | 13 +++++-------
 .../modernize/UseBoolLiteralsCheck.cpp        | 13 ++++++------
 .../ImplicitBoolConversionCheck.cpp           |  4 ++--
 5 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
index bedecb60569e8..203170d55f694 100644
--- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.cpp
@@ -702,17 +702,16 @@ void NotNullTerminatedResultCheck::registerMatchers(MatchFinder *Finder) {
     return hasArgument(
         CC.LengthPos,
         allOf(
-            anyOf(
-                ignoringImpCasts(integerLiteral().bind(WrongLengthExprName)),
-                allOf(unless(hasDefinition(SizeOfCharExpr)),
-                      allOf(CC.WithIncrease
-                                ? ignoringImpCasts(hasDefinition(HasIncOp))
-                                : ignoringImpCasts(allOf(
-                                      unless(hasDefinition(HasIncOp)),
-                                      anyOf(hasDefinition(binaryOperator().bind(
-                                                UnknownLengthName)),
-                                            hasDefinition(anything())))),
-                            AnyOfWrongLengthInit))),
+            anyOf(ignoringImpCasts(integerLiteral().bind(WrongLengthExprName)),
+                  allOf(unless(hasDefinition(SizeOfCharExpr)),
+                        allOf(CC.WithIncrease
+                                  ? ignoringImpCasts(hasDefinition(HasIncOp))
+                                  : ignoringImpCasts(
+                                        allOf(unless(hasDefinition(HasIncOp)),
+                                              hasDefinition(optionally(
+                                                  binaryOperator().bind(
+                                                      UnknownLengthName))))),
+                              AnyOfWrongLengthInit))),
             expr().bind(LengthExprName)));
   };
 
diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
index 2b2acfdf5b08e..ed39568ea554a 100644
--- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.cpp
@@ -24,14 +24,12 @@ void ExceptionBaseclassCheck::registerMatchers(MatchFinder *Finder) {
                   isSameOrDerivedFrom(hasName("::std::exception")))))))))),
           // This condition is always true, but will bind to the
           // template value if the thrown type is templated.
-          anyOf(has(expr(
-                    hasType(substTemplateTypeParmType().bind("templ_type")))),
-                anything()),
+          optionally(has(
+              expr(hasType(substTemplateTypeParmType().bind("templ_type"))))),
           // Bind to the declaration of the type of the value that
-          // is thrown. 'anything()' is necessary to always succeed
-          // in the 'eachOf' because builtin types are not
-          // 'namedDecl'.
-          eachOf(has(expr(hasType(namedDecl().bind("decl")))), anything()))
+          // is thrown. 'optionally' is necessary because builtin types
+          // are not 'namedDecl'.
+          optionally(has(expr(hasType(namedDecl().bind("decl"))))))
           .bind("bad_throw"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
index faff1c17fc61e..37fbd8c0d725f 100644
--- a/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/StaticAssertCheck.cpp
@@ -38,8 +38,7 @@ void StaticAssertCheck::registerMatchers(MatchFinder *Finder) {
       binaryOperator(
           hasAnyOperatorName("&&", "=="),
           hasEitherOperand(ignoringImpCasts(stringLiteral().bind("assertMSG"))),
-          anyOf(binaryOperator(hasEitherOperand(IsAlwaysFalseWithCast)),
-                anything()))
+          optionally(binaryOperator(hasEitherOperand(IsAlwaysFalseWithCast))))
           .bind("assertExprRoot"),
       IsAlwaysFalse);
   auto NonConstexprFunctionCall =
@@ -52,12 +51,10 @@ void StaticAssertCheck::registerMatchers(MatchFinder *Finder) {
   auto NonConstexprCode =
       expr(anyOf(NonConstexprFunctionCall, NonConstexprVariableReference));
   auto AssertCondition =
-      expr(
-          anyOf(expr(ignoringParenCasts(anyOf(
-                    AssertExprRoot, unaryOperator(hasUnaryOperand(
-                                        ignoringParenCasts(AssertExprRoot)))))),
-                anything()),
-          unless(NonConstexprCode), unless(hasDescendant(NonConstexprCode)))
+      expr(optionally(expr(ignoringParenCasts(anyOf(
+               AssertExprRoot, unaryOperator(hasUnaryOperand(
+                                   ignoringParenCasts(AssertExprRoot))))))),
+           unless(NonConstexprCode), unless(hasDescendant(NonConstexprCode)))
           .bind("condition");
   auto Condition =
       anyOf(ignoringParenImpCasts(callExpr(
diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
index c8e6bf47bb82f..339462093a6d6 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.cpp
@@ -26,13 +26,12 @@ void UseBoolLiteralsCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 
 void UseBoolLiteralsCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          implicitCastExpr(
-              has(ignoringParenImpCasts(integerLiteral().bind("literal"))),
-              hasImplicitDestinationType(qualType(booleanType())),
-              unless(isInTemplateInstantiation()),
-              anyOf(hasParent(explicitCastExpr().bind("cast")), anything()))),
+      traverse(TK_AsIs,
+               implicitCastExpr(
+                   has(ignoringParenImpCasts(integerLiteral().bind("literal"))),
+                   hasImplicitDestinationType(qualType(booleanType())),
+                   unless(isInTemplateInstantiation()),
+                   optionally(hasParent(explicitCastExpr().bind("cast"))))),
       this);
 
   Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index f9fd1d903e231..20c73299915a9 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -348,8 +348,8 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
               implicitCastExpr().bind("implicitCastFromBool"),
               unless(hasParent(BitfieldConstruct)),
               // Check also for nested casts, for example: bool -> int -> float.
-              anyOf(hasParent(implicitCastExpr().bind("furtherImplicitCast")),
-                    anything()),
+              optionally(
+                  hasParent(implicitCastExpr().bind("furtherImplicitCast"))),
               unless(isInTemplateInstantiation()),
               unless(IsInCompilerGeneratedFunction))),
       this);

From 892513e51864f3e21120eab87c0c5a6aa37cae31 Mon Sep 17 00:00:00 2001
From: Zhikai Zeng <backlight.zzk@gmail.com>
Date: Sat, 14 Jun 2025 17:14:16 +0800
Subject: [PATCH 468/851] [clang] fix infinite recursion (#143244)

fix https://github.com/llvm/llvm-project/issues/141789

The direct cause of infinite recursion is that `T` is changing from
`struct X` and `S<X>` infinitely, this pr add a check that if `T`
visited before then return false directly.

```plaintext
/home/backlight/llvm-project/clang/lib/Sema/SemaDeclCXX.cpp:7196] FD->getType().getAsString()=struct X, T.getAsString()=S<X>, FD->getType().getCanonicalType().getUnqualifiedType().getAsString()=struct X, CanUnqualT.getAsString()=struct S<struct X>,
/home/backlight/llvm-project/clang/lib/Sema/SemaDeclCXX.cpp:7196] FD->getType().getAsString()=S<X>, T.getAsString()=struct X, FD->getType().getCanonicalType().getUnqualifiedType().getAsString()=struct S<struct X>, CanUnqualT.getAsString()=struct X,
```

https://github.com/llvm/llvm-project/pull/104829 fix similar infinite
recursion, but I think it is no longer needed so I kind of revert it.
---
 clang/docs/ReleaseNotes.rst     |  1 +
 clang/lib/Sema/SemaDeclCXX.cpp  | 14 ++++++--------
 clang/test/SemaCXX/gh102293.cpp | 17 +++++++++++++++++
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9ab8031b9ea8c..33ee8a53b5f37 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -703,6 +703,7 @@ Bug Fixes in This Version
   the second clause of a C-style ``for`` loop. (#GH139818)
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 - Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
+- Fixed an infinite recursion when checking constexpr destructors. (#GH141789)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 31e2834336742..6f62c53aaf04d 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7159,7 +7159,10 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
     // "effectively constexpr" for better compatibility.
     // See https://github.com/llvm/llvm-project/issues/102293 for more info.
     if (isa<CXXDestructorDecl>(M)) {
-      auto Check = [](QualType T, auto &&Check) -> bool {
+      llvm::SmallDenseSet<QualType> Visited;
+      auto Check = [&Visited](QualType T, auto &&Check) -> bool {
+        if (!Visited.insert(T->getCanonicalTypeUnqualified()).second)
+          return false;
         const CXXRecordDecl *RD =
             T->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
         if (!RD || !RD->isCompleteDefinition())
@@ -7168,16 +7171,11 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
         if (!RD->hasConstexprDestructor())
           return false;
 
-        QualType CanUnqualT = T.getCanonicalType().getUnqualifiedType();
         for (const CXXBaseSpecifier &B : RD->bases())
-          if (B.getType().getCanonicalType().getUnqualifiedType() !=
-                  CanUnqualT &&
-              !Check(B.getType(), Check))
+          if (!Check(B.getType(), Check))
             return false;
         for (const FieldDecl *FD : RD->fields())
-          if (FD->getType().getCanonicalType().getUnqualifiedType() !=
-                  CanUnqualT &&
-              !Check(FD->getType(), Check))
+          if (!Check(FD->getType(), Check))
             return false;
         return true;
       };
diff --git a/clang/test/SemaCXX/gh102293.cpp b/clang/test/SemaCXX/gh102293.cpp
index d4218cc13dcec..fe417e697841b 100644
--- a/clang/test/SemaCXX/gh102293.cpp
+++ b/clang/test/SemaCXX/gh102293.cpp
@@ -45,3 +45,20 @@ class quux : quux { // expected-error {{base class has incomplete type}} \
   virtual int c();
 };
 }
+
+// Ensure we don't get infinite recursion from the check, however. See GH141789
+namespace GH141789 {
+template <typename Ty>
+struct S {
+  Ty t; // expected-error {{field has incomplete type 'GH141789::X'}}
+};
+
+struct T {
+  ~T();
+};
+
+struct X { // expected-note {{definition of 'GH141789::X' is not complete until the closing '}'}}
+  S<X> next; // expected-note {{in instantiation of template class 'GH141789::S<GH141789::X>' requested here}}
+  T m;
+};
+}

From 732ebf803b80a8a3fc3aaaceb600cebdf659118e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 10:44:20 +0100
Subject: [PATCH 469/851] [VPlan] Address post-commit comments for
 f68848015f62.

Assign sentinel value to named variable to clarify naming and update
comments.

Addresses post-commit comments from
https://github.com/llvm/llvm-project/pull/142291.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 20 +++++++++----------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +++++-----
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 93f53996425d3..7c006ae326ecb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7263,14 +7263,13 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
   } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
                  RdxDesc.getRecurrenceKind())) {
     Value *StartV = getStartValueFromReductionResult(EpiRedResult);
+    Value *SentinelV = EpiRedResult->getOperand(2)->getLiveInIRValue();
     using namespace llvm::PatternMatch;
     Value *Cmp, *OrigResumeV, *CmpOp;
     bool IsExpectedPattern =
         match(MainResumeValue,
-              m_Select(
-                  m_OneUse(m_Value(Cmp)),
-                  m_Specific(EpiRedResult->getOperand(2)->getLiveInIRValue()),
-                  m_Value(OrigResumeV))) &&
+              m_Select(m_OneUse(m_Value(Cmp)), m_Specific(SentinelV),
+                       m_Value(OrigResumeV))) &&
         (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
                                    m_Value(CmpOp))) &&
          ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp))));
@@ -9224,11 +9223,10 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
             RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
-      FinalReductionResult = Builder.createNaryOp(
-          VPInstruction::ComputeFindLastIVResult,
-          {PhiR, Start, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()),
-           NewExitingVPV},
-          ExitDL);
+      VPValue *Sentinel = Plan->getOrAddLiveIn(RdxDesc.getSentinelValue());
+      FinalReductionResult =
+          Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult,
+                               {PhiR, Start, Sentinel, NewExitingVPV}, ExitDL);
     } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
                    RdxDesc.getRecurrenceKind())) {
       VPValue *Start = PhiR->getStartValue();
@@ -9816,8 +9814,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
         IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
         Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]);
-        ResumeV = Builder.CreateSelect(
-            Cmp, RdxResult->getOperand(2)->getLiveInIRValue(), ResumeV);
+        Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue();
+        ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV);
       } else {
         VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
         auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index d59cec892d405..c64bda167b854 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -643,8 +643,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
     assert(!PhiR->isInLoop() &&
            "In-loop FindLastIV reduction is not supported yet");
 
-    // The recipe's operands are the reduction phi, followed by one operand for
-    // each part of the reduction.
+    // The recipe's operands are the reduction phi, the start value, the
+    // sentinel value, followed by one operand for each part of the reduction.
     unsigned UF = getNumOperands() - 3;
     Value *ReducedPartRdx = State.get(getOperand(3));
     for (unsigned Part = 1; Part < UF; ++Part) {
@@ -652,9 +652,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
                                       State.get(getOperand(3 + Part)));
     }
 
-    return createFindLastIVReduction(Builder, ReducedPartRdx,
-                                     State.get(getOperand(1), true),
-                                     getOperand(2)->getLiveInIRValue());
+    Value *Start = State.get(getOperand(1), true);
+    Value *Sentinel = getOperand(2)->getLiveInIRValue();
+    return createFindLastIVReduction(Builder, ReducedPartRdx, Start, Sentinel);
   }
   case VPInstruction::ComputeReductionResult: {
     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary

From 1bc0b08e19788f2b34f46b183e89f5049468da2a Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sat, 14 Jun 2025 19:02:42 +0900
Subject: [PATCH 470/851] CMake: Fix LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING to
 be 1 or 0.

It has been introduced in #107278 but it was passing
"DISABLED" of LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING to cmakedefine01.

cmakadefine01 treats non-false-like strings as 1.
"DISABLED" is replaced with 1.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index e2f9826d39818..743eb6f5529f2 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -204,10 +204,13 @@ elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_
   set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
   set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
-  # The DISABLED setting is default and requires no additional defines.
+  # The DISABLED setting is default.
+  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0 )
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING: \"${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}\"!")
 endif()
+# LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING (non-cached) is expected to be
+# 1 or 0 here, assuming referenced in #cmakedefine01.
 
 if(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS)
   add_compile_definitions(EXPERIMENTAL_KEY_INSTRUCTIONS)

From 64640667871990e4d73ae6221b9c4f05d0b36ea6 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Sat, 14 Jun 2025 13:26:03 +0200
Subject: [PATCH 471/851] [CIR] Upstream CreateOp for ComplexType with folder
 (#143192)

This change adds support for the create op for ComplexType with folder
and support for empty init list

https://github.com/llvm/llvm-project/issues/141365
---
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  |   4 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  32 ++++
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  |   3 +-
 clang/include/clang/CIR/MissingFeatures.h     |   1 -
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |   6 +
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          |  12 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  11 ++
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |  79 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |   9 ++
 clang/lib/CIR/CodeGen/CMakeLists.txt          |   1 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  27 ++++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   5 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  48 +++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  10 ++
 clang/test/CIR/CodeGen/complex.cpp            | 149 ++++++++++++++++++
 .../CIR/Transforms/complex-create-fold.cir    |  30 ++++
 16 files changed, 415 insertions(+), 12 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
 create mode 100644 clang/test/CIR/Transforms/complex-create-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index d22d265e82425..b48f4ed461ccb 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -307,9 +307,9 @@ def ConstComplexAttr : CIR_Attr<"ConstComplex", "const_complex",
   );
 
   let builders = [
-    AttrBuilderWithInferredContext<(ins "cir::ComplexType":$type,
-                                        "mlir::TypedAttr":$real,
+    AttrBuilderWithInferredContext<(ins "mlir::TypedAttr":$real,
                                         "mlir::TypedAttr":$imag), [{
+      auto type = cir::ComplexType::get(real.getType());
       return $_get(type.getContext(), type, real, imag);
     }]>,
   ];
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 194153caa9271..bd36d228578b7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2353,4 +2353,36 @@ def BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexCreateOp
+//===----------------------------------------------------------------------===//
+
+def ComplexCreateOp : CIR_Op<"complex.create", [Pure, SameTypeOperands]> {
+  let summary = "Create a complex value from its real and imaginary parts";
+  let description = [{
+    The `cir.complex.create` operation takes two operands that represent the
+    real and imaginary part of a complex number, and yields the complex number.
+
+    ```mlir
+    %0 = cir.const #cir.fp<1.000000e+00> : !cir.double
+    %1 = cir.const #cir.fp<2.000000e+00> : !cir.double
+    %2 = cir.complex.create %0, %1 : !cir.double -> !cir.complex<!cir.double>
+    ```
+  }];
+
+  let results = (outs CIR_ComplexType:$result);
+  let arguments = (ins
+    CIR_AnyIntOrFloatType:$real,
+    CIR_AnyIntOrFloatType:$imag
+  );
+
+  let assemblyFormat = [{
+    $real `,` $imag
+    `:` qualified(type($real)) `->` qualified(type($result)) attr-dict
+  }];
+
+  let hasVerifier = 1;
+  let hasFolder = 1;
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index fb96976075130..41d7d725a09e0 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -600,7 +600,8 @@ def CIRRecordType : Type<
 
 def CIR_AnyType : AnyTypeOf<[
   CIR_VoidType, CIR_BoolType, CIR_ArrayType, CIR_VectorType, CIR_IntType,
-  CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType
+  CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType,
+  CIR_ComplexType
 ]>;
 
 #endif // MLIR_CIR_DIALECT_CIR_TYPES
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 225e9ec89a827..13ddc77835fbc 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -251,7 +251,6 @@ struct MissingFeatures {
   // Future CIR operations
   static bool awaitOp() { return false; }
   static bool callOp() { return false; }
-  static bool complexCreateOp() { return false; }
   static bool complexImagOp() { return false; }
   static bool complexRealOp() { return false; }
   static bool ifOp() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index adf7cb77f1a5d..e38faba83b80c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -360,6 +360,12 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return CIRBaseBuilderTy::createStore(loc, val, dst.getPointer(), align);
   }
 
+  mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real,
+                                  mlir::Value imag) {
+    auto resultComplexTy = cir::ComplexType::get(real.getType());
+    return create<cir::ComplexCreateOp>(loc, resultComplexTy, real, imag);
+  }
+
   /// Create a cir.ptr_stride operation to get access to an array element.
   /// \p idx is the index of the element to access, \p shouldDecay is true if
   /// the result should decay to a pointer to the element type.
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 1941b5066edb4..afbe92aded804 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -499,7 +499,13 @@ void CIRGenFunction::emitExprAsInit(const Expr *init, const ValueDecl *d,
     emitScalarInit(init, getLoc(d->getSourceRange()), lvalue);
     return;
   case cir::TEK_Complex: {
-    cgm.errorNYI(init->getSourceRange(), "emitExprAsInit: complex type");
+    mlir::Value complex = emitComplexExpr(init);
+    if (capturedByInit)
+      cgm.errorNYI(init->getSourceRange(),
+                   "emitExprAsInit: complex type captured by init");
+    mlir::Location loc = getLoc(init->getExprLoc());
+    emitStoreOfComplex(loc, complex, lvalue,
+                       /*isInit*/ true);
     return;
   }
   case cir::TEK_Aggregate:
@@ -593,8 +599,8 @@ void CIRGenFunction::emitDecl(const Decl &d) {
     // None of these decls require codegen support.
     return;
 
-  case Decl::Enum:   // enum X;
-  case Decl::Record: // struct/union/class X;
+  case Decl::Enum:      // enum X;
+  case Decl::Record:    // struct/union/class X;
   case Decl::CXXRecord: // struct/union/class X; [C++]
   case Decl::NamespaceAlias:
   case Decl::Using:          // using X; [C++]
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 42d0c78013f57..2e43f10be132c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1690,3 +1690,14 @@ mlir::Value CIRGenFunction::emitScalarConstant(
   }
   return builder.getConstant(getLoc(e->getSourceRange()), constant.getValue());
 }
+
+/// An LValue is a candidate for having its loads and stores be made atomic if
+/// we are operating under /volatile:ms *and* the LValue itself is volatile and
+/// performing such an operation can be performed without a libcall.
+bool CIRGenFunction::isLValueSuitableForInlineAtomic(LValue lv) {
+  if (!cgm.getLangOpts().MSVolatile)
+    return false;
+
+  cgm.errorNYI("LValueSuitableForInlineAtomic LangOpts MSVolatile");
+  return false;
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
new file mode 100644
index 0000000000000..2ffe75a388e98
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -0,0 +1,79 @@
+#include "CIRGenBuilder.h"
+#include "CIRGenFunction.h"
+
+#include "clang/AST/StmtVisitor.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
+  CIRGenFunction &cgf;
+  CIRGenBuilderTy &builder;
+
+public:
+  explicit ComplexExprEmitter(CIRGenFunction &cgf)
+      : cgf(cgf), builder(cgf.getBuilder()) {}
+
+  /// Store the specified real/imag parts into the
+  /// specified value pointer.
+  void emitStoreOfComplex(mlir::Location loc, mlir::Value val, LValue lv,
+                          bool isInit);
+
+  mlir::Value VisitInitListExpr(InitListExpr *e);
+};
+
+} // namespace
+
+static const ComplexType *getComplexType(QualType type) {
+  type = type.getCanonicalType();
+  if (const ComplexType *comp = dyn_cast<ComplexType>(type))
+    return comp;
+  return cast<ComplexType>(cast<AtomicType>(type)->getValueType());
+}
+
+void ComplexExprEmitter::emitStoreOfComplex(mlir::Location loc, mlir::Value val,
+                                            LValue lv, bool isInit) {
+  if (lv.getType()->isAtomicType() ||
+      (!isInit && cgf.isLValueSuitableForInlineAtomic(lv))) {
+    cgf.cgm.errorNYI("StoreOfComplex with Atomic LV");
+    return;
+  }
+
+  const Address destAddr = lv.getAddress();
+  builder.createStore(loc, val, destAddr);
+}
+
+mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
+  mlir::Location loc = cgf.getLoc(e->getExprLoc());
+  if (e->getNumInits() == 2) {
+    mlir::Value real = cgf.emitScalarExpr(e->getInit(0));
+    mlir::Value imag = cgf.emitScalarExpr(e->getInit(1));
+    return builder.createComplexCreate(loc, real, imag);
+  }
+
+  if (e->getNumInits() == 1) {
+    cgf.cgm.errorNYI("Create Complex with InitList with size 1");
+    return {};
+  }
+
+  assert(e->getNumInits() == 0 && "Unexpected number of inits");
+  QualType complexElemTy =
+      e->getType()->castAs<clang::ComplexType>()->getElementType();
+  mlir::Type complexElemLLVMTy = cgf.convertType(complexElemTy);
+  mlir::TypedAttr defaultValue = builder.getZeroInitAttr(complexElemLLVMTy);
+  auto complexAttr = cir::ConstComplexAttr::get(defaultValue, defaultValue);
+  return builder.create<cir::ConstantOp>(loc, complexAttr);
+}
+
+mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
+  assert(e && getComplexType(e->getType()) &&
+         "Invalid complex expression to emit");
+
+  return ComplexExprEmitter(*this).Visit(const_cast<Expr *>(e));
+}
+
+void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v,
+                                        LValue dest, bool isInit) {
+  ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 318d3fbf3f9e1..de6ef2a69faf1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -338,6 +338,8 @@ class CIRGenFunction : public CIRGenTypeCache {
     PrototypeWrapper(const clang::ObjCMethodDecl *md) : p(md) {}
   };
 
+  bool isLValueSuitableForInlineAtomic(LValue lv);
+
   /// An abstract representation of regular/ObjC call/message targets.
   class AbstractCallee {
     /// The function declaration of the callee.
@@ -860,6 +862,10 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitForStmt(const clang::ForStmt &s);
 
+  /// Emit the computation of the specified expression of complex type,
+  /// returning the result.
+  mlir::Value emitComplexExpr(const Expr *e);
+
   void emitCompoundStmt(const clang::CompoundStmt &s);
 
   void emitCompoundStmtWithoutScope(const clang::CompoundStmt &s);
@@ -961,6 +967,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage);
 
+  void emitStoreOfComplex(mlir::Location loc, mlir::Value v, LValue dest,
+                          bool isInit);
+
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
                          clang::QualType ty, bool isInit = false,
                          bool isNontemporal = false);
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 217609687eabc..385bea066c61c 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -19,6 +19,7 @@ add_clang_library(clangCIR
   CIRGenDeclOpenACC.cpp
   CIRGenExpr.cpp
   CIRGenExprAggregate.cpp
+  CIRGenExprComplex.cpp
   CIRGenExprConstant.cpp
   CIRGenExprScalar.cpp
   CIRGenFunction.cpp
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index a685253b7d821..5578d4f5825a9 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1748,6 +1748,33 @@ OpFoldResult cir::VecTernaryOp::fold(FoldAdaptor adaptor) {
       vecTy, mlir::ArrayAttr::get(getContext(), elements));
 }
 
+//===----------------------------------------------------------------------===//
+// ComplexCreateOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult cir::ComplexCreateOp::verify() {
+  if (getType().getElementType() != getReal().getType()) {
+    emitOpError()
+        << "operand type of cir.complex.create does not match its result type";
+    return failure();
+  }
+
+  return success();
+}
+
+OpFoldResult cir::ComplexCreateOp::fold(FoldAdaptor adaptor) {
+  mlir::Attribute real = adaptor.getReal();
+  mlir::Attribute imag = adaptor.getImag();
+  if (!real || !imag)
+    return {};
+
+  // When both of real and imag are constants, we can fold the operation into an
+  // `#cir.const_complex` operation.
+  auto realAttr = mlir::cast<mlir::TypedAttr>(real);
+  auto imagAttr = mlir::cast<mlir::TypedAttr>(imag);
+  return cir::ConstComplexAttr::get(realAttr, imagAttr);
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 6f8a64ce0251e..20c634d6c66f6 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -134,7 +134,6 @@ void CIRCanonicalizePass::runOnOperation() {
   getOperation()->walk([&](Operation *op) {
     assert(!cir::MissingFeatures::switchOp());
     assert(!cir::MissingFeatures::tryOp());
-    assert(!cir::MissingFeatures::complexCreateOp());
     assert(!cir::MissingFeatures::complexRealOp());
     assert(!cir::MissingFeatures::complexImagOp());
     assert(!cir::MissingFeatures::callOp());
@@ -142,8 +141,8 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
-            VecTernaryOp>(op))
+            ComplexCreateOp, VecCreateOp, VecExtractOp, VecShuffleOp,
+            VecShuffleDynamicOp, VecTernaryOp>(op))
       ops.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 619e113202c9a..6a4e4e4a7df3b 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -905,7 +905,32 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
     rewriter.replaceOp(op, lowerCirAttrAsValue(op, op.getValue(), rewriter,
                                                getTypeConverter()));
     return mlir::success();
-  } else {
+  } else if (auto complexTy = mlir::dyn_cast<cir::ComplexType>(op.getType())) {
+    auto complexAttr = mlir::cast<cir::ConstComplexAttr>(op.getValue());
+    mlir::Type complexElemTy = complexTy.getElementType();
+    mlir::Type complexElemLLVMTy = typeConverter->convertType(complexElemTy);
+
+    mlir::Attribute components[2];
+    if (mlir::isa<cir::IntType>(complexElemTy)) {
+      components[0] = rewriter.getIntegerAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::IntAttr>(complexAttr.getReal()).getValue());
+      components[1] = rewriter.getIntegerAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::IntAttr>(complexAttr.getImag()).getValue());
+    } else {
+      components[0] = rewriter.getFloatAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::FPAttr>(complexAttr.getReal()).getValue());
+      components[1] = rewriter.getFloatAttr(
+          complexElemLLVMTy,
+          mlir::cast<cir::FPAttr>(complexAttr.getImag()).getValue());
+    }
+
+    attr = rewriter.getArrayAttr(components);
+  }
+
+  else {
     return op.emitError() << "unsupported constant type " << op.getType();
   }
 
@@ -1810,7 +1835,8 @@ void ConvertCIRToLLVMPass::runOnOperation() {
                CIRToLLVMVecSplatOpLowering,
                CIRToLLVMVecShuffleOpLowering,
                CIRToLLVMVecShuffleDynamicOpLowering,
-               CIRToLLVMVecTernaryOpLowering
+               CIRToLLVMVecTernaryOpLowering,
+               CIRToLLVMComplexCreateOpLowering
       // clang-format on
       >(converter, patterns.getContext());
 
@@ -2096,6 +2122,24 @@ mlir::LogicalResult CIRToLLVMVecTernaryOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite(
+    cir::ComplexCreateOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type complexLLVMTy =
+      getTypeConverter()->convertType(op.getResult().getType());
+  auto initialComplex =
+      rewriter.create<mlir::LLVM::UndefOp>(op->getLoc(), complexLLVMTy);
+
+  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
+      op->getLoc(), initialComplex, adaptor.getReal(), 0);
+
+  auto complex = rewriter.create<mlir::LLVM::InsertValueOp>(
+      op->getLoc(), realComplex, adaptor.getImag(), 1);
+
+  rewriter.replaceOp(op, complex);
+  return mlir::success();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 2eda568c84bdb..a809818063547 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -408,6 +408,16 @@ class CIRToLLVMVecTernaryOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMComplexCreateOpLowering
+    : public mlir::OpConversionPattern<cir::ComplexCreateOp> {
+public:
+  using mlir::OpConversionPattern<cir::ComplexCreateOp>::OpConversionPattern;
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::ComplexCreateOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 } // namespace direct
 } // namespace cir
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 6fa7bca3749cf..d193b9f32efbc 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -27,3 +27,152 @@ float _Complex cf2 = { 1.0f, 2.0f };
 // OGCG: {{.*}} = global { float, float } zeroinitializer, align 4
 // OGCG: {{.*}} = global { i32, i32 } { i32 1, i32 2 }, align 4
 // OGCG: {{.*}} = global { float, float } { float 1.000000e+00, float 2.000000e+00 }, align 4
+
+void foo() { int _Complex c = {}; }
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<0> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } zeroinitializer, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 0, ptr %[[C_IMAG_PTR]], align 4
+
+void foo2() { int _Complex c = {1, 2}; }
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 1, i32 2 }, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 1, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4
+
+void foo3() {
+  int a;
+  int b;
+  int _Complex c = {a, b};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[TMP_B]] : !s32i -> !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[TMP_B:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { i32, i32 } undef, i32 %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { i32, i32 } %[[TMP]], i32 %[[TMP_B]], 1
+// LLVM: store { i32, i32 } %[[TMP_2]], ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[REAL_VAL:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[IMAG_VAL:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 %[[REAL_VAL]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 %[[IMAG_VAL]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo4() {
+  int a;
+  int _Complex c = {1, a};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!s32i>, !s32i
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[CONST_1]], %[[TMP_A]] : !s32i -> !cir.complex<!s32i>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+
+// LLVM: %[[INIT:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// LLVM: %[[COMPLEX:.*]] = insertvalue { i32, i32 } { i32 1, i32 undef }, i32 %[[TMP_A]], 1
+// LLVM: store { i32, i32 } %[[COMPLEX]], ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[TMP_A:.*]] = load i32, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 1, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 %[[TMP_A]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo5() {
+  float _Complex c = {1.0f, 2.0f};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00> : !cir.float> : !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[INIT:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: store { float, float } { float 1.000000e+00, float 2.000000e+00 }, ptr %[[INIT]], align 4
+
+// OGCG: %[[COMPLEX]] = alloca { float, float }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float 1.000000e+00, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
+
+void foo6() {
+  float a;
+  float b;
+  float _Complex c = {a, b};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[TMP_B:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[TMP_B]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP_B:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { float, float } undef, float %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { float, float } %[[TMP]], float %[[TMP_B]], 1
+// LLVM: store { float, float } %[[TMP_2]], ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX]] = alloca { float, float }, align 4
+// OGCG: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[TMP_B:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float %[[TMP_B]], ptr %[[C_IMAG_PTR]], align 4
+
+void foo7() {
+  float a;
+  float _Complex c = {a, 2.0f};
+}
+
+// CIR: %[[INIT:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["c", init]
+// CIR: %[[TMP_A:.*]] = cir.load{{.*}} {{.*}} : !cir.ptr<!cir.float>, !cir.float
+// CIR: %[[CONST_2F:.*]] = cir.const #cir.fp<2.000000e+00> : !cir.float
+// CIR: %[[COMPLEX:.*]] = cir.complex.create %[[TMP_A]], %[[CONST_2F]] : !cir.float -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// LLVM: %[[TMP:.*]] = insertvalue { float, float } undef, float %[[TMP_A]], 0
+// LLVM: %[[TMP_2:.*]] = insertvalue { float, float } %[[TMP]], float 2.000000e+00, 1
+// LLVM: store { float, float } %[[TMP_2]], ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4
+// OGCG: %[[TMP_A:.*]] = load float, ptr {{.*}}, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
+// OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
+
diff --git a/clang/test/CIR/Transforms/complex-create-fold.cir b/clang/test/CIR/Transforms/complex-create-fold.cir
new file mode 100644
index 0000000000000..5d9d22112c8b7
--- /dev/null
+++ b/clang/test/CIR/Transforms/complex-create-fold.cir
@@ -0,0 +1,30 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.func @fold_complex_create_test() -> !cir.complex<!s32i>  {
+    %0 = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["__retval"]
+    %1 = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+    %2 = cir.const #cir.int<1> : !s32i
+    %3 = cir.const #cir.int<2> : !s32i
+    %4 = cir.complex.create %2, %3 : !s32i -> !cir.complex<!s32i>
+    cir.store align(4) %4, %1 : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+    %5 = cir.load align(4) %1 : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+    cir.store align(4) %5, %0 : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+    %6 = cir.load %0 : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+    cir.return %6 : !cir.complex<!s32i>
+  }
+
+// CHECK: cir.func @fold_complex_create_test() -> !cir.complex<!s32i> {
+// CHECK:   %[[RET:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["__retval"]
+// CHECK:   %[[INIT:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["c", init]
+// CHECK:   %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+// CHECK:   cir.store{{.*}} %[[COMPLEX]], %[[INIT]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CHECK:   %[[TMP:.*]] = cir.load{{.*}} %[[INIT]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CHECK:   cir.store{{.*}} %[[TMP:.*]], %[[RET]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
+// CHECK:   %[[TMP_2:.*]] = cir.load %[[RET]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
+// CHECK:   cir.return %[[TMP_2]] : !cir.complex<!s32i>
+// CHECK: }
+
+}

From 2cb32e29408a6c598072ea0f066a246957be69f9 Mon Sep 17 00:00:00 2001
From: Ross Kirsling <rkirsling@gmail.com>
Date: Sat, 14 Jun 2025 22:03:23 +0900
Subject: [PATCH 472/851] [Clang] Fix fix-it hint regression from #143460
 (#144069)

Following #143460, `:` began displaying as `colon` in the fix-it hint
for a `case` with a missing colon, as is visible in the description of
(the separate bug) #144052.

This PR simply reverts a line that didn't need to be changed.
---
 clang/lib/Parse/ParseStmt.cpp                  |  3 +--
 clang/test/FixIt/fixit-punctuator-spelling.cpp | 10 ++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/FixIt/fixit-punctuator-spelling.cpp

diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 434ea68442819..c0c9bbc2e15c6 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -836,8 +836,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
 
       Diag(ExpectedLoc, diag::err_expected_after)
           << "'case'" << tok::colon
-          << FixItHint::CreateInsertion(ExpectedLoc,
-                                        tok::getTokenName(tok::colon));
+          << FixItHint::CreateInsertion(ExpectedLoc, ":");
 
       ColonLoc = ExpectedLoc;
     }
diff --git a/clang/test/FixIt/fixit-punctuator-spelling.cpp b/clang/test/FixIt/fixit-punctuator-spelling.cpp
new file mode 100644
index 0000000000000..3cba0e7b64594
--- /dev/null
+++ b/clang/test/FixIt/fixit-punctuator-spelling.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: not %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+void f(int x) {
+  switch (x) {
+    case 1 // expected-error {{expected ':' after 'case'}}
+      break;
+  }
+}
+// CHECK: fix-it:"{{.*}}":{6:11-6:11}:":"

From 42595d34bda74e0d6e3b6ec0cf253875330f9c42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= <mgorny@gentoo.org>
Date: Sat, 14 Jun 2025 16:11:41 +0200
Subject: [PATCH 473/851] [llvm] [cmake] Use pkg-config to obtain libffi search
 hints (#144221)

Extend `FindFFI.cmake` to include the paths obtained from pkg-config
when searching for libffi. This is going to help systems where libffi is
installed in nonstandard directory such as Gentoo, saving us from having
to copy the paths from pkg-config to `FFI_*` variables explicitly. The
logic is inspired by `FindLibEdit.cmake`.
---
 llvm/cmake/modules/FindFFI.cmake | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/cmake/modules/FindFFI.cmake b/llvm/cmake/modules/FindFFI.cmake
index 8e67c5d8c6d17..b1f64522b2682 100644
--- a/llvm/cmake/modules/FindFFI.cmake
+++ b/llvm/cmake/modules/FindFFI.cmake
@@ -23,7 +23,10 @@
 # Additionally, the following import target will be defined:
 # FFI::ffi
 
-find_path(FFI_INCLUDE_DIRS ffi.h PATHS ${FFI_INCLUDE_DIR})
+find_package(PkgConfig QUIET)
+pkg_check_modules(PC_LIBFFI QUIET libffi)
+
+find_path(FFI_INCLUDE_DIRS ffi.h PATHS ${FFI_INCLUDE_DIR} ${PC_LIBFFI_INCLUDE_DIRS})
 if( EXISTS "${FFI_INCLUDE_DIRS}/ffi.h" )
   set(FFI_HEADER ffi.h CACHE INTERNAL "")
   set(HAVE_FFI_H 1 CACHE INTERNAL "")
@@ -35,8 +38,8 @@ else()
   endif()
 endif()
 
-find_library(FFI_LIBRARIES NAMES ffi PATHS ${FFI_LIBRARY_DIR})
-find_library(FFI_STATIC_LIBRARIES NAMES libffi.a PATHS ${FFI_LIBRARY_DIR})
+find_library(FFI_LIBRARIES NAMES ffi PATHS ${FFI_LIBRARY_DIR} ${PC_LIBFFI_LIBRARY_DIRS})
+find_library(FFI_STATIC_LIBRARIES NAMES libffi.a PATHS ${FFI_LIBRARY_DIR} ${PC_LIBFFI_LIBRARY_DIRS})
 
 if(FFI_LIBRARIES)
   include(CMakePushCheckState)

From ff295d2f3429a5a2a93b2c86099af40544f467d4 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler@amd.com>
Date: Sat, 14 Jun 2025 16:17:08 +0200
Subject: [PATCH 474/851] [OpenMP][clang] declare mapper: fix handling of
 nested types (#143504)

Fix a crash that happened during parsing of a "declare mapper" construct
for a struct that contains an element for which we also declared a
custom default mapper.
---
 clang/include/clang/Sema/SemaOpenMP.h         |  2 +-
 clang/lib/Parse/ParseOpenMP.cpp               | 12 ++++++---
 clang/test/OpenMP/declare_mapper_ast_print.c  | 25 ++++++++++++++++++
 .../test/OpenMP/declare_mapper_ast_print.cpp  | 26 +++++++++++++++++++
 4 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index be6bec2068784..7b169f56b6807 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -283,7 +283,7 @@ class SemaOpenMP : public SemaBase {
   /// mapper' construct.
   QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
                                         TypeResult ParsedType);
-  /// Called on start of '#pragma omp declare mapper'.
+  /// Called for '#pragma omp declare mapper'.
   DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective(
       Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
       SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index def1a52ba7d4a..78d3503d8eb68 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -576,6 +576,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     return DeclGroupPtrTy();
   }
 
+  Scope *OuterScope = getCurScope();
   // Enter scope.
   DeclarationNameInfo DirName;
   SourceLocation Loc = Tok.getLocation();
@@ -614,12 +615,17 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     IsCorrect = false;
   }
 
+  // This needs to be called within the scope because
+  // processImplicitMapsWithDefaultMappers may add clauses when analyzing nested
+  // types. The scope used for calling ActOnOpenMPDeclareMapperDirective,
+  // however, needs to be the outer one, otherwise declared mappers don't become
+  // visible.
+  DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective(
+      OuterScope, Actions.getCurLexicalContext(), MapperId, MapperType,
+      Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses);
   // Exit scope.
   Actions.OpenMP().EndOpenMPDSABlock(nullptr);
   OMPDirectiveScope.Exit();
-  DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective(
-      getCurScope(), Actions.getCurLexicalContext(), MapperId, MapperType,
-      Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses);
   if (!IsCorrect)
     return DeclGroupPtrTy();
 
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.c b/clang/test/OpenMP/declare_mapper_ast_print.c
index 3c554a106fe49..bb83f23a0c18a 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.c
+++ b/clang/test/OpenMP/declare_mapper_ast_print.c
@@ -49,6 +49,23 @@ struct dat {
 #pragma omp declare mapper(struct dat d) map(to: d.d)
 // CHECK: #pragma omp declare mapper (default : struct dat d) map(to: d.d){{$}}
 
+// Verify that nested default mappers do not lead to a crash during parsing / sema.
+// CHECK: struct inner {
+struct inner {
+  int size;
+  int *data;
+};
+#pragma omp declare mapper(struct inner i) map(i, i.data[0 : i.size])
+// CHECK: #pragma omp declare mapper (default : struct inner i) map(tofrom: default::i,i.data[0:i.size]){{$}}
+
+// CHECK: struct outer {
+struct outer {
+  int a;
+  struct inner i;
+};
+#pragma omp declare mapper(struct outer o) map(o)
+// CHECK: #pragma omp declare mapper (default : struct outer o) map(tofrom: default::o) map(tofrom: o.i){{$}}
+
 // CHECK: int main(void) {
 int main(void) {
 #pragma omp declare mapper(id: struct vec v) map(v.len)
@@ -77,6 +94,14 @@ int main(void) {
 #pragma omp declare mapper(id1: struct vec vvec) map(iterator(it=0:vvec.len:2), tofrom:vvec.data[it])
 // OMP52: #pragma omp declare mapper (id1 : struct vec vvec) map(iterator(int it = 0:vvec.len:2),tofrom: vvec.data[it]);
 #endif
+
+  {
+    struct outer outer;
+#pragma omp target map(outer)
+// CHECK: #pragma omp target map(tofrom: outer)
+    { }
+  }
+
   return 0;
 }
 // CHECK: }
diff --git a/clang/test/OpenMP/declare_mapper_ast_print.cpp b/clang/test/OpenMP/declare_mapper_ast_print.cpp
index 422fa9981672e..9ca3412e3e3d9 100644
--- a/clang/test/OpenMP/declare_mapper_ast_print.cpp
+++ b/clang/test/OpenMP/declare_mapper_ast_print.cpp
@@ -34,6 +34,28 @@ class vecchild : public vec {
 // CHECK: }
 // CHECK: ;
 
+// Verify that nested default mappers do not lead to a crash during parsing / sema.
+// CHECK: namespace N2 {
+namespace N2
+{
+// CHECK: struct inner {
+struct inner {
+  int size;
+  int *data;
+};
+#pragma omp declare mapper(struct inner i) map(i, i.data[0 : i.size])
+// CHECK: #pragma omp declare mapper (default : struct inner i) map(tofrom: N2::default::i,i.data[0:i.size]){{$}}
+
+// CHECK: struct outer {
+struct outer {
+  int a;
+  struct inner i;
+};
+#pragma omp declare mapper(struct outer o) map(o)
+// CHECK: #pragma omp declare mapper (default : struct outer o) map(tofrom: N2::default::o) map(tofrom: o.i){{$}}
+} // namespace N2
+// CHECK: }
+
 template <class T>
 class dat {
 public:
@@ -122,6 +144,7 @@ T foo(T a) {
 int main() {
   N1::vec vv, vvv;
   N1::vecchild vc;
+  N2::outer outer;
   dat<double> dd;
 #pragma omp target map(mapper(N1::id) tofrom: vv) map(mapper(dat<double>::id) alloc: vvv)
 // CHECK: #pragma omp target map(mapper(N1::id),tofrom: vv) map(mapper(dat<double>::id),alloc: vvv)
@@ -132,6 +155,9 @@ int main() {
 #pragma omp target map(mapper(default) tofrom: dd)
 // CHECK: #pragma omp target map(mapper(default),tofrom: dd)
   { dd.d++; }
+#pragma omp target map(outer)
+// CHECK: #pragma omp target map(tofrom: outer)
+  { }
 
 #pragma omp target update to(mapper(N1::id) : vc)
 // CHECK: #pragma omp target update to(mapper(N1::id): vc)

From 10bc17fc3676b82c7240046a948d2925dd2045d3 Mon Sep 17 00:00:00 2001
From: Tom Vijlbrief <tvijlbrief@gmail.com>
Date: Sat, 14 Jun 2025 17:10:04 +0200
Subject: [PATCH 475/851] [AVR] Add support for many new AVR MCUs (#143914)

fixes https://github.com/llvm/llvm-project/issues/116116
---
 clang/lib/Basic/Targets/AVR.cpp     | 69 ++++++++++++++++++++++++++
 clang/lib/Driver/ToolChains/AVR.cpp | 70 ++++++++++++++++++++++++++
 llvm/lib/Target/AVR/AVRDevices.td   | 76 +++++++++++++++++++++++++++++
 3 files changed, 215 insertions(+)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 85ca4bc30c461..bbe7b01ca036d 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,6 +336,9 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
+    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
+    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
+    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -344,6 +347,72 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
+    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
+    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
+    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
+    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
+    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
+    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
+    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
+    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
+    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
+    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
+    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
+    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
+    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
+    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
+    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
+    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
+    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
+    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
+    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
+
+    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
+    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
+    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
+    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
+    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
+    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
+    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
+    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
+    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
+    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
+    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
+    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
+    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
+    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
+    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
+    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
+    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
+    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
+    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
+    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
+    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
+    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
+    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
+    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
+    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
+    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
+    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
+    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
+    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
+    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
+    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
+    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
+    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
+    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
+    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
+    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
+    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
+    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
+    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
+    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
+    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
+    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index b0523a7f4e40e..731076d9754a9 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,8 +326,78 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
+    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
+
+    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
+
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 56147bb473bc4..efe78391f7319 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -215,6 +215,13 @@ def FamilyXMEGA3 : Family<"xmega3",
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
+def FamilyXMEGA4 : Family<"xmega4",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureELPM,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -567,6 +574,9 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -575,3 +585,69 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
+
+// Additions from gcc 14:
+
+def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
+
+def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 62d8e001dac4b1a68f5b33c8784adba1335003f4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 08:18:50 -0700
Subject: [PATCH 476/851] Revert "[AVR] Add support for many new AVR MCUs
 (#143914)"

This reverts commit 10bc17fc3676b82c7240046a948d2925dd2045d3.

Multiple buildbot failures have been reported:
https://github.com/llvm/llvm-project/pull/143914
---
 clang/lib/Basic/Targets/AVR.cpp     | 69 --------------------------
 clang/lib/Driver/ToolChains/AVR.cpp | 70 --------------------------
 llvm/lib/Target/AVR/AVRDevices.td   | 76 -----------------------------
 3 files changed, 215 deletions(-)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index bbe7b01ca036d..85ca4bc30c461 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,9 +336,6 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
-    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
-    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
-    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -347,72 +344,6 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
-
-    // gcc 14 additions:
-
-    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
-    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
-    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
-    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
-    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
-    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
-    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
-    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
-    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
-    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
-    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
-    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
-    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
-    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
-    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
-    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
-    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
-    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
-    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
-    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
-
-    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
-    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
-    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
-    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
-    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
-    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
-    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
-    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
-    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
-    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
-    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
-    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
-    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
-    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
-    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
-    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
-    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
-    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
-    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
-    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
-    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
-    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
-    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
-    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
-    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
-    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
-    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
-    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
-    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
-    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
-    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
-    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
-    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
-    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
-    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
-    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
-    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
-    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
-    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
-    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
-    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
-    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 731076d9754a9..b0523a7f4e40e 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,78 +326,8 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
-    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
-    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
-    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
-
-    // gcc 14 additions:
-
-    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
-    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
-    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
-
-    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
-    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
-    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
-    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
-
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index efe78391f7319..56147bb473bc4 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -215,13 +215,6 @@ def FamilyXMEGA3 : Family<"xmega3",
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
-def FamilyXMEGA4 : Family<"xmega4",
-                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
-                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
-                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
-                           FeatureELPM,
-                           FeatureBREAK, FeatureLowByteFirst]>;
-
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -574,9 +567,6 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -585,69 +575,3 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
-
-// Additions from gcc 14:
-
-def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
-def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
-
-def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
-def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
-def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 72f99b75afc12bb15a7730544339bcc1ca11e8ee Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 16:48:44 +0100
Subject: [PATCH 477/851] [LV] Add test case with branch weights.

Add test case with branch weights where the vector loop can
be removed. Exposed a crash with db8d34db26e9
(https://github.com/llvm/llvm-project/pull/143035).
---
 ...oop-backedge-elimination-branch-weights.ll | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll

diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
new file mode 100644
index 0000000000000..d5acf5c38f768
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-branch-weights.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF8UF1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF8UF2 %s
+; RUN: opt -p loop-vectorize -force-vector-width=16 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF16UF1 %s
+
+; Check if the vector loop condition can be simplified to true for a given
+; VF/IC combination.
+define void @test_tc_between_8_and_17(ptr %A, i64 range(i64 8, 17) %N) {
+; VF8UF1-LABEL: define void @test_tc_between_8_and_17(
+; VF8UF1-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF8UF1-NEXT:  [[ENTRY:.*]]:
+; VF8UF1-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF8UF1:       [[VECTOR_PH]]:
+; VF8UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; VF8UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF1:       [[VECTOR_BODY]]:
+; VF8UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VF8UF1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; VF8UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
+; VF8UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF1-NEXT:    store <8 x i8> [[TMP2]], ptr [[TMP1]], align 1
+; VF8UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VF8UF1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF8UF1:       [[MIDDLE_BLOCK]]:
+; VF8UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; VF8UF1:       [[SCALAR_PH]]:
+; VF8UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF1-NEXT:    br label %[[LOOP:.*]]
+; VF8UF1:       [[LOOP]]:
+; VF8UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF8UF1:       [[EXIT]]:
+; VF8UF1-NEXT:    ret void
+;
+; VF8UF2-LABEL: define void @test_tc_between_8_and_17(
+; VF8UF2-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF8UF2-NEXT:  [[ENTRY:.*]]:
+; VF8UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF8UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF8UF2:       [[VECTOR_PH]]:
+; VF8UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF8UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF8UF2-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF8UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF8UF2:       [[VECTOR_BODY]]:
+; VF8UF2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF8UF2-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; VF8UF2-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; VF8UF2-NEXT:    [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP4:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10)
+; VF8UF2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF8UF2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i32 8
+; VF8UF2-NEXT:    store <8 x i8> [[TMP3]], ptr [[TMP5]], align 1
+; VF8UF2-NEXT:    store <8 x i8> [[TMP4]], ptr [[TMP6]], align 1
+; VF8UF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF8UF2:       [[MIDDLE_BLOCK]]:
+; VF8UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF8UF2-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF8UF2:       [[SCALAR_PH]]:
+; VF8UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF8UF2-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF8UF2-NEXT:    br label %[[LOOP:.*]]
+; VF8UF2:       [[LOOP]]:
+; VF8UF2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF8UF2-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF8UF2-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF8UF2-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF8UF2-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF8UF2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF8UF2-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF8UF2:       [[EXIT]]:
+; VF8UF2-NEXT:    ret void
+;
+; VF16UF1-LABEL: define void @test_tc_between_8_and_17(
+; VF16UF1-SAME: ptr [[A:%.*]], i64 range(i64 8, 17) [[N:%.*]]) {
+; VF16UF1-NEXT:  [[ENTRY:.*]]:
+; VF16UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; VF16UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; VF16UF1:       [[VECTOR_PH]]:
+; VF16UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; VF16UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; VF16UF1-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
+; VF16UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF16UF1:       [[VECTOR_BODY]]:
+; VF16UF1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF16UF1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; VF16UF1-NEXT:    [[TMP2:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10)
+; VF16UF1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i32 0
+; VF16UF1-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP3]], align 1
+; VF16UF1-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
+; VF16UF1:       [[MIDDLE_BLOCK]]:
+; VF16UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; VF16UF1-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; VF16UF1:       [[SCALAR_PH]]:
+; VF16UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; VF16UF1-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ]
+; VF16UF1-NEXT:    br label %[[LOOP:.*]]
+; VF16UF1:       [[LOOP]]:
+; VF16UF1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ]
+; VF16UF1-NEXT:    [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1
+; VF16UF1-NEXT:    [[L:%.*]] = load i8, ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[ADD:%.*]] = add nsw i8 [[L]], 10
+; VF16UF1-NEXT:    store i8 [[ADD]], ptr [[P_SRC]], align 1
+; VF16UF1-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
+; VF16UF1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; VF16UF1-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF16UF1:       [[EXIT]]:
+; VF16UF1-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, 1
+  %cmp = icmp eq i64 %iv.next, %N
+  br i1 %cmp, label %exit, label %loop, !prof !0
+
+exit:
+  ret void
+}
+
+!0 = !{!"branch_weights", !"expected", i32 1, i32 2000}

From 577199f9221ebc805a69372a2b19f4c8ebaf1daf Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 14 Jun 2025 17:18:36 +0100
Subject: [PATCH 478/851] Reapply "[VPlan] Set branch weight metadata on middle
 term in VPlan (NFC) (#143035)"

This reverts commit 0604dc199c019b23746f4a54885ba0c75569cdae.

The recommitted version addresses post-commit comments and adjusts the
place the branch weights are added. It now runs before VPlans are optimized
for VF and UF, which may remove the vector loop region, causing a crash
trying to get the middle block after that. Test case added in
72f99b75afc12bb.

Original message:
Manage branch weights for the BranchOnCond in the middle block in VPlan.
This requires updating VPInstruction to inherit from VPIRMetadata, which
in general makes sense as there are a number of opcodes that could take
metadata.

There are other branches (part of the skeleton) that also need branch
weights adding.

PR: https://github.com/llvm/llvm-project/pull/143035
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 24 ++-------
 llvm/lib/Transforms/Vectorize/VPlan.h         | 54 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  6 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 23 ++++++++
 .../Transforms/Vectorize/VPlanTransforms.h    |  4 ++
 5 files changed, 66 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7c006ae326ecb..9b5ad16589539 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7300,6 +7300,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
                            OrigLoop->getHeader()->getContext());
   VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
+  if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
+    VPlanTransforms::runPass(VPlanTransforms::addBranchWeightToMiddleTerminator,
+                             BestVPlan, BestVF);
   VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
   VPlanTransforms::simplifyRecipes(BestVPlan, *Legal->getWidestInductionType());
   VPlanTransforms::narrowInterleaveGroups(
@@ -7309,11 +7312,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan,
                                             *Legal->getWidestInductionType());
-  // Retrieve and store the middle block before dissolving regions. Regions are
-  // dissolved after optimizing for VF and UF, which completely removes unneeded
-  // loop regions first.
-  VPBasicBlock *MiddleVPBB =
-      BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr;
+  // Regions are dissolved after optimizing for VF and UF, which completely
+  // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
   // Perform the actual loop transformation.
   VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
@@ -7456,20 +7456,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
 
   ILV.printDebugTracesAtEnd();
 
-  // 4. Adjust branch weight of the branch in the middle block.
-  if (HeaderVPBB) {
-    auto *MiddleTerm =
-        cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());
-    if (MiddleTerm->isConditional() &&
-        hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
-      // Assume that `Count % VectorTripCount` is equally distributed.
-      unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
-      assert(TripCount > 0 && "trip count should not be zero");
-      const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
-    }
-  }
-
   return ExpandedSCEVs;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 53619b39219e6..5a3c4a514a5dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -882,11 +882,40 @@ template <unsigned PartOpIdx> class VPUnrollPartAccessor {
   unsigned getUnrollPart(VPUser &U) const;
 };
 
+/// Helper to manage IR metadata for recipes. It filters out metadata that
+/// cannot be propagated.
+class VPIRMetadata {
+  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
+
+public:
+  VPIRMetadata() {}
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I.
+  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
+
+  /// Adds metatadata that can be preserved from the original instruction
+  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
+  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
+
+  /// Copy constructor for cloning.
+  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
+
+  /// Add all metadata to \p I.
+  void applyMetadata(Instruction &I) const;
+
+  /// Add metadata with kind \p Kind and \p Node.
+  void addMetadata(unsigned Kind, MDNode *Node) {
+    Metadata.emplace_back(Kind, Node);
+  }
+};
+
 /// This is a concrete Recipe that models a single VPlan-level instruction.
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPRecipeWithIRFlags,
+                      public VPIRMetadata,
                       public VPUnrollPartAccessor<1> {
   friend class VPlanSlp;
 
@@ -976,7 +1005,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL = {},
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        Opcode(Opcode), Name(Name.str()) {}
+        VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                 const VPIRFlags &Flags, DebugLoc DL = {},
@@ -1268,29 +1297,6 @@ struct VPIRPhi : public VPIRInstruction, public VPPhiAccessors {
   const VPRecipeBase *getAsRecipe() const override { return this; }
 };
 
-/// Helper to manage IR metadata for recipes. It filters out metadata that
-/// cannot be propagated.
-class VPIRMetadata {
-  SmallVector<std::pair<unsigned, MDNode *>> Metadata;
-
-public:
-  VPIRMetadata() {}
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I.
-  VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
-
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
-  /// Copy constructor for cloning.
-  VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {}
-
-  /// Add all metadata to \p I.
-  void applyMetadata(Instruction &I) const;
-};
-
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
 /// opcode and operands of the recipe. This recipe covers most of the
 /// traditional vectorization cases where each recipe transforms into a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c64bda167b854..3bdfa6724f691 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -409,7 +409,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
                              const VPIRFlags &Flags, DebugLoc DL,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
-      Opcode(Opcode), Name(Name.str()) {
+      VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
   assert(flagsValidForOpcode(getOpcode()) &&
          "Set flags not supported for the provided opcode");
 }
@@ -590,7 +590,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
   }
   case VPInstruction::BranchOnCond: {
     Value *Cond = State.get(getOperand(0), VPLane(0));
-    return createCondBranch(Cond, getParent(), State);
+    auto *Br = createCondBranch(Cond, getParent(), State);
+    applyMetadata(*Br);
+    return Br;
   }
   case VPInstruction::BranchOnCount: {
     // First create the compare.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index dc3c7bfe5cd1a..44a72755b9cf8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/TypeSize.h"
@@ -3203,3 +3204,25 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
       Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   removeDeadRecipes(Plan);
 }
+
+/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
+/// BranchOnCond recipe.
+void VPlanTransforms::addBranchWeightToMiddleTerminator(VPlan &Plan,
+                                                        ElementCount VF) {
+  VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
+  auto *MiddleTerm =
+      dyn_cast_or_null<VPInstruction>(MiddleVPBB->getTerminator());
+  // Only add branch metadata if there is a (conditional) terminator.
+  if (!MiddleTerm)
+    return;
+
+  assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
+         "must have a BranchOnCond");
+  // Assume that `TripCount % VectorStep ` is equally distributed.
+  unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
+  assert(VectorStep > 0 && "trip count should not be zero");
+  MDBuilder MDB(Plan.getScalarHeader()->getIRBasicBlock()->getContext());
+  MDNode *BranchWeights =
+      MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
+  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 34e2de4eb3b74..5a03bdb7c6882 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -234,6 +234,10 @@ struct VPlanTransforms {
   /// removed in the future.
   static DenseMap<VPBasicBlock *, VPValue *>
   introduceMasksAndLinearize(VPlan &Plan, bool FoldTail);
+
+  /// Add branch weight metadata, if the \p Plan's middle block is terminated by
+  /// a BranchOnCond recipe.
+  static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF);
 };
 
 } // namespace llvm

From d6e25c4d21ebe20aaa6cbf6e2b9afde8f6713160 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Sat, 14 Jun 2025 10:21:08 -0700
Subject: [PATCH 479/851] [SelectionDAG] Take passthru into account when
 widening ISD::MLOAD (#144170)

#140595 used vp.load in the cases where we need to widen masked.load.
However, we didn't account for the passthru operand so it might
miscompile when the passthru is not undef. While we can simply avoid
using vp.load to widen when passthru is not undef, doing so will ran
into the exact same crash described in #140198 , so for scalable vector,
this patch manually merges the vp.load result with passthru when the
latter is not undef.
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 17 ++++++++++++--
 .../rvv/fixed-vectors-masked-load-int.ll      | 13 +++++++++++
 .../test/CodeGen/RISCV/rvv/masked-load-int.ll | 22 ++++++++++++++++++-
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f63fe17da51ff..c56cfec81acdd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6149,7 +6149,12 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   if (ExtType == ISD::NON_EXTLOAD &&
       TLI.isOperationLegalOrCustom(ISD::VP_LOAD, WidenVT) &&
-      TLI.isTypeLegal(WideMaskVT)) {
+      TLI.isTypeLegal(WideMaskVT) &&
+      // If there is a passthru, we shouldn't use vp.load. However,
+      // type legalizer will struggle on masked.load with
+      // scalable vectors, so for scalable vectors, we still use vp.load
+      // but manually merge the load result with the passthru using vp.select.
+      (N->getPassThru()->isUndef() || VT.isScalableVector())) {
     Mask = DAG.getInsertSubvector(dl, DAG.getUNDEF(WideMaskVT), Mask, 0);
     SDValue EVL = DAG.getElementCount(dl, TLI.getVPExplicitVectorLengthTy(),
                                       VT.getVectorElementCount());
@@ -6157,12 +6162,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
         DAG.getLoadVP(N->getAddressingMode(), ISD::NON_EXTLOAD, WidenVT, dl,
                       N->getChain(), N->getBasePtr(), N->getOffset(), Mask, EVL,
                       N->getMemoryVT(), N->getMemOperand());
+    SDValue NewVal = NewLoad;
+
+    // Manually merge with vp.select
+    if (!N->getPassThru()->isUndef()) {
+      assert(WidenVT.isScalableVector());
+      NewVal =
+          DAG.getNode(ISD::VP_SELECT, dl, WidenVT, Mask, NewVal, PassThru, EVL);
+    }
 
     // Modified the chain - switch anything that used the old chain to use
     // the new one.
     ReplaceValueWith(SDValue(N, 1), NewLoad.getValue(1));
 
-    return NewLoad;
+    return NewVal;
   }
 
   // The mask should be widened as well
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
index 545c89495e621..ed60d91308495 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-int.ll
@@ -341,3 +341,16 @@ define <7 x i8> @masked_load_v7i8(ptr %a, <7 x i1> %mask) {
   ret <7 x i8> %load
 }
 
+define <7 x i8> @masked_load_passthru_v7i8(ptr %a, <7 x i1> %mask) {
+; CHECK-LABEL: masked_load_passthru_v7i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; CHECK-NEXT:    vmv.s.x v8, a1
+; CHECK-NEXT:    vmand.mm v0, v0, v8
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vle8.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %load = call <7 x i8> @llvm.masked.load.v7i8(ptr %a, i32 8, <7 x i1> %mask, <7 x i8> zeroinitializer)
+  ret <7 x i8> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index d992669306fb1..75537406f3515 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -21,7 +21,27 @@ define <vscale x 1 x i8> @masked_load_nxv1i8(ptr %a, <vscale x 1 x i1> %mask) no
   %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> undef)
   ret <vscale x 1 x i8> %load
 }
-declare <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr, i32, <vscale x 1 x i1>, <vscale x 1 x i8>)
+
+define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1> %mask) nounwind {
+; V-LABEL: masked_load_passthru_nxv1i8:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a1, zero, e8, mf8, ta, mu
+; V-NEXT:    vmv.v.i v8, 0
+; V-NEXT:    vle8.v v8, (a0), v0.t
+; V-NEXT:    ret
+;
+; ZVE32-LABEL: masked_load_passthru_nxv1i8:
+; ZVE32:       # %bb.0:
+; ZVE32-NEXT:    csrr a1, vlenb
+; ZVE32-NEXT:    srli a1, a1, 3
+; ZVE32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; ZVE32-NEXT:    vmv.v.i v8, 0
+; ZVE32-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT:    vle8.v v8, (a0), v0.t
+; ZVE32-NEXT:    ret
+  %load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
+  ret <vscale x 1 x i8> %load
+}
 
 define <vscale x 1 x i16> @masked_load_nxv1i16(ptr %a, <vscale x 1 x i1> %mask) nounwind {
 ; V-LABEL: masked_load_nxv1i16:

From db682a721aabf3c33dfda471bf6a7908fbf656b4 Mon Sep 17 00:00:00 2001
From: Tomer Shafir <tomer.shafir8@gmail.com>
Date: Sat, 14 Jun 2025 21:06:43 +0300
Subject: [PATCH 480/851] [utils] Add "aarch64-apple-macosx" triple to
 update_llc_test_checks.py (#144023)

Add a missing valid triple "aarch64-apple-macosx" for usability.
---
 llvm/utils/UpdateTestChecks/asm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index da7e7ecc24bdc..3754aa2eeba85 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -561,6 +561,7 @@ def get_run_handler(triple):
         "aarch64": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_RE),
         "aarch64-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
         "aarch64-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
+        "aarch64-apple-macosx": (scrub_asm_arm_eabi, ASM_FUNCTION_AARCH64_DARWIN_RE),
         "bpf": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),
         "bpfel": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),
         "bpfeb": (scrub_asm_bpf, ASM_FUNCTION_BPF_RE),

From 0ff95c9eb1e3b0785724d3e33df1e1f77f2c7473 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Sun, 15 Jun 2025 00:01:25 +0300
Subject: [PATCH 481/851] [Clang] add fix-it hints for unknown attributes
 (#141305)

This patch adds fix-it hints for unknown attribute names when Clang
suggests a correction
---
 .../include/clang/Basic/AttributeCommonInfo.h |  50 +++--
 .../include/clang/Basic/AttributeScopeInfo.h  |  48 +++++
 clang/include/clang/Sema/ParsedAttr.h         | 184 +++++++++---------
 clang/lib/AST/ASTImporter.cpp                 |   5 +-
 clang/lib/Basic/Attributes.cpp                |  55 ++++--
 clang/lib/Parse/ParseDecl.cpp                 |  96 ++++-----
 clang/lib/Parse/ParseDeclCXX.cpp              |  37 ++--
 clang/lib/Parse/ParseExprCXX.cpp              |   4 +-
 clang/lib/Parse/ParseHLSL.cpp                 |   4 +-
 clang/lib/Parse/ParseObjc.cpp                 |   2 +-
 clang/lib/Parse/ParsePragma.cpp               |   2 +-
 clang/lib/Parse/ParseStmt.cpp                 |   4 +-
 clang/lib/Sema/SemaAPINotes.cpp               |   7 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  58 ++++--
 clang/lib/Sema/SemaDeclCXX.cpp                |   3 +-
 clang/lib/Sema/SemaStmtAttr.cpp               |  14 +-
 clang/lib/Sema/SemaType.cpp                   |  10 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   4 +-
 .../dcl.module/dcl.module.import/p1.cppm      |   2 +-
 clang/test/FixIt/fixit-unknown-attributes.cpp |  74 +++++++
 .../Parser/cxx11-base-spec-attributes.cpp     |   2 +-
 clang/test/Parser/objcxx11-attributes.mm      |   2 +-
 clang/test/Sema/unknown-attributes.c          |  11 +-
 ...attr-non-x86-no_caller_saved_registers.cpp |   2 +-
 24 files changed, 432 insertions(+), 248 deletions(-)
 create mode 100644 clang/include/clang/Basic/AttributeScopeInfo.h
 create mode 100644 clang/test/FixIt/fixit-unknown-attributes.cpp

diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index 34fc774362557..21a7a88a3fb98 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H
 #define LLVM_CLANG_BASIC_ATTRIBUTECOMMONINFO_H
 
+#include "clang/Basic/AttributeScopeInfo.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TokenKinds.h"
 
@@ -61,6 +62,7 @@ class AttributeCommonInfo {
     /// implicitly.
     AS_Implicit
   };
+
   enum Kind {
 #define PARSED_ATTR(NAME) AT_##NAME,
 #include "clang/Basic/AttrParsedAttrList.inc"
@@ -78,9 +80,9 @@ class AttributeCommonInfo {
 
 private:
   const IdentifierInfo *AttrName = nullptr;
-  const IdentifierInfo *ScopeName = nullptr;
+  AttributeScopeInfo AttrScope;
   SourceRange AttrRange;
-  const SourceLocation ScopeLoc;
+
   // Corresponds to the Kind enum.
   LLVM_PREFERRED_TYPE(Kind)
   unsigned AttrKind : 16;
@@ -146,11 +148,10 @@ class AttributeCommonInfo {
   };
 
   AttributeCommonInfo(const IdentifierInfo *AttrName,
-                      const IdentifierInfo *ScopeName, SourceRange AttrRange,
-                      SourceLocation ScopeLoc, Kind AttrKind, Form FormUsed)
-      : AttrName(AttrName), ScopeName(ScopeName), AttrRange(AttrRange),
-        ScopeLoc(ScopeLoc), AttrKind(AttrKind),
-        SyntaxUsed(FormUsed.getSyntax()),
+                      AttributeScopeInfo AttrScope, SourceRange AttrRange,
+                      Kind AttrKind, Form FormUsed)
+      : AttrName(AttrName), AttrScope(AttrScope), AttrRange(AttrRange),
+        AttrKind(AttrKind), SyntaxUsed(FormUsed.getSyntax()),
         SpellingIndex(FormUsed.getSpellingIndex()),
         IsAlignas(FormUsed.isAlignas()),
         IsRegularKeywordAttribute(FormUsed.isRegularKeywordAttribute()) {
@@ -158,21 +159,20 @@ class AttributeCommonInfo {
            "Invalid syntax!");
   }
 
-  AttributeCommonInfo(const IdentifierInfo *AttrName,
-                      const IdentifierInfo *ScopeName, SourceRange AttrRange,
-                      SourceLocation ScopeLoc, Form FormUsed)
+  AttributeCommonInfo(const IdentifierInfo *AttrName, AttributeScopeInfo Scope,
+                      SourceRange AttrRange, Form FormUsed)
       : AttributeCommonInfo(
-            AttrName, ScopeName, AttrRange, ScopeLoc,
-            getParsedKind(AttrName, ScopeName, FormUsed.getSyntax()),
+            AttrName, Scope, AttrRange,
+            getParsedKind(AttrName, Scope.getName(), FormUsed.getSyntax()),
             FormUsed) {}
 
   AttributeCommonInfo(const IdentifierInfo *AttrName, SourceRange AttrRange,
                       Form FormUsed)
-      : AttributeCommonInfo(AttrName, nullptr, AttrRange, SourceLocation(),
+      : AttributeCommonInfo(AttrName, AttributeScopeInfo(), AttrRange,
                             FormUsed) {}
 
   AttributeCommonInfo(SourceRange AttrRange, Kind K, Form FormUsed)
-      : AttributeCommonInfo(nullptr, nullptr, AttrRange, SourceLocation(), K,
+      : AttributeCommonInfo(nullptr, AttributeScopeInfo(), AttrRange, K,
                             FormUsed) {}
 
   AttributeCommonInfo(AttributeCommonInfo &&) = default;
@@ -190,17 +190,27 @@ class AttributeCommonInfo {
   SourceRange getRange() const { return AttrRange; }
   void setRange(SourceRange R) { AttrRange = R; }
 
-  bool hasScope() const { return ScopeName; }
-  const IdentifierInfo *getScopeName() const { return ScopeName; }
-  SourceLocation getScopeLoc() const { return ScopeLoc; }
+  bool hasScope() const { return AttrScope.isValid(); }
+  bool isExplicitScope() const { return AttrScope.isExplicit(); }
+
+  const IdentifierInfo *getScopeName() const { return AttrScope.getName(); }
+  SourceLocation getScopeLoc() const { return AttrScope.getNameLoc(); }
 
   /// Gets the normalized full name, which consists of both scope and name and
   /// with surrounding underscores removed as appropriate (e.g.
   /// __gnu__::__attr__ will be normalized to gnu::attr).
   std::string getNormalizedFullName() const;
-  std::optional<std::string>
-  getCorrectedFullName(const TargetInfo &Target,
-                       const LangOptions &LangOpts) const;
+  std::string getNormalizedFullName(StringRef ScopeName,
+                                    StringRef AttrName) const;
+  StringRef getNormalizedScopeName() const;
+  StringRef getNormalizedAttrName(StringRef ScopeName) const;
+
+  std::optional<StringRef> tryGetCorrectedScopeName(StringRef ScopeName) const;
+  std::optional<StringRef>
+  tryGetCorrectedAttrName(StringRef ScopeName, StringRef AttrName,
+                          const TargetInfo &Target,
+                          const LangOptions &LangOpts) const;
+
   SourceRange getNormalizedRange() const;
 
   bool isDeclspecAttribute() const { return SyntaxUsed == AS_Declspec; }
diff --git a/clang/include/clang/Basic/AttributeScopeInfo.h b/clang/include/clang/Basic/AttributeScopeInfo.h
new file mode 100644
index 0000000000000..cca4df7c11b02
--- /dev/null
+++ b/clang/include/clang/Basic/AttributeScopeInfo.h
@@ -0,0 +1,48 @@
+//==- AttributeScopeInfo.h - Base info about an Attribute Scope --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AttributeScopeInfo type, which represents information
+// about the scope of an attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
+#define LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
+
+#include "clang/Basic/SourceLocation.h"
+
+namespace clang {
+
+class IdentifierInfo;
+
+class AttributeScopeInfo {
+public:
+  AttributeScopeInfo() = default;
+
+  AttributeScopeInfo(const IdentifierInfo *Name, SourceLocation NameLoc)
+      : Name(Name), NameLoc(NameLoc) {}
+
+  AttributeScopeInfo(const IdentifierInfo *Name, SourceLocation NameLoc,
+                     SourceLocation CommonScopeLoc)
+      : Name(Name), NameLoc(NameLoc), CommonScopeLoc(CommonScopeLoc) {}
+
+  const IdentifierInfo *getName() const { return Name; }
+  SourceLocation getNameLoc() const { return NameLoc; }
+
+  bool isValid() const { return Name != nullptr; }
+  bool isExplicit() const { return CommonScopeLoc.isInvalid(); }
+
+private:
+  const IdentifierInfo *Name = nullptr;
+  SourceLocation NameLoc;
+  SourceLocation CommonScopeLoc;
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_BASIC_ATTRIBUTESCOPEINFO_H
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index 9e050ab9a620e..6b3c5a173417a 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -204,10 +204,9 @@ class ParsedAttr final
 
   /// Constructor for attributes with expression arguments.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             ArgsUnion *args, unsigned numArgs, Form formUsed,
-             SourceLocation ellipsisLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
+             AttributeScopeInfo scope, ArgsUnion *args, unsigned numArgs,
+             Form formUsed, SourceLocation ellipsisLoc)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed),
         EllipsisLoc(ellipsisLoc), NumArgs(numArgs), Invalid(false),
         UsedAsTypeAttr(false), IsAvailability(false),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
@@ -219,14 +218,14 @@ class ParsedAttr final
 
   /// Constructor for availability attributes.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *Parm, const AvailabilityChange &introduced,
+             AttributeScopeInfo scope, IdentifierLoc *Parm,
+             const AvailabilityChange &introduced,
              const AvailabilityChange &deprecated,
              const AvailabilityChange &obsoleted, SourceLocation unavailable,
              const Expr *messageExpr, Form formUsed, SourceLocation strict,
              const Expr *replacementExpr, const IdentifierLoc *environmentLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(1), Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(1),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(true),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
         HasProcessingCache(false), IsPragmaClangAttribute(false),
         UnavailableLoc(unavailable), MessageExpr(messageExpr),
@@ -240,14 +239,13 @@ class ParsedAttr final
 
   /// Constructor for objc_bridge_related attributes.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *Parm1, IdentifierLoc *Parm2, IdentifierLoc *Parm3,
-             Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(3), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(false), IsProperty(false),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierLoc *Parm1,
+             IdentifierLoc *Parm2, IdentifierLoc *Parm3, Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(3),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion *Args = getArgsBuffer();
     Args[0] = Parm1;
     Args[1] = Parm2;
@@ -256,14 +254,14 @@ class ParsedAttr final
 
   /// Constructor for type_tag_for_datatype attribute.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierLoc *ArgKind, ParsedType matchingCType,
-             bool layoutCompatible, bool mustBeNull, Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(1), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(true), IsProperty(false),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierLoc *ArgKind,
+             ParsedType matchingCType, bool layoutCompatible, bool mustBeNull,
+             Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(1),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(true), IsProperty(false), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     ArgsUnion PVal(ArgKind);
     memcpy(getArgsBuffer(), &PVal, sizeof(ArgsUnion));
     detail::TypeTagForDatatypeData &ExtraData = getTypeTagForDatatypeDataSlot();
@@ -274,9 +272,9 @@ class ParsedAttr final
 
   /// Constructor for attributes with a single type argument.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             ParsedType typeArg, Form formUsed, SourceLocation ellipsisLoc)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
+             AttributeScopeInfo scope, ParsedType typeArg, Form formUsed,
+             SourceLocation ellipsisLoc)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed),
         EllipsisLoc(ellipsisLoc), NumArgs(0), Invalid(false),
         UsedAsTypeAttr(false), IsAvailability(false),
         IsTypeTagForDatatype(false), IsProperty(false), HasParsedType(true),
@@ -287,13 +285,13 @@ class ParsedAttr final
 
   /// Constructor for microsoft __declspec(property) attribute.
   ParsedAttr(IdentifierInfo *attrName, SourceRange attrRange,
-             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-             IdentifierInfo *getterId, IdentifierInfo *setterId, Form formUsed)
-      : AttributeCommonInfo(attrName, scopeName, attrRange, scopeLoc, formUsed),
-        NumArgs(0), Invalid(false), UsedAsTypeAttr(false),
-        IsAvailability(false), IsTypeTagForDatatype(false), IsProperty(true),
-        HasParsedType(false), HasProcessingCache(false),
-        IsPragmaClangAttribute(false), Info(ParsedAttrInfo::get(*this)) {
+             AttributeScopeInfo scope, IdentifierInfo *getterId,
+             IdentifierInfo *setterId, Form formUsed)
+      : AttributeCommonInfo(attrName, scope, attrRange, formUsed), NumArgs(0),
+        Invalid(false), UsedAsTypeAttr(false), IsAvailability(false),
+        IsTypeTagForDatatype(false), IsProperty(true), HasParsedType(false),
+        HasProcessingCache(false), IsPragmaClangAttribute(false),
+        Info(ParsedAttrInfo::get(*this)) {
     new (&getPropertyDataBuffer()) detail::PropertyData(getterId, setterId);
   }
 
@@ -735,21 +733,21 @@ class AttributePool {
   void takeFrom(ParsedAttributesView &List, AttributePool &Pool);
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form,
+                     AttributeScopeInfo scope, ArgsUnion *args,
+                     unsigned numArgs, ParsedAttr::Form form,
                      SourceLocation ellipsisLoc = SourceLocation()) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(numArgs, 0, 0, 0,
                                                            0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       args, numArgs, form, ellipsisLoc));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, args,
+                                       numArgs, form, ellipsisLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param, const AvailabilityChange &introduced,
+                     AttributeScopeInfo scope, IdentifierLoc *Param,
+                     const AvailabilityChange &introduced,
                      const AvailabilityChange &deprecated,
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
@@ -757,58 +755,54 @@ class AttributePool {
                      const Expr *ReplacementExpr,
                      IdentifierLoc *EnvironmentLoc) {
     void *memory = allocate(AttributeFactory::AvailabilityAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       Param, introduced, deprecated, obsoleted,
-                                       unavailable, MessageExpr, form, strict,
-                                       ReplacementExpr, EnvironmentLoc));
+    return add(new (memory)
+                   ParsedAttr(attrName, attrRange, scope, Param, introduced,
+                              deprecated, obsoleted, unavailable, MessageExpr,
+                              form, strict, ReplacementExpr, EnvironmentLoc));
   }
 
   ParsedAttr *create(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param1, IdentifierLoc *Param2,
-                     IdentifierLoc *Param3, ParsedAttr::Form form) {
+                     AttributeScopeInfo scope, IdentifierLoc *Param1,
+                     IdentifierLoc *Param2, IdentifierLoc *Param3,
+                     ParsedAttr::Form form) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(3, 0, 0, 0, 0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       Param1, Param2, Param3, form));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, Param1,
+                                       Param2, Param3, form));
   }
 
-  ParsedAttr *
-  createTypeTagForDatatype(IdentifierInfo *attrName, SourceRange attrRange,
-                           IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                           IdentifierLoc *argumentKind,
-                           ParsedType matchingCType, bool layoutCompatible,
-                           bool mustBeNull, ParsedAttr::Form form) {
+  ParsedAttr *createTypeTagForDatatype(
+      IdentifierInfo *attrName, SourceRange attrRange, AttributeScopeInfo scope,
+      IdentifierLoc *argumentKind, ParsedType matchingCType,
+      bool layoutCompatible, bool mustBeNull, ParsedAttr::Form form) {
     void *memory = allocate(AttributeFactory::TypeTagForDatatypeAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       argumentKind, matchingCType,
-                                       layoutCompatible, mustBeNull, form));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, argumentKind,
+                                       matchingCType, layoutCompatible,
+                                       mustBeNull, form));
   }
 
   ParsedAttr *createTypeAttribute(IdentifierInfo *attrName,
                                   SourceRange attrRange,
-                                  IdentifierInfo *scopeName,
-                                  SourceLocation scopeLoc, ParsedType typeArg,
+                                  AttributeScopeInfo scope, ParsedType typeArg,
                                   ParsedAttr::Form formUsed,
                                   SourceLocation ellipsisLoc) {
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
                                      detail::PropertyData>(0, 0, 0, 1, 0));
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       typeArg, formUsed, ellipsisLoc));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, typeArg,
+                                       formUsed, ellipsisLoc));
   }
 
   ParsedAttr *
   createPropertyAttribute(IdentifierInfo *attrName, SourceRange attrRange,
-                          IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                          IdentifierInfo *getterId, IdentifierInfo *setterId,
-                          ParsedAttr::Form formUsed) {
+                          AttributeScopeInfo scope, IdentifierInfo *getterId,
+                          IdentifierInfo *setterId, ParsedAttr::Form formUsed) {
     void *memory = allocate(AttributeFactory::PropertyAllocSize);
-    return add(new (memory) ParsedAttr(attrName, attrRange, scopeName, scopeLoc,
-                                       getterId, setterId, formUsed));
+    return add(new (memory) ParsedAttr(attrName, attrRange, scope, getterId,
+                                       setterId, formUsed));
   }
 };
 
@@ -982,19 +976,19 @@ class ParsedAttributes : public ParsedAttributesView {
 
   /// Add attribute with expression arguments.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form,
+                     AttributeScopeInfo scope, ArgsUnion *args,
+                     unsigned numArgs, ParsedAttr::Form form,
                      SourceLocation ellipsisLoc = SourceLocation()) {
-    ParsedAttr *attr = pool.create(attrName, attrRange, scopeName, scopeLoc,
-                                   args, numArgs, form, ellipsisLoc);
+    ParsedAttr *attr = pool.create(attrName, attrRange, scope, args, numArgs,
+                                   form, ellipsisLoc);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add availability attribute.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param, const AvailabilityChange &introduced,
+                     AttributeScopeInfo scope, IdentifierLoc *Param,
+                     const AvailabilityChange &introduced,
                      const AvailabilityChange &deprecated,
                      const AvailabilityChange &obsoleted,
                      SourceLocation unavailable, const Expr *MessageExpr,
@@ -1002,33 +996,31 @@ class ParsedAttributes : public ParsedAttributesView {
                      const Expr *ReplacementExpr,
                      IdentifierLoc *EnvironmentLoc) {
     ParsedAttr *attr =
-        pool.create(attrName, attrRange, scopeName, scopeLoc, Param, introduced,
-                    deprecated, obsoleted, unavailable, MessageExpr, form,
-                    strict, ReplacementExpr, EnvironmentLoc);
+        pool.create(attrName, attrRange, scope, Param, introduced, deprecated,
+                    obsoleted, unavailable, MessageExpr, form, strict,
+                    ReplacementExpr, EnvironmentLoc);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add objc_bridge_related attribute.
   ParsedAttr *addNew(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierLoc *Param1, IdentifierLoc *Param2,
-                     IdentifierLoc *Param3, ParsedAttr::Form form) {
-    ParsedAttr *attr = pool.create(attrName, attrRange, scopeName, scopeLoc,
-                                   Param1, Param2, Param3, form);
+                     AttributeScopeInfo scope, IdentifierLoc *Param1,
+                     IdentifierLoc *Param2, IdentifierLoc *Param3,
+                     ParsedAttr::Form form) {
+    ParsedAttr *attr =
+        pool.create(attrName, attrRange, scope, Param1, Param2, Param3, form);
     addAtEnd(attr);
     return attr;
   }
 
   /// Add type_tag_for_datatype attribute.
-  ParsedAttr *
-  addNewTypeTagForDatatype(IdentifierInfo *attrName, SourceRange attrRange,
-                           IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                           IdentifierLoc *argumentKind,
-                           ParsedType matchingCType, bool layoutCompatible,
-                           bool mustBeNull, ParsedAttr::Form form) {
+  ParsedAttr *addNewTypeTagForDatatype(
+      IdentifierInfo *attrName, SourceRange attrRange, AttributeScopeInfo scope,
+      IdentifierLoc *argumentKind, ParsedType matchingCType,
+      bool layoutCompatible, bool mustBeNull, ParsedAttr::Form form) {
     ParsedAttr *attr = pool.createTypeTagForDatatype(
-        attrName, attrRange, scopeName, scopeLoc, argumentKind, matchingCType,
+        attrName, attrRange, scope, argumentKind, matchingCType,
         layoutCompatible, mustBeNull, form);
     addAtEnd(attr);
     return attr;
@@ -1036,12 +1028,11 @@ class ParsedAttributes : public ParsedAttributesView {
 
   /// Add an attribute with a single type argument.
   ParsedAttr *addNewTypeAttr(IdentifierInfo *attrName, SourceRange attrRange,
-                             IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                             ParsedType typeArg, ParsedAttr::Form formUsed,
+                             AttributeScopeInfo scope, ParsedType typeArg,
+                             ParsedAttr::Form formUsed,
                              SourceLocation ellipsisLoc = SourceLocation()) {
-    ParsedAttr *attr =
-        pool.createTypeAttribute(attrName, attrRange, scopeName, scopeLoc,
-                                 typeArg, formUsed, ellipsisLoc);
+    ParsedAttr *attr = pool.createTypeAttribute(attrName, attrRange, scope,
+                                                typeArg, formUsed, ellipsisLoc);
     addAtEnd(attr);
     return attr;
   }
@@ -1049,11 +1040,10 @@ class ParsedAttributes : public ParsedAttributesView {
   /// Add microsoft __delspec(property) attribute.
   ParsedAttr *
   addNewPropertyAttr(IdentifierInfo *attrName, SourceRange attrRange,
-                     IdentifierInfo *scopeName, SourceLocation scopeLoc,
-                     IdentifierInfo *getterId, IdentifierInfo *setterId,
-                     ParsedAttr::Form formUsed) {
+                     AttributeScopeInfo scope, IdentifierInfo *getterId,
+                     IdentifierInfo *setterId, ParsedAttr::Form formUsed) {
     ParsedAttr *attr = pool.createPropertyAttribute(
-        attrName, attrRange, scopeName, scopeLoc, getterId, setterId, formUsed);
+        attrName, attrRange, scope, getterId, setterId, formUsed);
     addAtEnd(attr);
     return attr;
   }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 003bad225e30c..5c44353d8b987 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -9333,8 +9333,9 @@ class AttrImporter {
     if (Err)
       return;
 
-    AttributeCommonInfo ToI(ToAttrName, ToScopeName, ToAttrRange, ToScopeLoc,
-                            FromAttr->getParsedKind(), FromAttr->getForm());
+    AttributeCommonInfo ToI(
+        ToAttrName, AttributeScopeInfo(ToScopeName, ToScopeLoc), ToAttrRange,
+        FromAttr->getParsedKind(), FromAttr->getForm());
     // The "SemanticSpelling" is not needed to be passed to the constructor.
     // That value is recalculated from the SpellingListIndex if needed.
     ToAttr = T::Create(Importer.getToContext(),
diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp
index 905046685934b..004e5209a44a7 100644
--- a/clang/lib/Basic/Attributes.cpp
+++ b/clang/lib/Basic/Attributes.cpp
@@ -119,7 +119,6 @@ normalizeAttrScopeName(const IdentifierInfo *ScopeName,
                        AttributeCommonInfo::Syntax SyntaxUsed) {
   if (ScopeName)
     return normalizeAttrScopeName(ScopeName->getName(), SyntaxUsed);
-
   return "";
 }
 
@@ -141,12 +140,23 @@ static StringRef normalizeAttrName(StringRef AttrName,
   return AttrName;
 }
 
+StringRef AttributeCommonInfo::getNormalizedScopeName() const {
+  return normalizeAttrScopeName(getScopeName(), getSyntax());
+}
+
+StringRef
+AttributeCommonInfo::getNormalizedAttrName(StringRef ScopeName) const {
+  return normalizeAttrName(getAttrName()->getName(), ScopeName, getSyntax());
+}
+
 bool AttributeCommonInfo::isGNUScope() const {
-  return ScopeName && (ScopeName->isStr("gnu") || ScopeName->isStr("__gnu__"));
+  return AttrScope.isValid() && (AttrScope.getName()->isStr("gnu") ||
+                                 AttrScope.getName()->isStr("__gnu__"));
 }
 
 bool AttributeCommonInfo::isClangScope() const {
-  return ScopeName && (ScopeName->isStr("clang") || ScopeName->isStr("_Clang"));
+  return AttrScope.isValid() && (AttrScope.getName()->isStr("clang") ||
+                                 AttrScope.getName()->isStr("_Clang"));
 }
 
 #include "clang/Sema/AttrParsedAttrKinds.inc"
@@ -198,8 +208,16 @@ std::string AttributeCommonInfo::getNormalizedFullName() const {
       normalizeName(getAttrName(), getScopeName(), getSyntax()));
 }
 
+std::string
+AttributeCommonInfo::getNormalizedFullName(StringRef ScopeName,
+                                           StringRef AttrName) const {
+  return static_cast<std::string>(
+      normalizeName(AttrName, ScopeName, getSyntax()));
+}
+
 SourceRange AttributeCommonInfo::getNormalizedRange() const {
-  return hasScope() ? SourceRange(ScopeLoc, AttrRange.getEnd()) : AttrRange;
+  return hasScope() ? SourceRange(AttrScope.getNameLoc(), AttrRange.getEnd())
+                    : AttrRange;
 }
 
 static AttributeCommonInfo::Scope
@@ -239,10 +257,8 @@ static constexpr const char *AttrScopeSpellingList[] = {
 #include "clang/Basic/AttributeSpellingList.inc"
 };
 
-std::optional<std::string>
-AttributeCommonInfo::getCorrectedFullName(const TargetInfo &Target,
-                                          const LangOptions &LangOpts) const {
-  StringRef ScopeName = normalizeAttrScopeName(getScopeName(), getSyntax());
+std::optional<StringRef>
+AttributeCommonInfo::tryGetCorrectedScopeName(StringRef ScopeName) const {
   if (ScopeName.size() > 0 &&
       llvm::none_of(AttrScopeSpellingList,
                     [&](const char *S) { return S == ScopeName; })) {
@@ -251,25 +267,26 @@ AttributeCommonInfo::getCorrectedFullName(const TargetInfo &Target,
       STC.add(Scope);
 
     if (auto CorrectedScopeName = STC.getCorrection())
-      ScopeName = *CorrectedScopeName;
+      return CorrectedScopeName;
   }
+  return std::nullopt;
+}
 
-  StringRef AttrName =
-      normalizeAttrName(getAttrName()->getName(), ScopeName, getSyntax());
+std::optional<StringRef> AttributeCommonInfo::tryGetCorrectedAttrName(
+    StringRef ScopeName, StringRef AttrName, const TargetInfo &Target,
+    const LangOptions &LangOpts) const {
   if (llvm::none_of(AttrSpellingList,
                     [&](const char *A) { return A == AttrName; })) {
     SimpleTypoCorrection STC(AttrName);
     for (const auto &Attr : AttrSpellingList)
       STC.add(Attr);
 
-    if (auto CorrectedAttrName = STC.getCorrection())
-      AttrName = *CorrectedAttrName;
+    if (auto CorrectedAttrName = STC.getCorrection()) {
+      if (hasAttribute(getSyntax(), ScopeName, *CorrectedAttrName, Target,
+                       LangOpts,
+                       /*CheckPlugins=*/true))
+        return CorrectedAttrName;
+    }
   }
-
-  if (hasAttribute(getSyntax(), ScopeName, AttrName, Target, LangOpts,
-                   /*CheckPlugins=*/true))
-    return static_cast<std::string>(
-        normalizeName(AttrName, ScopeName, getSyntax()));
-
   return std::nullopt;
 }
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 647ee34efcabc..02f33511dbd61 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -151,7 +151,7 @@ bool Parser::ParseSingleGNUAttribute(ParsedAttributes &Attrs,
   SourceLocation AttrNameLoc = ConsumeToken();
 
   if (Tok.isNot(tok::l_paren)) {
-    Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  ParsedAttr::Form::GNU());
     return false;
   }
@@ -396,12 +396,12 @@ void Parser::ParseAttributeWithTypeArg(IdentifierInfo &AttrName,
     return;
 
   if (T.isUsable())
-    Attrs.addNewTypeAttr(&AttrName,
-                         SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-                         ScopeName, ScopeLoc, T.get(), Form);
+    Attrs.addNewTypeAttr(
+        &AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
+        AttributeScopeInfo(ScopeName, ScopeLoc), T.get(), Form);
   else
     Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-                 ScopeName, ScopeLoc, nullptr, 0, Form);
+                 AttributeScopeInfo(ScopeName, ScopeLoc), nullptr, 0, Form);
 }
 
 ExprResult
@@ -609,10 +609,12 @@ unsigned Parser::ParseAttributeArgsCommon(
 
     if (AttributeIsTypeArgAttr && !TheParsedType.get().isNull()) {
       Attrs.addNewTypeAttr(AttrName, SourceRange(AttrNameLoc, RParen),
-                           ScopeName, ScopeLoc, TheParsedType, Form);
+                           AttributeScopeInfo(ScopeName, ScopeLoc),
+                           TheParsedType, Form);
     } else {
-      Attrs.addNew(AttrName, SourceRange(AttrLoc, RParen), ScopeName, ScopeLoc,
-                   ArgExprs.data(), ArgExprs.size(), Form);
+      Attrs.addNew(AttrName, SourceRange(AttrLoc, RParen),
+                   AttributeScopeInfo(ScopeName, ScopeLoc), ArgExprs.data(),
+                   ArgExprs.size(), Form);
     }
   }
 
@@ -854,7 +856,7 @@ bool Parser::ParseMicrosoftDeclSpecArgs(IdentifierInfo *AttrName,
 
     // Only add the property attribute if it was well-formed.
     if (!HasInvalidAccessor)
-      Attrs.addNewPropertyAttr(AttrName, AttrNameLoc, nullptr, SourceLocation(),
+      Attrs.addNewPropertyAttr(AttrName, AttrNameLoc, AttributeScopeInfo(),
                                AccessorNames[AK_Get], AccessorNames[AK_Put],
                                ParsedAttr::Form::Declspec());
     T.skipToEnd();
@@ -940,7 +942,7 @@ void Parser::ParseMicrosoftDeclSpecs(ParsedAttributes &Attrs) {
             << AttrName->getName();
 
       if (!AttrHandled)
-        Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+        Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                      ParsedAttr::Form::Declspec());
     }
     T.consumeClose();
@@ -968,7 +970,7 @@ void Parser::ParseMicrosoftTypeAttributes(ParsedAttributes &attrs) {
     case tok::kw___uptr: {
       IdentifierInfo *AttrName = Tok.getIdentifierInfo();
       SourceLocation AttrNameLoc = ConsumeToken();
-      attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+      attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                    Kind);
       break;
     }
@@ -989,9 +991,8 @@ void Parser::ParseWebAssemblyFuncrefTypeAttribute(ParsedAttributes &attrs) {
 
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   SourceLocation AttrNameLoc = ConsumeToken();
-  attrs.addNew(AttrName, AttrNameLoc, /*ScopeName=*/nullptr,
-               /*ScopeLoc=*/SourceLocation{}, /*Args=*/nullptr, /*numArgs=*/0,
-               tok::kw___funcref);
+  attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), /*Args=*/nullptr,
+               /*numArgs=*/0, tok::kw___funcref);
 }
 
 void Parser::DiagnoseAndSkipExtendedMicrosoftTypeAttributes() {
@@ -1035,7 +1036,7 @@ void Parser::ParseBorlandTypeAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___pascal)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___pascal);
   }
 }
@@ -1045,7 +1046,7 @@ void Parser::ParseOpenCLKernelAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___kernel)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___kernel);
   }
 }
@@ -1054,7 +1055,7 @@ void Parser::ParseCUDAFunctionAttributes(ParsedAttributes &attrs) {
   while (Tok.is(tok::kw___noinline__)) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                  tok::kw___noinline__);
   }
 }
@@ -1062,7 +1063,7 @@ void Parser::ParseCUDAFunctionAttributes(ParsedAttributes &attrs) {
 void Parser::ParseOpenCLQualifiers(ParsedAttributes &Attrs) {
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   SourceLocation AttrNameLoc = Tok.getLocation();
-  Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+  Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                Tok.getKind());
 }
 
@@ -1074,7 +1075,7 @@ void Parser::ParseHLSLQualifiers(ParsedAttributes &Attrs) {
   IdentifierInfo *AttrName = Tok.getIdentifierInfo();
   auto Kind = Tok.getKind();
   SourceLocation AttrNameLoc = ConsumeToken();
-  Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+  Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
 }
 
 void Parser::ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs) {
@@ -1091,7 +1092,7 @@ void Parser::ParseNullabilityTypeSpecifiers(ParsedAttributes &attrs) {
       if (!getLangOpts().ObjC)
         Diag(AttrNameLoc, diag::ext_nullability)
           << AttrName;
-      attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+      attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                    Kind);
       break;
     }
@@ -1435,10 +1436,11 @@ void Parser::ParseAvailabilityAttribute(
 
   // Record this attribute
   attrs.addNew(&Availability,
-               SourceRange(AvailabilityLoc, T.getCloseLocation()), ScopeName,
-               ScopeLoc, Platform, Changes[Introduced], Changes[Deprecated],
-               Changes[Obsoleted], UnavailableLoc, MessageExpr.get(), Form,
-               StrictLoc, ReplacementExpr.get(), EnvironmentLoc);
+               SourceRange(AvailabilityLoc, T.getCloseLocation()),
+               AttributeScopeInfo(ScopeName, ScopeLoc), Platform,
+               Changes[Introduced], Changes[Deprecated], Changes[Obsoleted],
+               UnavailableLoc, MessageExpr.get(), Form, StrictLoc,
+               ReplacementExpr.get(), EnvironmentLoc);
 }
 
 void Parser::ParseExternalSourceSymbolAttribute(
@@ -1556,7 +1558,8 @@ void Parser::ParseExternalSourceSymbolAttribute(
   ArgsUnion Args[] = {Language.get(), DefinedInExpr.get(), GeneratedDeclaration,
                       USR.get()};
   Attrs.addNew(&ExternalSourceSymbol, SourceRange(Loc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, Args, std::size(Args), Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), Args, std::size(Args),
+               Form);
 }
 
 void Parser::ParseObjCBridgeRelatedAttribute(
@@ -1624,8 +1627,8 @@ void Parser::ParseObjCBridgeRelatedAttribute(
   // Record this attribute
   Attrs.addNew(&ObjCBridgeRelated,
                SourceRange(ObjCBridgeRelatedLoc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, RelatedClass, ClassMethod, InstanceMethod,
-               Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), RelatedClass,
+               ClassMethod, InstanceMethod, Form);
 }
 
 void Parser::ParseSwiftNewTypeAttribute(
@@ -1666,7 +1669,8 @@ void Parser::ParseSwiftNewTypeAttribute(
 
   ArgsUnion Args[] = {SwiftType};
   Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, T.getCloseLocation()),
-               ScopeName, ScopeLoc, Args, std::size(Args), Form);
+               AttributeScopeInfo(ScopeName, ScopeLoc), Args, std::size(Args),
+               Form);
 }
 
 void Parser::ParseTypeTagForDatatypeAttribute(
@@ -1719,9 +1723,9 @@ void Parser::ParseTypeTagForDatatypeAttribute(
   }
 
   if (!T.consumeClose()) {
-    Attrs.addNewTypeTagForDatatype(&AttrName, AttrNameLoc, ScopeName, ScopeLoc,
-                                   ArgumentKind, MatchingCType.get(),
-                                   LayoutCompatible, MustBeNull, Form);
+    Attrs.addNewTypeTagForDatatype(
+        &AttrName, AttrNameLoc, AttributeScopeInfo(ScopeName, ScopeLoc),
+        ArgumentKind, MatchingCType.get(), LayoutCompatible, MustBeNull, Form);
   }
 
   if (EndLoc)
@@ -1828,9 +1832,10 @@ void Parser::ProhibitCXX11Attributes(ParsedAttributes &Attrs,
     if (!AL.isStandardAttributeSyntax())
       continue;
     if (AL.getKind() == ParsedAttr::UnknownAttribute) {
-      if (WarnOnUnknownAttrs)
-        Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-            << AL << AL.getRange();
+      if (WarnOnUnknownAttrs) {
+        Actions.DiagnoseUnknownAttribute(AL);
+        AL.setInvalid();
+      }
     } else {
       Diag(AL.getLoc(), AttrDiagID) << AL;
       AL.setInvalid();
@@ -3117,12 +3122,12 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
     *EndLoc = T.getCloseLocation();
 
   if (IsType) {
-    Attrs.addNewTypeAttr(KWName, KWLoc, nullptr, KWLoc, TypeResult, Kind,
+    Attrs.addNewTypeAttr(KWName, KWLoc, AttributeScopeInfo(), TypeResult, Kind,
                          EllipsisLoc);
   } else {
     ArgsVector ArgExprs;
     ArgExprs.push_back(ArgExpr.get());
-    Attrs.addNew(KWName, KWLoc, nullptr, KWLoc, ArgExprs.data(), 1, Kind,
+    Attrs.addNew(KWName, KWLoc, AttributeScopeInfo(), ArgExprs.data(), 1, Kind,
                  EllipsisLoc);
   }
 }
@@ -3168,9 +3173,8 @@ void Parser::ParsePtrauthQualifier(ParsedAttributes &Attrs) {
     return;
   }
 
-  Attrs.addNew(KwName, SourceRange(KwLoc, EndLoc),
-               /*scope*/ nullptr, SourceLocation(), ArgExprs.data(),
-               ArgExprs.size(),
+  Attrs.addNew(KwName, SourceRange(KwLoc, EndLoc), AttributeScopeInfo(),
+               ArgExprs.data(), ArgExprs.size(),
                ParsedAttr::Form::Keyword(/*IsAlignAs=*/false,
                                          /*IsRegularKeywordAttribute=*/false));
 }
@@ -3216,7 +3220,7 @@ void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
       Ctx.getSizeType(), SourceLocation()));
 
   Attrs.addNew(&AttrName, SourceRange(AttrNameLoc, Parens.getCloseLocation()),
-               ScopeName, ScopeLoc, ArgExprs.data(), ArgExprs.size(), Form);
+               AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(), Form);
 }
 
 ExprResult Parser::ParseExtIntegerArgument() {
@@ -3995,7 +3999,7 @@ void Parser::ParseDeclarationSpecifiers(
       isInvalid = DS.setFunctionSpecForceInline(Loc, PrevSpec, DiagID);
       IdentifierInfo *AttrName = Tok.getIdentifierInfo();
       SourceLocation AttrNameLoc = Tok.getLocation();
-      DS.getAttributes().addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc,
+      DS.getAttributes().addNew(AttrName, AttrNameLoc, AttributeScopeInfo(),
                                 nullptr, 0, tok::kw___forceinline);
       break;
     }
@@ -4053,8 +4057,9 @@ void Parser::ParseDeclarationSpecifiers(
 
     // Objective-C 'kindof' types.
     case tok::kw___kindof:
-      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc, nullptr, Loc,
-                                nullptr, 0, tok::kw___kindof);
+      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc,
+                                AttributeScopeInfo(), nullptr, 0,
+                                tok::kw___kindof);
       (void)ConsumeToken();
       continue;
 
@@ -6238,8 +6243,9 @@ void Parser::ParseTypeQualifierListOpt(
 
     // Objective-C 'kindof' types.
     case tok::kw___kindof:
-      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc, nullptr, Loc,
-                                nullptr, 0, tok::kw___kindof);
+      DS.getAttributes().addNew(Tok.getIdentifierInfo(), Loc,
+                                AttributeScopeInfo(), nullptr, 0,
+                                tok::kw___kindof);
       (void)ConsumeToken();
       continue;
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5f34370aeeb2d..f31c9265a0074 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1430,7 +1430,7 @@ void Parser::ParseMicrosoftInheritanceClassAttributes(ParsedAttributes &attrs) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     auto Kind = Tok.getKind();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
   }
 }
 
@@ -1439,7 +1439,7 @@ void Parser::ParseNullabilityClassAttributes(ParsedAttributes &attrs) {
     IdentifierInfo *AttrName = Tok.getIdentifierInfo();
     auto Kind = Tok.getKind();
     SourceLocation AttrNameLoc = ConsumeToken();
-    attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0, Kind);
+    attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0, Kind);
   }
 }
 
@@ -4493,8 +4493,8 @@ bool Parser::ParseCXXAssumeAttributeArg(
   ArgsUnion Assumption = Res.get();
   auto RParen = Tok.getLocation();
   T.consumeClose();
-  Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen), ScopeName, ScopeLoc,
-               &Assumption, 1, Form);
+  Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen),
+               AttributeScopeInfo(ScopeName, ScopeLoc), &Assumption, 1, Form);
 
   if (EndLoc)
     *EndLoc = RParen;
@@ -4574,7 +4574,7 @@ bool Parser::ParseCXX11AttributeArgs(
 
     // Ignore attributes that don't exist for the target.
     if (!Attr.existsInTarget(getTargetInfo())) {
-      Diag(LParenLoc, diag::warn_unknown_attribute_ignored) << AttrName;
+      Actions.DiagnoseUnknownAttribute(Attr);
       Attr.setInvalid(true);
       return true;
     }
@@ -4629,7 +4629,7 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
                                  /*ScopeName*/ nullptr,
                                  /*ScopeLoc*/ Loc, Form);
     } else
-      Attrs.addNew(AttrName, Loc, nullptr, Loc, nullptr, 0, Form);
+      Attrs.addNew(AttrName, Loc, AttributeScopeInfo(), nullptr, 0, Form);
     return;
   }
 
@@ -4724,12 +4724,15 @@ void Parser::ParseCXX11AttributeSpecifierInternal(ParsedAttributes &Attrs,
                                            ScopeName, ScopeLoc, OpenMPTokens);
 
     if (!AttrParsed) {
-      Attrs.addNew(
-          AttrName,
-          SourceRange(ScopeLoc.isValid() ? ScopeLoc : AttrLoc, AttrLoc),
-          ScopeName, ScopeLoc, nullptr, 0,
-          getLangOpts().CPlusPlus ? ParsedAttr::Form::CXX11()
-                                  : ParsedAttr::Form::C23());
+      Attrs.addNew(AttrName,
+                   SourceRange(ScopeLoc.isValid() && CommonScopeLoc.isInvalid()
+                                   ? ScopeLoc
+                                   : AttrLoc,
+                               AttrLoc),
+                   AttributeScopeInfo(ScopeName, ScopeLoc, CommonScopeLoc),
+                   nullptr, 0,
+                   getLangOpts().CPlusPlus ? ParsedAttr::Form::CXX11()
+                                           : ParsedAttr::Form::C23());
       AttrParsed = true;
     }
 
@@ -4890,8 +4893,8 @@ void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) {
   }
 
   if (!T.consumeClose()) {
-    Attrs.addNew(UuidIdent, SourceRange(UuidLoc, T.getCloseLocation()), nullptr,
-                 SourceLocation(), ArgExprs.data(), ArgExprs.size(),
+    Attrs.addNew(UuidIdent, SourceRange(UuidLoc, T.getCloseLocation()),
+                 AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(),
                  ParsedAttr::Form::Microsoft());
   }
 }
@@ -4975,8 +4978,8 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
 
   if (!T.consumeClose())
     Attrs.addNew(RootSignatureIdent,
-                 SourceRange(RootSignatureLoc, T.getCloseLocation()), nullptr,
-                 SourceLocation(), Args.data(), Args.size(),
+                 SourceRange(RootSignatureLoc, T.getCloseLocation()),
+                 AttributeScopeInfo(), Args.data(), Args.size(),
                  ParsedAttr::Form::Microsoft());
 }
 
@@ -5026,7 +5029,7 @@ void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) {
             ReplayOpenMPAttributeTokens(OpenMPTokens);
           }
           if (!AttrParsed) {
-            Attrs.addNew(II, NameLoc, nullptr, SourceLocation(), nullptr, 0,
+            Attrs.addNew(II, NameLoc, AttributeScopeInfo(), nullptr, 0,
                          ParsedAttr::Form::Microsoft());
           }
         }
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 329572047da04..1ea0cf52933f6 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1234,8 +1234,8 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
       if (Tok.is(tok::kw___noinline__)) {
         IdentifierInfo *AttrName = Tok.getIdentifierInfo();
         SourceLocation AttrNameLoc = ConsumeToken();
-        Attributes.addNew(AttrName, AttrNameLoc, /*ScopeName=*/nullptr,
-                          AttrNameLoc, /*ArgsUnion=*/nullptr,
+        Attributes.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(),
+                          /*ArgsUnion=*/nullptr,
                           /*numArgs=*/0, tok::kw___noinline__);
       } else if (Tok.is(tok::kw___attribute))
         ParseGNUAttributes(Attributes, /*LatePArsedAttrList=*/nullptr, &D);
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 53d46465e3362..e6caa81b309ca 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -296,6 +296,6 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
     break;
   }
 
-  Attrs.addNew(II, Loc, nullptr, SourceLocation(), ArgExprs.data(),
-               ArgExprs.size(), ParsedAttr::Form::HLSLAnnotation());
+  Attrs.addNew(II, Loc, AttributeScopeInfo(), ArgExprs.data(), ArgExprs.size(),
+               ParsedAttr::Form::HLSLAnnotation());
 }
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 8ef16a4d3808a..291c70e7bad4b 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -370,7 +370,7 @@ static void addContextSensitiveTypeNullability(Parser &P,
   // Create the attribute.
   auto getNullabilityAttr = [&](AttributePool &Pool) -> ParsedAttr * {
     return Pool.create(P.getNullabilityKeyword(nullability),
-                       SourceRange(nullabilityLoc), nullptr, SourceLocation(),
+                       SourceRange(nullabilityLoc), AttributeScopeInfo(),
                        nullptr, 0, ParsedAttr::Form::ContextSensitiveKeyword());
   };
 
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 6341e565b5042..98933811265e8 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -1926,7 +1926,7 @@ void Parser::HandlePragmaAttribute() {
       SourceLocation AttrNameLoc = ConsumeToken();
 
       if (Tok.isNot(tok::l_paren))
-        Attrs.addNew(AttrName, AttrNameLoc, nullptr, AttrNameLoc, nullptr, 0,
+        Attrs.addNew(AttrName, AttrNameLoc, AttributeScopeInfo(), nullptr, 0,
                      ParsedAttr::Form::GNU());
       else
         ParseGNUAttributeArgs(AttrName, AttrNameLoc, Attrs, /*EndLoc=*/nullptr,
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index c0c9bbc2e15c6..bc40b726bf41b 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -2345,8 +2345,8 @@ StmtResult Parser::ParsePragmaLoopHint(StmtVector &Stmts,
     ArgsUnion ArgHints[] = {Hint.PragmaNameLoc, Hint.OptionLoc, Hint.StateLoc,
                             ArgsUnion(Hint.ValueExpr)};
     TempAttrs.addNew(Hint.PragmaNameLoc->getIdentifierInfo(), Hint.Range,
-                     /*scopeName=*/nullptr, Hint.PragmaNameLoc->getLoc(),
-                     ArgHints, /*numArgs=*/4, ParsedAttr::Form::Pragma());
+                     AttributeScopeInfo(), ArgHints, /*numArgs=*/4,
+                     ParsedAttr::Form::Pragma());
   }
 
   // Get the next statement.
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index def909fc2478d..f21cbbbdb44ee 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -303,10 +303,9 @@ static void ProcessAPINotes(Sema &S, Decl *D,
           AttributeFactory AF{};
           AttributePool AP{AF};
           auto &C = S.getASTContext();
-          ParsedAttr *SNA =
-              AP.create(&C.Idents.get("swift_name"), SourceRange(), nullptr,
-                        SourceLocation(), nullptr, nullptr, nullptr,
-                        ParsedAttr::Form::GNU());
+          ParsedAttr *SNA = AP.create(
+              &C.Idents.get("swift_name"), SourceRange(), AttributeScopeInfo(),
+              nullptr, nullptr, nullptr, ParsedAttr::Form::GNU());
 
           if (!S.Swift().DiagnoseName(D, Info.SwiftName, D->getLocation(), *SNA,
                                       /*IsAsync=*/false))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 1aeae41042a1c..9c985e6bd5e03 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1986,14 +1986,13 @@ bool Sema::CheckAttrNoArgs(const ParsedAttr &Attrs) {
 bool Sema::CheckAttrTarget(const ParsedAttr &AL) {
   // Check whether the attribute is valid on the current target.
   if (!AL.existsInTarget(Context.getTargetInfo())) {
-    Diag(AL.getLoc(), AL.isRegularKeywordAttribute()
-                          ? diag::err_keyword_not_supported_on_target
-                          : diag::warn_unknown_attribute_ignored)
-        << AL << AL.getRange();
+    if (AL.isRegularKeywordAttribute())
+      Diag(AL.getLoc(), diag::err_keyword_not_supported_on_target);
+    else
+      DiagnoseUnknownAttribute(AL);
     AL.setInvalid();
     return true;
   }
-
   return false;
 }
 
@@ -7956,8 +7955,7 @@ static void checkUnusedDeclAttributes(Sema &S, const ParsedAttributesView &A) {
       continue;
 
     if (AL.getKind() == ParsedAttr::UnknownAttribute) {
-      S.Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-          << AL << AL.getRange();
+      S.DiagnoseUnknownAttribute(AL);
     } else {
       S.Diag(AL.getLoc(), diag::warn_attribute_not_on_decl) << AL
                                                             << AL.getRange();
@@ -7975,15 +7973,45 @@ void Sema::checkUnusedDeclAttributes(Declarator &D) {
 
 void Sema::DiagnoseUnknownAttribute(const ParsedAttr &AL) {
   std::string NormalizedFullName = '\'' + AL.getNormalizedFullName() + '\'';
-  if (auto CorrectedFullName =
-          AL.getCorrectedFullName(Context.getTargetInfo(), getLangOpts())) {
-    Diag(AL.getNormalizedRange().getBegin(),
-         diag::warn_unknown_attribute_ignored_suggestion)
-        << NormalizedFullName << *CorrectedFullName << AL.getNormalizedRange();
+  SourceRange NR = AL.getNormalizedRange();
+
+  StringRef ScopeName = AL.getNormalizedScopeName();
+  std::optional<StringRef> CorrectedScopeName =
+      AL.tryGetCorrectedScopeName(ScopeName);
+  if (CorrectedScopeName) {
+    ScopeName = *CorrectedScopeName;
+  }
+
+  StringRef AttrName = AL.getNormalizedAttrName(ScopeName);
+  std::optional<StringRef> CorrectedAttrName = AL.tryGetCorrectedAttrName(
+      ScopeName, AttrName, Context.getTargetInfo(), getLangOpts());
+  if (CorrectedAttrName) {
+    AttrName = *CorrectedAttrName;
+  }
+
+  if (CorrectedScopeName || CorrectedAttrName) {
+    std::string CorrectedFullName =
+        AL.getNormalizedFullName(ScopeName, AttrName);
+    SemaDiagnosticBuilder D =
+        Diag(CorrectedScopeName ? NR.getBegin() : AL.getRange().getBegin(),
+             diag::warn_unknown_attribute_ignored_suggestion);
+
+    D << NormalizedFullName << CorrectedFullName;
+
+    if (AL.isExplicitScope()) {
+      D << FixItHint::CreateReplacement(NR, CorrectedFullName) << NR;
+    } else {
+      if (CorrectedScopeName) {
+        D << FixItHint::CreateReplacement(SourceRange(AL.getScopeLoc()),
+                                          ScopeName);
+      }
+      if (CorrectedAttrName) {
+        D << FixItHint::CreateReplacement(AL.getRange(), AttrName);
+      }
+    }
   } else {
-    Diag(AL.getNormalizedRange().getBegin(),
-         diag::warn_unknown_attribute_ignored)
-        << NormalizedFullName << AL.getNormalizedRange();
+    Diag(NR.getBegin(), diag::warn_unknown_attribute_ignored)
+        << NormalizedFullName << NR;
   }
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 6f62c53aaf04d..16645ecf411e5 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -2865,8 +2865,7 @@ BaseResult Sema::ActOnBaseSpecifier(Decl *classdecl, SourceRange SpecifierRange,
     if (AL.isInvalid() || AL.getKind() == ParsedAttr::IgnoredAttribute)
       continue;
     if (AL.getKind() == ParsedAttr::UnknownAttribute)
-      Diag(AL.getLoc(), diag::warn_unknown_attribute_ignored)
-          << AL << AL.getRange();
+      DiagnoseUnknownAttribute(AL);
     else
       Diag(AL.getLoc(), diag::err_base_specifier_attribute)
           << AL << AL.isRegularKeywordAttribute() << AL.getRange();
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index b78080c991763..857d46af9ada9 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -672,12 +672,14 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A,
       !(A.existsInTarget(S.Context.getTargetInfo()) ||
         (S.Context.getLangOpts().SYCLIsDevice && Aux &&
          A.existsInTarget(*Aux)))) {
-    S.Diag(A.getLoc(), A.isRegularKeywordAttribute()
-                           ? (unsigned)diag::err_keyword_not_supported_on_target
-                       : A.isDeclspecAttribute()
-                           ? (unsigned)diag::warn_unhandled_ms_attribute_ignored
-                           : (unsigned)diag::warn_unknown_attribute_ignored)
-        << A << A.getRange();
+    if (A.isRegularKeywordAttribute() || A.isDeclspecAttribute()) {
+      S.Diag(A.getLoc(), A.isRegularKeywordAttribute()
+                             ? diag::err_keyword_not_supported_on_target
+                             : diag::warn_unhandled_ms_attribute_ignored)
+          << A << A.getRange();
+    } else {
+      S.DiagnoseUnknownAttribute(A);
+    }
     return nullptr;
   }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index a0cd2d1615243..785d7b89e778e 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -4552,7 +4552,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
                                           false /*IsRegularKeywordAttribute*/);
       ParsedAttr *nullabilityAttr = Pool.create(
           S.getNullabilityKeyword(*inferNullability), SourceRange(pointerLoc),
-          nullptr, SourceLocation(), nullptr, 0, form);
+          AttributeScopeInfo(), nullptr, 0, form);
 
       attrs.addAtEnd(nullabilityAttr);
 
@@ -5735,10 +5735,10 @@ static void transferARCOwnershipToDeclaratorChunk(TypeProcessingState &state,
 
   // If there wasn't one, add one (with an invalid source location
   // so that we don't make an AttributedType for it).
-  ParsedAttr *attr = D.getAttributePool().create(
-      &S.Context.Idents.get("objc_ownership"), SourceLocation(),
-      /*scope*/ nullptr, SourceLocation(),
-      /*args*/ &Args, 1, ParsedAttr::Form::GNU());
+  ParsedAttr *attr =
+      D.getAttributePool().create(&S.Context.Idents.get("objc_ownership"),
+                                  SourceLocation(), AttributeScopeInfo(),
+                                  /*args*/ &Args, 1, ParsedAttr::Form::GNU());
   chunk.getAttrs().addAtEnd(attr);
   // TODO: mark whether we did this inference?
 }
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8dafefb9696bf..a1368a48351c6 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3198,8 +3198,8 @@ Attr *ASTRecordReader::readAttr() {
                     SpellingIndex == AlignedAttr::Keyword_alignas);
   bool IsRegularKeywordAttribute = Record.readBool();
 
-  AttributeCommonInfo Info(AttrName, ScopeName, AttrRange, ScopeLoc,
-                           AttributeCommonInfo::Kind(ParsedKind),
+  AttributeCommonInfo Info(AttrName, AttributeScopeInfo(ScopeName, ScopeLoc),
+                           AttrRange, AttributeCommonInfo::Kind(ParsedKind),
                            {AttributeCommonInfo::Syntax(Syntax), SpellingIndex,
                             IsAlignas, IsRegularKeywordAttribute});
 
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
index 873e4c0edeac2..3670f9430ed4b 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.import/p1.cppm
@@ -45,7 +45,7 @@ import x;
 import x [[]];
 import x [[foo]]; // expected-warning {{unknown attribute 'foo' ignored}}
 import x [[noreturn]]; // expected-error {{'noreturn' attribute cannot be applied to a module import}}
-import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'noreturn' ignored}}
+import x [[blarg::noreturn]]; // expected-warning {{unknown attribute 'blarg::noreturn' ignored}}
 
 import x.y;
 import x.; // expected-error {{expected a module name after 'import'}}
diff --git a/clang/test/FixIt/fixit-unknown-attributes.cpp b/clang/test/FixIt/fixit-unknown-attributes.cpp
new file mode 100644
index 0000000000000..7dff510f5ddf0
--- /dev/null
+++ b/clang/test/FixIt/fixit-unknown-attributes.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+[[gmu::deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f1(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:17}:"gnu::deprecated"
+
+[[gmu::deprecated]] // expected-warning {{unknown attribute 'gmu::deprecated' ignored; did you mean 'gnu::deprecated'?}}
+int f2(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:18}:"gnu::deprecated"
+
+[[gnu::deprected]] // expected-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f3(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:17}:"gnu::deprecated"
+
+[[deprected]] // expected-warning {{unknown attribute 'deprected' ignored; did you mean 'deprecated'?}}
+int f4(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:3-[[@LINE-4]]:12}:"deprecated"
+
+[[using gmu : deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f5(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:15-[[@LINE-5]]:24}:"deprecated"
+
+[[using gmu : deprecated]] // expected-warning {{unknown attribute 'gmu::deprecated' ignored; did you mean 'gnu::deprecated'?}}
+int f6(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+
+[[using gnu : deprected]] // expected-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f7(void) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:15-[[@LINE-4]]:24}:"deprecated"
+
+[[using gnu : deprecated, noretyrn]] // expected-warning {{unknown attribute 'gnu::noretyrn' ignored; did you mean 'gnu::noreturn'?}}
+void f8(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-3]]:27-[[@LINE-3]]:35}:"noreturn"
+
+[[using gmu : deprected, noretyrn]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}} \
+                                    // expected-warning {{unknown attribute 'gmu::noretyrn' ignored; did you mean 'gnu::noreturn'?}}
+void f9(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:9-[[@LINE-4]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:15-[[@LINE-5]]:24}:"deprecated"
+
+// CHECK: fix-it:"{{.*}}":{[[@LINE-7]]:9-[[@LINE-7]]:12}:"gnu"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-8]]:26-[[@LINE-8]]:34}:"noreturn"
+
+__attribute__((cont, deprected)) // expected-warning {{unknown attribute 'cont' ignored; did you mean 'const'?}} \
+                                 // expected-warning {{unknown attribute 'deprected' ignored; did you mean 'deprecated'?}}
+int f10(int) {
+  return 0;
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-5]]:16-[[@LINE-5]]:20}:"const"
+// CHECK: fix-it:"{{.*}}":{[[@LINE-6]]:22-[[@LINE-6]]:31}:"deprecated"
+
+[[using gnu: noretyrn, address_spaci(0)]] // expected-warning {{unknown attribute 'gnu::noretyrn' ignored; did you mean 'gnu::noreturn'?}} \
+                                          // expected-warning {{unknown attribute 'gnu::address_spaci' ignored}}
+void f11(void) {
+}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:14-[[@LINE-4]]:22}:"noreturn"
diff --git a/clang/test/Parser/cxx11-base-spec-attributes.cpp b/clang/test/Parser/cxx11-base-spec-attributes.cpp
index 7338c5116c16c..6f2f54ead62bc 100644
--- a/clang/test/Parser/cxx11-base-spec-attributes.cpp
+++ b/clang/test/Parser/cxx11-base-spec-attributes.cpp
@@ -7,4 +7,4 @@ struct D : [[]] public virtual A {};
 struct E : public [[]] virtual A {}; // expected-error {{an attribute list cannot appear here}}
 struct F : virtual [[]] public A {}; // expected-error {{an attribute list cannot appear here}}
 struct G : [[noreturn]] A {}; // expected-error {{'noreturn' attribute cannot be applied to a base specifier}}
-struct H : [[unknown::foobar]] A {}; // expected-warning {{unknown attribute 'foobar' ignored}}
+struct H : [[unknown::foobar]] A {}; // expected-warning {{unknown attribute 'unknown::foobar' ignored}}
diff --git a/clang/test/Parser/objcxx11-attributes.mm b/clang/test/Parser/objcxx11-attributes.mm
index d7ba609ebd74b..88fa3103593ef 100644
--- a/clang/test/Parser/objcxx11-attributes.mm
+++ b/clang/test/Parser/objcxx11-attributes.mm
@@ -57,7 +57,7 @@ void f(X *noreturn) {
 
 template<typename...Ts> void f(Ts ...x) {
   [[test::foo(bar, baz)...]]; // expected-error {{attribute 'foo' cannot be used as an attribute pack}} \
-  // expected-warning {{unknown attribute 'foo' ignored}}
+  // expected-warning {{unknown attribute 'test::foo' ignored}}
 
   [[used(x)...]]; // expected-error {{attribute 'used' cannot be used as an attribute pack}} \
   // expected-warning {{unknown attribute 'used' ignored}}
diff --git a/clang/test/Sema/unknown-attributes.c b/clang/test/Sema/unknown-attributes.c
index a701650c9e056..4711c9fa667ba 100644
--- a/clang/test/Sema/unknown-attributes.c
+++ b/clang/test/Sema/unknown-attributes.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify %s
-// RUN: %clang_cc1 -x c++ -Wunknown-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -Wunknown-attributes -fsyntax-only -verify=expected,c %s
+// RUN: %clang_cc1 -x c++ -Wunknown-attributes -fsyntax-only -verify=expected,cxx %s
 
 [[gmu::deprected]] // expected-warning {{unknown attribute 'gmu::deprected' ignored; did you mean 'gnu::deprecated'?}}
 int f1(void) {
@@ -20,3 +20,10 @@ int f3(void) {
 int f4(void) {
   return 0;
 }
+
+[[using gnu : deprected]] // c-error {{expected ','}} \
+                          // c-warning {{unknown attribute 'using' ignored}} \
+                          // cxx-warning {{unknown attribute 'gnu::deprected' ignored; did you mean 'gnu::deprecated'?}}
+int f5(void) {
+  return 0;
+}
diff --git a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
index 00fa5bd7336b6..acd9846bb20fb 100644
--- a/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
+++ b/clang/test/SemaCXX/attr-non-x86-no_caller_saved_registers.cpp
@@ -11,7 +11,7 @@ __attribute__((no_caller_saved_registers(999))) void bar(int *) {} // expected-w
 
 __attribute__((no_caller_saved_registers)) void foo(int *){} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
 
-[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
+[[gnu::no_caller_saved_registers]] void foo2(int *) {} // expected-warning {{unknown attribute 'gnu::no_caller_saved_registers' ignored}}
 
 typedef __attribute__((no_caller_saved_registers)) void (*foo3)(int *); // expected-warning {{unknown attribute 'no_caller_saved_registers' ignored}}
 

From 951ea8b681451ff2db8b895f1dcfe0fbc91d939a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Sat, 14 Jun 2025 18:20:47 -0700
Subject: [PATCH 482/851] [mlir][nvvm][NFC] Fix typo in TargetAttr (#144159)

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 026c1fae0eb89..2dd7ac29cfedd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -3835,7 +3835,7 @@ def NVVM_DotAccumulate2WayOp : NVVM_Op<"dot.accumulate.2way"> {
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
 
-def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target", 
+def NVVM_TargetAttr : NVVM_Attr<"NVVMTarget", "target", 
   [DeclareAttrInterfaceMethods<GPUTargetAttrVerifyInterface>]> {
   let description = [{
     GPU target attribute for controlling compilation of NVIDIA targets. All

From 4ed10db85919d3d87bf0b3353340b58354a75994 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Sat, 14 Jun 2025 14:07:14 +0100
Subject: [PATCH 483/851] [clang][cmake] Don't pass -fno-strict-aliasing for
 GCC

This was added a long time ago..
* to the Makefiles in 40fee6313df688d43d1f8bbe85bc35161689afca;
* first to CMake in b3ce035c7155644d5bced46c45ae5ac865b7aedc;
* then moved to only apply when building Clang with GCC in
  c5635a6af7c643169f81145bfae8c895f2207792.

This shouldn't be needed these days. If an issue does arise, it really
ought to be documented better and the cause will certainly be different
than it was back then.

The two GCC bugs cited in 40fee6313df688d43d1f8bbe85bc35161689afca were:
* https://gcc.gnu.org/PR41874
* https://gcc.gnu.org/PR41838
and both are long-fixed. Not only that, if those issues did come up again,
we'd be better off doing -Wno-strict-aliasing where appropriate if there
weren't a real code issue or some suppression that was tighter in scope
wasn't appropriate.
---
 clang/CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index ab2ac9bc6b9ad..94607a8e8473c 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -345,9 +345,6 @@ configure_file(
 # Add appropriate flags for GCC
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-common -Woverloaded-virtual")
-  if (NOT "${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-strict-aliasing")
-  endif ()
 
   # Enable -pedantic for Clang even if it's not enabled for LLVM.
   if (NOT LLVM_ENABLE_PEDANTIC)

From 24c8d900c47edeefb85643a06bc32235d9f42ea3 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Sun, 15 Jun 2025 11:38:04 +0800
Subject: [PATCH 484/851] [RISCV] Remove B and Zbc extension from Andes series
 cpus. (#144022)

The Andes CPU is configurable with optional extensions. The minimal
required extension set does not include `B` and `Zbc` extensions. So we
decided to remove them.
---
 .../Driver/print-enabled-extensions/riscv-andes-a25.c     | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-a45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-ax25.c    | 7 +------
 .../Driver/print-enabled-extensions/riscv-andes-ax45.c    | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-n45.c     | 6 +-----
 .../Driver/print-enabled-extensions/riscv-andes-nx45.c    | 6 +-----
 llvm/lib/Target/RISCV/RISCVProcessors.td                  | 8 --------
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s              | 2 +-
 8 files changed, 7 insertions(+), 41 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index d8b3848d84520..cfb4d0ed58d11 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,12 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index a0a1c35911409..3c3c554dffc57 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 3f933ecd8ac83..70100a0a8df13 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,12 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index 6460d701411bc..d2b1a32e321e5 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 4d9c514b756e6..1a2c30bfc7a2e 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -19,11 +18,8 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 5eaada3f9e164..50c38da3bd034 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,7 +10,6 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
-// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,11 +17,8 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
-// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
-// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
-// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 32f4ab607a34c..d7e6c71ea062e 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,8 +703,6 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
-                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -718,8 +716,6 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
-                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -741,7 +737,6 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -756,7 +751,6 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -771,7 +765,6 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
-                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -786,6 +779,5 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
-                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index f6dc6eef3f0ff..d90dce8c5c3fc 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0

From a0c00ccd5ff180c721def8001c870338d5de319e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Sun, 15 Jun 2025 07:45:48 +0300
Subject: [PATCH 485/851] [libc++] P2944R3: Constrained comparisons - update
 `reference_wrapper` implementation (#139368)

Updates the implementation `std::reference_wrapper` -
[P2944R3](https://wg21.link/P2944R3) as discussed in
https://github.com/llvm/llvm-project/pull/117664#discussion_r1857826166
This PR also refactors the tests in preparation to implements the
constrained comparisons for `optional`, `variant` etc.

- Moves the test helpers (concepts and types) for testing constrained
comparisons to `test_comparisons.h`.
- Updates the `std::reference_wrapper` implementation to use the concept
`__core_convertible_to<bool>` as per comments in #135759.

Closes #138233

# References:
- [refwrap.comparisons](https://wg21.link/refwrap.comparisons)

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
Co-authored-by: Nikolas Klauser <nikolasklauser@berlin.de>
---
 libcxx/docs/Status/Cxx2cPapers.csv            |  2 +-
 .../include/__functional/reference_wrapper.h  |  8 ++--
 .../array/compare.three_way.pass.cpp          |  1 -
 ...mpare.three_way.refwrap.const_ref.pass.cpp | 13 +++----
 ...compare.three_way.refwrap.refwrap.pass.cpp | 14 +++----
 ...e.three_way.refwrap.refwrap_const.pass.cpp | 17 ++++-----
 .../equal.refwrap.const_ref.pass.cpp          | 11 +++---
 .../equal.refwrap.refwrap.pass.cpp            |  9 ++---
 .../equal.refwrap.refwrap_const.pass.cpp      | 13 +++----
 .../refwrap.comparissons/helper_concepts.h    | 38 -------------------
 .../refwrap.comparissons/helper_types.h       | 30 ---------------
 libcxx/test/support/test_comparisons.h        | 25 +++++++++++-
 .../test/support/test_container_comparisons.h |  4 --
 13 files changed, 59 insertions(+), 126 deletions(-)
 delete mode 100644 libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
 delete mode 100644 libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h

diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 3809446a57896..8a0417e120d75 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -59,7 +59,7 @@
 "`P2248R8 <https://wg21.link/P2248R8>`__","Enabling list-initialization for algorithms","2024-03 (Tokyo)","","",""
 "`P2810R4 <https://wg21.link/P2810R4>`__","``is_debugger_present`` ``is_replaceable``","2024-03 (Tokyo)","","",""
 "`P1068R11 <https://wg21.link/P1068R11>`__","Vector API for random number generation","2024-03 (Tokyo)","","",""
-"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","Implemented changes to ``reference_wrapper`` and ``pair``"
+"`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Partial|","","The changes to ``optional``, ``tuple`` and ``variant`` are not yet implemented"
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19",""
 "","","","","",""
diff --git a/libcxx/include/__functional/reference_wrapper.h b/libcxx/include/__functional/reference_wrapper.h
index b409ad7511f6c..c46203a4ca9a4 100644
--- a/libcxx/include/__functional/reference_wrapper.h
+++ b/libcxx/include/__functional/reference_wrapper.h
@@ -11,7 +11,6 @@
 #define _LIBCPP___FUNCTIONAL_REFERENCE_WRAPPER_H
 
 #include <__compare/synth_three_way.h>
-#include <__concepts/boolean_testable.h>
 #include <__config>
 #include <__functional/weak_result_type.h>
 #include <__memory/addressof.h>
@@ -19,6 +18,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/invoke.h>
 #include <__type_traits/is_const.h>
+#include <__type_traits/is_core_convertible.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
@@ -75,7 +75,7 @@ class reference_wrapper : public __weak_result_type<_Tp> {
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper __y)
     requires requires {
-      { __x.get() == __y.get() } -> __boolean_testable;
+      { __x.get() == __y.get() } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y.get();
@@ -83,7 +83,7 @@ class reference_wrapper : public __weak_result_type<_Tp> {
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, const _Tp& __y)
     requires requires {
-      { __x.get() == __y } -> __boolean_testable;
+      { __x.get() == __y } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y;
@@ -91,7 +91,7 @@ class reference_wrapper : public __weak_result_type<_Tp> {
 
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(reference_wrapper __x, reference_wrapper<const _Tp> __y)
     requires(!is_const_v<_Tp>) && requires {
-      { __x.get() == __y.get() } -> __boolean_testable;
+      { __x.get() == __y.get() } -> __core_convertible_to<bool>;
     }
   {
     return __x.get() == __y.get();
diff --git a/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
index 01be1db73041b..671747f89a82e 100644
--- a/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/compare.three_way.pass.cpp
@@ -26,7 +26,6 @@ constexpr std::size_t N{1};
 static_assert(std::three_way_comparable<std::array<int, N>>);
 
 // Thanks to SFINAE, the following is not a compiler error but returns `false`
-struct NonComparable {};
 static_assert(!std::three_way_comparable<std::array<NonComparable, N>>);
 
 // Implementation detail of `test_sequence_container_array_spaceship`
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
index 85106c18ec35a..4a2ae963e3bdb 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.const_ref.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,16 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<StrongOrder>>);
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<WeakOrder>>);
-static_assert(HasSpaceshipOperatorWithInt<std::reference_wrapper<PartialOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, int>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, int>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, int>);
 
-static_assert(!HasSpaceshipOperatorWithInt<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<NonComparable>, int>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
index 794fac00de8a6..3d72459bc5a19 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -22,17 +22,13 @@
 
 #include "test_comparisons.h"
 #include "test_macros.h"
-
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::three_way_comparable<std::reference_wrapper<StrongOrder>>);
-static_assert(std::three_way_comparable<std::reference_wrapper<WeakOrder>>);
-static_assert(std::three_way_comparable<std::reference_wrapper<PartialOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>>);
 
-static_assert(!std::three_way_comparable<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<NonComparable>>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
index 9b1302affa851..1ae22b4ac58e0 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/compare.three_way.refwrap.refwrap_const.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,18 +23,15 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const StrongOrder>);
-static_assert(std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const WeakOrder>);
-static_assert(std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const PartialOrder>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, std::reference_wrapper<const StrongOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, std::reference_wrapper<const WeakOrder>>);
+static_assert(HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, std::reference_wrapper<const PartialOrder>>);
 
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<StrongOrder>, const NonComparable>);
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<WeakOrder>, const NonComparable>);
-static_assert(!std::three_way_comparable_with<std::reference_wrapper<PartialOrder>, const NonComparable>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<StrongOrder>, std::reference_wrapper<const NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<WeakOrder>, std::reference_wrapper<const NonComparable>>);
+static_assert(!HasOperatorSpaceship<std::reference_wrapper<PartialOrder>, std::reference_wrapper<const NonComparable>>);
 
 // Test comparisons.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
index 465326818f17c..316ff7c303315 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.const_ref.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,14 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(HasEqualityOperatorWithInt<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>, int>);
 
-static_assert(!HasEqualityOperatorWithInt<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>, int>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
index a50b530bbc6e1..70e79d399861a 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -22,14 +22,11 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::equality_comparable<std::reference_wrapper<EqualityComparable>>);
+static_assert(HasOperatorEqual<std::reference_wrapper<EqualityComparable>>);
 
-static_assert(!std::equality_comparable<std::reference_wrapper<NonComparable>>);
+static_assert(!HasOperatorEqual<std::reference_wrapper<NonComparable>>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
index 10f017742a87f..c68ad5c4aa527 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/equal.refwrap.refwrap_const.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// REQUIRES: std-at-least-c++26
 
 // <functional>
 
@@ -23,16 +23,13 @@
 #include "test_comparisons.h"
 #include "test_macros.h"
 
-#include "helper_concepts.h"
-#include "helper_types.h"
-
 // Test SFINAE.
 
-static_assert(std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
-                                            std::reference_wrapper<const EqualityComparable>>);
+static_assert(
+    HasOperatorEqual<std::reference_wrapper<EqualityComparable>, std::reference_wrapper<const EqualityComparable>>);
 
-static_assert(!std::equality_comparable_with<std::reference_wrapper<EqualityComparable>,
-                                             std::reference_wrapper<const NonComparable>>);
+static_assert(
+    !HasOperatorEqual<std::reference_wrapper<EqualityComparable>, std::reference_wrapper<const NonComparable>>);
 
 // Test equality.
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
deleted file mode 100644
index 2dbb304f8af63..0000000000000
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_concepts.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
-#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
-
-#include <concepts>
-#include <utility>
-
-// Equality
-
-template <typename T>
-concept HasEqualityOperatorWithInt = requires(T t, int i) {
-  { t.get() == i } -> std::convertible_to<bool>;
-};
-
-// Spaceship
-
-template <class T>
-concept BooleanTestableImpl = std::convertible_to<T, bool>;
-
-template <class T>
-concept BooleanTestable = BooleanTestableImpl<T> && requires(T&& t) {
-  { !std::forward<T>(t) } -> BooleanTestableImpl;
-};
-
-template <typename T>
-concept HasSpaceshipOperatorWithInt = requires(T t, int i) {
-  { t < i } -> BooleanTestable;
-  { i < t } -> BooleanTestable;
-};
-
-#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_CONCEPTS_H
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
deleted file mode 100644
index cf5e568dbf936..0000000000000
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.comparissons/helper_types.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
-#define TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
-
-#include <concepts>
-
-struct EqualityComparable {
-  constexpr EqualityComparable(int value) : value_{value} {};
-
-  friend constexpr bool operator==(const EqualityComparable&, const EqualityComparable&) noexcept = default;
-
-  int value_;
-};
-
-static_assert(std::equality_comparable<EqualityComparable>);
-static_assert(EqualityComparable{94} == EqualityComparable{94});
-static_assert(EqualityComparable{94} != EqualityComparable{82});
-
-struct NonComparable {};
-
-static_assert(!std::three_way_comparable<NonComparable>);
-
-#endif // TEST_STD_FUNCTIONOBJECTS_REFWRAP_HELPER_TYPES_H
diff --git a/libcxx/test/support/test_comparisons.h b/libcxx/test/support/test_comparisons.h
index db6977a96a2fe..d9729e0451b49 100644
--- a/libcxx/test/support/test_comparisons.h
+++ b/libcxx/test/support/test_comparisons.h
@@ -268,6 +268,29 @@ struct PartialOrder {
   }
 };
 
-#endif
+template <typename T1, typename T2 = T1>
+concept HasOperatorEqual = requires(T1 t1, T2 t2) { t1 == t2; };
+
+template <typename T1, typename T2 = T1>
+concept HasOperatorSpaceship = requires(T1 t1, T2 t2) { t1 <=> t2; };
+
+struct NonComparable {};
+static_assert(!std::equality_comparable<NonComparable>);
+static_assert(!HasOperatorEqual<NonComparable>);
+static_assert(!HasOperatorSpaceship<NonComparable>);
+
+class EqualityComparable {
+public:
+  constexpr EqualityComparable(int value) : value_{value} {};
+
+  friend constexpr bool operator==(const EqualityComparable&, const EqualityComparable&) noexcept = default;
+
+private:
+  int value_;
+};
+static_assert(std::equality_comparable<EqualityComparable>);
+static_assert(HasOperatorEqual<EqualityComparable>);
+
+#endif // TEST_STD_VER >= 20
 
 #endif // TEST_COMPARISONS_H
diff --git a/libcxx/test/support/test_container_comparisons.h b/libcxx/test/support/test_container_comparisons.h
index f7bf78e48a1f8..53db5ba99ce47 100644
--- a/libcxx/test/support/test_container_comparisons.h
+++ b/libcxx/test/support/test_container_comparisons.h
@@ -88,7 +88,6 @@ constexpr bool test_sequence_container_spaceship() {
                                               std::weak_ordering>();
 
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<NonComparable>>);
 
   return true;
@@ -163,7 +162,6 @@ constexpr void test_sequence_container_adaptor_spaceship_with_type() {
 template <template <typename...> typename ContainerAdaptor, template <typename...> typename Container>
 constexpr bool test_sequence_container_adaptor_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<ContainerAdaptor<NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`
@@ -301,7 +299,6 @@ constexpr void test_ordered_map_container_spaceship_with_type(Compare comp) {
 template <template <typename...> typename Container>
 constexpr bool test_ordered_map_container_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<int, NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`
@@ -444,7 +441,6 @@ constexpr void test_ordered_set_spaceship_with_type(Compare comp) {
 template <template <typename...> typename Container>
 constexpr bool test_ordered_set_container_spaceship() {
   // Thanks to SFINAE, the following is not a compiler error but returns `false`
-  struct NonComparable {};
   static_assert(!std::three_way_comparable<Container<NonComparable>>);
 
   // The container should fulfill `std::three_way_comparable`

From c4ba734993ac7ca39cc101db62797aad3a2a265a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 23:23:42 -0700
Subject: [PATCH 486/851] [mlir] Compare std::optional<T> to values directly
 (NFC) (#144241)

This patch transforms:

  X && *X == Y

to:

  X == Y

where X is of std::optional<T>, and Y is of T or similar.
---
 mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp  | 2 +-
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp              | 9 ++++-----
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp           | 6 ++----
 mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp   | 3 +--
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp               | 2 +-
 mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp | 2 +-
 mlir/lib/Dialect/Utils/StaticValueUtils.cpp           | 3 +--
 7 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
index dab15d23f6e0f..ac8ed4fdff7c3 100644
--- a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
+++ b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp
@@ -173,7 +173,7 @@ getTreePredicates(std::vector<PositionalPredicate> &predList, Value val,
 
       // Ignore the specified operand, usually because this position was
       // visited in an upward traversal via an iterative choice.
-      if (ignoreOperand && *ignoreOperand == operandIt.index())
+      if (ignoreOperand == operandIt.index())
         continue;
 
       Position *pos =
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 8a708eb29210c..3d09c6a9b2c24 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -2367,7 +2367,7 @@ struct AffineForEmptyLoopFolder : public OpRewritePattern<AffineForOp> {
     if (forOp.getNumResults() == 0)
       return success();
     std::optional<uint64_t> tripCount = getTrivialConstantTripCount(forOp);
-    if (tripCount && *tripCount == 0) {
+    if (tripCount == 0) {
       // The initial values of the iteration arguments would be the op's
       // results.
       rewriter.replaceOp(forOp, forOp.getInits());
@@ -2447,7 +2447,7 @@ void AffineForOp::getSuccessorRegions(
 
   // From the loop body, if the trip count is one, we can only branch back to
   // the parent.
-  if (!point.isParent() && tripCount && *tripCount == 1) {
+  if (!point.isParent() && tripCount == 1) {
     regions.push_back(RegionSuccessor(getResults()));
     return;
   }
@@ -2460,8 +2460,7 @@ void AffineForOp::getSuccessorRegions(
 
 /// Returns true if the affine.for has zero iterations in trivial cases.
 static bool hasTrivialZeroTripCount(AffineForOp op) {
-  std::optional<uint64_t> tripCount = getTrivialConstantTripCount(op);
-  return tripCount && *tripCount == 0;
+  return getTrivialConstantTripCount(op) == 0;
 }
 
 LogicalResult AffineForOp::fold(FoldAdaptor adaptor,
@@ -4789,7 +4788,7 @@ struct DropUnitExtentBasis
          llvm::enumerate(delinearizeOp.getPaddedBasis())) {
       std::optional<int64_t> basisVal =
           basis ? getConstantIntValue(basis) : std::nullopt;
-      if (basisVal && *basisVal == 1)
+      if (basisVal == 1)
         replacements[index] = getZero();
       else
         newBasis.push_back(basis);
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 0d4ba3940c48e..4aa1fe318efa8 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1015,8 +1015,7 @@ LogicalResult mlir::affine::loopUnrollByFactor(
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (unrollFactor == 1) {
-    if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
-        failed(promoteIfSingleIteration(forOp)))
+    if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
   }
@@ -1103,8 +1102,7 @@ LogicalResult mlir::affine::loopUnrollJamByFactor(AffineForOp forOp,
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   if (unrollJamFactor == 1) {
-    if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
-        failed(promoteIfSingleIteration(forOp)))
+    if (mayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index bd4ffabfbb929..5e6dde36d7f9f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -606,8 +606,7 @@ struct DropPadUnitDims : public OpRewritePattern<tensor::PadOp> {
     int64_t padRank = sourceShape.size();
 
     auto isStaticZero = [](OpFoldResult f) {
-      std::optional<int64_t> maybeInt = getConstantIntValue(f);
-      return maybeInt && *maybeInt == 0;
+      return getConstantIntValue(f) == 0;
     };
 
     llvm::SmallDenseSet<unsigned> unitDimsFilter(allowedUnitDims.begin(),
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index bae06c003fd97..2527d90cfa2e6 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -688,7 +688,7 @@ computeSliceParameters(OpBuilder &builder, Location loc, Value valueToTile,
     //    tensors with "0" dimensions would never be constructed.
     int64_t shapeSize = shape[r];
     std::optional<int64_t> sizeCst = getConstantIntValue(size);
-    auto hasTileSizeOne = sizeCst && *sizeCst == 1;
+    auto hasTileSizeOne = sizeCst == 1;
     auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
                          ((shapeSize % *sizeCst) == 0);
     if (!hasTileSizeOne && !dividesEvenly) {
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index f5a58c58e05df..1e7bb046d3752 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -737,7 +737,7 @@ static spirv::GlobalVariableOp getBuiltinVariable(Block &body,
             spirv::SPIRVDialect::getAttributeName(
                 spirv::Decoration::BuiltIn))) {
       auto varBuiltIn = spirv::symbolizeBuiltIn(builtinAttr.getValue());
-      if (varBuiltIn && *varBuiltIn == builtin) {
+      if (varBuiltIn == builtin) {
         return varOp;
       }
     }
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 29f7bd6857c27..8e3f796af54df 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -142,8 +142,7 @@ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
 }
 
 bool isConstantIntValue(OpFoldResult ofr, int64_t value) {
-  auto val = getConstantIntValue(ofr);
-  return val && *val == value;
+  return getConstantIntValue(ofr) == value;
 }
 
 bool areAllConstantIntValue(ArrayRef<OpFoldResult> ofrs, int64_t value) {

From 84ff1bda2977e580265997ad2d4c47b18cd3bf9f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 14 Jun 2025 23:23:49 -0700
Subject: [PATCH 487/851] [RISCV] Use StringRef in a range-based for loop (NFC)
 (#144243)

When we iterate over std::vector<std::string>, we can directly assign
each element to StringRef.  We do not need to go through a separate
statement.
---
 llvm/lib/TargetParser/RISCVISAInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index e76ddd4b648dc..17c98332ab0af 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -449,8 +449,7 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
   assert(XLen == 32 || XLen == 64);
   std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen));
 
-  for (auto &Feature : Features) {
-    StringRef ExtName = Feature;
+  for (StringRef ExtName : Features) {
     assert(ExtName.size() > 1 && (ExtName[0] == '+' || ExtName[0] == '-'));
     bool Add = ExtName[0] == '+';
     ExtName = ExtName.drop_front(1); // Drop '+' or '-'

From 9e16792639242a86314e5d6531010953a0a96216 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Sun, 15 Jun 2025 02:35:20 -0400
Subject: [PATCH 488/851] [mlir][bzl] Add CAPIIndex rule. (#144248)

---
 .../bazel/llvm-project-overlay/mlir/BUILD.bazel | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b62d5595fe941..e7398a696beaa 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -535,6 +535,23 @@ mlir_c_api_cc_library(
     ],
 )
 
+mlir_c_api_cc_library(
+    name = "CAPIIndex",
+    srcs = [
+        "lib/CAPI/Dialect/Index.cpp",
+    ],
+    hdrs = [
+        "include/mlir-c/Dialect/Index.h",
+    ],
+    capi_deps = [
+        ":CAPIIR",
+    ],
+    includes = ["include"],
+    deps = [
+        ":IndexDialect",
+    ],
+)
+
 mlir_c_api_cc_library(
     name = "CAPILinalg",
     srcs = [

From 149cb5c43c3a75ecb827b8b7ae853250c3c09449 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 15 Jun 2025 15:17:53 +0800
Subject: [PATCH 489/851] [ValueTracking] Infer `X | Y != 0` from `X != Y`
 (#117443)

Alive2: https://alive2.llvm.org/ce/z/cJ75Ya

Closes https://github.com/llvm/llvm-project/issues/117436.
---
 llvm/lib/Analysis/ValueTracking.cpp          | 4 ++++
 llvm/test/Transforms/InstCombine/icmp-dom.ll | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d8c1096049dce..99670b92187c5 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3043,6 +3043,10 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     // (X | (X != 0)) is non zero
     if (matchOpWithOpEqZero(I->getOperand(0), I->getOperand(1)))
       return true;
+    // X | Y != 0 if X != Y.
+    if (isKnownNonEqual(I->getOperand(0), I->getOperand(1), DemandedElts, Q,
+                        Depth))
+      return true;
     // X | Y != 0 if X != 0 or Y != 0.
     return isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth) ||
            isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
diff --git a/llvm/test/Transforms/InstCombine/icmp-dom.ll b/llvm/test/Transforms/InstCombine/icmp-dom.ll
index 6613bbeb8d6ae..a72b5e0bbfa03 100644
--- a/llvm/test/Transforms/InstCombine/icmp-dom.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-dom.ll
@@ -535,16 +535,13 @@ else:
   ret i1 %cmp1
 }
 
-; TODO: X != Y implies X | Y != 0
 define i1 @or_nonzero_from_nonequal(i8 %x, i8 %y) {
 ; CHECK-LABEL: @or_nonzero_from_nonequal(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[OR:%.*]] = or i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[OR]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ; CHECK:       if.else:
 ; CHECK-NEXT:    ret i1 false
 ;

From 30a41a642358d0f427c3cbc0299ea48fbc0cf79e Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Sun, 15 Jun 2025 03:32:34 -0400
Subject: [PATCH 490/851] [ValueTracking] Add subtraction support for
 setLimitsForBinOp (#143618)

We can determine the range from a subtraction if it has nsw or nuw.

https://alive2.llvm.org/ce/z/tXAKVV
---
 llvm/lib/Analysis/ValueTracking.cpp          | 36 ++++++++++++++++++--
 llvm/test/Transforms/InstCombine/div.ll      |  8 ++---
 llvm/test/Transforms/InstCombine/icmp-sub.ll |  3 +-
 3 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 99670b92187c5..e7a1f07c0270d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9580,15 +9580,45 @@ static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
-  case Instruction::Add:
-    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
+  case Instruction::Sub:
+    if (match(BO.getOperand(0), m_APInt(C))) {
       bool HasNSW = IIQ.hasNoSignedWrap(&BO);
       bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
 
       // If the caller expects a signed compare, then try to use a signed range.
       // Otherwise if both no-wraps are set, use the unsigned range because it
       // is never larger than the signed range. Example:
-      // "add nuw nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
+      // "sub nuw nsw i8 -2, x" is unsigned [0, 254] vs. signed [-128, 126].
+      // "sub nuw nsw i8 2, x" is unsigned [0, 2] vs. signed [-125, 127].
+      if (PreferSignedRange && HasNSW && HasNUW)
+        HasNUW = false;
+
+      if (HasNUW) {
+        // 'sub nuw c, x' produces [0, C].
+        Upper = *C + 1;
+      } else if (HasNSW) {
+        if (C->isNegative()) {
+          // 'sub nsw -C, x' produces [SINT_MIN, -C - SINT_MIN].
+          Lower = APInt::getSignedMinValue(Width);
+          Upper = *C - APInt::getSignedMaxValue(Width);
+        } else {
+          // Note that sub 0, INT_MIN is not NSW. It techically is a signed wrap
+          // 'sub nsw C, x' produces [C - SINT_MAX, SINT_MAX].
+          Lower = *C - APInt::getSignedMaxValue(Width);
+          Upper = APInt::getSignedMinValue(Width);
+        }
+      }
+    }
+    break;
+  case Instruction::Add:
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
+      bool HasNSW = IIQ.hasNoSignedWrap(&BO);
+      bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
+
+      // If the caller expects a signed compare, then try to use a signed
+      // range. Otherwise if both no-wraps are set, use the unsigned range
+      // because it is never larger than the signed range. Example: "add nuw
+      // nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
       if (PreferSignedRange && HasNSW && HasNUW)
         HasNUW = false;
 
diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll
index 7e93612150e8c..f0fdc5f54366a 100644
--- a/llvm/test/Transforms/InstCombine/div.ll
+++ b/llvm/test/Transforms/InstCombine/div.ll
@@ -494,9 +494,7 @@ define <2 x i8> @sdiv_exact_negated_dividend_constant_divisor_vec_splat(<2 x i8>
 
 define i8 @sdiv_negated_dividend_constant_divisor_smin(i8 %x) {
 ; CHECK-LABEL: @sdiv_negated_dividend_constant_divisor_smin(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP1]] to i8
-; CHECK-NEXT:    ret i8 [[D]]
+; CHECK-NEXT:    ret i8 0
 ;
   %neg = sub nsw i8 0, %x
   %d = sdiv i8 %neg, -128
@@ -505,9 +503,7 @@ define i8 @sdiv_negated_dividend_constant_divisor_smin(i8 %x) {
 
 define <2 x i8> @sdiv_negated_dividend_constant_divisor_vec_splat_smin(<2 x i8> %x) {
 ; CHECK-LABEL: @sdiv_negated_dividend_constant_divisor_vec_splat_smin(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], splat (i8 -128)
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
-; CHECK-NEXT:    ret <2 x i8> [[D]]
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
   %neg = sub nsw <2 x i8> zeroinitializer, %x
   %d = sdiv <2 x i8> %neg, <i8 -128, i8 -128>
diff --git a/llvm/test/Transforms/InstCombine/icmp-sub.ll b/llvm/test/Transforms/InstCombine/icmp-sub.ll
index 4143902bc9c46..13ed7ba0c1703 100644
--- a/llvm/test/Transforms/InstCombine/icmp-sub.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-sub.ll
@@ -290,8 +290,7 @@ define i1 @subC_nsw_ne(i32 %x) {
 ; CHECK-LABEL: @subC_nsw_ne(
 ; CHECK-NEXT:    [[SUBX:%.*]] = sub nsw i32 -2147483647, [[X:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[SUBX]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i32 [[X]], 2147483603
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %subx = sub nsw i32 -2147483647, %x
   call void @use(i32 %subx)

From 48e54f3a225062b5d229e6fd3b06140f76c0613b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 15 Jun 2025 08:51:59 +0100
Subject: [PATCH 491/851] [CostModel] Mark all TTIImpls as final. NFC (#143404)

In the AArch64 version this helps reduce the number of blr instruction
(indirect jumps) in from 325 to 87, and reduces the size of the object
file by 4%. It seems to help make the code more efficient even if it
doesn't greatly affect compile time.

The AMDGPU variants are already marked as final.
---
 llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 2 +-
 llvm/lib/Target/ARC/ARCTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/BPF/BPFTargetTransformInfo.h         | 2 +-
 llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h | 2 +-
 llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h | 2 +-
 llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/Mips/MipsTargetTransformInfo.h       | 2 +-
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h     | 2 +-
 llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h | 2 +-
 llvm/lib/Target/VE/VETargetTransformInfo.h           | 2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h         | 2 +-
 llvm/lib/Target/XCore/XCoreTargetTransformInfo.h     | 2 +-
 16 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 664c360032ea3..0184e748b3d86 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -39,7 +39,7 @@ class Type;
 class Value;
 class VectorType;
 
-class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
+class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
   using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
index bb7f2a0a459c2..3d5ff6dc256d9 100644
--- a/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
+++ b/llvm/lib/Target/ARC/ARCTargetTransformInfo.h
@@ -26,7 +26,7 @@ class ARCSubtarget;
 class ARCTargetLowering;
 class ARCTargetMachine;
 
-class ARCTTIImpl : public BasicTTIImplBase<ARCTTIImpl> {
+class ARCTTIImpl final : public BasicTTIImplBase<ARCTTIImpl> {
   using BaseT = BasicTTIImplBase<ARCTTIImpl>;
   friend BaseT;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 20a2c59511087..c1af4e3dc5da6 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -54,7 +54,7 @@ namespace TPLoop {
 enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
 }
 
-class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
+class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
index e94497896f681..d7b2ceff105c9 100644
--- a/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
+++ b/llvm/lib/Target/BPF/BPFTargetTransformInfo.h
@@ -21,7 +21,7 @@
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 namespace llvm {
-class BPFTTIImpl : public BasicTTIImplBase<BPFTTIImpl> {
+class BPFTTIImpl final : public BasicTTIImplBase<BPFTTIImpl> {
   typedef BasicTTIImplBase<BPFTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
index 9f344d7d52ba0..e2dd4354a8167 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -17,7 +17,7 @@
 #include "llvm/IR/Function.h"
 
 namespace llvm {
-class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
+class DirectXTTIImpl final : public BasicTTIImplBase<DirectXTTIImpl> {
   using BaseT = BasicTTIImplBase<DirectXTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index d7509c3bb1d2f..c03cad4713e40 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -30,7 +30,7 @@ class ScalarEvolution;
 class User;
 class Value;
 
-class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
+class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
   using BaseT = BasicTTIImplBase<HexagonTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index f17abf4c8af04..0342af65c1ef7 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -25,7 +25,7 @@
 #include "llvm/Support/MathExtras.h"
 
 namespace llvm {
-class LanaiTTIImpl : public BasicTTIImplBase<LanaiTTIImpl> {
+class LanaiTTIImpl final : public BasicTTIImplBase<LanaiTTIImpl> {
   typedef BasicTTIImplBase<LanaiTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
index 5e3884cd80161..8f8173915b2fb 100644
--- a/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
+++ b/llvm/lib/Target/Mips/MipsTargetTransformInfo.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-class MipsTTIImpl : public BasicTTIImplBase<MipsTTIImpl> {
+class MipsTTIImpl final : public BasicTTIImplBase<MipsTTIImpl> {
   using BaseT = BasicTTIImplBase<MipsTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 98aea4e535f0a..aa7850acbd64a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -26,7 +26,7 @@
 
 namespace llvm {
 
-class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
+class NVPTXTTIImpl final : public BasicTTIImplBase<NVPTXTTIImpl> {
   typedef BasicTTIImplBase<NVPTXTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 361b2ff223ea0..8618f3064c188 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
+class PPCTTIImpl final : public BasicTTIImplBase<PPCTTIImpl> {
   typedef BasicTTIImplBase<PPCTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 0a784461d67bf..dd7e9f7709f8e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -25,7 +25,7 @@
 
 namespace llvm {
 
-class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
+class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   using BaseT = BasicTTIImplBase<RISCVTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
index 3f211b5a8b168..40e561ba38881 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetTransformInfo.h
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 
 namespace llvm {
-class SPIRVTTIImpl : public BasicTTIImplBase<SPIRVTTIImpl> {
+class SPIRVTTIImpl final : public BasicTTIImplBase<SPIRVTTIImpl> {
   using BaseT = BasicTTIImplBase<SPIRVTTIImpl>;
   using TTI = TargetTransformInfo;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index b4bc41974b70b..368a4af768b3e 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -15,7 +15,7 @@
 
 namespace llvm {
 
-class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
+class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
   typedef BasicTTIImplBase<SystemZTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index 5cb0286087821..5c0ddca62c761 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -49,7 +49,7 @@ static bool isVectorLaneType(llvm::Type &ElemTy) {
 
 namespace llvm {
 
-class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
+class VETTIImpl final : public BasicTTIImplBase<VETTIImpl> {
   using BaseT = BasicTTIImplBase<VETTIImpl>;
   friend BaseT;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 72673d6fbd80f..8045f1b1d6637 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 class InstCombiner;
 
-class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
+class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
   typedef BasicTTIImplBase<X86TTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;
diff --git a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
index f2c10518109dc..cb809b992396a 100644
--- a/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
+++ b/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h
@@ -24,7 +24,7 @@
 
 namespace llvm {
 
-class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> {
+class XCoreTTIImpl final : public BasicTTIImplBase<XCoreTTIImpl> {
   typedef BasicTTIImplBase<XCoreTTIImpl> BaseT;
   typedef TargetTransformInfo TTI;
   friend BaseT;

From 89f692a24f6a13ae5cf9e37f91abe6f34c403258 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 15 Jun 2025 09:43:18 +0100
Subject: [PATCH 492/851] [GlobalISel] Split Legalizer debug ouput into
 paragraphs. NFC (#143427)

This helps keep the legalizer output easier to read, splitting each
instructions legalization into a separate block.
---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 83ba71e4c9d49..028bffd1bf5a7 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -118,7 +118,7 @@ LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
                                    LostDebugLocObserver &LocObserver) {
-  LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
+  LLVM_DEBUG(dbgs() << "\nLegalizing: " << MI);
 
   MIRBuilder.setInstrAndDebugLoc(MI);
 

From 147a4c7743c44af3537bae69dcf513153b03b00e Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Sun, 15 Jun 2025 06:54:11 -0700
Subject: [PATCH 493/851] [rtsan] Fix issue where close test would lead to
 crash (#144017)

---
 .../tests/rtsan_test_interceptors_posix.cpp   | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index c2d07400593d5..2ee35555c24de 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -449,12 +449,6 @@ TEST_F(RtsanFileTest, FcntlSetFdDiesWhenRealtime) {
   close(fd);
 }
 
-TEST(TestRtsanInterceptors, CloseDiesWhenRealtime) {
-  auto Func = []() { close(0); };
-  ExpectRealtimeDeath(Func, "close");
-  ExpectNonRealtimeSurvival(Func);
-}
-
 TEST(TestRtsanInterceptors, ChdirDiesWhenRealtime) {
   auto Func = []() { chdir("."); };
   ExpectRealtimeDeath(Func, "chdir");
@@ -606,8 +600,10 @@ class RtsanOpenedFileTest : public RtsanFileTest {
   }
 
   void TearDown() override {
-    if (file != nullptr)
+    const bool is_open = fcntl(fd, F_GETFD) != -1;
+    if (is_open && file != nullptr)
       fclose(file);
+
     RtsanFileTest::TearDown();
   }
 
@@ -620,6 +616,16 @@ class RtsanOpenedFileTest : public RtsanFileTest {
   int fd = -1;
 };
 
+TEST_F(RtsanOpenedFileTest, CloseDiesWhenRealtime) {
+  auto Func = [this]() { close(GetOpenFd()); };
+  ExpectRealtimeDeath(Func, "close");
+}
+
+TEST_F(RtsanOpenedFileTest, CloseSurvivesWhenNotRealtime) {
+  auto Func = [this]() { close(GetOpenFd()); };
+  ExpectNonRealtimeSurvival(Func);
+}
+
 #if SANITIZER_INTERCEPT_FSEEK
 TEST_F(RtsanOpenedFileTest, FgetposDieWhenRealtime) {
   auto Func = [this]() {

From b983431c281a0acb9e446c7c9d72474f4d09e8e0 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Sun, 15 Jun 2025 06:55:22 -0700
Subject: [PATCH 494/851] [rtsan] Fix issue when intercepted function was not
 execve in test (#144018)

---
 compiler-rt/test/rtsan/fork_exec.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/rtsan/fork_exec.cpp b/compiler-rt/test/rtsan/fork_exec.cpp
index 3b2d2e5ca2f5d..5890a0936a2f7 100644
--- a/compiler-rt/test/rtsan/fork_exec.cpp
+++ b/compiler-rt/test/rtsan/fork_exec.cpp
@@ -45,7 +45,12 @@ int main() MAYBE_NONBLOCKING {
 }
 
 // CHECK-NOHALT: Intercepted call to {{.*}} `fork` {{.*}}
-// CHECK-NOHALT: Intercepted call to {{.*}} `execve` {{.*}}
+
+// We should also get some other intercepted call. On some systems this
+// is `execve`, on others, it's a lock to set up `execve`. In either
+// case, just check that we get a second intercepted call, don't sweat
+// the name.
+// CHECK-NOHALT: Intercepted call to {{.*}}
 
 // usleep checks that rtsan is still enabled in the parent process
 // See note in our interceptors file for why we don't look for `wait`

From 567647888ea3dd292827bbac445d316d6a6b0ecb Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Sun, 15 Jun 2025 23:00:16 +0800
Subject: [PATCH 495/851] [clang][bytecode] Avoid revisiting decomposition decl
 in visitDeclRef (#144226)

This simple patch removes the code to revisit `DecompositionDecl` in
`visitDeclRef`. The revisit will try to emit the initializer of the
`DecompositionDecl`, which could result in evaluation errors if the
`DecompositionDecl` is not within a constexpr context.
---
 clang/lib/AST/ByteCode/Compiler.cpp | 4 ----
 clang/test/AST/ByteCode/cxx17.cpp   | 8 ++++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index bf38b2e5d537d..9fe4803ce98ec 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -6591,10 +6591,6 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     return T->isReferenceType();
   };
 
-  // DecompositionDecls are just proxies for us.
-  if (isa<DecompositionDecl>(VD))
-    return revisit(VD);
-
   if ((VD->hasGlobalStorage() || VD->isStaticDataMember()) &&
       typeShouldBeVisited(VD->getType())) {
     if (const Expr *Init = VD->getAnyInitializer();
diff --git a/clang/test/AST/ByteCode/cxx17.cpp b/clang/test/AST/ByteCode/cxx17.cpp
index 08a40e0a92862..0cf3a4f666d63 100644
--- a/clang/test/AST/ByteCode/cxx17.cpp
+++ b/clang/test/AST/ByteCode/cxx17.cpp
@@ -141,3 +141,11 @@ template <int x> constexpr auto c() {
 }
 
 auto y = c<1>(); // both-note {{in instantiation of function template specialization 'c<1>' requested here}}
+
+namespace NonConstexprStructuredBinding {
+  void f1() {
+    int arr[2] = {};
+    auto [a, b] = arr;
+    static_assert(&a != &b);
+  }
+}

From 886174a835208ecd2d06b378d2094b10611030d5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 15 Jun 2025 17:43:14 +0100
Subject: [PATCH 496/851] [X86] shuffle-blend.ll - regenerate test checks

---
 .../X86/avx512-shuffles/shuffle-blend.ll      | 35 +++++++------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
index 59e9fb1c4a9f3..78957d10301c9 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW
 
@@ -61,27 +61,16 @@ entry:
 }
 
 define <64 x i8> @addb_selectw_64xi8(<64 x i8> %t0, <64 x i8> %t1) {
-; X86-AVX512F-LABEL: addb_selectw_64xi8:
-; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; X86-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; X86-AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; X86-AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; X86-AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; X86-AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; X86-AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm2, %zmm0
-; X86-AVX512F-NEXT:    retl
-;
-; X64-AVX512F-LABEL: addb_selectw_64xi8:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; X64-AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; X64-AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
-; X64-AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
-; X64-AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; X64-AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
-; X64-AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
-; X64-AVX512F-NEXT:    retq
+; AVX512F-LABEL: addb_selectw_64xi8:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
+; AVX512F-NEXT:    ret{{[l|q]}}
 ;
 ; X86-AVX512BW-LABEL: addb_selectw_64xi8:
 ; X86-AVX512BW:       # %bb.0:

From 2669664605d00e1b3a9c479545b95a6844786d0c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:30 -0700
Subject: [PATCH 497/851] [modularize] Use range-based for loops (NFC)
 (#144244)

---
 clang-tools-extra/modularize/CoverageChecker.cpp     | 6 ++----
 clang-tools-extra/modularize/Modularize.cpp          | 4 ++--
 clang-tools-extra/modularize/ModularizeUtilities.cpp | 3 +--
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index fe6711398ab7d..1345a6ef8f489 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -329,10 +329,8 @@ bool CoverageChecker::collectFileSystemHeaders() {
   else {
     // Otherwise we only look at the sub-trees specified by the
     // include paths.
-    for (std::vector<std::string>::const_iterator I = IncludePaths.begin(),
-      E = IncludePaths.end();
-      I != E; ++I) {
-      if (!collectFileSystemHeaders(*I))
+    for (const std::string &IncludePath : IncludePaths) {
+      if (!collectFileSystemHeaders(IncludePath))
         return false;
     }
   }
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index 7f8a19280b111..2a90c5e3f6782 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -339,8 +339,8 @@ static std::string findInputFile(const CommandLineArguments &CLArgs) {
   llvm::opt::Visibility VisibilityMask(options::CC1Option);
   unsigned MissingArgIndex, MissingArgCount;
   SmallVector<const char *, 256> Argv;
-  for (auto I = CLArgs.begin(), E = CLArgs.end(); I != E; ++I)
-    Argv.push_back(I->c_str());
+  for (const std::string &CLArg : CLArgs)
+    Argv.push_back(CLArg.c_str());
   InputArgList Args = getDriverOptTable().ParseArgs(
       Argv, MissingArgIndex, MissingArgCount, VisibilityMask);
   std::vector<std::string> Inputs = Args.getAllArgValues(OPT_INPUT);
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 9ad1731915a8b..8a24f21d658df 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -69,8 +69,7 @@ ModularizeUtilities *ModularizeUtilities::createModularizeUtilities(
 // Load all header lists and dependencies.
 std::error_code ModularizeUtilities::loadAllHeaderListsAndDependencies() {
   // For each input file.
-  for (auto I = InputFilePaths.begin(), E = InputFilePaths.end(); I != E; ++I) {
-    llvm::StringRef InputPath = *I;
+  for (llvm::StringRef InputPath : InputFilePaths) {
     // If it's a module map.
     if (InputPath.ends_with(".modulemap")) {
       // Load the module map.

From fef5df9d843745b2c4ed163911ed1305028350ca Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:37 -0700
Subject: [PATCH 498/851] [TableGen] Use range-based for loops (NFC) (#144250)

---
 .../utils/TableGen/ClangCommentCommandInfoEmitter.cpp | 11 +++++------
 ...ClangCommentHTMLNamedCharacterReferenceEmitter.cpp |  4 ++--
 clang/utils/TableGen/ClangDiagnosticsEmitter.cpp      |  4 ++--
 clang/utils/TableGen/ClangOpcodesEmitter.cpp          |  3 +--
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
index f15e30cd3f8f4..161dd425fbc7b 100644
--- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
@@ -78,10 +78,10 @@ void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records,
 
 static std::string MangleName(StringRef Str) {
   std::string Mangled;
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    switch (Str[i]) {
+  for (char C : Str) {
+    switch (C) {
     default:
-      Mangled += Str[i];
+      Mangled += C;
       break;
     case '(':
       Mangled += "lparen";
@@ -122,9 +122,8 @@ void clang::EmitClangCommentCommandList(const RecordKeeper &Records,
      << "#endif\n";
 
   ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
-  for (size_t i = 0, e = Tags.size(); i != e; ++i) {
-    const Record &Tag = *Tags[i];
-    std::string MangledName = MangleName(Tag.getValueAsString("Name"));
+  for (const Record *Tag : Tags) {
+    std::string MangledName = MangleName(Tag->getValueAsString("Name"));
 
     OS << "COMMENT_COMMAND(" << MangledName << ")\n";
   }
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
index b8d8ac853a5c0..e5eec5e7ca8d4 100644
--- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
@@ -37,9 +37,9 @@ static bool translateCodePointToUTF8(unsigned CodePoint,
 
   raw_svector_ostream OS(CLiteral);
   OS << "\"";
-  for (size_t i = 0, e = UTF8.size(); i != e; ++i) {
+  for (char C : UTF8) {
     OS << "\\x";
-    OS.write_hex(static_cast<unsigned char>(UTF8[i]));
+    OS.write_hex(static_cast<unsigned char>(C));
   }
   OS << "\"";
 
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index e347b89a85d46..bfc60f485cd32 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1794,8 +1794,8 @@ static std::string getDiagCategoryEnum(StringRef name) {
   if (name.empty())
     return "DiagCat_None";
   SmallString<256> enumName = StringRef("DiagCat_");
-  for (StringRef::iterator I = name.begin(), E = name.end(); I != E; ++I)
-    enumName += isalnum(*I) ? *I : '_';
+  for (char C : name)
+    enumName += isalnum(C) ? C : '_';
   return std::string(enumName);
 }
 
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index 5d6d90994cf37..9d0773e1aff8f 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -224,8 +224,7 @@ void ClangOpcodesEmitter::EmitProto(raw_ostream &OS, StringRef N,
   auto Args = R->getValueAsListOfDefs("Args");
   Enumerate(R, N, [&OS, &Args](ArrayRef<const Record *> TS, const Twine &ID) {
     OS << "bool emit" << ID << "(";
-    for (size_t I = 0, N = Args.size(); I < N; ++I) {
-      const auto *Arg = Args[I];
+    for (const Record *Arg : Args) {
       bool AsRef = Arg->getValueAsBit("AsRef");
       auto Name = Arg->getValueAsString("Name");
 

From d78eec864c60729685487c884724f27edd53b3b8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:45 -0700
Subject: [PATCH 499/851] [lld] Use range-based for loops (NFC) (#144251)

---
 lld/ELF/Arch/ARM.cpp            | 6 +++---
 lld/ELF/SyntheticSections.cpp   | 7 +++----
 lld/MachO/SyntheticSections.cpp | 4 +---
 lld/wasm/Driver.cpp             | 6 +++---
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index aa90fecc533e3..91a673f13d68e 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1317,11 +1317,11 @@ void elf::processArmCmseSymbols(Ctx &ctx) {
   // with its corresponding special symbol __acle_se_<sym>.
   parallelForEach(ctx.objectFiles, [&](InputFile *file) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
-    for (size_t i = 0, e = syms.size(); i != e; ++i) {
-      StringRef symName = syms[i]->getName();
+    for (Symbol *&sym : syms) {
+      StringRef symName = sym->getName();
       auto it = ctx.symtab->cmseSymMap.find(symName);
       if (it != ctx.symtab->cmseSymMap.end())
-        syms[i] = it->second.acleSeSym;
+        sym = it->second.acleSeSym;
     }
   });
 }
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 051e5cd04ef50..efec41a737b62 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -4026,10 +4026,9 @@ void MergeNoTailSection::finalizeContents() {
   // So far, section pieces have offsets from beginning of shards, but
   // we want offsets from beginning of the whole section. Fix them.
   parallelForEach(sections, [&](MergeInputSection *sec) {
-    for (size_t i = 0, e = sec->pieces.size(); i != e; ++i)
-      if (sec->pieces[i].live)
-        sec->pieces[i].outputOff +=
-            shardOffsets[getShardId(sec->pieces[i].hash)];
+    for (SectionPiece &piece : sec->pieces)
+      if (piece.live)
+        piece.outputOff += shardOffsets[getShardId(piece.hash)];
   });
 }
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 0b7f233042487..979a4ee6d8133 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -947,9 +947,7 @@ uint64_t ObjCStubsSection::getSize() const {
 
 void ObjCStubsSection::writeTo(uint8_t *buf) const {
   uint64_t stubOffset = 0;
-  for (size_t i = 0, n = symbols.size(); i < n; ++i) {
-    Defined *sym = symbols[i];
-
+  for (Defined *sym : symbols) {
     auto methname = getMethname(sym);
     InputSection *selRef = ObjCSelRefsHelper::getSelRef(methname);
     assert(selRef != nullptr && "no selref for methname");
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 2b1fb945f41c8..1c5d21c06f5af 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1226,9 +1226,9 @@ static void wrapSymbols(ArrayRef<WrappedSymbol> wrapped) {
   // Update pointers in input files.
   parallelForEach(ctx.objectFiles, [&](InputFile *file) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
-    for (size_t i = 0, e = syms.size(); i != e; ++i)
-      if (Symbol *s = map.lookup(syms[i]))
-        syms[i] = s;
+    for (Symbol *&sym : syms)
+      if (Symbol *s = map.lookup(sym))
+        sym = s;
   });
 
   // Update pointers in the symbol table.

From 8f5c338b89a22abc3191a0d931071c09630d6195 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 10:32:52 -0700
Subject: [PATCH 500/851] [Sema] Use a range-based for loop (NFC) (#144252)

Note that LLVM Coding Standards discourages for_each.
---
 clang/lib/Sema/SemaOverload.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 49e5a311e239e..8c5f81f126c7a 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9272,11 +9272,10 @@ class BuiltinOperatorOverloadBuilder {
     /// the candidates into a unique set, then move from that set into the list
     /// of arithmetic types.
     llvm::SmallSetVector<CanQualType, 2> BitIntCandidates;
-    llvm::for_each(CandidateTypes, [&BitIntCandidates](
-                                       BuiltinCandidateTypeSet &Candidate) {
+    for (BuiltinCandidateTypeSet &Candidate : CandidateTypes) {
       for (QualType BitTy : Candidate.bitint_types())
         BitIntCandidates.insert(CanQualType::CreateUnsafe(BitTy));
-    });
+    }
     llvm::move(BitIntCandidates, std::back_inserter(ArithmeticTypes));
     LastPromotedIntegralType = ArithmeticTypes.size();
     LastPromotedArithmeticType = ArithmeticTypes.size();

From b16d43a874748a496da5cd774dd864c95b78d6b0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 10:46:47 -0700
Subject: [PATCH 501/851] VE: Rename VEMCExpr::VK_ to VE::S_

Prepare for removing VEMCExpr. Adopt the newer naming convention adopted
by AMDGPU/WebAssembly.
---
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp  | 66 +++++++++----------
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp     | 10 +--
 .../Target/VE/MCTargetDesc/VEMCAsmInfo.cpp    | 30 ++++-----
 .../VE/MCTargetDesc/VEMCCodeEmitter.cpp       |  2 +-
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp  | 32 ++++-----
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h    | 44 +++++++------
 llvm/lib/Target/VE/VEAsmPrinter.cpp           | 26 +++-----
 llvm/lib/Target/VE/VEISelLowering.cpp         | 49 +++++++-------
 8 files changed, 127 insertions(+), 132 deletions(-)

diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index a58ef127bbd5d..418587947e1ec 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -1042,7 +1042,7 @@ bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
 const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
                                             VEMCExpr::Specifier &Variant) {
   MCContext &Context = getParser().getContext();
-  Variant = VEMCExpr::VK_None;
+  Variant = VE::S_None;
 
   switch (E->getKind()) {
   case MCExpr::Target:
@@ -1055,51 +1055,51 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
     switch (SRE->getSpecifier()) {
-    case VEMCExpr::VK_None:
+    case VE::S_None:
       // Use VK_REFLONG to a symbol without modifiers.
-      Variant = VEMCExpr::VK_REFLONG;
+      Variant = VE::S_REFLONG;
       break;
-    case VEMCExpr::VK_HI32:
-      Variant = VEMCExpr::VK_HI32;
+    case VE::S_HI32:
+      Variant = VE::S_HI32;
       break;
-    case VEMCExpr::VK_LO32:
-      Variant = VEMCExpr::VK_LO32;
+    case VE::S_LO32:
+      Variant = VE::S_LO32;
       break;
-    case VEMCExpr::VK_PC_HI32:
-      Variant = VEMCExpr::VK_PC_HI32;
+    case VE::S_PC_HI32:
+      Variant = VE::S_PC_HI32;
       break;
-    case VEMCExpr::VK_PC_LO32:
-      Variant = VEMCExpr::VK_PC_LO32;
+    case VE::S_PC_LO32:
+      Variant = VE::S_PC_LO32;
       break;
-    case VEMCExpr::VK_GOT_HI32:
-      Variant = VEMCExpr::VK_GOT_HI32;
+    case VE::S_GOT_HI32:
+      Variant = VE::S_GOT_HI32;
       break;
-    case VEMCExpr::VK_GOT_LO32:
-      Variant = VEMCExpr::VK_GOT_LO32;
+    case VE::S_GOT_LO32:
+      Variant = VE::S_GOT_LO32;
       break;
-    case VEMCExpr::VK_GOTOFF_HI32:
-      Variant = VEMCExpr::VK_GOTOFF_HI32;
+    case VE::S_GOTOFF_HI32:
+      Variant = VE::S_GOTOFF_HI32;
       break;
-    case VEMCExpr::VK_GOTOFF_LO32:
-      Variant = VEMCExpr::VK_GOTOFF_LO32;
+    case VE::S_GOTOFF_LO32:
+      Variant = VE::S_GOTOFF_LO32;
       break;
-    case VEMCExpr::VK_PLT_HI32:
-      Variant = VEMCExpr::VK_PLT_HI32;
+    case VE::S_PLT_HI32:
+      Variant = VE::S_PLT_HI32;
       break;
-    case VEMCExpr::VK_PLT_LO32:
-      Variant = VEMCExpr::VK_PLT_LO32;
+    case VE::S_PLT_LO32:
+      Variant = VE::S_PLT_LO32;
       break;
-    case VEMCExpr::VK_TLS_GD_HI32:
-      Variant = VEMCExpr::VK_TLS_GD_HI32;
+    case VE::S_TLS_GD_HI32:
+      Variant = VE::S_TLS_GD_HI32;
       break;
-    case VEMCExpr::VK_TLS_GD_LO32:
-      Variant = VEMCExpr::VK_TLS_GD_LO32;
+    case VE::S_TLS_GD_LO32:
+      Variant = VE::S_TLS_GD_LO32;
       break;
-    case VEMCExpr::VK_TPOFF_HI32:
-      Variant = VEMCExpr::VK_TPOFF_HI32;
+    case VE::S_TPOFF_HI32:
+      Variant = VE::S_TPOFF_HI32;
       break;
-    case VEMCExpr::VK_TPOFF_LO32:
-      Variant = VEMCExpr::VK_TPOFF_LO32;
+    case VE::S_TPOFF_LO32:
+      Variant = VE::S_TPOFF_LO32;
       break;
     default:
       return nullptr;
@@ -1130,9 +1130,9 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
     if (!RHS)
       RHS = BE->getRHS();
 
-    if (LHSVariant == VEMCExpr::VK_None)
+    if (LHSVariant == VE::S_None)
       Variant = RHSVariant;
-    else if (RHSVariant == VEMCExpr::VK_None)
+    else if (RHSVariant == VE::S_None)
       Variant = LHSVariant;
     else if (LHSVariant == RHSVariant)
       Variant = LHSVariant;
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index e707bb2fe3e1d..bdedde505295f 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -40,10 +40,10 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                          const MCValue &Target,
                                          bool IsPCRel) const {
   switch (Target.getSpecifier()) {
-  case VEMCExpr::VK_TLS_GD_HI32:
-  case VEMCExpr::VK_TLS_GD_LO32:
-  case VEMCExpr::VK_TPOFF_HI32:
-  case VEMCExpr::VK_TPOFF_LO32:
+  case VE::S_TLS_GD_HI32:
+  case VE::S_TLS_GD_LO32:
+  case VE::S_TPOFF_HI32:
+  case VE::S_TPOFF_LO32:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -51,7 +51,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
     break;
   }
   if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Fixup.getValue())) {
-    if (SExpr->getSpecifier() == VEMCExpr::VK_PC_LO32)
+    if (SExpr->getSpecifier() == VE::S_PC_LO32)
       return ELF::R_VE_PC_LO32;
   }
 
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index fdde46f09d5b1..ac580f79a77b0 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -19,20 +19,20 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {VEMCExpr::VK_HI32, "hi"},
-    {VEMCExpr::VK_LO32, "lo"},
-    {VEMCExpr::VK_PC_HI32, "pc_hi"},
-    {VEMCExpr::VK_PC_LO32, "pc_lo"},
-    {VEMCExpr::VK_GOT_HI32, "got_hi"},
-    {VEMCExpr::VK_GOT_LO32, "got_lo"},
-    {VEMCExpr::VK_GOTOFF_HI32, "gotoff_hi"},
-    {VEMCExpr::VK_GOTOFF_LO32, "gotoff_lo"},
-    {VEMCExpr::VK_PLT_HI32, "plt_hi"},
-    {VEMCExpr::VK_PLT_LO32, "plt_lo"},
-    {VEMCExpr::VK_TLS_GD_HI32, "tls_gd_hi"},
-    {VEMCExpr::VK_TLS_GD_LO32, "tls_gd_lo"},
-    {VEMCExpr::VK_TPOFF_HI32, "tpoff_hi"},
-    {VEMCExpr::VK_TPOFF_LO32, "tpoff_lo"},
+    {VE::S_HI32, "hi"},
+    {VE::S_LO32, "lo"},
+    {VE::S_PC_HI32, "pc_hi"},
+    {VE::S_PC_LO32, "pc_lo"},
+    {VE::S_GOT_HI32, "got_hi"},
+    {VE::S_GOT_LO32, "got_lo"},
+    {VE::S_GOTOFF_HI32, "gotoff_hi"},
+    {VE::S_GOTOFF_LO32, "gotoff_lo"},
+    {VE::S_PLT_HI32, "plt_hi"},
+    {VE::S_PLT_LO32, "plt_lo"},
+    {VE::S_TLS_GD_HI32, "tls_gd_hi"},
+    {VE::S_TLS_GD_LO32, "tls_gd_lo"},
+    {VE::S_TPOFF_HI32, "tpoff_hi"},
+    {VE::S_TPOFF_LO32, "tpoff_lo"},
 };
 
 void VEELFMCAsmInfo::anchor() {}
@@ -61,6 +61,6 @@ void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
                                         const MCSpecifierExpr &Expr) const {
   printExpr(OS, *Expr.getSubExpr());
   auto specifier = Expr.getSpecifier();
-  if (specifier && specifier != VEMCExpr::VK_REFLONG)
+  if (specifier && specifier != VE::S_REFLONG)
     OS << '@' << getSpecifierName(specifier);
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index 7dece1b309a96..c3fae1a0c77d4 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -99,7 +99,7 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 
   const MCExpr *Expr = MO.getExpr();
   if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
-    auto Kind = VEMCExpr::getFixupKind(SExpr->getSpecifier());
+    auto Kind = VE::getFixupKind(SExpr->getSpecifier());
     Fixups.push_back(MCFixup::create(0, Expr, Kind));
     return 0;
   }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index fa4d9b18a9ad9..ed0eafc75888f 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -27,39 +27,39 @@ const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) VEMCExpr(Expr, S);
 }
 
-VE::Fixups VEMCExpr::getFixupKind(MCSpecifierExpr::Spec S) {
+VE::Fixups VE::getFixupKind(uint8_t S) {
   switch (S) {
   default:
     llvm_unreachable("Unhandled VEMCExpr::Specifier");
-  case VK_REFLONG:
+  case VE::S_REFLONG:
     return VE::fixup_ve_reflong;
-  case VK_HI32:
+  case VE::S_HI32:
     return VE::fixup_ve_hi32;
-  case VK_LO32:
+  case VE::S_LO32:
     return VE::fixup_ve_lo32;
-  case VK_PC_HI32:
+  case VE::S_PC_HI32:
     return VE::fixup_ve_pc_hi32;
-  case VK_PC_LO32:
+  case VE::S_PC_LO32:
     return VE::fixup_ve_pc_lo32;
-  case VK_GOT_HI32:
+  case VE::S_GOT_HI32:
     return VE::fixup_ve_got_hi32;
-  case VK_GOT_LO32:
+  case VE::S_GOT_LO32:
     return VE::fixup_ve_got_lo32;
-  case VK_GOTOFF_HI32:
+  case VE::S_GOTOFF_HI32:
     return VE::fixup_ve_gotoff_hi32;
-  case VK_GOTOFF_LO32:
+  case VE::S_GOTOFF_LO32:
     return VE::fixup_ve_gotoff_lo32;
-  case VK_PLT_HI32:
+  case VE::S_PLT_HI32:
     return VE::fixup_ve_plt_hi32;
-  case VK_PLT_LO32:
+  case VE::S_PLT_LO32:
     return VE::fixup_ve_plt_lo32;
-  case VK_TLS_GD_HI32:
+  case VE::S_TLS_GD_HI32:
     return VE::fixup_ve_tls_gd_hi32;
-  case VK_TLS_GD_LO32:
+  case VE::S_TLS_GD_LO32:
     return VE::fixup_ve_tls_gd_lo32;
-  case VK_TPOFF_HI32:
+  case VE::S_TPOFF_HI32:
     return VE::fixup_ve_tpoff_hi32;
-  case VK_TPOFF_LO32:
+  case VE::S_TPOFF_LO32:
     return VE::fixup_ve_tpoff_lo32;
   }
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index 4d191149d4aa0..d4e0f77c8ece8 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -22,25 +22,7 @@ namespace llvm {
 class StringRef;
 class VEMCExpr : public MCSpecifierExpr {
 public:
-  enum Specifier {
-    VK_None,
-
-    VK_REFLONG = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_HI32,        // @hi
-    VK_LO32,        // @lo
-    VK_PC_HI32,     // @pc_hi
-    VK_PC_LO32,     // @pc_lo
-    VK_GOT_HI32,    // @got_hi
-    VK_GOT_LO32,    // @got_lo
-    VK_GOTOFF_HI32, // @gotoff_hi
-    VK_GOTOFF_LO32, // @gotoff_lo
-    VK_PLT_HI32,    // @plt_hi
-    VK_PLT_LO32,    // @plt_lo
-    VK_TLS_GD_HI32, // @tls_gd_hi
-    VK_TLS_GD_LO32, // @tls_gd_lo
-    VK_TPOFF_HI32,  // @tpoff_hi
-    VK_TPOFF_LO32,  // @tpoff_lo
-  };
+  using Specifier = uint8_t;
 
 private:
   explicit VEMCExpr(const MCExpr *Expr, Specifier S)
@@ -52,10 +34,32 @@ class VEMCExpr : public MCSpecifierExpr {
 
   bool evaluateAsRelocatableImpl(MCValue &Res,
                                  const MCAssembler *Asm) const override;
+};
+
+namespace VE {
+enum Specifier {
+  S_None,
 
-  static VE::Fixups getFixupKind(Spec S);
+  S_REFLONG = MCSymbolRefExpr::FirstTargetSpecifier,
+  S_HI32,        // @hi
+  S_LO32,        // @lo
+  S_PC_HI32,     // @pc_hi
+  S_PC_LO32,     // @pc_lo
+  S_GOT_HI32,    // @got_hi
+  S_GOT_LO32,    // @got_lo
+  S_GOTOFF_HI32, // @gotoff_hi
+  S_GOTOFF_LO32, // @gotoff_lo
+  S_PLT_HI32,    // @plt_hi
+  S_PLT_LO32,    // @plt_lo
+  S_TLS_GD_HI32, // @tls_gd_hi
+  S_TLS_GD_LO32, // @tls_gd_lo
+  S_TPOFF_HI32,  // @tpoff_hi
+  S_TPOFF_LO32,  // @tpoff_lo
 };
 
+VE::Fixups getFixupKind(uint8_t S);
+} // namespace VE
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index ee347cda05217..f0d6f52268544 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -194,8 +194,8 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
     case CodeModel::Small:
     case CodeModel::Medium:
     case CodeModel::Large:
-      emitHiLo(*OutStreamer, GOTLabel, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32,
-               MCRegOP, OutContext, STI);
+      emitHiLo(*OutStreamer, GOTLabel, VE::S_HI32, VE::S_LO32, MCRegOP,
+               OutContext, STI);
       break;
     }
     return;
@@ -209,14 +209,12 @@ void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
   // sic %plt
   // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%plt, %got)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_PC_LO32, GOTLabel, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_PC_LO32, GOTLabel, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
   emitSIC(*OutStreamer, RegPLT, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_PC_HI32, GOTLabel, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_PC_HI32, GOTLabel, OutContext);
   emitLEASLrri(*OutStreamer, RegGOT, RegPLT, hiImm, MCRegOP, STI);
 }
 
@@ -257,14 +255,12 @@ void VEAsmPrinter::lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
   // sic %plt                            ; FIXME: is it safe to use %plt here?
   // lea.sl %dst, func@plt_hi(%plt, %dst)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_LO32, AddrSym, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_PLT_LO32, AddrSym, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
   emitSIC(*OutStreamer, RegPLT, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_HI32, AddrSym, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_PLT_HI32, AddrSym, OutContext);
   emitLEASLrri(*OutStreamer, MCRegOP, RegPLT, hiImm, MCRegOP, STI);
 }
 
@@ -305,22 +301,20 @@ void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
   // lea.sl %s12, __tls_get_addr@plt_hi(%s12, %lr)
   // bsic %lr, (, %s12)
   MCOperand cim24 = MCOperand::createImm(-24);
-  MCOperand loImm =
-      createGOTRelExprOp(VEMCExpr::VK_TLS_GD_LO32, AddrSym, OutContext);
+  MCOperand loImm = createGOTRelExprOp(VE::S_TLS_GD_LO32, AddrSym, OutContext);
   emitLEAzii(*OutStreamer, cim24, loImm, RegS0, STI);
   MCOperand M032 = MCOperand::createImm(M0(32));
   emitANDrm(*OutStreamer, RegS0, M032, RegS0, STI);
   emitSIC(*OutStreamer, RegLR, STI);
-  MCOperand hiImm =
-      createGOTRelExprOp(VEMCExpr::VK_TLS_GD_HI32, AddrSym, OutContext);
+  MCOperand hiImm = createGOTRelExprOp(VE::S_TLS_GD_HI32, AddrSym, OutContext);
   emitLEASLrri(*OutStreamer, RegS0, RegLR, hiImm, RegS0, STI);
   MCOperand ci8 = MCOperand::createImm(8);
   MCOperand loImm2 =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_LO32, GetTLSLabel, OutContext);
+      createGOTRelExprOp(VE::S_PLT_LO32, GetTLSLabel, OutContext);
   emitLEAzii(*OutStreamer, ci8, loImm2, RegS12, STI);
   emitANDrm(*OutStreamer, RegS12, M032, RegS12, STI);
   MCOperand hiImm2 =
-      createGOTRelExprOp(VEMCExpr::VK_PLT_HI32, GetTLSLabel, OutContext);
+      createGOTRelExprOp(VE::S_PLT_HI32, GetTLSLabel, OutContext);
   emitLEASLrri(*OutStreamer, RegS12, RegLR, hiImm2, RegS12, STI);
   emitBSIC(*OutStreamer, RegLR, RegS12, STI);
 }
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 313c894cafa85..b5a0d26abbf8e 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -664,7 +664,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
-      Callee = makeHiLoPair(Callee, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+      Callee = makeHiLoPair(Callee, VE::S_HI32, VE::S_LO32, DAG);
     }
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (IsPICCall) {
@@ -673,7 +673,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
     } else {
-      Callee = makeHiLoPair(Callee, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+      Callee = makeHiLoPair(Callee, VE::S_HI32, VE::S_LO32, DAG);
     }
   }
 
@@ -1020,8 +1020,8 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
       //     lea %reg, label@gotoff_lo
       //     and %reg, %reg, (32)0
       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
-      SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_GOTOFF_HI32,
-                                  VEMCExpr::VK_GOTOFF_LO32, DAG);
+      SDValue HiLo =
+          makeHiLoPair(Op, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     }
@@ -1030,8 +1030,7 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     //     and %reg, %reg, (32)0
     //     lea.sl %reg, label@got_hi(%reg)
     //     ld %reg, (%reg, %got)
-    SDValue HiLo =
-        makeHiLoPair(Op, VEMCExpr::VK_GOT_HI32, VEMCExpr::VK_GOT_LO32, DAG);
+    SDValue HiLo = makeHiLoPair(Op, VE::S_GOT_HI32, VE::S_GOT_LO32, DAG);
     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
@@ -1046,7 +1045,7 @@ SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   case CodeModel::Medium:
   case CodeModel::Large:
     // abs64.
-    return makeHiLoPair(Op, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+    return makeHiLoPair(Op, VE::S_HI32, VE::S_LO32, DAG);
   }
 }
 
@@ -1782,12 +1781,11 @@ SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     SDValue Addr =
         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
     if (isPositionIndependent()) {
-      Addr = makeHiLoPair(Addr, VEMCExpr::VK_GOTOFF_HI32,
-                          VEMCExpr::VK_GOTOFF_LO32, DAG);
+      Addr = makeHiLoPair(Addr, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
     }
-    return makeHiLoPair(Addr, VEMCExpr::VK_HI32, VEMCExpr::VK_LO32, DAG);
+    return makeHiLoPair(Addr, VE::S_HI32, VE::S_LO32, DAG);
   }
   }
 }
@@ -2011,8 +2009,7 @@ SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
   // In order to do so, we need to genarate correctly marked DAG node using
   // makeHiLoPair.
   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
-  SDValue HiLo =
-      makeHiLoPair(Op, VEMCExpr::VK_GOTOFF_HI32, VEMCExpr::VK_GOTOFF_LO32, DAG);
+  SDValue HiLo = makeHiLoPair(Op, VE::S_GOTOFF_HI32, VE::S_GOTOFF_LO32, DAG);
   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
 }
@@ -2038,14 +2035,14 @@ Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_GOTOFF_LO32);
+        .addMBB(TargetBB, VE::S_GOTOFF_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
-        .addMBB(TargetBB, VEMCExpr::VK_GOTOFF_HI32);
+        .addMBB(TargetBB, VE::S_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, TargetBB@lo
@@ -2054,14 +2051,14 @@ Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_LO32);
+        .addMBB(TargetBB, VE::S_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addMBB(TargetBB, VEMCExpr::VK_HI32);
+        .addMBB(TargetBB, VE::S_HI32);
   }
   return Result;
 }
@@ -2099,14 +2096,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOTOFF_LO32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOTOFF_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOTOFF_HI32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOTOFF_HI32);
     } else {
       Register Tmp1 = MRI.createVirtualRegister(RC);
       Register Tmp2 = MRI.createVirtualRegister(RC);
@@ -2119,14 +2116,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
           .addImm(0)
           .addImm(0)
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOT_LO32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOT_LO32);
       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
           .addReg(Tmp1, getKillRegState(true))
           .addImm(M0(32));
       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
           .addReg(VE::SX15)
           .addReg(Tmp2, getKillRegState(true))
-          .addExternalSymbol(Symbol.data(), VEMCExpr::VK_GOT_HI32);
+          .addExternalSymbol(Symbol.data(), VE::S_GOT_HI32);
       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
           .addReg(Tmp3, getKillRegState(true))
           .addImm(0)
@@ -2142,14 +2139,14 @@ Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_LO32);
+        .addExternalSymbol(Symbol.data(), VE::S_LO32);
     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addExternalSymbol(Symbol.data(), VEMCExpr::VK_HI32);
+        .addExternalSymbol(Symbol.data(), VE::S_HI32);
   }
   return Result;
 }
@@ -2528,14 +2525,14 @@ VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_GOTOFF_LO32);
+        .addJumpTableIndex(MJTI, VE::S_GOTOFF_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
         .addReg(VE::SX15)
         .addReg(Tmp2, getKillRegState(true))
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_GOTOFF_HI32);
+        .addJumpTableIndex(MJTI, VE::S_GOTOFF_HI32);
   } else {
     // Create following instructions for non-PIC code.
     //     lea     %Tmp1, .LJTI0_0@lo
@@ -2544,14 +2541,14 @@ VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
         .addImm(0)
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_LO32);
+        .addJumpTableIndex(MJTI, VE::S_LO32);
     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
         .addReg(Tmp1, getKillRegState(true))
         .addImm(M0(32));
     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
         .addReg(Tmp2, getKillRegState(true))
         .addImm(0)
-        .addJumpTableIndex(MJTI, VEMCExpr::VK_HI32);
+        .addJumpTableIndex(MJTI, VE::S_HI32);
   }
 
   switch (JTE) {

From df54a2d9357fe7f56ca3c6fa2f07889449b50325 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 19:31:30 +0100
Subject: [PATCH 502/851] [VPlan] Only skip induction phis in
 planContainsAdditionalSimps (NFC).

Skip induction phis when checking for simplifications, as they may not
be lowered directly be lowered to a corresponding PHI recipe. Reductions
and first-order recurrences will get lowered to phi recipes, unless they
are removed. Considering them for simplifications allows removing them
if there are no remaining users.

NFC as currently reduction and recurrence phis are not
simplified/removed if dead.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9b5ad16589539..eb04e2d5ca7b4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7061,7 +7061,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
   return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
                                     TheLoop](BasicBlock *BB) {
     return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
-      if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
+      // Skip induction phis when checking for simplifications, as they may not
+      // be lowered directly be lowered to a corresponding PHI recipe.
+      if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
+          CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
         return false;
       return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
     });

From 254a92d49a4c1e1f7f747b1c2f1ccbfd7f217880 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 11:41:33 -0700
Subject: [PATCH 503/851] MC: Add MCSpecifierExpr::create

as a target-agnostic implementation to replace target-specific
XXXMCExpr::create.
---
 llvm/include/llvm/MC/MCExpr.h                         | 10 +++++++---
 llvm/lib/MC/MCExpr.cpp                                | 10 ++++++++++
 llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp    | 10 +++++-----
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp |  4 ++--
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp    | 10 ----------
 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h      |  4 ----
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp             |  6 +++---
 llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp       |  2 +-
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index 1c72269e53e29..cd57fafc50b56 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -510,12 +510,16 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
   // Target-specific relocation specifier code
   const Spec specifier;
 
-public:
-  explicit MCSpecifierExpr(const MCExpr *Expr, Spec S)
-      : MCExpr(Specifier, SMLoc()), Expr(Expr), specifier(S) {}
+  explicit MCSpecifierExpr(const MCExpr *Expr, Spec S, SMLoc Loc = SMLoc())
+      : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
   virtual ~MCSpecifierExpr() = default;
 
 public:
+  LLVM_ABI static const MCSpecifierExpr *
+  create(const MCExpr *Expr, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
+  LLVM_ABI static const MCSpecifierExpr *
+  create(const MCSymbol *Sym, Spec S, MCContext &Ctx, SMLoc Loc = SMLoc());
+
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 2ae440cba46f9..e83ce05b37a89 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -737,6 +737,16 @@ MCFragment *MCExpr::findAssociatedFragment() const {
   llvm_unreachable("Invalid assembly expression kind!");
 }
 
+const MCSpecifierExpr *MCSpecifierExpr::create(const MCExpr *Expr, Spec S,
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSpecifierExpr(Expr, S, Loc);
+}
+
+const MCSpecifierExpr *MCSpecifierExpr::create(const MCSymbol *Sym, Spec S,
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S, Loc);
+}
+
 bool MCSpecifierExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                                 const MCAssembler *Asm) const {
   if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 187ecbaad4bb2..90aacacd8ed2d 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -848,14 +848,14 @@ bool SparcAsmParser::expandSETX(MCInst &Inst, SMLoc IDLoc,
   // sethi %hh(val), tmp
   Instructions.push_back(MCInstBuilder(SP::SETHIi)
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(Sparc::createSpecifierExpr(
-                                 getContext(), ValExpr, ELF::R_SPARC_HH22)));
+                             .addExpr(MCSpecifierExpr::create(
+                                 ValExpr, ELF::R_SPARC_HH22, getContext())));
   // or    tmp, %hm(val), tmp
   Instructions.push_back(MCInstBuilder(SP::ORri)
                              .addReg(MCTmpOp.getReg())
                              .addReg(MCTmpOp.getReg())
-                             .addExpr(Sparc::createSpecifierExpr(
-                                 getContext(), ValExpr, ELF::R_SPARC_HM10)));
+                             .addExpr(MCSpecifierExpr::create(
+                                 ValExpr, ELF::R_SPARC_HM10, getContext())));
   // sllx  tmp, 32, tmp
   Instructions.push_back(MCInstBuilder(SP::SLLXri)
                              .addReg(MCTmpOp.getReg())
@@ -1689,7 +1689,7 @@ const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
     }
   }
 
-  return Sparc::createSpecifierExpr(getContext(), subExpr, RelType);
+  return MCSpecifierExpr::create(subExpr, RelType, getContext());
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 4156780e962dc..800567bf58ffa 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -50,7 +50,7 @@ SparcELFMCAsmInfo::getExprForPersonalitySymbol(const MCSymbol *Sym,
                                                MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(Sym, ELF::R_SPARC_DISP32, Ctx);
   }
 
   return MCAsmInfo::getExprForPersonalitySymbol(Sym, Encoding, Streamer);
@@ -62,7 +62,7 @@ SparcELFMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
                                        MCStreamer &Streamer) const {
   if (Encoding & dwarf::DW_EH_PE_pcrel) {
     MCContext &Ctx = Streamer.getContext();
-    return Sparc::createSpecifierExpr(Ctx, Sym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(Sym, ELF::R_SPARC_DISP32, Ctx);
   }
   return MCAsmInfo::getExprForFDESymbol(Sym, Encoding, Streamer);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 6a08fa5c9f3f7..6d43b93713906 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -22,16 +22,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sparcmcexpr"
 
-const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
-                                              const MCExpr *Expr, uint16_t S) {
-  return new (Ctx) MCSpecifierExpr(Expr, S);
-}
-
-const SparcMCExpr *Sparc::createSpecifierExpr(MCContext &Ctx,
-                                              const MCSymbol *Sym, uint16_t S) {
-  return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
 StringRef Sparc::getSpecifierName(uint16_t S) {
   // clang-format off
   switch (uint16_t(S)) {
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index 78af9a8150200..8e7c173c70ccb 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -23,10 +23,6 @@ class StringRef;
 using SparcMCExpr = MCSpecifierExpr;
 
 namespace Sparc {
-const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCExpr *Expr,
-                                       uint16_t S);
-const SparcMCExpr *createSpecifierExpr(MCContext &Ctx, const MCSymbol *Sym,
-                                       uint16_t S);
 uint16_t parseSpecifier(StringRef name);
 StringRef getSpecifierName(uint16_t S);
 } // namespace Sparc
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index ffefdf97edab1..dab2de7d56c01 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -82,7 +82,7 @@ class SparcAsmPrinter : public AsmPrinter {
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, MCSym, Kind);
+  const SparcMCExpr *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = Sparc::createSpecifierExpr(OutContext, Add, Spec);
+  const SparcMCExpr *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
   return MCOperand::createExpr(expr);
 }
 
@@ -302,7 +302,7 @@ MCOperand SparcAsmPrinter::lowerOperand(const MachineOperand &MO) const {
 
     const MCExpr *expr = MCSymbolRefExpr::create(Symbol, OutContext);
     if (RelType)
-      expr = Sparc::createSpecifierExpr(OutContext, expr, RelType);
+      expr = MCSpecifierExpr::create(expr, RelType, OutContext);
     return MCOperand::createExpr(expr);
   }
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index be11ea272ed1f..a42a67d91d848 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -39,7 +39,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     }
 
     MCContext &Ctx = getContext();
-    return Sparc::createSpecifierExpr(Ctx, SSym, ELF::R_SPARC_DISP32);
+    return MCSpecifierExpr::create(SSym, ELF::R_SPARC_DISP32, Ctx);
   }
 
   return TargetLoweringObjectFileELF::getTTypeGlobalReference(GV, Encoding, TM,

From 72de33a406383cb8555234c40e7b31db593e164f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 11:52:43 -0700
Subject: [PATCH 504/851] MC: Add MCAsmInfo::evaluateAsRelocatableImpl and
 replace VEMCExpr with MCSpecifierExpr

Expressions with specifier can only be folded during relocation
generatin. At parse time the `MCAssembler *` argument might be null, and
targets should not rely on the evaluateAsRelocatable result.

Therefore, we can move evaluateAsRelocatableImpl from MCSpecifierExpr to
MCAsmInfo, so that targets do not need to inherit from MCSpecifierExpr.
---
 llvm/include/llvm/MC/MCAsmInfo.h               |  4 ++++
 llvm/lib/MC/MCAsmInfo.cpp                      |  8 ++++++++
 llvm/lib/MC/MCExpr.cpp                         |  5 ++++-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp   | 14 +++++++-------
 .../VE/MCTargetDesc/VEELFObjectWriter.cpp      |  2 +-
 .../lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp | 10 ++++++++++
 llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h  |  2 ++
 .../Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp |  2 +-
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp   | 13 -------------
 llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h     | 15 ---------------
 llvm/lib/Target/VE/VEAsmPrinter.cpp            | 18 ++++++++----------
 llvm/lib/Target/VE/VEMCInstLower.cpp           |  4 ++--
 12 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 1f2ea0cfaaff0..a7bf1b965bf2d 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -25,6 +25,7 @@
 
 namespace llvm {
 
+class MCAssembler;
 class MCContext;
 class MCCFIInstruction;
 class MCExpr;
@@ -33,6 +34,7 @@ class MCSection;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbol;
+class MCValue;
 class raw_ostream;
 
 namespace WinEH {
@@ -714,6 +716,8 @@ class LLVM_ABI MCAsmInfo {
 
   void printExpr(raw_ostream &, const MCExpr &) const;
   virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
+  virtual bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &Res,
+                                         const MCAssembler *Asm) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 13b077349a587..e8eaf4619df51 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -163,3 +163,11 @@ void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
   // migrate to MCAsmInfo::printSpecifierExpr.
   Expr.printImpl(OS, this);
 }
+
+bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                          MCValue &Res,
+                                          const MCAssembler *Asm) const {
+  // TODO: Remove after all targets that use MCSpecifierExpr migrate to
+  // MCAsmInfo::evaluateAsRelocatableImpl.
+  return Expr.evaluateAsRelocatableImpl(Res, Asm);
+}
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index e83ce05b37a89..5ccad6d487973 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -680,7 +680,10 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
     return true;
   }
   case Specifier:
-    return cast<MCSpecifierExpr>(this)->evaluateAsRelocatableImpl(Res, Asm);
+    // Fold the expression during relocation generation. As parse time Asm might
+    // be null, and targets should not rely on the folding.
+    return Asm && Asm->getContext().getAsmInfo()->evaluateAsRelocatableImpl(
+                      cast<MCSpecifierExpr>(*this), Res, Asm);
   }
 
   llvm_unreachable("Invalid assembly expression kind!");
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index 418587947e1ec..c54ce40de45ff 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -73,7 +73,7 @@ class VEAsmParser : public MCTargetAsmParser {
   ParseStatus parseVEAsmOperand(std::unique_ptr<VEOperand> &Operand);
 
   // Helper function to parse expression with a symbol.
-  const MCExpr *extractSpecifier(const MCExpr *E, VEMCExpr::Specifier &Variant);
+  const MCExpr *extractSpecifier(const MCExpr *E, VE::Specifier &Variant);
   bool parseExpression(const MCExpr *&EVal);
 
   // Split the mnemonic stripping conditional code and quantifiers
@@ -1036,11 +1036,11 @@ bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
 /// Extract \code @lo32/@hi32/etc \endcode specifier from expression.
 /// Recursively scan the expression and check for VK_HI32/LO32/etc
 /// symbol variants.  If all symbols with modifier use the same
-/// variant, return the corresponding VEMCExpr::Specifier,
+/// variant, return the corresponding VE::Specifier,
 /// and a modified expression using the default symbol variant.
 /// Otherwise, return NULL.
 const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
-                                            VEMCExpr::Specifier &Variant) {
+                                            VE::Specifier &Variant) {
   MCContext &Context = getParser().getContext();
   Variant = VE::S_None;
 
@@ -1118,7 +1118,7 @@ const MCExpr *VEAsmParser::extractSpecifier(const MCExpr *E,
 
   case MCExpr::Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    VEMCExpr::Specifier LHSVariant, RHSVariant;
+    VE::Specifier LHSVariant, RHSVariant;
     const MCExpr *LHS = extractSpecifier(BE->getLHS(), LHSVariant);
     const MCExpr *RHS = extractSpecifier(BE->getRHS(), RHSVariant);
 
@@ -1153,11 +1153,11 @@ bool VEAsmParser::parseExpression(const MCExpr *&EVal) {
   if (getParser().parseExpression(EVal))
     return true;
 
-  // Convert MCSymbolRefExpr with VK_* to MCExpr with VK_*.
-  VEMCExpr::Specifier Specifier;
+  // Convert MCSymbolRefExpr with specifier to MCSpecifierExpr.
+  VE::Specifier Specifier;
   const MCExpr *E = extractSpecifier(EVal, Specifier);
   if (E)
-    EVal = VEMCExpr::create(Specifier, E, getParser().getContext());
+    EVal = MCSpecifierExpr::create(E, Specifier, getParser().getContext());
 
   return false;
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
index bdedde505295f..0e3f5d18de07f 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -50,7 +50,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup,
   default:
     break;
   }
-  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Fixup.getValue())) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Fixup.getValue())) {
     if (SExpr->getSpecifier() == VE::S_PC_LO32)
       return ELF::R_VE_PC_LO32;
   }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index ac580f79a77b0..8eb3aedd668e4 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -14,6 +14,7 @@
 #include "VEMCExpr.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -64,3 +65,12 @@ void VEELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
   if (specifier && specifier != VE::S_REFLONG)
     OS << '@' << getSpecifierName(specifier);
 }
+
+bool VEELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                               MCValue &Res,
+                                               const MCAssembler *Asm) const {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
index 444f422c7ec12..2d73c94e2113e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h
@@ -26,6 +26,8 @@ class VEELFMCAsmInfo : public MCAsmInfoELF {
   explicit VEELFMCAsmInfo(const Triple &TheTriple);
   void printSpecifierExpr(raw_ostream &OS,
                           const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
index c3fae1a0c77d4..712de5accce51 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -98,7 +98,7 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
   assert(MO.isExpr());
 
   const MCExpr *Expr = MO.getExpr();
-  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
+  if (const auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     auto Kind = VE::getFixupKind(SExpr->getSpecifier());
     Fixups.push_back(MCFixup::create(0, Expr, Kind));
     return 0;
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
index ed0eafc75888f..ca13aba095e25 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -22,11 +22,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "vemcexpr"
 
-const VEMCExpr *VEMCExpr::create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx) {
-  return new (Ctx) VEMCExpr(Expr, S);
-}
-
 VE::Fixups VE::getFixupKind(uint8_t S) {
   switch (S) {
   default:
@@ -63,11 +58,3 @@ VE::Fixups VE::getFixupKind(uint8_t S) {
     return VE::fixup_ve_tpoff_lo32;
   }
 }
-
-bool VEMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return true;
-}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
index d4e0f77c8ece8..b7913513bd51e 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -20,21 +20,6 @@
 namespace llvm {
 
 class StringRef;
-class VEMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint8_t;
-
-private:
-  explicit VEMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const VEMCExpr *create(Specifier Kind, const MCExpr *Expr,
-                                MCContext &Ctx);
-
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-};
 
 namespace VE {
 enum Specifier {
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index f0d6f52268544..af0dc0404d3cc 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -67,18 +67,17 @@ class VEAsmPrinter : public AsmPrinter {
 };
 } // end of anonymous namespace
 
-static MCOperand createVEMCOperand(VEMCExpr::Specifier Kind, MCSymbol *Sym,
+static MCOperand createVEMCOperand(VE::Specifier Kind, MCSymbol *Sym,
                                    MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const VEMCExpr *expr = VEMCExpr::create(Kind, MCSym, OutContext);
-  return MCOperand::createExpr(expr);
+  return MCOperand::createExpr(
+      MCSpecifierExpr::create(MCSym, Kind, OutContext));
 }
 
-static MCOperand createGOTRelExprOp(VEMCExpr::Specifier Kind,
-                                    MCSymbol *GOTLabel, MCContext &OutContext) {
+static MCOperand createGOTRelExprOp(VE::Specifier Kind, MCSymbol *GOTLabel,
+                                    MCContext &OutContext) {
   const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
-  const VEMCExpr *expr = VEMCExpr::create(Kind, GOT, OutContext);
-  return MCOperand::createExpr(expr);
+  return MCOperand::createExpr(MCSpecifierExpr::create(GOT, Kind, OutContext));
 }
 
 static void emitSIC(MCStreamer &OutStreamer, MCOperand &RD,
@@ -166,9 +165,8 @@ static void emitANDrm(MCStreamer &OutStreamer, MCOperand &RS1, MCOperand &Imm,
 }
 
 static void emitHiLo(MCStreamer &OutStreamer, MCSymbol *GOTSym,
-                     VEMCExpr::Specifier HiKind, VEMCExpr::Specifier LoKind,
-                     MCOperand &RD, MCContext &OutContext,
-                     const MCSubtargetInfo &STI) {
+                     VE::Specifier HiKind, VE::Specifier LoKind, MCOperand &RD,
+                     MCContext &OutContext, const MCSubtargetInfo &STI) {
 
   MCOperand hi = createVEMCOperand(HiKind, GOTSym, OutContext);
   MCOperand lo = createVEMCOperand(LoKind, GOTSym, OutContext);
diff --git a/llvm/lib/Target/VE/VEMCInstLower.cpp b/llvm/lib/Target/VE/VEMCInstLower.cpp
index bed71df3921cf..a438d8740cd09 100644
--- a/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 static MCOperand LowerSymbolOperand(const MachineInstr *MI,
                                     const MachineOperand &MO,
                                     const MCSymbol *Symbol, AsmPrinter &AP) {
-  VEMCExpr::Specifier Kind = (VEMCExpr::Specifier)MO.getTargetFlags();
+  auto Kind = (VE::Specifier)MO.getTargetFlags();
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, AP.OutContext);
   // Add offset iff MO is not jump table info or machine basic block.
@@ -36,7 +36,7 @@ static MCOperand LowerSymbolOperand(const MachineInstr *MI,
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(MO.getOffset(), AP.OutContext),
         AP.OutContext);
-  Expr = VEMCExpr::create(Kind, Expr, AP.OutContext);
+  Expr = MCSpecifierExpr::create(Expr, Kind, AP.OutContext);
   return MCOperand::createExpr(Expr);
 }
 

From 490d7bb89a029edd037ed5e46747d0085a649ee8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:18:46 -0700
Subject: [PATCH 505/851] Xtensa: Remove unneeded XtensaMCExpr::create calls

MCSpecifierExpr and its subclasses should only be used with the
relocation specifier is not zero.
---
 llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp | 12 +++---------
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp          |  1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index e0bbbc79b201b..1f6cfec8edf4e 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -393,9 +393,7 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   case Xtensa::L32R: {
     const MCSymbolRefExpr *OpExpr =
         static_cast<const MCSymbolRefExpr *>(Inst.getOperand(1).getExpr());
-    XtensaMCExpr::Specifier Kind = XtensaMCExpr::VK_None;
-    const MCExpr *NewOpExpr = XtensaMCExpr::create(OpExpr, Kind, getContext());
-    Inst.getOperand(1).setExpr(NewOpExpr);
+    Inst.getOperand(1).setExpr(OpExpr);
     break;
   }
   case Xtensa::MOVI: {
@@ -413,10 +411,8 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         const MCExpr *Value = MCConstantExpr::create(ImmOp64, getContext());
         MCSymbol *Sym = getContext().createTempSymbol();
         const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-        const MCExpr *OpExpr =
-            XtensaMCExpr::create(Expr, XtensaMCExpr::VK_None, getContext());
         TmpInst.addOperand(Inst.getOperand(0));
-        MCOperand Op1 = MCOperand::createExpr(OpExpr);
+        MCOperand Op1 = MCOperand::createExpr(Expr);
         TmpInst.addOperand(Op1);
         TS.emitLiteral(Sym, Value, true, IDLoc);
         Inst = TmpInst;
@@ -428,10 +424,8 @@ bool XtensaAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       const MCExpr *Value = Inst.getOperand(1).getExpr();
       MCSymbol *Sym = getContext().createTempSymbol();
       const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-      const MCExpr *OpExpr =
-          XtensaMCExpr::create(Expr, XtensaMCExpr::VK_None, getContext());
       TmpInst.addOperand(Inst.getOperand(0));
-      MCOperand Op1 = MCOperand::createExpr(OpExpr);
+      MCOperand Op1 = MCOperand::createExpr(Expr);
       TmpInst.addOperand(Op1);
       Inst = TmpInst;
       TS.emitLiteral(Sym, Value, true, IDLoc);
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 9182ea272befe..4f3a2e791a3ca 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -257,7 +257,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   const MCExpr *ME = MCSymbolRefExpr::create(Symbol, OutContext);
-  ME = XtensaMCExpr::create(ME, Kind, OutContext);
 
   if (Offset) {
     // Assume offset is never negative.

From cf9665dd2bcef3ff2f3e22d3f44e8603f4ba9577 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:32:10 -0700
Subject: [PATCH 506/851] Xtensa: Migrate to newer relocation specifier
 representation

* Rename specifier constants from XtensaMCExpr::Specifier::VK_ to
  Xtensa::S_, following Sparc and VE.
* Use MCAsmInfo::printSpecifierExpr instead of MCExpr::print.
* Remove unneeded XtensaMCExpr. Just use MCSpecifierExpr when a
  specifier is needed.
---
 .../Target/Xtensa/MCTargetDesc/CMakeLists.txt |  1 -
 .../Xtensa/MCTargetDesc/XtensaInstPrinter.cpp | 21 +++-----
 .../Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp   | 21 ++++++++
 .../Xtensa/MCTargetDesc/XtensaMCAsmInfo.h     |  3 ++
 .../Xtensa/MCTargetDesc/XtensaMCExpr.cpp      | 52 -------------------
 .../Target/Xtensa/MCTargetDesc/XtensaMCExpr.h | 21 ++------
 llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp   | 13 ++---
 7 files changed, 40 insertions(+), 92 deletions(-)
 delete mode 100644 llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp

diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
index dc12863394c7a..6c5a6bef5e242 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMXtensaDesc
   XtensaInstPrinter.cpp
   XtensaMCAsmInfo.cpp
   XtensaMCCodeEmitter.cpp
-  XtensaMCExpr.cpp
   XtensaMCTargetDesc.cpp
   XtensaTargetStreamer.cpp
 
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
index fc5e1780de2e5..408a6ac01de9e 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "XtensaInstPrinter.h"
 #include "MCTargetDesc/XtensaMCExpr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegister.h"
@@ -35,14 +36,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   if (!(SRE = cast<MCSymbolRefExpr>(Expr)))
     assert(false && "Unexpected MCExpr type.");
 
-  auto Spec = XtensaMCExpr::Specifier(SRE->getKind());
-  switch (Spec) {
-  case XtensaMCExpr::VK_None:
-    break;
-  // TODO
-  default:
-    report_fatal_error("Invalid kind!");
-  }
+  assert(SRE->getSpecifier() == 0);
 
   OS << SRE->getSymbol();
 
@@ -51,9 +45,6 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
       OS << '+';
     OS << Offset;
   }
-
-  if (Spec != XtensaMCExpr::VK_None)
-    OS << ')';
 }
 
 void XtensaInstPrinter::printOperand(const MCOperand &MC, raw_ostream &O) {
@@ -97,7 +88,7 @@ void XtensaInstPrinter::printBranchTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -109,7 +100,7 @@ void XtensaInstPrinter::printLoopTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI, true);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -121,7 +112,7 @@ void XtensaInstPrinter::printJumpTarget(const MCInst *MI, uint64_t Address,
     int64_t Val = MC.getImm() + 4;
     printPCRelImm(Address, Val, O);
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
   ;
@@ -168,7 +159,7 @@ void XtensaInstPrinter::printL32RTarget(const MCInst *MI, uint64_t Address,
       printPCRelImm(Address, Value, O);
     }
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
index 28764d369247a..0b20f2e14a841 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.cpp
@@ -13,6 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "XtensaMCAsmInfo.h"
+#include "XtensaMCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -30,3 +32,22 @@ XtensaMCAsmInfo::XtensaMCAsmInfo(const Triple &TT) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
   AlignmentIsInBytes = false;
 }
+
+void XtensaMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                         const MCSpecifierExpr &Expr) const {
+  StringRef S = Xtensa::getSpecifierName(Expr.getSpecifier());
+  if (!S.empty())
+    OS << '%' << S << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (!S.empty())
+    OS << ')';
+}
+
+uint8_t Xtensa::parseSpecifier(StringRef name) { return 0; }
+
+StringRef Xtensa::getSpecifierName(uint8_t S) {
+  switch (S) {
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
+}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
index a86a95f6be37e..6f6f4bcb7047e 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCAsmInfo.h
@@ -23,6 +23,9 @@ class Triple;
 class XtensaMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit XtensaMCAsmInfo(const Triple &TT);
+
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp
deleted file mode 100644
index f7f92e1646c3f..0000000000000
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- XtensaMCExpr.cpp - Xtensa specific MC expression classes ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the Xtensa architecture
-//
-//===----------------------------------------------------------------------===//
-
-#include "XtensaMCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "xtensamcexpr"
-
-const XtensaMCExpr *XtensaMCExpr::create(const MCExpr *Expr, Specifier S,
-                                         MCContext &Ctx) {
-  return new (Ctx) XtensaMCExpr(Expr, S);
-}
-
-void XtensaMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool HasSpecifier = getSpecifier() != VK_None;
-  if (HasSpecifier)
-    OS << '%' << getSpecifierName(getSpecifier()) << '(';
-  Expr->print(OS, MAI);
-  if (HasSpecifier)
-    OS << ')';
-}
-
-XtensaMCExpr::Specifier XtensaMCExpr::parseSpecifier(StringRef name) {
-  return StringSwitch<XtensaMCExpr::Specifier>(name).Default(VK_None);
-}
-
-StringRef XtensaMCExpr::getSpecifierName(Specifier S) {
-  switch (S) {
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
index 54b5ad30516bd..5a7b1ee9880f8 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.h
@@ -20,24 +20,13 @@
 namespace llvm {
 
 class StringRef;
-class XtensaMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-  enum { VK_None, VK_TPOFF };
 
-private:
-  explicit XtensaMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
+namespace Xtensa {
+enum Specifier { S_None, S_TPOFF };
 
-public:
-  static const XtensaMCExpr *create(const MCExpr *Expr, Specifier,
-                                    MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static Specifier parseSpecifier(StringRef name);
-  static StringRef getSpecifierName(Specifier Kind);
-};
+uint8_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint8_t S);
+} // namespace Xtensa
 
 } // end namespace llvm.
 
diff --git a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
index 4f3a2e791a3ca..4e3ed4b9e8ee5 100644
--- a/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaAsmPrinter.cpp
@@ -32,13 +32,13 @@
 
 using namespace llvm;
 
-static XtensaMCExpr::Specifier
+static Xtensa::Specifier
 getModifierSpecifier(XtensaCP::XtensaCPModifier Modifier) {
   switch (Modifier) {
   case XtensaCP::no_modifier:
-    return XtensaMCExpr::VK_None;
+    return Xtensa::S_None;
   case XtensaCP::TPOFF:
-    return XtensaMCExpr::VK_TPOFF;
+    return Xtensa::S_TPOFF;
   }
   report_fatal_error("Invalid XtensaCPModifier!");
 }
@@ -92,7 +92,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
   MCSymbol *LblSym = GetCPISymbol(ACPV->getLabelId());
   auto *TS =
       static_cast<XtensaTargetStreamer *>(OutStreamer->getTargetStreamer());
-  XtensaMCExpr::Specifier VK = getModifierSpecifier(ACPV->getModifier());
+  auto Spec = getModifierSpecifier(ACPV->getModifier());
 
   if (ACPV->getModifier() != XtensaCP::no_modifier) {
     std::string SymName(MCSym->getName());
@@ -101,7 +101,7 @@ void XtensaAsmPrinter::emitMachineConstantPoolValue(
     MCSym = OutContext.getOrCreateSymbol(SymName);
   }
 
-  const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, VK, OutContext);
+  const MCExpr *Expr = MCSymbolRefExpr::create(MCSym, Spec, OutContext);
   TS->emitLiteral(LblSym, Expr, false);
 }
 
@@ -227,8 +227,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
                                      MachineOperand::MachineOperandType MOTy,
                                      unsigned Offset) const {
   const MCSymbol *Symbol;
-  XtensaMCExpr::Specifier Kind = XtensaMCExpr::VK_None;
-
   switch (MOTy) {
   case MachineOperand::MO_GlobalAddress:
     Symbol = getSymbol(MO.getGlobal());
@@ -257,7 +255,6 @@ XtensaAsmPrinter::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   const MCExpr *ME = MCSymbolRefExpr::create(Symbol, OutContext);
-
   if (Offset) {
     // Assume offset is never negative.
     assert(Offset > 0);

From 7c22612b2948d8657b4a22ce59870ddd708c4677 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:43:37 -0700
Subject: [PATCH 507/851] SPARC: Remove dead specifier code from asm operand
 printer

We don't currently print %specifier( ) for asm operands.
The old code was also incorrect - as it did not print "(".
---
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index dab2de7d56c01..f4201f9a8dc1a 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -371,11 +371,7 @@ void SparcAsmPrinter::emitFunctionBodyStart() {
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const MachineOperand &MO = MI->getOperand (opNum);
-  auto TF = MO.getTargetFlags();
-
-  StringRef Spec = Sparc::getSpecifierName(TF);
-  O << Spec;
+  const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
     O << "%" << StringRef(getRegisterName(MO.getReg())).lower();
@@ -406,8 +402,6 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   default:
     llvm_unreachable("<unknown operand type>");
   }
-  if (!Spec.empty())
-    O << ")";
 }
 
 void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,

From d3e9e2d433a666d6620afb00a1533ef4937c667f Mon Sep 17 00:00:00 2001
From: Ross Kirsling <ross.kirsling@sony.com>
Date: Sun, 15 Jun 2025 12:45:08 -0700
Subject: [PATCH 508/851] [Clang] Fix typo in is_replaceable diagnostic
 (#144247)

Adjustment to #143265; `because it not` should be `because it is not`.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td     | 2 +-
 clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8fe7ad6138aa0..979ff60b73b75 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -1779,7 +1779,7 @@ def note_unsatisfied_trait_reason
            "%HasArcLifetime{has an ARC lifetime qualifier}|"
            "%VLA{is a variably-modified type}|"
            "%VBase{has a virtual base %1}|"
-           "%NotScalarOrClass{not %select{a|an array of objects of}1 scalar or "
+           "%NotScalarOrClass{is not %select{a|an array of objects of}1 scalar or "
            "class type}|"
            "%NTRBase{has a non-trivially-relocatable base %1}|"
            "%NTRField{has a non-trivially-relocatable member %1 of type %2}|"
diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
index a8c78f6304ca9..5210354a66d43 100644
--- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
+++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp
@@ -166,7 +166,7 @@ static_assert(__builtin_is_replaceable(const volatile int));
 static_assert(__builtin_is_replaceable(void()));
 // expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(void ())}} \
 // expected-note@-1 {{'void ()' is not replaceable}} \
-// expected-note@-1 {{because it not a scalar or class type}}
+// expected-note@-1 {{because it is not a scalar or class type}}
 
 struct B {
  virtual ~B();

From 5cf138a68744904562e81436181df668b00cdb1f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 12:53:30 -0700
Subject: [PATCH 509/851] M68k: Replace M68kMCExpr::VK_ to M68k::S_

Prepare for removing VEMCExpr. Adopt the newer naming convention adopted
by AMDGPU/WebAssembly/VE.
---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  2 +-
 llvm/lib/Target/M68k/M68kMCInstLower.cpp      | 20 +++++------
 .../M68k/MCTargetDesc/M68kELFObjectWriter.cpp | 32 ++++++++---------
 .../M68k/MCTargetDesc/M68kMCAsmInfo.cpp       | 12 +++----
 .../Target/M68k/MCTargetDesc/M68kMCExpr.cpp   |  2 +-
 .../lib/Target/M68k/MCTargetDesc/M68kMCExpr.h | 34 +++++++++----------
 6 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 9d3ab606ab8cd..c1860fa88a83b 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -2833,7 +2833,7 @@ unsigned M68kTargetLowering::getJumpTableEncoding() const {
 const MCExpr *M68kTargetLowering::LowerCustomJumpTableEntry(
     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
     unsigned uid, MCContext &Ctx) const {
-  return MCSymbolRefExpr::create(MBB->getSymbol(), M68kMCExpr::VK_GOTOFF, Ctx);
+  return MCSymbolRefExpr::create(MBB->getSymbol(), M68k::S_GOTOFF, Ctx);
 }
 
 SDValue M68kTargetLowering::getPICJumpTableRelocBase(SDValue Table,
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
index 8698fc0de4710..b256d56c032c5 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -76,7 +76,7 @@ MCOperand M68kMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   // FIXME We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing. This fixme is originally from X86
   const MCExpr *Expr = nullptr;
-  M68kMCExpr::Specifier RefKind = M68kMCExpr::VK_None;
+  M68k::Specifier RefKind = M68k::S_None;
 
   switch (MO.getTargetFlags()) {
   default:
@@ -86,31 +86,31 @@ MCOperand M68kMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case M68kII::MO_PC_RELATIVE_ADDRESS:
     break;
   case M68kII::MO_GOTPCREL:
-    RefKind = M68kMCExpr::VK_GOTPCREL;
+    RefKind = M68k::S_GOTPCREL;
     break;
   case M68kII::MO_GOT:
-    RefKind = M68kMCExpr::VK_GOT;
+    RefKind = M68k::S_GOT;
     break;
   case M68kII::MO_GOTOFF:
-    RefKind = M68kMCExpr::VK_GOTOFF;
+    RefKind = M68k::S_GOTOFF;
     break;
   case M68kII::MO_PLT:
-    RefKind = M68kMCExpr::VK_PLT;
+    RefKind = M68k::S_PLT;
     break;
   case M68kII::MO_TLSGD:
-    RefKind = M68kMCExpr::VK_TLSGD;
+    RefKind = M68k::S_TLSGD;
     break;
   case M68kII::MO_TLSLD:
-    RefKind = M68kMCExpr::VK_TLSLD;
+    RefKind = M68k::S_TLSLD;
     break;
   case M68kII::MO_TLSLDM:
-    RefKind = M68kMCExpr::VK_TLSLDM;
+    RefKind = M68k::S_TLSLDM;
     break;
   case M68kII::MO_TLSIE:
-    RefKind = M68kMCExpr::VK_GOTTPOFF;
+    RefKind = M68k::S_GOTTPOFF;
     break;
   case M68kII::MO_TLSLE:
-    RefKind = M68kMCExpr::VK_TPOFF;
+    RefKind = M68k::S_TPOFF;
     break;
   }
 
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index 1a61325008aab..3f7593cf4352c 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -45,7 +45,7 @@ M68kELFObjectWriter::~M68kELFObjectWriter() {}
 
 enum M68kRelType { RT_32, RT_16, RT_8 };
 
-static M68kRelType getType(unsigned Kind, M68kMCExpr::Specifier &Modifier,
+static M68kRelType getType(unsigned Kind, M68k::Specifier &Modifier,
                            bool &IsPCRel) {
   switch (Kind) {
   case FK_Data_4:
@@ -64,15 +64,15 @@ static M68kRelType getType(unsigned Kind, M68kMCExpr::Specifier &Modifier,
 unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                            const MCValue &Target,
                                            bool IsPCRel) const {
-  auto Specifier = M68kMCExpr::Specifier(Target.getSpecifier());
+  auto Specifier = M68k::Specifier(Target.getSpecifier());
   unsigned Kind = Fixup.getKind();
   M68kRelType Type = getType(Kind, Specifier, IsPCRel);
   switch (Specifier) {
-  case M68kMCExpr::VK_GOTTPOFF:
-  case M68kMCExpr::VK_TLSGD:
-  case M68kMCExpr::VK_TLSLD:
-  case M68kMCExpr::VK_TLSLDM:
-  case M68kMCExpr::VK_TPOFF:
+  case M68k::S_GOTTPOFF:
+  case M68k::S_TLSGD:
+  case M68k::S_TLSLD:
+  case M68k::S_TLSLDM:
+  case M68k::S_TPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -84,7 +84,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
   default:
     llvm_unreachable("Unimplemented");
 
-  case M68kMCExpr::VK_TLSGD:
+  case M68k::S_TLSGD:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_GD32;
@@ -94,7 +94,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_GD8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TLSLDM:
+  case M68k::S_TLSLDM:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LDM32;
@@ -104,7 +104,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LDM8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TLSLD:
+  case M68k::S_TLSLD:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LDO32;
@@ -114,7 +114,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LDO8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTTPOFF:
+  case M68k::S_GOTTPOFF:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_IE32;
@@ -124,7 +124,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_IE8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_TPOFF:
+  case M68k::S_TPOFF:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_TLS_LE32;
@@ -134,7 +134,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_TLS_LE8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_None:
+  case M68k::S_None:
     switch (Type) {
     case RT_32:
       return IsPCRel ? ELF::R_68K_PC32 : ELF::R_68K_32;
@@ -144,7 +144,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return IsPCRel ? ELF::R_68K_PC8 : ELF::R_68K_8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTPCREL:
+  case M68k::S_GOTPCREL:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_GOTPCREL32;
@@ -154,7 +154,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_GOTPCREL8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_GOTOFF:
+  case M68k::S_GOTOFF:
     assert(!IsPCRel);
     switch (Type) {
     case RT_32:
@@ -165,7 +165,7 @@ unsigned M68kELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_68K_GOTOFF8;
     }
     llvm_unreachable("Unrecognized size");
-  case M68kMCExpr::VK_PLT:
+  case M68k::S_PLT:
     switch (Type) {
     case RT_32:
       return ELF::R_68K_PLT32;
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
index ba1b0dc2bb090..8259546fbae57 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -20,14 +20,10 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {M68kMCExpr::VK_GOTOFF, "GOTOFF"},
-    {M68kMCExpr::VK_GOTPCREL, "GOTPCREL"},
-    {M68kMCExpr::VK_GOTTPOFF, "GOTTPOFF"},
-    {M68kMCExpr::VK_PLT, "PLT"},
-    {M68kMCExpr::VK_TLSGD, "TLSGD"},
-    {M68kMCExpr::VK_TLSLD, "TLSLD"},
-    {M68kMCExpr::VK_TLSLDM, "TLSLDM"},
-    {M68kMCExpr::VK_TPOFF, "TPOFF"},
+    {M68k::S_GOTOFF, "GOTOFF"},     {M68k::S_GOTPCREL, "GOTPCREL"},
+    {M68k::S_GOTTPOFF, "GOTTPOFF"}, {M68k::S_PLT, "PLT"},
+    {M68k::S_TLSGD, "TLSGD"},       {M68k::S_TLSLD, "TLSLD"},
+    {M68k::S_TLSLDM, "TLSLDM"},     {M68k::S_TPOFF, "TPOFF"},
 };
 
 void M68kELFMCAsmInfo::anchor() {}
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
index 22d8da263cea5..18301d7ea9b39 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
@@ -13,7 +13,7 @@
 
 using namespace llvm;
 
-const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Specifier S,
+const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Spec S,
                                      MCContext &Ctx) {
   return new (Ctx) M68kMCExpr(Expr, S);
 }
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
index 02bffdcb2889b..39a2898e2eda7 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
@@ -19,30 +19,28 @@
 namespace llvm {
 
 class M68kMCExpr : public MCSpecifierExpr {
-public:
-  enum Specifier {
-    VK_None,
-
-    VK_GOT = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_GOTOFF,
-    VK_GOTPCREL,
-    VK_GOTTPOFF,
-    VK_PLT,
-    VK_TLSGD,
-    VK_TLSLD,
-    VK_TLSLDM,
-    VK_TPOFF,
-  };
-
 protected:
-  explicit M68kMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
+  explicit M68kMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
 
 public:
-  static const M68kMCExpr *create(const MCExpr *, Specifier, MCContext &);
+  static const M68kMCExpr *create(const MCExpr *, Spec, MCContext &);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 };
+namespace M68k {
+enum Specifier {
+  S_None,
+  S_GOT,
+  S_GOTOFF,
+  S_GOTPCREL,
+  S_GOTTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+  S_TPOFF,
+};
+}
 } // namespace llvm
 
 #endif

From 444c6ae530e4814af2cfd6918e3f852ef14ff50d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:02:41 -0700
Subject: [PATCH 510/851] M68k: Remove M68kMCExpr

---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  2 +-
 llvm/lib/Target/M68k/M68kMCInstLower.cpp      |  2 +-
 .../Target/M68k/MCTargetDesc/CMakeLists.txt   |  1 -
 .../M68k/MCTargetDesc/M68kELFObjectWriter.cpp |  2 +-
 .../M68k/MCTargetDesc/M68kMCAsmInfo.cpp       |  1 -
 .../Target/M68k/MCTargetDesc/M68kMCAsmInfo.h  | 15 ++++++
 .../Target/M68k/MCTargetDesc/M68kMCExpr.cpp   | 21 ---------
 .../lib/Target/M68k/MCTargetDesc/M68kMCExpr.h | 46 -------------------
 8 files changed, 18 insertions(+), 72 deletions(-)
 delete mode 100644 llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
 delete mode 100644 llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index c1860fa88a83b..594ea9f48c201 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -19,7 +19,7 @@
 #include "M68kSubtarget.h"
 #include "M68kTargetMachine.h"
 #include "M68kTargetObjectFile.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
index b256d56c032c5..301112c41efb7 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -18,7 +18,7 @@
 #include "M68kInstrInfo.h"
 
 #include "MCTargetDesc/M68kBaseInfo.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
index 0146e21acf363..1127b3b547f1e 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/M68k/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMM68kDesc
   M68kInstPrinter.cpp
   M68kMCAsmInfo.cpp
   M68kMCCodeEmitter.cpp
-  M68kMCExpr.cpp
   M68kMCTargetDesc.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index 3f7593cf4352c..03416df639cf3 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/M68kFixupKinds.h"
-#include "MCTargetDesc/M68kMCExpr.h"
+#include "MCTargetDesc/M68kMCAsmInfo.h"
 #include "MCTargetDesc/M68kMCTargetDesc.h"
 
 #include "llvm/BinaryFormat/ELF.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
index 8259546fbae57..b0a19309b50f7 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "M68kMCAsmInfo.h"
-#include "MCTargetDesc/M68kMCExpr.h"
 
 #include "llvm/MC/MCExpr.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
index 873264d88674c..1ab36260cef18 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
@@ -26,6 +26,21 @@ class M68kELFMCAsmInfo : public MCAsmInfoELF {
   explicit M68kELFMCAsmInfo(const Triple &Triple);
 };
 
+namespace M68k {
+enum Specifier {
+  S_None,
+  S_GOT,
+  S_GOTOFF,
+  S_GOTPCREL,
+  S_GOTTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+  S_TPOFF,
+};
+}
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
deleted file mode 100644
index 18301d7ea9b39..0000000000000
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===- M68k specific MC expression classes ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "M68kMCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-
-using namespace llvm;
-
-const M68kMCExpr *M68kMCExpr::create(const MCExpr *Expr, Spec S,
-                                     MCContext &Ctx) {
-  return new (Ctx) M68kMCExpr(Expr, S);
-}
-
-void M68kMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {}
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
deleted file mode 100644
index 39a2898e2eda7..0000000000000
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===- M68k specific MC expression classes ----------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The MCTargetExpr subclass describes a relocatable expression with a
-// M68k-specific relocation specifier.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCEXPR_H
-#define LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class M68kMCExpr : public MCSpecifierExpr {
-protected:
-  explicit M68kMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const M68kMCExpr *create(const MCExpr *, Spec, MCContext &);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
-namespace M68k {
-enum Specifier {
-  S_None,
-  S_GOT,
-  S_GOTOFF,
-  S_GOTPCREL,
-  S_GOTTPOFF,
-  S_PLT,
-  S_TLSGD,
-  S_TLSLD,
-  S_TLSLDM,
-  S_TPOFF,
-};
-}
-} // namespace llvm
-
-#endif

From b839632bf44f56e6f17777857f4b23d4eccb6f33 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:17:22 -0700
Subject: [PATCH 511/851] PowerPC: Rename PPCMCExpr::VK_ to PPC::S_

Prepare for removing PPCMCExpr. Adopt the newer naming convention with
AMDGPU/WebAssembly/VE/M68k.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |  60 ++--
 .../PowerPC/MCTargetDesc/PPCAsmBackend.cpp    |   2 +-
 .../MCTargetDesc/PPCELFObjectWriter.cpp       | 272 +++++++++---------
 .../PowerPC/MCTargetDesc/PPCELFStreamer.cpp   |   8 +-
 .../PowerPC/MCTargetDesc/PPCInstPrinter.cpp   |   8 +-
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     | 152 +++++-----
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.h       |  84 ++++++
 .../PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp |  16 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp |  23 +-
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.h   |  96 +------
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp  |   6 +-
 .../MCTargetDesc/PPCXCOFFObjectWriter.cpp     |  34 +--
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 118 ++++----
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp    |  44 +--
 .../Target/PowerPC/PPCTargetObjectFile.cpp    |   6 +-
 15 files changed, 459 insertions(+), 470 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 9d3d04e6b8add..7e79d85d60173 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -745,8 +745,8 @@ struct PPCOperand : public MCParsedAsmOperand {
       return CreateImm(CE->getValue(), S, E, IsPPC64);
 
     if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
-      if (getSpecifier(SRE) == PPCMCExpr::VK_TLS ||
-          getSpecifier(SRE) == PPCMCExpr::VK_TLS_PCREL)
+      if (getSpecifier(SRE) == PPC::S_TLS ||
+          getSpecifier(SRE) == PPC::S_TLS_PCREL)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
     if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
@@ -1378,25 +1378,25 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     auto *TE = cast<PPCMCExpr>(E);
     Spec = TE->getSpecifier();
     (void)extractSpecifier(TE->getSubExpr(), Spec);
-    Spec = PPCMCExpr::VK_None;
+    Spec = PPC::S_None;
   } break;
 
   case MCExpr::SymbolRef: {
     const auto *SRE = cast<MCSymbolRefExpr>(E);
     switch (getSpecifier(SRE)) {
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
     default:
       break;
-    case PPCMCExpr::VK_LO:
-    case PPCMCExpr::VK_HI:
-    case PPCMCExpr::VK_HA:
-    case PPCMCExpr::VK_HIGH:
-    case PPCMCExpr::VK_HIGHA:
-    case PPCMCExpr::VK_HIGHER:
-    case PPCMCExpr::VK_HIGHERA:
-    case PPCMCExpr::VK_HIGHEST:
-    case PPCMCExpr::VK_HIGHESTA:
-      if (Spec == PPCMCExpr::VK_None)
+    case PPC::S_LO:
+    case PPC::S_HI:
+    case PPC::S_HA:
+    case PPC::S_HIGH:
+    case PPC::S_HIGHA:
+    case PPC::S_HIGHER:
+    case PPC::S_HIGHERA:
+    case PPC::S_HIGHEST:
+    case PPC::S_HIGHESTA:
+      if (Spec == PPC::S_None)
         Spec = getSpecifier(SRE);
       else
         Error(E->getLoc(), "cannot contain more than one relocation specifier");
@@ -1408,7 +1408,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
   case MCExpr::Unary: {
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
     const MCExpr *Sub = extractSpecifier(UE->getSubExpr(), Spec);
-    if (Spec != PPCMCExpr::VK_None)
+    if (Spec != PPC::S_None)
       return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
     break;
   }
@@ -1417,7 +1417,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
     const MCExpr *LHS = extractSpecifier(BE->getLHS(), Spec);
     const MCExpr *RHS = extractSpecifier(BE->getRHS(), Spec);
-    if (Spec != PPCMCExpr::VK_None)
+    if (Spec != PPC::S_None)
       return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
     break;
   }
@@ -1436,9 +1436,9 @@ bool PPCAsmParser::parseExpression(const MCExpr *&EVal) {
   if (getParser().parseExpression(EVal))
     return true;
 
-  uint16_t Spec = PPCMCExpr::VK_None;
+  uint16_t Spec = PPC::S_None;
   const MCExpr *E = extractSpecifier(EVal, Spec);
-  if (Spec != PPCMCExpr::VK_None)
+  if (Spec != PPC::S_None)
     EVal = PPCMCExpr::create(Spec, E, getParser().getContext());
 
   return false;
@@ -1512,9 +1512,9 @@ bool PPCAsmParser::parseOperand(OperandVector &Operands) {
       if (!(parseOptionalToken(AsmToken::Identifier) &&
             Tok.getString().compare_insensitive("plt") == 0))
         return Error(Tok.getLoc(), "expected 'plt'");
-      EVal = MCSymbolRefExpr::create(
-          getContext().getOrCreateSymbol(TlsGetAddr),
-          MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_PLT), getContext());
+      EVal = MCSymbolRefExpr::create(getContext().getOrCreateSymbol(TlsGetAddr),
+                                     MCSymbolRefExpr::VariantKind(PPC::S_PLT),
+                                     getContext());
       if (parseOptionalToken(AsmToken::Plus)) {
         const MCExpr *Addend = nullptr;
         SMLoc EndLoc;
@@ -1826,15 +1826,15 @@ const MCExpr *PPCAsmParser::applySpecifier(const MCExpr *E, uint32_t Spec,
                                            MCContext &Ctx) {
   if (isa<MCConstantExpr>(E)) {
     switch (PPCMCExpr::Specifier(Spec)) {
-    case PPCMCExpr::VK_LO:
-    case PPCMCExpr::VK_HI:
-    case PPCMCExpr::VK_HA:
-    case PPCMCExpr::VK_HIGH:
-    case PPCMCExpr::VK_HIGHA:
-    case PPCMCExpr::VK_HIGHER:
-    case PPCMCExpr::VK_HIGHERA:
-    case PPCMCExpr::VK_HIGHEST:
-    case PPCMCExpr::VK_HIGHESTA:
+    case PPC::S_LO:
+    case PPC::S_HI:
+    case PPC::S_HA:
+    case PPC::S_HIGH:
+    case PPC::S_HIGHA:
+    case PPC::S_HIGHER:
+    case PPC::S_HIGHERA:
+    case PPC::S_HIGHEST:
+    case PPC::S_HIGHESTA:
       break;
     default:
       return nullptr;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 84cd122488428..d4b86d5e2811d 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -140,7 +140,7 @@ class PPCAsmBackend : public MCAsmBackend {
     // In PPC64 ELFv1, .quad .TOC.@tocbase in the .opd section is expected to
     // reference the null symbol.
     auto Target = TargetVal;
-    if (Target.getSpecifier() == PPCMCExpr::VK_TOCBASE)
+    if (Target.getSpecifier() == PPC::S_TOCBASE)
       Target.setAddSym(nullptr);
     return MCAsmBackend::addReloc(F, Fixup, Target, FixedValue, IsResolved);
   }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 096c019f8556e..8e885c3d86a0e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -43,49 +43,49 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
   SMLoc Loc = Fixup.getValue()->getLoc();
   auto Spec = static_cast<PPCMCExpr::Specifier>(Target.getSpecifier());
   switch (Spec) {
-  case PPCMCExpr::VK_DTPMOD:
-  case PPCMCExpr::VK_DTPREL:
-  case PPCMCExpr::VK_DTPREL_HA:
-  case PPCMCExpr::VK_DTPREL_HI:
-  case PPCMCExpr::VK_DTPREL_HIGH:
-  case PPCMCExpr::VK_DTPREL_HIGHA:
-  case PPCMCExpr::VK_DTPREL_HIGHER:
-  case PPCMCExpr::VK_DTPREL_HIGHERA:
-  case PPCMCExpr::VK_DTPREL_HIGHEST:
-  case PPCMCExpr::VK_DTPREL_HIGHESTA:
-  case PPCMCExpr::VK_DTPREL_LO:
-  case PPCMCExpr::VK_GOT_DTPREL:
-  case PPCMCExpr::VK_GOT_DTPREL_HA:
-  case PPCMCExpr::VK_GOT_DTPREL_HI:
-  case PPCMCExpr::VK_GOT_DTPREL_LO:
-  case PPCMCExpr::VK_GOT_TLSGD:
-  case PPCMCExpr::VK_GOT_TLSGD_HA:
-  case PPCMCExpr::VK_GOT_TLSGD_HI:
-  case PPCMCExpr::VK_GOT_TLSGD_LO:
-  case PPCMCExpr::VK_GOT_TLSGD_PCREL:
-  case PPCMCExpr::VK_GOT_TLSLD:
-  case PPCMCExpr::VK_GOT_TLSLD_HA:
-  case PPCMCExpr::VK_GOT_TLSLD_HI:
-  case PPCMCExpr::VK_GOT_TLSLD_LO:
-  case PPCMCExpr::VK_GOT_TPREL:
-  case PPCMCExpr::VK_GOT_TPREL_HA:
-  case PPCMCExpr::VK_GOT_TPREL_HI:
-  case PPCMCExpr::VK_GOT_TPREL_LO:
-  case PPCMCExpr::VK_GOT_TPREL_PCREL:
-  case PPCMCExpr::VK_TLS:
-  case PPCMCExpr::VK_TLSGD:
-  case PPCMCExpr::VK_TLSLD:
-  case PPCMCExpr::VK_TLS_PCREL:
-  case PPCMCExpr::VK_TPREL:
-  case PPCMCExpr::VK_TPREL_HA:
-  case PPCMCExpr::VK_TPREL_HI:
-  case PPCMCExpr::VK_TPREL_HIGH:
-  case PPCMCExpr::VK_TPREL_HIGHA:
-  case PPCMCExpr::VK_TPREL_HIGHER:
-  case PPCMCExpr::VK_TPREL_HIGHERA:
-  case PPCMCExpr::VK_TPREL_HIGHEST:
-  case PPCMCExpr::VK_TPREL_HIGHESTA:
-  case PPCMCExpr::VK_TPREL_LO:
+  case PPC::S_DTPMOD:
+  case PPC::S_DTPREL:
+  case PPC::S_DTPREL_HA:
+  case PPC::S_DTPREL_HI:
+  case PPC::S_DTPREL_HIGH:
+  case PPC::S_DTPREL_HIGHA:
+  case PPC::S_DTPREL_HIGHER:
+  case PPC::S_DTPREL_HIGHERA:
+  case PPC::S_DTPREL_HIGHEST:
+  case PPC::S_DTPREL_HIGHESTA:
+  case PPC::S_DTPREL_LO:
+  case PPC::S_GOT_DTPREL:
+  case PPC::S_GOT_DTPREL_HA:
+  case PPC::S_GOT_DTPREL_HI:
+  case PPC::S_GOT_DTPREL_LO:
+  case PPC::S_GOT_TLSGD:
+  case PPC::S_GOT_TLSGD_HA:
+  case PPC::S_GOT_TLSGD_HI:
+  case PPC::S_GOT_TLSGD_LO:
+  case PPC::S_GOT_TLSGD_PCREL:
+  case PPC::S_GOT_TLSLD:
+  case PPC::S_GOT_TLSLD_HA:
+  case PPC::S_GOT_TLSLD_HI:
+  case PPC::S_GOT_TLSLD_LO:
+  case PPC::S_GOT_TPREL:
+  case PPC::S_GOT_TPREL_HA:
+  case PPC::S_GOT_TPREL_HI:
+  case PPC::S_GOT_TPREL_LO:
+  case PPC::S_GOT_TPREL_PCREL:
+  case PPC::S_TLS:
+  case PPC::S_TLSGD:
+  case PPC::S_TLSLD:
+  case PPC::S_TLS_PCREL:
+  case PPC::S_TPREL:
+  case PPC::S_TPREL_HA:
+  case PPC::S_TPREL_HI:
+  case PPC::S_TPREL_HIGH:
+  case PPC::S_TPREL_HIGHA:
+  case PPC::S_TPREL_HIGHER:
+  case PPC::S_TPREL_HIGHERA:
+  case PPC::S_TPREL_HIGHEST:
+  case PPC::S_TPREL_HIGHESTA:
+  case PPC::S_TPREL_LO:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -106,16 +106,16 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC_REL24;
         break;
-      case PPCMCExpr::VK_PLT:
+      case PPC::S_PLT:
         Type = ELF::R_PPC_PLTREL24;
         break;
-      case PPCMCExpr::VK_LOCAL:
+      case PPC::S_LOCAL:
         Type = ELF::R_PPC_LOCAL24PC;
         break;
-      case PPCMCExpr::VK_NOTOC:
+      case PPC::S_NOTOC:
         Type = ELF::R_PPC64_REL24_NOTOC;
         break;
       }
@@ -129,13 +129,13 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         return ELF::R_PPC_NONE;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         return ELF::R_PPC_REL16;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC_REL16_LO;
-      case PPCMCExpr::VK_HI:
+      case PPC::S_HI:
         return ELF::R_PPC_REL16_HI;
-      case PPCMCExpr::VK_HA:
+      case PPC::S_HA:
         return ELF::R_PPC_REL16_HA;
       }
       break;
@@ -148,19 +148,19 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_PCREL:
+      case PPC::S_PCREL:
         Type = ELF::R_PPC64_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_PCREL:
+      case PPC::S_GOT_PCREL:
         Type = ELF::R_PPC64_GOT_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_PCREL:
+      case PPC::S_GOT_TLSGD_PCREL:
         Type = ELF::R_PPC64_GOT_TLSGD_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_PCREL:
+      case PPC::S_GOT_TLSLD_PCREL:
         Type = ELF::R_PPC64_GOT_TLSLD_PCREL34;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_PCREL:
+      case PPC::S_GOT_TPREL_PCREL:
         Type = ELF::R_PPC64_GOT_TPREL_PCREL34;
         break;
       }
@@ -186,172 +186,172 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC_ADDR16_LO;
-      case PPCMCExpr::VK_HI:
+      case PPC::S_HI:
         return ELF::R_PPC_ADDR16_HI;
-      case PPCMCExpr::VK_HA:
+      case PPC::S_HA:
         return ELF::R_PPC_ADDR16_HA;
-      case PPCMCExpr::VK_HIGH:
+      case PPC::S_HIGH:
         return ELF::R_PPC64_ADDR16_HIGH;
-      case PPCMCExpr::VK_HIGHA:
+      case PPC::S_HIGHA:
         return ELF::R_PPC64_ADDR16_HIGHA;
-      case PPCMCExpr::VK_HIGHER:
+      case PPC::S_HIGHER:
         return ELF::R_PPC64_ADDR16_HIGHER;
-      case PPCMCExpr::VK_HIGHERA:
+      case PPC::S_HIGHERA:
         return ELF::R_PPC64_ADDR16_HIGHERA;
-      case PPCMCExpr::VK_HIGHEST:
+      case PPC::S_HIGHEST:
         return ELF::R_PPC64_ADDR16_HIGHEST;
-      case PPCMCExpr::VK_HIGHESTA:
+      case PPC::S_HIGHESTA:
         return ELF::R_PPC64_ADDR16_HIGHESTA;
 
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC_ADDR16;
         break;
-      case PPCMCExpr::VK_GOT:
+      case PPC::S_GOT:
         Type = ELF::R_PPC_GOT16;
         break;
-      case PPCMCExpr::VK_GOT_LO:
+      case PPC::S_GOT_LO:
         Type = ELF::R_PPC_GOT16_LO;
         break;
-      case PPCMCExpr::VK_GOT_HI:
+      case PPC::S_GOT_HI:
         Type = ELF::R_PPC_GOT16_HI;
         break;
-      case PPCMCExpr::VK_GOT_HA:
+      case PPC::S_GOT_HA:
         Type = ELF::R_PPC_GOT16_HA;
         break;
-      case PPCMCExpr::VK_TOC:
+      case PPC::S_TOC:
         Type = ELF::R_PPC64_TOC16;
         break;
-      case PPCMCExpr::VK_TOC_LO:
+      case PPC::S_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO;
         break;
-      case PPCMCExpr::VK_TOC_HI:
+      case PPC::S_TOC_HI:
         Type = ELF::R_PPC64_TOC16_HI;
         break;
-      case PPCMCExpr::VK_TOC_HA:
+      case PPC::S_TOC_HA:
         Type = ELF::R_PPC64_TOC16_HA;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC_TPREL16;
         break;
-      case PPCMCExpr::VK_TPREL_LO:
+      case PPC::S_TPREL_LO:
         Type = ELF::R_PPC_TPREL16_LO;
         break;
-      case PPCMCExpr::VK_TPREL_HI:
+      case PPC::S_TPREL_HI:
         Type = ELF::R_PPC_TPREL16_HI;
         break;
-      case PPCMCExpr::VK_TPREL_HA:
+      case PPC::S_TPREL_HA:
         Type = ELF::R_PPC_TPREL16_HA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGH:
+      case PPC::S_TPREL_HIGH:
         Type = ELF::R_PPC64_TPREL16_HIGH;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHA:
+      case PPC::S_TPREL_HIGHA:
         Type = ELF::R_PPC64_TPREL16_HIGHA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHER:
+      case PPC::S_TPREL_HIGHER:
         Type = ELF::R_PPC64_TPREL16_HIGHER;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHERA:
+      case PPC::S_TPREL_HIGHERA:
         Type = ELF::R_PPC64_TPREL16_HIGHERA;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHEST:
+      case PPC::S_TPREL_HIGHEST:
         Type = ELF::R_PPC64_TPREL16_HIGHEST;
         break;
-      case PPCMCExpr::VK_TPREL_HIGHESTA:
+      case PPC::S_TPREL_HIGHESTA:
         Type = ELF::R_PPC64_TPREL16_HIGHESTA;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL16;
         break;
-      case PPCMCExpr::VK_DTPREL_LO:
+      case PPC::S_DTPREL_LO:
         Type = ELF::R_PPC64_DTPREL16_LO;
         break;
-      case PPCMCExpr::VK_DTPREL_HI:
+      case PPC::S_DTPREL_HI:
         Type = ELF::R_PPC64_DTPREL16_HI;
         break;
-      case PPCMCExpr::VK_DTPREL_HA:
+      case PPC::S_DTPREL_HA:
         Type = ELF::R_PPC64_DTPREL16_HA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGH:
+      case PPC::S_DTPREL_HIGH:
         Type = ELF::R_PPC64_DTPREL16_HIGH;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHA:
+      case PPC::S_DTPREL_HIGHA:
         Type = ELF::R_PPC64_DTPREL16_HIGHA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHER:
+      case PPC::S_DTPREL_HIGHER:
         Type = ELF::R_PPC64_DTPREL16_HIGHER;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHERA:
+      case PPC::S_DTPREL_HIGHERA:
         Type = ELF::R_PPC64_DTPREL16_HIGHERA;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHEST:
+      case PPC::S_DTPREL_HIGHEST:
         Type = ELF::R_PPC64_DTPREL16_HIGHEST;
         break;
-      case PPCMCExpr::VK_DTPREL_HIGHESTA:
+      case PPC::S_DTPREL_HIGHESTA:
         Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD:
+      case PPC::S_GOT_TLSGD:
         if (is64Bit())
           Type = ELF::R_PPC64_GOT_TLSGD16;
         else
           Type = ELF::R_PPC_GOT_TLSGD16;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_LO:
+      case PPC::S_GOT_TLSGD_LO:
         Type = ELF::R_PPC64_GOT_TLSGD16_LO;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_HI:
+      case PPC::S_GOT_TLSGD_HI:
         Type = ELF::R_PPC64_GOT_TLSGD16_HI;
         break;
-      case PPCMCExpr::VK_GOT_TLSGD_HA:
+      case PPC::S_GOT_TLSGD_HA:
         Type = ELF::R_PPC64_GOT_TLSGD16_HA;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD:
+      case PPC::S_GOT_TLSLD:
         if (is64Bit())
           Type = ELF::R_PPC64_GOT_TLSLD16;
         else
           Type = ELF::R_PPC_GOT_TLSLD16;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_LO:
+      case PPC::S_GOT_TLSLD_LO:
         Type = ELF::R_PPC64_GOT_TLSLD16_LO;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_HI:
+      case PPC::S_GOT_TLSLD_HI:
         Type = ELF::R_PPC64_GOT_TLSLD16_HI;
         break;
-      case PPCMCExpr::VK_GOT_TLSLD_HA:
+      case PPC::S_GOT_TLSLD_HA:
         Type = ELF::R_PPC64_GOT_TLSLD16_HA;
         break;
-      case PPCMCExpr::VK_GOT_TPREL:
+      case PPC::S_GOT_TPREL:
         /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS.  */
         Type = ELF::R_PPC64_GOT_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_LO:
+      case PPC::S_GOT_TPREL_LO:
         /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS.  */
         Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_HI:
+      case PPC::S_GOT_TPREL_HI:
         Type = ELF::R_PPC64_GOT_TPREL16_HI;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL:
+      case PPC::S_GOT_DTPREL:
         /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS.  */
         Type = ELF::R_PPC64_GOT_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_LO:
+      case PPC::S_GOT_DTPREL_LO:
         /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
            are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS.  */
         Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_HA:
+      case PPC::S_GOT_TPREL_HA:
         Type = ELF::R_PPC64_GOT_TPREL16_HA;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_HI:
+      case PPC::S_GOT_DTPREL_HI:
         Type = ELF::R_PPC64_GOT_DTPREL16_HI;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_HA:
+      case PPC::S_GOT_DTPREL_HA:
         Type = ELF::R_PPC64_GOT_DTPREL16_HA;
         break;
       }
@@ -362,45 +362,45 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_LO:
+      case PPC::S_LO:
         return ELF::R_PPC64_ADDR16_LO_DS;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC64_ADDR16_DS;
         break;
-      case PPCMCExpr::VK_GOT:
+      case PPC::S_GOT:
         Type = ELF::R_PPC64_GOT16_DS;
         break;
-      case PPCMCExpr::VK_GOT_LO:
+      case PPC::S_GOT_LO:
         Type = ELF::R_PPC64_GOT16_LO_DS;
         break;
-      case PPCMCExpr::VK_TOC:
+      case PPC::S_TOC:
         Type = ELF::R_PPC64_TOC16_DS;
         break;
-      case PPCMCExpr::VK_TOC_LO:
+      case PPC::S_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_TPREL_LO:
+      case PPC::S_TPREL_LO:
         Type = ELF::R_PPC64_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_DTPREL_LO:
+      case PPC::S_DTPREL_LO:
         Type = ELF::R_PPC64_DTPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL:
+      case PPC::S_GOT_TPREL:
         Type = ELF::R_PPC64_GOT_TPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_TPREL_LO:
+      case PPC::S_GOT_TPREL_LO:
         Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL:
+      case PPC::S_GOT_DTPREL:
         Type = ELF::R_PPC64_GOT_DTPREL16_DS;
         break;
-      case PPCMCExpr::VK_GOT_DTPREL_LO:
+      case PPC::S_GOT_DTPREL_LO:
         Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
         break;
       }
@@ -410,25 +410,25 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_TLSGD:
+      case PPC::S_TLSGD:
         if (is64Bit())
           Type = ELF::R_PPC64_TLSGD;
         else
           Type = ELF::R_PPC_TLSGD;
         break;
-      case PPCMCExpr::VK_TLSLD:
+      case PPC::S_TLSLD:
         if (is64Bit())
           Type = ELF::R_PPC64_TLSLD;
         else
           Type = ELF::R_PPC_TLSLD;
         break;
-      case PPCMCExpr::VK_TLS:
+      case PPC::S_TLS:
         if (is64Bit())
           Type = ELF::R_PPC64_TLS;
         else
           Type = ELF::R_PPC_TLS;
         break;
-      case PPCMCExpr::VK_TLS_PCREL:
+      case PPC::S_TLS_PCREL:
         Type = ELF::R_PPC64_TLS;
         break;
       }
@@ -438,10 +438,10 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL34;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL34;
         break;
       }
@@ -451,26 +451,26 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup,
       default:
         reportError(Loc, "unsupported relocation type");
         break;
-      case PPCMCExpr::VK_TOCBASE:
+      case PPC::S_TOCBASE:
         Type = ELF::R_PPC64_TOC;
         break;
-      case PPCMCExpr::VK_None:
+      case PPC::S_None:
         Type = ELF::R_PPC64_ADDR64;
         break;
-      case PPCMCExpr::VK_DTPMOD:
+      case PPC::S_DTPMOD:
         Type = ELF::R_PPC64_DTPMOD64;
         break;
-      case PPCMCExpr::VK_TPREL:
+      case PPC::S_TPREL:
         Type = ELF::R_PPC64_TPREL64;
         break;
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC64_DTPREL64;
         break;
       }
       break;
     case FK_Data_4:
       switch (Spec) {
-      case PPCMCExpr::VK_DTPREL:
+      case PPC::S_DTPREL:
         Type = ELF::R_PPC_DTPREL32;
         break;
       default:
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 501ef460b6938..78065541f0d03 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -139,7 +139,7 @@ void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) {
   // Cast the last operand to MCSymbolRefExpr to get the symbol.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  assert(getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT &&
+  assert(getSpecifier(SymExpr) == PPC::S_PCREL_OPT &&
          "Expecting a symbol of type VK_PCREL_OPT");
   MCSymbol *LabelSym =
       getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
@@ -174,7 +174,7 @@ void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
   // Cast the last operand to MCSymbolRefExpr to get the symbol.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  assert(getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT &&
+  assert(getSpecifier(SymExpr) == PPC::S_PCREL_OPT &&
          "Expecting a symbol of type VK_PCREL_OPT");
   MCSymbol *LabelSym =
       getContext().getOrCreateSymbol(SymExpr->getSymbol().getName());
@@ -190,7 +190,7 @@ void PPCELFStreamer::emitGOTToPCRelLabel(const MCInst &Inst) {
 // The above is a pair of such instructions and this function will not return
 // std::nullopt for either one of them. In both cases we are looking for the
 // last operand <MCOperand Expr:(.Lpcrel@<<invalid>>)> which needs to be an
-// MCExpr and has the flag PPCMCExpr::VK_PCREL_OPT. After that we just
+// MCExpr and has the flag PPC::S_PCREL_OPT. After that we just
 // look at the opcode and in the case of PLDpc we will return true. For the load
 // (or store) this function will return false indicating it has found the second
 // instruciton in the pair.
@@ -212,7 +212,7 @@ std::optional<bool> llvm::isPartOfGOTToPCRelPair(const MCInst &Inst,
   // Check for the variant kind VK_PCREL_OPT in this expression.
   const MCExpr *Expr = Operand.getExpr();
   const MCSymbolRefExpr *SymExpr = static_cast<const MCSymbolRefExpr *>(Expr);
-  if (!SymExpr || getSpecifier(SymExpr) != PPCMCExpr::VK_PCREL_OPT)
+  if (!SymExpr || getSpecifier(SymExpr) != PPC::S_PCREL_OPT)
     return std::nullopt;
 
   return (Inst.getOpcode() == PPC::PLDpc);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 0e1b28af691d5..bd01767f41bd5 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -92,7 +92,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
       const MCSymbolRefExpr *SymExpr =
           static_cast<const MCSymbolRefExpr *>(Expr);
 
-      if (SymExpr && getSpecifier(SymExpr) == PPCMCExpr::VK_PCREL_OPT) {
+      if (SymExpr && getSpecifier(SymExpr) == PPC::S_PCREL_OPT) {
         const MCSymbol &Symbol = SymExpr->getSymbol();
         if (MI->getOpcode() == PPC::PLDpc) {
           printInstruction(MI, Address, STI, O);
@@ -579,13 +579,13 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   // because we do not want the assembly to print out the @notoc at the
   // end like __tls_get_addr(x@tlsgd)@notoc. Instead we want it to look
   // like __tls_get_addr@notoc(x@tlsgd).
-  if (getSpecifier(RefExp) == PPCMCExpr::VK_NOTOC)
+  if (getSpecifier(RefExp) == PPC::S_NOTOC)
     O << '@' << MAI.getSpecifierName(RefExp->getKind());
   O << '(';
   printOperand(MI, OpNo + 1, STI, O);
   O << ')';
-  if (getSpecifier(RefExp) != PPCMCExpr::VK_None &&
-      getSpecifier(RefExp) != PPCMCExpr::VK_NOTOC)
+  if (getSpecifier(RefExp) != PPC::S_None &&
+      getSpecifier(RefExp) != PPC::S_NOTOC)
     O << '@' << MAI.getSpecifierName(RefExp->getKind());
   if (Rhs) {
     SmallString<0> Buf;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b5be23c5a96ad..bb1f21d8f0327 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -20,82 +20,82 @@ using namespace llvm;
 void PPCELFMCAsmInfo::anchor() { }
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {PPCMCExpr::VK_DTPREL, "DTPREL"},
-    {PPCMCExpr::VK_GOT, "GOT"},
-    {PPCMCExpr::VK_GOT_HA, "got@ha"},
-    {PPCMCExpr::VK_GOT_HI, "got@h"},
-    {PPCMCExpr::VK_GOT_LO, "got@l"},
-    {PPCMCExpr::VK_HA, "ha"},
-    {PPCMCExpr::VK_HI, "h"},
-    {PPCMCExpr::VK_HIGH, "high"},
-    {PPCMCExpr::VK_HIGHA, "higha"},
-    {PPCMCExpr::VK_HIGHER, "higher"},
-    {PPCMCExpr::VK_HIGHERA, "highera"},
-    {PPCMCExpr::VK_HIGHEST, "highest"},
-    {PPCMCExpr::VK_HIGHESTA, "highesta"},
-    {PPCMCExpr::VK_LO, "l"},
-    {PPCMCExpr::VK_L, "l"}, // FIXME: share the name with VK_LO
-    {PPCMCExpr::VK_PCREL, "PCREL"},
-    {PPCMCExpr::VK_PLT, "PLT"},
-    {PPCMCExpr::VK_TLSGD, "tlsgd"},
-    {PPCMCExpr::VK_TLSLD, "tlsld"},
-    {PPCMCExpr::VK_TOC, "toc"},
-    {PPCMCExpr::VK_TOCBASE, "tocbase"},
-    {PPCMCExpr::VK_TOC_HA, "toc@ha"},
-    {PPCMCExpr::VK_TOC_HI, "toc@h"},
-    {PPCMCExpr::VK_TOC_LO, "toc@l"},
-    {PPCMCExpr::VK_TPREL, "TPREL"},
-    {PPCMCExpr::VK_AIX_TLSGD, "gd"},
-    {PPCMCExpr::VK_AIX_TLSGDM, "m"},
-    {PPCMCExpr::VK_AIX_TLSIE, "ie"},
-    {PPCMCExpr::VK_AIX_TLSLD, "ld"},
-    {PPCMCExpr::VK_AIX_TLSLE, "le"},
-    {PPCMCExpr::VK_AIX_TLSML, "ml"},
-    {PPCMCExpr::VK_DTPMOD, "dtpmod"},
-    {PPCMCExpr::VK_DTPREL_HA, "dtprel@ha"},
-    {PPCMCExpr::VK_DTPREL_HI, "dtprel@h"},
-    {PPCMCExpr::VK_DTPREL_HIGH, "dtprel@high"},
-    {PPCMCExpr::VK_DTPREL_HIGHA, "dtprel@higha"},
-    {PPCMCExpr::VK_DTPREL_HIGHER, "dtprel@higher"},
-    {PPCMCExpr::VK_DTPREL_HIGHERA, "dtprel@highera"},
-    {PPCMCExpr::VK_DTPREL_HIGHEST, "dtprel@highest"},
-    {PPCMCExpr::VK_DTPREL_HIGHESTA, "dtprel@highesta"},
-    {PPCMCExpr::VK_DTPREL_LO, "dtprel@l"},
-    {PPCMCExpr::VK_GOT_DTPREL, "got@dtprel"},
-    {PPCMCExpr::VK_GOT_DTPREL_HA, "got@dtprel@ha"},
-    {PPCMCExpr::VK_GOT_DTPREL_HI, "got@dtprel@h"},
-    {PPCMCExpr::VK_GOT_DTPREL_LO, "got@dtprel@l"},
-    {PPCMCExpr::VK_GOT_PCREL, "got@pcrel"},
-    {PPCMCExpr::VK_GOT_TLSGD, "got@tlsgd"},
-    {PPCMCExpr::VK_GOT_TLSGD_HA, "got@tlsgd@ha"},
-    {PPCMCExpr::VK_GOT_TLSGD_HI, "got@tlsgd@h"},
-    {PPCMCExpr::VK_GOT_TLSGD_LO, "got@tlsgd@l"},
-    {PPCMCExpr::VK_GOT_TLSGD_PCREL, "got@tlsgd@pcrel"},
-    {PPCMCExpr::VK_GOT_TLSLD, "got@tlsld"},
-    {PPCMCExpr::VK_GOT_TLSLD_HA, "got@tlsld@ha"},
-    {PPCMCExpr::VK_GOT_TLSLD_HI, "got@tlsld@h"},
-    {PPCMCExpr::VK_GOT_TLSLD_LO, "got@tlsld@l"},
-    {PPCMCExpr::VK_GOT_TLSLD_PCREL, "got@tlsld@pcrel"},
-    {PPCMCExpr::VK_GOT_TPREL, "got@tprel"},
-    {PPCMCExpr::VK_GOT_TPREL_HA, "got@tprel@ha"},
-    {PPCMCExpr::VK_GOT_TPREL_HI, "got@tprel@h"},
-    {PPCMCExpr::VK_GOT_TPREL_LO, "got@tprel@l"},
-    {PPCMCExpr::VK_GOT_TPREL_PCREL, "got@tprel@pcrel"},
-    {PPCMCExpr::VK_LOCAL, "local"},
-    {PPCMCExpr::VK_NOTOC, "notoc"},
-    {PPCMCExpr::VK_PCREL_OPT, "<<invalid>>"},
-    {PPCMCExpr::VK_TLS, "tls"},
-    {PPCMCExpr::VK_TLS_PCREL, "tls@pcrel"},
-    {PPCMCExpr::VK_TPREL_HA, "tprel@ha"},
-    {PPCMCExpr::VK_TPREL_HI, "tprel@h"},
-    {PPCMCExpr::VK_TPREL_HIGH, "tprel@high"},
-    {PPCMCExpr::VK_TPREL_HIGHA, "tprel@higha"},
-    {PPCMCExpr::VK_TPREL_HIGHER, "tprel@higher"},
-    {PPCMCExpr::VK_TPREL_HIGHERA, "tprel@highera"},
-    {PPCMCExpr::VK_TPREL_HIGHEST, "tprel@highest"},
-    {PPCMCExpr::VK_TPREL_HIGHESTA, "tprel@highesta"},
-    {PPCMCExpr::VK_TPREL_LO, "tprel@l"},
-    {PPCMCExpr::VK_U, "u"},
+    {PPC::S_DTPREL, "DTPREL"},
+    {PPC::S_GOT, "GOT"},
+    {PPC::S_GOT_HA, "got@ha"},
+    {PPC::S_GOT_HI, "got@h"},
+    {PPC::S_GOT_LO, "got@l"},
+    {PPC::S_HA, "ha"},
+    {PPC::S_HI, "h"},
+    {PPC::S_HIGH, "high"},
+    {PPC::S_HIGHA, "higha"},
+    {PPC::S_HIGHER, "higher"},
+    {PPC::S_HIGHERA, "highera"},
+    {PPC::S_HIGHEST, "highest"},
+    {PPC::S_HIGHESTA, "highesta"},
+    {PPC::S_LO, "l"},
+    {PPC::S_L, "l"}, // FIXME: share the name with VK_LO
+    {PPC::S_PCREL, "PCREL"},
+    {PPC::S_PLT, "PLT"},
+    {PPC::S_TLSGD, "tlsgd"},
+    {PPC::S_TLSLD, "tlsld"},
+    {PPC::S_TOC, "toc"},
+    {PPC::S_TOCBASE, "tocbase"},
+    {PPC::S_TOC_HA, "toc@ha"},
+    {PPC::S_TOC_HI, "toc@h"},
+    {PPC::S_TOC_LO, "toc@l"},
+    {PPC::S_TPREL, "TPREL"},
+    {PPC::S_AIX_TLSGD, "gd"},
+    {PPC::S_AIX_TLSGDM, "m"},
+    {PPC::S_AIX_TLSIE, "ie"},
+    {PPC::S_AIX_TLSLD, "ld"},
+    {PPC::S_AIX_TLSLE, "le"},
+    {PPC::S_AIX_TLSML, "ml"},
+    {PPC::S_DTPMOD, "dtpmod"},
+    {PPC::S_DTPREL_HA, "dtprel@ha"},
+    {PPC::S_DTPREL_HI, "dtprel@h"},
+    {PPC::S_DTPREL_HIGH, "dtprel@high"},
+    {PPC::S_DTPREL_HIGHA, "dtprel@higha"},
+    {PPC::S_DTPREL_HIGHER, "dtprel@higher"},
+    {PPC::S_DTPREL_HIGHERA, "dtprel@highera"},
+    {PPC::S_DTPREL_HIGHEST, "dtprel@highest"},
+    {PPC::S_DTPREL_HIGHESTA, "dtprel@highesta"},
+    {PPC::S_DTPREL_LO, "dtprel@l"},
+    {PPC::S_GOT_DTPREL, "got@dtprel"},
+    {PPC::S_GOT_DTPREL_HA, "got@dtprel@ha"},
+    {PPC::S_GOT_DTPREL_HI, "got@dtprel@h"},
+    {PPC::S_GOT_DTPREL_LO, "got@dtprel@l"},
+    {PPC::S_GOT_PCREL, "got@pcrel"},
+    {PPC::S_GOT_TLSGD, "got@tlsgd"},
+    {PPC::S_GOT_TLSGD_HA, "got@tlsgd@ha"},
+    {PPC::S_GOT_TLSGD_HI, "got@tlsgd@h"},
+    {PPC::S_GOT_TLSGD_LO, "got@tlsgd@l"},
+    {PPC::S_GOT_TLSGD_PCREL, "got@tlsgd@pcrel"},
+    {PPC::S_GOT_TLSLD, "got@tlsld"},
+    {PPC::S_GOT_TLSLD_HA, "got@tlsld@ha"},
+    {PPC::S_GOT_TLSLD_HI, "got@tlsld@h"},
+    {PPC::S_GOT_TLSLD_LO, "got@tlsld@l"},
+    {PPC::S_GOT_TLSLD_PCREL, "got@tlsld@pcrel"},
+    {PPC::S_GOT_TPREL, "got@tprel"},
+    {PPC::S_GOT_TPREL_HA, "got@tprel@ha"},
+    {PPC::S_GOT_TPREL_HI, "got@tprel@h"},
+    {PPC::S_GOT_TPREL_LO, "got@tprel@l"},
+    {PPC::S_GOT_TPREL_PCREL, "got@tprel@pcrel"},
+    {PPC::S_LOCAL, "local"},
+    {PPC::S_NOTOC, "notoc"},
+    {PPC::S_PCREL_OPT, "<<invalid>>"},
+    {PPC::S_TLS, "tls"},
+    {PPC::S_TLS_PCREL, "tls@pcrel"},
+    {PPC::S_TPREL_HA, "tprel@ha"},
+    {PPC::S_TPREL_HI, "tprel@h"},
+    {PPC::S_TPREL_HIGH, "tprel@high"},
+    {PPC::S_TPREL_HIGHA, "tprel@higha"},
+    {PPC::S_TPREL_HIGHER, "tprel@higher"},
+    {PPC::S_TPREL_HIGHERA, "tprel@highera"},
+    {PPC::S_TPREL_HIGHEST, "tprel@highest"},
+    {PPC::S_TPREL_HIGHESTA, "tprel@highesta"},
+    {PPC::S_TPREL_LO, "tprel@l"},
+    {PPC::S_U, "u"},
 };
 
 PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 48806051f5814..9fbb73c2e3182 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -33,6 +33,90 @@ class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
 };
 
+namespace PPC {
+enum Specifier {
+  S_None,
+
+  S_LO,
+  S_HI,
+  S_HA,
+  S_HIGH,
+  S_HIGHA,
+  S_HIGHER,
+  S_HIGHERA,
+  S_HIGHEST,
+  S_HIGHESTA,
+
+  S_AIX_TLSGD,       // symbol@gd
+  S_AIX_TLSGDM,      // symbol@m
+  S_AIX_TLSIE,       // symbol@ie
+  S_AIX_TLSLD,       // symbol@ld
+  S_AIX_TLSLE,       // symbol@le
+  S_AIX_TLSML,       // symbol@ml
+  S_DTPMOD,          // symbol@dtpmod
+  S_DTPREL,          // symbol@dprel
+  S_DTPREL_HA,       // symbol@dtprel@ha
+  S_DTPREL_HI,       // symbol@dtprel@h
+  S_DTPREL_HIGH,     // symbol@dtprel@high
+  S_DTPREL_HIGHA,    // symbol@dtprel@higha
+  S_DTPREL_HIGHER,   // symbol@dtprel@higher
+  S_DTPREL_HIGHERA,  // symbol@dtprel@highera
+  S_DTPREL_HIGHEST,  // symbol@dtprel@highest
+  S_DTPREL_HIGHESTA, // symbol@dtprel@highesta
+  S_DTPREL_LO,       // symbol@dtprel@l
+  S_GOT,             // symbol@got
+  S_GOT_DTPREL,      // symbol@got@dtprel
+  S_GOT_DTPREL_HA,   // symbol@got@dtprel@ha
+  S_GOT_DTPREL_HI,   // symbol@got@dtprel@h
+  S_GOT_DTPREL_LO,   // symbol@got@dtprel@l
+  S_GOT_HA,          // symbol@got@ha
+  S_GOT_HI,          // symbol@got@h
+  S_GOT_LO,          // symbol@got@l
+  S_GOT_PCREL,       // symbol@got@pcrel
+  S_GOT_TLSGD,       // symbol@got@tlsgd
+  S_GOT_TLSGD_HA,    // symbol@got@tlsgd@ha
+  S_GOT_TLSGD_HI,    // symbol@got@tlsgd@h
+  S_GOT_TLSGD_LO,    // symbol@got@tlsgd@l
+  S_GOT_TLSGD_PCREL, // symbol@got@tlsgd@pcrel
+  S_GOT_TLSLD,       // symbol@got@tlsld
+  S_GOT_TLSLD_HA,    // symbol@got@tlsld@ha
+  S_GOT_TLSLD_HI,    // symbol@got@tlsld@h
+  S_GOT_TLSLD_LO,    // symbol@got@tlsld@l
+  S_GOT_TLSLD_PCREL, // symbol@got@tlsld@pcrel
+  S_GOT_TPREL,       // symbol@got@tprel
+  S_GOT_TPREL_HA,    // symbol@got@tprel@ha
+  S_GOT_TPREL_HI,    // symbol@got@tprel@h
+  S_GOT_TPREL_LO,    // symbol@got@tprel@l
+  S_GOT_TPREL_PCREL, // symbol@got@tprel@pcrel
+  S_L,               // symbol@l
+  S_LOCAL,           // symbol@local
+  S_NOTOC,           // symbol@notoc
+  S_PCREL,
+  S_PCREL_OPT,      // .reloc expr, R_PPC64_PCREL_OPT, expr
+  S_PLT,            // symbol@plt
+  S_TLS,            // symbol@tls
+  S_TLSGD,          // symbol@tlsgd
+  S_TLSLD,          // symbol@tlsld
+  S_TLS_PCREL,      // symbol@tls@pcrel
+  S_TOC,            // symbol@toc
+  S_TOCBASE,        // symbol@tocbase
+  S_TOC_HA,         // symbol@toc@ha
+  S_TOC_HI,         // symbol@toc@h
+  S_TOC_LO,         // symbol@toc@l
+  S_TPREL,          // symbol@tprel
+  S_TPREL_HA,       // symbol@tprel@ha
+  S_TPREL_HI,       // symbol@tprel@h
+  S_TPREL_HIGH,     // symbol@tprel@high
+  S_TPREL_HIGHA,    // symbol@tprel@higha
+  S_TPREL_HIGHER,   // symbol@tprel@higher
+  S_TPREL_HIGHERA,  // symbol@tprel@highera
+  S_TPREL_HIGHEST,  // symbol@tprel@highest
+  S_TPREL_HIGHESTA, // symbol@tprel@highesta
+  S_TPREL_LO,       // symbol@tprel@l
+  S_U,              // symbol@u
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index ef067f745239c..b1b1c5280f2ae 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -329,11 +329,11 @@ PPCMCCodeEmitter::getDispRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
     (void)SRE;
     // Currently these are the only valid PCRelative Relocations.
-    assert((getSpecifier(SRE) == PPCMCExpr::VK_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TLSGD_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TLSLD_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_TPREL_PCREL) &&
+    assert((getSpecifier(SRE) == PPC::S_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TLSGD_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TLSLD_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_TPREL_PCREL) &&
            "VariantKind must be VK_PCREL or VK_GOT_PCREL or "
            "VK_GOT_TLSGD_PCREL or VK_GOT_TLSLD_PCREL or "
            "VK_GOT_TPREL_PCREL.");
@@ -368,8 +368,8 @@ PPCMCCodeEmitter::getDispRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
            "Value must fit in 34 bits.");
 
     // Currently these are the only valid PCRelative Relocations.
-    assert((getSpecifier(SRE) == PPCMCExpr::VK_PCREL ||
-            getSpecifier(SRE) == PPCMCExpr::VK_GOT_PCREL) &&
+    assert((getSpecifier(SRE) == PPC::S_PCREL ||
+            getSpecifier(SRE) == PPC::S_GOT_PCREL) &&
            "VariantKind must be VK_PCREL or VK_GOT_PCREL");
     // Generate the fixup for the relocation.
     Fixups.push_back(
@@ -433,7 +433,7 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
   // if using PC relative memops.
   const MCExpr *Expr = MO.getExpr();
   const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
-  bool IsPCRel = getSpecifier(SRE) == PPCMCExpr::VK_TLS_PCREL;
+  bool IsPCRel = getSpecifier(SRE) == PPC::S_TLS_PCREL;
   Fixups.push_back(MCFixup::create(IsPCRel ? 1 : 0, Expr,
                                    (MCFixupKind)PPC::fixup_ppc_nofixup));
   const Triple &TT = STI.getTargetTriple();
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 728b6799f94dc..49ae6bb5fa451 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -21,6 +21,11 @@ const PPCMCExpr *PPCMCExpr::create(Specifier S, const MCExpr *Expr,
   return new (Ctx) PPCMCExpr(S, Expr);
 }
 
+const PPCMCExpr *PPCMCExpr::create(const MCExpr *Expr, Specifier S,
+                                   MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(S, Expr);
+}
+
 void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   getSubExpr()->print(OS, MAI);
   OS << '@' << MAI->getSpecifierName(specifier);
@@ -44,23 +49,23 @@ PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
 
 std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
   switch (specifier) {
-  case VK_LO:
+  case PPC::S_LO:
     return Value & 0xffff;
-  case VK_HI:
+  case PPC::S_HI:
     return (Value >> 16) & 0xffff;
-  case VK_HA:
+  case PPC::S_HA:
     return ((Value + 0x8000) >> 16) & 0xffff;
-  case VK_HIGH:
+  case PPC::S_HIGH:
     return (Value >> 16) & 0xffff;
-  case VK_HIGHA:
+  case PPC::S_HIGHA:
     return ((Value + 0x8000) >> 16) & 0xffff;
-  case VK_HIGHER:
+  case PPC::S_HIGHER:
     return (Value >> 32) & 0xffff;
-  case VK_HIGHERA:
+  case PPC::S_HIGHERA:
     return ((Value + 0x8000) >> 32) & 0xffff;
-  case VK_HIGHEST:
+  case PPC::S_HIGHEST:
     return (Value >> 48) & 0xffff;
-  case VK_HIGHESTA:
+  case PPC::S_HIGHESTA:
     return ((Value + 0x8000) >> 48) & 0xffff;
   default:
     return {};
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 3d0511da2749f..814217ea060e0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
 #define LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCEXPR_H
 
+#include "PPCMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
 #include <optional>
@@ -18,87 +19,6 @@ namespace llvm {
 class PPCMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  enum {
-    VK_None,
-
-    VK_LO = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_HI,
-    VK_HA,
-    VK_HIGH,
-    VK_HIGHA,
-    VK_HIGHER,
-    VK_HIGHERA,
-    VK_HIGHEST,
-    VK_HIGHESTA,
-
-    VK_AIX_TLSGD,       // symbol@gd
-    VK_AIX_TLSGDM,      // symbol@m
-    VK_AIX_TLSIE,       // symbol@ie
-    VK_AIX_TLSLD,       // symbol@ld
-    VK_AIX_TLSLE,       // symbol@le
-    VK_AIX_TLSML,       // symbol@ml
-    VK_DTPMOD,          // symbol@dtpmod
-    VK_DTPREL,          // symbol@dprel
-    VK_DTPREL_HA,       // symbol@dtprel@ha
-    VK_DTPREL_HI,       // symbol@dtprel@h
-    VK_DTPREL_HIGH,     // symbol@dtprel@high
-    VK_DTPREL_HIGHA,    // symbol@dtprel@higha
-    VK_DTPREL_HIGHER,   // symbol@dtprel@higher
-    VK_DTPREL_HIGHERA,  // symbol@dtprel@highera
-    VK_DTPREL_HIGHEST,  // symbol@dtprel@highest
-    VK_DTPREL_HIGHESTA, // symbol@dtprel@highesta
-    VK_DTPREL_LO,       // symbol@dtprel@l
-    VK_GOT,             // symbol@got
-    VK_GOT_DTPREL,      // symbol@got@dtprel
-    VK_GOT_DTPREL_HA,   // symbol@got@dtprel@ha
-    VK_GOT_DTPREL_HI,   // symbol@got@dtprel@h
-    VK_GOT_DTPREL_LO,   // symbol@got@dtprel@l
-    VK_GOT_HA,          // symbol@got@ha
-    VK_GOT_HI,          // symbol@got@h
-    VK_GOT_LO,          // symbol@got@l
-    VK_GOT_PCREL,       // symbol@got@pcrel
-    VK_GOT_TLSGD,       // symbol@got@tlsgd
-    VK_GOT_TLSGD_HA,    // symbol@got@tlsgd@ha
-    VK_GOT_TLSGD_HI,    // symbol@got@tlsgd@h
-    VK_GOT_TLSGD_LO,    // symbol@got@tlsgd@l
-    VK_GOT_TLSGD_PCREL, // symbol@got@tlsgd@pcrel
-    VK_GOT_TLSLD,       // symbol@got@tlsld
-    VK_GOT_TLSLD_HA,    // symbol@got@tlsld@ha
-    VK_GOT_TLSLD_HI,    // symbol@got@tlsld@h
-    VK_GOT_TLSLD_LO,    // symbol@got@tlsld@l
-    VK_GOT_TLSLD_PCREL, // symbol@got@tlsld@pcrel
-    VK_GOT_TPREL,       // symbol@got@tprel
-    VK_GOT_TPREL_HA,    // symbol@got@tprel@ha
-    VK_GOT_TPREL_HI,    // symbol@got@tprel@h
-    VK_GOT_TPREL_LO,    // symbol@got@tprel@l
-    VK_GOT_TPREL_PCREL, // symbol@got@tprel@pcrel
-    VK_L,               // symbol@l
-    VK_LOCAL,           // symbol@local
-    VK_NOTOC,           // symbol@notoc
-    VK_PCREL,
-    VK_PCREL_OPT,      // .reloc expr, R_PPC64_PCREL_OPT, expr
-    VK_PLT,            // symbol@plt
-    VK_TLS,            // symbol@tls
-    VK_TLSGD,          // symbol@tlsgd
-    VK_TLSLD,          // symbol@tlsld
-    VK_TLS_PCREL,      // symbol@tls@pcrel
-    VK_TOC,            // symbol@toc
-    VK_TOCBASE,        // symbol@tocbase
-    VK_TOC_HA,         // symbol@toc@ha
-    VK_TOC_HI,         // symbol@toc@h
-    VK_TOC_LO,         // symbol@toc@l
-    VK_TPREL,          // symbol@tprel
-    VK_TPREL_HA,       // symbol@tprel@ha
-    VK_TPREL_HI,       // symbol@tprel@h
-    VK_TPREL_HIGH,     // symbol@tprel@high
-    VK_TPREL_HIGHA,    // symbol@tprel@higha
-    VK_TPREL_HIGHER,   // symbol@tprel@higher
-    VK_TPREL_HIGHERA,  // symbol@tprel@highera
-    VK_TPREL_HIGHEST,  // symbol@tprel@highest
-    VK_TPREL_HIGHESTA, // symbol@tprel@highesta
-    VK_TPREL_LO,       // symbol@tprel@l
-    VK_U,              // symbol@u
-  };
 
 private:
   std::optional<int64_t> evaluateAsInt64(int64_t Value) const;
@@ -109,18 +29,8 @@ class PPCMCExpr : public MCSpecifierExpr {
 public:
   static const PPCMCExpr *create(Specifier S, const MCExpr *Expr,
                                  MCContext &Ctx);
-
-  static const PPCMCExpr *createLo(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO, Expr, Ctx);
-  }
-
-  static const PPCMCExpr *createHi(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI, Expr, Ctx);
-  }
-
-  static const PPCMCExpr *createHa(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HA, Expr, Ctx);
-  }
+  static const PPCMCExpr *create(const MCExpr *Expr, Specifier S,
+                                 MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 03a034182ae15..7f80c101bcc9c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -221,9 +221,9 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
       // variables. Finally for local-exec and initial-exec, we have a thread
       // pointer, in r13 for 64-bit mode and returned by .__get_tpointer for
       // 32-bit mode.
-      if (Kind == PPCMCExpr::VK_AIX_TLSGD || Kind == PPCMCExpr::VK_AIX_TLSGDM ||
-          Kind == PPCMCExpr::VK_AIX_TLSIE || Kind == PPCMCExpr::VK_AIX_TLSLE ||
-          Kind == PPCMCExpr::VK_AIX_TLSLD || Kind == PPCMCExpr::VK_AIX_TLSML)
+      if (Kind == PPC::S_AIX_TLSGD || Kind == PPC::S_AIX_TLSGDM ||
+          Kind == PPC::S_AIX_TLSIE || Kind == PPC::S_AIX_TLSLE ||
+          Kind == PPC::S_AIX_TLSLD || Kind == PPC::S_AIX_TLSML)
         OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << "@"
            << getContext().getAsmInfo()->getSpecifierName(Kind) << '\n';
       else
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 1b5fe08bea49d..8532f537e2d6e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -61,15 +61,15 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       report_fatal_error("Unsupported modifier for half16 fixup.");
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_TOC, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_U:
+    case PPC::S_U:
       return {XCOFF::RelocationType::R_TOCU, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_L:
+    case PPC::S_L:
       return {XCOFF::RelocationType::R_TOCL, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForHalf16};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, SignAndSizeForHalf16};
     }
   } break;
@@ -80,13 +80,13 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_TOC, 15};
-    case PPCMCExpr::VK_L:
+    case PPC::S_L:
       return {XCOFF::RelocationType::R_TOCL, 15};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, 15};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, 15};
     }
   } break;
@@ -97,7 +97,7 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
   case PPC::fixup_ppc_br24abs:
     return {XCOFF::RelocationType::R_RBA, EncodedSignednessIndicator | 25};
   case PPC::fixup_ppc_nofixup: {
-    if (Specifier == PPCMCExpr::VK_None)
+    if (Specifier == PPC::S_None)
       return {XCOFF::RelocationType::R_REF, 0};
     else
       llvm_unreachable("Unsupported Modifier");
@@ -110,19 +110,19 @@ std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
     switch (Specifier) {
     default:
       report_fatal_error("Unsupported modifier");
-    case PPCMCExpr::VK_AIX_TLSGD:
+    case PPC::S_AIX_TLSGD:
       return {XCOFF::RelocationType::R_TLS, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSGDM:
+    case PPC::S_AIX_TLSGDM:
       return {XCOFF::RelocationType::R_TLSM, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSIE:
+    case PPC::S_AIX_TLSIE:
       return {XCOFF::RelocationType::R_TLS_IE, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSLE:
+    case PPC::S_AIX_TLSLE:
       return {XCOFF::RelocationType::R_TLS_LE, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSLD:
+    case PPC::S_AIX_TLSLD:
       return {XCOFF::RelocationType::R_TLS_LD, SignAndSizeForFKData};
-    case PPCMCExpr::VK_AIX_TLSML:
+    case PPC::S_AIX_TLSML:
       return {XCOFF::RelocationType::R_TLSML, SignAndSizeForFKData};
-    case PPCMCExpr::VK_None:
+    case PPC::S_None:
       return {XCOFF::RelocationType::R_POS, SignAndSizeForFKData};
     }
   }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 0fe615a95894f..8a1357c5fd555 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -108,9 +108,9 @@ template <>
 struct DenseMapInfo<std::pair<const MCSymbol *, PPCMCExpr::Specifier>> {
   using TOCKey = std::pair<const MCSymbol *, PPCMCExpr::Specifier>;
 
-  static inline TOCKey getEmptyKey() { return {nullptr, PPCMCExpr::VK_None}; }
+  static inline TOCKey getEmptyKey() { return {nullptr, PPC::S_None}; }
   static inline TOCKey getTombstoneKey() {
-    return {(const MCSymbol *)1, PPCMCExpr::VK_None};
+    return {(const MCSymbol *)1, PPC::S_None};
   }
   static unsigned getHashValue(const TOCKey &PairVal) {
     return detail::combineHashValue(
@@ -174,9 +174,8 @@ class PPCAsmPrinter : public AsmPrinter {
     TOCType_EHBlock
   };
 
-  MCSymbol *
-  lookUpOrCreateTOCEntry(const MCSymbol *Sym, TOCEntryType Type,
-                         PPCMCExpr::Specifier Kind = PPCMCExpr::VK_None);
+  MCSymbol *lookUpOrCreateTOCEntry(const MCSymbol *Sym, TOCEntryType Type,
+                                   PPCMCExpr::Specifier Kind = PPC::S_None);
 
   bool doInitialization(Module &M) override {
     if (!TOC.empty())
@@ -691,13 +690,13 @@ void PPCAsmPrinter::EmitAIXTlsCallHelper(const MachineInstr *MI) {
 /// the current output stream.
 void PPCAsmPrinter::emitTlsCall(const MachineInstr *MI,
                                 PPCMCExpr::Specifier VK) {
-  PPCMCExpr::Specifier Kind = PPCMCExpr::VK_None;
+  PPCMCExpr::Specifier Kind = PPC::S_None;
   unsigned Opcode = PPC::BL8_NOP_TLS;
 
   assert(MI->getNumOperands() >= 3 && "Expecting at least 3 operands from MI");
   if (MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG ||
       MI->getOperand(2).getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG) {
-    Kind = PPCMCExpr::VK_NOTOC;
+    Kind = PPC::S_NOTOC;
     Opcode = PPC::BL8_NOTOC_TLS;
   }
   const Module *M = MF->getFunction().getParent();
@@ -730,13 +729,13 @@ void PPCAsmPrinter::emitTlsCall(const MachineInstr *MI,
   MCSymbol *TlsGetAddr = OutContext.getOrCreateSymbol("__tls_get_addr");
 
   if (Subtarget->is32BitELFABI() && isPositionIndependent())
-    Kind = PPCMCExpr::VK_PLT;
+    Kind = PPC::S_PLT;
 
   const MCExpr *TlsRef = MCSymbolRefExpr::create(
       TlsGetAddr, MCSymbolRefExpr::VariantKind(Kind), OutContext);
 
   // Add 32768 offset to the symbol so we follow up the latest GOT/PLT ABI.
-  if (Kind == PPCMCExpr::VK_PLT && Subtarget->isSecurePlt() &&
+  if (Kind == PPC::S_PLT && Subtarget->isSecurePlt() &&
       M->getPICLevel() == PICLevel::BigPIC)
     TlsRef = MCBinaryExpr::createAdd(
         TlsRef, MCConstantExpr::create(32768, OutContext), OutContext);
@@ -861,7 +860,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   auto getTOCEntryLoadingExprForXCOFF =
       [IsPPC64, getTOCRelocAdjustedExprForXCOFF,
        this](const MCSymbol *MOSymbol, const MCExpr *Expr,
-             PPCMCExpr::Specifier VK = PPCMCExpr::VK_None) -> const MCExpr * {
+             PPCMCExpr::Specifier VK = PPC::S_None) -> const MCExpr * {
     const unsigned EntryByteSize = IsPPC64 ? 8 : 4;
     const auto TOCEntryIter = TOC.find({MOSymbol, VK});
     assert(TOCEntryIter != TOC.end() &&
@@ -886,9 +885,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       assert(MO.isGlobal() && "Only expecting a global MachineOperand here!\n");
       TLSModel::Model Model = TM.getTLSModel(MO.getGlobal());
       if (Model == TLSModel::LocalExec)
-        return PPCMCExpr::VK_AIX_TLSLE;
+        return PPC::S_AIX_TLSLE;
       if (Model == TLSModel::InitialExec)
-        return PPCMCExpr::VK_AIX_TLSIE;
+        return PPC::S_AIX_TLSIE;
       // On AIX, TLS model opt may have turned local-dynamic accesses into
       // initial-exec accesses.
       PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
@@ -896,7 +895,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           FuncInfo->isAIXFuncUseTLSIEForLD()) {
         LLVM_DEBUG(
             dbgs() << "Current function uses IE access for default LD vars.\n");
-        return PPCMCExpr::VK_AIX_TLSIE;
+        return PPC::S_AIX_TLSIE;
       }
       llvm_unreachable("Only expecting local-exec or initial-exec accesses!");
     }
@@ -904,17 +903,17 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // the variable offset and the other for the region handle). They are
     // differentiated by MO_TLSGD_FLAG and MO_TLSGDM_FLAG.
     if (Flag == PPCII::MO_TLSGDM_FLAG)
-      return PPCMCExpr::VK_AIX_TLSGDM;
+      return PPC::S_AIX_TLSGDM;
     if (Flag == PPCII::MO_TLSGD_FLAG || Flag == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
-      return PPCMCExpr::VK_AIX_TLSGD;
+      return PPC::S_AIX_TLSGD;
     // For local-dynamic TLS access on AIX, we have one TOC entry for the symbol
     // (the variable offset) and one shared TOC entry for the module handle.
     // They are differentiated by MO_TLSLD_FLAG and MO_TLSLDM_FLAG.
     if (Flag == PPCII::MO_TLSLD_FLAG && IsAIX)
-      return PPCMCExpr::VK_AIX_TLSLD;
+      return PPC::S_AIX_TLSLD;
     if (Flag == PPCII::MO_TLSLDM_FLAG && IsAIX)
-      return PPCMCExpr::VK_AIX_TLSML;
-    return PPCMCExpr::VK_None;
+      return PPC::S_AIX_TLSML;
+    return PPC::S_None;
   };
 
   // Lower multi-instruction pseudo operations.
@@ -955,8 +954,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *OffsExpr = MCBinaryExpr::createSub(
         MCSymbolRefExpr::create(
-            GOTSymbol, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_LOCAL),
-            OutContext),
+            GOTSymbol, MCSymbolRefExpr::VariantKind(PPC::S_LOCAL), OutContext),
         MCConstantExpr::create(4, OutContext), OutContext);
 
     // Emit the 'bl'.
@@ -1002,12 +1000,14 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       const MCExpr *DeltaExpr = MCBinaryExpr::createSub(
           MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
 
-      const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, OutContext);
+      const MCExpr *DeltaHi =
+          PPCMCExpr::create(DeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
 
-      const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, OutContext);
+      const MCExpr *DeltaLo =
+          PPCMCExpr::create(DeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
@@ -1055,7 +1055,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Create a reference to the GOT entry for the symbol. The GOT entry will be
     // synthesized later.
     if (PL == PICLevel::SmallPIC && !IsAIX) {
-      const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT);
+      const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPC::S_GOT);
       TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
@@ -1144,8 +1144,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     MCSymbol *TOCEntry =
         lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    PPCMCExpr::Specifier VKExpr =
-        IsAIX ? PPCMCExpr::VK_None : PPCMCExpr::VK_TOC;
+    PPCMCExpr::Specifier VKExpr = IsAIX ? PPC::S_None : PPC::S_TOC;
     const MCExpr *Exp = symbolWithSpecifier(TOCEntry, VKExpr);
     TmpInst.getOperand(1) = MCOperand::createExpr(
         IsAIX ? getTOCEntryLoadingExprForXCOFF(MOSymbol, Exp, VK) : Exp);
@@ -1195,7 +1194,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
     }
 
-    const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_U);
+    const MCExpr *Exp = symbolWithSpecifier(MOSymbol, PPC::S_U);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1227,7 +1226,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // 'MOSymbol'.
     MCSymbol *TOCEntry =
         lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
-    const MCExpr *Exp = symbolWithSpecifier(TOCEntry, PPCMCExpr::VK_L);
+    const MCExpr *Exp = symbolWithSpecifier(TOCEntry, PPC::S_L);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1260,7 +1259,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
         (MO.isCPI() && CM == CodeModel::Large))
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    VK = IsAIX ? PPCMCExpr::VK_U : PPCMCExpr::VK_TOC_HA;
+    VK = IsAIX ? PPC::S_U : PPC::S_TOC_HA;
 
     const MCExpr *Exp = symbolWithSpecifier(MOSymbol, VK);
 
@@ -1302,7 +1301,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     if (!MO.isCPI() || CM == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
 
-    VK = IsAIX ? PPCMCExpr::VK_L : PPCMCExpr::VK_TOC_LO;
+    VK = IsAIX ? PPC::S_L : PPC::S_TOC_LO;
     const MCExpr *Exp = symbolWithSpecifier(MOSymbol, VK);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -1332,8 +1331,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const MCExpr *Exp = MCSymbolRefExpr::create(
         MOSymbol,
-        MCSymbolRefExpr::VariantKind(IsAIX ? PPCMCExpr::VK_L
-                                           : PPCMCExpr::VK_TOC_LO),
+        MCSymbolRefExpr::VariantKind(IsAIX ? PPC::S_L : PPC::S_TOC_LO),
         OutContext);
 
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
@@ -1348,7 +1346,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TPREL_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TPREL_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1365,9 +1363,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *Exp =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TPREL_LO
-                                              : PPCMCExpr::VK_GOT_TPREL);
+    const MCExpr *Exp = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TPREL_LO : PPC::S_GOT_TPREL);
     TmpInst.getOperand(1) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1405,11 +1402,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     MCSymbol *GOTSymbol =
         OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
     const MCExpr *SymGotTlsL = PPCMCExpr::create(
-        PPCMCExpr::VK_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext),
-        OutContext);
+        PPC::S_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
     const MCExpr *SymGotTlsHA = PPCMCExpr::create(
-        PPCMCExpr::VK_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext),
-        OutContext);
+        PPC::S_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addExpr(SymGotTlsL));
@@ -1427,7 +1422,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TLSGD_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TLSGD_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1443,9 +1438,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsGD =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TLSGD_LO
-                                              : PPCMCExpr::VK_GOT_TLSGD);
+    const MCExpr *SymGotTlsGD = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TLSGD_LO : PPC::S_GOT_TLSGD);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                    .addReg(MI->getOperand(0).getReg())
@@ -1470,7 +1464,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsADDR32: {
     // Transform: %r3 = GETtlsADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsgd)@PLT
-    emitTlsCall(MI, PPCMCExpr::VK_TLSGD);
+    emitTlsCall(MI, PPC::S_TLSGD);
     return;
   }
   case PPC::GETtlsTpointer32AIX: {
@@ -1487,7 +1481,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_GOT_TLSLD_HA);
+        symbolWithSpecifier(MOSymbol, PPC::S_GOT_TLSLD_HA);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS8)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addReg(MI->getOperand(1).getReg())
@@ -1503,9 +1497,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymGotTlsLD =
-        symbolWithSpecifier(MOSymbol, IsPPC64 ? PPCMCExpr::VK_GOT_TLSLD_LO
-                                              : PPCMCExpr::VK_GOT_TLSLD);
+    const MCExpr *SymGotTlsLD = symbolWithSpecifier(
+        MOSymbol, IsPPC64 ? PPC::S_GOT_TLSLD_LO : PPC::S_GOT_TLSLD);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
@@ -1520,7 +1513,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::GETtlsldADDR32: {
     // Transform: %r3 = GETtlsldADDR32 %r3, @sym
     // Into: BL_TLS __tls_get_addr(sym at tlsld)@PLT
-    emitTlsCall(MI, PPCMCExpr::VK_TLSLD);
+    emitTlsCall(MI, PPC::S_TLSLD);
     return;
   }
   case PPC::ADDISdtprelHA:
@@ -1532,8 +1525,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL_HA);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL_HA);
     EmitToStreamer(
         *OutStreamer,
         MCInstBuilder(IsPPC64 ? PPC::ADDIS8 : PPC::ADDIS)
@@ -1548,8 +1540,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::PADDI8)
                                      .addReg(MI->getOperand(0).getReg())
                                      .addReg(MI->getOperand(1).getReg())
@@ -1566,8 +1557,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = getSymbol(GValue);
-    const MCExpr *SymDtprel =
-        symbolWithSpecifier(MOSymbol, PPCMCExpr::VK_DTPREL_LO);
+    const MCExpr *SymDtprel = symbolWithSpecifier(MOSymbol, PPC::S_DTPREL_LO);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(IsPPC64 ? PPC::ADDI8 : PPC::ADDI)
                        .addReg(MI->getOperand(0).getReg())
@@ -1737,9 +1727,8 @@ PPCAsmPrinter::getAdjustedFasterLocalExpr(const MachineOperand &MO,
   // assume that the address of extern TLS variables are zero.
   const MCExpr *Expr = MCSymbolRefExpr::create(
       getSymbol(GValue),
-      MCSymbolRefExpr::VariantKind(Model == TLSModel::LocalExec
-                                       ? PPCMCExpr::VK_AIX_TLSLE
-                                       : PPCMCExpr::VK_AIX_TLSLD),
+      MCSymbolRefExpr::VariantKind(
+          Model == TLSModel::LocalExec ? PPC::S_AIX_TLSLE : PPC::S_AIX_TLSLD),
       OutContext);
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
@@ -2028,8 +2017,7 @@ void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
   OutStreamer->emitValue(
       MCSymbolRefExpr::create(
-          Symbol2, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_TOCBASE),
-          OutContext),
+          Symbol2, MCSymbolRefExpr::VariantKind(PPC::S_TOCBASE), OutContext),
       8 /*size*/);
   // Emit a null environment pointer.
   OutStreamer->emitIntValue(0, 8 /* size */);
@@ -2136,13 +2124,15 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
                                 GlobalEntryLabelExp, OutContext);
 
-      const MCExpr *TOCDeltaHi = PPCMCExpr::createHa(TOCDeltaExpr, OutContext);
+      const MCExpr *TOCDeltaHi =
+          PPCMCExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X12)
                                    .addExpr(TOCDeltaHi));
 
-      const MCExpr *TOCDeltaLo = PPCMCExpr::createLo(TOCDeltaExpr, OutContext);
+      const MCExpr *TOCDeltaLo =
+          PPCMCExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X2)
@@ -3007,9 +2997,9 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
     // new symbol to prefix the name with a dot.
     // If TLS model opt is turned on, create a new symbol to prefix the name
     // with a dot.
-    if (I.first.second == PPCMCExpr::VK_AIX_TLSGDM ||
+    if (I.first.second == PPC::S_AIX_TLSGDM ||
         (Subtarget->hasAIXShLibTLSModelOpt() &&
-         I.first.second == PPCMCExpr::VK_AIX_TLSLD)) {
+         I.first.second == PPC::S_AIX_TLSLD)) {
       SmallString<128> Name;
       StringRef Prefix = ".";
       Name += Prefix;
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index 0a04b7fb8d169..f6624ec989ee2 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -54,31 +54,31 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
                               AsmPrinter &Printer) {
   MCContext &Ctx = Printer.OutContext;
-  PPCMCExpr::Specifier RefKind = PPCMCExpr::VK_None;
+  PPCMCExpr::Specifier RefKind = PPC::S_None;
 
   unsigned access = MO.getTargetFlags();
 
   switch (access) {
     case PPCII::MO_TPREL_LO:
-      RefKind = PPCMCExpr::VK_TPREL_LO;
+      RefKind = PPC::S_TPREL_LO;
       break;
     case PPCII::MO_TPREL_HA:
-      RefKind = PPCMCExpr::VK_TPREL_HA;
+      RefKind = PPC::S_TPREL_HA;
       break;
     case PPCII::MO_DTPREL_LO:
-      RefKind = PPCMCExpr::VK_DTPREL_LO;
+      RefKind = PPC::S_DTPREL_LO;
       break;
     case PPCII::MO_TLSLD_LO:
-      RefKind = PPCMCExpr::VK_GOT_TLSLD_LO;
+      RefKind = PPC::S_GOT_TLSLD_LO;
       break;
     case PPCII::MO_TOC_LO:
-      RefKind = PPCMCExpr::VK_TOC_LO;
+      RefKind = PPC::S_TOC_LO;
       break;
     case PPCII::MO_TLS:
-      RefKind = PPCMCExpr::VK_TLS;
+      RefKind = PPC::S_TLS;
       break;
     case PPCII::MO_TLS_PCREL_FLAG:
-      RefKind = PPCMCExpr::VK_TLS_PCREL;
+      RefKind = PPC::S_TLS_PCREL;
       break;
   }
 
@@ -87,19 +87,19 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   const MachineFunction *MF = MI->getMF();
 
   if (MO.getTargetFlags() == PPCII::MO_PLT)
-    RefKind = PPCMCExpr::VK_PLT;
+    RefKind = PPC::S_PLT;
   else if (MO.getTargetFlags() == PPCII::MO_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_PCREL;
+    RefKind = PPC::S_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_PCREL;
+    RefKind = PPC::S_GOT_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_TPREL_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_TPREL;
+    RefKind = PPC::S_TPREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSGD_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TLSGD_PCREL;
+    RefKind = PPC::S_GOT_TLSGD_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TLSLD_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TLSLD_PCREL;
+    RefKind = PPC::S_GOT_TLSLD_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_GOT_TPREL_PCREL_FLAG)
-    RefKind = PPCMCExpr::VK_GOT_TPREL_PCREL;
+    RefKind = PPC::S_GOT_TPREL_PCREL;
   else if (MO.getTargetFlags() == PPCII::MO_TPREL_FLAG ||
            MO.getTargetFlags() == PPCII::MO_TLSLD_FLAG) {
     assert(MO.isGlobal() && "Only expecting a global MachineOperand here!");
@@ -110,14 +110,14 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     // the relocation type in case the result is used for purposes other than a
     // TOC reference. In TOC reference cases, this result is discarded.
     if (Model == TLSModel::LocalExec)
-      RefKind = PPCMCExpr::VK_AIX_TLSLE;
+      RefKind = PPC::S_AIX_TLSLE;
     else if (Model == TLSModel::LocalDynamic &&
              FuncInfo->isAIXFuncUseTLSIEForLD())
       // On AIX, TLS model opt may have turned local-dynamic accesses into
       // initial-exec accesses.
-      RefKind = PPCMCExpr::VK_AIX_TLSIE;
+      RefKind = PPC::S_AIX_TLSIE;
     else if (Model == TLSModel::LocalDynamic)
-      RefKind = PPCMCExpr::VK_AIX_TLSLD;
+      RefKind = PPC::S_AIX_TLSLD;
   }
 
   const Module *M = MF->getFunction().getParent();
@@ -130,10 +130,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     if (MIOpcode == PPC::TAILB || MIOpcode == PPC::TAILB8 ||
         MIOpcode == PPC::TCRETURNdi || MIOpcode == PPC::TCRETURNdi8 ||
         MIOpcode == PPC::BL8_NOTOC || MIOpcode == PPC::BL8_NOTOC_RM) {
-      RefKind = PPCMCExpr::VK_NOTOC;
+      RefKind = PPC::S_NOTOC;
     }
     if (MO.getTargetFlags() == PPCII::MO_PCREL_OPT_FLAG)
-      RefKind = PPCMCExpr::VK_PCREL_OPT;
+      RefKind = PPC::S_PCREL_OPT;
   }
 
   const MCExpr *Expr = MCSymbolRefExpr::create(
@@ -164,11 +164,11 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   switch (access) {
     case PPCII::MO_LO:
     case PPCII::MO_PIC_LO_FLAG:
-      Expr = PPCMCExpr::createLo(Expr, Ctx);
+      Expr = PPCMCExpr::create(Expr, PPC::S_LO, Ctx);
       break;
     case PPCII::MO_HA:
     case PPCII::MO_PIC_HA_FLAG:
-      Expr = PPCMCExpr::createHa(Expr, Ctx);
+      Expr = PPCMCExpr::create(Expr, PPC::S_HA, Ctx);
       break;
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index 078f4b1effbb1..29e4286cf4ada 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetObjectFile.h"
-#include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCAsmInfo.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -49,8 +49,8 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
 
 const MCExpr *PPC64LinuxTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
-  const MCExpr *Expr = MCSymbolRefExpr::create(
-      Sym, MCSymbolRefExpr::VariantKind(PPCMCExpr::VK_DTPREL), getContext());
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(Sym, PPC::S_DTPREL, getContext());
   return MCBinaryExpr::createAdd(Expr,
                                  MCConstantExpr::create(0x8000, getContext()),
                                  getContext());

From a8d76acdd88b25a98e50ac2da9e6f311fc2c2cb8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 13:22:52 -0700
Subject: [PATCH 512/851] PowerPC: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp  | 4 ++--
 llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index bd01767f41bd5..d587e7d339e81 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -590,7 +590,7 @@ void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
   if (Rhs) {
     SmallString<0> Buf;
     raw_svector_ostream Tmp(Buf);
-    Rhs->print(Tmp, &MAI);
+    MAI.printExpr(Tmp, *Rhs);
     if (isdigit(Buf[0]))
       O << '+';
     O << Buf;
@@ -671,5 +671,5 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 7f80c101bcc9c..44b5732be6e3e 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -255,7 +255,7 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
     OS << "\t.localentry\t";
     S->print(OS, MAI);
     OS << ", ";
-    LocalOffset->print(OS, MAI);
+    MAI->printExpr(OS, *LocalOffset);
     OS << '\n';
   }
 };

From 087a6ac420ad99c523b9dd517351e0c6d1f1a980 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 21:22:29 +0100
Subject: [PATCH 513/851] [LV] Add users to some first-order recurrence tests.

Add extra users to ensure the recurrence cannot be DCE'd.

Also re-generates some checks.
---
 .../partial-reduce-dot-product-neon.ll        |  89 +-
 .../first-order-recurrence-scalable-vf1.ll    |  25 +-
 .../first-order-recurrence-chains.ll          | 927 +++++++++++++-----
 .../LoopVectorize/first-order-recurrence.ll   | 263 +++--
 .../scalable-first-order-recurrence.ll        | 851 ++++++++++++++--
 5 files changed, 1697 insertions(+), 458 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index e6687fe767c0a..0fc324f720e60 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -589,88 +589,100 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
-define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
+define i32 @not_dotp_not_phi(ptr %a, ptr noalias %b, ptr noalias %c) {
 ; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVED-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP7]] = add <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT:    store <16 x i32> [[TMP8]], ptr [[TMP10]], align 4
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP8]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
 ; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
 ;
@@ -688,6 +700,8 @@ for.body:                                         ; preds = %for.body, %entry
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
   %add = add i32 %mul, %ext.b
+  %gep.c = getelementptr i32, ptr %c, i64 %iv
+  store i32 %accum, ptr %gep.c
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body
@@ -946,6 +960,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 1d12f11b20e18..d340985457168 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -6,9 +6,9 @@ target triple = "riscv64-unknown-linux-gnu"
 
 ; Make sure we do not pick <vscale x 1 x i64> as VF for a loop with a
 ; first-order recurrence.
-define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
+define i64 @pr97452_scalable_vf1_for(ptr %src, ptr noalias %dst) #0 {
 ; CHECK-LABEL: define i64 @pr97452_scalable_vf1_for(
-; CHECK-SAME: ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -17,8 +17,17 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> [[WIDE_LOAD1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -31,15 +40,17 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L]] = load i64, ptr [[GEP]], align 8
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[FOR]], ptr [[GEP_DST]], align 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 22
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[SCALAR_RECUR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = phi i64 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[RES]]
 ;
 entry:
@@ -49,8 +60,10 @@ loop:
   %for = phi i64 [ 0, %entry ], [ %l, %loop ]
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %iv.next = add i64 %iv, 1
-  %gep = getelementptr inbounds i64, ptr %src, i64 %iv
-  %l = load i64, ptr %gep, align 8
+  %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
+  %l = load i64, ptr %gep.src, align 8
+  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
+  store i64 %for, ptr %gep.dst
   %ec = icmp eq i64 %iv, 22
   br i1 %ec, label %exit, label %loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
index 89268ac25c345..0c5784b32fc9f 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains.ll
@@ -1,26 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
 
 define i16 @test_chained_first_order_recurrences_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES]]
 ;
 entry:
   br label %loop
@@ -43,26 +71,53 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_2(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x i16> [[TMP4]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP8]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES]]
 ;
 entry:
   br label %loop
@@ -85,31 +140,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -135,8 +222,23 @@ exit:
 }
 
 define void @test_cyclic_phis(ptr %ptr) {
-; CHECK-LABEL: @test_cyclic_phis
-; CHECK-NOT: vector.body:
+; CHECK-LABEL: define void @test_cyclic_phis(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, %[[ENTRY]] ], [ [[FOR_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2]] = phi i16 [ 33, %[[ENTRY]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -158,19 +260,47 @@ exit:
 }
 
 define void @test_first_order_recurrences_incoming_cycle_preheader(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define void @test_first_order_recurrences_incoming_cycle_preheader(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[LOOP_1:.*]]
+; CHECK:       [[LOOP_1]]:
+; CHECK-NEXT:    br i1 true, label %[[LOOP_PREHEADER:.*]], label %[[LOOP_1]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label %middle.block, label %vector.body
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    store i16 [[ADD]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop.1
@@ -195,31 +325,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_reordered_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_2:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -245,31 +407,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_reordered_2(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_reordered_2
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_reordered_2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -295,31 +489,63 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_for2_no_other_uses
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for2_no_other_uses(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP2]], splat (i16 10)
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i16> [[TMP6]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
+; CHECK-NEXT:    store i16 [[ADD_2]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -345,30 +571,61 @@ exit:
 }
 
 define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3_for1_for2_no_other_uses
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %vector.ph ], [ [[TMP5:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i16> [[TMP6]], splat (i16 10)
-; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP2]], align 2
+; CHECK-LABEL: define i16 @test_chained_first_order_recurrences_3_for1_for2_no_other_uses(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 22>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 33>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3]] = shufflevector <4 x i16> [[VECTOR_RECUR1]], <4 x i16> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i16> [[TMP4]], splat (i16 10)
+; CHECK-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP10]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:      middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI8:%.*]] = extractelement <4 x i16> [[TMP5]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT5:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT7:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT5]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT8:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR_2]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_3]], 10
+; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i16 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi i16 [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi i16 [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI4]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = add i16 [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i16 [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret i16 [[RES_2]]
 ;
 entry:
   br label %loop
@@ -393,28 +650,56 @@ exit:
 }
 
 define double @test_chained_first_order_recurrence_sink_users_1(ptr %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
+; CHECK-LABEL: define double @test_chained_first_order_recurrence_sink_users_1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 1.000000e+01>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 2.000000e+01>, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[PTR:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD]] = load <4 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[TMP2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> splat (double 1.000000e+01), [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
-; CHECK-NEXT:    br i1 [[TMP9]], label %middle.block, label %vector.body, !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 996
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI3:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI2:%.*]] = extractelement <4 x double> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 1.000000e+01, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT4:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 2.000000e+01, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 997, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR_1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd double [[ADD_1]], [[FOR_1]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds double, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[FOR_1_NEXT]] = load double, ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    store double [[ADD_2]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    ret double [[RES]]
 ;
 entry:
   br label %loop
@@ -438,8 +723,25 @@ exit:
 }
 
 define void @test_first_order_recurrences_and_reduction(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_reduction(
-; CHECK-NOT:   vector.body:
+; CHECK-LABEL: define void @test_first_order_recurrences_and_reduction(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i16 [ 22, %[[ENTRY]] ], [ [[RED:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED]] = phi i16 [ 33, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[FOR_1_NEXT:%.*]] = load i16, ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[FOR_1]], 10
+; CHECK-NEXT:    [[RED_NEXT]] = add i16 [[RED]], [[LV]]
+; CHECK-NEXT:    store i16 [[ADD_1]], ptr [[GEP_PTR]], align 2
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %loop
@@ -463,24 +765,46 @@ exit:
 }
 
 define i64 @test_first_order_recurrences_and_induction(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_induction(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %vector.ph ], [ [[VEC_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %[[VECTOR_PH]] ], [ [[VEC_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
+; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[FOR_1_LCSSA]]
+;
 
 entry:
   br label %loop
@@ -502,24 +826,45 @@ exit:
 ; Same as @test_first_order_recurrences_and_induction but with order of phis
 ; flipped.
 define i64 @test_first_order_recurrences_and_induction2(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_induction2(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %vector.ph ], [ [[VEC_IND]], %vector.body ]
+; CHECK-LABEL: define i64 @test_first_order_recurrences_and_induction2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 22>, %[[VECTOR_PH]] ], [ [[VEC_IND]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP1]], splat (i64 10)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[FOR_1]], 10
+; CHECK-NEXT:    store i64 [[ADD_1]], ptr [[GEP_PTR]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi i64 [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -539,26 +884,50 @@ exit:
 }
 
 define ptr @test_first_order_recurrences_and_pointer_induction1(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction1(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
+; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction1(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PTR_IV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[GEP_PTR:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[IV]]
+; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
+; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[GEP_PTR]], align 8
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret ptr [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -571,6 +940,7 @@ loop:
   %gep.ptr = getelementptr inbounds ptr, ptr %ptr, i64 %iv
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
+  store ptr %for.1, ptr %gep.ptr
   %exitcond.not = icmp eq i64 %iv.next, 1000
   br i1 %exitcond.not, label %exit, label %loop
 
@@ -581,26 +951,50 @@ exit:
 ; same as @test_first_order_recurrences_and_pointer_induction1 but with order
 ; of phis flipped.
 define ptr @test_first_order_recurrences_and_pointer_induction2(ptr %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_and_pointer_induction2(
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 4000
-; CHECK-NEXT:    br label %vector.body
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %vector.ph ], [ [[PTR_IND:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP0]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <4 x ptr> [[TMP0]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-LABEL: define ptr @test_first_order_recurrences_and_pointer_induction2(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 4000
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ <ptr poison, ptr poison, ptr poison, ptr null>, %[[VECTOR_PH]] ], [ [[VECTOR_GEP:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_GEP]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[VECTOR_RECUR]], <4 x ptr> [[VECTOR_GEP]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <4 x ptr> [[TMP1]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[VECTOR_GEP]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[PTR]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi ptr [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi ptr [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[PTR_IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[PTR]], i64 [[INDEX]]
+; CHECK-NEXT:    store ptr [[PTR_IV]], ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i32, ptr [[PTR_IV]], i64 1
+; CHECK-NEXT:    store ptr [[FOR_1]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP5]], label %middle.block, label %vector.body
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x ptr> [[TMP0]], i32 3
-; CHECK-NEXT:    br i1 true
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi ptr [ [[FOR_1]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret ptr [[FOR_1_LCSSA]]
 ;
 entry:
   br label %loop
@@ -613,6 +1007,7 @@ loop:
   %gep.ptr = getelementptr inbounds ptr, ptr %ptr, i64 %iv
   store ptr %ptr.iv, ptr %gep.ptr
   %ptr.iv.next = getelementptr i32, ptr %ptr.iv, i64 1
+  store ptr %for.1, ptr %gep.ptr
   %exitcond.not = icmp eq i64 %iv.next, 1000
   br i1 %exitcond.not, label %exit, label %loop
 
@@ -623,39 +1018,64 @@ exit:
 ; In this test case, %USE_2_FORS uses 2 different fixed-order recurrences and
 ; it needs to be sunk past the previous value for both recurrences.
 define double @test_resinking_required(ptr %p, ptr noalias %a, ptr noalias %b) {
-; CHECK-LABEL: @test_resinking_required(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[BROADCAST_SPLAT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[BROADCAST_SPLAT4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[TMP4:%.*]], %vector.body ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr %a, align 8
+; CHECK-LABEL: define double @test_resinking_required(
+; CHECK-SAME: ptr [[P:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[BROADCAST_SPLAT4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR2:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[A]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr %b, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT3]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP4]] = shufflevector <4 x double> [[VECTOR_RECUR1]], <4 x double> [[BROADCAST_SPLAT4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP2]], i32 3
-; CHECK-NEXT:    store double [[TMP6]], ptr [[P:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR2]], <4 x double> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP8]], i32 3
+; CHECK-NEXT:    store double [[TMP6]], ptr [[P]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI10:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT9:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
-; CHECK-NEXT:    br i1 true, label %End, label %scalar.ph
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    phi double [ [[TMP0]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    phi double [ [[TMP3]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    phi double [ [[VECTOR_RECUR_EXTRACT9]], %middle.block ], [ 0.000000e+00, %Entry ]
-; CHECK-NEXT:    %bc.resume.val = phi i64 [ 1000, %middle.block ], [ 0, %Entry ]
-; CHECK:      End:
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP0]], %middle.block ]
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[TMP3]], %middle.block ]
-; CHECK-NEXT:    = phi double [ {{.+}}, %Loop ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI10]], %middle.block ]
+; CHECK-NEXT:    br i1 true, label %[[END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT5:%.*]] = phi double [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT6:%.*]] = phi double [ [[VECTOR_RECUR_EXTRACT9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[FOR_1:%.*]] = phi double [ [[L1:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[FOR_2:%.*]] = phi double [ [[L2:%.*]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT5]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[FOR_3:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[USE_2_FORS:%.*]] = fdiv double [[FOR_3]], [[FOR_1]]
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double 0.000000e+00, [[FOR_1]]
+; CHECK-NEXT:    [[L1]] = load double, ptr [[A]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[L2]] = load double, ptr [[B]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[DIV]], [[FOR_3]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[P]], align 8
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[COND]], label %[[END]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[FOR_1_LCSSA:%.*]] = phi double [ [[FOR_1]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_2_LCSSA:%.*]] = phi double [ [[FOR_2]], %[[LOOP]] ], [ [[TMP3]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[FOR_3_LCSSA:%.*]] = phi double [ [[FOR_3]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd double [[FOR_1_LCSSA]], [[FOR_2_LCSSA]]
+; CHECK-NEXT:    [[RES_2:%.*]] = fadd double [[RES_1]], [[FOR_3_LCSSA]]
+; CHECK-NEXT:    ret double [[RES_2]]
 ;
 Entry:
   br label %Loop
@@ -670,7 +1090,8 @@ Loop:
   %l1 = load double, ptr %a, align 8
   %iv.next= add nuw nsw i64 %iv, 1
   %l2 = load double, ptr %b, align 8
-  store double %div, ptr %p, align 8
+  %add = fadd double %div, %for.3
+  store double %add, ptr %p, align 8
   %cond = icmp eq i64 %iv.next, 1000
   br i1 %cond, label %End, label %Loop
 
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 594b8ff70feb8..d28db1c77efaa 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -870,7 +870,7 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR27246() {
+define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @PR27246(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -882,21 +882,25 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -904,19 +908,21 @@ define i32 @PR27246() {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND2:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-IC:       for.cond1:
-; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; UNROLL-NO-IC-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-IC-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       for.cond.cleanup3:
-; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -932,34 +938,39 @@ define i32 @PR27246() {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[TMP1]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
+; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND2:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-VF:       for.cond1:
-; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; UNROLL-NO-VF-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-VF-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-VF:       for.cond.cleanup3:
-; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -975,20 +986,24 @@ define i32 @PR27246() {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
@@ -996,19 +1011,21 @@ define i32 @PR27246() {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND2:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; SINK-AFTER:       for.cond1:
-; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
 ; SINK-AFTER-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; SINK-AFTER-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; SINK-AFTER-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
+; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SINK-AFTER:       for.cond.cleanup3:
-; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -1028,8 +1045,10 @@ for.cond.cleanup:
 for.cond1:
   %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
   %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %i.016
   %cmp2 = icmp sgt i32 %k.0, 1
   %dec = add nsw i32 %k.0, -1
+  store i32 %e.1, ptr %gep.dst
   br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
 
 for.cond.cleanup3:
@@ -1056,7 +1075,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
@@ -1074,10 +1093,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP17]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1160,7 +1179,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -1178,10 +1197,10 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 1
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP17]], i32 2
-; SINK-AFTER-NEXT:    [[TMP22]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP18]], i32 3
+; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
+; SINK-AFTER-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
+; SINK-AFTER-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1371,19 +1390,19 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[TMP1]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
 ; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
@@ -2650,7 +2669,7 @@ for.end:
   ret void
 }
 
-define i32 @sink_into_replication_region(i32 %y) {
+define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
@@ -2741,18 +2760,74 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
 ; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
 ; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
+; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NO-IC:       pred.store.if:
+; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP65]], ptr [[DST:%.*]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; UNROLL-NO-IC:       pred.store.continue:
+; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; UNROLL-NO-IC:       pred.store.if21:
+; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP67]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; UNROLL-NO-IC:       pred.store.continue22:
+; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; UNROLL-NO-IC:       pred.store.if23:
+; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP53]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; UNROLL-NO-IC:       pred.store.continue24:
+; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; UNROLL-NO-IC:       pred.store.if25:
+; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP55]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; UNROLL-NO-IC:       pred.store.continue26:
+; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; UNROLL-NO-IC:       pred.store.if27:
+; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP57]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; UNROLL-NO-IC:       pred.store.continue28:
+; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; UNROLL-NO-IC:       pred.store.if29:
+; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP59]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; UNROLL-NO-IC:       pred.store.continue30:
+; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; UNROLL-NO-IC:       pred.store.if31:
+; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP61]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; UNROLL-NO-IC:       pred.store.continue32:
+; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF33:%.*]], label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.store.if33:
+; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
+; UNROLL-NO-IC-NEXT:    store i32 [[TMP63]], ptr [[DST]], align 4
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
+; UNROLL-NO-IC:       pred.store.continue34:
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2777,6 +2852,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; UNROLL-NO-IC-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2808,15 +2884,25 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
 ; UNROLL-NO-VF:       pred.udiv.if3:
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
 ; UNROLL-NO-VF:       pred.udiv.continue4:
 ; UNROLL-NO-VF-NEXT:    [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; UNROLL-NO-VF:       pred.store.if:
+; UNROLL-NO-VF-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; UNROLL-NO-VF:       pred.store.continue:
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF5:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF:       pred.store.if5:
+; UNROLL-NO-VF-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-VF:       pred.store.continue6:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2840,6 +2926,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; UNROLL-NO-VF-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2894,16 +2981,44 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER:       pred.udiv.continue6:
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
 ; SINK-AFTER:       pred.udiv.if7:
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
 ; SINK-AFTER:       pred.udiv.continue8:
 ; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
+; SINK-AFTER-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
+; SINK-AFTER-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; SINK-AFTER:       pred.store.if:
+; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
+; SINK-AFTER-NEXT:    store i32 [[TMP34]], ptr [[DST:%.*]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; SINK-AFTER:       pred.store.continue:
+; SINK-AFTER-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; SINK-AFTER-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; SINK-AFTER:       pred.store.if9:
+; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; SINK-AFTER-NEXT:    store i32 [[TMP28]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; SINK-AFTER:       pred.store.continue10:
+; SINK-AFTER-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
+; SINK-AFTER-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; SINK-AFTER:       pred.store.if11:
+; SINK-AFTER-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; SINK-AFTER-NEXT:    store i32 [[TMP30]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; SINK-AFTER:       pred.store.continue12:
+; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER:       pred.store.if13:
+; SINK-AFTER-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; SINK-AFTER-NEXT:    store i32 [[TMP32]], ptr [[DST]], align 4
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
+; SINK-AFTER:       pred.store.continue14:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2926,6 +3041,7 @@ define i32 @sink_into_replication_region(i32 %y) {
 ; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
+; SINK-AFTER-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2943,6 +3059,7 @@ bb:
   %var6 = add i32 %var5, %var4
   %var7 = udiv i32 219220132, %var3
   %var8 = add nsw i32 %var3, -1
+  store i32 %var4, ptr %dst
   %var9 = icmp slt i32 %var3, 2
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
@@ -3430,28 +3547,28 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP7]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10]] = zext i16 [[TMP5]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
 ; UNROLL-NO-VF:       loop:
 ; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 59727aeb8249a..e1b264620261b 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "(for.body:|scalar.body:)" --filter-out-after "for.body:" --version 5
 ; RUN: opt -passes=loop-vectorize -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=1 -force-target-supports-scalable-vectors=true -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF1
 ; RUN: opt -passes=loop-vectorize -scalable-vectorization=on -force-vector-width=4 -force-vector-interleave=2 -force-target-supports-scalable-vectors=true -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF2
 
@@ -7,30 +8,150 @@
 ; }
 ;
 define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
-; CHECK-VF4UF1-LABEL: @recurrence_1
-; CHECK-VF4UF1: for.preheader
-; CHECK-VF4UF1: %[[SUB_1:.*]] = add i32 %n, -1
-; CHECK-VF4UF1: %[[ZEXT:.*]] = zext i32 %[[SUB_1]] to i64
-; CHECK-VF4UF1: %[[ADD:.*]] = add nuw nsw i64 %[[ZEXT]], 1
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %pre_load, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %[[NEXT_IDX:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
-; CHECK-VF4UF1: %[[VEC_RECUR_FOR_PHI:.*]] =  extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
-; CHECK-VF4UF1: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
-; CHECK-VF4UF1: %[[SUB3:.*]] = sub i32 %[[MUL3]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB3]]
+; CHECK-VF4UF1-LABEL: define i32 @recurrence_1(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-VF4UF1-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
+; CHECK-VF4UF1-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
+; CHECK-VF4UF1-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP20]]
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP23]], align 4
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP26]], 2
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP27]]
+; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP29]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP30]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @recurrence_1(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-VF4UF2-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = add i64 [[B1]], -4
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP8]], [[A2]]
+; CHECK-VF4UF2-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP9]], [[TMP7]]
+; CHECK-VF4UF2-NEXT:    br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP11]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[PRE_LOAD]], i32 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP17]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP19]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i32>, ptr [[TMP22]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD3]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP23]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD3]], [[TMP24]]
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP30]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP26]], ptr [[TMP28]], align 4
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP31]], align 4
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = mul i32 [[TMP33]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 2
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP35]]
+; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD3]], i32 [[TMP38]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], %[[FOR_PREHEADER]] ], [ [[PRE_LOAD]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   br label %for.preheader
 
@@ -63,21 +184,142 @@ for.exit:
 ; }
 ;
 define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
-; CHECK-VF4UF1-LABEL: @recurrence_2
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 %.pre, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i32>, ptr
-; CHECK-VF4UF1: %[[REVERSE:.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %[[VEC_RECUR]], <vscale x 4 x i32> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF1: %[[VEC_RECUR_EXT:.*]] = extractelement <vscale x 4 x i32> %[[LOAD]], i32 %[[SUB2]]
+; CHECK-VF4UF1-LABEL: define i32 @recurrence_2(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
+; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP13]], zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI]], [[TMP15]]
+; CHECK-VF4UF1-NEXT:    [[TMP17]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP15]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP17]])
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD]], i32 [[TMP22]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP19]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF1:       [[FOR_COND_CLEANUP]]:
+; CHECK-VF4UF1-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-VF4UF1-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @recurrence_2(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], i32 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP27]], label %[[FOR_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
+; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP8]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTPRE]], i32 [[TMP9]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP25:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ undef, %[[VECTOR_PH]] ], [ [[TMP26:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD2]] = load <vscale x 4 x i32>, ptr [[TMP14]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> [[WIDE_LOAD2]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD]], [[TMP15]]
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[WIDE_LOAD2]], [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP17]], zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = icmp sgt <vscale x 4 x i32> [[TMP18]], zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[TMP17]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = select <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI]], [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = icmp slt <vscale x 4 x i32> [[VEC_PHI1]], [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[TMP25]] = select <vscale x 4 x i1> [[TMP23]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP26]] = select <vscale x 4 x i1> [[TMP24]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP27]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[RDX_MINMAX:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32> [[TMP26]])
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[RDX_MINMAX]])
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[WIDE_LOAD2]], i32 [[TMP31]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ undef, %[[FOR_PREHEADER]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_COND_LCSSA:%.*]] = phi i32 [ [[MINMAX_0_COND:%.*]], %[[SCALAR_BODY]] ], [ [[TMP28]], %[[MIDDLE_BLOCK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-VF4UF2:       [[FOR_COND_CLEANUP]]:
+; CHECK-VF4UF2-NEXT:    [[MINMAX_0_LCSSA:%.*]] = phi i32 [ undef, %[[ENTRY]] ], [ [[MINMAX_0_COND_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-VF4UF2-NEXT:    ret i32 [[MINMAX_0_LCSSA]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   %cmp27 = icmp sgt i32 %n, 0
   br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
@@ -113,23 +355,180 @@ scalar.body:
 }
 
 define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, float %f, i16 %p) {
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF1: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF1: %vector.recur.init = insertelement <vscale x 4 x i16> poison, i16 %0, i32 %[[SUB1]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1: %vector.recur = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[L1:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[L1]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %vector.recur, <vscale x 4 x i16> %[[L1]], i32 -1)
+; CHECK-VF4UF1-LABEL: define void @recurrence_3(
+; CHECK-VF4UF1-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]], float [[F:%.*]], i16 [[P:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF1-NEXT:    [[CONV:%.*]] = sitofp i16 [[TMP0]] to double
+; CHECK-VF4UF1-NEXT:    [[CONV1:%.*]] = fpext float [[F]] to double
+; CHECK-VF4UF1-NEXT:    [[CONV2:%.*]] = sitofp i16 [[P]] to double
+; CHECK-VF4UF1-NEXT:    [[MUL:%.*]] = fmul fast double [[CONV2]], [[CONV1]]
+; CHECK-VF4UF1-NEXT:    [[SUB:%.*]] = fsub fast double [[CONV]], [[MUL]]
+; CHECK-VF4UF1-NEXT:    store double [[SUB]], ptr [[B]], align 8
+; CHECK-VF4UF1-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
+; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -2
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 16
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP6]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-VF4UF1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-VF4UF1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x double>
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = sitofp <vscale x 4 x i16> [[TMP21]] to <vscale x 4 x double>
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = fmul fast <vscale x 4 x double> [[TMP23]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP25:%.*]] = fsub fast <vscale x 4 x double> [[TMP22]], [[TMP24]]
+; CHECK-VF4UF1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, ptr [[TMP26]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x double> [[TMP25]], ptr [[TMP27]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-VF4UF1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP31]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 1, %[[FOR_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define void @recurrence_3(
+; CHECK-VF4UF2-SAME: ptr readonly captures(none) [[A:%.*]], ptr captures(none) [[B:%.*]], i32 [[N:%.*]], float [[F:%.*]], i16 [[P:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*:]]
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF2-NEXT:    [[CONV:%.*]] = sitofp i16 [[TMP0]] to double
+; CHECK-VF4UF2-NEXT:    [[CONV1:%.*]] = fpext float [[F]] to double
+; CHECK-VF4UF2-NEXT:    [[CONV2:%.*]] = sitofp i16 [[P]] to double
+; CHECK-VF4UF2-NEXT:    [[MUL:%.*]] = fmul fast double [[CONV2]], [[CONV1]]
+; CHECK-VF4UF2-NEXT:    [[SUB:%.*]] = fsub fast double [[CONV]], [[MUL]]
+; CHECK-VF4UF2-NEXT:    store double [[SUB]], ptr [[B]], align 8
+; CHECK-VF4UF2-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
+; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -2
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 3
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP7]], 16
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP6]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-VF4UF2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-VF4UF2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP11]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP12]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x double> poison, double [[CONV1]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x double> [[BROADCAST_SPLATINSERT]], <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = mul i32 [[TMP16]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = sub i32 [[TMP17]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[TMP0]], i32 [[TMP18]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[TMP19]], i64 [[TMP22]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP20]], align 2, !alias.scope [[META6:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD4]] = load <vscale x 4 x i16>, ptr [[TMP23]], align 2, !alias.scope [[META6]]
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD4]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = sitofp <vscale x 4 x i16> [[WIDE_LOAD4]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = sitofp <vscale x 4 x i16> [[TMP24]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = sitofp <vscale x 4 x i16> [[TMP25]] to <vscale x 4 x double>
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = fmul fast <vscale x 4 x double> [[TMP28]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = fmul fast <vscale x 4 x double> [[TMP29]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = fsub fast <vscale x 4 x double> [[TMP26]], [[TMP30]]
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = fsub fast <vscale x 4 x double> [[TMP27]], [[TMP31]]
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[OFFSET_IDX]]
+; CHECK-VF4UF2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP36:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP36]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, ptr [[TMP34]], i64 [[TMP37]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP32]], ptr [[TMP35]], align 8, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x double> [[TMP33]], ptr [[TMP38]], align 8, !alias.scope [[META9]], !noalias [[META6]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
+; CHECK-VF4UF2-NEXT:    [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP39]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP41:%.*]] = mul i32 [[TMP40]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP42:%.*]] = sub i32 [[TMP41]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD4]], i32 [[TMP42]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[FOR_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP15]], %[[MIDDLE_BLOCK]] ], [ 1, %[[FOR_PREHEADER]] ], [ 1, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 ; Check also that the casts were not moved needlessly.
-; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[L1]] to <vscale x 4 x double>
-; CHECK-VF4UF1: sitofp <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x double>
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF1: %[[SUB2:.*]] = sub i32 %[[MUL2]], 1
-; CHECK-VF4UF1: %vector.recur.extract = extractelement <vscale x 4 x i16> %[[L1]], i32 %[[SUB2]]
 entry:
   %0 = load i16, ptr %a, align 2
   %conv = sitofp i16 %0 to double
@@ -168,10 +567,72 @@ for.end:
 }
 
 define i64 @constant_folded_previous_value() {
-; CHECK-VF4UF2-LABEL: @constant_folded_previous_value
-; CHECK-VF4UF2: vector.body
-; CHECK-VF4UF2: %[[VECTOR_RECUR:.*]] = phi <vscale x 4 x i64> [ %vector.recur.init, %vector.ph ], [ splat (i64 1), %vector.body ]
-; CHECK-VF4UF2: br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK-VF4UF1-LABEL: define i64 @constant_folded_previous_value() {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF1:       [[SCALAR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i64 @constant_folded_previous_value() {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[SCALAR_BODY:.*]]
+; CHECK-VF4UF2:       [[SCALAR_BODY]]:
+;
 entry:
   br label %scalar.body
 
@@ -180,7 +641,7 @@ scalar.body:
   %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
   %tmp3 = add i64 0, 1
   %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, undef
+  %cond = icmp eq i64 %i.next, 1000
   br i1 %cond, label %for.end, label %scalar.body, !llvm.loop !0
 
 for.end:
@@ -193,28 +654,113 @@ for.end:
 ; the first order recurrence phi is used outside the loop, so we require the phi
 ; itself and not its update (addx).
 define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
-; CHECK-VF4UF2-LABEL: @extract_second_last_iteration
-; CHECK-VF4UF2: vector.ph
-; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[SPLAT_INS1:.*]] = insertelement <vscale x 4 x i32> poison, i32 %x, i64 0
-; CHECK-VF4UF2: %[[SPLAT1:.*]] = shufflevector <vscale x 4 x i32> %[[SPLAT_INS1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL1:.*]] = mul i32 %[[VSCALE1]], 4
-; CHECK-VF4UF2: %[[SUB1:.*]] = sub i32 %[[MUL1]], 1
-; CHECK-VF4UF2: %[[VEC_RECUR_INIT:.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 %[[SUB1]]
-; ; CHECK-VF4UF2: vector.body
-; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i32> [ %[[VEC_RECUR_INIT]], %vector.ph ], [ %[[ADD2:.*]], %vector.body ]
-; CHECK-VF4UF2: %[[ADD1:.*]] = add <vscale x 4 x i32> %{{.*}}, %[[SPLAT1]]
-; CHECK-VF4UF2: middle.block
-; CHECK-VF4UF2: %[[VSCALE2:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL2:.*]] = mul i32 %[[VSCALE2]], 4
-; CHECK-VF4UF2: %[[SUB3:.*]] = sub i32 %[[MUL2]], 2
-; CHECK-VF4UF2: %vector.recur.extract.for.phi = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB3]]
-; CHECK-VF4UF2: %[[VSCALE3:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: %[[MUL3:.*]] = mul i32 %[[VSCALE3]], 4
-; CHECK-VF4UF2: %[[SUB2:.*]] = sub i32 %[[MUL3]], 1
-; CHECK-VF4UF2: %vector.recur.extract = extractelement <vscale x 4 x i32> %[[ADD2]], i32 %[[SUB2]]
+; CHECK-VF4UF1-LABEL: define i32 @extract_second_last_iteration(
+; CHECK-VF4UF1-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
+; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP7]]
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP8]], i64 0
+; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
+; CHECK-VF4UF1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define i32 @extract_second_last_iteration(
+; CHECK-VF4UF2-SAME: ptr [[CVAL:%.*]], i32 [[X:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 96, [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i32 96, [[TMP3]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
+; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 2
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP13]]
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP9]], i32 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i32 96, [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_BODY]]:
+;
 entry:
   br label %for.body
 
@@ -238,13 +784,140 @@ for.end:
 
 ; Check that the sext sank after the load in the vector loop.
 define void @sink_after(ptr %a, ptr %b, i64 %n) {
-; CHECK-VF4UF1-LABEL: @sink_after
-; CHECK-VF4UF1: vector.body
-; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[LOAD:.*]], %vector.body ]
-; CHECK-VF4UF1: %[[LOAD]] = load <vscale x 4 x i16>, ptr
-; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[LOAD]], i32 -1)
-; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT: sext <vscale x 4 x i16> %[[LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-LABEL: define void @sink_after(
+; CHECK-VF4UF1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4UF1-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 2
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 2
+; CHECK-VF4UF1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF4UF1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-VF4UF1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-VF4UF1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF1:       [[VECTOR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-VF4UF1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
+; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF1:       [[VECTOR_BODY]]:
+; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF1-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
+; CHECK-VF4UF1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; CHECK-VF4UF1-NEXT:    [[WIDE_LOAD]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF1-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i16> [[TMP15]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP17]], [[TMP16]]
+; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
+; CHECK-VF4UF1-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-VF4UF1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4UF1:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF1-NEXT:    [[TMP23:%.*]] = mul i32 [[TMP22]], 4
+; CHECK-VF4UF1-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP23]], 1
+; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD]], i32 [[TMP24]]
+; CHECK-VF4UF1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4UF1-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF1:       [[SCALAR_PH]]:
+; CHECK-VF4UF1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[ENTRY]] ], [ [[DOTPRE]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF1:       [[FOR_BODY]]:
+;
+; CHECK-VF4UF2-LABEL: define void @sink_after(
+; CHECK-VF4UF2-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF4UF2-NEXT:  [[ENTRY:.*]]:
+; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl i64 [[N]], 2
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = shl i64 [[N]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 2
+; CHECK-VF4UF2-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]]
+; CHECK-VF4UF2-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
+; CHECK-VF4UF2-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-VF4UF2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-VF4UF2-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-VF4UF2:       [[VECTOR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-VF4UF2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP10]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
+; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-VF4UF2:       [[VECTOR_BODY]]:
+; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[WIDE_LOAD3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[INDEX]], 1
+; CHECK-VF4UF2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP12]]
+; CHECK-VF4UF2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP13]], i64 [[TMP16]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP14]], align 2, !alias.scope [[META17:![0-9]+]]
+; CHECK-VF4UF2-NEXT:    [[WIDE_LOAD3]] = load <vscale x 4 x i16>, ptr [[TMP17]], align 2, !alias.scope [[META17]]
+; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_LOAD]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16> [[WIDE_LOAD3]], i32 -1)
+; CHECK-VF4UF2-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i16> [[TMP18]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[TMP19]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP22:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-VF4UF2-NEXT:    [[TMP24:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP20]]
+; CHECK-VF4UF2-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 4 x i32> [[TMP23]], [[TMP21]]
+; CHECK-VF4UF2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-VF4UF2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0
+; CHECK-VF4UF2-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i64 [[TMP29]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP24]], ptr [[TMP27]], align 4, !alias.scope [[META20:![0-9]+]], !noalias [[META17]]
+; CHECK-VF4UF2-NEXT:    store <vscale x 4 x i32> [[TMP25]], ptr [[TMP30]], align 4, !alias.scope [[META20]], !noalias [[META17]]
+; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-VF4UF2-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:
+; CHECK-VF4UF2-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-VF4UF2-NEXT:    [[TMP33:%.*]] = mul i32 [[TMP32]], 4
+; CHECK-VF4UF2-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP33]], 1
+; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_LOAD3]], i32 [[TMP34]]
+; CHECK-VF4UF2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF4UF2-NEXT:    br i1 [[CMP_N]], [[FOR_END:label %.*]], label %[[SCALAR_PH]]
+; CHECK-VF4UF2:       [[SCALAR_PH]]:
+; CHECK-VF4UF2-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[DOTPRE]], %[[ENTRY]] ], [ [[DOTPRE]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-VF4UF2-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-VF4UF2:       [[FOR_BODY]]:
+;
 entry:
   %.pre = load i16, ptr %a
   br label %for.body

From 790df93298b3ad6c57dafb55fc6d18bddff16c4a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 15 Jun 2025 21:59:30 +0100
Subject: [PATCH 514/851] [VPlan] Mark VPFirstOrderRecurrencePHI as not
 reading/writing memory.

First-order recurrence phis don't have side-effects and don't read or
write memory. Mark them as such.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   9 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   7 +
 .../LoopVectorize/AArch64/induction-costs.ll  |   9 +-
 .../AArch64/loop-vectorization-factors.ll     |   6 +-
 .../AArch64/partial-reduce-dot-product.ll     |  28 +---
 .../RISCV/blocks-with-dead-instructions.ll    |  40 -----
 .../X86/fixed-order-recurrence.ll             |  13 +-
 .../LoopVectorize/X86/induction-costs.ll      |   3 +-
 .../X86/pr131359-dead-for-splice.ll           |   6 +-
 .../Transforms/LoopVectorize/X86/pr72969.ll   |   3 +-
 .../first-order-recurrence-interleave-only.ll |   7 +-
 .../LoopVectorize/first-order-recurrence.ll   | 145 +++++++-----------
 .../interleave-and-scalarize-only.ll          |   3 +-
 llvm/test/Transforms/LoopVectorize/optsize.ll |   3 +-
 .../scalable-first-order-recurrence.ll        |  30 +---
 15 files changed, 99 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 10906d9a30df8..cca3d32c0783e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1071,7 +1071,14 @@ void VPlan::execute(VPTransformState *State) {
 InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
   // For now only return the cost of the vector loop region, ignoring any other
   // blocks, like the preheader or middle blocks.
-  return getVectorLoopRegion()->cost(VF, Ctx);
+  InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
+
+  // If any instructions in the middle block are invalid return invalid.
+  // TODO: Remove once no VPlans with VF == vscale x 1 and first-order recurrences are created.
+  if (!getMiddleBlock()->cost(VF, Ctx).isValid())
+    return InstructionCost::getInvalid();
+
+  return Cost;
 }
 
 VPRegionBlock *VPlan::getVectorLoopRegion() {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3bdfa6724f691..048286d7a97bc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -66,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
   case VPBranchOnMaskSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPScalarIVStepsSC:
   case VPPredInstPHISC:
     return false;
@@ -113,6 +114,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayReadFromMemory();
   case VPBranchOnMaskSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPScalarIVStepsSC:
   case VPWidenStoreEVLSC:
@@ -146,6 +148,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
 bool VPRecipeBase::mayHaveSideEffects() const {
   switch (getVPDefID()) {
   case VPDerivedIVSC:
+  case VPFirstOrderRecurrencePHISC:
   case VPPredInstPHISC:
   case VPVectorEndPointerSC:
     return false;
@@ -837,6 +840,10 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
                                   I32Ty, {Arg0Ty, I32Ty, I1Ty});
     return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
   }
+  case VPInstruction::ExtractPenultimateElement:
+    if (VF == ElementCount::getScalable(1))
+      return InstructionCost::getInvalid();
+  LLVM_FALLTHROUGH;
   default:
     // TODO: Compute cost other VPInstructions once the legacy cost model has
     // been retired.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 6ed9c856f50cc..4af4929fad521 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -105,12 +105,11 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i64> [ <i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -162,12 +161,11 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -306,7 +304,6 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
@@ -316,7 +313,7 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i32> [[TMP7]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i32> [[STEP_ADD]] to <4 x i64>
-; CHECK-NEXT:    [[TMP10]] = shl <4 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <4 x i64> [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index e97bb857fdba3..31be8862a8872 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -750,11 +750,10 @@ define void @add_phifail(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <16 x i32> [[TMP3]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[TMP4]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
@@ -832,11 +831,10 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw <16 x i32> [[TMP5]], splat (i32 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[INDEX]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 52dcba69d036a..b091452e28b4a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -890,14 +890,9 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
@@ -917,7 +912,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -1020,14 +1015,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
@@ -1037,7 +1027,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1067,14 +1057,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
@@ -1088,7 +1073,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1118,14 +1103,9 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
@@ -1135,7 +1115,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
index ea8df1669288d..d41caca97e1fa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll
@@ -31,15 +31,10 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -121,15 +116,10 @@ define void @block_with_dead_inst_2(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 0, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -211,15 +201,10 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 0, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -311,15 +296,10 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -413,15 +393,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 {
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 3, [[TMP7]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP15]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 1, i32 [[TMP16]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> zeroinitializer, <vscale x 4 x ptr> [[TMP17]], i32 2, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -523,15 +498,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 %
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 3, [[TMP10]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP16]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 8
-; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 1, i32 [[TMP19]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i16> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], <vscale x 8 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> [[TMP20]], i32 2, <vscale x 8 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
@@ -621,14 +591,9 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 1, i32 [[TMP8]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
@@ -705,14 +670,9 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 1, i32 [[TMP8]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ zeroinitializer, %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP11]], align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
index 83e9d6146755d..3361068c99220 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll
@@ -217,14 +217,12 @@ define i64 @test_pr62954_scalar_epilogue_required(ptr %A, ptr noalias %B, ptr %C
 ; CHECK-NEXT:    [[REC_START:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <2 x i64> poison, i64 [[REC_START]], i32 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 1, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i64> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 4)
-; CHECK-NEXT:    [[TMP1]] = sub nsw <2 x i64> zeroinitializer, [[STEP_ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i64> zeroinitializer, [[STEP_ADD]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i64 [[TMP2]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -299,13 +297,12 @@ define void @for_iv_trunc_optimized(ptr %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 1>, [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1, i32 2, i32 3, i32 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP0]], splat (i32 3)
-; CHECK-NEXT:    [[TMP3]] = or <4 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP1]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr [[DST:%.*]], align 4
@@ -364,11 +361,9 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP4]]
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x ptr> poison, ptr [[A:%.*]], i32 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x ptr> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 2
@@ -377,7 +372,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr nusw [3 x float], ptr [[A:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr nusw [3 x float], ptr [[A]], i64 [[TMP8]]
@@ -392,7 +387,7 @@ define void @test_for_tried_to_force_scalar(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP21]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x ptr> [[TMP25]], ptr [[TMP22]], i32 1
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x ptr> [[TMP26]], ptr [[TMP23]], i32 2
-; CHECK-NEXT:    [[TMP28]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x ptr> [[TMP27]], ptr [[TMP24]], i32 3
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <4 x ptr> [[TMP20]], <4 x ptr> [[TMP28]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x float>, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x float> [[WIDE_VEC]], <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 19424e44a9022..7aeb32afe43be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -407,11 +407,10 @@ define i16 @iv_and_step_trunc() {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <2 x i16> [ <i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
-; CHECK-NEXT:    [[TMP2]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
index bcfa212cf3644..c02ec91c4a0c6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr131359-dead-for-splice.ll
@@ -15,9 +15,8 @@ define void @no_use() {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
@@ -60,9 +59,8 @@ define void @dead_use() {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, %[[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
index 0cd746590e0f1..368842634c374 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr72969.ll
@@ -56,7 +56,6 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VEC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 1>, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 1, i16 2, i16 3, i16 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
 ; VEC-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 1
@@ -75,7 +74,7 @@ define void @test(ptr %p) {
 ; VEC-NEXT:    store i64 0, ptr [[TMP25]], align 8
 ; VEC-NEXT:    store i64 0, ptr [[TMP26]], align 8
 ; VEC-NEXT:    [[TMP27:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; VEC-NEXT:    [[TMP28]] = zext <4 x i16> [[TMP27]] to <4 x i64>
+; VEC-NEXT:    [[TMP28:%.*]] = zext <4 x i16> [[TMP27]] to <4 x i64>
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VEC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; VEC-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
index 6b7736fa9f61d..53113b2bdf49b 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll
@@ -11,13 +11,12 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi float [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 16
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[TMP3]] = load float, ptr [[NEXT_GEP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -27,12 +26,12 @@ define float @for_load_interleave_only(ptr %src) {
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1001, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[FOR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 16
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index d28db1c77efaa..13dc53559d283 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -884,23 +884,21 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[STEP_ADD:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -943,10 +941,9 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[FOR_COND1]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP3]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1071,32 +1068,26 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD:%.*]], i32 3
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
-; UNROLL-NO-IC-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1105,7 +1096,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; UNROLL-NO-IC-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-IC:       scalar.body:
 ; UNROLL-NO-IC-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1133,7 +1124,6 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ [[PRE_LOAD:%.*]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[OFFSET_IDX]], 2
@@ -1141,7 +1131,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
-; UNROLL-NO-VF-NEXT:    [[TMP10]] = load i32, ptr [[TMP8]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 4
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1150,7 +1140,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
 ; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; UNROLL-NO-VF:       scalar.body:
 ; UNROLL-NO-VF-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1175,32 +1165,26 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; SINK-AFTER-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[PRE_LOAD:%.*]], i32 3
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 2
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 4
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 6
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
-; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP16]], i32 1
-; SINK-AFTER-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP21]], i32 [[TMP17]], i32 2
-; SINK-AFTER-NEXT:    [[TMP24]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP18]], i32 3
+; SINK-AFTER-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1209,7 +1193,7 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD]], [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[PRE_LOAD:%.*]], [[ENTRY]] ]
 ; SINK-AFTER-NEXT:    br label [[SCALAR_BODY:%.*]]
 ; SINK-AFTER:       scalar.body:
 ; SINK-AFTER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ]
@@ -1248,7 +1232,6 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ splat (i64 1), [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1276,7 +1259,6 @@ define i64 @constant_folded_previous_value() {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ 1, [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1304,7 +1286,6 @@ define i64 @constant_folded_previous_value() {
 ; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ splat (i64 1), [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
@@ -1357,9 +1338,8 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
-; UNROLL-NO-IC-NEXT:    [[TMP0]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -1391,29 +1371,28 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP4]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT1:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
-; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADDX:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
-; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT1]], [[SCALAR_PH]] ], [ [[ADDX1:%.*]], [[FOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[VAL_PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[VAL_PHI]] to i64
+; UNROLL-NO-VF-NEXT:    [[ADDX1]] = add i32 [[VAL_PHI]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[VAL_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI1]], [[FOR_BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -1426,8 +1405,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
@@ -2507,13 +2485,12 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -2561,13 +2538,12 @@ define void @sink_dead_inst(ptr %a) {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP4]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
 ; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
@@ -2610,11 +2586,10 @@ define void @sink_dead_inst(ptr %a) {
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR1:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 -27>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
@@ -3500,12 +3475,11 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
@@ -3548,15 +3522,14 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP5]], [[TMP5]]
 ; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP10]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP4]] to i32
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
 ; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
@@ -3596,11 +3569,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
 ; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
 ; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
@@ -3668,10 +3640,9 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP1]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
 ; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -3701,12 +3672,11 @@ define void @unused_recurrence(ptr %a) {
 ; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
 ; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2]] = add i16 [[TMP1]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
 ; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
@@ -3735,9 +3705,8 @@ define void @unused_recurrence(ptr %a) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP1]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
 ; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index f54d3bad95127..db25e7bede5c4 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -243,10 +243,9 @@ define void @first_order_recurrence_using_induction(i32 %n, ptr %dst) {
 ; CHECK-LABEL: @first_order_recurrence_using_induction(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDUCTION1:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDEX]] to i32
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[TMP3]], 0
-; CHECK-NEXT:    [[INDUCTION1]] = add i32 [[TMP3]], 1
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INDUCTION]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], %n.vec
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index c7149b0845981..ebddca2294d9c 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -621,8 +621,7 @@ define i32 @pr45526_pgso() !prof !14 {
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 5>, %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
+; NPGSO-NEXT:    [[TMP0:%.*]] = add nuw nsw <4 x i32> [[VEC_IND]], splat (i32 1)
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; NPGSO-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 508
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index e1b264620261b..a70d8f72c8a33 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -580,14 +580,9 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -613,14 +608,9 @@ define i64 @constant_folded_previous_value() {
 ; CHECK-VF4UF2-NEXT:    [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]]
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP9]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 0, i32 [[TMP8]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i64> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ splat (i64 1), %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -676,16 +666,11 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP8]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
 ; CHECK-VF4UF1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF1-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF1-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; CHECK-VF4UF1-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-VF4UF1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -729,19 +714,14 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 4
-; CHECK-VF4UF2-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP19]]
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; CHECK-VF4UF2-NEXT:    [[TMP9]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
-; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
+; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT1]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT2]]
 ; CHECK-VF4UF2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF4UF2:       [[MIDDLE_BLOCK]]:

From f4a63523b88631e224496435bea0940ac05897bf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 14:51:20 -0700
Subject: [PATCH 515/851] PowerPC: Migrate to newer relocation specifier
 representation

* Use MCAsmInfo::printSpecifierExpr instead of MCExpr::print.
* Replace PPCMCExpr with MCSpecifierExpr.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 10 ++--
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp     | 24 ++++++++
 .../PowerPC/MCTargetDesc/PPCMCAsmInfo.h       |  8 +++
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp | 60 +++++++------------
 .../Target/PowerPC/MCTargetDesc/PPCMCExpr.h   | 31 +++-------
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     | 16 ++---
 llvm/lib/Target/PowerPC/PPCMCInstLower.cpp    |  4 +-
 7 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 7e79d85d60173..bb4c2fd3e5cf8 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -749,9 +749,9 @@ struct PPCOperand : public MCParsedAsmOperand {
           getSpecifier(SRE) == PPC::S_TLS_PCREL)
         return CreateTLSReg(SRE, S, E, IsPPC64);
 
-    if (const PPCMCExpr *TE = dyn_cast<PPCMCExpr>(Val)) {
+    if (const auto *SE = dyn_cast<MCSpecifierExpr>(Val)) {
       int64_t Res;
-      if (TE->evaluateAsConstant(Res))
+      if (PPC::evaluateAsConstant(*SE, Res))
         return CreateContextImm(Res, S, E, IsPPC64);
     }
 
@@ -1375,7 +1375,7 @@ const MCExpr *PPCAsmParser::extractSpecifier(const MCExpr *E,
     break;
   case MCExpr::Specifier: {
     // Detect error but do not return a modified expression.
-    auto *TE = cast<PPCMCExpr>(E);
+    auto *TE = cast<MCSpecifierExpr>(E);
     Spec = TE->getSpecifier();
     (void)extractSpecifier(TE->getSubExpr(), Spec);
     Spec = PPC::S_None;
@@ -1439,7 +1439,7 @@ bool PPCAsmParser::parseExpression(const MCExpr *&EVal) {
   uint16_t Spec = PPC::S_None;
   const MCExpr *E = extractSpecifier(EVal, Spec);
   if (Spec != PPC::S_None)
-    EVal = PPCMCExpr::create(Spec, E, getParser().getContext());
+    EVal = MCSpecifierExpr::create(E, Spec, getParser().getContext());
 
   return false;
 }
@@ -1841,5 +1841,5 @@ const MCExpr *PPCAsmParser::applySpecifier(const MCExpr *E, uint32_t Spec,
     }
   }
 
-  return PPCMCExpr::create(PPCMCExpr::Specifier(Spec), E, Ctx);
+  return MCSpecifierExpr::create(E, Spec, Ctx);
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index bb1f21d8f0327..971b592643dc6 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -13,6 +13,7 @@
 #include "PPCMCAsmInfo.h"
 #include "PPCMCExpr.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -136,6 +137,18 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   initializeVariantKinds(variantKindDescs);
 }
 
+void PPCELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                         const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  OS << '@' << getSpecifierName(Expr.getSpecifier());
+}
+
+bool PPCELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                                MCValue &Res,
+                                                const MCAssembler *Asm) const {
+  return PPC::evaluateAsRelocatableImpl(Expr, Res, Asm);
+}
+
 void PPCXCOFFMCAsmInfo::anchor() {}
 
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
@@ -159,3 +172,14 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void PPCXCOFFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  printExpr(OS, *Expr.getSubExpr());
+  OS << '@' << getSpecifierName(Expr.getSpecifier());
+}
+
+bool PPCXCOFFMCAsmInfo::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return PPC::evaluateAsRelocatableImpl(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 9fbb73c2e3182..172fe81c2bce2 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -24,6 +24,10 @@ class PPCELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
@@ -31,6 +35,10 @@ class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
 
 public:
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace PPC {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 49ae6bb5fa451..8d9c0892ae16f 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -16,38 +16,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppcmcexpr"
 
-const PPCMCExpr *PPCMCExpr::create(Specifier S, const MCExpr *Expr,
-                                   MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(S, Expr);
-}
-
-const PPCMCExpr *PPCMCExpr::create(const MCExpr *Expr, Specifier S,
-                                   MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(S, Expr);
-}
-
-void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  getSubExpr()->print(OS, MAI);
-  OS << '@' << MAI->getSpecifierName(specifier);
-}
-
-bool
-PPCMCExpr::evaluateAsConstant(int64_t &Res) const {
-  MCValue Value;
-
-  if (!getSubExpr()->evaluateAsRelocatable(Value, nullptr))
-    return false;
-
-  if (!Value.isAbsolute())
-    return false;
-  auto Tmp = evaluateAsInt64(Value.getConstant());
-  if (!Tmp)
-    return false;
-  Res = *Tmp;
-  return true;
-}
-
-std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
+static std::optional<int64_t> evaluateAsInt64(uint16_t specifier,
+                                              int64_t Value) {
   switch (specifier) {
   case PPC::S_LO:
     return Value & 0xffff;
@@ -72,21 +42,35 @@ std::optional<int64_t> PPCMCExpr::evaluateAsInt64(int64_t Value) const {
   }
 }
 
-bool PPCMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                          const MCAssembler *Asm) const {
-  if (!Asm)
+bool PPC::evaluateAsConstant(const MCSpecifierExpr &Expr, int64_t &Res) {
+  MCValue Value;
+
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Value, nullptr))
+    return false;
+
+  if (!Value.isAbsolute())
+    return false;
+  auto Tmp = evaluateAsInt64(Expr.getSpecifier(), Value.getConstant());
+  if (!Tmp)
     return false;
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+  Res = *Tmp;
+  return true;
+}
+
+bool PPC::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                    const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
     return false;
 
   // The signedness of the result is dependent on the instruction operand. E.g.
   // in addis 3,3,65535@l, 65535@l is signed. In the absence of information at
   // parse time (!Asm), disable the folding.
-  std::optional<int64_t> MaybeInt = evaluateAsInt64(Res.getConstant());
+  std::optional<int64_t> MaybeInt =
+      evaluateAsInt64(Expr.getSpecifier(), Res.getConstant());
   if (Res.isAbsolute() && MaybeInt) {
     Res = MCValue::get(*MaybeInt);
   } else {
-    Res.setSpecifier(specifier);
+    Res.setSpecifier(Expr.getSpecifier());
   }
 
   return true;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 814217ea060e0..d97a1204efbca 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -16,33 +16,20 @@
 
 namespace llvm {
 
-class PPCMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  std::optional<int64_t> evaluateAsInt64(int64_t Value) const;
-
-  explicit PPCMCExpr(Specifier S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const PPCMCExpr *create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx);
-  static const PPCMCExpr *create(const MCExpr *Expr, Specifier S,
-                                 MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
-  bool evaluateAsConstant(int64_t &Res) const;
-};
+namespace PPCMCExpr {
+using Specifier = uint16_t;
+}
 
 static inline PPCMCExpr::Specifier getSpecifier(const MCSymbolRefExpr *SRE) {
   return PPCMCExpr::Specifier(SRE->getKind());
 }
 
+namespace PPC {
+bool evaluateAsConstant(const MCSpecifierExpr &Expr, int64_t &Res);
+bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                               const MCAssembler *Asm);
+} // namespace PPC
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 8a1357c5fd555..d5d51e3ca6386 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1001,13 +1001,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
 
       const MCExpr *DeltaHi =
-          PPCMCExpr::create(DeltaExpr, PPC::S_HA, OutContext);
+          MCSpecifierExpr::create(DeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
 
       const MCExpr *DeltaLo =
-          PPCMCExpr::create(DeltaExpr, PPC::S_LO, OutContext);
+          MCSpecifierExpr::create(DeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
@@ -1401,10 +1401,10 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case PPC::PPC32GOT: {
     MCSymbol *GOTSymbol =
         OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_"));
-    const MCExpr *SymGotTlsL = PPCMCExpr::create(
-        PPC::S_LO, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
-    const MCExpr *SymGotTlsHA = PPCMCExpr::create(
-        PPC::S_HA, MCSymbolRefExpr::create(GOTSymbol, OutContext), OutContext);
+    const MCExpr *SymGotTlsL =
+        MCSpecifierExpr::create(GOTSymbol, PPC::S_LO, OutContext);
+    const MCExpr *SymGotTlsHA =
+        MCSpecifierExpr::create(GOTSymbol, PPC::S_HA, OutContext);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI)
                                  .addReg(MI->getOperand(0).getReg())
                                  .addExpr(SymGotTlsL));
@@ -2125,14 +2125,14 @@ void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
                                 GlobalEntryLabelExp, OutContext);
 
       const MCExpr *TOCDeltaHi =
-          PPCMCExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
+          MCSpecifierExpr::create(TOCDeltaExpr, PPC::S_HA, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X12)
                                    .addExpr(TOCDeltaHi));
 
       const MCExpr *TOCDeltaLo =
-          PPCMCExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
+          MCSpecifierExpr::create(TOCDeltaExpr, PPC::S_LO, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X2)
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index f6624ec989ee2..cbd53651bbbfc 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -164,11 +164,11 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   switch (access) {
     case PPCII::MO_LO:
     case PPCII::MO_PIC_LO_FLAG:
-      Expr = PPCMCExpr::create(Expr, PPC::S_LO, Ctx);
+      Expr = MCSpecifierExpr::create(Expr, PPC::S_LO, Ctx);
       break;
     case PPCII::MO_HA:
     case PPCII::MO_PIC_HA_FLAG:
-      Expr = PPCMCExpr::create(Expr, PPC::S_HA, Ctx);
+      Expr = MCSpecifierExpr::create(Expr, PPC::S_HA, Ctx);
       break;
   }
 

From 34c85ed2bc1adfa375745db6de7f62d350a8f768 Mon Sep 17 00:00:00 2001
From: Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com>
Date: Mon, 16 Jun 2025 00:07:51 +0200
Subject: [PATCH 516/851] [clang-reorder-fields] Use expanded location for
 macros (#142147)

Fixes macros being replaced instead of their expansion.

Closes #52632
---
 .../ReorderFieldsAction.cpp                   |  4 ++++
 .../MacroExpansionField.cpp                   | 24 +++++++++++++++++++
 2 files changed, 28 insertions(+)
 create mode 100644 clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp

diff --git a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
index ea0207619fb2b..3b1cd18d80346 100644
--- a/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
+++ b/clang-tools-extra/clang-reorder-fields/ReorderFieldsAction.cpp
@@ -86,6 +86,10 @@ getNewFieldsOrder(const RecordDecl *Definition,
 static void
 addReplacement(SourceRange Old, SourceRange New, const ASTContext &Context,
                std::map<std::string, tooling::Replacements> &Replacements) {
+  if (Old.getBegin().isMacroID())
+    Old = Context.getSourceManager().getExpansionRange(Old).getAsRange();
+  if (New.getBegin().isMacroID())
+    New = Context.getSourceManager().getExpansionRange(New).getAsRange();
   StringRef NewText =
       Lexer::getSourceText(CharSourceRange::getTokenRange(New),
                            Context.getSourceManager(), Context.getLangOpts());
diff --git a/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp b/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp
new file mode 100644
index 0000000000000..a4c3cbc1e12f4
--- /dev/null
+++ b/clang-tools-extra/test/clang-reorder-fields/MacroExpansionField.cpp
@@ -0,0 +1,24 @@
+// RUN: clang-reorder-fields -record-name ::bar::Foo -fields-order z,y,x %s -- | FileCheck %s
+
+namespace bar {
+
+#define INT_DECL(NAME) int NAME // CHECK:      {{^#define INT_DECL\(NAME\) int NAME}}
+#define MACRO_DECL int x;       // CHECK-NEXT: {{^#define MACRO_DECL int x;}}
+
+struct Foo {
+  MACRO_DECL   // CHECK:      {{^ INT_DECL\(z\);}}
+  int y;       // CHECK-NEXT: {{^ int y;}}
+  INT_DECL(z); // CHECK-NEXT: {{^ MACRO_DECL}}
+};
+
+#define FOO 0 // CHECK:      {{^#define FOO 0}}
+#define BAR 1 // CHECK-NEXT: {{^#define BAR 1}}
+#define BAZ 2 // CHECK-NEXT: {{^#define BAZ 2}}
+
+struct Foo foo = {
+  FOO, // CHECK:      {{^ BAZ,}}
+  BAR, // CHECK-NEXT: {{^ BAR,}}
+  BAZ, // CHECK-NEXT: {{^ FOO,}}
+};
+
+} // end namespace bar

From e448c3e5fc2ab4244356e29c9c9135b6ccf5f6ff Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:36:45 -0700
Subject: [PATCH 517/851] LoongArch: Migrate to MCAsmInfo::printExpr

---
 .../LoongArch/AsmParser/LoongArchAsmParser.cpp |  3 ++-
 .../Target/LoongArch/LoongArchAsmPrinter.cpp   |  8 +++++---
 .../MCTargetDesc/LoongArchInstPrinter.cpp      |  2 +-
 .../MCTargetDesc/LoongArchMCAsmInfo.cpp        | 13 +++++++++++++
 .../MCTargetDesc/LoongArchMCAsmInfo.h          |  7 +++++++
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.cpp | 18 ++++--------------
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h   |  5 -----
 7 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index f5c540728852a..7d58270089575 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -8,6 +8,7 @@
 
 #include "MCTargetDesc/LoongArchBaseInfo.h"
 #include "MCTargetDesc/LoongArchInstPrinter.h"
+#include "MCTargetDesc/LoongArchMCAsmInfo.h"
 #include "MCTargetDesc/LoongArchMCExpr.h"
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
 #include "MCTargetDesc/LoongArchMatInt.h"
@@ -755,7 +756,7 @@ LoongArchAsmParser::parseOperandWithModifier(OperandVector &Operands) {
   if (getLexer().getKind() != AsmToken::Identifier)
     return Error(getLoc(), "expected valid identifier for operand modifier");
   StringRef Identifier = getParser().getTok().getIdentifier();
-  LoongArchMCExpr::Specifier VK = LoongArchMCExpr::parseSpecifier(Identifier);
+  auto VK = LoongArch::parseSpecifier(Identifier);
   if (VK == LoongArchMCExpr::VK_None)
     return Error(getLoc(), "invalid relocation specifier");
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 9181e539f75cb..64ac7c03c0419 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -160,9 +161,10 @@ bool LoongArchAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   else if (OffsetMO.isImm())
     OS << ", " << OffsetMO.getImm();
   else if (OffsetMO.isGlobal() || OffsetMO.isBlockAddress() ||
-           OffsetMO.isMCSymbol())
-    OS << ", " << *MCO.getExpr();
-  else
+           OffsetMO.isMCSymbol()) {
+    OS << ", ";
+    MAI->printExpr(OS, *MCO.getExpr());
+  } else
     return true;
 
   return false;
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
index e59cac7726a67..f912af330e34e 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchInstPrinter.cpp
@@ -84,7 +84,7 @@ void LoongArchInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void LoongArchInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
index 9b7fccd0078e4..dc55ceab2dd30 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "LoongArchMCAsmInfo.h"
+#include "LoongArchMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -32,3 +34,14 @@ LoongArchMCAsmInfo::LoongArchMCAsmInfo(const Triple &TT) {
   DwarfRegNumForCFI = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
+
+void LoongArchMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                            const MCSpecifierExpr &Expr) const {
+  auto S = Expr.getSpecifier();
+  bool HasSpecifier = S != 0 && S != ELF::R_LARCH_B26;
+  if (HasSpecifier)
+    OS << '%' << LoongArch::getSpecifierName(S) << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (HasSpecifier)
+    OS << ')';
+}
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
index ed1abbf461534..58ffb723d62cd 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCAsmInfo.h
@@ -23,8 +23,15 @@ class LoongArchMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit LoongArchMCAsmInfo(const Triple &TargetTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace LoongArch {
+StringRef getSpecifierName(uint16_t S);
+uint16_t parseSpecifier(StringRef name);
+} // namespace LoongArch
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_LOONGARCH_MCTARGETDESC_LOONGARCHMCASMINFO_H
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 7eec236475000..c763aaa7276f8 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LoongArchMCExpr.h"
+#include "LoongArchMCAsmInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -27,18 +28,7 @@ const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr, uint16_t S,
   return new (Ctx) LoongArchMCExpr(Expr, Specifier(S), Hint);
 }
 
-void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Specifier S = getSpecifier();
-  bool HasVariant = S != VK_None && S != ELF::R_LARCH_B26;
-
-  if (HasVariant)
-    OS << '%' << getSpecifierName(specifier) << '(';
-  Expr->print(OS, MAI);
-  if (HasVariant)
-    OS << ')';
-}
-
-StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
+StringRef LoongArch::getSpecifierName(uint16_t S) {
   switch (S) {
   default:
     llvm_unreachable("Invalid ELF symbol kind");
@@ -149,7 +139,7 @@ StringRef LoongArchMCExpr::getSpecifierName(uint16_t S) {
   }
 }
 
-LoongArchMCExpr::Specifier LoongArchMCExpr::parseSpecifier(StringRef name) {
+LoongArchMCExpr::Specifier LoongArch::parseSpecifier(StringRef name) {
   return StringSwitch<LoongArchMCExpr::Specifier>(name)
       .Case("plt", ELF::R_LARCH_B26)
       .Case("b16", ELF::R_LARCH_B16)
@@ -205,5 +195,5 @@ LoongArchMCExpr::Specifier LoongArchMCExpr::parseSpecifier(StringRef name) {
       .Case("ld_pcrel_20", ELF::R_LARCH_TLS_LD_PCREL20_S2)
       .Case("gd_pcrel_20", ELF::R_LARCH_TLS_GD_PCREL20_S2)
       .Case("desc_pcrel_20", ELF::R_LARCH_TLS_DESC_PCREL20_S2)
-      .Default(VK_None);
+      .Default(0);
 }
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 06370001fa412..36563d8a6b609 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -36,11 +36,6 @@ class LoongArchMCExpr : public MCSpecifierExpr {
                                        MCContext &Ctx, bool Hint = false);
 
   bool getRelaxHint() const { return RelaxHint; }
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static StringRef getSpecifierName(uint16_t S);
-  static Specifier parseSpecifier(StringRef name);
 };
 
 } // end namespace llvm

From e3025c95090f74b26e36106d2aa394b213f713a1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:51:10 -0700
Subject: [PATCH 518/851] RISCV: Rename RISCVMCExpr::VK_ to RISCV::S_

Prepare for removing RISCVMCExpr. Adopt the newer naming convention (S_)
used by AMDGPU/WebAssembly/VE/M68k/PowerPC.
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 43 +++++++++----------
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  8 ++--
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 20 ++++-----
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 27 ++++++------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     | 10 ++---
 5 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 1f434beca5388..040900064b90d 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -544,9 +544,9 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   // True if operand is a symbol with no modifiers, or a constant with no
@@ -559,9 +559,9 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<N>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   // Predicate methods for AsmOperands defined in RISCVInstrInfo.td
@@ -572,9 +572,9 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_None;
+           VK == RISCV::S_None;
   }
 
   bool isCallSymbol() const {
@@ -583,7 +583,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -594,7 +594,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -605,7 +605,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TPREL_ADD;
   }
@@ -616,7 +616,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TLSDESC_CALL;
   }
@@ -870,11 +870,10 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<12>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           (VK == RISCVMCExpr::VK_LO || VK == RISCVMCExpr::VK_PCREL_LO ||
-            VK == RISCVMCExpr::VK_TPREL_LO ||
-            VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
+           (VK == RISCV::S_LO || VK == RISCV::S_PCREL_LO ||
+            VK == RISCV::S_TPREL_LO || VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
             VK == ELF::R_RISCV_TLSDESC_ADD_LO12);
   }
 
@@ -903,9 +902,9 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<20>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
-           VK == RISCVMCExpr::VK_QC_ABS20;
+           VK == RISCV::S_QC_ABS20;
   }
 
   bool isUImm20LUI() const {
@@ -916,7 +915,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_HI20 || VK == ELF::R_RISCV_TPREL_HI20);
   }
@@ -929,7 +928,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None;
+    RISCVMCExpr::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_PCREL_HI20 || VK == ELF::R_RISCV_GOT_HI20 ||
             VK == ELF::R_RISCV_TLS_GOT_HI20 || VK == ELF::R_RISCV_TLS_GD_HI20 ||
@@ -2920,7 +2919,7 @@ bool RISCVAsmParser::parseInstruction(ParseInstructionInfo &Info,
 
 bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
                                        RISCVMCExpr::Specifier &Kind) {
-  Kind = RISCVMCExpr::VK_None;
+  Kind = RISCV::S_None;
 
   if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
     Kind = RE->getSpecifier();
@@ -2929,14 +2928,14 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
 
   MCValue Res;
   if (Expr->evaluateAsRelocatable(Res, nullptr))
-    return Res.getSpecifier() == RISCVMCExpr::VK_None;
+    return Res.getSpecifier() == RISCV::S_None;
   return false;
 }
 
 bool RISCVAsmParser::isSymbolDiff(const MCExpr *Expr) {
   MCValue Res;
   if (Expr->evaluateAsRelocatable(Res, nullptr)) {
-    return Res.getSpecifier() == RISCVMCExpr::VK_None && Res.getAddSym() &&
+    return Res.getSpecifier() == RISCV::S_None && Res.getAddSym() &&
            Res.getSubSym();
   }
   return false;
@@ -3451,7 +3450,7 @@ void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
                  MCInstBuilder(RISCV::AUIPC).addReg(TmpReg).addExpr(SymbolHi));
 
   const MCExpr *RefToLinkTmpLabel = RISCVMCExpr::create(
-      MCSymbolRefExpr::create(TmpLabel, Ctx), RISCVMCExpr::VK_PCREL_LO, Ctx);
+      MCSymbolRefExpr::create(TmpLabel, Ctx), RISCV::S_PCREL_LO, Ctx);
 
   emitToStreamer(Out, MCInstBuilder(SecondOpcode)
                           .addReg(DestReg)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index dd5540038c437..20014611499c1 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -580,7 +580,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       // encounter it here is an error.
       llvm_unreachable(
           "ELF::R_RISCV_TPREL_ADD should not represent an instruction operand");
-    case RISCVMCExpr::VK_LO:
+    case RISCV::S_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = RISCV::fixup_riscv_lo12_i;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -593,7 +593,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_hi20;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_PCREL_LO:
+    case RISCV::S_PCREL_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = RISCV::fixup_riscv_pcrel_lo12_i;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -606,7 +606,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_pcrel_hi20;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_TPREL_LO:
+    case RISCV::S_TPREL_LO:
       if (MIFrm == RISCVII::InstFormatI)
         FixupKind = ELF::R_RISCV_TPREL_LO12_I;
       else if (MIFrm == RISCVII::InstFormatS)
@@ -622,7 +622,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_call_plt;
       RelaxCandidate = true;
       break;
-    case RISCVMCExpr::VK_QC_ABS20:
+    case RISCV::S_QC_ABS20:
       FixupKind = RISCV::fixup_riscv_qc_abs20_u;
       RelaxCandidate = true;
       break;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index f5f40ad44ac19..ce0ac067cb278 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -33,7 +33,7 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
 
 void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   Specifier S = getSpecifier();
-  bool HasVariant = S != VK_None && S != ELF::R_RISCV_CALL_PLT;
+  bool HasVariant = S != RISCV::S_None && S != ELF::R_RISCV_CALL_PLT;
 
   if (HasVariant)
     OS << '%' << getSpecifierName(S) << '(';
@@ -90,12 +90,12 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
 std::optional<RISCVMCExpr::Specifier>
 RISCVMCExpr::getSpecifierForName(StringRef name) {
   return StringSwitch<std::optional<RISCVMCExpr::Specifier>>(name)
-      .Case("lo", VK_LO)
+      .Case("lo", RISCV::S_LO)
       .Case("hi", ELF::R_RISCV_HI20)
-      .Case("pcrel_lo", VK_PCREL_LO)
+      .Case("pcrel_lo", RISCV::S_PCREL_LO)
       .Case("pcrel_hi", ELF::R_RISCV_PCREL_HI20)
       .Case("got_pcrel_hi", ELF::R_RISCV_GOT_HI20)
-      .Case("tprel_lo", VK_TPREL_LO)
+      .Case("tprel_lo", RISCV::S_TPREL_LO)
       .Case("tprel_hi", ELF::R_RISCV_TPREL_HI20)
       .Case("tprel_add", ELF::R_RISCV_TPREL_ADD)
       .Case("tls_ie_pcrel_hi", ELF::R_RISCV_TLS_GOT_HI20)
@@ -104,7 +104,7 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
       .Case("tlsdesc_load_lo", ELF::R_RISCV_TLSDESC_LOAD_LO12)
       .Case("tlsdesc_add_lo", ELF::R_RISCV_TLSDESC_ADD_LO12)
       .Case("tlsdesc_call", ELF::R_RISCV_TLSDESC_CALL)
-      .Case("qc.abs20", VK_QC_ABS20)
+      .Case("qc.abs20", RISCV::S_QC_ABS20)
       // Used in data directives
       .Case("pltpcrel", ELF::R_RISCV_PLT32)
       .Case("gotpcrel", ELF::R_RISCV_GOT32_PCREL)
@@ -113,19 +113,19 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
 
 StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
   switch (S) {
-  case VK_None:
+  case RISCV::S_None:
     llvm_unreachable("not used as %specifier()");
-  case VK_LO:
+  case RISCV::S_LO:
     return "lo";
   case ELF::R_RISCV_HI20:
     return "hi";
-  case VK_PCREL_LO:
+  case RISCV::S_PCREL_LO:
     return "pcrel_lo";
   case ELF::R_RISCV_PCREL_HI20:
     return "pcrel_hi";
   case ELF::R_RISCV_GOT_HI20:
     return "got_pcrel_hi";
-  case VK_TPREL_LO:
+  case RISCV::S_TPREL_LO:
     return "tprel_lo";
   case ELF::R_RISCV_TPREL_HI20:
     return "tprel_hi";
@@ -151,7 +151,7 @@ StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
     return "gotpcrel";
   case ELF::R_RISCV_PLT32:
     return "pltpcrel";
-  case VK_QC_ABS20:
+  case RISCV::S_QC_ABS20:
     return "qc.abs20";
   }
   llvm_unreachable("Invalid ELF symbol kind");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index d3b4a94f2f281..7e3acdfcb87b2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -24,18 +24,6 @@ class StringRef;
 class RISCVMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  // Specifiers mapping to relocation types below FirstTargetFixupKind are
-  // encoded literally, with these exceptions:
-  enum {
-    VK_None,
-    // Specifiers mapping to distinct relocation types.
-    VK_LO = FirstTargetFixupKind,
-    VK_PCREL_LO,
-    VK_TPREL_LO,
-    // Vendor-specific relocation types might conflict across vendors.
-    // Refer to them using Specifier constants.
-    VK_QC_ABS20,
-  };
 
 private:
   explicit RISCVMCExpr(const MCExpr *Expr, Specifier S)
@@ -57,6 +45,21 @@ class RISCVMCExpr : public MCSpecifierExpr {
   static std::optional<Specifier> getSpecifierForName(StringRef name);
   static StringRef getSpecifierName(Specifier Kind);
 };
+
+namespace RISCV {
+// Specifiers mapping to relocation types below FirstTargetFixupKind are
+// encoded literally, with these exceptions:
+enum Specifier {
+  S_None,
+  // Specifiers mapping to distinct relocation types.
+  S_LO = FirstTargetFixupKind,
+  S_PCREL_LO,
+  S_TPREL_LO,
+  // Vendor-specific relocation types might conflict across vendors.
+  // Refer to them using Specifier constants.
+  S_QC_ABS20,
+};
+} // namespace RISCV
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4fb71a3ed0006..4a75a559a9277 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -963,19 +963,19 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   default:
     llvm_unreachable("Unknown target flag on GV operand");
   case RISCVII::MO_None:
-    Kind = RISCVMCExpr::VK_None;
+    Kind = RISCV::S_None;
     break;
   case RISCVII::MO_CALL:
     Kind = ELF::R_RISCV_CALL_PLT;
     break;
   case RISCVII::MO_LO:
-    Kind = RISCVMCExpr::VK_LO;
+    Kind = RISCV::S_LO;
     break;
   case RISCVII::MO_HI:
     Kind = ELF::R_RISCV_HI20;
     break;
   case RISCVII::MO_PCREL_LO:
-    Kind = RISCVMCExpr::VK_PCREL_LO;
+    Kind = RISCV::S_PCREL_LO;
     break;
   case RISCVII::MO_PCREL_HI:
     Kind = ELF::R_RISCV_PCREL_HI20;
@@ -984,7 +984,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
     Kind = ELF::R_RISCV_GOT_HI20;
     break;
   case RISCVII::MO_TPREL_LO:
-    Kind = RISCVMCExpr::VK_TPREL_LO;
+    Kind = RISCV::S_TPREL_LO;
     break;
   case RISCVII::MO_TPREL_HI:
     Kind = ELF::R_RISCV_TPREL_HI20;
@@ -1018,7 +1018,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
     ME = MCBinaryExpr::createAdd(
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
-  if (Kind != RISCVMCExpr::VK_None)
+  if (Kind != RISCV::S_None)
     ME = RISCVMCExpr::create(ME, Kind, Ctx);
   return MCOperand::createExpr(ME);
 }

From 4635b6076dc1933b7ebd9fcca9f22ec93e2f9c0c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:01:28 -0700
Subject: [PATCH 519/851] RISCV: Rename RISCVMCExpr::VK_ to RISCV::S_

---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index 18be125d53aeb..ee6f067ff3a36 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -438,12 +438,12 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
     case ELF::R_RISCV_PCREL_LO12_I:
     case ELF::R_RISCV_PCREL_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_PCREL_LO, Ctx);
+      return RISCVMCExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
     case ELF::R_RISCV_HI20:
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
     case ELF::R_RISCV_LO12_I:
     case ELF::R_RISCV_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_LO, Ctx);
+      return RISCVMCExpr::create(Expr, RISCV::S_LO, Ctx);
     case ELF::R_RISCV_CALL:
       return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     case ELF::R_RISCV_CALL_PLT:

From fedf6c68ddfb43730578837aad394afcd97fe65a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 15:43:41 -0700
Subject: [PATCH 520/851] RISCV: Move RISCVMCExpr functions to RISCVMCAsmInfo
 or RISCVMCAsmBackend

* Move getPCRelHiFixup closer to the only caller RISCVAsmBackend::evaluateTargetFixup.
* Declare getSpecifierForName in RISCVMCAsmInfo, in align with other
  targets that have migrated to the new relocation specifier representation.
---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp  |  2 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  6 +-
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    | 55 +++++++++++++++-
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |  6 +-
 .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp   |  3 +-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp     | 12 +++-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.h       | 23 +++++++
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  2 +-
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 66 ++-----------------
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 27 --------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  2 +-
 11 files changed, 102 insertions(+), 102 deletions(-)

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index ee6f067ff3a36..cf30ad272d1c4 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "llvm/BinaryFormat/ELF.h"
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 040900064b90d..612ac428dd1bf 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "MCTargetDesc/RISCVAsmBackend.h"
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
@@ -2087,7 +2087,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getLexer().getKind() != AsmToken::Identifier)
     return Error(getLoc(), "expected '%' relocation specifier");
   StringRef Identifier = getParser().getTok().getIdentifier();
-  auto Spec = RISCVMCExpr::getSpecifierForName(Identifier);
+  auto Spec = RISCV::parseSpecifierName(Identifier);
   if (!Spec)
     return Error(getLoc(), "invalid relocation specifier");
 
@@ -2099,7 +2099,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getParser().parseParenExpression(SubExpr, E))
     return true;
 
-  Res = RISCVMCExpr::create(SubExpr, *Spec, getContext());
+  Res = RISCVMCExpr::create(SubExpr, Spec, getContext());
   return false;
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 338e5a4772830..2f37c351baf9f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -8,7 +8,6 @@
 
 #include "RISCVAsmBackend.h"
 #include "RISCVFixupKinds.h"
-#include "RISCVMCExpr.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -591,6 +590,57 @@ bool RISCVAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
   return !Res.getSubSym();
 }
 
+// Get the corresponding PC-relative HI fixup that a S_PCREL_LO points to, and
+// optionally the fragment containing it.
+//
+// \returns nullptr if this isn't a S_PCREL_LO pointing to a known PC-relative
+// HI fixup.
+static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr,
+                                      const MCFragment **DFOut) {
+  MCValue AUIPCLoc;
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr))
+    return nullptr;
+
+  const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
+  if (!AUIPCSymbol)
+    return nullptr;
+  const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
+
+  if (!DF)
+    return nullptr;
+
+  uint64_t Offset = AUIPCSymbol->getOffset();
+  if (DF->getContents().size() == Offset) {
+    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
+    if (!DF)
+      return nullptr;
+    Offset = 0;
+  }
+
+  for (const MCFixup &F : DF->getFixups()) {
+    if (F.getOffset() != Offset)
+      continue;
+    auto Kind = F.getTargetKind();
+    if (!mc::isRelocation(F.getKind())) {
+      if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
+        *DFOut = DF;
+        return &F;
+      }
+      break;
+    }
+    switch (Kind) {
+    case ELF::R_RISCV_GOT_HI20:
+    case ELF::R_RISCV_TLS_GOT_HI20:
+    case ELF::R_RISCV_TLS_GD_HI20:
+    case ELF::R_RISCV_TLSDESC_HI20:
+      *DFOut = DF;
+      return &F;
+    }
+  }
+
+  return nullptr;
+}
+
 bool RISCVAsmBackend::evaluateTargetFixup(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           uint64_t &Value) {
@@ -602,7 +652,8 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCFixup &Fixup,
     llvm_unreachable("Unexpected fixup kind!");
   case RISCV::fixup_riscv_pcrel_lo12_i:
   case RISCV::fixup_riscv_pcrel_lo12_s: {
-    AUIPCFixup = cast<RISCVMCExpr>(Fixup.getValue())->getPCRelHiFixup(&AUIPCDF);
+    AUIPCFixup =
+        getPCRelHiFixup(cast<MCSpecifierExpr>(*Fixup.getValue()), &AUIPCDF);
     if (!AUIPCFixup) {
       getContext().reportError(Fixup.getLoc(),
                                "could not find corresponding %pcrel_hi");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index a0bf378f3c767..1d81096d6b600 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/RISCVFixupKinds.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -49,7 +49,7 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                             const MCValue &Target,
                                             bool IsPCRel) const {
   unsigned Kind = Fixup.getTargetKind();
-  auto Spec = RISCVMCExpr::Specifier(Target.getSpecifier());
+  auto Spec = Target.getSpecifier();
   switch (Spec) {
   case ELF::R_RISCV_TPREL_HI20:
   case ELF::R_RISCV_TLS_GOT_HI20:
@@ -62,7 +62,7 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
   case ELF::R_RISCV_GOT32_PCREL:
     if (Kind == FK_Data_4)
       break;
-    reportError(Fixup.getLoc(), "%" + RISCVMCExpr::getSpecifierName(Spec) +
+    reportError(Fixup.getLoc(), "%" + RISCV::getSpecifierName(Spec) +
                                     " can only be used in a .word directive");
     return ELF::R_RISCV_NONE;
   default:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 1f4a77414db6b..8c9ab8effa71b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -12,7 +12,6 @@
 
 #include "RISCVInstPrinter.h"
 #include "RISCVBaseInfo.h"
-#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -102,7 +101,7 @@ void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void RISCVInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index e75bc521d47ca..88b1d21f86c51 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVMCAsmInfo.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCExpr.h"
@@ -47,3 +46,14 @@ const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
   return RISCVMCExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
 }
+
+void RISCVMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  auto S = Expr.getSpecifier();
+  bool HasSpecifier = S != 0 && S != ELF::R_RISCV_CALL_PLT;
+  if (HasSpecifier)
+    OS << '%' << RISCV::getSpecifierName(S) << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  if (HasSpecifier)
+    OS << ')';
+}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index bceeb1256471d..05f04a6185600 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 
+#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCFixup.h"
 
 namespace llvm {
 class Triple;
@@ -26,8 +28,29 @@ class RISCVMCAsmInfo : public MCAsmInfoELF {
 
   const MCExpr *getExprForFDESymbol(const MCSymbol *Sym, unsigned Encoding,
                                     MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace RISCV {
+using Specifier = uint16_t;
+// Specifiers mapping to relocation types below FirstTargetFixupKind are
+// encoded literally, with these exceptions:
+enum {
+  S_None,
+  // Specifiers mapping to distinct relocation types.
+  S_LO = FirstTargetFixupKind,
+  S_PCREL_LO,
+  S_TPREL_LO,
+  // Vendor-specific relocation types might conflict across vendors.
+  // Refer to them using Specifier constants.
+  S_QC_ABS20,
+};
+
+Specifier parseSpecifierName(StringRef name);
+StringRef getSpecifierName(Specifier Kind);
+} // namespace RISCV
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 20014611499c1..03c6701a17958 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -12,7 +12,7 @@
 
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVFixupKinds.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCAsmInfo.h"
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index ce0ac067cb278..1f6f940cac6f2 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -13,6 +13,7 @@
 
 #include "RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVAsmBackend.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "RISCVFixupKinds.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
@@ -31,65 +32,8 @@ const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
   return new (Ctx) RISCVMCExpr(Expr, S);
 }
 
-void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Specifier S = getSpecifier();
-  bool HasVariant = S != RISCV::S_None && S != ELF::R_RISCV_CALL_PLT;
-
-  if (HasVariant)
-    OS << '%' << getSpecifierName(S) << '(';
-  Expr->print(OS, MAI);
-  if (HasVariant)
-    OS << ')';
-}
-
-const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
-  MCValue AUIPCLoc;
-  if (!getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr))
-    return nullptr;
-
-  const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym();
-  if (!AUIPCSymbol)
-    return nullptr;
-  const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment());
-
-  if (!DF)
-    return nullptr;
-
-  uint64_t Offset = AUIPCSymbol->getOffset();
-  if (DF->getContents().size() == Offset) {
-    DF = dyn_cast_or_null<MCDataFragment>(DF->getNext());
-    if (!DF)
-      return nullptr;
-    Offset = 0;
-  }
-
-  for (const MCFixup &F : DF->getFixups()) {
-    if (F.getOffset() != Offset)
-      continue;
-    auto Kind = F.getTargetKind();
-    if (!mc::isRelocation(F.getKind())) {
-      if (Kind == RISCV::fixup_riscv_pcrel_hi20) {
-        *DFOut = DF;
-        return &F;
-      }
-      break;
-    }
-    switch (Kind) {
-    case ELF::R_RISCV_GOT_HI20:
-    case ELF::R_RISCV_TLS_GOT_HI20:
-    case ELF::R_RISCV_TLS_GD_HI20:
-    case ELF::R_RISCV_TLSDESC_HI20:
-      *DFOut = DF;
-      return &F;
-    }
-  }
-
-  return nullptr;
-}
-
-std::optional<RISCVMCExpr::Specifier>
-RISCVMCExpr::getSpecifierForName(StringRef name) {
-  return StringSwitch<std::optional<RISCVMCExpr::Specifier>>(name)
+RISCV::Specifier RISCV::parseSpecifierName(StringRef name) {
+  return StringSwitch<RISCV::Specifier>(name)
       .Case("lo", RISCV::S_LO)
       .Case("hi", ELF::R_RISCV_HI20)
       .Case("pcrel_lo", RISCV::S_PCREL_LO)
@@ -108,10 +52,10 @@ RISCVMCExpr::getSpecifierForName(StringRef name) {
       // Used in data directives
       .Case("pltpcrel", ELF::R_RISCV_PLT32)
       .Case("gotpcrel", ELF::R_RISCV_GOT32_PCREL)
-      .Default(std::nullopt);
+      .Default(0);
 }
 
-StringRef RISCVMCExpr::getSpecifierName(Specifier S) {
+StringRef RISCV::getSpecifierName(Specifier S) {
   switch (S) {
   case RISCV::S_None:
     llvm_unreachable("not used as %specifier()");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 7e3acdfcb87b2..3e842abc8da7f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -32,34 +32,7 @@ class RISCVMCExpr : public MCSpecifierExpr {
 public:
   static const RISCVMCExpr *create(const MCExpr *Expr, Specifier S,
                                    MCContext &Ctx);
-
-  /// Get the corresponding PC-relative HI fixup that a VK_PCREL_LO
-  /// points to, and optionally the fragment containing it.
-  ///
-  /// \returns nullptr if this isn't a VK_PCREL_LO pointing to a
-  /// known PC-relative HI fixup.
-  const MCFixup *getPCRelHiFixup(const MCFragment **DFOut) const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-
-  static std::optional<Specifier> getSpecifierForName(StringRef name);
-  static StringRef getSpecifierName(Specifier Kind);
-};
-
-namespace RISCV {
-// Specifiers mapping to relocation types below FirstTargetFixupKind are
-// encoded literally, with these exceptions:
-enum Specifier {
-  S_None,
-  // Specifiers mapping to distinct relocation types.
-  S_LO = FirstTargetFixupKind,
-  S_PCREL_LO,
-  S_TPREL_LO,
-  // Vendor-specific relocation types might conflict across vendors.
-  // Refer to them using Specifier constants.
-  S_QC_ABS20,
 };
-} // namespace RISCV
 } // end namespace llvm.
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 4a75a559a9277..33dbed5f7861f 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -13,7 +13,7 @@
 
 #include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVInstPrinter.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
 #include "RISCV.h"

From 51b63bbee56c2253643f41c53bc3592af261b82d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:27:20 -0700
Subject: [PATCH 521/851] RISCV: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 33dbed5f7861f..2c636c4efadc7 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -432,7 +432,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   if (Offset.isImm())
     OS << MCO.getImm();
   else if (Offset.isGlobal() || Offset.isBlockAddress() || Offset.isMCSymbol())
-    OS << *MCO.getExpr();
+    MAI->printExpr(OS, *MCO.getExpr());
 
   if (Offset.isMCSymbol())
     MMI->getContext().registerInlineAsmLabel(Offset.getMCSymbol());

From f11dd116e0aa8cf35bdb82dba0a3a926538c05c2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:51:08 -0700
Subject: [PATCH 522/851] RISCV: Replace RISCVMCExpr with MCSpecifierExpr

---
 bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp  | 24 +++++-----
 .../CodeGen/TargetLoweringObjectFileImpl.h    |  4 --
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  2 +-
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 46 +++++++++----------
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp     |  2 +-
 .../RISCV/MCTargetDesc/RISCVMCAsmInfo.h       |  1 -
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  6 +--
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp |  6 ---
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   | 38 ---------------
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     | 10 ++--
 .../Target/RISCV/RISCVTargetObjectFile.cpp    | 10 +---
 llvm/lib/Target/RISCV/RISCVTargetObjectFile.h |  3 --
 12 files changed, 45 insertions(+), 107 deletions(-)
 delete mode 100644 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h

diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
index cf30ad272d1c4..c7d664ab09d46 100644
--- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
+++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp
@@ -33,8 +33,8 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
 
   bool equals(const MCSpecifierExpr &A, const MCSpecifierExpr &B,
               CompFuncTy Comp) const override {
-    const auto &RISCVExprA = cast<RISCVMCExpr>(A);
-    const auto &RISCVExprB = cast<RISCVMCExpr>(B);
+    const auto &RISCVExprA = cast<MCSpecifierExpr>(A);
+    const auto &RISCVExprB = cast<MCSpecifierExpr>(B);
     if (RISCVExprA.getSpecifier() != RISCVExprB.getSpecifier())
       return false;
 
@@ -245,7 +245,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
                   MCContext *Ctx) {
     Inst.setOpcode(Opcode);
     Inst.clear();
-    Inst.addOperand(MCOperand::createExpr(RISCVMCExpr::create(
+    Inst.addOperand(MCOperand::createExpr(MCSpecifierExpr::create(
         MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx),
         ELF::R_RISCV_CALL_PLT, *Ctx)));
   }
@@ -342,7 +342,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
   }
 
   const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
-    auto *RISCVExpr = dyn_cast<RISCVMCExpr>(Expr);
+    auto *RISCVExpr = dyn_cast<MCSpecifierExpr>(Expr);
     if (RISCVExpr && RISCVExpr->getSubExpr())
       return getTargetSymbol(RISCVExpr->getSubExpr());
 
@@ -435,19 +435,19 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
     case ELF::R_RISCV_TLS_GD_HI20:
       // The GOT is reused so no need to create GOT relocations
     case ELF::R_RISCV_PCREL_HI20:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_PCREL_HI20, Ctx);
     case ELF::R_RISCV_PCREL_LO12_I:
     case ELF::R_RISCV_PCREL_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
+      return MCSpecifierExpr::create(Expr, RISCV::S_PCREL_LO, Ctx);
     case ELF::R_RISCV_HI20:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_HI20, Ctx);
     case ELF::R_RISCV_LO12_I:
     case ELF::R_RISCV_LO12_S:
-      return RISCVMCExpr::create(Expr, RISCV::S_LO, Ctx);
+      return MCSpecifierExpr::create(Expr, RISCV::S_LO, Ctx);
     case ELF::R_RISCV_CALL:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     case ELF::R_RISCV_CALL_PLT:
-      return RISCVMCExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
+      return MCSpecifierExpr::create(Expr, ELF::R_RISCV_CALL_PLT, Ctx);
     }
   }
 
@@ -466,10 +466,10 @@ class RISCVMCPlusBuilder : public MCPlusBuilder {
       return false;
 
     const auto *ImmExpr = ImmOp.getExpr();
-    if (!isa<RISCVMCExpr>(ImmExpr))
+    if (!isa<MCSpecifierExpr>(ImmExpr))
       return false;
 
-    switch (cast<RISCVMCExpr>(ImmExpr)->getSpecifier()) {
+    switch (cast<MCSpecifierExpr>(ImmExpr)->getSpecifier()) {
     default:
       return false;
     case ELF::R_RISCV_CALL_PLT:
diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index fa6cb75338d4f..00e681e6bf53e 100644
--- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -117,10 +117,6 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
   MCSection *getStaticDtorSection(unsigned Priority,
                                   const MCSymbol *KeySym) const override;
 
-  virtual const MCExpr *createTargetMCExpr(const MCExpr *Expr,
-                                           uint8_t Specifier) const {
-    return nullptr;
-  }
   const MCExpr *
   lowerSymbolDifference(const MCSymbol *LHS, const MCSymbol *RHS,
                         int64_t Addend,
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index b8c632d11f2e0..c804a179d8865 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1219,7 +1219,7 @@ const MCExpr *TargetLoweringObjectFileELF::lowerSymbolDifference(
     if (Addend)
       Res = MCBinaryExpr::createAdd(Res, MCConstantExpr::create(Addend, Ctx),
                                     Ctx);
-    return createTargetMCExpr(Res, PLTPCRelativeSpecifier);
+    return MCSpecifierExpr::create(Res, PLTPCRelativeSpecifier, getContext());
   }
 
   if (!PLTRelativeSpecifier)
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 612ac428dd1bf..510ca5f8c0d92 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -137,7 +137,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
   // Helper to emit a combination of AUIPC and SecondOpcode. Used to implement
   // helpers such as emitLoadLocalAddress and emitLoadAddress.
   void emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
-                         const MCExpr *Symbol, RISCVMCExpr::Specifier VKHi,
+                         const MCExpr *Symbol, RISCV::Specifier VKHi,
                          unsigned SecondOpcode, SMLoc IDLoc, MCStreamer &Out);
 
   // Helper to emit pseudo instruction "lla" used in PC-rel addressing.
@@ -295,8 +295,7 @@ class RISCVAsmParser : public MCTargetAsmParser {
 #undef GET_OPERAND_DIAGNOSTIC_TYPES
   };
 
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                RISCVMCExpr::Specifier &Kind);
+  static bool classifySymbolRef(const MCExpr *Expr, RISCV::Specifier &Kind);
   static bool isSymbolDiff(const MCExpr *Expr);
 
   RISCVAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
@@ -544,7 +543,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isShiftedInt<N - 1, 1>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -559,7 +558,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<N>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -572,7 +571,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_None;
   }
@@ -583,7 +582,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -594,7 +593,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_CALL_PLT;
   }
@@ -605,7 +604,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TPREL_ADD;
   }
@@ -616,7 +615,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm() || evaluateConstantImm(getImm(), Imm))
       return false;
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == ELF::R_RISCV_TLSDESC_CALL;
   }
@@ -870,7 +869,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<12>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == RISCV::S_LO || VK == RISCV::S_PCREL_LO ||
             VK == RISCV::S_TPREL_LO || VK == ELF::R_RISCV_TLSDESC_LOAD_LO12 ||
@@ -902,7 +901,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isInt<20>(fixImmediateForRV32(Imm, isRV64Imm()));
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            VK == RISCV::S_QC_ABS20;
   }
@@ -915,7 +914,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_HI20 || VK == ELF::R_RISCV_TPREL_HI20);
   }
@@ -928,7 +927,7 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (evaluateConstantImm(getImm(), Imm))
       return isUInt<20>(Imm);
 
-    RISCVMCExpr::Specifier VK = RISCV::S_None;
+    RISCV::Specifier VK = RISCV::S_None;
     return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
            (VK == ELF::R_RISCV_PCREL_HI20 || VK == ELF::R_RISCV_GOT_HI20 ||
             VK == ELF::R_RISCV_TLS_GOT_HI20 || VK == ELF::R_RISCV_TLS_GD_HI20 ||
@@ -2099,7 +2098,7 @@ bool RISCVAsmParser::parseExprWithSpecifier(const MCExpr *&Res, SMLoc &E) {
   if (getParser().parseParenExpression(SubExpr, E))
     return true;
 
-  Res = RISCVMCExpr::create(SubExpr, Spec, getContext());
+  Res = MCSpecifierExpr::create(SubExpr, Spec, getContext());
   return false;
 }
 
@@ -2183,11 +2182,11 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
   }
 
   SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size());
-  RISCVMCExpr::Specifier Kind = ELF::R_RISCV_CALL_PLT;
+  RISCV::Specifier Kind = ELF::R_RISCV_CALL_PLT;
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
   Res = MCSymbolRefExpr::create(Sym, getContext());
-  Res = RISCVMCExpr::create(Res, Kind, getContext());
+  Res = MCSpecifierExpr::create(Res, Kind, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
   return ParseStatus::Success;
 }
@@ -2203,7 +2202,7 @@ ParseStatus RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) {
   if (Res->getKind() != MCExpr::ExprKind::SymbolRef)
     return Error(S, "operand must be a valid jump target");
 
-  Res = RISCVMCExpr::create(Res, ELF::R_RISCV_CALL_PLT, getContext());
+  Res = MCSpecifierExpr::create(Res, ELF::R_RISCV_CALL_PLT, getContext());
   Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
   return ParseStatus::Success;
 }
@@ -2918,10 +2917,9 @@ bool RISCVAsmParser::parseInstruction(ParseInstructionInfo &Info,
 }
 
 bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr,
-                                       RISCVMCExpr::Specifier &Kind) {
+                                       RISCV::Specifier &Kind) {
   Kind = RISCV::S_None;
-
-  if (const RISCVMCExpr *RE = dyn_cast<RISCVMCExpr>(Expr)) {
+  if (const auto *RE = dyn_cast<MCSpecifierExpr>(Expr)) {
     Kind = RE->getSpecifier();
     Expr = RE->getSubExpr();
   }
@@ -3434,7 +3432,7 @@ void RISCVAsmParser::emitLoadImm(MCRegister DestReg, int64_t Value,
 
 void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
                                        const MCExpr *Symbol,
-                                       RISCVMCExpr::Specifier VKHi,
+                                       RISCV::Specifier VKHi,
                                        unsigned SecondOpcode, SMLoc IDLoc,
                                        MCStreamer &Out) {
   // A pair of instructions for PC-relative addressing; expands to
@@ -3445,11 +3443,11 @@ void RISCVAsmParser::emitAuipcInstPair(MCRegister DestReg, MCRegister TmpReg,
   MCSymbol *TmpLabel = Ctx.createNamedTempSymbol("pcrel_hi");
   Out.emitLabel(TmpLabel);
 
-  const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
+  const auto *SymbolHi = MCSpecifierExpr::create(Symbol, VKHi, Ctx);
   emitToStreamer(Out,
                  MCInstBuilder(RISCV::AUIPC).addReg(TmpReg).addExpr(SymbolHi));
 
-  const MCExpr *RefToLinkTmpLabel = RISCVMCExpr::create(
+  const MCExpr *RefToLinkTmpLabel = MCSpecifierExpr::create(
       MCSymbolRefExpr::create(TmpLabel, Ctx), RISCV::S_PCREL_LO, Ctx);
 
   emitToStreamer(Out, MCInstBuilder(SecondOpcode)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index 88b1d21f86c51..090d331d99cab 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -44,7 +44,7 @@ const MCExpr *RISCVMCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   MCContext &Ctx = Streamer.getContext();
   const MCExpr *ME = MCSymbolRefExpr::create(Sym, Ctx);
   assert(Encoding & dwarf::DW_EH_PE_sdata4 && "Unexpected encoding");
-  return RISCVMCExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
+  return MCSpecifierExpr::create(ME, ELF::R_RISCV_32_PCREL, Ctx);
 }
 
 void RISCVMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
index 05f04a6185600..097e94b6117c7 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCASMINFO_H
 
-#include "RISCVMCExpr.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCFixup.h"
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 03c6701a17958..1185e3558b002 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -177,7 +177,7 @@ void RISCVMCCodeEmitter::expandTLSDESCCall(const MCInst &MI,
   MCOperand SrcSymbol = MI.getOperand(3);
   assert(SrcSymbol.isExpr() &&
          "Expected expression as first input to TLSDESCCALL");
-  const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+  const auto *Expr = dyn_cast<MCSpecifierExpr>(SrcSymbol.getExpr());
   MCRegister Link = MI.getOperand(0).getReg();
   MCRegister Dest = MI.getOperand(1).getReg();
   int64_t Imm = MI.getOperand(2).getImm();
@@ -205,7 +205,7 @@ void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI,
   assert(SrcSymbol.isExpr() &&
          "Expected expression as third input to TP-relative add");
 
-  const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+  const auto *Expr = dyn_cast<MCSpecifierExpr>(SrcSymbol.getExpr());
   assert(Expr && Expr->getSpecifier() == ELF::R_RISCV_TPREL_ADD &&
          "Expected tprel_add relocation on TP-relative symbol");
 
@@ -566,7 +566,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   unsigned FixupKind = RISCV::fixup_riscv_invalid;
   bool RelaxCandidate = false;
   if (Kind == MCExpr::Specifier) {
-    const RISCVMCExpr *RVExpr = cast<RISCVMCExpr>(Expr);
+    const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
     FixupKind = RVExpr->getSpecifier();
     switch (RVExpr->getSpecifier()) {
     default:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 1f6f940cac6f2..baa508ad3a688 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVAsmBackend.h"
 #include "MCTargetDesc/RISCVMCAsmInfo.h"
 #include "RISCVFixupKinds.h"
@@ -27,11 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscvmcexpr"
 
-const RISCVMCExpr *RISCVMCExpr::create(const MCExpr *Expr, Specifier S,
-                                       MCContext &Ctx) {
-  return new (Ctx) RISCVMCExpr(Expr, S);
-}
-
 RISCV::Specifier RISCV::parseSpecifierName(StringRef name) {
   return StringSwitch<RISCV::Specifier>(name)
       .Case("lo", RISCV::S_LO)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
deleted file mode 100644
index 3e842abc8da7f..0000000000000
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- RISCVMCExpr.h - RISC-V specific MC expression classes----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes RISC-V specific MCExprs, used for modifiers like
-// "%hi" or "%lo" etc.,
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCEXPR_H
-#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-
-class StringRef;
-
-class RISCVMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  explicit RISCVMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const RISCVMCExpr *create(const MCExpr *Expr, Specifier S,
-                                   MCContext &Ctx);
-};
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 2c636c4efadc7..83e9b4b4d7c5c 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -630,7 +630,7 @@ void RISCVAsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
     Sym = OutContext.getOrCreateSymbol(SymName);
   }
   auto Res = MCSymbolRefExpr::create(Sym, OutContext);
-  auto Expr = RISCVMCExpr::create(Res, ELF::R_RISCV_CALL_PLT, OutContext);
+  auto Expr = MCSpecifierExpr::create(Res, ELF::R_RISCV_CALL_PLT, OutContext);
 
   EmitToStreamer(*OutStreamer, MCInstBuilder(RISCV::PseudoCALL).addExpr(Expr));
 }
@@ -741,8 +741,8 @@ void RISCVAsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
 
   const MCSymbolRefExpr *HwasanTagMismatchV2Ref =
       MCSymbolRefExpr::create(HwasanTagMismatchV2Sym, OutContext);
-  auto Expr = RISCVMCExpr::create(HwasanTagMismatchV2Ref, ELF::R_RISCV_CALL_PLT,
-                                  OutContext);
+  auto Expr = MCSpecifierExpr::create(HwasanTagMismatchV2Ref,
+                                      ELF::R_RISCV_CALL_PLT, OutContext);
 
   for (auto &P : HwasanMemaccessSymbols) {
     unsigned Reg = std::get<0>(P.first);
@@ -957,7 +957,7 @@ void RISCVAsmPrinter::emitNoteGnuProperty(const Module &M) {
 static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
                                     const AsmPrinter &AP) {
   MCContext &Ctx = AP.OutContext;
-  RISCVMCExpr::Specifier Kind;
+  RISCV::Specifier Kind;
 
   switch (MO.getTargetFlags()) {
   default:
@@ -1019,7 +1019,7 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
         ME, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
 
   if (Kind != RISCV::S_None)
-    ME = RISCVMCExpr::create(ME, Kind, Ctx);
+    ME = MCSpecifierExpr::create(ME, Kind, Ctx);
   return MCOperand::createExpr(ME);
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index 3cb5c7e13dd78..bc90cf8f53aba 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetObjectFile.h"
-#include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCObjectFileInfo.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -53,7 +52,7 @@ const MCExpr *RISCVELFTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *Res = MCSymbolRefExpr::create(Sym, Ctx);
   Res = MCBinaryExpr::createAdd(
       Res, MCConstantExpr::create(Offset + MV.getConstant(), Ctx), Ctx);
-  return RISCVMCExpr::create(Res, ELF::R_RISCV_GOT32_PCREL, Ctx);
+  return MCSpecifierExpr::create(Res, ELF::R_RISCV_GOT32_PCREL, Ctx);
 }
 
 // A address must be loaded from a small section if its size is less than the
@@ -180,10 +179,3 @@ MCSection *RISCVELFTargetObjectFile::getSectionForConstant(
   return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
                                                             Alignment);
 }
-
-const MCExpr *
-RISCVELFTargetObjectFile::createTargetMCExpr(const MCExpr *Expr,
-                                             uint8_t Specifier) const {
-  return RISCVMCExpr::create(Expr, RISCVMCExpr::Specifier(Specifier),
-                             getContext());
-}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
index b6da3f4721f4b..ff7e3e4c752c3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -48,9 +48,6 @@ class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
 
   bool isInSmallSection(uint64_t Size) const;
 
-  const MCExpr *createTargetMCExpr(const MCExpr *Expr,
-                                   uint8_t Specifier) const override;
-
   const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV,
                                           const MCSymbol *Sym,
                                           const MCValue &MV, int64_t Offset,

From 489dcc9e5233b52152272e6e5377784a56a12f1d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 16:55:45 -0700
Subject: [PATCH 523/851] AArch64: Replace MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 .../AArch64/MCTargetDesc/AArch64InstPrinter.cpp  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 9d9e23e99ab3b..bbe83821eca8e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -278,7 +278,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     {
       WithMarkup M = markup(O, Markup::Immediate);
       O << "#";
-      MI->getOperand(1).getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MI->getOperand(1).getExpr());
     }
     return;
   }
@@ -291,7 +291,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     {
       WithMarkup M = markup(O, Markup::Immediate);
       O << "#";
-      MI->getOperand(2).getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MI->getOperand(2).getExpr());
     }
     return;
   }
@@ -1163,7 +1163,7 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     printImm(MI, OpNo, STI, O);
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -1240,7 +1240,7 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
     }
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     printShifter(MI, OpNum + 1, STI, O);
   }
 }
@@ -1431,7 +1431,7 @@ void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
     markup(O, Markup::Immediate) << '#' << formatImm(MO.getImm() * Scale);
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   }
 }
 
@@ -1446,7 +1446,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
   } else {
     assert(MO1.isExpr() && "Unexpected operand type!");
     O << ", ";
-    MO1.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO1.getExpr());
   }
   O << ']';
 }
@@ -1805,7 +1805,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
     markup(O, Markup::Target) << formatHex((uint64_t)TargetAddress);
   } else {
     // Otherwise, just print the expression.
-    MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MI->getOperand(OpNum).getExpr());
   }
 }
 
@@ -1832,7 +1832,7 @@ void AArch64InstPrinter::printAdrAdrpLabel(const MCInst *MI, uint64_t Address,
   }
 
   // Otherwise, just print the expression.
-  MI->getOperand(OpNum).getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MI->getOperand(OpNum).getExpr());
 }
 
 void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,

From 9a87c94622863cf712c6ab432931dfdb704fae3e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:00:53 -0700
Subject: [PATCH 524/851] MIPS: Replace MCExpr::print with MCAsmInfo::printExpr

---
 .../lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp |  3 ++-
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp     |  2 +-
 .../Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 12 ++++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
index d743f00da273b..f67356c105a4d 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
@@ -13,6 +13,7 @@
 #include "MipsInstPrinter.h"
 #include "Mips.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -138,7 +139,7 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(Op.isExpr() && "unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
 
 void MipsInstPrinter::printJumpOperand(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index d5d64ae8a0cdb..704ee0375f7a6 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -47,7 +47,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   case MEK_DTPREL:
     // MEK_DTPREL is used for marking TLS DIEExpr only
     // and contains a regular sub-expression.
-    getSubExpr()->print(OS, MAI);
+    MAI->printExpr(OS, *getSubExpr());
     return;
   case MEK_CALL_HI16:
     OS << "%call_hi";
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 4919d4f108567..49aea9c691629 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -398,42 +398,42 @@ MipsTargetAsmStreamer::MipsTargetAsmStreamer(MCStreamer &S,
 void MipsTargetAsmStreamer::emitDTPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.dtprelword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitDTPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.dtpreldword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitTPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.tprelword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitTPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.tpreldword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitGPRel32Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.gpword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
 void MipsTargetAsmStreamer::emitGPRel64Value(const MCExpr *Value) {
   auto *MAI = getStreamer().getContext().getAsmInfo();
   OS << "\t.gpdword\t";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 

From 81d8c89da056a7751f6c7714fccb30c071dbc31a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:02:48 -0700
Subject: [PATCH 525/851] M68k: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

Follow-up to 18b67a7a102c0052e5ae0e76ef1297902ffeb22d
---
 llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
index 68ac15b57508c..778d31280adc8 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
@@ -27,6 +27,7 @@
 #include "M68kBaseInfo.h"
 
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -68,7 +69,7 @@ void M68kInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 
   assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *MO.getExpr());
 }
 
 void M68kInstPrinter::printImmediate(const MCInst *MI, unsigned opNum,
@@ -78,7 +79,7 @@ void M68kInstPrinter::printImmediate(const MCInst *MI, unsigned opNum,
     O << '#' << MO.getImm();
   else if (MO.isExpr()) {
     O << '#';
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   } else
     llvm_unreachable("Unknown immediate kind");
 }
@@ -144,7 +145,7 @@ void M68kInstPrinter::printDisp(const MCInst *MI, unsigned opNum,
     return;
   }
   assert(Op.isExpr() && "Unknown operand kind in printOperand");
-  Op.getExpr()->print(O, &MAI);
+  MAI.printExpr(O, *Op.getExpr());
 }
 
 // NOTE forcing (W,L) size available since M68020 only
@@ -153,7 +154,7 @@ void M68kInstPrinter::printAbsMem(const MCInst *MI, unsigned opNum,
   const MCOperand &MO = MI->getOperand(opNum);
 
   if (MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     return;
   }
 

From 95acd6199f3799da00e45b62fd1045ece7142cad Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:11:20 -0700
Subject: [PATCH 526/851] AMDGPU: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp   | 5 +++--
 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp        | 4 ++--
 .../lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 5 +++--
 llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp     | 6 ++++--
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a56bca514aff3..a6ce42dca92be 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -51,7 +52,7 @@ void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                            raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isExpr()) {
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
     return;
   }
 
@@ -787,7 +788,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
     }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
+    MAI.printExpr(O, *Exp);
   } else {
     O << "/*INV_OP*/";
   }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 7b04fb576f438..dc1445621c7ad 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -77,7 +77,7 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     break;
   }
   for (const auto *It = Args.begin(); It != Args.end(); ++It) {
-    (*It)->print(OS, MAI);
+    MAI->printExpr(OS, **It);
     if ((It + 1) != Args.end())
       OS << ", ";
   }
@@ -709,5 +709,5 @@ void llvm::AMDGPU::printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS,
     return;
   }
 
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index a6c97a02cb959..6d69bb75f2935 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -18,6 +18,7 @@
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -282,7 +283,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
   OS << ", ";                                                                  \
-  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  getContext().getAsmInfo()->printExpr(OS, *ARG->getVariableValue());          \
   Streamer.addBlankLine();
 
   PRINT_RES_INFO(NumVGPR);
@@ -304,7 +305,7 @@ void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
   OS << "\t.set ";                                                             \
   ARG->print(OS, getContext().getAsmInfo());                                   \
   OS << ", ";                                                                  \
-  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  getContext().getAsmInfo()->printExpr(OS, *ARG->getVariableValue());          \
   Streamer.addBlankLine();
 
   PRINT_RES_INFO(MaxVGPR);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index 71e06dbbd1514..46728e59a6446 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -10,6 +10,7 @@
 #include "R600InstPrinter.h"
 #include "AMDGPUInstPrinter.h"
 #include "R600MCTargetDesc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -98,7 +99,8 @@ void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
     O << Imm << '(' << llvm::bit_cast<float>(static_cast<uint32_t>(Imm)) << ')';
   }
   if (Op.isExpr()) {
-    Op.getExpr()->print(O << '@', &MAI);
+    O << '@';
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -160,7 +162,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     }
   } else if (Op.isExpr()) {
     const MCExpr *Exp = Op.getExpr();
-    Exp->print(O, &MAI);
+    MAI.printExpr(O, *Exp);
   } else {
     O << "/*INV_OP*/";
   }

From 0894094efdfb1ff4f93f818cef9f2aec9c1ea1a8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:14:23 -0700
Subject: [PATCH 527/851] X86: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp   | 7 ++++---
 .../lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp | 2 +-
 llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp | 9 +++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index cd89b88f46194..6614eea3901bd 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -14,6 +14,7 @@
 #include "X86ATTInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -414,7 +415,7 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     WithMarkup M = markup(O, Markup::Immediate);
     O << '$';
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -445,7 +446,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       O << formatImm(DispVal);
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 
   if (IndexReg.getReg() || BaseReg.getReg()) {
@@ -500,7 +501,7 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     O << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 8470d26011cd8..7523d2aedcced 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -374,7 +374,7 @@ void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
       markup(O, Markup::Immediate) << formatHex((uint64_t)Address);
     } else {
       // Otherwise, just print the expression.
-      Op.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *Op.getExpr());
     }
   }
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index 0d92609b3a63d..b8e117be465eb 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -14,6 +14,7 @@
 #include "X86IntelInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86InstComments.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
@@ -372,7 +373,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << "offset ";
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -415,7 +416,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   } else {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
@@ -470,7 +471,7 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
     markup(O, Markup::Immediate) << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
-    DispSpec.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *DispSpec.getExpr());
   }
 
   O << ']';
@@ -479,7 +480,7 @@ void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
 void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
   if (MI->getOperand(Op).isExpr())
-    return MI->getOperand(Op).getExpr()->print(O, &MAI);
+    return MAI.printExpr(O, *MI->getOperand(Op).getExpr());
 
   markup(O, Markup::Immediate) << formatImm(MI->getOperand(Op).getImm() & 0xff);
 }

From dcb8cd8ecdd74eb2ceca2365e0fb4c9545e3cd97 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:19:14 -0700
Subject: [PATCH 528/851] ARM: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp |  2 +-
 llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp | 10 +++++-----
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp      |  3 ++-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp      |  3 ++-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 49b89cad6d475..2b959768d2135 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -288,7 +288,7 @@ void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
   OS << "\t.thumb_set\t";
   Symbol->print(OS, MAI);
   OS << ", ";
-  Value->print(OS, MAI);
+  MAI->printExpr(OS, *Value);
   OS << '\n';
 }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index ac90095a20be4..ad00b171aaf63 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -350,7 +350,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     switch (Expr->getKind()) {
     case MCExpr::Binary:
       O << '#';
-      Expr->print(O, &MAI);
+      MAI.printExpr(O, *Expr);
       break;
     case MCExpr::Constant: {
       // If a symbolic branch target was added as a constant expression then
@@ -360,7 +360,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       int64_t TargetAddress;
       if (!Constant->evaluateAsAbsolute(TargetAddress)) {
         O << '#';
-        Expr->print(O, &MAI);
+        MAI.printExpr(O, *Expr);
       } else {
         O << "0x";
         O.write_hex(static_cast<uint32_t>(TargetAddress));
@@ -370,7 +370,7 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     default:
       // FIXME: Should we always treat this as if it is a constant literal and
       // prefix it with '#'?
-      Expr->print(O, &MAI);
+      MAI.printExpr(O, *Expr);
       break;
     }
   }
@@ -395,7 +395,7 @@ void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   if (MO1.isExpr()) {
-    MO1.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO1.getExpr());
     return;
   }
 
@@ -1081,7 +1081,7 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO = MI->getOperand(OpNum);
 
   if (MO.isExpr()) {
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
     return;
   }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index b5e17e3c2da0b..1035a9e131c48 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 using namespace llvm;
@@ -44,7 +45,7 @@ void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   const MCExpr *Expr = getSubExpr();
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << '(';
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << ')';
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 49784e806b887..5be799093d2c1 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -8,6 +8,7 @@
 
 #include "AVRMCExpr.h"
 
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -44,7 +45,7 @@ void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << getName() << '(';
   if (isNegated())
     OS << '-' << '(';
-  getSubExpr()->print(OS, MAI);
+  MAI->printExpr(OS, *getSubExpr());
   if (isNegated())
     OS << ')';
   OS << ')';

From a7e5de472314a891604abee390beb8af5493b29a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:23:13 -0700
Subject: [PATCH 529/851] SystemZ: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 .../Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp | 6 +++---
 .../SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp       | 4 ++--
 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp      | 3 ++-
 .../Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp   | 3 ++-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp               | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
index 165feec7a7d43..ec8c810809301 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp
@@ -209,7 +209,7 @@ void SystemZHLASMAsmStreamer::emitHLASMValueImpl(const MCExpr *Value,
   switch (Value->getKind()) {
   case MCExpr::Constant: {
     OS << "XL" << Size << '\'';
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     OS << '\'';
     return;
   }
@@ -258,12 +258,12 @@ void SystemZHLASMAsmStreamer::emitHLASMValueImpl(const MCExpr *Value,
     return;
   }
   case MCExpr::Target:
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     return;
   default:
     if (Parens)
       OS << "A(";
-    Value->print(OS, MAI);
+    MAI->printExpr(OS, *Value);
     if (Parens)
       OS << ')';
     return;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index 5ba55e27a6136..7fd1a1c2d801a 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -53,7 +53,7 @@ void SystemZInstPrinterCommon::printOperand(const MCOperand &MO,
   } else if (MO.isImm())
     markup(O, Markup::Immediate) << MO.getImm();
   else if (MO.isExpr())
-    MO.getExpr()->print(O, MAI);
+    MAI->printExpr(O, *MO.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }
@@ -171,7 +171,7 @@ void SystemZInstPrinterCommon::printPCRelOperand(const MCInst *MI,
     markup(O, Markup::Target) << formatHex((uint64_t)TargetAddress);
   } else {
     // Otherwise, just print the expression.
-    MO.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MO.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
index 0167eae60452e..6dcca60dcedda 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
@@ -32,7 +33,7 @@ StringRef SystemZMCExpr::getVariantKindName() const {
 
 void SystemZMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << getVariantKindName() << '(';
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   OS << ')';
 }
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
index a4506eddaa69b..7720678097440 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZTargetStreamer.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 
 using namespace llvm;
@@ -44,7 +45,7 @@ const MCExpr *SystemZTargetHLASMStreamer::createWordDiffExpr(
   OS << Temp->getName() << " EQU ";
   const MCBinaryExpr *TempExpr = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(Hi, Ctx), MCSymbolRefExpr::create(Lo, Ctx), Ctx);
-  TempExpr->print(OS, Ctx.getAsmInfo());
+  Ctx.getAsmInfo()->printExpr(OS, *TempExpr);
   OS << "\n";
   return MCBinaryExpr::createLShr(MCSymbolRefExpr::create(Temp, Ctx),
                                   MCConstantExpr::create(1, Ctx), Ctx);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 57911ac1ec2f4..eb4b4c1647a13 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1043,7 +1043,7 @@ static void printOperand(const MCOperand &MCOp, const MCAsmInfo *MAI,
   else if (MCOp.isImm())
     OS << MCOp.getImm();
   else if (MCOp.isExpr())
-    MCOp.getExpr()->print(OS, MAI);
+    MAI->printExpr(OS, *MCOp.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }

From dca2b261d77a9b758587b660e5b88b6a312d057c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:25:13 -0700
Subject: [PATCH 530/851] Lanai: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 .../Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp   | 12 ++++++------
 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp   |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
index 837d8fea1c896..add4096ef9365 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -152,7 +152,7 @@ void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     OS << formatHex(Op.getImm());
   else {
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -165,7 +165,7 @@ void LanaiInstPrinter::printMemImmOperand(const MCInst *MI, unsigned OpNo,
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
     OS << '[';
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
     OS << ']';
   }
 }
@@ -178,7 +178,7 @@ void LanaiInstPrinter::printHi16ImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -190,7 +190,7 @@ void LanaiInstPrinter::printHi16AndImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -202,7 +202,7 @@ void LanaiInstPrinter::printLo16AndImmOperand(const MCInst *MI, unsigned OpNo,
   } else {
     // Symbolic operand will be lowered to immediate value by linker
     assert(Op.isExpr() && "Expected an expression");
-    Op.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *Op.getExpr());
   }
 }
 
@@ -227,7 +227,7 @@ static void printMemoryImmediateOffset(const MCAsmInfo &MAI,
     assert(isInt<SizeInBits>(OffsetOp.getImm()) && "Constant value truncated");
     OS << OffsetOp.getImm();
   } else
-    OffsetOp.getExpr()->print(OS, &MAI);
+    MAI.printExpr(OS, *OffsetOp.getExpr());
 }
 
 void LanaiInstPrinter::printMemRiOperand(const MCInst *MI, int OpNo,
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
index eec1b7f482f18..b75a09915660c 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LanaiMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -21,7 +22,7 @@ const LanaiMCExpr *LanaiMCExpr::create(Spec S, const MCExpr *Expr,
 
 void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (specifier == VK_Lanai_None) {
-    Expr->print(OS, MAI);
+    MAI->printExpr(OS, *Expr);
     return;
   }
 
@@ -38,6 +39,6 @@ void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
   OS << '(';
   const MCExpr *Expr = getSubExpr();
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
   OS << ')';
 }

From 178fac3d61aa7fc4eb9e4a3d385ae02e660c0d3a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:31:08 -0700
Subject: [PATCH 531/851] Hexagon: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp      | 3 ++-
 llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp | 4 ++--
 llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp      | 3 ++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index ae0305d570dc2..f9b4bc0d14fd9 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCExpr.h"
@@ -457,7 +458,7 @@ struct HexagonOperand : public MCParsedAsmOperand {
 void HexagonOperand::print(raw_ostream &OS) const {
   switch (Kind) {
   case Immediate:
-    getImm()->print(OS, nullptr);
+    HexagonMCAsmInfo(Triple()).printExpr(OS, *getImm());
     break;
   case Register:
     OS << "<register R";
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 16b8cec541998..9030e43b7149f 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -72,7 +72,7 @@ void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
     if (MO.getExpr()->evaluateAsAbsolute(Value))
       O << formatImm(Value);
     else
-      MO.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *MO.getExpr());
   } else {
     llvm_unreachable("Unknown operand");
   }
@@ -90,6 +90,6 @@ void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo,
     if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))
       if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo)
         O << "##";
-    Expr.print(O, &MAI);
+    MAI.printExpr(O, Expr);
   }
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index 92a8be359d739..d96e9601bf9e4 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "HexagonMCExpr.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
@@ -58,7 +59,7 @@ HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
       SignMismatch(false) {}
 
 void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  Expr->print(OS, MAI);
+  MAI->printExpr(OS, *Expr);
 }
 
 void HexagonMCExpr::setSignMismatch(bool Val) {

From 22ad0359f9006f47a1707170896f359abbd6e10d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:34:31 -0700
Subject: [PATCH 532/851] NVPTX: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 ++-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp               | 2 +-
 llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp                   | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 732950deca9fa..cc79257fb9c86 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -15,6 +15,7 @@
 #include "NVPTXUtilities.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/NVVMIntrinsicUtils.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -90,7 +91,7 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     markup(O, Markup::Immediate) << formatImm(Op.getImm());
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index a1f528c4379eb..b4e2c46b94440 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1850,7 +1850,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV,
 }
 
 void NVPTXAsmPrinter::printMCExpr(const MCExpr &Expr, raw_ostream &OS) const {
-  Expr.print(OS, OutContext.getAsmInfo());
+  OutContext.getAsmInfo()->printExpr(OS, Expr);
 }
 
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
diff --git a/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
index 95125eb41bc05..8cde0873d4d2b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -8,6 +8,7 @@
 
 #include "NVPTXMCExpr.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Format.h"
@@ -64,6 +65,6 @@ NVPTXGenericMCSymbolRefExpr::create(const MCSymbolRefExpr *SymExpr,
 void NVPTXGenericMCSymbolRefExpr::printImpl(raw_ostream &OS,
                                             const MCAsmInfo *MAI) const {
   OS << "generic(";
-  SymExpr->print(OS, MAI);
+  MAI->printExpr(OS, *SymExpr);
   OS << ")";
 }

From c9d511bc642fbf612014eee4749ad7ee2646af32 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 17:41:17 -0700
Subject: [PATCH 533/851] Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 bolt/lib/Passes/RetpolineInsertion.cpp                      | 2 +-
 llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp   | 6 +++---
 .../WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp     | 3 ++-
 llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp   | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Passes/RetpolineInsertion.cpp b/bolt/lib/Passes/RetpolineInsertion.cpp
index 98e5a8fba6454..bda26206e16c3 100644
--- a/bolt/lib/Passes/RetpolineInsertion.cpp
+++ b/bolt/lib/Passes/RetpolineInsertion.cpp
@@ -195,7 +195,7 @@ std::string createRetpolineFunctionTag(BinaryContext &BC,
 
   TagOS << "+";
   if (MemRef.DispExpr)
-    MemRef.DispExpr->print(TagOS, BC.AsmInfo.get());
+    BC.AsmInfo->printExpr(TagOS, *MemRef.DispExpr);
   else
     TagOS << MemRef.DispImm;
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
index f925a1efc88f5..80a1e85e4a5db 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
@@ -48,7 +48,7 @@ void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
     O << Imm;
   } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -62,7 +62,7 @@ void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << '#';
-    Op.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 
@@ -83,7 +83,7 @@ void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
     O << '&';
 
   if (Disp.isExpr())
-    Disp.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *Disp.getExpr());
   else {
     assert(Disp.isImm() && "Expected immediate in displacement field");
     O << Disp.getImm();
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index 344ccec58affd..321aee4720829 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -344,7 +345,7 @@ void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       auto &Sym = static_cast<const MCSymbolWasm &>(SRE->getSymbol());
       O << WebAssembly::signatureToString(Sym.getSignature());
     } else {
-      Op.getExpr()->print(O, &MAI);
+      MAI.printExpr(O, *Op.getExpr());
     }
   }
 }
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
index 408a6ac01de9e..6f9f29765452e 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp
@@ -135,7 +135,7 @@ void XtensaInstPrinter::printCallOperand(const MCInst *MI, uint64_t Address,
       O << Val;
     }
   } else if (MC.isExpr())
-    MC.getExpr()->print(O, &MAI);
+    MAI.printExpr(O, *MC.getExpr());
   else
     llvm_unreachable("Invalid operand");
 }

From d793168e3b1a0343debfdfe143d7fb4127f9038c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:04:22 -0700
Subject: [PATCH 534/851] MIPS: Rename MipsMCExpr::MEK_ to Mips::S_

Prepare for removing MipsMCExpr. Adopt the newer naming convention (S_)
used by AMDGPU/WebAssembly/VE/M68k/PowerPC/LoongArch/RISCV.
---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 113 ++++++++-------
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |  18 +--
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       | 132 +++++++++++++++++
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  34 +++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  56 ++++----
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   | 134 +-----------------
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h |  29 ----
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   9 +-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |   3 +-
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      |  68 ++++-----
 llvm/lib/Target/Mips/MipsTargetObjectFile.cpp |   4 +-
 11 files changed, 305 insertions(+), 295 deletions(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 76bbdef7ae22d..8d9c3a96b32a1 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MCTargetDesc/MipsTargetStreamer.h"
 #include "TargetInfo/MipsTargetInfo.h"
@@ -2964,10 +2964,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
         Res.getConstant() == 0 && !IsLocalSym) {
       if (UseXGOT) {
-        const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_HI16,
-                                                      SymExpr, getContext());
-        const MCExpr *CallLoExpr = MipsMCExpr::create(MipsMCExpr::MEK_CALL_LO16,
-                                                      SymExpr, getContext());
+        const MCExpr *CallHiExpr =
+            MipsMCExpr::create(Mips::S_CALL_HI16, SymExpr, getContext());
+        const MCExpr *CallLoExpr =
+            MipsMCExpr::create(Mips::S_CALL_LO16, SymExpr, getContext());
         TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                     STI);
         TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
@@ -2976,7 +2976,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                      MCOperand::createExpr(CallLoExpr), IDLoc, STI);
       } else {
         const MCExpr *CallExpr =
-            MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+            MipsMCExpr::create(Mips::S_GOT_CALL, SymExpr, getContext());
         TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
                      MCOperand::createExpr(CallExpr), IDLoc, STI);
       }
@@ -3009,9 +3009,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       const MCExpr *CallHiExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext());
-      const MCExpr *CallLoExpr = MipsMCExpr::create(
-          Res.getAddSym(), MipsMCExpr::MEK_GOT_LO16, getContext());
+          MipsMCExpr::create(Mips::S_GOT_HI16, SymExpr, getContext());
+      const MCExpr *CallLoExpr =
+          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_LO16, getContext());
 
       TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                   STI);
@@ -3042,8 +3042,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // The daddiu's marked with a '>' may be omitted if they are redundant. If
       // this happens then the last instruction must use $rd as the result
       // register.
-      GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT_DISP,
-                                   getContext());
+      GotExpr =
+          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_DISP, getContext());
       if (Res.getConstant() != 0) {
         // Symbols fully resolve with just the %got_disp(symbol) but we
         // must still account for any offset to the symbol for
@@ -3070,15 +3070,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       if (IsLocalSym) {
-        GotExpr =
-            MipsMCExpr::create(MipsMCExpr::MEK_GOT, SymExpr, getContext());
-        LoExpr = MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+        GotExpr = MipsMCExpr::create(Mips::S_GOT, SymExpr, getContext());
+        LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
       } else {
         // External symbols fully resolve the symbol with just the %got(symbol)
         // but we must still account for any offset to the symbol for
         // expressions like symbol+8.
-        GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT,
-                                     getContext());
+        GotExpr =
+            MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
         if (Res.getConstant() != 0)
           LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
@@ -3099,9 +3098,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
   }
 
   const MipsMCExpr *HiExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
+      MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, SymExpr, getContext());
+      MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3113,9 +3112,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // source register.
 
     const MipsMCExpr *HighestExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
+        MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
     const MipsMCExpr *HigherExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
+        MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
 
     bool RdRegIsRsReg =
         UseSrcReg &&
@@ -3314,7 +3313,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
     const MipsMCExpr *GotExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());
+        MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3326,7 +3325,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
     const MipsMCExpr *HiExpr =
-        MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext());
+        MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3339,10 +3338,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
       const MipsMCExpr *HighestExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext());
+          MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
       const MipsMCExpr *HigherExpr =
-          MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext());
+          MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
                   STI);
@@ -3430,7 +3429,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3481,7 +3480,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3562,7 +3561,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
   const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3785,15 +3784,15 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       //                  sw  $8,  %lo(sym)($at)
       const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_LO, OffExpr, getContext()));
+          MipsMCExpr::create(Mips::S_LO, OffExpr, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_HI, OffExpr, getContext()));
+          MipsMCExpr::create(Mips::S_HI, OffExpr, getContext()));
 
       if (ABI.IsN64()) {
         MCOperand HighestOperand = MCOperand::createExpr(
-            MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, OffExpr, getContext()));
+            MipsMCExpr::create(Mips::S_HIGHEST, OffExpr, getContext()));
         MCOperand HigherOperand = MCOperand::createExpr(
-            MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, OffExpr, getContext()));
+            MipsMCExpr::create(Mips::S_HIGHER, OffExpr, getContext()));
 
         TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
         TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
@@ -6355,31 +6354,31 @@ MCRegister MipsAsmParser::getReg(int RC, int RegNo) {
 const MCExpr *MipsAsmParser::parseRelocExpr() {
   auto getOp = [](StringRef Op) {
     return StringSwitch<MipsMCExpr::Specifier>(Op)
-        .Case("call16", MipsMCExpr::MEK_GOT_CALL)
-        .Case("call_hi", MipsMCExpr::MEK_CALL_HI16)
-        .Case("call_lo", MipsMCExpr::MEK_CALL_LO16)
-        .Case("dtprel_hi", MipsMCExpr::MEK_DTPREL_HI)
-        .Case("dtprel_lo", MipsMCExpr::MEK_DTPREL_LO)
-        .Case("got", MipsMCExpr::MEK_GOT)
-        .Case("got_disp", MipsMCExpr::MEK_GOT_DISP)
-        .Case("got_hi", MipsMCExpr::MEK_GOT_HI16)
-        .Case("got_lo", MipsMCExpr::MEK_GOT_LO16)
-        .Case("got_ofst", MipsMCExpr::MEK_GOT_OFST)
-        .Case("got_page", MipsMCExpr::MEK_GOT_PAGE)
-        .Case("gottprel", MipsMCExpr::MEK_GOTTPREL)
-        .Case("gp_rel", MipsMCExpr::MEK_GPREL)
-        .Case("hi", MipsMCExpr::MEK_HI)
-        .Case("higher", MipsMCExpr::MEK_HIGHER)
-        .Case("highest", MipsMCExpr::MEK_HIGHEST)
-        .Case("lo", MipsMCExpr::MEK_LO)
-        .Case("neg", MipsMCExpr::MEK_NEG)
-        .Case("pcrel_hi", MipsMCExpr::MEK_PCREL_HI16)
-        .Case("pcrel_lo", MipsMCExpr::MEK_PCREL_LO16)
-        .Case("tlsgd", MipsMCExpr::MEK_TLSGD)
-        .Case("tlsldm", MipsMCExpr::MEK_TLSLDM)
-        .Case("tprel_hi", MipsMCExpr::MEK_TPREL_HI)
-        .Case("tprel_lo", MipsMCExpr::MEK_TPREL_LO)
-        .Default(MipsMCExpr::MEK_None);
+        .Case("call16", Mips::S_GOT_CALL)
+        .Case("call_hi", Mips::S_CALL_HI16)
+        .Case("call_lo", Mips::S_CALL_LO16)
+        .Case("dtprel_hi", Mips::S_DTPREL_HI)
+        .Case("dtprel_lo", Mips::S_DTPREL_LO)
+        .Case("got", Mips::S_GOT)
+        .Case("got_disp", Mips::S_GOT_DISP)
+        .Case("got_hi", Mips::S_GOT_HI16)
+        .Case("got_lo", Mips::S_GOT_LO16)
+        .Case("got_ofst", Mips::S_GOT_OFST)
+        .Case("got_page", Mips::S_GOT_PAGE)
+        .Case("gottprel", Mips::S_GOTTPREL)
+        .Case("gp_rel", Mips::S_GPREL)
+        .Case("hi", Mips::S_HI)
+        .Case("higher", Mips::S_HIGHER)
+        .Case("highest", Mips::S_HIGHEST)
+        .Case("lo", Mips::S_LO)
+        .Case("neg", Mips::S_NEG)
+        .Case("pcrel_hi", Mips::S_PCREL_HI16)
+        .Case("pcrel_lo", Mips::S_PCREL_LO16)
+        .Case("tlsgd", Mips::S_TLSGD)
+        .Case("tlsldm", Mips::S_TLSLDM)
+        .Case("tprel_hi", Mips::S_TPREL_HI)
+        .Case("tprel_lo", Mips::S_TPREL_LO)
+        .Default(Mips::S_None);
   };
 
   MCAsmParser &Parser = getParser();
@@ -6391,7 +6390,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
         Parser.parseToken(AsmToken::LParen, "expected '('"))
       return nullptr;
     auto Op = getOp(Name);
-    if (Op == MipsMCExpr::MEK_None) {
+    if (Op == Mips::S_None) {
       Error(Parser.getTok().getLoc(), "invalid relocation operator");
       return nullptr;
     }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index d5e19ccaa1689..58aa374e5302d 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -157,14 +157,14 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup,
   // Determine the type of the relocation.
   unsigned Kind = Fixup.getTargetKind();
   switch (Target.getSpecifier()) {
-  case MipsMCExpr::MEK_DTPREL:
-  case MipsMCExpr::MEK_DTPREL_HI:
-  case MipsMCExpr::MEK_DTPREL_LO:
-  case MipsMCExpr::MEK_TLSLDM:
-  case MipsMCExpr::MEK_TLSGD:
-  case MipsMCExpr::MEK_GOTTPREL:
-  case MipsMCExpr::MEK_TPREL_HI:
-  case MipsMCExpr::MEK_TPREL_LO:
+  case Mips::S_DTPREL:
+  case Mips::S_DTPREL_HI:
+  case Mips::S_DTPREL_LO:
+  case Mips::S_TLSLDM:
+  case Mips::S_TLSGD:
+  case Mips::S_GOTTPREL:
+  case Mips::S_TPREL_HI:
+  case Mips::S_TPREL_LO:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 11df6fecaf37b..97c173618167b 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCAsmInfo.h"
 #include "MipsABIInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -57,3 +58,134 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   PrivateLabelPrefix = ".L";
   AllowAtInName = true;
 }
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  int64_t AbsVal;
+
+  switch (specifier) {
+  case Mips::S_None:
+  case Mips::S_Special:
+    llvm_unreachable("Mips::S_None and MEK_Special are invalid");
+    break;
+  case Mips::S_DTPREL:
+    // Mips::S_DTPREL is used for marking TLS DIEExpr only
+    // and contains a regular sub-expression.
+    MAI->printExpr(OS, *getSubExpr());
+    return;
+  case Mips::S_CALL_HI16:
+    OS << "%call_hi";
+    break;
+  case Mips::S_CALL_LO16:
+    OS << "%call_lo";
+    break;
+  case Mips::S_DTPREL_HI:
+    OS << "%dtprel_hi";
+    break;
+  case Mips::S_DTPREL_LO:
+    OS << "%dtprel_lo";
+    break;
+  case Mips::S_GOT:
+    OS << "%got";
+    break;
+  case Mips::S_GOTTPREL:
+    OS << "%gottprel";
+    break;
+  case Mips::S_GOT_CALL:
+    OS << "%call16";
+    break;
+  case Mips::S_GOT_DISP:
+    OS << "%got_disp";
+    break;
+  case Mips::S_GOT_HI16:
+    OS << "%got_hi";
+    break;
+  case Mips::S_GOT_LO16:
+    OS << "%got_lo";
+    break;
+  case Mips::S_GOT_PAGE:
+    OS << "%got_page";
+    break;
+  case Mips::S_GOT_OFST:
+    OS << "%got_ofst";
+    break;
+  case Mips::S_GPREL:
+    OS << "%gp_rel";
+    break;
+  case Mips::S_HI:
+    OS << "%hi";
+    break;
+  case Mips::S_HIGHER:
+    OS << "%higher";
+    break;
+  case Mips::S_HIGHEST:
+    OS << "%highest";
+    break;
+  case Mips::S_LO:
+    OS << "%lo";
+    break;
+  case Mips::S_NEG:
+    OS << "%neg";
+    break;
+  case Mips::S_PCREL_HI16:
+    OS << "%pcrel_hi";
+    break;
+  case Mips::S_PCREL_LO16:
+    OS << "%pcrel_lo";
+    break;
+  case Mips::S_TLSGD:
+    OS << "%tlsgd";
+    break;
+  case Mips::S_TLSLDM:
+    OS << "%tlsldm";
+    break;
+  case Mips::S_TPREL_HI:
+    OS << "%tprel_hi";
+    break;
+  case Mips::S_TPREL_LO:
+    OS << "%tprel_lo";
+    break;
+  }
+
+  OS << '(';
+  if (Expr->evaluateAsAbsolute(AbsVal))
+    OS << AbsVal;
+  else
+    Expr->print(OS, MAI);
+  OS << ')';
+}
+
+bool MipsMCExpr::isGpOff(Specifier &S) const {
+  if (getSpecifier() == Mips::S_HI || getSpecifier() == Mips::S_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+        if (S1->getSpecifier() == Mips::S_NEG &&
+            S2->getSpecifier() == Mips::S_GPREL) {
+          S = getSpecifier();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                           const MCAssembler *Asm) const {
+  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
+  // special cases.
+  if (isGpOff()) {
+    const MCExpr *SubExpr =
+        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+            ->getSubExpr();
+    if (!SubExpr->evaluateAsRelocatable(Res, Asm))
+      return false;
+
+    Res.setSpecifier(Mips::S_Special);
+    return true;
+  }
+
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(specifier);
+  return !Res.getSubSym();
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 3a2895a79f9c7..d8b96f8b568c7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,8 +13,10 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCFixup.h"
 
 namespace llvm {
 class Triple;
@@ -34,6 +36,38 @@ class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
   explicit MipsCOFFMCAsmInfo();
 };
 
+namespace Mips {
+enum {
+  S_None,
+  S_CALL_HI16 = FirstTargetFixupKind,
+  S_CALL_LO16,
+  S_DTPREL,
+  S_DTPREL_HI,
+  S_DTPREL_LO,
+  S_GOT,
+  S_GOTTPREL,
+  S_GOT_CALL,
+  S_GOT_DISP,
+  S_GOT_HI16,
+  S_GOT_LO16,
+  S_GOT_OFST,
+  S_GOT_PAGE,
+  S_GPREL,
+  S_HI,
+  S_HIGHER,
+  S_HIGHEST,
+  S_LO,
+  S_NEG,
+  S_PCREL_HI16,
+  S_PCREL_LO16,
+  S_TLSGD,
+  S_TLSLDM,
+  S_TPREL_HI,
+  S_TPREL_LO,
+  S_Special,
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index a426ca7360ce7..4035618e02526 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -585,62 +585,62 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getSpecifier()) {
-    case MipsMCExpr::MEK_None:
-    case MipsMCExpr::MEK_Special:
+    case Mips::S_None:
+    case Mips::S_Special:
       llvm_unreachable("Unhandled fixup kind!");
       break;
-    case MipsMCExpr::MEK_DTPREL:
+    case Mips::S_DTPREL:
       // MEK_DTPREL is used for marking TLS DIEExpr only
       // and contains a regular sub-expression.
       return getExprOpValue(MipsExpr->getSubExpr(), Fixups, STI);
-    case MipsMCExpr::MEK_CALL_HI16:
+    case Mips::S_CALL_HI16:
       FixupKind = Mips::fixup_Mips_CALL_HI16;
       break;
-    case MipsMCExpr::MEK_CALL_LO16:
+    case Mips::S_CALL_LO16:
       FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
-    case MipsMCExpr::MEK_DTPREL_HI:
+    case Mips::S_DTPREL_HI:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
                                    : Mips::fixup_Mips_DTPREL_HI;
       break;
-    case MipsMCExpr::MEK_DTPREL_LO:
+    case Mips::S_DTPREL_LO:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
                                    : Mips::fixup_Mips_DTPREL_LO;
       break;
-    case MipsMCExpr::MEK_GOTTPREL:
+    case Mips::S_GOTTPREL:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOTTPREL
                                    : Mips::fixup_Mips_GOTTPREL;
       break;
-    case MipsMCExpr::MEK_GOT:
+    case Mips::S_GOT:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
                                    : Mips::fixup_Mips_GOT;
       break;
-    case MipsMCExpr::MEK_GOT_CALL:
+    case Mips::S_GOT_CALL:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_CALL16
                                    : Mips::fixup_Mips_CALL16;
       break;
-    case MipsMCExpr::MEK_GOT_DISP:
+    case Mips::S_GOT_DISP:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_DISP
                                    : Mips::fixup_Mips_GOT_DISP;
       break;
-    case MipsMCExpr::MEK_GOT_HI16:
+    case Mips::S_GOT_HI16:
       FixupKind = Mips::fixup_Mips_GOT_HI16;
       break;
-    case MipsMCExpr::MEK_GOT_LO16:
+    case Mips::S_GOT_LO16:
       FixupKind = Mips::fixup_Mips_GOT_LO16;
       break;
-    case MipsMCExpr::MEK_GOT_PAGE:
+    case Mips::S_GOT_PAGE:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_PAGE
                                    : Mips::fixup_Mips_GOT_PAGE;
       break;
-    case MipsMCExpr::MEK_GOT_OFST:
+    case Mips::S_GOT_OFST:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT_OFST
                                    : Mips::fixup_Mips_GOT_OFST;
       break;
-    case MipsMCExpr::MEK_GPREL:
+    case Mips::S_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MipsMCExpr::MEK_LO:
+    case Mips::S_LO:
       // Check for %lo(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff())
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
@@ -649,15 +649,15 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
                                      : Mips::fixup_Mips_LO16;
       break;
-    case MipsMCExpr::MEK_HIGHEST:
+    case Mips::S_HIGHEST:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHEST
                                    : Mips::fixup_Mips_HIGHEST;
       break;
-    case MipsMCExpr::MEK_HIGHER:
+    case Mips::S_HIGHER:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HIGHER
                                    : Mips::fixup_Mips_HIGHER;
       break;
-    case MipsMCExpr::MEK_HI:
+    case Mips::S_HI:
       // Check for %hi(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff())
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
@@ -666,29 +666,29 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_HI16
                                      : Mips::fixup_Mips_HI16;
       break;
-    case MipsMCExpr::MEK_PCREL_HI16:
+    case Mips::S_PCREL_HI16:
       FixupKind = Mips::fixup_MIPS_PCHI16;
       break;
-    case MipsMCExpr::MEK_PCREL_LO16:
+    case Mips::S_PCREL_LO16:
       FixupKind = Mips::fixup_MIPS_PCLO16;
       break;
-    case MipsMCExpr::MEK_TLSGD:
+    case Mips::S_TLSGD:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_GD
                                    : Mips::fixup_Mips_TLSGD;
       break;
-    case MipsMCExpr::MEK_TLSLDM:
+    case Mips::S_TLSLDM:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_LDM
                                    : Mips::fixup_Mips_TLSLDM;
       break;
-    case MipsMCExpr::MEK_TPREL_HI:
+    case Mips::S_TPREL_HI:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
                                    : Mips::fixup_Mips_TPREL_HI;
       break;
-    case MipsMCExpr::MEK_TPREL_LO:
+    case Mips::S_TPREL_LO:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
                                    : Mips::fixup_Mips_TPREL_LO;
       break;
-    case MipsMCExpr::MEK_NEG:
+    case Mips::S_NEG:
       FixupKind =
           isMicroMips(STI) ? Mips::fixup_MICROMIPS_SUB : Mips::fixup_Mips_SUB;
       break;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 704ee0375f7a6..821f662f0cbfb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -33,135 +34,6 @@ const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S,
 
 const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
                                           const MCExpr *Expr, MCContext &Ctx) {
-  return create(S, create(MEK_NEG, create(MEK_GPREL, Expr, Ctx), Ctx), Ctx);
-}
-
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  int64_t AbsVal;
-
-  switch (specifier) {
-  case MEK_None:
-  case MEK_Special:
-    llvm_unreachable("MEK_None and MEK_Special are invalid");
-    break;
-  case MEK_DTPREL:
-    // MEK_DTPREL is used for marking TLS DIEExpr only
-    // and contains a regular sub-expression.
-    MAI->printExpr(OS, *getSubExpr());
-    return;
-  case MEK_CALL_HI16:
-    OS << "%call_hi";
-    break;
-  case MEK_CALL_LO16:
-    OS << "%call_lo";
-    break;
-  case MEK_DTPREL_HI:
-    OS << "%dtprel_hi";
-    break;
-  case MEK_DTPREL_LO:
-    OS << "%dtprel_lo";
-    break;
-  case MEK_GOT:
-    OS << "%got";
-    break;
-  case MEK_GOTTPREL:
-    OS << "%gottprel";
-    break;
-  case MEK_GOT_CALL:
-    OS << "%call16";
-    break;
-  case MEK_GOT_DISP:
-    OS << "%got_disp";
-    break;
-  case MEK_GOT_HI16:
-    OS << "%got_hi";
-    break;
-  case MEK_GOT_LO16:
-    OS << "%got_lo";
-    break;
-  case MEK_GOT_PAGE:
-    OS << "%got_page";
-    break;
-  case MEK_GOT_OFST:
-    OS << "%got_ofst";
-    break;
-  case MEK_GPREL:
-    OS << "%gp_rel";
-    break;
-  case MEK_HI:
-    OS << "%hi";
-    break;
-  case MEK_HIGHER:
-    OS << "%higher";
-    break;
-  case MEK_HIGHEST:
-    OS << "%highest";
-    break;
-  case MEK_LO:
-    OS << "%lo";
-    break;
-  case MEK_NEG:
-    OS << "%neg";
-    break;
-  case MEK_PCREL_HI16:
-    OS << "%pcrel_hi";
-    break;
-  case MEK_PCREL_LO16:
-    OS << "%pcrel_lo";
-    break;
-  case MEK_TLSGD:
-    OS << "%tlsgd";
-    break;
-  case MEK_TLSLDM:
-    OS << "%tlsldm";
-    break;
-  case MEK_TPREL_HI:
-    OS << "%tprel_hi";
-    break;
-  case MEK_TPREL_LO:
-    OS << "%tprel_lo";
-    break;
-  }
-
-  OS << '(';
-  if (Expr->evaluateAsAbsolute(AbsVal))
-    OS << AbsVal;
-  else
-    Expr->print(OS, MAI);
-  OS << ')';
-}
-
-bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                           const MCAssembler *Asm) const {
-  // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
-  // special cases.
-  if (isGpOff()) {
-    const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
-            ->getSubExpr();
-    if (!SubExpr->evaluateAsRelocatable(Res, Asm))
-      return false;
-
-    Res.setSpecifier(MEK_Special);
-    return true;
-  }
-
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return !Res.getSubSym();
-}
-
-bool MipsMCExpr::isGpOff(Specifier &S) const {
-  if (getSpecifier() == MEK_HI || getSpecifier() == MEK_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
-      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
-        if (S1->getSpecifier() == MEK_NEG && S2->getSpecifier() == MEK_GPREL) {
-          S = getSpecifier();
-          return true;
-        }
-      }
-    }
-  }
-  return false;
+  return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
+                Ctx);
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 5aad02c38d6ec..216077a1aa489 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -17,35 +17,6 @@ namespace llvm {
 class MipsMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
-  enum {
-    MEK_None,
-    MEK_CALL_HI16,
-    MEK_CALL_LO16,
-    MEK_DTPREL,
-    MEK_DTPREL_HI,
-    MEK_DTPREL_LO,
-    MEK_GOT,
-    MEK_GOTTPREL,
-    MEK_GOT_CALL,
-    MEK_GOT_DISP,
-    MEK_GOT_HI16,
-    MEK_GOT_LO16,
-    MEK_GOT_OFST,
-    MEK_GOT_PAGE,
-    MEK_GPREL,
-    MEK_HI,
-    MEK_HIGHER,
-    MEK_HIGHEST,
-    MEK_LO,
-    MEK_NEG,
-    MEK_PCREL_HI16,
-    MEK_PCREL_LO16,
-    MEK_TLSGD,
-    MEK_TLSLDM,
-    MEK_TPREL_HI,
-    MEK_TPREL_LO,
-    MEK_Special,
-  };
 
 private:
   explicit MipsMCExpr(const MCExpr *Expr, Specifier S)
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 49aea9c691629..80a854c799014 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsTargetStreamer.h"
 #include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
@@ -1266,7 +1267,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   const MCExpr *HiSym = MipsMCExpr::create(
-      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
+      Mips::S_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().emitInstruction(TmpInst, STI);
@@ -1277,7 +1278,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   const MCExpr *LoSym = MipsMCExpr::create(
-      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
+      Mips::S_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().emitInstruction(TmpInst, STI);
@@ -1342,10 +1343,10 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
   }
 
   const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
-      MipsMCExpr::MEK_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      Mips::S_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
   const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
-      MipsMCExpr::MEK_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+      Mips::S_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
       MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 1a3e99ec7f68f..da3f7cb55b301 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MCTargetDesc/MipsTargetStreamer.h"
@@ -1244,7 +1245,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // and value for debug thread local expression.
 void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
   if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
-    if (MipsExpr && MipsExpr->getSpecifier() == MipsMCExpr::MEK_DTPREL) {
+    if (MipsExpr && MipsExpr->getSpecifier() == Mips::S_DTPREL) {
       switch (Size) {
       case 4:
         getTargetStreamer().emitDTPRel32Value(MipsExpr->getSubExpr());
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index d1eef1775aa6d..3c3690a7f983d 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -13,7 +13,7 @@
 
 #include "MipsMCInstLower.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsAsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -35,7 +35,7 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
                                               int64_t Offset) const {
-  MipsMCExpr::Specifier TargetKind = MipsMCExpr::MEK_None;
+  MipsMCExpr::Specifier TargetKind = Mips::S_None;
   bool IsGpOff = false;
   const MCSymbol *Symbol;
   SmallString<128> Name;
@@ -53,75 +53,75 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_NO_FLAG:
     break;
   case MipsII::MO_GPREL:
-    TargetKind = MipsMCExpr::MEK_GPREL;
+    TargetKind = Mips::S_GPREL;
     break;
   case MipsII::MO_GOT_CALL:
-    TargetKind = MipsMCExpr::MEK_GOT_CALL;
+    TargetKind = Mips::S_GOT_CALL;
     break;
   case MipsII::MO_GOT:
-    TargetKind = MipsMCExpr::MEK_GOT;
+    TargetKind = Mips::S_GOT;
     break;
   case MipsII::MO_ABS_HI:
-    TargetKind = MipsMCExpr::MEK_HI;
+    TargetKind = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    TargetKind = MipsMCExpr::MEK_LO;
+    TargetKind = Mips::S_LO;
     break;
   case MipsII::MO_TLSGD:
-    TargetKind = MipsMCExpr::MEK_TLSGD;
+    TargetKind = Mips::S_TLSGD;
     break;
   case MipsII::MO_TLSLDM:
-    TargetKind = MipsMCExpr::MEK_TLSLDM;
+    TargetKind = Mips::S_TLSLDM;
     break;
   case MipsII::MO_DTPREL_HI:
-    TargetKind = MipsMCExpr::MEK_DTPREL_HI;
+    TargetKind = Mips::S_DTPREL_HI;
     break;
   case MipsII::MO_DTPREL_LO:
-    TargetKind = MipsMCExpr::MEK_DTPREL_LO;
+    TargetKind = Mips::S_DTPREL_LO;
     break;
   case MipsII::MO_GOTTPREL:
-    TargetKind = MipsMCExpr::MEK_GOTTPREL;
+    TargetKind = Mips::S_GOTTPREL;
     break;
   case MipsII::MO_TPREL_HI:
-    TargetKind = MipsMCExpr::MEK_TPREL_HI;
+    TargetKind = Mips::S_TPREL_HI;
     break;
   case MipsII::MO_TPREL_LO:
-    TargetKind = MipsMCExpr::MEK_TPREL_LO;
+    TargetKind = Mips::S_TPREL_LO;
     break;
   case MipsII::MO_GPOFF_HI:
-    TargetKind = MipsMCExpr::MEK_HI;
+    TargetKind = Mips::S_HI;
     IsGpOff = true;
     break;
   case MipsII::MO_GPOFF_LO:
-    TargetKind = MipsMCExpr::MEK_LO;
+    TargetKind = Mips::S_LO;
     IsGpOff = true;
     break;
   case MipsII::MO_GOT_DISP:
-    TargetKind = MipsMCExpr::MEK_GOT_DISP;
+    TargetKind = Mips::S_GOT_DISP;
     break;
   case MipsII::MO_GOT_HI16:
-    TargetKind = MipsMCExpr::MEK_GOT_HI16;
+    TargetKind = Mips::S_GOT_HI16;
     break;
   case MipsII::MO_GOT_LO16:
-    TargetKind = MipsMCExpr::MEK_GOT_LO16;
+    TargetKind = Mips::S_GOT_LO16;
     break;
   case MipsII::MO_GOT_PAGE:
-    TargetKind = MipsMCExpr::MEK_GOT_PAGE;
+    TargetKind = Mips::S_GOT_PAGE;
     break;
   case MipsII::MO_GOT_OFST:
-    TargetKind = MipsMCExpr::MEK_GOT_OFST;
+    TargetKind = Mips::S_GOT_OFST;
     break;
   case MipsII::MO_HIGHER:
-    TargetKind = MipsMCExpr::MEK_HIGHER;
+    TargetKind = Mips::S_HIGHER;
     break;
   case MipsII::MO_HIGHEST:
-    TargetKind = MipsMCExpr::MEK_HIGHEST;
+    TargetKind = Mips::S_HIGHEST;
     break;
   case MipsII::MO_CALL_HI16:
-    TargetKind = MipsMCExpr::MEK_CALL_HI16;
+    TargetKind = Mips::S_CALL_HI16;
     break;
   case MipsII::MO_CALL_LO16:
-    TargetKind = MipsMCExpr::MEK_CALL_LO16;
+    TargetKind = Mips::S_CALL_LO16;
     break;
   case MipsII::MO_JALR:
     return MCOperand();
@@ -176,7 +176,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 
   if (IsGpOff)
     Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
-  else if (TargetKind != MipsMCExpr::MEK_None)
+  else if (TargetKind != Mips::S_None)
     Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
 
   return MCOperand::createExpr(Expr);
@@ -230,16 +230,16 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
-    Spec = MipsMCExpr::MEK_HIGHEST;
+    Spec = Mips::S_HIGHEST;
     break;
   case MipsII::MO_HIGHER:
-    Spec = MipsMCExpr::MEK_HIGHER;
+    Spec = Mips::S_HIGHER;
     break;
   case MipsII::MO_ABS_HI:
-    Spec = MipsMCExpr::MEK_HI;
+    Spec = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    Spec = MipsMCExpr::MEK_LO;
+    Spec = Mips::S_LO;
     break;
   default:
     report_fatal_error("Unexpected flags for lowerLongBranchLUi");
@@ -265,16 +265,16 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
   unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
-    Spec = MipsMCExpr::MEK_HIGHEST;
+    Spec = Mips::S_HIGHEST;
     break;
   case MipsII::MO_HIGHER:
-    Spec = MipsMCExpr::MEK_HIGHER;
+    Spec = Mips::S_HIGHER;
     break;
   case MipsII::MO_ABS_HI:
-    Spec = MipsMCExpr::MEK_HI;
+    Spec = Mips::S_HI;
     break;
   case MipsII::MO_ABS_LO:
-    Spec = MipsMCExpr::MEK_LO;
+    Spec = Mips::S_LO;
     break;
   default:
     report_fatal_error("Unexpected flags for lowerLongBranchADDiu");
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index e74b3ad5ebca3..23aa699318a2e 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetObjectFile.h"
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -189,5 +189,5 @@ MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
-  return MipsMCExpr::create(MipsMCExpr::MEK_DTPREL, Expr, getContext());
+  return MipsMCExpr::create(Mips::S_DTPREL, Expr, getContext());
 }

From ad94f77a6a0c421e1f5815d1b8e5aa86d8dd2e92 Mon Sep 17 00:00:00 2001
From: Tom Vijlbrief <tvijlbrief@gmail.com>
Date: Mon, 16 Jun 2025 03:25:40 +0200
Subject: [PATCH 535/851] [AVR] Add many new AVR MCU model definitions
 (#144229)

1. Added the missing XMEGA2 definition. The avr64 devices use xmega2 which has SPM(X) defined.

2. The avr16/avr32 devices do have SPM and SPMX features, but the current xmega3 definition has not.
   Xmega3 is also used for modern attiny series which do not have SPM(X), so that is correct.
   Leave the avr16/avr32 devices unchanged (using xmega3 to be in sync with gcc definitions).

Fixes https://github.com/llvm/llvm-project/issues/116116
---
 clang/lib/Basic/Targets/AVR.cpp               | 69 +++++++++++++++
 clang/lib/Driver/ToolChains/AVR.cpp           | 70 ++++++++++++++++
 clang/test/Misc/target-invalid-cpu-note/avr.c | 65 ++++++++++++++
 llvm/lib/Target/AVR/AVRDevices.td             | 84 +++++++++++++++++++
 4 files changed, 288 insertions(+)

diff --git a/clang/lib/Basic/Targets/AVR.cpp b/clang/lib/Basic/Targets/AVR.cpp
index 85ca4bc30c461..bbe7b01ca036d 100644
--- a/clang/lib/Basic/Targets/AVR.cpp
+++ b/clang/lib/Basic/Targets/AVR.cpp
@@ -336,6 +336,9 @@ static MCUInfo AVRMcus[] = {
     {"attiny1624", "__AVR_ATtiny1624__", "103", 1},
     {"attiny1626", "__AVR_ATtiny1626__", "103", 1},
     {"attiny1627", "__AVR_ATtiny1627__", "103", 1},
+    {"attiny3224", "__AVR_ATtiny3224__", "103", 1},
+    {"attiny3226", "__AVR_ATtiny3226__", "103", 1},
+    {"attiny3227", "__AVR_ATtiny3227__", "103", 1},
     {"atmega808", "__AVR_ATmega808__", "103", 1},
     {"atmega809", "__AVR_ATmega809__", "103", 1},
     {"atmega1608", "__AVR_ATmega1608__", "103", 1},
@@ -344,6 +347,72 @@ static MCUInfo AVRMcus[] = {
     {"atmega3209", "__AVR_ATmega3209__", "103", 1},
     {"atmega4808", "__AVR_ATmega4808__", "103", 1},
     {"atmega4809", "__AVR_ATmega4809__", "103", 1},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "__AVR_AVR64DA28__", "102", 1},
+    {"avr64da32", "__AVR_AVR64DA32__", "102", 1},
+    {"avr64da48", "__AVR_AVR64DA48__", "102", 1},
+    {"avr64da64", "__AVR_AVR64DA64__", "102", 1},
+    {"avr64db28", "__AVR_AVR64DB28__", "102", 1},
+    {"avr64db32", "__AVR_AVR64DB32__", "102", 1},
+    {"avr64db48", "__AVR_AVR64DB48__", "102", 1},
+    {"avr64db64", "__AVR_AVR64DB64__", "102", 1},
+    {"avr64dd14", "__AVR_AVR64DD14__", "102", 1},
+    {"avr64dd20", "__AVR_AVR64DD20__", "102", 1},
+    {"avr64dd28", "__AVR_AVR64DD28__", "102", 1},
+    {"avr64dd32", "__AVR_AVR64DD32__", "102", 1},
+    {"avr64du28", "__AVR_AVR64DU28__", "102", 1},
+    {"avr64du32", "__AVR_AVR64DU32__", "102", 1},
+    {"avr64ea28", "__AVR_AVR64EA28__", "102", 1},
+    {"avr64ea32", "__AVR_AVR64EA32__", "102", 1},
+    {"avr64ea48", "__AVR_AVR64EA48__", "102", 1},
+    {"avr64sd28", "__AVR_AVR64SD28__", "102", 1},
+    {"avr64sd32", "__AVR_AVR64SD32__", "102", 1},
+    {"avr64sd48", "__AVR_AVR64SD48__", "102", 1},
+
+    {"avr16dd20", "__AVR_AVR16DD20__", "103", 1},
+    {"avr16dd28", "__AVR_AVR16DD28__", "103", 1},
+    {"avr16dd32", "__AVR_AVR16DD32__", "103", 1},
+    {"avr16du14", "__AVR_AVR16DU14__", "103", 1},
+    {"avr16du20", "__AVR_AVR16DU20__", "103", 1},
+    {"avr16du28", "__AVR_AVR16DU28__", "103", 1},
+    {"avr16du32", "__AVR_AVR16DU32__", "103", 1},
+    {"avr32da28", "__AVR_AVR32DA28__", "103", 1},
+    {"avr32da32", "__AVR_AVR32DA32__", "103", 1},
+    {"avr32da48", "__AVR_AVR32DA48__", "103", 1},
+    {"avr32db28", "__AVR_AVR32DB28__", "103", 1},
+    {"avr32db32", "__AVR_AVR32DB32__", "103", 1},
+    {"avr32db48", "__AVR_AVR32DB48__", "103", 1},
+    {"avr32dd14", "__AVR_AVR32DD14__", "103", 1},
+    {"avr32dd20", "__AVR_AVR32DD20__", "103", 1},
+    {"avr32dd28", "__AVR_AVR32DD28__", "103", 1},
+    {"avr32dd32", "__AVR_AVR32DD32__", "103", 1},
+    {"avr32du14", "__AVR_AVR32DU14__", "103", 1},
+    {"avr32du20", "__AVR_AVR32DU20__", "103", 1},
+    {"avr32du28", "__AVR_AVR32DU28__", "103", 1},
+    {"avr32du32", "__AVR_AVR32DU32__", "103", 1},
+    {"avr16eb14", "__AVR_AVR16EB14__", "103", 1},
+    {"avr16eb20", "__AVR_AVR16EB20__", "103", 1},
+    {"avr16eb28", "__AVR_AVR16EB28__", "103", 1},
+    {"avr16eb32", "__AVR_AVR16EB32__", "103", 1},
+    {"avr16ea28", "__AVR_AVR16EA28__", "103", 1},
+    {"avr16ea32", "__AVR_AVR16EA32__", "103", 1},
+    {"avr16ea48", "__AVR_AVR16EA48__", "103", 1},
+    {"avr32ea28", "__AVR_AVR32EA28__", "103", 1},
+    {"avr32ea32", "__AVR_AVR32EA32__", "103", 1},
+    {"avr32ea48", "__AVR_AVR32EA48__", "103", 1},
+    {"avr32sd20", "__AVR_AVR32SD20__", "103", 1},
+    {"avr32sd28", "__AVR_AVR32SD28__", "103", 1},
+    {"avr32sd32", "__AVR_AVR32SD32__", "103", 1},
+    {"avr128da28", "__AVR_AVR128DA28__", "104", 2},
+    {"avr128da32", "__AVR_AVR128DA32__", "104", 2},
+    {"avr128da48", "__AVR_AVR128DA48__", "104", 2},
+    {"avr128da64", "__AVR_AVR128DA64__", "104", 2},
+    {"avr128db28", "__AVR_AVR128DB28__", "104", 2},
+    {"avr128db32", "__AVR_AVR128DB32__", "104", 2},
+    {"avr128db48", "__AVR_AVR128DB48__", "104", 2},
+    {"avr128db64", "__AVR_AVR128DB64__", "104", 2},
 };
 
 } // namespace targets
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index b0523a7f4e40e..731076d9754a9 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -326,8 +326,78 @@ constexpr struct {
     {"attiny1624", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1626", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny1627", "avrxmega3", "avrxmega3", 0x803800},
+    {"attiny3224", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3226", "avrxmega3", "avrxmega3", 0x803400},
+    {"attiny3227", "avrxmega3", "avrxmega3", 0x803400},
     {"attiny3216", "avrxmega3", "avrxmega3", 0x803800},
     {"attiny3217", "avrxmega3", "avrxmega3", 0x803800},
+
+    // gcc 14 additions:
+
+    {"avr64da28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64da64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db48", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64db64", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd14", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd20", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64dd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64du32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64ea28", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea32", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64ea48", "avrxmega2", "avrxmega2", 0x806800},
+    {"avr64sd28", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd32", "avrxmega2", "avrxmega2", 0x806000},
+    {"avr64sd48", "avrxmega2", "avrxmega2", 0x806000},
+
+    {"avr16dd20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16dd32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16du32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32da28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32da48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32db48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32dd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du14", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32du32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr16eb14", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb20", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16eb32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea28", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea32", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr16ea48", "avrxmega3", "avrxmega3", 0x807800},
+    {"avr32ea28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32ea48", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd20", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd28", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr32sd32", "avrxmega3", "avrxmega3", 0x807000},
+    {"avr128da28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128da64", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db28", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db32", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db48", "avrxmega4", "avrxmega4", 0x804000},
+    {"avr128db64", "avrxmega4", "avrxmega4", 0x804000},
+
 };
 
 std::string GetMCUSubPath(StringRef MCUName) {
diff --git a/clang/test/Misc/target-invalid-cpu-note/avr.c b/clang/test/Misc/target-invalid-cpu-note/avr.c
index 86ffbb6838582..49d68bcc2edf8 100644
--- a/clang/test/Misc/target-invalid-cpu-note/avr.c
+++ b/clang/test/Misc/target-invalid-cpu-note/avr.c
@@ -311,6 +311,9 @@
 // CHECK-SAME: {{^}}, attiny1624
 // CHECK-SAME: {{^}}, attiny1626
 // CHECK-SAME: {{^}}, attiny1627
+// CHECK-SAME: {{^}}, attiny3224
+// CHECK-SAME: {{^}}, attiny3226
+// CHECK-SAME: {{^}}, attiny3227
 // CHECK-SAME: {{^}}, atmega808
 // CHECK-SAME: {{^}}, atmega809
 // CHECK-SAME: {{^}}, atmega1608
@@ -319,4 +322,66 @@
 // CHECK-SAME: {{^}}, atmega3209
 // CHECK-SAME: {{^}}, atmega4808
 // CHECK-SAME: {{^}}, atmega4809
+// CHECK-SAME: {{^}}, avr64da28
+// CHECK-SAME: {{^}}, avr64da32
+// CHECK-SAME: {{^}}, avr64da48
+// CHECK-SAME: {{^}}, avr64da64
+// CHECK-SAME: {{^}}, avr64db28
+// CHECK-SAME: {{^}}, avr64db32
+// CHECK-SAME: {{^}}, avr64db48
+// CHECK-SAME: {{^}}, avr64db64
+// CHECK-SAME: {{^}}, avr64dd14
+// CHECK-SAME: {{^}}, avr64dd20
+// CHECK-SAME: {{^}}, avr64dd28
+// CHECK-SAME: {{^}}, avr64dd32
+// CHECK-SAME: {{^}}, avr64du28
+// CHECK-SAME: {{^}}, avr64du32
+// CHECK-SAME: {{^}}, avr64ea28
+// CHECK-SAME: {{^}}, avr64ea32
+// CHECK-SAME: {{^}}, avr64ea48
+// CHECK-SAME: {{^}}, avr64sd28
+// CHECK-SAME: {{^}}, avr64sd32
+// CHECK-SAME: {{^}}, avr64sd48
+// CHECK-SAME: {{^}}, avr16dd20
+// CHECK-SAME: {{^}}, avr16dd28
+// CHECK-SAME: {{^}}, avr16dd32
+// CHECK-SAME: {{^}}, avr16du14
+// CHECK-SAME: {{^}}, avr16du20
+// CHECK-SAME: {{^}}, avr16du28
+// CHECK-SAME: {{^}}, avr16du32
+// CHECK-SAME: {{^}}, avr32da28
+// CHECK-SAME: {{^}}, avr32da32
+// CHECK-SAME: {{^}}, avr32da48
+// CHECK-SAME: {{^}}, avr32db28
+// CHECK-SAME: {{^}}, avr32db32
+// CHECK-SAME: {{^}}, avr32db48
+// CHECK-SAME: {{^}}, avr32dd14
+// CHECK-SAME: {{^}}, avr32dd20
+// CHECK-SAME: {{^}}, avr32dd28
+// CHECK-SAME: {{^}}, avr32dd32
+// CHECK-SAME: {{^}}, avr32du14
+// CHECK-SAME: {{^}}, avr32du20
+// CHECK-SAME: {{^}}, avr32du28
+// CHECK-SAME: {{^}}, avr32du32
+// CHECK-SAME: {{^}}, avr16eb14
+// CHECK-SAME: {{^}}, avr16eb20
+// CHECK-SAME: {{^}}, avr16eb28
+// CHECK-SAME: {{^}}, avr16eb32
+// CHECK-SAME: {{^}}, avr16ea28
+// CHECK-SAME: {{^}}, avr16ea32
+// CHECK-SAME: {{^}}, avr16ea48
+// CHECK-SAME: {{^}}, avr32ea28
+// CHECK-SAME: {{^}}, avr32ea32
+// CHECK-SAME: {{^}}, avr32ea48
+// CHECK-SAME: {{^}}, avr32sd20
+// CHECK-SAME: {{^}}, avr32sd28
+// CHECK-SAME: {{^}}, avr32sd32
+// CHECK-SAME: {{^}}, avr128da28
+// CHECK-SAME: {{^}}, avr128da32
+// CHECK-SAME: {{^}}, avr128da48
+// CHECK-SAME: {{^}}, avr128da64
+// CHECK-SAME: {{^}}, avr128db28
+// CHECK-SAME: {{^}}, avr128db32
+// CHECK-SAME: {{^}}, avr128db48
+// CHECK-SAME: {{^}}, avr128db64
 // CHECK-SAME: {{$}}
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 56147bb473bc4..ad760d7403573 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -209,12 +209,27 @@ def FamilyTiny
              [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding,
               FeatureSmallStack]>;
 
+def FamilyXMEGA2 : Family<"xmega2",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureSPM, FeatureSPMX,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA3 : Family<"xmega3",
                           [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                            FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
                            FeatureMultiplication, FeatureMOVW, FeatureLPMX,
                            FeatureBREAK, FeatureLowByteFirst]>;
 
+def FamilyXMEGA4 : Family<"xmega4",
+                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
+                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
+                           FeatureMultiplication, FeatureMOVW, FeatureLPMX,
+                           FeatureELPM, FeatureELPMX,
+                           FeatureSPM, FeatureSPMX,
+                           FeatureBREAK, FeatureLowByteFirst]>;
+
 def FamilyXMEGA : Family<"xmega",
                          [FamilyAVR0, FeatureLPM, FeatureIJMPCALL,
                           FeatureADDSUBIW, FeatureSRAM, FeatureJMPCALL,
@@ -567,6 +582,9 @@ def : Device<"attiny3217", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1624", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1626", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"attiny1627", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3224", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3226", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"attiny3227", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega809", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega1608", FamilyXMEGA3, ELFArchXMEGA3>;
@@ -575,3 +593,69 @@ def : Device<"atmega3208", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega3209", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4808", FamilyXMEGA3, ELFArchXMEGA3>;
 def : Device<"atmega4809", FamilyXMEGA3, ELFArchXMEGA3>;
+
+// Additions from gcc 14:
+
+def : Device<"avr64da28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64da64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64db64", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd14", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd20", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64dd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64du32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64ea48", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd28", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd32", FamilyXMEGA2, ELFArchXMEGA2>;
+def : Device<"avr64sd48", FamilyXMEGA2, ELFArchXMEGA2>;
+
+def : Device<"avr16dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32da48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32db48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32dd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32du32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb14", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16eb32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr16ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32ea48", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd20", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd28", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr32sd32", FamilyXMEGA3, ELFArchXMEGA3>;
+def : Device<"avr128da28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128da64", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db28", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db32", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db48", FamilyXMEGA4, ELFArchXMEGA4>;
+def : Device<"avr128db64", FamilyXMEGA4, ELFArchXMEGA4>;

From 1506ba95d7c3dca1abff0190550945f6cc263a99 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 15 Jun 2025 18:28:06 -0700
Subject: [PATCH 536/851] [clang-format][NFC] Clean up DisallowLineBreaks
 lambda (#144255)

See also
https://github.com/llvm/llvm-project/pull/141576/files#r2141808121
---
 clang/lib/Format/ContinuationIndenter.cpp | 78 ++++++++++++-----------
 1 file changed, 42 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 424b6dbc0da79..b4745477b96ef 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -329,9 +329,9 @@ bool ContinuationIndenter::canBreak(const LineState &State) {
   // statement and we are aligning lambda blocks to their signatures.
   if (Previous.is(tok::l_brace) && State.Stack.size() > 1 &&
       State.Stack[State.Stack.size() - 2].NestedBlockInlined &&
-      State.Stack[State.Stack.size() - 2].HasMultipleNestedBlocks &&
-      Style.LambdaBodyIndentation == FormatStyle::LBI_Signature) {
-    return false;
+      State.Stack[State.Stack.size() - 2].HasMultipleNestedBlocks) {
+    return Style.isCpp() &&
+           Style.LambdaBodyIndentation == FormatStyle::LBI_OuterScope;
   }
 
   // Don't break after very short return types (e.g. "void") as that is often
@@ -706,42 +706,48 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   const FormatToken &Previous = *State.NextToken->Previous;
   auto &CurrentState = State.Stack.back();
 
-  bool DisallowLineBreaksOnThisLine =
-      Style.LambdaBodyIndentation == FormatStyle::LBI_Signature &&
-      // Deal with lambda arguments in C++. The aim here is to ensure that we
-      // don't over-indent lambda function bodies when lambdas are passed as
-      // arguments to function calls. We do this by ensuring that either all
-      // arguments (including any lambdas) go on the same line as the function
-      // call, or we break before the first argument.
-      Style.isCpp() && [&] {
-        // For example, `/*Newline=*/false`.
-        if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
-          return false;
-        const auto *PrevNonComment = Current.getPreviousNonComment();
-        if (!PrevNonComment || PrevNonComment->isNot(tok::l_paren))
-          return false;
-        if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare))
-          return false;
-        auto BlockParameterCount = PrevNonComment->BlockParameterCount;
-        if (BlockParameterCount == 0)
-          return false;
+  // Deal with lambda arguments in C++. The aim here is to ensure that we don't
+  // over-indent lambda function bodies when lambdas are passed as arguments to
+  // function calls. We do this by ensuring that either all arguments (including
+  // any lambdas) go on the same line as the function call, or we break before
+  // the first argument.
+  auto DisallowLineBreaks = [&] {
+    if (!Style.isCpp() ||
+        Style.LambdaBodyIndentation == FormatStyle::LBI_OuterScope) {
+      return false;
+    }
 
-        // Multiple lambdas in the same function call.
-        if (BlockParameterCount > 1)
-          return true;
+    // For example, `/*Newline=*/false`.
+    if (Previous.is(TT_BlockComment) && Current.SpacesRequiredBefore == 0)
+      return false;
 
-        // A lambda followed by another arg.
-        if (!PrevNonComment->Role)
-          return false;
-        auto Comma = PrevNonComment->Role->lastComma();
-        if (!Comma)
-          return false;
-        auto Next = Comma->getNextNonComment();
-        return Next &&
-               !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
-      }();
+    if (Current.isOneOf(tok::comment, tok::l_paren, TT_LambdaLSquare))
+      return false;
+
+    const auto *Prev = Current.getPreviousNonComment();
+    if (!Prev || Prev->isNot(tok::l_paren))
+      return false;
+
+    if (Prev->BlockParameterCount == 0)
+      return false;
+
+    // Multiple lambdas in the same function call.
+    if (Prev->BlockParameterCount > 1)
+      return true;
+
+    // A lambda followed by another arg.
+    if (!Prev->Role)
+      return false;
+
+    const auto *Comma = Prev->Role->lastComma();
+    if (!Comma)
+      return false;
+
+    const auto *Next = Comma->getNextNonComment();
+    return Next && !Next->isOneOf(TT_LambdaLSquare, tok::l_brace, tok::caret);
+  };
 
-  if (DisallowLineBreaksOnThisLine)
+  if (DisallowLineBreaks())
     State.NoLineBreak = true;
 
   if (Current.is(tok::equal) &&

From f23b841f0fa7576b90fe226e66192b861a8cf1cf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:35:07 -0700
Subject: [PATCH 537/851] MIPS: Move MipsMCExpr functions to MipsMCAsmInfo

---
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       | 52 +++++++++++++------
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  | 10 ++++
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  4 +-
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   |  7 +++
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h |  8 ---
 5 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 97c173618167b..9b2b25c60c946 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -59,10 +59,11 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   AllowAtInName = true;
 }
 
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
+                      const MCSpecifierExpr &Expr) {
   int64_t AbsVal;
 
-  switch (specifier) {
+  switch (Expr.getSpecifier()) {
   case Mips::S_None:
   case Mips::S_Special:
     llvm_unreachable("Mips::S_None and MEK_Special are invalid");
@@ -70,7 +71,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   case Mips::S_DTPREL:
     // Mips::S_DTPREL is used for marking TLS DIEExpr only
     // and contains a regular sub-expression.
-    MAI->printExpr(OS, *getSubExpr());
+    MAI.printExpr(OS, *Expr.getSubExpr());
     return;
   case Mips::S_CALL_HI16:
     OS << "%call_hi";
@@ -147,20 +148,20 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   }
 
   OS << '(';
-  if (Expr->evaluateAsAbsolute(AbsVal))
+  if (Expr.evaluateAsAbsolute(AbsVal))
     OS << AbsVal;
   else
-    Expr->print(OS, MAI);
+    MAI.printExpr(OS, *Expr.getSubExpr());
   OS << ')';
 }
 
-bool MipsMCExpr::isGpOff(Specifier &S) const {
-  if (getSpecifier() == Mips::S_HI || getSpecifier() == Mips::S_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(getSubExpr())) {
+bool Mips::isGpOff(const MCSpecifierExpr &E) {
+  if (E.getSpecifier() == Mips::S_HI || E.getSpecifier() == Mips::S_LO) {
+    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(E.getSubExpr())) {
       if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
         if (S1->getSpecifier() == Mips::S_NEG &&
             S2->getSpecifier() == Mips::S_GPREL) {
-          S = getSpecifier();
+          // S = E.getSpecifier();
           return true;
         }
       }
@@ -169,13 +170,13 @@ bool MipsMCExpr::isGpOff(Specifier &S) const {
   return false;
 }
 
-bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                           const MCAssembler *Asm) const {
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
   // Look for the %hi(%neg(%gp_rel(X))) and %lo(%neg(%gp_rel(X)))
   // special cases.
-  if (isGpOff()) {
+  if (Mips::isGpOff(Expr)) {
     const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(getSubExpr())->getSubExpr())
+        cast<MipsMCExpr>(cast<MipsMCExpr>(Expr.getSubExpr())->getSubExpr())
             ->getSubExpr();
     if (!SubExpr->evaluateAsRelocatable(Res, Asm))
       return false;
@@ -184,8 +185,29 @@ bool MipsMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
     return true;
   }
 
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
     return false;
-  Res.setSpecifier(specifier);
+  Res.setSpecifier(Expr.getSpecifier());
   return !Res.getSubSym();
 }
+
+void MipsELFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                          const MCSpecifierExpr &Expr) const {
+  printImpl(*this, OS, Expr);
+}
+
+bool MipsELFMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                                 MCValue &Res,
+                                                 const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
+void MipsCOFFMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                           const MCSpecifierExpr &Expr) const {
+  printImpl(*this, OS, Expr);
+}
+
+bool MipsCOFFMCAsmInfo::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index d8b96f8b568c7..39699fdb98272 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -27,6 +27,10 @@ class MipsELFMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit MipsELFMCAsmInfo(const Triple &TheTriple,
                             const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
@@ -34,6 +38,10 @@ class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
 
 public:
   explicit MipsCOFFMCAsmInfo();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace Mips {
@@ -66,6 +74,8 @@ enum {
   S_TPREL_LO,
   S_Special,
 };
+
+bool isGpOff(const MCSpecifierExpr &E);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 4035618e02526..d2981c4ad4d20 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -642,7 +642,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       break;
     case Mips::S_LO:
       // Check for %lo(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff())
+      if (Mips::isGpOff(*MipsExpr))
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_LO
                                      : Mips::fixup_Mips_GPOFF_LO;
       else
@@ -659,7 +659,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       break;
     case Mips::S_HI:
       // Check for %hi(%neg(%gp_rel(X)))
-      if (MipsExpr->isGpOff())
+      if (Mips::isGpOff(*MipsExpr))
         FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GPOFF_HI
                                      : Mips::fixup_Mips_GPOFF_HI;
       else
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 821f662f0cbfb..280d944f2fbb3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -37,3 +37,10 @@ const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
   return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
                 Ctx);
 }
+
+void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  if (MAI)
+    MAI->printExpr(OS, *this);
+  else // llc -asm-show-inst
+    MipsELFMCAsmInfo(Triple(), MCTargetOptions()).printExpr(OS, *this);
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 216077a1aa489..91ec094821857 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -31,14 +31,6 @@ class MipsMCExpr : public MCSpecifierExpr {
                                        MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
-  bool isGpOff(Specifier &S) const;
-  bool isGpOff() const {
-    Specifier S;
-    return isGpOff(S);
-  }
 };
 
 } // end namespace llvm

From ba7369c49c6f638a4ce6f6be3acbdab5e0b5f418 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 16 Jun 2025 10:46:05 +0900
Subject: [PATCH 538/851] WebAssembly: Move runtime libcall setting out of
 TargetLowering (#142624)

RuntimeLibcallInfo needs to be correct outside of codegen contexts.
---
 .../wasm/lto/Inputs/libcall-return-addr.ll     |  6 ------
 lld/test/wasm/lto/libcall-return-addr.ll       | 18 ------------------
 llvm/lib/IR/RuntimeLibcalls.cpp                |  5 +++++
 .../WebAssembly/WebAssemblyISelLowering.cpp    |  5 -----
 4 files changed, 5 insertions(+), 29 deletions(-)
 delete mode 100644 lld/test/wasm/lto/Inputs/libcall-return-addr.ll
 delete mode 100644 lld/test/wasm/lto/libcall-return-addr.ll

diff --git a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll
deleted file mode 100644
index 271bdae11e49d..0000000000000
--- a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20"
-target triple = "wasm32-unknown-emscripten"
-
-define ptr @emscripten_return_address() {
-  ret ptr null
-}
diff --git a/lld/test/wasm/lto/libcall-return-addr.ll b/lld/test/wasm/lto/libcall-return-addr.ll
deleted file mode 100644
index 74eba74f97018..0000000000000
--- a/lld/test/wasm/lto/libcall-return-addr.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llvm-as %s -o %t.o
-; RUN: llvm-as %p/Inputs/libcall-return-addr.ll -o %t.return-addr.o
-; RUN: rm -f %t.a
-; RUN: llvm-ar rcs %t.a %t.return-addr.o
-; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20"
-target triple = "wasm32-unknown-emscripten"
-
-@g_ptr = global ptr null
-
-define void @_start() {
-  %addr = call ptr @llvm.returnaddress(i32 1)
-  store ptr %addr, ptr @g_ptr
-  ret void
-}
-
-; CHECK: wasm-ld: error: {{.*}}return-addr.o): attempt to add bitcode file after LTO (emscripten_return_address)
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d84c56f0af5c6..d655f84b37c50 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -531,6 +531,11 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
       setLibcallName(RTLIB::MULO_I64, nullptr);
     }
     setLibcallName(RTLIB::MULO_I128, nullptr);
+  } else {
+    // Define the emscripten name for return address helper.
+    // TODO: when implementing other Wasm backends, make this generic or only do
+    // this on emscripten depending on what they end up doing.
+    setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
   }
 
   if (TT.isSystemZ() && TT.isOSzOS()) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index aac3473311192..3cd923c0ba058 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -385,11 +385,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   setMaxAtomicSizeInBitsSupported(64);
 
-  // Define the emscripten name for return address helper.
-  // TODO: when implementing other Wasm backends, make this generic or only do
-  // this on emscripten depending on what they end up doing.
-  setLibcallName(RTLIB::RETURN_ADDRESS, "emscripten_return_address");
-
   // Always convert switches to br_tables unless there is only one case, which
   // is equivalent to a simple branch. This reduces code size for wasm, and we
   // defer possible jump table optimizations to the VM.

From 993c158a30b9ddc881e55efcd33e33abc10f3a5c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:46:50 -0700
Subject: [PATCH 539/851] MIPS: Reduce MipsMCExpr uses

---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 35 ++++++++-----------
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       |  7 ++--
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  1 +
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      | 12 +++----
 llvm/lib/Target/Mips/MipsMCInstLower.h        |  4 +--
 5 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 8d9c3a96b32a1..7ea7c58f1a512 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3032,7 +3032,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       return false;
     }
 
-    const MipsMCExpr *GotExpr = nullptr;
+    const MCSpecifierExpr *GotExpr = nullptr;
     const MCExpr *LoExpr = nullptr;
     if (ABI.IsN32() || ABI.IsN64()) {
       // The remaining cases are:
@@ -3097,10 +3097,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
-  const MipsMCExpr *HiExpr =
-      MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+  const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3111,9 +3109,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // If it is not available we exit if the destination is the same as the
     // source register.
 
-    const MipsMCExpr *HighestExpr =
+    const auto *HighestExpr =
         MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
-    const MipsMCExpr *HigherExpr =
+    const auto *HigherExpr =
         MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
 
     bool RdRegIsRsReg =
@@ -3312,8 +3310,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
 
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
-    const MipsMCExpr *GotExpr =
-        MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
+    const auto *GotExpr = MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3324,8 +3321,7 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     }
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
-    const MipsMCExpr *HiExpr =
-        MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
+    const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3337,10 +3333,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
-      const MipsMCExpr *HighestExpr =
+      const auto *HighestExpr =
           MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
-      const MipsMCExpr *HigherExpr =
+      const auto *HigherExpr =
           MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
@@ -3428,8 +3424,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3479,8 +3474,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3560,8 +3554,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const MipsMCExpr *LoExpr =
-      MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -6353,7 +6346,7 @@ MCRegister MipsAsmParser::getReg(int RC, int RegNo) {
 // e.g. "%lo foo", "(%lo(foo))", "%lo(foo)+1".
 const MCExpr *MipsAsmParser::parseRelocExpr() {
   auto getOp = [](StringRef Op) {
-    return StringSwitch<MipsMCExpr::Specifier>(Op)
+    return StringSwitch<Mips::Specifier>(Op)
         .Case("call16", Mips::S_GOT_CALL)
         .Case("call_hi", Mips::S_CALL_HI16)
         .Case("call_lo", Mips::S_CALL_LO16)
@@ -6384,7 +6377,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
   MCAsmParser &Parser = getParser();
   StringRef Name;
   const MCExpr *Res = nullptr;
-  SmallVector<MipsMCExpr::Specifier, 0> Ops;
+  SmallVector<Mips::Specifier, 0> Ops;
   while (parseOptionalToken(AsmToken::Percent)) {
     if (Parser.parseIdentifier(Name) ||
         Parser.parseToken(AsmToken::LParen, "expected '('"))
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 9b2b25c60c946..b64f86f382974 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -157,8 +157,8 @@ static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
 
 bool Mips::isGpOff(const MCSpecifierExpr &E) {
   if (E.getSpecifier() == Mips::S_HI || E.getSpecifier() == Mips::S_LO) {
-    if (const MipsMCExpr *S1 = dyn_cast<const MipsMCExpr>(E.getSubExpr())) {
-      if (const MipsMCExpr *S2 = dyn_cast<const MipsMCExpr>(S1->getSubExpr())) {
+    if (const auto *S1 = dyn_cast<const MCSpecifierExpr>(E.getSubExpr())) {
+      if (const auto *S2 = dyn_cast<const MCSpecifierExpr>(S1->getSubExpr())) {
         if (S1->getSpecifier() == Mips::S_NEG &&
             S2->getSpecifier() == Mips::S_GPREL) {
           // S = E.getSpecifier();
@@ -176,7 +176,8 @@ static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
   // special cases.
   if (Mips::isGpOff(Expr)) {
     const MCExpr *SubExpr =
-        cast<MipsMCExpr>(cast<MipsMCExpr>(Expr.getSubExpr())->getSubExpr())
+        cast<MCSpecifierExpr>(
+            cast<MCSpecifierExpr>(Expr.getSubExpr())->getSubExpr())
             ->getSubExpr();
     if (!SubExpr->evaluateAsRelocatable(Res, Asm))
       return false;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 39699fdb98272..0975116328fc1 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -45,6 +45,7 @@ class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
 };
 
 namespace Mips {
+using Specifier = uint16_t;
 enum {
   S_None,
   S_CALL_HI16 = FirstTargetFixupKind,
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 3c3690a7f983d..935fcd8fa7154 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -35,7 +35,7 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
                                               int64_t Offset) const {
-  MipsMCExpr::Specifier TargetKind = Mips::S_None;
+  Mips::Specifier TargetKind = Mips::S_None;
   bool IsGpOff = false;
   const MCSymbol *Symbol;
   SmallString<128> Name;
@@ -211,7 +211,7 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
 
 MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
                                      MachineBasicBlock *BB2,
-                                     MipsMCExpr::Specifier Kind) const {
+                                     Mips::Specifier Kind) const {
   const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::create(BB1->getSymbol(), *Ctx);
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
@@ -226,7 +226,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   // Lower register operand.
   OutMI.addOperand(LowerOperand(MI->getOperand(0)));
 
-  MipsMCExpr::Specifier Spec;
+  Mips::Specifier Spec;
   unsigned TargetFlags = MI->getOperand(1).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
@@ -248,7 +248,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   if (MI->getNumOperands() == 2) {
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
-    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 3) {
     // Create %hi($tgt-$baltgt).
@@ -261,7 +261,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
                                            MCInst &OutMI, int Opcode) const {
   OutMI.setOpcode(Opcode);
 
-  MipsMCExpr::Specifier Spec;
+  Mips::Specifier Spec;
   unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
   switch (TargetFlags) {
   case MipsII::MO_HIGHEST:
@@ -290,7 +290,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
     // Lower register operand.
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
-    const MipsMCExpr *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 4) {
     // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.h b/llvm/lib/Target/Mips/MipsMCInstLower.h
index b6ddbe98955d1..a618c6fb7bfab 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.h
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 #define LLVM_LIB_TARGET_MIPS_MIPSMCINSTLOWER_H
 
-#include "MCTargetDesc/MipsMCExpr.h"
+#include "MCTargetDesc/MipsMCAsmInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
 
@@ -41,7 +41,7 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, int64_t Offset) const;
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-                      MipsMCExpr::Specifier Kind) const;
+                      Mips::Specifier Kind) const;
   void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
   void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
                             int Opcode) const;

From cf679e66fade71220535775cca895628bf7692af Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 18:59:16 -0700
Subject: [PATCH 540/851] SystemZ: Rename SystemZMCExpr::VK_ to SystemZ::S_

Prepare for removing SystemZMCExpr. Adopt the newer naming convention
used by most other targets.
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |  6 +-
 .../MCTargetDesc/SystemZELFObjectWriter.cpp   | 38 ++++-----
 .../MCTargetDesc/SystemZInstPrinterCommon.cpp |  6 +-
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 14 ++--
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.h   | 22 +++++
 .../SystemZ/MCTargetDesc/SystemZMCExpr.cpp    |  7 +-
 .../SystemZ/MCTargetDesc/SystemZMCExpr.h      | 18 ----
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 82 +++++++++----------
 .../lib/Target/SystemZ/SystemZMCInstLower.cpp |  8 +-
 .../SystemZ/SystemZTargetObjectFile.cpp       |  4 +-
 10 files changed, 100 insertions(+), 105 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 83c74c8a976dc..74a8822a12ac7 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -1707,12 +1707,12 @@ ParseStatus SystemZAsmParser::parsePCRel(OperandVector &Operands,
     if (Parser.getTok().isNot(AsmToken::Identifier))
       return Error(Parser.getTok().getLoc(), "unexpected token");
 
-    SystemZMCExpr::Specifier Kind = SystemZMCExpr::VK_None;
+    SystemZMCExpr::Specifier Kind = SystemZ::S_None;
     StringRef Name = Parser.getTok().getString();
     if (Name == "tls_gdcall")
-      Kind = SystemZMCExpr::VK_TLSGD;
+      Kind = SystemZ::S_TLSGD;
     else if (Name == "tls_ldcall")
-      Kind = SystemZMCExpr::VK_TLSLDM;
+      Kind = SystemZ::S_TLSLDM;
     else
       return Error(Parser.getTok().getLoc(), "unknown TLS tag");
     Parser.Lex();
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
index b44859d75df0f..8b5587ab71255 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -103,14 +103,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                               bool IsPCRel) const {
   SMLoc Loc = Fixup.getLoc();
   unsigned Kind = Fixup.getKind();
-  auto Specifier = SystemZMCExpr::Specifier(Target.getSpecifier());
+  auto Specifier = SystemZ::Specifier(Target.getSpecifier());
   switch (Specifier) {
-  case SystemZMCExpr::VK_INDNTPOFF:
-  case SystemZMCExpr::VK_NTPOFF:
-  case SystemZMCExpr::VK_TLSGD:
-  case SystemZMCExpr::VK_TLSLD:
-  case SystemZMCExpr::VK_TLSLDM:
-  case SystemZMCExpr::VK_DTPOFF:
+  case SystemZ::S_INDNTPOFF:
+  case SystemZ::S_NTPOFF:
+  case SystemZ::S_TLSGD:
+  case SystemZ::S_TLSLD:
+  case SystemZ::S_TLSLDM:
+  case SystemZ::S_DTPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -119,12 +119,12 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
   }
 
   switch (Specifier) {
-  case SystemZMCExpr::VK_None:
+  case SystemZ::S_None:
     if (IsPCRel)
       return getPCRelReloc(Loc, Kind);
     return getAbsoluteReloc(Loc, Kind);
 
-  case SystemZMCExpr::VK_NTPOFF:
+  case SystemZ::S_NTPOFF:
     assert(!IsPCRel && "NTPOFF shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -135,14 +135,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-exec)");
     return 0;
 
-  case SystemZMCExpr::VK_INDNTPOFF:
+  case SystemZ::S_INDNTPOFF:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_TLS_IEENT;
     reportError(Loc,
                 "Only PC-relative INDNTPOFF accesses are supported for now");
     return 0;
 
-  case SystemZMCExpr::VK_DTPOFF:
+  case SystemZ::S_DTPOFF:
     assert(!IsPCRel && "DTPOFF shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -153,7 +153,7 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_TLSLDM:
+  case SystemZ::S_TLSLDM:
     assert(!IsPCRel && "TLSLDM shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -166,7 +166,7 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (local-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_TLSGD:
+  case SystemZ::S_TLSGD:
     assert(!IsPCRel && "TLSGD shouldn't be PC-relative");
     switch (Kind) {
     case FK_Data_4:
@@ -179,14 +179,14 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
     reportError(Loc, "Unsupported thread-local address (general-dynamic)");
     return 0;
 
-  case SystemZMCExpr::VK_GOT:
-  case SystemZMCExpr::VK_GOTENT:
+  case SystemZ::S_GOT:
+  case SystemZ::S_GOTENT:
     if (IsPCRel && Kind == SystemZ::FK_390_PC32DBL)
       return ELF::R_390_GOTENT;
     reportError(Loc, "Only PC-relative GOT accesses are supported for now");
     return 0;
 
-  case SystemZMCExpr::VK_PLT:
+  case SystemZ::S_PLT:
     assert(IsPCRel && "@PLT shouldn't be PC-relative");
     switch (Kind) {
     case SystemZ::FK_390_PC12DBL:
@@ -209,8 +209,8 @@ unsigned SystemZELFObjectWriter::getRelocType(const MCFixup &Fixup,
 bool SystemZELFObjectWriter::needsRelocateWithSymbol(const MCValue &V,
                                                      unsigned Type) const {
   switch (V.getSpecifier()) {
-  case SystemZMCExpr::VK_GOT:
-  case SystemZMCExpr::VK_PLT:
+  case SystemZ::S_GOT:
+  case SystemZ::S_PLT:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
index 7fd1a1c2d801a..297fdc8325928 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinterCommon.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZInstPrinterCommon.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegister.h"
@@ -186,10 +186,10 @@ void SystemZInstPrinterCommon::printPCRelTLSOperand(const MCInst *MI,
     const MCOperand &MO = MI->getOperand(OpNum + 1);
     const MCSymbolRefExpr &refExp = cast<MCSymbolRefExpr>(*MO.getExpr());
     switch (refExp.getSpecifier()) {
-    case SystemZMCExpr::VK_TLSGD:
+    case SystemZ::S_TLSGD:
       O << ":tls_gdcall:";
       break;
-    case SystemZMCExpr::VK_TLSLDM:
+    case SystemZ::S_TLSLDM:
       O << ":tls_ldcall:";
       break;
     default:
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index e9d387399bf30..0f7341e6d03bb 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -14,15 +14,11 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {SystemZMCExpr::VK_DTPOFF, "DTPOFF"},
-    {SystemZMCExpr::VK_GOT, "GOT"},
-    {SystemZMCExpr::VK_GOTENT, "GOTENT"},
-    {SystemZMCExpr::VK_INDNTPOFF, "INDNTPOFF"},
-    {SystemZMCExpr::VK_NTPOFF, "NTPOFF"},
-    {SystemZMCExpr::VK_PLT, "PLT"},
-    {SystemZMCExpr::VK_TLSGD, "TLSGD"},
-    {SystemZMCExpr::VK_TLSLD, "TLSLD"},
-    {SystemZMCExpr::VK_TLSLDM, "TLSLDM"},
+    {SystemZ::S_DTPOFF, "DTPOFF"}, {SystemZ::S_GOT, "GOT"},
+    {SystemZ::S_GOTENT, "GOTENT"}, {SystemZ::S_INDNTPOFF, "INDNTPOFF"},
+    {SystemZ::S_NTPOFF, "NTPOFF"}, {SystemZ::S_PLT, "PLT"},
+    {SystemZ::S_TLSGD, "TLSGD"},   {SystemZ::S_TLSLD, "TLSLD"},
+    {SystemZ::S_TLSLDM, "TLSLDM"},
 };
 
 SystemZMCAsmInfoELF::SystemZMCAsmInfoELF(const Triple &TT) {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 58b9a3dd652ef..6d7d669fa8e12 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -28,6 +28,28 @@ class SystemZMCAsmInfoGOFF : public MCAsmInfoGOFF {
   bool isAcceptableChar(char C) const override;
 };
 
+namespace SystemZ {
+using Specifier = uint16_t;
+enum {
+  S_None,
+
+  S_DTPOFF,
+  S_GOT,
+  S_GOTENT,
+  S_INDNTPOFF,
+  S_NTPOFF,
+  S_PLT,
+  S_TLSGD,
+  S_TLSLD,
+  S_TLSLDM,
+
+  // HLASM docs for address constants:
+  // https://www.ibm.com/docs/en/hla-and-tf/1.6?topic=value-address-constants
+  S_RCon, // Address of ADA of symbol.
+  S_VCon, // Address of external function symbol.
+};
+} // namespace SystemZ
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
index 6dcca60dcedda..7b82c0cb6609c 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCExpr.h"
+#include "SystemZMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
@@ -20,11 +21,11 @@ const SystemZMCExpr *SystemZMCExpr::create(MCSpecifierExpr::Spec S,
 
 StringRef SystemZMCExpr::getVariantKindName() const {
   switch (getSpecifier()) {
-  case VK_None:
+  case SystemZ::S_None:
     return "A";
-  case VK_SystemZ_RCon:
+  case SystemZ::S_RCon:
     return "R";
-  case VK_SystemZ_VCon:
+  case SystemZ::S_VCon:
     return "V";
   default:
     llvm_unreachable("Invalid kind");
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
index ac1de97ecf0a1..8e730e50ae9dd 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
@@ -18,24 +18,6 @@ namespace llvm {
 class SystemZMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
-  enum {
-    VK_None,
-
-    VK_DTPOFF = MCSymbolRefExpr::FirstTargetSpecifier,
-    VK_GOT,
-    VK_GOTENT,
-    VK_INDNTPOFF,
-    VK_NTPOFF,
-    VK_PLT,
-    VK_TLSGD,
-    VK_TLSLD,
-    VK_TLSLDM,
-
-    // HLASM docs for address constants:
-    // https://www.ibm.com/docs/en/hla-and-tf/1.6?topic=value-address-constants
-    VK_SystemZ_RCon, // Address of ADA of symbol.
-    VK_SystemZ_VCon, // Address of external function symbol.
-  };
 
 private:
   explicit SystemZMCExpr(const MCExpr *Expr, Spec S)
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index eb4b4c1647a13..d5e034b5a0096 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -14,7 +14,7 @@
 #include "SystemZAsmPrinter.h"
 #include "MCTargetDesc/SystemZGNUInstPrinter.h"
 #include "MCTargetDesc/SystemZHLASMInstPrinter.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZConstantPoolValue.h"
 #include "SystemZMCInstLower.h"
@@ -79,7 +79,7 @@ static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
 static const MCSymbolRefExpr *getTLSGetOffset(MCContext &Context) {
   StringRef Name = "__tls_get_offset";
   return MCSymbolRefExpr::create(Context.getOrCreateSymbol(Name),
-                                 SystemZMCExpr::VK_PLT, Context);
+                                 SystemZ::S_PLT, Context);
 }
 
 static const MCSymbolRefExpr *getGlobalOffsetTable(MCContext &Context) {
@@ -319,11 +319,10 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case SystemZ::CallBRASL_XPLINK64:
-    EmitToStreamer(
-        *OutStreamer,
-        MCInstBuilder(SystemZ::BRASL)
-            .addReg(SystemZ::R7D)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+                                     .addReg(SystemZ::R7D)
+                                     .addExpr(Lower.getExpr(MI->getOperand(0),
+                                                            SystemZ::S_PLT)));
     emitCallInformation(CallType::BRASL7);
     return;
 
@@ -380,10 +379,9 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     return;
   }
   case SystemZ::CallBRASL:
-    LoweredMI =
-        MCInstBuilder(SystemZ::BRASL)
-            .addReg(SystemZ::R14D)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+                    .addReg(SystemZ::R14D)
+                    .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBASR:
@@ -393,17 +391,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
     break;
 
   case SystemZ::CallJG:
-    LoweredMI =
-        MCInstBuilder(SystemZ::JG)
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+                    .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBRCL:
-    LoweredMI =
-        MCInstBuilder(SystemZ::BRCL)
-            .addImm(MI->getOperand(0).getImm())
-            .addImm(MI->getOperand(1).getImm())
-            .addExpr(Lower.getExpr(MI->getOperand(2), SystemZMCExpr::VK_PLT));
+    LoweredMI = MCInstBuilder(SystemZ::BRCL)
+                    .addImm(MI->getOperand(0).getImm())
+                    .addImm(MI->getOperand(1).getImm())
+                    .addExpr(Lower.getExpr(MI->getOperand(2), SystemZ::S_PLT));
     break;
 
   case SystemZ::CallBR:
@@ -495,15 +491,15 @@ void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
         MCInstBuilder(SystemZ::BRASL)
             .addReg(SystemZ::R14D)
             .addExpr(getTLSGetOffset(MF->getContext()))
-            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZMCExpr::VK_TLSGD));
+            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_TLSGD));
     break;
 
   case SystemZ::TLS_LDCALL:
-    LoweredMI = MCInstBuilder(SystemZ::BRASL)
-                    .addReg(SystemZ::R14D)
-                    .addExpr(getTLSGetOffset(MF->getContext()))
-                    .addExpr(Lower.getExpr(MI->getOperand(0),
-                                           SystemZMCExpr::VK_TLSLDM));
+    LoweredMI =
+        MCInstBuilder(SystemZ::BRASL)
+            .addReg(SystemZ::R14D)
+            .addExpr(getTLSGetOffset(MF->getContext()))
+            .addExpr(Lower.getExpr(MI->getOperand(0), SystemZ::S_TLSLDM));
     break;
 
   case SystemZ::GOT:
@@ -798,7 +794,7 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
 
   MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
   const MCSymbolRefExpr *Op =
-      MCSymbolRefExpr::create(fentry, SystemZMCExpr::VK_PLT, Ctx);
+      MCSymbolRefExpr::create(fentry, SystemZ::S_PLT, Ctx);
   OutStreamer->emitInstruction(
       MCInstBuilder(SystemZ::BRASL).addReg(SystemZ::R0D).addExpr(Op),
       getSubtargetInfo());
@@ -880,7 +876,7 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
       EncodedBytes += 2;
     }
   } else if (CalleeMO.isGlobal()) {
-    const MCExpr *Expr = Lower.getExpr(CalleeMO, SystemZMCExpr::VK_PLT);
+    const MCExpr *Expr = Lower.getExpr(CalleeMO, SystemZ::S_PLT);
     EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
                                    .addReg(SystemZ::R14D)
                                    .addExpr(Expr));
@@ -923,11 +919,10 @@ void SystemZAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(
   EmitNop(OutContext, *OutStreamer, 2, getSubtargetInfo());
   EmitToStreamer(*OutStreamer,
                  MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
-  EmitToStreamer(*OutStreamer,
-                 MCInstBuilder(SystemZ::BRASL)
-                     .addReg(SystemZ::R14D)
-                     .addExpr(MCSymbolRefExpr::create(
-                         FuncEntry, SystemZMCExpr::VK_PLT, OutContext)));
+  EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::BRASL)
+                                   .addReg(SystemZ::R14D)
+                                   .addExpr(MCSymbolRefExpr::create(
+                                       FuncEntry, SystemZ::S_PLT, OutContext)));
   OutStreamer->emitLabel(EndOfSled);
   recordSled(BeginOfSled, MI, SledKind::FUNCTION_ENTER, 2);
 }
@@ -967,10 +962,9 @@ void SystemZAsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   EmitNop(OutContext, *OutStreamer, 4, getSubtargetInfo());
   EmitToStreamer(*OutStreamer,
                  MCInstBuilder(SystemZ::LLILF).addReg(SystemZ::R2D).addImm(0));
-  EmitToStreamer(*OutStreamer,
-                 MCInstBuilder(SystemZ::J)
-                     .addExpr(MCSymbolRefExpr::create(
-                         FuncExit, SystemZMCExpr::VK_PLT, OutContext)));
+  EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::J)
+                                   .addExpr(MCSymbolRefExpr::create(
+                                       FuncExit, SystemZ::S_PLT, OutContext)));
   if (FallthroughLabel)
     OutStreamer->emitLabel(FallthroughLabel);
   recordSled(BeginOfSled, MI, SledKind::FUNCTION_EXIT, 2);
@@ -992,13 +986,13 @@ void SystemZAsmPrinter::emitAttributes(Module &M) {
 static uint8_t getSpecifierFromModifier(SystemZCP::SystemZCPModifier Modifier) {
   switch (Modifier) {
   case SystemZCP::TLSGD:
-    return SystemZMCExpr::VK_TLSGD;
+    return SystemZ::S_TLSGD;
   case SystemZCP::TLSLDM:
-    return SystemZMCExpr::VK_TLSLDM;
+    return SystemZ::S_TLSLDM;
   case SystemZCP::DTPOFF:
-    return SystemZMCExpr::VK_DTPOFF;
+    return SystemZ::S_DTPOFF;
   case SystemZCP::NTPOFF:
-    return SystemZMCExpr::VK_NTPOFF;
+    return SystemZ::S_NTPOFF;
   }
   llvm_unreachable("Invalid SystemCPModifier!");
 }
@@ -1145,12 +1139,12 @@ void SystemZAsmPrinter::emitADASection() {
       // imported functions, that are placed in the ADA to be 8 byte aligned.
       EMIT_COMMENT("function descriptor of");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_RCon,
+          SystemZMCExpr::create(SystemZ::S_RCon,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_VCon,
+          SystemZMCExpr::create(SystemZ::S_VCon,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
@@ -1159,7 +1153,7 @@ void SystemZAsmPrinter::emitADASection() {
     case SystemZII::MO_ADA_DATA_SYMBOL_ADDR:
       EMIT_COMMENT("pointer to data symbol");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_None,
+          SystemZMCExpr::create(SystemZ::S_None,
                                 MCSymbolRefExpr::create(Sym, OutContext),
                                 OutContext),
           PointerSize);
@@ -1174,7 +1168,7 @@ void SystemZAsmPrinter::emitADASection() {
 
       EMIT_COMMENT("pointer to function descriptor");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZMCExpr::VK_SystemZ_VCon,
+          SystemZMCExpr::create(SystemZ::S_VCon,
                                 MCSymbolRefExpr::create(Alias, OutContext),
                                 OutContext),
           PointerSize);
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 1aa71618082fb..4a68c5d6462d7 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCInstLower.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "SystemZAsmPrinter.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
@@ -20,11 +20,11 @@ using namespace llvm;
 static SystemZMCExpr::Specifier getSpecifierForTFlags(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
     case 0:
-      return SystemZMCExpr::VK_None;
+      return SystemZ::S_None;
     case SystemZII::MO_GOT:
-      return SystemZMCExpr::VK_GOT;
+      return SystemZ::S_GOT;
     case SystemZII::MO_INDNTPOFF:
-      return SystemZMCExpr::VK_INDNTPOFF;
+      return SystemZ::S_INDNTPOFF;
   }
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
index 7d22c26ff9a86..ae90c51432fe9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetObjectFile.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -15,5 +15,5 @@ using namespace llvm;
 
 const MCExpr *SystemZELFTargetObjectFile::getDebugThreadLocalSymbol(
     const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, SystemZMCExpr::VK_DTPOFF, getContext());
+  return MCSymbolRefExpr::create(Sym, SystemZ::S_DTPOFF, getContext());
 }

From d64ee2cd4fe488b6dc21e7a8173fbb9cf3610ba0 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 10:12:51 +0800
Subject: [PATCH 541/851] [RISCV] Add GetVTypeMinimalPredicates for the
 operation supported by zvfhmin. NFC. (#143847)

This patch adds a new `GetVTypeMinimalPredicates` for `f16` operation
supported by `Zvfhmin`. Split the type predicates for minimal support
and full compute support. This is a refactor patch for implementing
vector compute support for bf16 (Zvfbfa), that we can check `bf16` type
whether with `Zvfbfa` extension in `GetVTypePredicates`.
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 68 ++++++++++---------
 .../Target/RISCV/RISCVInstrInfoVSDPatterns.td |  8 +--
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 13 ++--
 3 files changed, 45 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index f9fc6f0be3804..22b5b52541d61 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -783,6 +783,15 @@ class GetVTypePredicates<VTypeInfo vti> {
                                      true : [HasVInstructions]);
 }
 
+class GetVTypeMinimalPredicates<VTypeInfo vti> {
+  list<Predicate> Predicates = !cond(!eq(vti.Scalar, f16) : [HasVInstructionsF16Minimal],
+                                     !eq(vti.Scalar, bf16) : [HasVInstructionsBF16Minimal],
+                                     !eq(vti.Scalar, f32) : [HasVInstructionsAnyF],
+                                     !eq(vti.Scalar, f64) : [HasVInstructionsF64],
+                                     !eq(vti.SEW, 64) : [HasVInstructionsI64],
+                                     true : [HasVInstructions]);
+}
+
 class VPseudoUSLoadNoMask<VReg RetClass,
                           int EEW,
                           DAGOperand sewop = sew> :
@@ -4568,7 +4577,7 @@ multiclass VPatUnaryS_M<string intrinsic_name,
 multiclass VPatUnaryV_V_AnyMask<string intrinsic, string instruction,
                                 list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     def : VPatUnaryAnyMask<intrinsic, instruction, "VM",
                            vti.Vector, vti.Vector, vti.Mask,
                            vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass>;
@@ -4887,7 +4896,7 @@ multiclass VPatBinaryV_VV_INT<string intrinsic, string instruction,
                               list<VTypeInfo> vtilist> {
   foreach vti = vtilist in {
     defvar ivti = GetIntVTypeInfo<vti>.Vti;
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinary<intrinsic,
                       instruction # "_VV_" # vti.LMul.MX # "_E" # vti.SEW,
                       vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
@@ -4950,7 +4959,7 @@ multiclass VPatBinaryV_VX_RM<string intrinsic, string instruction,
 multiclass VPatBinaryV_VX_INT<string intrinsic, string instruction,
                           list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinary<intrinsic, instruction # "_VX_" # vti.LMul.MX,
                       vti.Vector, vti.Vector, XLenVT, vti.Mask,
                       vti.Log2SEW, vti.RegClass,
@@ -4979,6 +4988,16 @@ multiclass VPatBinaryV_VI_RM<string intrinsic, string instruction,
                                   vti.RegClass, imm_type>;
 }
 
+multiclass VPatBinaryV_VI_INT<string intrinsic, string instruction,
+                              list<VTypeInfo> vtilist, Operand imm_type> {
+  foreach vti = vtilist in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
+    defm : VPatBinary<intrinsic, instruction # "_VI_" # vti.LMul.MX,
+                      vti.Vector, vti.Vector, XLenVT, vti.Mask,
+                      vti.Log2SEW, vti.RegClass,
+                      vti.RegClass, imm_type>;
+}
+
 multiclass VPatBinaryM_MM<string intrinsic, string instruction> {
   foreach mti = AllMasks in
     let Predicates = [HasVInstructions] in
@@ -5709,7 +5728,7 @@ multiclass VPatBinaryV_VV_VX_VI_INT<string intrinsic, string instruction,
                                     list<VTypeInfo> vtilist, Operand ImmType>
     : VPatBinaryV_VV_INT<intrinsic#"_vv", instruction, vtilist>,
       VPatBinaryV_VX_INT<intrinsic#"_vx", instruction, vtilist>,
-      VPatBinaryV_VI<intrinsic#"_vx", instruction, vtilist, ImmType>;
+      VPatBinaryV_VI_INT<intrinsic#"_vx", instruction, vtilist, ImmType>;
 
 multiclass VPatReductionV_VS<string intrinsic, string instruction, bit IsFloat = 0> {
   foreach vti = !if(IsFloat, NoGroupFloatVectors, NoGroupIntegerVectors) in {
@@ -5887,12 +5906,11 @@ multiclass VPatConversionWF_VF<string intrinsic, string instruction,
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
     // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
-    let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                         !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                     GetVTypePredicates<fwti>.Predicates)) in
-      defm : VPatConversion<intrinsic, instruction, "V",
-                            fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
-                            fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
+    let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                                 GetVTypeMinimalPredicates<fwti>.Predicates) in
+    defm : VPatConversion<intrinsic, instruction, "V",
+                          fwti.Vector, fvti.Vector, fwti.Mask, fvti.Log2SEW,
+                          fvti.LMul, fwti.RegClass, fvti.RegClass, isSEWAware>;
   }
 }
 
@@ -5979,8 +5997,9 @@ multiclass VPatConversionVF_WF_RM<string intrinsic, string instruction,
   foreach fvtiToFWti = wlist in {
     defvar fvti = fvtiToFWti.Vti;
     defvar fwti = fvtiToFWti.Wti;
-    let Predicates = !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                 GetVTypePredicates<fwti>.Predicates) in
+    // Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
+    let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                                 GetVTypeMinimalPredicates<fwti>.Predicates) in
     defm : VPatConversionRoundingMode<intrinsic, instruction, "W",
                                       fvti.Vector, fwti.Vector, fvti.Mask, fvti.Log2SEW,
                                       fvti.LMul, fvti.RegClass, fwti.RegClass,
@@ -6999,8 +7018,7 @@ defm : VPatBinaryV_VM_XM_IM<"int_riscv_vmerge", "PseudoVMERGE">;
 // 11.16. Vector Integer Move Instructions
 //===----------------------------------------------------------------------===//
 foreach vti = AllVectors in {
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in {
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in {
     def : Pat<(vti.Vector (int_riscv_vmv_v_v (vti.Vector vti.RegClass:$passthru),
                                              (vti.Vector vti.RegClass:$rs1),
                                              VLOpFrag)),
@@ -7195,8 +7213,7 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">;
 // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses
 // int_riscv_vmerge. Support both for compatibility.
 foreach vti = AllFloatVectors in {
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM",
                                  vti.Vector,
                                  vti.Vector, vti.Vector, vti.Mask,
@@ -7275,16 +7292,8 @@ defm : VPatConversionVF_WI_RM<"int_riscv_vfncvt_f_xu_w", "PseudoVFNCVT_F_XU",
                               isSEWAware=1>;
 defm : VPatConversionVF_WI_RM<"int_riscv_vfncvt_f_x_w", "PseudoVFNCVT_F_X",
                               isSEWAware=1>;
-defvar WidenableFloatVectorsExceptF16 = !filter(fvtiToFWti, AllWidenableFloatVectors,
-                                                !ne(fvtiToFWti.Vti.Scalar, f16));
-defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F",
-                           WidenableFloatVectorsExceptF16, isSEWAware=1>;
-// Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
-defvar F16WidenableFloatVectors = !filter(fvtiToFWti, AllWidenableFloatVectors,
-                                          !eq(fvtiToFWti.Vti.Scalar, f16));
-let Predicates = [HasVInstructionsF16Minimal] in
 defm : VPatConversionVF_WF_RM<"int_riscv_vfncvt_f_f_w", "PseudoVFNCVT_F_F",
-                           F16WidenableFloatVectors, isSEWAware=1>;
+                              AllWidenableFloatVectors, isSEWAware=1>;
 defm : VPatConversionVF_WF_BF_RM<"int_riscv_vfncvtbf16_f_f_w", 
                                  "PseudoVFNCVTBF16_F_F", isSEWAware=1>;
 defm : VPatConversionVF_WF<"int_riscv_vfncvt_rod_f_f_w", "PseudoVFNCVT_ROD_F_F",
@@ -7419,10 +7428,7 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
                               eew=16, vtilist=AllIntegerVectors>;
 
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
-                                AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
-                                  AllFP16Vectors, uimm5>;
+                                AllFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
@@ -7431,9 +7437,7 @@ defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
 defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>;
-defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectorsExceptFP16>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFP16Vectors>;
+defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>;
 defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllBFloatVectors>;
 
 // Include the non-intrinsic ISel patterns
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index e318a78285a2e..520959b0896f7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -864,8 +864,7 @@ multiclass VPatAVGADD_VV_VX_RM<SDNode vop, int vxrm, string suffix = ""> {
 
 // 7.4. Vector Unit-Stride Instructions
 foreach vti = AllVectors in
-  let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       GetVTypePredicates<vti>.Predicates) in 
+  let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
   defm : VPatUSLoadStoreSDNode<vti.Vector, vti.RegClass, vti.Log2SEW, vti.LMul,
                                vti.AVL, vti.RegClass>;
 foreach mti = AllMasks in
@@ -1449,9 +1448,8 @@ defm : VPatNConvertI2FPSDNode_W_RM<any_uint_to_fp, "PseudoVFNCVT_F_XU_W">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in
   def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
             (!cast<Instruction>("PseudoVFNCVT_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
                 (fvti.Vector (IMPLICIT_DEF)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index b54c2b042b4dd..6328e6c860f71 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -2683,9 +2683,9 @@ defm : VPatWConvertI2FPVL_V<any_riscv_sint_to_fp_vl, "PseudoVFWCVT_F_X_V">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in
+  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in
   def : Pat<(fwti.Vector (any_riscv_fpextend_vl
                              (fvti.Vector fvti.RegClass:$rs1),
                              (fvti.Mask VMV0:$vm),
@@ -2726,10 +2726,9 @@ defm : VPatNConvertI2FP_RM_VL_W<riscv_vfcvt_rm_f_x_vl, "PseudoVFNCVT_F_X_W">;
 foreach fvtiToFWti = AllWidenableFloatVectors in {
   defvar fvti = fvtiToFWti.Vti;
   defvar fwti = fvtiToFWti.Wti;
-  // Define vfwcvt.f.f.v for f16 when Zvfhmin is enable.
-  let Predicates = !if(!eq(fvti.Scalar, f16), [HasVInstructionsF16Minimal],
-                       !listconcat(GetVTypePredicates<fvti>.Predicates,
-                                   GetVTypePredicates<fwti>.Predicates)) in {
+  // Define vfncvt.f.f.w for f16 when Zvfhmin is enable.
+  let Predicates = !listconcat(GetVTypeMinimalPredicates<fvti>.Predicates,
+                               GetVTypeMinimalPredicates<fwti>.Predicates) in {
     def : Pat<(fvti.Vector (any_riscv_fpround_vl
                                (fwti.Vector fwti.RegClass:$rs1),
                                (fwti.Mask VMV0:$vm), VLOpFrag)),

From b591f6dad4079401fadc4a516b32d3900b7946de Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 19:26:40 -0700
Subject: [PATCH 542/851] SystemZ: Migrate to newer relocation specifier
 representation

z/OS creates SystemZMCExpr objects (https://reviews.llvm.org/D153788)
while ELF doesn't. Define the SystemZMCAsmInfoGOFF hooks
instead of the legacy MCSpecifierExpr:: hooks.
---
 .../SystemZ/AsmParser/SystemZAsmParser.cpp    |  3 +-
 .../SystemZ/MCTargetDesc/CMakeLists.txt       |  1 -
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 30 +++++++++++-
 .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.h   |  4 ++
 .../SystemZ/MCTargetDesc/SystemZMCExpr.cpp    | 47 -------------------
 .../SystemZ/MCTargetDesc/SystemZMCExpr.h      | 38 ---------------
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp | 20 ++++----
 .../lib/Target/SystemZ/SystemZMCInstLower.cpp |  8 ++--
 llvm/lib/Target/SystemZ/SystemZMCInstLower.h  |  5 +-
 9 files changed, 48 insertions(+), 108 deletions(-)
 delete mode 100644 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
 delete mode 100644 llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h

diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 74a8822a12ac7..6ee2a87565baa 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,7 +8,6 @@
 
 #include "MCTargetDesc/SystemZGNUInstPrinter.h"
 #include "MCTargetDesc/SystemZMCAsmInfo.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZTargetStreamer.h"
 #include "TargetInfo/SystemZTargetInfo.h"
@@ -1707,7 +1706,7 @@ ParseStatus SystemZAsmParser::parsePCRel(OperandVector &Operands,
     if (Parser.getTok().isNot(AsmToken::Identifier))
       return Error(Parser.getTok().getLoc(), "unexpected token");
 
-    SystemZMCExpr::Specifier Kind = SystemZ::S_None;
+    auto Kind = SystemZ::S_None;
     StringRef Name = Parser.getTok().getString();
     if (Name == "tls_gdcall")
       Kind = SystemZ::S_TLSGD;
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
index c95445637d0b2..28f7ced8d7ce7 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -8,7 +8,6 @@ add_llvm_component_library(LLVMSystemZDesc
   SystemZMCAsmBackend.cpp
   SystemZMCAsmInfo.cpp
   SystemZMCCodeEmitter.cpp
-  SystemZMCExpr.cpp
   SystemZMCTargetDesc.cpp
   SystemZTargetStreamer.cpp
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index 0f7341e6d03bb..052875bf0d3f6 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -7,9 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZMCAsmInfo.h"
-#include "MCTargetDesc/SystemZMCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -58,3 +58,31 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) {
 bool SystemZMCAsmInfoGOFF::isAcceptableChar(char C) const {
   return MCAsmInfo::isAcceptableChar(C) || C == '#';
 }
+
+void SystemZMCAsmInfoGOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  switch (Expr.getSpecifier()) {
+  case SystemZ::S_None:
+    OS << "A";
+    break;
+  case SystemZ::S_RCon:
+    OS << "R";
+    break;
+  case SystemZ::S_VCon:
+    OS << "V";
+    break;
+  default:
+    llvm_unreachable("Invalid kind");
+  }
+  OS << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  OS << ')';
+}
+
+bool SystemZMCAsmInfoGOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index 6d7d669fa8e12..11c2833b8ada8 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -26,6 +26,10 @@ class SystemZMCAsmInfoGOFF : public MCAsmInfoGOFF {
 public:
   explicit SystemZMCAsmInfoGOFF(const Triple &TT);
   bool isAcceptableChar(char C) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace SystemZ {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
deleted file mode 100644
index 7b82c0cb6609c..0000000000000
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- SystemZMCExpr.cpp - SystemZ specific MC expression classes --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "SystemZMCExpr.h"
-#include "SystemZMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "systemzmcexpr"
-
-const SystemZMCExpr *SystemZMCExpr::create(MCSpecifierExpr::Spec S,
-                                           const MCExpr *Expr, MCContext &Ctx) {
-  return new (Ctx) SystemZMCExpr(Expr, S);
-}
-
-StringRef SystemZMCExpr::getVariantKindName() const {
-  switch (getSpecifier()) {
-  case SystemZ::S_None:
-    return "A";
-  case SystemZ::S_RCon:
-    return "R";
-  case SystemZ::S_VCon:
-    return "V";
-  default:
-    llvm_unreachable("Invalid kind");
-  }
-}
-
-void SystemZMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getVariantKindName() << '(';
-  MAI->printExpr(OS, *Expr);
-  OS << ')';
-}
-
-bool SystemZMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(specifier);
-  return true;
-}
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
deleted file mode 100644
index 8e730e50ae9dd..0000000000000
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCExpr.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- SystemZMCExpr.h - SystemZ specific MC expression classes -*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCEXPR_H
-#define LLVM_LIB_TARGET_SYSTEMZ_MCTARGETDESC_SYSTEMZMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class SystemZMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = Spec;
-
-private:
-  explicit SystemZMCExpr(const MCExpr *Expr, Spec S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const SystemZMCExpr *create(Spec Kind, const MCExpr *Expr,
-                                     MCContext &Ctx);
-
-  StringRef getVariantKindName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-};
-} // end namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index d5e034b5a0096..aaf12b88de132 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -1139,23 +1139,20 @@ void SystemZAsmPrinter::emitADASection() {
       // imported functions, that are placed in the ADA to be 8 byte aligned.
       EMIT_COMMENT("function descriptor of");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_RCon,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_RCon, OutContext),
           PointerSize);
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_VCon,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_VCon, OutContext),
           PointerSize);
       EmittedBytes += PointerSize * 2;
       break;
     case SystemZII::MO_ADA_DATA_SYMBOL_ADDR:
       EMIT_COMMENT("pointer to data symbol");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_None,
-                                MCSymbolRefExpr::create(Sym, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Sym, OutContext),
+                                  SystemZ::S_None, OutContext),
           PointerSize);
       EmittedBytes += PointerSize;
       break;
@@ -1168,9 +1165,8 @@ void SystemZAsmPrinter::emitADASection() {
 
       EMIT_COMMENT("pointer to function descriptor");
       OutStreamer->emitValue(
-          SystemZMCExpr::create(SystemZ::S_VCon,
-                                MCSymbolRefExpr::create(Alias, OutContext),
-                                OutContext),
+          MCSpecifierExpr::create(MCSymbolRefExpr::create(Alias, OutContext),
+                                  SystemZ::S_VCon, OutContext),
           PointerSize);
       EmittedBytes += PointerSize;
       break;
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 4a68c5d6462d7..c1d0994a9e17e 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -16,8 +16,8 @@
 
 using namespace llvm;
 
-// Return the VK_* enumeration for MachineOperand target flags Flags.
-static SystemZMCExpr::Specifier getSpecifierForTFlags(unsigned Flags) {
+// Return the S_* enumeration for MachineOperand target flags Flags.
+static SystemZ::Specifier getSpecifierForTFlags(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
     case 0:
       return SystemZ::S_None;
@@ -34,7 +34,7 @@ SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
   : Ctx(ctx), AsmPrinter(asmprinter) {}
 
 const MCExpr *SystemZMCInstLower::getExpr(const MachineOperand &MO,
-                                          SystemZMCExpr::Specifier Spec) const {
+                                          SystemZ::Specifier Spec) const {
   const MCSymbol *Symbol;
   bool HasOffset = true;
   switch (MO.getType()) {
@@ -85,7 +85,7 @@ MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
     return MCOperand::createImm(MO.getImm());
 
   default: {
-    SystemZMCExpr::Specifier Kind = getSpecifierForTFlags(MO.getTargetFlags());
+    auto Kind = getSpecifierForTFlags(MO.getTargetFlags());
     return MCOperand::createExpr(getExpr(MO, Kind));
   }
   }
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
index 90526882c8535..3187d7726c31e 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMCINSTLOWER_H
 
-#include "MCTargetDesc/SystemZMCExpr.h"
+#include "MCTargetDesc/SystemZMCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -35,8 +35,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
   MCOperand lowerOperand(const MachineOperand& MO) const;
 
   // Return an MCExpr for symbolic operand MO with variant kind Kind.
-  const MCExpr *getExpr(const MachineOperand &MO,
-                        SystemZMCExpr::Specifier) const;
+  const MCExpr *getExpr(const MachineOperand &MO, SystemZ::Specifier) const;
 };
 } // end namespace llvm
 

From 167223f8c2c2350a3de9478355885c63b35ca6a9 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 16 Jun 2025 02:26:58 +0000
Subject: [PATCH 543/851] [gn build] Port b591f6dad407

---
 .../gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
index 4182b4911538d..360cdc5f10e60 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/SystemZ/MCTargetDesc/BUILD.gn
@@ -70,7 +70,6 @@ static_library("MCTargetDesc") {
     "SystemZMCAsmBackend.cpp",
     "SystemZMCAsmInfo.cpp",
     "SystemZMCCodeEmitter.cpp",
-    "SystemZMCExpr.cpp",
     "SystemZMCTargetDesc.cpp",
     "SystemZTargetStreamer.cpp",
   ]

From 9adde28df784f5c0cc960bdabd413ac131a5852e Mon Sep 17 00:00:00 2001
From: Ming-Yi Lai <ming-yi.lai@mediatek.com>
Date: Mon, 16 Jun 2025 11:18:41 +0800
Subject: [PATCH 544/851] [LLD][ELF][RISCV][Zicfilp][Zicfiss] Support `-z
 zicfilp=` and `-z zicfiss=` to force enable/disable features (#143114)

+ If `-z zicfilp=implicit` or option not specified, the output would
have the ZICFILP feature enabled/disabled based on input objects
+ If `-z zicfilp=<never|unlabeled|func-sig>`, the output would have
ZICFILP feature forced <off|on to the "unlabeled" scheme|on to the
"func-sig" scheme>
+ If `-z zicfiss=implicit` or option not specified, the output would
have the ZICFISS feature enabled/disabled based on input objects
+ If `-z zicfiss=<never|always>`, the output would have the ZICFISS
feature forced <off|on>
---
 lld/ELF/Config.h                              |  8 ++
 lld/ELF/Driver.cpp                            | 77 +++++++++++++++++++
 lld/test/ELF/riscv-feature-zicfilp-func-sig.s | 47 ++++++++++-
 .../ELF/riscv-feature-zicfilp-unlabeled.s     | 48 +++++++++++-
 lld/test/ELF/riscv-feature-zicfiss.s          | 20 ++++-
 5 files changed, 191 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index a2f7759fb7d37..2b72d54ba410d 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -136,6 +136,12 @@ enum LtoKind : uint8_t {UnifiedThin, UnifiedRegular, Default};
 // For -z gcs=
 enum class GcsPolicy { Implicit, Never, Always };
 
+// For -z zicfilp=
+enum class ZicfilpPolicy { Implicit, Never, Unlabeled, FuncSig };
+
+// For -z zicfiss=
+enum class ZicfissPolicy { Implicit, Never, Always };
+
 // For some options that resemble -z bti-report={none,warning,error}
 enum class ReportPolicy { None, Warning, Error };
 
@@ -411,6 +417,8 @@ struct Config {
   bool zText;
   bool zRetpolineplt;
   bool zWxneeded;
+  ZicfilpPolicy zZicfilp;
+  ZicfissPolicy zZicfiss;
   DiscardPolicy discard;
   GnuStackKind zGnustack;
   ICFLevel icf;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c9ac71f7236f8..7e132a387a04d 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -430,6 +430,10 @@ static void checkOptions(Ctx &ctx) {
                         "RISC-V targets";
     if (ctx.arg.zZicfissReport != ReportPolicy::None)
       ErrAlways(ctx) << "-z zicfiss-report is only supported on RISC-V targets";
+    if (ctx.arg.zZicfilp != ZicfilpPolicy::Implicit)
+      ErrAlways(ctx) << "-z zicfilp is only supported on RISC-V targets";
+    if (ctx.arg.zZicfiss != ZicfissPolicy::Implicit)
+      ErrAlways(ctx) << "-z zicfiss is only supported on RISC-V targets";
   }
 
   if (ctx.arg.emachine != EM_386 && ctx.arg.emachine != EM_X86_64 &&
@@ -584,6 +588,46 @@ static GcsPolicy getZGcs(Ctx &ctx, opt::InputArgList &args) {
   return ret;
 }
 
+static ZicfilpPolicy getZZicfilp(Ctx &ctx, opt::InputArgList &args) {
+  auto ret = ZicfilpPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "zicfilp") {
+      arg->claim();
+      if (kv.second == "unlabeled")
+        ret = ZicfilpPolicy::Unlabeled;
+      else if (kv.second == "func-sig")
+        ret = ZicfilpPolicy::FuncSig;
+      else if (kv.second == "never")
+        ret = ZicfilpPolicy::Never;
+      else if (kv.second == "implicit")
+        ret = ZicfilpPolicy::Implicit;
+      else
+        ErrAlways(ctx) << "unknown -z zicfilp= value: " << kv.second;
+    }
+  }
+  return ret;
+}
+
+static ZicfissPolicy getZZicfiss(Ctx &ctx, opt::InputArgList &args) {
+  auto ret = ZicfissPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "zicfiss") {
+      arg->claim();
+      if (kv.second == "always")
+        ret = ZicfissPolicy::Always;
+      else if (kv.second == "never")
+        ret = ZicfissPolicy::Never;
+      else if (kv.second == "implicit")
+        ret = ZicfissPolicy::Implicit;
+      else
+        ErrAlways(ctx) << "unknown -z zicfiss= value: " << kv.second;
+    }
+  }
+  return ret;
+}
+
 // Report a warning for an unknown -z option.
 static void checkZOptions(Ctx &ctx, opt::InputArgList &args) {
   // This function is called before getTarget(), when certain options are not
@@ -1567,6 +1611,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.zCopyreloc = getZFlag(args, "copyreloc", "nocopyreloc", true);
   ctx.arg.zForceBti = hasZOption(args, "force-bti");
   ctx.arg.zForceIbt = hasZOption(args, "force-ibt");
+  ctx.arg.zZicfilp = getZZicfilp(ctx, args);
+  ctx.arg.zZicfiss = getZZicfiss(ctx, args);
   ctx.arg.zGcs = getZGcs(ctx, args);
   ctx.arg.zGlobal = hasZOption(args, "global");
   ctx.arg.zGnustack = getZGnuStack(args);
@@ -2926,6 +2972,18 @@ static void readSecurityNotes(Ctx &ctx) {
           << f
           << ": -z zicfiss-report: file does not have "
              "GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property";
+
+      if (ctx.arg.zZicfilp == ZicfilpPolicy::Unlabeled &&
+          (features & GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG))
+        Warn(ctx) << f
+                  << ": -z zicfilp=unlabeled: file has conflicting property: "
+                     "GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG";
+
+      if (ctx.arg.zZicfilp == ZicfilpPolicy::FuncSig &&
+          (features & GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED))
+        Warn(ctx) << f
+                  << ": -z zicfilp=func-sig: file has conflicting property: "
+                     "GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED";
     }
 
     if (ctx.arg.zForceBti && !(features & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)) {
@@ -2989,6 +3047,25 @@ static void readSecurityNotes(Ctx &ctx) {
   else if (ctx.arg.zGcs == GcsPolicy::Never)
     ctx.arg.andFeatures &= ~GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
 
+  if (ctx.arg.emachine == EM_RISCV) {
+    // Force enable/disable Zicfilp.
+    if (ctx.arg.zZicfilp == ZicfilpPolicy::Unlabeled) {
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED;
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG;
+    } else if (ctx.arg.zZicfilp == ZicfilpPolicy::FuncSig) {
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG;
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED;
+    } else if (ctx.arg.zZicfilp == ZicfilpPolicy::Never)
+      ctx.arg.andFeatures &= ~(GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED |
+                               GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG);
+
+    // Force enable/disable Zicfiss.
+    if (ctx.arg.zZicfiss == ZicfissPolicy::Always)
+      ctx.arg.andFeatures |= GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS;
+    else if (ctx.arg.zZicfiss == ZicfissPolicy::Never)
+      ctx.arg.andFeatures &= ~GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS;
+  }
+
   // If we are utilising GCS at any stage, the sharedFiles should be checked to
   // ensure they also support this feature. The gcs-report-dynamic option is
   // used to indicate if the user wants information relating to this, and will
diff --git a/lld/test/ELF/riscv-feature-zicfilp-func-sig.s b/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
index f68fbddfa6026..c5818dd33978f 100644
--- a/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
+++ b/lld/test/ELF/riscv-feature-zicfilp-func-sig.s
@@ -2,6 +2,7 @@
 ## Test the ZICFILP func-sig feature.
 ## To lift maintenance burden, most tests are conducted only with 64-bit RISC-V
 ## Naming convention: *-s.s files enables ZICFILP func-sig.
+## Naming convention: *-u.s files enables ZICFILP unlabeled.
 # RUN: rm -rf %t && split-file %s %t && cd %t
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f1-s.s -o rv32-f1-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f2-s.s -o rv32-f2-s.o
@@ -12,14 +13,20 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f2-s.s -o f2-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
+# RUN: llvm-mc --filetype=obj --triple=riscv64 f3-u.s -o f3-u.o
 
-## ZICFILP-func-sig should be enabled when it's enabled in all inputs
+## ZICFILP-func-sig should be enabled when it's enabled in all inputs or when
+## it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfilp=func-sig --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfilp=never -z zicfilp=func-sig --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFILP %s
 # ZICFILP: Properties: RISC-V feature: ZICFILP-func-sig
 
 ## ZICFILP-func-sig should not be enabled if it's not enabled in at least one
@@ -29,11 +36,18 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFILP-func-sig should be disabled with zicfilp=never, even if
+## ZICFILP-func-sig is present in all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp=func-sig -z zicfilp=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfilp-func-sig-report should report any input files that don't have the
 ## ZICFILP-func-sig property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=func-sig 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfilp-func-sig-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-func-sig-report=warning 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-func-sig-report=warning -z zicfilp=func-sig 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfilp-func-sig-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG property
 # REPORT-ERROR: error: f3.o: -z zicfilp-func-sig-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG property
 
@@ -41,6 +55,14 @@
 # RUN: not ld.lld f2-s.o -z zicfilp-func-sig-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
 # INVALID: error: unknown -z zicfilp-func-sig-report= value: x
 
+## ZICFILP-unlabeled and ZICFILP-func-sig should conflict with each other.
+# RUN: ld.lld f3-u.o -o out.override -z zicfilp=func-sig 2>&1 | FileCheck --check-prefix=FORCE-CONFLICT %s
+# FORCE-CONFLICT: warning: f3-u.o: -z zicfilp=func-sig: file has conflicting property: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED
+
+## -z zicfilp=func-sig should override and disable ZICFILP-unlabeled.
+# RUN: llvm-readelf -n out.override | FileCheck --check-prefixes=ZICFILP,OVERRIDE %s
+# OVERRIDE-NOT: ZICFILP-unlabeled
+
 #--- rv32-f1-s.s
 .section ".note.gnu.property", "a"
 .balign 4
@@ -191,3 +213,24 @@ ndesc_end:
 .type f3,@function
 f3:
   ret
+
+#--- f3-u.s
+.section ".note.gnu.property", "a"
+.balign 8
+.4byte 4
+.4byte (ndesc_end - ndesc_begin)
+.4byte 0x5        // NT_GNU_PROPERTY_TYPE_0
+.asciz "GNU"
+ndesc_begin:
+.balign 8
+.4byte 0xc0000000 // GNU_PROPERTY_RISCV_FEATURE_1_AND
+.4byte 4
+.4byte 1          // GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED
+.balign 8
+ndesc_end:
+
+.text
+.globl f3
+.type f3,@function
+f3:
+  ret
diff --git a/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s b/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
index 0fcd8538d24a1..20491f057c8ed 100644
--- a/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
+++ b/lld/test/ELF/riscv-feature-zicfilp-unlabeled.s
@@ -2,6 +2,7 @@
 ## Test the ZICFILP unlabeled feature.
 ## To lift maintenance burden, most tests are conducted only with 64-bit RISC-V
 ## Naming convention: *-s.s files enables ZICFILP unlabeled.
+## Naming convention: *-f.s files enables ZICFILP func-sig.
 ## Naming convention: *-c.s files enables both of the conflicting ZICFILP unlabeled and ZICFILP func-sig features.
 # RUN: rm -rf %t && split-file %s %t && cd %t
 # RUN: llvm-mc --filetype=obj --triple=riscv32 rv32-f1-s.s -o rv32-f1-s.o
@@ -14,14 +15,20 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f2-s.s -o f2-s.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
+# RUN: llvm-mc --filetype=obj --triple=riscv64 f3-f.s -o f3-f.o
 
-## ZICFILP-unlabeled should be enabled when it's enabled in all inputs
+## ZICFILP-unlabeled should be enabled when it's enabled in all inputs or when
+## it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFILP %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfilp=unlabeled --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFILP %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfilp=never -z zicfilp=unlabeled --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFILP %s
 # ZICFILP: Properties: RISC-V feature: ZICFILP-unlabeled
 
 ## ZICFILP-unlabeled should not be enabled if it's not enabled in at least one
@@ -31,21 +38,35 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFILP-unlabeled should be disabled with zicfilp=never, even if
+## ZICFILP-unlabeled is present in all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp=unlabeled -z zicfilp=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfilp-unlabeled-report should report any input files that don't have the
 ## ZICFILP-unlabeled property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=unlabeled 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfilp-unlabeled-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-unlabeled-report=warning 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfilp-unlabeled-report=warning -z zicfilp=never 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfilp-unlabeled-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED property
 # REPORT-ERROR: error: f3.o: -z zicfilp-unlabeled-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED property
 
 ## An invalid -z zicfilp-unlabeled-report option should give an error
-# RUN: not ld.lld f2-s.o -z zicfilp-unlabeled-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# RUN: not ld.lld f2-s.o -z zicfilp=x -z zicfilp-unlabeled-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# INVALID: error: unknown -z zicfilp= value: x
 # INVALID: error: unknown -z zicfilp-unlabeled-report= value: x
 
 ## ZICFILP-unlabeled and ZICFILP-func-sig should conflict with each other
 # RUN: not ld.lld f1-c.o 2>&1 | FileCheck --check-prefix=CONFLICT %s
+# RUN: ld.lld f3-f.o -o out.override -z zicfilp=unlabeled 2>&1 | FileCheck --check-prefix=FORCE-CONFLICT %s
 # CONFLICT: error: f1-c.o: file has conflicting properties: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_UNLABELED and GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+# FORCE-CONFLICT: warning: f3-f.o: -z zicfilp=unlabeled: file has conflicting property: GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+
+## -z zicfilp=unlabeled should override and disable ZICFILP-func-sig.
+# RUN: llvm-readelf -n out.override | FileCheck --check-prefixes=ZICFILP,OVERRIDE %s
+# OVERRIDE-NOT: ZICFILP-func-sig
 
 #--- rv32-f1-s.s
 .section ".note.gnu.property", "a"
@@ -219,3 +240,24 @@ ndesc_end:
 .type f3,@function
 f3:
   ret
+
+#--- f3-f.s
+.section ".note.gnu.property", "a"
+.balign 8
+.4byte 4
+.4byte (ndesc_end - ndesc_begin)
+.4byte 0x5        // NT_GNU_PROPERTY_TYPE_0
+.asciz "GNU"
+ndesc_begin:
+.balign 8
+.4byte 0xc0000000 // GNU_PROPERTY_RISCV_FEATURE_1_AND
+.4byte 4
+.4byte 4          // GNU_PROPERTY_RISCV_FEATURE_1_CFI_LP_FUNC_SIG
+.balign 8
+ndesc_end:
+
+.text
+.globl f3
+.type f3,@function
+f3:
+  ret
diff --git a/lld/test/ELF/riscv-feature-zicfiss.s b/lld/test/ELF/riscv-feature-zicfiss.s
index 4623522f5ed79..7b208ddd9b8eb 100644
--- a/lld/test/ELF/riscv-feature-zicfiss.s
+++ b/lld/test/ELF/riscv-feature-zicfiss.s
@@ -13,13 +13,17 @@
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3.s   -o f3.o
 # RUN: llvm-mc --filetype=obj --triple=riscv64 f3-s.s -o f3-s.o
 
-## ZICFISS should be enabled when it's enabled in all inputs
+## ZICFISS should be enabled when it's enabled in all inputs or when it's forced on.
 # RUN: ld.lld rv32-f1-s.o rv32-f2-s.o rv32-f3-s.o -o out.rv32 --fatal-warnings
 # RUN: llvm-readelf -n out.rv32 | FileCheck --check-prefix=ZICFISS %s
 # RUN: ld.lld f1-s.o f2-s.o f3-s.o -o out --fatal-warnings
 # RUN: llvm-readelf -n out | FileCheck --check-prefix=ZICFISS %s
 # RUN: ld.lld f1-s.o f3-s.o --shared -o out.so --fatal-warnings
 # RUN: llvm-readelf -n out.so | FileCheck --check-prefix=ZICFISS %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -o out.force -z zicfiss=always --fatal-warnings
+# RUN: llvm-readelf -n out.force | FileCheck --check-prefix=ZICFISS %s
+# RUN: ld.lld f2-s.o f3.o --shared -o out.force.so -z zicfiss=never -z zicfiss=always --fatal-warnings
+# RUN: llvm-readelf -n out.force.so | FileCheck --check-prefix=ZICFISS %s
 # ZICFISS: Properties: RISC-V feature: ZICFISS
 
 ## ZICFISS should not be enabled if it's not enabled in at least one input
@@ -28,17 +32,25 @@
 # RUN: ld.lld f2-s.o f3.o --shared -o out.no.so --fatal-warnings
 # RUN: llvm-readelf -n out.no.so | count 0
 
+## ZICFISS should be disabled with zicfiss=never, even if ZICFISS is present in
+## all inputs.
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss=always -z zicfiss=never -o out.never --fatal-warnings
+# RUN: llvm-readelf -n out.never | count 0
+
 ## zicfiss-report should report any input files that don't have the zicfiss
 ## property
 # RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning -z zicfiss=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld f1-s.o f2.o f3-s.o -z zicfiss-report=warning -z zicfiss=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
 # RUN: not ld.lld f2-s.o f3.o --shared -z zicfiss-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=warning 2>&1 | count 0
-# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=error 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=warning -z zicfiss=always 2>&1 | count 0
+# RUN: ld.lld f1-s.o f2-s.o f3-s.o -z zicfiss-report=error -z zicfiss=always 2>&1 | count 0
 # REPORT-WARN: warning: f2.o: -z zicfiss-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property
 # REPORT-ERROR: error: f3.o: -z zicfiss-report: file does not have GNU_PROPERTY_RISCV_FEATURE_1_CFI_SS property
 
 ## An invalid -z zicfiss-report option should give an error
-# RUN: not ld.lld f2-s.o f3-s.o -z zicfiss-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# RUN: not ld.lld f2-s.o f3-s.o -z zicfiss=x -z zicfiss-report=x 2>&1 | FileCheck --check-prefix=INVALID %s
+# INVALID: error: unknown -z zicfiss= value: x
 # INVALID: error: unknown -z zicfiss-report= value: x
 
 #--- rv32-f1-s.s

From f71fb2dc01e117481f56e040c25391883d43c1c5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:22 -0700
Subject: [PATCH 545/851] [clang] Use StringRef in range-based for loops (NFC)
 (#144242)

When we iterate over std::vector<std::string>, we can directly assign
each element to StringRef.  We do not need to go through separate
statements.
---
 clang/lib/Basic/TargetInfo.cpp           | 3 +--
 clang/lib/Sema/SemaDeclAttr.cpp          | 6 ++----
 clang/lib/Tooling/ArgumentsAdjusters.cpp | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 7b577632fdf55..9429a316a9196 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -555,8 +555,7 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts) {
 bool TargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeatureVec) const {
-  for (const auto &F : FeatureVec) {
-    StringRef Name = F;
+  for (StringRef Name : FeatureVec) {
     if (Name.empty())
       continue;
     // Apply the feature via the target.
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9c985e6bd5e03..2e826adf9229f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3232,8 +3232,7 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
     if (ParsedAttrs.Duplicate != "")
       return Diag(LiteralLoc, diag::err_duplicate_target_attribute)
              << Duplicate << None << ParsedAttrs.Duplicate << Target;
-    for (const auto &Feature : ParsedAttrs.Features) {
-      StringRef CurFeature = Feature;
+    for (StringRef CurFeature : ParsedAttrs.Features) {
       if (!CurFeature.starts_with('+') && !CurFeature.starts_with('-'))
         return Diag(LiteralLoc, diag::warn_unsupported_target_attribute)
                << Unsupported << None << AttrStr << Target;
@@ -3241,8 +3240,7 @@ bool Sema::checkTargetAttr(SourceLocation LiteralLoc, StringRef AttrStr) {
   }
 
   if (Context.getTargetInfo().getTriple().isLoongArch()) {
-    for (const auto &Feature : ParsedAttrs.Features) {
-      StringRef CurFeature = Feature;
+    for (StringRef CurFeature : ParsedAttrs.Features) {
       if (CurFeature.starts_with("!arch=")) {
         StringRef ArchValue = CurFeature.split("=").second.trim();
         return Diag(LiteralLoc, diag::err_attribute_unsupported)
diff --git a/clang/lib/Tooling/ArgumentsAdjusters.cpp b/clang/lib/Tooling/ArgumentsAdjusters.cpp
index d01c57ee69c00..999fa790124cb 100644
--- a/clang/lib/Tooling/ArgumentsAdjusters.cpp
+++ b/clang/lib/Tooling/ArgumentsAdjusters.cpp
@@ -22,8 +22,7 @@ namespace clang {
 namespace tooling {
 
 static StringRef getDriverMode(const CommandLineArguments &Args) {
-  for (const auto &Arg : Args) {
-    StringRef ArgRef = Arg;
+  for (StringRef ArgRef : Args) {
     if (ArgRef.consume_front("--driver-mode=")) {
       return ArgRef;
     }

From 7a4a83b551eaf159ce10b612def3be62d80706d4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:29 -0700
Subject: [PATCH 546/851] [TableGen] Use range-based for loops (NFC) (#144283)

---
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |  4 +--
 .../TableGen/Common/CodeGenDAGPatterns.h      | 14 +++++-----
 .../TableGen/Common/CodeGenInstruction.h      |  3 +--
 .../utils/TableGen/Common/CodeGenSchedule.cpp |  7 ++---
 llvm/utils/TableGen/Common/DAGISelMatcher.cpp |  8 +++---
 llvm/utils/TableGen/DAGISelMatcherGen.cpp     | 27 +++++++++----------
 llvm/utils/TableGen/RegisterInfoEmitter.cpp   | 12 ++++-----
 llvm/utils/TableGen/X86DisassemblerTables.cpp |  6 ++---
 8 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index d33c0dba91fd8..810b35e65b310 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -1824,8 +1824,8 @@ bool TreePatternNode::UpdateNodeTypeFromInst(unsigned ResNo,
 }
 
 bool TreePatternNode::ContainsUnresolvedType(TreePattern &TP) const {
-  for (unsigned i = 0, e = Types.size(); i != e; ++i)
-    if (!TP.getInfer().isConcrete(Types[i], true))
+  for (const TypeSetByHwMode &Type : Types)
+    if (!TP.getInfer().isConcrete(Type, true))
       return true;
   for (const TreePatternNode &Child : children())
     if (Child.ContainsUnresolvedType(TP))
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index a5aadf2ee1138..64fec275faa68 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -747,8 +747,8 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
 
   /// hasChild - Return true if N is any of our children.
   bool hasChild(const TreePatternNode *N) const {
-    for (unsigned i = 0, e = Children.size(); i != e; ++i)
-      if (Children[i].get() == N)
+    for (const TreePatternNodePtr &Child : Children)
+      if (Child.get() == N)
         return true;
     return false;
   }
@@ -1171,9 +1171,9 @@ class CodeGenDAGPatterns {
   }
 
   const CodeGenIntrinsic &getIntrinsic(const Record *R) const {
-    for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
-      if (Intrinsics[i].TheDef == R)
-        return Intrinsics[i];
+    for (const CodeGenIntrinsic &Intrinsic : Intrinsics)
+      if (Intrinsic.TheDef == R)
+        return Intrinsic;
     llvm_unreachable("Unknown intrinsic!");
   }
 
@@ -1280,8 +1280,8 @@ class CodeGenDAGPatterns {
 inline bool SDNodeInfo::ApplyTypeConstraints(TreePatternNode &N,
                                              TreePattern &TP) const {
   bool MadeChange = false;
-  for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
-    MadeChange |= TypeConstraints[i].ApplyTypeConstraint(N, *this, TP);
+  for (const SDTypeConstraint &TypeConstraint : TypeConstraints)
+    MadeChange |= TypeConstraint.ApplyTypeConstraint(N, *this, TP);
   return MadeChange;
 }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index e38979af3909d..3a5abc55319b1 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -127,8 +127,7 @@ class CGIOperandList {
     /// getTiedOperand - If this operand is tied to another one, return the
     /// other operand number.  Otherwise, return -1.
     int getTiedRegister() const {
-      for (unsigned j = 0, e = Constraints.size(); j != e; ++j) {
-        const CGIOperandList::ConstraintInfo &CI = Constraints[j];
+      for (const CGIOperandList::ConstraintInfo &CI : Constraints) {
         if (CI.isTied())
           return CI.getTiedOperand();
       }
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 72954804b66f8..af7e43929bcf0 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -2232,13 +2232,10 @@ void PredTransitions::dump() const {
       dbgs() << LS << SchedModels.getSchedRW(PC.RWIdx, PC.IsRead).Name << ":"
              << PC.Predicate->getName();
     dbgs() << "},\n  => {";
-    for (SmallVectorImpl<SmallVector<unsigned, 4>>::const_iterator
-             WSI = TI.WriteSequences.begin(),
-             WSE = TI.WriteSequences.end();
-         WSI != WSE; ++WSI) {
+    for (const auto &WS : TI.WriteSequences) {
       dbgs() << "(";
       ListSeparator LS;
-      for (unsigned N : *WSI)
+      for (unsigned N : WS)
         dbgs() << LS << SchedModels.getSchedWrite(N).Name;
       dbgs() << "),";
     }
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
index 8780c4f5b61c2..3543bb5a55c64 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp
@@ -286,11 +286,11 @@ void EmitNodeMatcherCommon::printImpl(raw_ostream &OS, indent Indent) const {
   OS << (isa<MorphNodeToMatcher>(this) ? "MorphNodeTo: " : "EmitNode: ")
      << CGI.Namespace << "::" << CGI.TheDef->getName() << ": <todo flags> ";
 
-  for (unsigned i = 0, e = VTs.size(); i != e; ++i)
-    OS << ' ' << getEnumName(VTs[i]);
+  for (MVT::SimpleValueType VT : VTs)
+    OS << ' ' << getEnumName(VT);
   OS << '(';
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
-    OS << Operands[i] << ' ';
+  for (unsigned Operand : Operands)
+    OS << Operand << ' ';
   OS << ")\n";
 }
 
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index afdb6879eede4..0039ff4f3e2d7 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -526,23 +526,20 @@ void MatcherGen::EmitMatchCode(const TreePatternNode &N,
     EmitOperatorMatchCode(N, NodeNoTypes);
 
   // If there are node predicates for this node, generate their checks.
-  for (unsigned i = 0, e = N.getPredicateCalls().size(); i != e; ++i) {
-    const TreePredicateCall &Pred = N.getPredicateCalls()[i];
+  for (const TreePredicateCall &Pred : N.getPredicateCalls()) {
     SmallVector<unsigned, 4> Operands;
     if (Pred.Fn.usesOperands()) {
       TreePattern *TP = Pred.Fn.getOrigPatFragRecord();
-      for (unsigned i = 0; i < TP->getNumArgs(); ++i) {
-        std::string Name =
-            ("pred:" + Twine(Pred.Scope) + ":" + TP->getArgName(i)).str();
+      for (const std::string &Arg : TP->getArgList()) {
+        std::string Name = ("pred:" + Twine(Pred.Scope) + ":" + Arg).str();
         Operands.push_back(getNamedArgumentSlot(Name));
       }
     }
     AddMatcher(new CheckPredicateMatcher(Pred.Fn, Operands));
   }
 
-  for (unsigned i = 0, e = ResultsToTypeCheck.size(); i != e; ++i)
-    AddMatcher(new CheckTypeMatcher(N.getSimpleType(ResultsToTypeCheck[i]),
-                                    ResultsToTypeCheck[i]));
+  for (unsigned I : ResultsToTypeCheck)
+    AddMatcher(new CheckTypeMatcher(N.getSimpleType(I), I));
 }
 
 /// EmitMatcherCode - Generate the code that matches the predicate of this
@@ -836,8 +833,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
       // overridden, or which we aren't letting it override; emit the 'default
       // ops' operands.
       const DAGDefaultOperand &DefaultOp = CGP.getDefaultOperand(OperandNode);
-      for (unsigned i = 0, e = DefaultOp.DefaultOps.size(); i != e; ++i)
-        EmitResultOperand(*DefaultOp.DefaultOps[i], InstOps);
+      for (const TreePatternNodePtr &Op : DefaultOp.DefaultOps)
+        EmitResultOperand(*Op, InstOps);
       continue;
     }
 
@@ -886,10 +883,10 @@ void MatcherGen::EmitResultInstructionAsOperand(
   if (isRoot && !PhysRegInputs.empty()) {
     // Emit all of the CopyToReg nodes for the input physical registers.  These
     // occur in patterns like (mul:i8 AL:i8, GR8:i8:$src).
-    for (unsigned i = 0, e = PhysRegInputs.size(); i != e; ++i) {
+    for (const auto &PhysRegInput : PhysRegInputs) {
       const CodeGenRegister *Reg =
-          CGP.getTargetInfo().getRegBank().getReg(PhysRegInputs[i].first);
-      AddMatcher(new EmitCopyToRegMatcher(PhysRegInputs[i].second, Reg));
+          CGP.getTargetInfo().getRegBank().getReg(PhysRegInput.first);
+      AddMatcher(new EmitCopyToRegMatcher(PhysRegInput.second, Reg));
     }
 
     // Even if the node has no other glue inputs, the resultant node must be
@@ -977,8 +974,8 @@ void MatcherGen::EmitResultInstructionAsOperand(
                                  NumFixedArityOperands, NextRecordedOperandNo));
 
   // The non-chain and non-glue results of the newly emitted node get recorded.
-  for (unsigned i = 0, e = ResultVTs.size(); i != e; ++i) {
-    if (ResultVTs[i] == MVT::Other || ResultVTs[i] == MVT::Glue)
+  for (MVT::SimpleValueType ResultVT : ResultVTs) {
+    if (ResultVT == MVT::Other || ResultVT == MVT::Glue)
       break;
     OutputOps.push_back(NextRecordedOperandNo++);
   }
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index bc1650a4acf0b..7d24c0f80cddb 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -726,11 +726,12 @@ void RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
   // Output the rows.
   OS << "  static const " << getMinimalTypeForRange(SubRegIndicesSize + 1, 32)
      << " Rows[" << Rows.size() << "][" << SubRegIndicesSize << "] = {\n";
-  for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
+  for (const auto &Row : Rows) {
     OS << "    { ";
-    for (unsigned i = 0, e = SubRegIndicesSize; i != e; ++i)
-      if (Rows[r][i])
-        OS << Rows[r][i]->getQualifiedName() << ", ";
+    for (const llvm::CodeGenSubRegIndex *Elem :
+         ArrayRef(&Row[0], SubRegIndicesSize))
+      if (Elem)
+        OS << Elem->getQualifiedName() << ", ";
       else
         OS << "0, ";
     OS << "},\n";
@@ -830,8 +831,7 @@ void RegisterInfoEmitter::emitComposeSubRegIndexLaneMask(raw_ostream &OS,
   for (size_t s = 0, se = Sequences.size(); s != se; ++s) {
     OS << "    ";
     const SmallVectorImpl<MaskRolPair> &Sequence = Sequences[s];
-    for (size_t p = 0, pe = Sequence.size(); p != pe; ++p) {
-      const MaskRolPair &P = Sequence[p];
+    for (const MaskRolPair &P : Sequence) {
       printMask(OS << "{ ", P.Mask);
       OS << format(", %2u }, ", P.RotateLeft);
     }
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
index 36f752a1ebe63..3c422a32dcaf7 100644
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -882,9 +882,9 @@ void DisassemblerTables::emitInstructionInfo(raw_ostream &o,
     N = ++OperandSetNum;
 
     o << "  { /* " << (OperandSetNum - 1) << " */\n";
-    for (unsigned i = 0, e = OperandList.size(); i != e; ++i) {
-      const char *Encoding = stringForOperandEncoding(OperandList[i].first);
-      const char *Type = stringForOperandType(OperandList[i].second);
+    for (const auto &[Enc, Ty] : OperandList) {
+      const char *Encoding = stringForOperandEncoding(Enc);
+      const char *Type = stringForOperandType(Ty);
       o << "    { " << Encoding << ", " << Type << " },\n";
     }
     o << "  },\n";

From c01532177ff61a768d5dc1ea541f9a8d986497fa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 15 Jun 2025 21:00:36 -0700
Subject: [PATCH 547/851] [clang] Remove unused includes (NFC) (#144285)

These are identified by misc-include-cleaner.  I've filtered out those
that break builds.  Also, I'm staying away from llvm-config.h,
config.h, and Compiler.h, which likely cause platform- or
compiler-specific build failures.
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp                       | 1 -
 clang/lib/Basic/LangOptions.cpp                                | 1 -
 clang/lib/CodeGen/CGBuiltin.cpp                                | 1 -
 clang/lib/CodeGen/CGHLSLRuntime.cpp                            | 1 -
 clang/lib/Edit/EditedSource.cpp                                | 2 --
 clang/lib/ExtractAPI/ExtractAPIConsumer.cpp                    | 1 -
 clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp   | 2 --
 clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp             | 1 -
 clang/lib/InstallAPI/DirectoryScanner.cpp                      | 1 -
 clang/lib/InstallAPI/FileList.cpp                              | 2 --
 clang/lib/InstallAPI/Frontend.cpp                              | 1 -
 clang/lib/InstallAPI/Visitor.cpp                               | 1 -
 clang/lib/Interpreter/InterpreterValuePrinter.cpp              | 2 --
 clang/lib/Interpreter/Value.cpp                                | 3 ---
 clang/lib/Lex/HeaderMap.cpp                                    | 1 -
 clang/lib/Rewrite/HTMLRewrite.cpp                              | 2 --
 clang/lib/Sema/SemaDeclAttr.cpp                                | 1 -
 clang/lib/Sema/SemaExprCXX.cpp                                 | 1 -
 .../Checkers/RetainCountChecker/RetainCountChecker.cpp         | 1 -
 clang/lib/Support/RISCVVIntrinsicUtils.cpp                     | 3 ---
 20 files changed, 29 deletions(-)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 8b1ca6b80971f..631a546b45ff4 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -30,7 +30,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
 #include <cstddef>
 #include <optional>
 #include <queue>
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index 7e696620993f9..912b890569cf5 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c0b02a104d95e..1f69274351676 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17,7 +17,6 @@
 #include "CGDebugInfo.h"
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
-#include "CGPointerAuthInfo.h"
 #include "CGRecordLayout.h"
 #include "CGValue.h"
 #include "CodeGenFunction.h"
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 977ff792bae2c..571ff53b7d644 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
-#include <utility>
 
 using namespace clang;
 using namespace CodeGen;
diff --git a/clang/lib/Edit/EditedSource.cpp b/clang/lib/Edit/EditedSource.cpp
index a3386b2489b07..398cce71d5e27 100644
--- a/clang/lib/Edit/EditedSource.cpp
+++ b/clang/lib/Edit/EditedSource.cpp
@@ -16,10 +16,8 @@
 #include "clang/Edit/FileOffset.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include <algorithm>
 #include <cassert>
 #include <tuple>
 #include <utility>
diff --git a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
index 764c345a9db99..1087eb3001856 100644
--- a/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
+++ b/clang/lib/ExtractAPI/ExtractAPIConsumer.cpp
@@ -43,7 +43,6 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Regex.h"
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index e881d56258e5e..139023f32e8d3 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -19,14 +19,12 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 #include <iterator>
 #include <optional>
-#include <type_traits>
 
 using namespace clang;
 using namespace clang::extractapi;
diff --git a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
index fd9db8113a41e..37b428216c91e 100644
--- a/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
+++ b/clang/lib/InstallAPI/DiagnosticBuilderWrappers.cpp
@@ -8,7 +8,6 @@
 
 #include "DiagnosticBuilderWrappers.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TextAPI/Platform.h"
 
diff --git a/clang/lib/InstallAPI/DirectoryScanner.cpp b/clang/lib/InstallAPI/DirectoryScanner.cpp
index be43a96f3d97d..f8f708fda4ca4 100644
--- a/clang/lib/InstallAPI/DirectoryScanner.cpp
+++ b/clang/lib/InstallAPI/DirectoryScanner.cpp
@@ -9,7 +9,6 @@
 #include "clang/InstallAPI/DirectoryScanner.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/TextAPI/DylibReader.h"
 
 using namespace llvm;
 using namespace llvm::MachO;
diff --git a/clang/lib/InstallAPI/FileList.cpp b/clang/lib/InstallAPI/FileList.cpp
index 65610903840af..8f8ed6e8a5db6 100644
--- a/clang/lib/InstallAPI/FileList.cpp
+++ b/clang/lib/InstallAPI/FileList.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/InstallAPI/FileList.h"
-#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/InstallAPI/FileList.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Error.h"
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
index 9e8c60fbda3d0..cce0b19b50619 100644
--- a/clang/lib/InstallAPI/Frontend.cpp
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -9,7 +9,6 @@
 #include "clang/InstallAPI/Frontend.h"
 #include "clang/AST/Availability.h"
 #include "clang/InstallAPI/FrontendRecords.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
index a73ea0b0d124c..487be2c300887 100644
--- a/clang/lib/InstallAPI/Visitor.cpp
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -13,7 +13,6 @@
 #include "clang/Basic/Linkage.h"
 #include "clang/InstallAPI/DylibVerifier.h"
 #include "clang/InstallAPI/FrontendRecords.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Mangler.h"
diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
index 3e3fbfd172caa..3e7e32b2e8557 100644
--- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp
+++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp
@@ -18,7 +18,6 @@
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
-#include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/Sema.h"
 
@@ -26,7 +25,6 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <cassert>
-#include <string>
 
 #include <cstdarg>
 
diff --git a/clang/lib/Interpreter/Value.cpp b/clang/lib/Interpreter/Value.cpp
index eb2ce9c9fd330..afdf406b37253 100644
--- a/clang/lib/Interpreter/Value.cpp
+++ b/clang/lib/Interpreter/Value.cpp
@@ -16,10 +16,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Interpreter/Interpreter.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_os_ostream.h"
 #include <cassert>
-#include <cstdint>
 #include <utility>
 
 namespace {
diff --git a/clang/lib/Lex/HeaderMap.cpp b/clang/lib/Lex/HeaderMap.cpp
index 588b32ee9ca8e..a7b670f00ac6e 100644
--- a/clang/lib/Lex/HeaderMap.cpp
+++ b/clang/lib/Lex/HeaderMap.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/SystemZ/zOSSupport.h"
 #include <cstring>
 #include <memory>
 #include <optional>
diff --git a/clang/lib/Rewrite/HTMLRewrite.cpp b/clang/lib/Rewrite/HTMLRewrite.cpp
index 1829a4ff3504a..109cdf990543a 100644
--- a/clang/lib/Rewrite/HTMLRewrite.cpp
+++ b/clang/lib/Rewrite/HTMLRewrite.cpp
@@ -17,9 +17,7 @@
 #include "clang/Lex/TokenConcatenation.h"
 #include "clang/Rewrite/Core/Rewriter.h"
 #include "llvm/ADT/RewriteBuffer.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2e826adf9229f..1c2fa80e782d4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -53,7 +53,6 @@
 #include "clang/Sema/SemaOpenCL.h"
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaRISCV.h"
-#include "clang/Sema/SemaSPIRV.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/SemaSwift.h"
 #include "clang/Sema/SemaWasm.h"
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index ba52e8f8932d3..4a86cbd0633b6 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -40,7 +40,6 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
-#include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaLambda.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaPPC.h"
diff --git a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
index a00a09f60fd5d..62bc3218d9ced 100644
--- a/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/RetainCountChecker/RetainCountChecker.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RetainCountChecker.h"
-#include "clang/StaticAnalyzer/Core/Checker.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
 #include <optional>
 
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index daf09ac66f214..5a4e805d4a9d1 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -8,13 +8,10 @@
 
 #include "clang/Support/RISCVVIntrinsicUtils.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <numeric>
 #include <optional>
 
 using namespace llvm;

From cab09e76e0c4c95f44cf90bf2bf7a6eaa15b14b2 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 16 Jun 2025 12:07:47 +0800
Subject: [PATCH 548/851] [InstCombine] Propagate FMF from fptrunc when folding
 `fptrunc fabs(X) -> fabs(fptrunc X)` (#143352)

Alive2: https://alive2.llvm.org/ce/z/DWV3G3
fptrunc yields infinity when the input cannot fit in the target type. So
ninf should be propagated from fptrunc. For other intrinsics, the
previous check ensures that the result is never an infinity:

https://github.com/llvm/llvm-project/blob/5d3899d293e902124c3602b466031b6b799fb123/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp#L1910-L1917

Closes https://github.com/llvm/llvm-project/issues/143122.
---
 .../InstCombine/InstCombineCasts.cpp          |  4 ++-
 .../InstCombine/double-float-shrink-2.ll      |  4 +--
 llvm/test/Transforms/InstCombine/fabs.ll      |  2 +-
 llvm/test/Transforms/InstCombine/fpcast.ll    | 35 ++++++++++++++++++-
 4 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2db79228bf0e6..d4a2fe5e37ef5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1917,7 +1917,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
       II->getOperandBundlesAsDefs(OpBundles);
       CallInst *NewCI =
           CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName());
-      NewCI->copyFastMathFlags(II);
+      // A normal value may be converted to an infinity. It means that we cannot
+      // propagate ninf from the intrinsic. So we propagate FMF from fptrunc.
+      NewCI->copyFastMathFlags(&FPT);
       return NewCI;
     }
     }
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index f2049e2813ebc..f884d2bd1ab5b 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -449,7 +449,7 @@ define float @test_shrink_intrin_fabs_fast_double_src(double %D) {
 ; CHECK-NEXT:    ret float [[F]]
 ;
   %E = call fast double @llvm.fabs.f64(double %D)
-  %F = fptrunc double %E to float
+  %F = fptrunc fast double %E to float
   ret float %F
 }
 
@@ -611,7 +611,7 @@ define half @test_mismatched_type_intrin_fabs_fast_double_src(double %D) {
 ; CHECK-NEXT:    ret half [[F]]
 ;
   %E = call fast double @llvm.fabs.f64(double %D)
-  %F = fptrunc double %E to half
+  %F = fptrunc fast double %E to half
   ret half %F
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index 0a22d1431b5f1..0d9374410a1d8 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -1522,7 +1522,7 @@ define float @test_fabs_nsz_used_by_frem(float %x) {
 define half @test_fabs_nsz_used_by_fptrunc(float %x) {
 ; CHECK-LABEL: @test_fabs_nsz_used_by_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[X:%.*]] to half
-; CHECK-NEXT:    [[OP:%.*]] = call nsz half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    [[OP:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
 ; CHECK-NEXT:    ret half [[OP]]
 ;
   %cmp = fcmp oge float %x, 0.000000e+00
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index d5290b572aefd..1a3faceebf244 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -32,7 +32,7 @@ define half @test3(float %a) {
 define half @test3_fast(float %a) {
 ; CHECK-LABEL: @test3_fast(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
-; CHECK-NEXT:    [[C:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    [[C:%.*]] = call fast half @llvm.fabs.f16(half [[TMP1]])
 ; CHECK-NEXT:    ret half [[C]]
 ;
   %b = call float @llvm.fabs.f32(float %a)
@@ -40,6 +40,39 @@ define half @test3_fast(float %a) {
   ret half %c
 }
 
+define half @test3_both_ninf(float %a) {
+; CHECK-LABEL: @test3_both_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call ninf half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call ninf float @llvm.fabs.f32(float %a)
+  %c = fptrunc ninf float %b to half
+  ret half %c
+}
+
+define half @test3_fabs_ninf(float %a) {
+; CHECK-LABEL: @test3_fabs_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call ninf float @llvm.fabs.f32(float %a)
+  %c = fptrunc float %b to half
+  ret half %c
+}
+
+define half @test3_fptrunc_ninf(float %a) {
+; CHECK-LABEL: @test3_fptrunc_ninf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call ninf half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call float @llvm.fabs.f32(float %a)
+  %c = fptrunc ninf float %b to half
+  ret half %c
+}
+
 define half @fneg_fptrunc(float %a) {
 ; CHECK-LABEL: @fneg_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half

From e2afda6fc95ef63b54d449fc1a9eb13cd0ff3639 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 12:15:30 +0800
Subject: [PATCH 549/851] [RISCV] Fix incorrect predicates for fp16 permutation
 intrinsics (#144063)

vrgatherei16, vslideup and vslidedown should be supported with fp16 type
for Zvfhmin.

Fixes https://github.com/llvm/llvm-project/issues/143975.
---
 clang/include/clang/Basic/riscv_vector.td           |  2 +-
 clang/include/clang/Basic/riscv_vector_common.td    |  4 ++--
 .../non-policy/non-overloaded/vrgatherei16.c        |  2 +-
 .../non-policy/non-overloaded/vslidedown.c          |  2 +-
 .../non-policy/non-overloaded/vslideup.c            |  2 +-
 .../non-policy/overloaded/vrgatherei16.c            |  2 +-
 .../non-policy/overloaded/vslidedown.c              |  2 +-
 .../non-policy/overloaded/vslideup.c                |  2 +-
 .../policy/non-overloaded/vrgatherei16.c            |  2 +-
 .../policy/non-overloaded/vslidedown.c              |  2 +-
 .../policy/non-overloaded/vslideup.c                |  2 +-
 .../policy/overloaded/vrgatherei16.c                |  2 +-
 .../policy/overloaded/vslidedown.c                  |  2 +-
 .../policy/overloaded/vslideup.c                    |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td     | 13 ++++++++++---
 llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll         |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/vslidedown.ll           |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/vslideup.ll             |  4 ++--
 18 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 3e22bfb330af6..c6fd8a1a45fd1 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -2397,7 +2397,7 @@ let RequiredFeatures = ["zvfbfmin"] in {
 }
 defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "csilfd",
                                      [["vv", "v", "vv(Log2EEW:4)Uv"]]>;
-let RequiredFeatures = ["zvfh"] in
+let RequiredFeatures = ["zvfhmin"] in
 defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "x",
                                      [["vv", "v", "vv(Log2EEW:4)Uv"]]>;
 // unsigned type
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
index c6753978274a0..e3d589699645b 100644
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -593,7 +593,7 @@ let UnMaskedPolicyScheme = HasPolicyOperand,
   multiclass RVVSlideUpBuiltinSet {
     defm "" : RVVOutBuiltinSet<NAME, "csilfd",
                                [["vx","v", "vvvz"]]>;
-    let RequiredFeatures = ["zvfh"] in
+    let RequiredFeatures = ["zvfhmin"] in
       defm "" : RVVOutBuiltinSet<NAME, "x",
                                  [["vx","v", "vvvz"]]>;
     defm "" : RVVOutBuiltinSet<NAME, "csil",
@@ -618,7 +618,7 @@ let UnMaskedPolicyScheme = HasPassthruOperand,
   multiclass RVVSlideDownBuiltinSet {
     defm "" : RVVOutBuiltinSet<NAME, "csilfd",
                                [["vx","v", "vvz"]]>;
-    let RequiredFeatures = ["zvfh"] in
+    let RequiredFeatures = ["zvfhmin"] in
       defm "" : RVVOutBuiltinSet<NAME, "x",
                                  [["vx","v", "vvz"]]>;
     defm "" : RVVOutBuiltinSet<NAME, "csil",
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
index 32469731d1140..41214f7cdce23 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
index c4e7d86e7d536..8b97ce8f760cb 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
index 0ab387525f6a4..c302b2940bc67 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
index f69613c4777f6..a63f0a59a34e7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
index 14b3a99a6f0f5..fb99a750a6707 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
index 1b3c3f6c0f858..77e8122890ab7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
index e22da32dbfa84..cf98549c41af2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
index 205866db3566c..4f1c00bef0760 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
index b32264fd88e75..c9fa994e51b3a 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
index 3d53e46b48859..c50f1f731ffb2 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vrgatherei16.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
index c275ee9bb2f6f..476b9b59dc192 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslidedown.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
index 9bd602fa5d762..1e0228e17caf9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vslideup.c
@@ -1,7 +1,7 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
-// RUN:   -target-feature +zvfh -disable-O0-optnone  \
+// RUN:   -target-feature +zvfhmin -disable-O0-optnone  \
 // RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
 // RUN:   FileCheck --check-prefix=CHECK-RV64 %s
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 22b5b52541d61..5e554d2d03911 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -7414,8 +7414,12 @@ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllInteger
 defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
 
-defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
-defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectorsExceptFP16, uimm5>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFP16Vectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectorsExceptFP16, uimm5>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFP16Vectors, uimm5>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
 
@@ -7432,7 +7436,10 @@ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                              eew=16, vtilist=AllFloatVectors>;
+                              eew=16, vtilist=AllFloatVectorsExceptFP16>;
+let Predicates = [HasVInstructionsF16Minimal] in
+  defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
+                                eew=16, vtilist=AllFP16Vectors>;
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
index f386fd9cd3aeb..75c00e406b4f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrgatherei16.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vrgatherei16.vv.nxv1i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
index 2be187c50af26..f0f78c211c4a9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslidedown.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vslidedown.nxv1i8(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
index 1e3ede7fee9cb..8e3c05611bc71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vslideup.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfh \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+d,+zvfhmin \
 ; RUN:   -verify-machineinstrs | FileCheck %s
 
 declare <vscale x 1 x i8> @llvm.riscv.vslideup.nxv1i8(

From 29fcad000ca63078d28dd231e0727b7811df43b0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:27:12 -0700
Subject: [PATCH 550/851] AVR: Replace deprecated MCExpr::print with
 MCAsmInfo::printExpr

---
 llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index bf512dd07c0a0..1e4b2e27a1837 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -14,6 +14,7 @@
 
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -133,7 +134,7 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << formatImm(Op.getImm());
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    MAI.printExpr(O, *Op.getExpr());
   }
 }
 

From f8e0518120cd2850a7f674322bf428bc7d7d3326 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:46:08 -0700
Subject: [PATCH 551/851] MC: Adjust -show-inst output for MCExpr

This dump feature does not pass MCAsmInfo to the printer function.
When we remove MCSpecifierExpr subclasses (and the printImpl overrides),
we will not be able to print target-specific specifier strings.
Just print a textual representation.
---
 llvm/lib/MC/MCExpr.cpp                   |   13 +-
 llvm/lib/MC/MCInst.cpp                   |    2 +-
 llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll |  496 ++---
 llvm/test/CodeGen/Mips/llvm-ir/load.ll   | 2316 +++++++++++-----------
 llvm/test/CodeGen/Mips/llvm-ir/store.ll  | 1308 ++++++------
 llvm/test/MC/Lanai/conditional_inst.s    |    6 +-
 llvm/test/MC/Lanai/memory.s              |    4 +-
 7 files changed, 2075 insertions(+), 2070 deletions(-)

diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 5ccad6d487973..89191294f3ed3 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -173,10 +173,15 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI,
     return;
   }
 
-  case MCExpr::Specifier:
-    // TODO: Remove after all targets that use MCSpecifierExpr migrate to
-    // MCAsmInfo::printSpecifierExpr.
-    return cast<MCSpecifierExpr>(this)->printImpl(OS, MAI);
+  case MCExpr::Specifier: {
+    auto &SE = cast<MCSpecifierExpr>(*this);
+    if (MAI)
+      return MAI->printSpecifierExpr(OS, SE);
+    // Used by dump features like -show-inst. Regular MCAsmStreamer output must
+    // set MAI.
+    OS << "specifier(" << SE.getSpecifier() << ',' << *SE.getSubExpr() << ')';
+    return;
+  }
   }
 
   llvm_unreachable("Invalid expression kind!");
diff --git a/llvm/lib/MC/MCInst.cpp b/llvm/lib/MC/MCInst.cpp
index 639619fe4e991..832d25060f880 100644
--- a/llvm/lib/MC/MCInst.cpp
+++ b/llvm/lib/MC/MCInst.cpp
@@ -35,7 +35,7 @@ void MCOperand::print(raw_ostream &OS, const MCRegisterInfo *RegInfo) const {
   else if (isDFPImm())
     OS << "DFPImm:" << bit_cast<double>(getDFPImm());
   else if (isExpr()) {
-    OS << "Expr:(" << *getExpr() << ")";
+    OS << "Expr:" << *getExpr();
   } else if (isInst()) {
     OS << "Inst:(";
     if (const auto *Inst = getInst())
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll b/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
index 3bf17abc7965e..79fe2fd26a6e2 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/fptosi.ll
@@ -38,189 +38,189 @@
 define i32 @test1(float %t) {
 ; M32-LABEL: test1:
 ; M32:       # %bb.0: # %entry
-; M32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-FP64-LABEL: test1:
 ; M32R2-FP64:       # %bb.0: # %entry
-; M32R2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-SF-LABEL: test1:
 ; M32R2-SF:       # %bb.0: # %entry
-; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4:]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST5:]] SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
 ; M32R2-SF-NEXT:    .cfi_offset 31, -4
-; M32R2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL
-; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
-; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jal __fixsfsi # <MCInst #[[#MCINST6:]] JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
+; M32R2-SF-NEXT:    nop # <MCInst #[[#MCINST7:]] SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
 ; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST8:]] LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
-; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
 ;
 ; M32R3R5-LABEL: test1:
 ; M32R3R5:       # %bb.0: # %entry
-; M32R3R5-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R3R5-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R6-LABEL: test1:
 ; M32R6:       # %bb.0: # %entry
-; M32R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M32R6-NEXT:    jr $ra # <MCInst #[[#MCINST9:]] JALR
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64-LABEL: test1:
 ; M64:       # %bb.0: # %entry
-; M64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64R6-LABEL: test1:
 ; M64R6:       # %bb.0: # %entry
-; M64R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST1:]] TRUNC_W_S
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; M64R6-NEXT:    jr $ra # <MCInst #[[#MCINST10:]] JALR64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3:]] MFC1
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP32-LABEL: test1:
 ; MMR2-FP32:       # %bb.0: # %entry
-; MMR2-FP32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST11:]] TRUNC_W_S_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP64-LABEL: test1:
 ; MMR2-FP64:       # %bb.0: # %entry
-; MMR2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST11:]] TRUNC_W_S_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-SF-LABEL: test1:
 ; MMR2-SF:       # %bb.0: # %entry
-; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #[[#MCINST14:]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST15:]] SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR2-SF-NEXT:    .cfi_offset 31, -4
-; MMR2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL_MM
-; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
-; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    jal __fixsfsi # <MCInst #[[#MCINST16:]] JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
+; MMR2-SF-NEXT:    nop # <MCInst #[[#MCINST17:]] SLL_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
 ; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST18:]] LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-LABEL: test1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #[[#MCINST20:]] TRUNC_W_S_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13:]] MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>>
 ;
 ; MMR6-SF-LABEL: test1:
 ; MMR6-SF:       # %bb.0: # %entry
-; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4:]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST5:]] SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR6-SF-NEXT:    .cfi_offset 31, -4
-; MMR6-SF-NEXT:    balc __fixsfsi # <MCInst #{{[0-9]+}} BALC_MMR6
-; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
+; MMR6-SF-NEXT:    balc __fixsfsi # <MCInst #[[#MCINST21:]] BALC_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:__fixsfsi>>
 ; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST8:]] LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19:]] JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 entry:
   %conv = fptosi float %t to i32
   ret i32 %conv
@@ -229,189 +229,189 @@ entry:
 define i32 @test2(double %t) {
 ; M32-LABEL: test2:
 ; M32:       # %bb.0: # %entry
-; M32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST22:]] TRUNC_W_D32
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; M32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-FP64-LABEL: test2:
 ; M32R2-FP64:       # %bb.0: # %entry
-; M32R2-FP64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R2-FP64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R2-SF-LABEL: test2:
 ; M32R2-SF:       # %bb.0: # %entry
-; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST5]] SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
 ; M32R2-SF-NEXT:    .cfi_offset 31, -4
-; M32R2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL
-; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
-; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jal __fixdfsi # <MCInst #[[#MCINST6]] JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
+; M32R2-SF-NEXT:    nop # <MCInst #[[#MCINST7]] SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
 ; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    # <MCInst #[[#MCINST8]] LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
-; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; M32R2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
 ;
 ; M32R3R5-LABEL: test2:
 ; M32R3R5:       # %bb.0: # %entry
-; M32R3R5-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R3R5-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R3R5-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST22:]] TRUNC_W_D32
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R3R5-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M32R6-LABEL: test2:
 ; M32R6:       # %bb.0: # %entry
-; M32R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M32R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M32R6-NEXT:    jr $ra # <MCInst #[[#MCINST9]] JALR
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64-LABEL: test2:
 ; M64:       # %bb.0: # %entry
-; M64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG7]]>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; M64R6-LABEL: test2:
 ; M64R6:       # %bb.0: # %entry
-; M64R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; M64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; M64R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST23:]] TRUNC_W_D64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; M64R6-NEXT:    jr $ra # <MCInst #[[#MCINST10]] JALR64
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST3]] MFC1
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; M64R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP32-LABEL: test2:
 ; MMR2-FP32:       # %bb.0: # %entry
-; MMR2-FP32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST24:]] TRUNC_W_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-FP64-LABEL: test2:
 ; MMR2-FP64:       # %bb.0: # %entry
-; MMR2-FP64-NEXT:    cvt.w.d $f0, $f12 # <MCInst #{{[0-9]+}} CVT_W_D64_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-FP64-NEXT:    cvt.w.d $f0, $f12 # <MCInst #[[#MCINST25:]] CVT_W_D64_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
 ;
 ; MMR2-SF-LABEL: test2:
 ; MMR2-SF:       # %bb.0: # %entry
-; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST15]] SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR2-SF-NEXT:    .cfi_offset 31, -4
-; MMR2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL_MM
-; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
-; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    jal __fixdfsi # <MCInst #[[#MCINST16]] JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
+; MMR2-SF-NEXT:    nop # <MCInst #[[#MCINST17]] SLL_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG6]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
 ; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR2-SF-NEXT:    # <MCInst #[[#MCINST18]] LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #[[#MCINST14]] ADDIUSP_MM
 ; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR2-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-LABEL: test2:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #[[#MCINST26:]] TRUNC_W_D_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #[[#MCINST13]] MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 ;
 ; MMR6-SF-LABEL: test2:
 ; MMR6-SF:       # %bb.0: # %entry
-; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
 ; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
 ; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST5]] SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
 ; MMR6-SF-NEXT:    .cfi_offset 31, -4
-; MMR6-SF-NEXT:    balc __fixdfsi # <MCInst #{{[0-9]+}} BALC_MMR6
-; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
+; MMR6-SF-NEXT:    balc __fixdfsi # <MCInst #[[#MCINST21]] BALC_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:__fixdfsi>>
 ; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
-; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    # <MCInst #[[#MCINST8]] LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
-; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #[[#MCINST4]] ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
-; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-SF-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #[[#MCINST19]] JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:[[#MCREG3]]>>
 entry:
   %conv = fptosi double %t to i32
   ret i32 %conv
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/load.ll b/llvm/test/CodeGen/Mips/llvm-ir/load.ll
index b96bdff227cae..ee858ac94aed6 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/load.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/load.ll
@@ -25,161 +25,161 @@
 define i8 @f1() {
 ; MIPS32-LABEL: f1:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f1:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f1:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7:]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f1:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
 ;
 ; MIPS3-LABEL: f1:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS3-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64-LABEL: f1:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f1:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12:]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64R6-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f1:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR5FP64-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST6:]] LBu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f1:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lbu $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LBu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R5FP643-NEXT:    lbu $2, %lo(a)($1) # <MCInst #[[#MCINST3:]] LBu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 entry:
   %0 = load i8, ptr @a
   ret i8 %0
@@ -188,161 +188,161 @@ entry:
 define i32 @f2() {
 ; MIPS32-LABEL: f2:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f2:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f2:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f2:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f2:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64-LABEL: f2:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f2:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f2:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST14:]] LB_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f2:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lb $2, %lo(a)($1) # <MCInst #{{[0-9]+}} LB
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lb $2, %lo(a)($1) # <MCInst #[[#MCINST13:]] LB
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 entry:
   %0 = load i8, ptr @a
   %1 = sext i8 %0 to i32
@@ -352,161 +352,161 @@ entry:
 define i16 @f3() {
 ; MIPS32-LABEL: f3:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f3:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f3:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f3:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f3:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64-LABEL: f3:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f3:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST16:]] LHu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f3:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lhu $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LHu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lhu $2, %lo(b)($1) # <MCInst #[[#MCINST15:]] LHu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 entry:
   %0 = load i16, ptr @b
   ret i16 %0
@@ -515,161 +515,161 @@ entry:
 define i32 @f4() {
 ; MIPS32-LABEL: f4:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f4:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f4:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f4:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f4:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64-LABEL: f4:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f4:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f4:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST18:]] LH_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f4:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lh $2, %lo(b)($1) # <MCInst #{{[0-9]+}} LH
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lh $2, %lo(b)($1) # <MCInst #[[#MCINST17:]] LH
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 entry:
   %0 = load i16, ptr @b
   %1 = sext i16 %0 to i32
@@ -679,161 +679,161 @@ entry:
 define i32 @f5() {
 ; MIPS32-LABEL: f5:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR3-LABEL: f5:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f5:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR6-LABEL: f5:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f5:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f5:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f5:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f5:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST20:]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f5:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST19:]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 entry:
   %0 = load i32, ptr @c
   ret i32 %0
@@ -842,181 +842,181 @@ entry:
 define i64 @f6() {
 ; MIPS32-LABEL: f6:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:0>>
 ;
 ; MMR3-LABEL: f6:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR3-NEXT:    # <MCOperand Imm:0>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f6:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:0>>
 ;
 ; MMR6-LABEL: f6:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR6-NEXT:    # <MCOperand Imm:0>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f6:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f6:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f6:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lwu $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LWu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lwu $2, %lo(c)($1) # <MCInst #[[#MCINST23:]] LWu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f6:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    li16 $2, 0 # <MCInst #[[#MCINST22:]] LI16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:0>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f6:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    addiu $2, $zero, 0 # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    addiu $2, $zero, 0 # <MCInst #[[#MCINST21:]] ADDiu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:0>>
 entry:
   %0 = load i32, ptr @c
@@ -1027,184 +1027,184 @@ entry:
 define i64 @f7() {
 ; MIPS32-LABEL: f7:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MMR3-LABEL: f7:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR3-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MIPS32R6-LABEL: f7:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MMR6-LABEL: f7:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR6-NEXT:    # <MCOperand Imm:31>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f7:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64-LABEL: f7:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f7:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #{{[0-9]+}} LW64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lw $2, %lo(c)($1) # <MCInst #[[#MCINST26:]] LW64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG9]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f7:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST20]] LW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST25:]] SRA_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:31>>
 ;
 ; MIPS32R5FP643-LABEL: f7:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #{{[0-9]+}} LW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sra $2, $3, 31 # <MCInst #{{[0-9]+}} SRA
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    lw $3, %lo(c)($1) # <MCInst #[[#MCINST19]] LW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sra $2, $3, 31 # <MCInst #[[#MCINST24:]] SRA
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:31>>
 entry:
   %0 = load i32, ptr @c
@@ -1215,161 +1215,161 @@ entry:
 define float @f8() {
 ; MIPS32-LABEL: f8:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR3-LABEL: f8:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R6-LABEL: f8:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR6-LABEL: f8:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(e))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f8:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64-LABEL: f8:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64R6-LABEL: f8:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR5FP64-LABEL: f8:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST28:]] LWC1_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R5FP643-LABEL: f8:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #{{[0-9]+}} LWC1
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    lwc1 $f0, %lo(e)($1) # <MCInst #[[#MCINST27:]] LWC1
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 entry:
   %0 = load float, ptr @e
   ret float %0
@@ -1378,161 +1378,161 @@ entry:
 define double @f9() {
 ; MIPS32-LABEL: f9:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST29:]] LDC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR3-LABEL: f9:
 ; MMR3:       # %bb.0: # %entry
-; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_MM_D32
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST30:]] LDC1_MM_D32
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R6-LABEL: f9:
 ; MIPS32R6:       # %bb.0: # %entry
-; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR6-LABEL: f9:
 ; MMR6:       # %bb.0: # %entry
-; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_D64_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(f))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST32:]] LDC1_D64_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS3-LABEL: f9:
 ; MIPS3:       # %bb.0: # %entry
-; MIPS3-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS3-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS3-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS3-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS3-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS3-NEXT:    # <MCOperand Imm:16>>
-; MIPS3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS3-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS3-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS3-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64-LABEL: f9:
 ; MIPS64:       # %bb.0: # %entry
-; MIPS64-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64-NEXT:    # <MCOperand Imm:16>>
-; MIPS64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS64-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64R6-LABEL: f9:
 ; MIPS64R6:       # %bb.0: # %entry
-; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST12]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR5FP64-LABEL: f9:
 ; MMR5FP64:       # %bb.0: # %entry
-; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC1_MM_D64
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST33:]] LDC1_MM_D64
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R5FP643-LABEL: f9:
 ; MIPS32R5FP643:       # %bb.0: # %entry
-; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #{{[0-9]+}} LDC164
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    ldc1 $f0, %lo(f)($1) # <MCInst #[[#MCINST31:]] LDC164
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 entry:
   %0 = load double, ptr @f
   ret double %0
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/store.ll b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
index 3922db72f2a7c..880a0f522574b 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/store.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/store.ll
@@ -24,133 +24,133 @@
 define void @f1(i8 %a) {
 ; MIPS32-LABEL: f1:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR3-LABEL: f1:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR3-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR3-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R6-LABEL: f1:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7:]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR6-LABEL: f1:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(a))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
 ;
 ; MIPS4-LABEL: f1:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS4-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST12:]] SB64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS64R6-LABEL: f1:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(a))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(a) # <MCInst #[[#MCINST9:]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5:]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,a)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(a) # <MCInst #[[#MCINST10:]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11:]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(a) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13:]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6:]]>>
+; MIPS64R6-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST12:]] SB64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MMR5FP64-LABEL: f1:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MMR5FP64-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST4:]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5:]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MMR5FP64-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST6:]] SB_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
 ;
 ; MIPS32R5FP643-LABEL: f1:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(a))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sb $4, %lo(a)($1) # <MCInst #{{[0-9]+}} SB
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(a))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(a) # <MCInst #[[#MCINST1:]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,a)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2:]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2:]]>>
+; MIPS32R5FP643-NEXT:    sb $4, %lo(a)($1) # <MCInst #[[#MCINST3:]] SB
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,a)>>
   store i8 %a, ptr @a
   ret void
 }
@@ -158,133 +158,133 @@ define void @f1(i8 %a) {
 define void @f2(i16 %a) {
 ; MIPS32-LABEL: f2:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR3-LABEL: f2:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR3-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R6-LABEL: f2:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR6-LABEL: f2:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(b))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f2:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST16:]] SH64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS64R6-LABEL: f2:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(b))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(b) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,b)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(b) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST16:]] SH64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MMR5FP64-LABEL: f2:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MMR5FP64-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST15:]] SH_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
 ;
 ; MIPS32R5FP643-LABEL: f2:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(b))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sh $4, %lo(b)($1) # <MCInst #{{[0-9]+}} SH
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(b))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(b) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,b)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sh $4, %lo(b)($1) # <MCInst #[[#MCINST14:]] SH
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,b)>>
   store i16 %a, ptr @b
   ret void
 }
@@ -292,133 +292,133 @@ define void @f2(i16 %a) {
 define void @f3(i32 %a) {
 ; MIPS32-LABEL: f3:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR3-LABEL: f3:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR3-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R6-LABEL: f3:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR6-LABEL: f3:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(c))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f3:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST19:]] SW64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS64R6-LABEL: f3:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(c))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(c) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,c)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(c) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST19:]] SW64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MMR5FP64-LABEL: f3:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MMR5FP64-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST18:]] SW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
 ;
 ; MIPS32R5FP643-LABEL: f3:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(c))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sw $4, %lo(c)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(c))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(c) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,c)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sw $4, %lo(c)($1) # <MCInst #[[#MCINST17:]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,c)>>
   store i32 %a, ptr @c
   ret void
 }
@@ -426,180 +426,180 @@ define void @f3(i32 %a) {
 define void @f4(i64 %a) {
 ; MIPS32-LABEL: f4:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32-NEXT:    # <MCOperand Imm:4>>
 ;
 ; MMR3-LABEL: f4:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR3-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR3-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR3-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR3-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR3-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR3-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR3-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR3-NEXT:    # <MCOperand Imm:4>>
-; MMR3-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR3-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS32R6-LABEL: f4:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32R6-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R6-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R6-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32R6-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R6-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32R6-NEXT:    # <MCOperand Imm:4>>
 ;
 ; MMR6-LABEL: f4:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR6-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR6-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR6-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR6-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR6-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR6-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR6-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR6-NEXT:    # <MCOperand Imm:4>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f4:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(d) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(d))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(d))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(d) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,d)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,d)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sd $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SD
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(d))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sd $4, %lo(d)($1) # <MCInst #[[#MCINST23:]] SD
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
 ;
 ; MIPS64R6-LABEL: f4:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(d) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(d))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(d))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(d) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,d)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,d)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(d) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sd $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SD
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(d))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sd $4, %lo(d)($1) # <MCInst #[[#MCINST23:]] SD
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG7]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
 ;
 ; MMR5FP64-LABEL: f4:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MMR5FP64-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR5FP64-NEXT:    addiu $2, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MMR5FP64-NEXT:    sw16 $5, 4($2) # <MCInst #{{[0-9]+}} SW16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MMR5FP64-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MMR5FP64-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST18]] SW_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR5FP64-NEXT:    addiu $2, $1, %lo(d) # <MCInst #[[#MCINST21:]] ADDiu_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MMR5FP64-NEXT:    sw16 $5, 4($2) # <MCInst #[[#MCINST22:]] SW16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG10]]>
 ; MMR5FP64-NEXT:    # <MCOperand Imm:4>>
-; MMR5FP64-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR5FP64-NEXT:    jrc $ra # <MCInst #[[#MCINST8:]] JRC16_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS32R5FP643-LABEL: f4:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(d) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(d))>>
-; MIPS32R5FP643-NEXT:    sw $4, %lo(d)($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R5FP643-NEXT:    addiu $1, $1, %lo(d) # <MCInst #{{[0-9]+}} ADDiu
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(d))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sw $5, 4($1) # <MCInst #{{[0-9]+}} SW
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(d) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,d)>>
+; MIPS32R5FP643-NEXT:    sw $4, %lo(d)($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG3]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R5FP643-NEXT:    addiu $1, $1, %lo(d) # <MCInst #[[#MCINST20:]] ADDiu
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,d)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sw $5, 4($1) # <MCInst #[[#MCINST17]] SW
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG9:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
 ; MIPS32R5FP643-NEXT:    # <MCOperand Imm:4>>
   store i64 %a, ptr @d
   ret void
@@ -608,133 +608,133 @@ define void @f4(i64 %a) {
 define void @f5(float %e) {
 ; MIPS32-LABEL: f5:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR3-LABEL: f5:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR3-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R6-LABEL: f5:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR6-LABEL: f5:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(e))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f5:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS64R6-LABEL: f5:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(e))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(e) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,e)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(e) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MMR5FP64-LABEL: f5:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MMR5FP64-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST25:]] SWC1_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
 ;
 ; MIPS32R5FP643-LABEL: f5:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(e))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #{{[0-9]+}} SWC1
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(e))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(e) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,e)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    swc1 $f12, %lo(e)($1) # <MCInst #[[#MCINST24:]] SWC1
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG11:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,e)>>
   store float %e, ptr @e
   ret void
 }
@@ -742,133 +742,133 @@ define void @f5(float %e) {
 define void @f6(double %f) {
 ; MIPS32-LABEL: f6:
 ; MIPS32:       # %bb.0:
-; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST26:]] SDC1
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MIPS32-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR3-LABEL: f6:
 ; MMR3:       # %bb.0:
-; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR3-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR3-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_MM_D32
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR3-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR3-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR3-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR3-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST27:]] SDC1_MM_D32
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG12:]]>
+; MMR3-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR3-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R6-LABEL: f6:
 ; MIPS32R6:       # %bb.0:
-; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R6-NEXT:    jr $ra # <MCInst #[[#MCINST7]] JALR
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG4]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS32R6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR6-LABEL: f6:
 ; MMR6:       # %bb.0:
-; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_D64_MMR6
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR6-NEXT:    # <MCOperand Expr:(%lo(f))>>
-; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
-; MMR6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
+; MMR6-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST29:]] SDC1_D64_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
+; MMR6-NEXT:    jrc $ra # <MCInst #[[#MCINST8]] JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
 ;
 ; MIPS4-LABEL: f6:
 ; MIPS4:       # %bb.0:
-; MIPS4-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS4-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS4-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS4-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS4-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS4-NEXT:    # <MCOperand Imm:16>>
-; MIPS4-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS4-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS4-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS4-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS4-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS4-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS4-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS64R6-LABEL: f6:
 ; MIPS64R6:       # %bb.0:
-; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #{{[0-9]+}} LUi64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%highest(f))>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%higher(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    lui $1, %highest(f) # <MCInst #[[#MCINST9]] LUi64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4030,f)>>
+; MIPS64R6-NEXT:    daddiu $1, $1, %higher(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4029,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #{{[0-9]+}} DADDiu
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #{{[0-9]+}} DSLL
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
+; MIPS64R6-NEXT:    daddiu $1, $1, %hi(f) # <MCInst #[[#MCINST10]] DADDiu
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS64R6-NEXT:    dsll $1, $1, 16 # <MCInst #[[#MCINST11]] DSLL
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
 ; MIPS64R6-NEXT:    # <MCOperand Imm:16>>
-; MIPS64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS64R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS64R6-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS64R6-NEXT:    jr $ra # <MCInst #[[#MCINST13]] JALR64
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG8]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG6]]>>
+; MIPS64R6-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS64R6-NEXT:    # <MCOperand Reg:[[#MCREG5]]>
+; MIPS64R6-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MMR5FP64-LABEL: f6:
 ; MMR5FP64:       # %bb.0:
-; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MMR5FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MMR5FP64-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC1_MM_D64
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MMR5FP64-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MMR5FP64-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST4]] LUi_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MMR5FP64-NEXT:    jr $ra # <MCInst #[[#MCINST5]] JR_MM
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MMR5FP64-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST30:]] SDC1_MM_D64
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MMR5FP64-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MMR5FP64-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
 ;
 ; MIPS32R5FP643-LABEL: f6:
 ; MIPS32R5FP643:       # %bb.0:
-; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #{{[0-9]+}} LUi
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%hi(f))>>
-; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>>
-; MIPS32R5FP643-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #{{[0-9]+}} SDC164
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Reg:{{[0-9]+}}>
-; MIPS32R5FP643-NEXT:    # <MCOperand Expr:(%lo(f))>>
+; MIPS32R5FP643-NEXT:    lui $1, %hi(f) # <MCInst #[[#MCINST1]] LUi
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4028,f)>>
+; MIPS32R5FP643-NEXT:    jr $ra # <MCInst #[[#MCINST2]] JR
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG2]]>>
+; MIPS32R5FP643-NEXT:    sdc1 $f12, %lo(f)($1) # <MCInst #[[#MCINST28:]] SDC164
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG13:]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Reg:[[#MCREG1]]>
+; MIPS32R5FP643-NEXT:    # <MCOperand Expr:specifier(4031,f)>>
   store double %f, ptr @f
   ret void
 }
diff --git a/llvm/test/MC/Lanai/conditional_inst.s b/llvm/test/MC/Lanai/conditional_inst.s
index d167d1af00eb3..a0a8caf269fe8 100644
--- a/llvm/test/MC/Lanai/conditional_inst.s
+++ b/llvm/test/MC/Lanai/conditional_inst.s
@@ -27,14 +27,14 @@ jump2:
 ! CHECK: encoding: [0b1110110A,A,A,0x01'A']
 ! CHECK-NEXT: fixup A - offset: 0, value: jump1, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BRCC{{$}}
-! CHECK-NEXT: <MCOperand Expr:(jump1)>
+! CHECK-NEXT: <MCOperand Expr:specifier(0,jump1)>
 ! CHECK-NEXT: <MCOperand Imm:13>
 
     bpl jump2
 ! CHECK: encoding: [0b1110101A,A,A,A]
 ! CHECK-NEXT: fixup A - offset: 0, value: jump2, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BRCC{{$}}
-! CHECK-NEXT: <MCOperand Expr:(jump2)>
+! CHECK-NEXT: <MCOperand Expr:specifier(0,jump2)>
 ! CHECK-NEXT: <MCOperand Imm:10>
 
     bt .
@@ -43,7 +43,7 @@ jump2:
 ! CHECK:      encoding: [0b1110000A,A,A,A]
 ! CHECK-NEXT:   fixup A - offset: 0, value: .Ltmp0, kind: FIXUP_LANAI_25
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} BT{{$}}
-! CHECK-NEXT:   <MCOperand Expr:(.Ltmp0)>
+! CHECK-NEXT:   <MCOperand Expr:.Ltmp0>
 
 ! SCC
     spl %r19
diff --git a/llvm/test/MC/Lanai/memory.s b/llvm/test/MC/Lanai/memory.s
index 41dc8fba7bf29..0e6234645a80d 100644
--- a/llvm/test/MC/Lanai/memory.s
+++ b/llvm/test/MC/Lanai/memory.s
@@ -235,7 +235,7 @@
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} ADD_I_HI
 ! CHECK-NEXT: <MCOperand Reg:11>
 ! CHECK-NEXT: <MCOperand Reg:7>
-! CHECK-NEXT: <MCOperand Expr:(hi(x))>
+! CHECK-NEXT: <MCOperand Expr:specifier(1,x)>
 
     mov hi(l+4), %r7
 ! CHECK: encoding: [0x03,0x81,A,A]
@@ -243,5 +243,5 @@
 ! CHECK-NEXT: <MCInst #{{[0-9]+}} ADD_I_HI
 ! CHECK-NEXT: <MCOperand Reg:14>
 ! CHECK-NEXT: <MCOperand Reg:7>
-! CHECK-NEXT: <MCOperand Expr:(hi(l)+4)>
+! CHECK-NEXT: <MCOperand Expr:specifier(1,l)+4>
 

From 05a9ad977624c4f6def7c0f4cf7103e28d6c6541 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 21:50:48 -0700
Subject: [PATCH 552/851] Lanai: Migrate to the new relocation specifier
 representation

Use MCSpecifierExpr directly and remove the LanaiMCExpr subclass. Define
MCSpecifierExpr::printImpl to print the relocation specifier in decimal
for llvm-mc -show-inst. The output is not guaranteed to be stable.

Depends on f8e0518120cd2850a7f674322bf428bc7d7d3326
("MC: Adjust -show-inst output for MCExpr")
---
 .../Target/Lanai/AsmParser/LanaiAsmParser.cpp | 104 +++++++++---------
 llvm/lib/Target/Lanai/LanaiMCInstLower.cpp    |  13 +--
 .../Target/Lanai/MCTargetDesc/CMakeLists.txt  |   1 -
 .../Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp     |  26 ++++-
 .../Lanai/MCTargetDesc/LanaiMCAsmInfo.h       |   7 ++
 .../Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp |  14 +--
 .../Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp |  44 --------
 .../Target/Lanai/MCTargetDesc/LanaiMCExpr.h   |  33 ------
 8 files changed, 95 insertions(+), 147 deletions(-)
 delete mode 100644 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
 delete mode 100644 llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h

diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index b2fcd7af07331..9cb7f71945d1d 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -9,7 +9,7 @@
 #include "LanaiAluCode.h"
 #include "LanaiCondCode.h"
 #include "LanaiInstrInfo.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -231,14 +231,14 @@ struct LanaiOperand : public MCParsedAsmOperand {
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI;
 
     return false;
   }
@@ -268,14 +268,14 @@ struct LanaiOperand : public MCParsedAsmOperand {
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     return false;
   }
@@ -292,14 +292,14 @@ struct LanaiOperand : public MCParsedAsmOperand {
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value))
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO;
 
     return false;
   }
@@ -339,8 +339,8 @@ struct LanaiOperand : public MCParsedAsmOperand {
     }
 
     // Symbolic reference expression
-    if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Imm.Value))
-      return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+    if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Imm.Value))
+      return SymbolRefExpr->getSpecifier() == Lanai::S_None;
     if (const MCSymbolRefExpr *SymbolRefExpr =
             dyn_cast<MCSymbolRefExpr>(Imm.Value)) {
       return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
@@ -348,9 +348,9 @@ struct LanaiOperand : public MCParsedAsmOperand {
 
     // Binary expression
     if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Imm.Value)) {
-      if (const LanaiMCExpr *SymbolRefExpr =
-              dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS()))
-        return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+      if (const auto *SymbolRefExpr =
+              dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS()))
+        return SymbolRefExpr->getSpecifier() == Lanai::S_None;
       if (const MCSymbolRefExpr *SymbolRefExpr =
               dyn_cast<MCSymbolRefExpr>(BinaryExpr->getLHS()))
         return SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None;
@@ -464,19 +464,18 @@ struct LanaiOperand : public MCParsedAsmOperand {
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(
           MCOperand::createImm(static_cast<int32_t>(ConstExpr->getValue())));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_LO);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_ABS_LO);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_ABS_LO);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -495,19 +494,18 @@ struct LanaiOperand : public MCParsedAsmOperand {
     assert(N == 1 && "Invalid number of operands!");
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() >> 16));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_ABS_HI);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_ABS_HI);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_ABS_HI);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -526,11 +524,10 @@ struct LanaiOperand : public MCParsedAsmOperand {
     assert(N == 1 && "Invalid number of operands!");
     if (const MCConstantExpr *ConstExpr = dyn_cast<MCConstantExpr>(getImm()))
       Inst.addOperand(MCOperand::createImm(ConstExpr->getValue() & 0x1fffff));
-    else if (isa<LanaiMCExpr>(getImm())) {
+    else if (isa<MCSpecifierExpr>(getImm())) {
 #ifndef NDEBUG
-      const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(getImm());
-      assert(SymbolRefExpr &&
-             SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None);
+      const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(getImm());
+      assert(SymbolRefExpr && SymbolRefExpr->getSpecifier() == Lanai::S_None);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else if (isa<MCSymbolRefExpr>(getImm())) {
@@ -544,9 +541,9 @@ struct LanaiOperand : public MCParsedAsmOperand {
     } else if (isa<MCBinaryExpr>(getImm())) {
 #ifndef NDEBUG
       const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(getImm());
-      assert(BinaryExpr && isa<LanaiMCExpr>(BinaryExpr->getLHS()) &&
-             cast<LanaiMCExpr>(BinaryExpr->getLHS())->getSpecifier() ==
-                 LanaiMCExpr::VK_Lanai_None);
+      assert(BinaryExpr && isa<MCSpecifierExpr>(BinaryExpr->getLHS()) &&
+             cast<MCSpecifierExpr>(BinaryExpr->getLHS())->getSpecifier() ==
+                 Lanai::S_None);
 #endif
       Inst.addOperand(MCOperand::createExpr(getImm()));
     } else
@@ -737,7 +734,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *Res, *RHS = nullptr;
-  LanaiMCExpr::Spec Kind = LanaiMCExpr::VK_Lanai_None;
+  auto Kind = Lanai::S_None;
 
   if (Lexer.getKind() != AsmToken::Identifier)
     return nullptr;
@@ -748,13 +745,13 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
 
   // Check if identifier has a modifier
   if (Identifier.equals_insensitive("hi"))
-    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    Kind = Lanai::S_ABS_HI;
   else if (Identifier.equals_insensitive("lo"))
-    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    Kind = Lanai::S_ABS_LO;
 
   // If the identifier corresponds to a variant then extract the real
   // identifier.
-  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+  if (Kind != Lanai::S_None) {
     if (Lexer.getKind() != AsmToken::LParen) {
       Error(Lexer.getLoc(), "Expected '('");
       return nullptr;
@@ -771,7 +768,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
     return nullptr;
 
   // For variants parse the final ')'
-  if (Kind != LanaiMCExpr::VK_Lanai_None) {
+  if (Kind != Lanai::S_None) {
     if (Lexer.getKind() != AsmToken::RParen) {
       Error(Lexer.getLoc(), "Expected ')'");
       return nullptr;
@@ -781,8 +778,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
 
   End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
-  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
-  Res = LanaiMCExpr::create(Kind, Expr, getContext());
+  Res = MCSpecifierExpr::create(Sym, Kind, getContext());
 
   // Nest if this was an addition
   if (RHS)
@@ -865,16 +861,16 @@ bool shouldBeSls(const LanaiOperand &Op) {
   }
   // The instruction should be encoded as an SLS if the operand is a symbolic
   // reference with no variant.
-  if (const LanaiMCExpr *SymbolRefExpr = dyn_cast<LanaiMCExpr>(Op.getImm()))
-    return SymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None;
+  if (const auto *SymbolRefExpr = dyn_cast<MCSpecifierExpr>(Op.getImm()))
+    return SymbolRefExpr->getSpecifier() == Lanai::S_None;
   // The instruction should be encoded as an SLS if the operand is a binary
   // expression with the left-hand side being a symbolic reference with no
   // variant.
   if (const MCBinaryExpr *BinaryExpr = dyn_cast<MCBinaryExpr>(Op.getImm())) {
-    const LanaiMCExpr *LHSSymbolRefExpr =
-        dyn_cast<LanaiMCExpr>(BinaryExpr->getLHS());
+    const auto *LHSSymbolRefExpr =
+        dyn_cast<MCSpecifierExpr>(BinaryExpr->getLHS());
     return (LHSSymbolRefExpr &&
-            LHSSymbolRefExpr->getSpecifier() == LanaiMCExpr::VK_Lanai_None);
+            LHSSymbolRefExpr->getSpecifier() == Lanai::S_None);
   }
   return false;
 }
diff --git a/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
index 21b327fd8f7cd..b0db8d0887689 100644
--- a/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -14,7 +14,7 @@
 #include "LanaiMCInstLower.h"
 
 #include "MCTargetDesc/LanaiBaseInfo.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -64,17 +64,16 @@ LanaiMCInstLower::GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
 
 MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                MCSymbol *Sym) const {
-  LanaiMCExpr::Spec Kind;
-
+  Lanai::Specifier Kind;
   switch (MO.getTargetFlags()) {
   case LanaiII::MO_NO_FLAG:
-    Kind = LanaiMCExpr::VK_Lanai_None;
+    Kind = Lanai::S_None;
     break;
   case LanaiII::MO_ABS_HI:
-    Kind = LanaiMCExpr::VK_Lanai_ABS_HI;
+    Kind = Lanai::S_ABS_HI;
     break;
   case LanaiII::MO_ABS_LO:
-    Kind = LanaiMCExpr::VK_Lanai_ABS_LO;
+    Kind = Lanai::S_ABS_LO;
     break;
   default:
     llvm_unreachable("Unknown target flag on GV operand");
@@ -84,7 +83,7 @@ MCOperand LanaiMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
-  Expr = LanaiMCExpr::create(Kind, Expr, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, Kind, Ctx);
   return MCOperand::createExpr(Expr);
 }
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
index 04fca878ca5af..ff3b6abc70ec3 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/CMakeLists.txt
@@ -4,7 +4,6 @@ add_llvm_component_library(LLVMLanaiDesc
   LanaiInstPrinter.cpp
   LanaiMCAsmInfo.cpp
   LanaiMCCodeEmitter.cpp
-  LanaiMCExpr.cpp
   LanaiMCTargetDesc.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
index 7ae693130da57..6ad018c12a28b 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -11,7 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "LanaiMCAsmInfo.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -38,3 +39,26 @@ LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/,
   // in dwarf generation.
   MinInstAlignment = 4;
 }
+
+void LanaiMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                        const MCSpecifierExpr &Expr) const {
+  if (Expr.getSpecifier() == 0) {
+    printExpr(OS, *Expr.getSubExpr());
+    return;
+  }
+
+  switch (Expr.getSpecifier()) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case Lanai::S_ABS_HI:
+    OS << "hi";
+    break;
+  case Lanai::S_ABS_LO:
+    OS << "lo";
+    break;
+  }
+
+  OS << '(';
+  printExpr(OS, *Expr.getSubExpr());
+  OS << ')';
+}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
index f0352d0212910..2696975e71c03 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h
@@ -24,8 +24,15 @@ class LanaiMCAsmInfo : public MCAsmInfoELF {
 public:
   explicit LanaiMCAsmInfo(const Triple &TheTriple,
                           const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
 };
 
+namespace Lanai {
+using Specifier = uint8_t;
+enum { S_None, S_ABS_HI, S_ABS_LO };
+} // namespace Lanai
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCASMINFO_H
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index 779c83e5b3f2f..d1b2da40446a6 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "LanaiAluCode.h"
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
-#include "MCTargetDesc/LanaiMCExpr.h"
+#include "MCTargetDesc/LanaiMCAsmInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -89,14 +89,14 @@ class LanaiMCCodeEmitter : public MCCodeEmitter {
 static Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
-  if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
-    LanaiMCExpr::Spec ExprKind = McExpr->getSpecifier();
+  if (const MCSpecifierExpr *McExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
+    Lanai::Specifier ExprKind = McExpr->getSpecifier();
     switch (ExprKind) {
-    case LanaiMCExpr::VK_Lanai_None:
+    case Lanai::S_None:
       return Lanai::FIXUP_LANAI_21;
-    case LanaiMCExpr::VK_Lanai_ABS_HI:
+    case Lanai::S_ABS_HI:
       return Lanai::FIXUP_LANAI_HI16;
-    case LanaiMCExpr::VK_Lanai_ABS_LO:
+    case Lanai::S_ABS_LO:
       return Lanai::FIXUP_LANAI_LO16;
     }
   }
@@ -123,7 +123,7 @@ unsigned LanaiMCCodeEmitter::getMachineOpValue(
     Expr = BinaryExpr->getLHS();
   }
 
-  assert(isa<LanaiMCExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
+  assert(isa<MCSpecifierExpr>(Expr) || Expr->getKind() == MCExpr::SymbolRef);
   // Push fixup (all info is contained within)
   Fixups.push_back(
       MCFixup::create(0, MCOp.getExpr(), MCFixupKind(FixupKind(Expr))));
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
deleted file mode 100644
index b75a09915660c..0000000000000
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- LanaiMCExpr.cpp - Lanai specific MC expression classes ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "LanaiMCExpr.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "lanaimcexpr"
-
-const LanaiMCExpr *LanaiMCExpr::create(Spec S, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) LanaiMCExpr(Expr, S);
-}
-
-void LanaiMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  if (specifier == VK_Lanai_None) {
-    MAI->printExpr(OS, *Expr);
-    return;
-  }
-
-  switch (specifier) {
-  default:
-    llvm_unreachable("Invalid kind!");
-  case VK_Lanai_ABS_HI:
-    OS << "hi";
-    break;
-  case VK_Lanai_ABS_LO:
-    OS << "lo";
-    break;
-  }
-
-  OS << '(';
-  const MCExpr *Expr = getSubExpr();
-  MAI->printExpr(OS, *Expr);
-  OS << ')';
-}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
deleted file mode 100644
index 90f8a3e5bbd59..0000000000000
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCExpr.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-- LanaiMCExpr.h - Lanai specific MC expression classes ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
-#define LLVM_LIB_TARGET_LANAI_MCTARGETDESC_LANAIMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class LanaiMCExpr : public MCSpecifierExpr {
-public:
-  using Spec = MCSpecifierExpr::Spec;
-  enum { VK_Lanai_None, VK_Lanai_ABS_HI, VK_Lanai_ABS_LO };
-
-private:
-  explicit LanaiMCExpr(const MCExpr *Expr, Spec S) : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const LanaiMCExpr *create(Spec Kind, const MCExpr *Expr,
-                                   MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-};
-} // end namespace llvm
-
-#endif

From 945b12f6c823c49336a878e7afe2a96e4d3382ea Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 16 Jun 2025 04:53:40 +0000
Subject: [PATCH 553/851] [gn build] Port 05a9ad977624

---
 .../gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
index a52132e69cc32..874cdc1b7839c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Lanai/MCTargetDesc/BUILD.gn
@@ -56,7 +56,6 @@ static_library("MCTargetDesc") {
     "LanaiInstPrinter.cpp",
     "LanaiMCAsmInfo.cpp",
     "LanaiMCCodeEmitter.cpp",
-    "LanaiMCExpr.cpp",
     "LanaiMCTargetDesc.cpp",
   ]
 }

From 4ea616d072d126a31149174ca2efdbdace9ce568 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 22:41:36 -0700
Subject: [PATCH 554/851] AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo

To migrate away from the legacy
XXXMCExpr::printImpl/evaluateAsRelocatableImpl overrides and align with
other targets.

While the AArch64MCAsmInfoXXX hooks introduce some duplication, they
enable better separation for object file formats.
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 138 ++++++++++++++++++
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 +++
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +-----------
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 166 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 31965d85d9eb4..a82896dbe0d6c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -53,6 +54,80 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
+StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+  // clang-format off
+  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+  case AArch64MCExpr::VK_CALL:                return "";
+  case AArch64MCExpr::VK_LO12:                return ":lo12:";
+  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
+  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
+  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
+  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
+  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
+  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
+  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
+  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
+  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
+  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
+  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
+  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
+  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
+  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
+  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
+  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
+  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
+  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
+  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
+  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case AArch64MCExpr::VK_ABS_PAGE:            return "";
+  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case AArch64MCExpr::VK_GOT:                 return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case AArch64MCExpr::VK_TLSDESC:             return "";
+  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
+  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
+  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
+
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -91,6 +166,34 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
+void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
+
+void AArch64MCAsmInfoDarwin::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  // FIXME: tryParseAdrLabel should not use VK_ABS for Mach-O
+  assert(Expr.getSpecifier() == AArch64MCExpr::VK_ABS);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -127,6 +230,19 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoELF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -146,6 +262,17 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -164,3 +291,14 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
+
+void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 225e0c8e55fca..bc02586d73884 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -26,20 +27,42 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
+namespace AArch64 {
+/// Return the string representation of the ELF relocation specifier
+/// (e.g. ":got:", ":lo12:").
+StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+} // namespace AArch64
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index d934af91b9ff5..7a7c6f7effd9f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,100 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
+#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
-StringRef AArch64MCExpr::getSpecifierName() const {
-  // clang-format off
-  switch (static_cast<uint32_t>(getSpecifier())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_PREL_G3:             return ":prel_g3:";
-  case VK_PREL_G2:             return ":prel_g2:";
-  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case VK_PREL_G1:             return ":prel_g1:";
-  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case VK_PREL_G0:             return ":prel_g0:";
-  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case VK_GOT:                 return ":got:";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL:            return ":gottprel:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case VK_TLSDESC_AUTH:        return "";
-  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case VK_SECREL_LO12:         return ":secrel_lo12:";
-  case VK_SECREL_HI12:         return ":secrel_hi12:";
-  case VK_GOT_AUTH:            return ":got_auth:";
-  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getSpecifierName();
-  Expr->print(OS, MAI);
-}
-
-bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(getSpecifier());
-  return true;
-}
-
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -114,17 +33,3 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
-
-void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 9c383894c7f54..541f24c943a15 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,8 +147,6 @@ class AArch64MCExpr : public MCSpecifierExpr {
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
-  /// @name VariantKind information extractors.
-  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -159,16 +157,6 @@ class AArch64MCExpr : public MCSpecifierExpr {
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
-
-  /// @}
-
-  /// Return the string representation of the ELF relocation specifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getSpecifierName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -189,7 +177,7 @@ class AArch64AuthMCExpr final : public AArch64MCExpr {
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 3009bd2ca2758..2e997631655ed 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From 4f9e6bad8438f4440bfd68be2f0ebdca0d588d47 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 16 Jun 2025 08:28:52 +0200
Subject: [PATCH 555/851] [clang][bytecode] Fix calling operator new with
 nothrow/align parameter (#144271)

Discard all the parameters we don't care about.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 20 +++++++++++++++++++-
 clang/test/AST/ByteCode/new-delete.cpp   | 22 ++++++++++++++++------
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5fc5034569597..d01e3d042a8bf 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1423,7 +1423,6 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
   // Walk up the call stack to find the appropriate caller and get the
   // element type from it.
   auto [NewCall, ElemType] = S.getStdAllocatorCaller("allocate");
-  APSInt Bytes = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(0)));
 
   if (ElemType.isNull()) {
     S.FFDiag(Call, S.getLangOpts().CPlusPlus20
@@ -1439,6 +1438,25 @@ static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
     return false;
   }
 
+  // We only care about the first parameter (the size), so discard all the
+  // others.
+  {
+    unsigned NumArgs = Call->getNumArgs();
+    assert(NumArgs >= 1);
+
+    // The std::nothrow_t arg never gets put on the stack.
+    if (Call->getArg(NumArgs - 1)->getType()->isNothrowT())
+      --NumArgs;
+    auto Args = llvm::ArrayRef(Call->getArgs(), Call->getNumArgs());
+    // First arg is needed.
+    Args = Args.drop_front();
+
+    // Discard the rest.
+    for (const Expr *Arg : Args)
+      discard(S.Stk, *S.getContext().classify(Arg));
+  }
+
+  APSInt Bytes = popToAPSInt(S.Stk, *S.getContext().classify(Call->getArg(0)));
   CharUnits ElemSize = S.getASTContext().getTypeSizeInChars(ElemType);
   assert(!ElemSize.isZero());
   // Divide the number of bytes by sizeof(ElemType), so we get the number of
diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp
index 1ee41a98e13bb..9c293e5d15fc8 100644
--- a/clang/test/AST/ByteCode/new-delete.cpp
+++ b/clang/test/AST/ByteCode/new-delete.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -fexperimental-new-constant-interpreter -verify=expected,both %s
-// RUN: %clang_cc1 -verify=ref,both %s
-// RUN: %clang_cc1 -std=c++20 -verify=ref,both %s
-// RUN: %clang_cc1 -triple=i686-linux-gnu -std=c++20 -verify=ref,both %s
+// RUN: %clang_cc1            -verify=expected,both                        -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1 -std=c++20 -verify=expected,both                        -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1 -std=c++20 -verify=expected,both -triple=i686-linux-gnu -fexperimental-new-constant-interpreter %s
+// RUN: %clang_cc1            -verify=ref,both                                                                     %s
+// RUN: %clang_cc1 -std=c++20 -verify=ref,both                                                                     %s
+// RUN: %clang_cc1 -std=c++20 -verify=ref,both      -triple=i686-linux-gnu                                         %s
 
 #if __cplusplus >= 202002L
 
@@ -1012,6 +1012,16 @@ constexpr int no_deallocate_nonalloc = (std::allocator<int>().deallocate((int*)&
                                                                                                              // both-note {{in call}} \
                                                                                                              // both-note {{declared here}}
 
+namespace OpNewNothrow {
+  constexpr int f() {
+      int *v = (int*)operator new(sizeof(int), std::align_val_t(2), std::nothrow); // both-note {{cannot allocate untyped memory in a constant expression; use 'std::allocator<T>::allocate' to allocate memory of type 'T'}}
+      operator delete(v, std::align_val_t(2), std::nothrow);
+      return 1;
+  }
+  static_assert(f()); // both-error {{not an integral constant expression}} \
+                      // both-note {{in call to}}
+}
+
 #else
 /// Make sure we reject this prior to C++20
 constexpr int a() { // both-error {{never produces a constant expression}}

From f3021e79fd5a4cab5537f37df2e6010a325d0a7c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 15 Jun 2025 23:29:06 -0700
Subject: [PATCH 556/851] ARM: Rename ARMMCExpr::VK_ to ARM::S_

Prepare for removing ARMMCExpr. Adopt the new naming convention (S_
instead of VK_; the relocation specifier was previously named
`VariantKind`)) used by most other targets.

Make ARMMCAsmInfo.h include ARMMCExpr.h and change .cpp files to include
ARMMCAsmInfo.h. We will eventually remove ARMMCExpr.h.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         |  17 ++-
 llvm/lib/Target/ARM/ARMMCInstLower.cpp        |   6 +-
 llvm/lib/Target/ARM/ARMTargetObjectFile.cpp   |  12 +--
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |  34 +++---
 .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp |   6 +-
 .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp   | 102 +++++++++---------
 .../ARM/MCTargetDesc/ARMELFStreamer.cpp       |  14 +--
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp  |  44 ++++----
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.h    |  45 ++++++++
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     |  26 ++---
 .../lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp |  39 +++++--
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h  |  69 ++----------
 12 files changed, 214 insertions(+), 200 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 13efd70c0f22b..fef7a17ae0b63 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -18,7 +18,7 @@
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "MCTargetDesc/ARMInstPrinter.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/BinaryFormat/COFF.h"
@@ -92,8 +92,7 @@ void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
 
   const MCExpr *E = MCSymbolRefExpr::create(
       GetARMGVSymbol(GV, ARMII::MO_NO_FLAG),
-      (Subtarget->isTargetELF() ? ARMMCExpr::VK_TARGET1 : ARMMCExpr::VK_None),
-      OutContext);
+      (Subtarget->isTargetELF() ? ARM::S_TARGET1 : ARM::S_None), OutContext);
 
   OutStreamer->emitValue(E, Size);
 }
@@ -843,17 +842,17 @@ static MCSymbol *getPICLabel(StringRef Prefix, unsigned FunctionNumber,
 static uint8_t getModifierSpecifier(ARMCP::ARMCPModifier Modifier) {
   switch (Modifier) {
   case ARMCP::no_modifier:
-    return ARMMCExpr::VK_None;
+    return ARM::S_None;
   case ARMCP::TLSGD:
-    return ARMMCExpr::VK_TLSGD;
+    return ARM::S_TLSGD;
   case ARMCP::TPOFF:
-    return ARMMCExpr::VK_TPOFF;
+    return ARM::S_TPOFF;
   case ARMCP::GOTTPOFF:
-    return ARMMCExpr::VK_GOTTPOFF;
+    return ARM::S_GOTTPOFF;
   case ARMCP::SBREL:
-    return ARMMCExpr::VK_SBREL;
+    return ARM::S_SBREL;
   case ARMCP::GOT_PREL:
-    return ARMMCExpr::VK_GOT_PREL;
+    return ARM::S_GOT_PREL;
   case ARMCP::SECREL:
     return MCSymbolRefExpr::VK_SECREL;
   }
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 6892db6eb52c4..b32de6b66058b 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -18,7 +18,7 @@
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -37,9 +37,9 @@ using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
-  auto Specifier = ARMMCExpr::VK_None;
+  auto Specifier = ARM::S_None;
   if (MO.getTargetFlags() & ARMII::MO_SBREL)
-    Specifier = ARMMCExpr::VK_SBREL;
+    Specifier = ARM::S_SBREL;
 
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
   switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 131703ec082bf..a0a400f938482 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -9,7 +9,7 @@
 #include "ARMTargetObjectFile.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -30,7 +30,7 @@ using namespace dwarf;
 //===----------------------------------------------------------------------===//
 
 ARMElfTargetObjectFile::ARMElfTargetObjectFile() {
-  PLTRelativeSpecifier = ARMMCExpr::VK_PREL31;
+  PLTRelativeSpecifier = ARM::S_PREL31;
   SupportIndirectSymViaGOTPCRel = true;
 }
 
@@ -68,14 +68,14 @@ const MCExpr *ARMElfTargetObjectFile::getIndirectSymViaGOTPCRel(
     int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   int64_t FinalOffset = Offset + MV.getConstant();
   const MCExpr *Res =
-      MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_GOT_PREL, getContext());
+      MCSymbolRefExpr::create(Sym, ARM::S_GOT_PREL, getContext());
   const MCExpr *Off = MCConstantExpr::create(FinalOffset, getContext());
   return MCBinaryExpr::createAdd(Res, Off, getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::
 getIndirectSymViaRWPI(const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_SBREL, getContext());
+  return MCSymbolRefExpr::create(Sym, ARM::S_SBREL, getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
@@ -87,13 +87,13 @@ const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
 
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::create(TM.getSymbol(GV), ARMMCExpr::VK_TARGET2,
+  return MCSymbolRefExpr::create(TM.getSymbol(GV), ARM::S_TARGET2,
                                  getContext());
 }
 
 const MCExpr *ARMElfTargetObjectFile::
 getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
-  return MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_TLSLDO, getContext());
+  return MCSymbolRefExpr::create(Sym, ARM::S_TLSLDO, getContext());
 }
 
 static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 19c417b2c6e9b..6e9efe40dc54c 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -11,7 +11,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMInstPrinter.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "TargetInfo/ARMTargetInfo.h"
 #include "Utils/ARMBaseInfo.h"
@@ -1327,8 +1327,8 @@ class ARMOperand : public MCParsedAsmOperand {
       // We want to avoid matching :upper16: and :lower16: as we want these
       // expressions to match in isImm0_65535Expr()
       const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(getImm());
-      return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARMMCExpr::VK_HI16 &&
-                             ARM16Expr->getSpecifier() != ARMMCExpr::VK_LO16));
+      return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
+                             ARM16Expr->getSpecifier() != ARM::S_LO16));
     }
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -6473,7 +6473,7 @@ bool ARMAsmParser::parseImmExpr(int64_t &Out) {
 // :upper8_15:, :upper0_7:, :lower8_15: and :lower0_7:
 bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
   MCAsmParser &Parser = getParser();
-  Spec = ARMMCExpr::VK_None;
+  Spec = ARM::S_None;
 
   // consume an optional '#' (GNU compatibility)
   if (getLexer().is(AsmToken::Hash))
@@ -6498,12 +6498,12 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
     ARMMCExpr::Specifier Spec;
     uint8_t SupportedFormats;
   } PrefixEntries[] = {
-      {"upper16", ARMMCExpr::VK_HI16, COFF | ELF | MACHO},
-      {"lower16", ARMMCExpr::VK_LO16, COFF | ELF | MACHO},
-      {"upper8_15", ARMMCExpr::VK_HI_8_15, ELF},
-      {"upper0_7", ARMMCExpr::VK_HI_0_7, ELF},
-      {"lower8_15", ARMMCExpr::VK_LO_8_15, ELF},
-      {"lower0_7", ARMMCExpr::VK_LO_0_7, ELF},
+      {"upper16", ARM::S_HI16, COFF | ELF | MACHO},
+      {"lower16", ARM::S_LO16, COFF | ELF | MACHO},
+      {"upper8_15", ARM::S_HI_8_15, ELF},
+      {"upper0_7", ARM::S_HI_0_7, ELF},
+      {"lower8_15", ARM::S_LO_8_15, ELF},
+      {"lower0_7", ARM::S_LO_0_7, ELF},
   };
 
   StringRef IDVal = Parser.getTok().getIdentifier();
@@ -6880,10 +6880,10 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   if (!E)
     return false;
   const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
-  if (ARM16Expr && (ARM16Expr->getSpecifier() == ARMMCExpr::VK_HI_8_15 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_HI_0_7 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_LO_8_15 ||
-                    ARM16Expr->getSpecifier() == ARMMCExpr::VK_LO_0_7))
+  if (ARM16Expr && (ARM16Expr->getSpecifier() == ARM::S_HI_8_15 ||
+                    ARM16Expr->getSpecifier() == ARM::S_HI_0_7 ||
+                    ARM16Expr->getSpecifier() == ARM::S_LO_8_15 ||
+                    ARM16Expr->getSpecifier() == ARM::S_LO_0_7))
     return true;
   return false;
 }
@@ -8287,8 +8287,8 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
     const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
-    if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARMMCExpr::VK_HI16 &&
-                       ARM16Expr->getSpecifier() != ARMMCExpr::VK_LO16))
+    if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
+                       ARM16Expr->getSpecifier() != ARM::S_LO16))
       return Error(
           Op.getStartLoc(),
           "immediate expression for mov requires :lower16: or :upper16");
@@ -12437,7 +12437,7 @@ bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
 
   auto *Sym = getContext().getOrCreateSymbol(Parser.getTok().getIdentifier());
   const auto *SRE =
-      MCSymbolRefExpr::create(Sym, ARMMCExpr::VK_TLSDESCSEQ, getContext());
+      MCSymbolRefExpr::create(Sym, ARM::S_TLSDESCSEQ, getContext());
   Lex();
 
   if (parseEOL())
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index a7320eea80b0e..f43fdae554b8b 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -12,7 +12,7 @@
 #include "MCTargetDesc/ARMAsmBackendELF.h"
 #include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -619,7 +619,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     // Offset by 8 just as above.
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getSpecifier() == ARMMCExpr::VK_TLSCALL)
+      if (SRE->getSpecifier() == ARM::S_TLSCALL)
         return 0;
     return 0xffffff & (Value >> 2);
   case ARM::fixup_t2_uncondbranch: {
@@ -746,7 +746,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
     uint32_t offset = (Value - 4) >> 2;
     if (const MCSymbolRefExpr *SRE =
             dyn_cast<MCSymbolRefExpr>(Fixup.getValue()))
-      if (SRE->getSpecifier() == ARMMCExpr::VK_TLSCALL)
+      if (SRE->getSpecifier() == ARM::S_TLSCALL)
         offset = 0;
     uint32_t signBit = (offset & 0x400000) >> 22;
     uint32_t I1Bit = (offset & 0x200000) >> 21;
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index f5a6ee5c5a2e5..b0ebb74424c78 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
@@ -87,16 +87,16 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
   };
 
   switch (Specifier) {
-  case ARMMCExpr::VK_GOTTPOFF:
-  case ARMMCExpr::VK_GOTTPOFF_FDPIC:
-  case ARMMCExpr::VK_TLSCALL:
-  case ARMMCExpr::VK_TLSDESC:
-  case ARMMCExpr::VK_TLSGD:
-  case ARMMCExpr::VK_TLSGD_FDPIC:
-  case ARMMCExpr::VK_TLSLDM:
-  case ARMMCExpr::VK_TLSLDM_FDPIC:
-  case ARMMCExpr::VK_TLSLDO:
-  case ARMMCExpr::VK_TPOFF:
+  case ARM::S_GOTTPOFF:
+  case ARM::S_GOTTPOFF_FDPIC:
+  case ARM::S_TLSCALL:
+  case ARM::S_TLSDESC:
+  case ARM::S_TLSGD:
+  case ARM::S_TLSGD_FDPIC:
+  case ARM::S_TLSLDM:
+  case ARM::S_TLSLDM_FDPIC:
+  case ARM::S_TLSLDO:
+  case ARM::S_TPOFF:
     if (auto *SA = Target.getAddSym())
       cast<MCSymbolELF>(SA)->setType(ELF::STT_TLS);
     break;
@@ -115,7 +115,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
         reportError(Fixup.getLoc(),
                     "invalid fixup for 4-byte pc-relative data relocation");
         return ELF::R_ARM_NONE;
-      case ARMMCExpr::VK_None: {
+      case ARM::S_None: {
         if (const auto *SA = Target.getAddSym()) {
           // For GNU AS compatibility expressions such as
           // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation.
@@ -124,19 +124,19 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
         }
         return ELF::R_ARM_REL32;
       }
-      case ARMMCExpr::VK_GOTTPOFF:
+      case ARM::S_GOTTPOFF:
         return ELF::R_ARM_TLS_IE32;
-      case ARMMCExpr::VK_GOT_PREL:
+      case ARM::S_GOT_PREL:
         return ELF::R_ARM_GOT_PREL;
-      case ARMMCExpr::VK_PREL31:
+      case ARM::S_PREL31:
         return ELF::R_ARM_PREL31;
       }
     case ARM::fixup_arm_blx:
     case ARM::fixup_arm_uncondbl:
       switch (Specifier) {
-      case ARMMCExpr::VK_PLT:
+      case ARM::S_PLT:
         return ELF::R_ARM_CALL;
-      case ARMMCExpr::VK_TLSCALL:
+      case ARM::S_TLSCALL:
         return ELF::R_ARM_TLS_CALL;
       default:
         return ELF::R_ARM_CALL;
@@ -172,7 +172,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     case ARM::fixup_arm_thumb_bl:
     case ARM::fixup_arm_thumb_blx:
       switch (Specifier) {
-      case ARMMCExpr::VK_TLSCALL:
+      case ARM::S_TLSCALL:
         return ELF::R_ARM_THM_TLS_CALL;
       default:
         return ELF::R_ARM_THM_CALL;
@@ -206,7 +206,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 1-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS8;
     }
   case FK_Data_2:
@@ -214,7 +214,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 2-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS16;
     }
   case FK_Data_4:
@@ -222,51 +222,51 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for 4-byte data relocation");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_ARM_NONE:
+    case ARM::S_ARM_NONE:
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_GOT:
+    case ARM::S_GOT:
       return ELF::R_ARM_GOT_BREL;
-    case ARMMCExpr::VK_TLSGD:
+    case ARM::S_TLSGD:
       return ELF::R_ARM_TLS_GD32;
-    case ARMMCExpr::VK_TPOFF:
+    case ARM::S_TPOFF:
       return ELF::R_ARM_TLS_LE32;
-    case ARMMCExpr::VK_GOTTPOFF:
+    case ARM::S_GOTTPOFF:
       return ELF::R_ARM_TLS_IE32;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_ABS32;
-    case ARMMCExpr::VK_GOTOFF:
+    case ARM::S_GOTOFF:
       return ELF::R_ARM_GOTOFF32;
-    case ARMMCExpr::VK_GOT_PREL:
+    case ARM::S_GOT_PREL:
       return ELF::R_ARM_GOT_PREL;
-    case ARMMCExpr::VK_TARGET1:
+    case ARM::S_TARGET1:
       return ELF::R_ARM_TARGET1;
-    case ARMMCExpr::VK_TARGET2:
+    case ARM::S_TARGET2:
       return ELF::R_ARM_TARGET2;
-    case ARMMCExpr::VK_PREL31:
+    case ARM::S_PREL31:
       return ELF::R_ARM_PREL31;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_SBREL32;
-    case ARMMCExpr::VK_TLSLDO:
+    case ARM::S_TLSLDO:
       return ELF::R_ARM_TLS_LDO32;
-    case ARMMCExpr::VK_TLSCALL:
+    case ARM::S_TLSCALL:
       return ELF::R_ARM_TLS_CALL;
-    case ARMMCExpr::VK_TLSDESC:
+    case ARM::S_TLSDESC:
       return ELF::R_ARM_TLS_GOTDESC;
-    case ARMMCExpr::VK_TLSLDM:
+    case ARM::S_TLSLDM:
       return ELF::R_ARM_TLS_LDM32;
-    case ARMMCExpr::VK_TLSDESCSEQ:
+    case ARM::S_TLSDESCSEQ:
       return ELF::R_ARM_TLS_DESCSEQ;
-    case ARMMCExpr::VK_FUNCDESC:
+    case ARM::S_FUNCDESC:
       return CheckFDPIC(ELF::R_ARM_FUNCDESC);
-    case ARMMCExpr::VK_GOTFUNCDESC:
+    case ARM::S_GOTFUNCDESC:
       return CheckFDPIC(ELF::R_ARM_GOTFUNCDESC);
-    case ARMMCExpr::VK_GOTOFFFUNCDESC:
+    case ARM::S_GOTOFFFUNCDESC:
       return CheckFDPIC(ELF::R_ARM_GOTOFFFUNCDESC);
-    case ARMMCExpr::VK_TLSGD_FDPIC:
+    case ARM::S_TLSGD_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_GD32_FDPIC);
-    case ARMMCExpr::VK_TLSLDM_FDPIC:
+    case ARM::S_TLSLDM_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_LDM32_FDPIC);
-    case ARMMCExpr::VK_GOTTPOFF_FDPIC:
+    case ARM::S_GOTTPOFF_FDPIC:
       return CheckFDPIC(ELF::R_ARM_TLS_IE32_FDPIC);
     }
   case ARM::fixup_arm_condbranch:
@@ -277,9 +277,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for ARM MOVT instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_MOVT_ABS;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_MOVT_BREL;
     }
   case ARM::fixup_arm_movw_lo16:
@@ -287,9 +287,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for ARM MOVW instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_MOVW_ABS_NC;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_MOVW_BREL_NC;
     }
   case ARM::fixup_t2_movt_hi16:
@@ -297,9 +297,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for Thumb MOVT instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_THM_MOVT_ABS;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_THM_MOVT_BREL;
     }
   case ARM::fixup_t2_movw_lo16:
@@ -307,9 +307,9 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup,
     default:
       reportError(Fixup.getLoc(), "invalid fixup for Thumb MOVW instruction");
       return ELF::R_ARM_NONE;
-    case ARMMCExpr::VK_None:
+    case ARM::S_None:
       return ELF::R_ARM_THM_MOVW_ABS_NC;
-    case ARMMCExpr::VK_SBREL:
+    case ARM::S_SBREL:
       return ELF::R_ARM_THM_MOVW_BREL_NC;
     }
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 2b959768d2135..73ad62ed79532 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -14,7 +14,7 @@
 
 #include "ARMMCTargetDesc.h"
 #include "ARMUnwindOpAsm.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "Utils/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
@@ -590,7 +590,7 @@ class ARMELFStreamer : public MCELFStreamer {
   /// necessary.
   void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
     if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
-      if (SRE->getSpecifier() == ARMMCExpr::VK_SBREL && !(Size == 4)) {
+      if (SRE->getSpecifier() == ARM::S_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
@@ -1255,7 +1255,7 @@ void ARMELFStreamer::emitFnEnd() {
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
   const MCSymbolRefExpr *FnStartRef =
-      MCSymbolRefExpr::create(FnStart, ARMMCExpr::VK_PREL31, getContext());
+      MCSymbolRefExpr::create(FnStart, ARM::S_PREL31, getContext());
 
   emitValue(FnStartRef, 4);
 
@@ -1264,7 +1264,7 @@ void ARMELFStreamer::emitFnEnd() {
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
-        MCSymbolRefExpr::create(ExTab, ARMMCExpr::VK_PREL31, getContext());
+        MCSymbolRefExpr::create(ExTab, ARM::S_PREL31, getContext());
     emitValue(ExTabEntryRef, 4);
   } else {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
@@ -1294,8 +1294,8 @@ void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
 void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
   const MCSymbol *PersonalitySym = getContext().getOrCreateSymbol(Name);
 
-  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
-      PersonalitySym, ARMMCExpr::VK_ARM_NONE, getContext());
+  const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext());
 
   visitUsedExpr(*PersonalityRef);
   MCDataFragment *DF = getOrCreateDataFragment();
@@ -1341,7 +1341,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // Emit personality
   if (Personality) {
     const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::create(
-        Personality, uint16_t(ARMMCExpr::VK_PREL31), getContext());
+        Personality, uint16_t(ARM::S_PREL31), getContext());
 
     emitValue(PersonalityRef, 4);
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 92121dd5704d8..f8ec0237dcb59 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -18,30 +18,30 @@
 using namespace llvm;
 
 const MCAsmInfo::VariantKindDesc variantKindDescs[] = {
-    {ARMMCExpr::VK_GOT_PREL, "GOT_PREL"},
-    {ARMMCExpr::VK_ARM_NONE, "none"},
-    {ARMMCExpr::VK_PREL31, "prel31"},
-    {ARMMCExpr::VK_SBREL, "sbrel"},
-    {ARMMCExpr::VK_TARGET1, "target1"},
-    {ARMMCExpr::VK_TARGET2, "target2"},
-    {ARMMCExpr::VK_TLSLDO, "TLSLDO"},
+    {ARM::S_GOT_PREL, "GOT_PREL"},
+    {ARM::S_ARM_NONE, "none"},
+    {ARM::S_PREL31, "prel31"},
+    {ARM::S_SBREL, "sbrel"},
+    {ARM::S_TARGET1, "target1"},
+    {ARM::S_TARGET2, "target2"},
+    {ARM::S_TLSLDO, "TLSLDO"},
     {MCSymbolRefExpr::VK_COFF_IMGREL32, "imgrel"},
-    {ARMMCExpr::VK_FUNCDESC, "FUNCDESC"},
-    {ARMMCExpr::VK_GOT, "GOT"},
-    {ARMMCExpr::VK_GOTFUNCDESC, "GOTFUNCDESC"},
-    {ARMMCExpr::VK_GOTOFF, "GOTOFF"},
-    {ARMMCExpr::VK_GOTOFFFUNCDESC, "GOTOFFFUNCDESC"},
-    {ARMMCExpr::VK_GOTTPOFF, "GOTTPOFF"},
-    {ARMMCExpr::VK_GOTTPOFF_FDPIC, "gottpoff_fdpic"},
-    {ARMMCExpr::VK_PLT, "PLT"},
+    {ARM::S_FUNCDESC, "FUNCDESC"},
+    {ARM::S_GOT, "GOT"},
+    {ARM::S_GOTFUNCDESC, "GOTFUNCDESC"},
+    {ARM::S_GOTOFF, "GOTOFF"},
+    {ARM::S_GOTOFFFUNCDESC, "GOTOFFFUNCDESC"},
+    {ARM::S_GOTTPOFF, "GOTTPOFF"},
+    {ARM::S_GOTTPOFF_FDPIC, "gottpoff_fdpic"},
+    {ARM::S_PLT, "PLT"},
     {MCSymbolRefExpr::VK_SECREL, "SECREL32"},
-    {ARMMCExpr::VK_TLSCALL, "tlscall"},
-    {ARMMCExpr::VK_TLSDESC, "tlsdesc"},
-    {ARMMCExpr::VK_TLSGD, "TLSGD"},
-    {ARMMCExpr::VK_TLSGD_FDPIC, "tlsgd_fdpic"},
-    {ARMMCExpr::VK_TLSLDM, "TLSLDM"},
-    {ARMMCExpr::VK_TLSLDM_FDPIC, "tlsldm_fdpic"},
-    {ARMMCExpr::VK_TPOFF, "TPOFF"},
+    {ARM::S_TLSCALL, "tlscall"},
+    {ARM::S_TLSDESC, "tlsdesc"},
+    {ARM::S_TLSGD, "TLSGD"},
+    {ARM::S_TLSGD_FDPIC, "tlsgd_fdpic"},
+    {ARM::S_TLSLDM, "TLSLDM"},
+    {ARM::S_TLSLDM_FDPIC, "tlsldm_fdpic"},
+    {ARM::S_TPOFF, "TPOFF"},
 };
 
 void ARMMCAsmInfoDarwin::anchor() { }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index 55d7b299674d3..baadf74e0d5a5 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -13,9 +13,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 class Triple;
@@ -50,6 +52,49 @@ class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
   explicit ARMCOFFMCAsmInfoGNU();
 };
 
+namespace ARM {
+enum {
+  S_None,
+  S_HI16 =
+      MCSymbolRefExpr::FirstTargetSpecifier, // The R_ARM_MOVT_ABS relocation
+                                             // (:upper16: in the .s file)
+  S_LO16, // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
+
+  S_HI_8_15, // The R_ARM_THM_ALU_ABS_G3    relocation (:upper8_15: in
+             // the .s file)
+  S_HI_0_7,  // The R_ARM_THM_ALU_ABS_G2_NC relocation (:upper0_8: in the
+             // .s file)
+  S_LO_8_15, // The R_ARM_THM_ALU_ABS_G1_NC relocation (:lower8_15: in
+             // the .s file)
+  S_LO_0_7,  // The R_ARM_THM_ALU_ABS_G0_NC relocation (:lower0_7: in the
+             // .s file)
+
+  S_ARM_NONE,
+  S_FUNCDESC,
+  S_GOT,
+  S_GOTFUNCDESC,
+  S_GOTOFF,
+  S_GOTOFFFUNCDESC,
+  S_GOTTPOFF,
+  S_GOTTPOFF_FDPIC,
+  S_GOT_PREL,
+  S_PLT,
+  S_PREL31,
+  S_SBREL,
+  S_TARGET1,
+  S_TARGET2,
+  S_TLSCALL,
+  S_TLSDESC,
+  S_TLSDESCSEQ,
+  S_TLSGD,
+  S_TLSGD_FDPIC,
+  S_TLSLDM,
+  S_TLSLDM_FDPIC,
+  S_TLSLDO,
+  S_TPOFF,
+};
+}
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index e79cdbde62ca9..f006e00ada328 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -13,7 +13,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCAsmInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1201,18 +1201,18 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
         report_fatal_error("constant value truncated (limited to 32-bit)");
 
       switch (ARM16Expr->getSpecifier()) {
-      case ARMMCExpr::VK_HI16:
+      case ARM::S_HI16:
         return (int32_t(Value) & 0xffff0000) >> 16;
-      case ARMMCExpr::VK_LO16:
+      case ARM::S_LO16:
         return (int32_t(Value) & 0x0000ffff);
 
-      case ARMMCExpr::VK_HI_8_15:
+      case ARM::S_HI_8_15:
         return (int32_t(Value) & 0xff000000) >> 24;
-      case ARMMCExpr::VK_HI_0_7:
+      case ARM::S_HI_0_7:
         return (int32_t(Value) & 0x00ff0000) >> 16;
-      case ARMMCExpr::VK_LO_8_15:
+      case ARM::S_LO_8_15:
         return (int32_t(Value) & 0x0000ff00) >> 8;
-      case ARMMCExpr::VK_LO_0_7:
+      case ARM::S_LO_0_7:
         return (int32_t(Value) & 0x000000ff);
 
       default: llvm_unreachable("Unsupported ARMFixup");
@@ -1221,30 +1221,30 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
 
     switch (ARM16Expr->getSpecifier()) {
     default: llvm_unreachable("Unsupported ARMFixup");
-    case ARMMCExpr::VK_HI16:
+    case ARM::S_HI16:
       Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movt_hi16
                                       : ARM::fixup_arm_movt_hi16);
       break;
-    case ARMMCExpr::VK_LO16:
+    case ARM::S_LO16:
       Kind = MCFixupKind(isThumb(STI) ? ARM::fixup_t2_movw_lo16
                                       : ARM::fixup_arm_movw_lo16);
       break;
-    case ARMMCExpr::VK_HI_8_15:
+    case ARM::S_HI_8_15:
       if (!isThumb(STI))
         llvm_unreachable(":upper_8_15: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_upper_8_15);
       break;
-    case ARMMCExpr::VK_HI_0_7:
+    case ARM::S_HI_0_7:
       if (!isThumb(STI))
         llvm_unreachable(":upper_0_7: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_upper_0_7);
       break;
-    case ARMMCExpr::VK_LO_8_15:
+    case ARM::S_LO_8_15:
       if (!isThumb(STI))
         llvm_unreachable(":lower_8_15: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_lower_8_15);
       break;
-    case ARMMCExpr::VK_LO_0_7:
+    case ARM::S_LO_0_7:
       if (!isThumb(STI))
         llvm_unreachable(":lower_0_7: not supported in Arm state");
       Kind = MCFixupKind(ARM::fixup_arm_thumb_lower_0_7);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 1035a9e131c48..1e6760a57608a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCExpr.h"
+#include "ARMMCAsmInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
@@ -22,22 +23,22 @@ const ARMMCExpr *ARMMCExpr::create(Specifier S, const MCExpr *Expr,
 void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   switch (specifier) {
   default: llvm_unreachable("Invalid kind!");
-  case VK_HI16:
+  case ARM::S_HI16:
     OS << ":upper16:";
     break;
-  case VK_LO16:
+  case ARM::S_LO16:
     OS << ":lower16:";
     break;
-  case VK_HI_8_15:
+  case ARM::S_HI_8_15:
     OS << ":upper8_15:";
     break;
-  case VK_HI_0_7:
+  case ARM::S_HI_0_7:
     OS << ":upper0_7:";
     break;
-  case VK_LO_8_15:
+  case ARM::S_LO_8_15:
     OS << ":lower8_15:";
     break;
-  case VK_LO_0_7:
+  case ARM::S_LO_0_7:
     OS << ":lower0_7:";
     break;
   }
@@ -49,3 +50,29 @@ void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   if (Expr->getKind() != MCExpr::SymbolRef)
     OS << ')';
 }
+
+const ARMMCExpr *ARMMCExpr::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI16, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower16(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO16, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createUpper8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI_8_15, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_HI_0_7, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO_8_15, Expr, Ctx);
+}
+
+const ARMMCExpr *ARMMCExpr::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return ARMMCExpr::create(ARM::S_LO_0_7, Expr, Ctx);
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index bcd92de3434ab..f29d05ba2a88d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -16,46 +16,6 @@ namespace llvm {
 class ARMMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = uint16_t;
-  enum {
-    VK_None,
-    VK_HI16 =
-        MCSymbolRefExpr::FirstTargetSpecifier, // The R_ARM_MOVT_ABS relocation
-                                               // (:upper16: in the .s file)
-    VK_LO16, // The R_ARM_MOVW_ABS_NC relocation (:lower16: in the .s file)
-
-    VK_HI_8_15, // The R_ARM_THM_ALU_ABS_G3    relocation (:upper8_15: in
-                // the .s file)
-    VK_HI_0_7,  // The R_ARM_THM_ALU_ABS_G2_NC relocation (:upper0_8: in the
-                // .s file)
-    VK_LO_8_15, // The R_ARM_THM_ALU_ABS_G1_NC relocation (:lower8_15: in
-                // the .s file)
-    VK_LO_0_7,  // The R_ARM_THM_ALU_ABS_G0_NC relocation (:lower0_7: in the
-                // .s file)
-
-    VK_ARM_NONE,
-    VK_FUNCDESC,
-    VK_GOT,
-    VK_GOTFUNCDESC,
-    VK_GOTOFF,
-    VK_GOTOFFFUNCDESC,
-    VK_GOTTPOFF,
-    VK_GOTTPOFF_FDPIC,
-    VK_GOT_PREL,
-    VK_PLT,
-    VK_PREL31,
-    VK_SBREL,
-    VK_TARGET1,
-    VK_TARGET2,
-    VK_TLSCALL,
-    VK_TLSDESC,
-    VK_TLSDESCSEQ,
-    VK_TLSGD,
-    VK_TLSGD_FDPIC,
-    VK_TLSLDM,
-    VK_TLSLDM_FDPIC,
-    VK_TLSLDO,
-    VK_TPOFF,
-  };
 
 private:
   explicit ARMMCExpr(Specifier S, const MCExpr *Expr)
@@ -65,29 +25,12 @@ class ARMMCExpr : public MCSpecifierExpr {
   static const ARMMCExpr *create(Specifier S, const MCExpr *Expr,
                                  MCContext &Ctx);
 
-  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI16, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO16, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI_8_15, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_HI_0_7, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO_8_15, Expr, Ctx);
-  }
-
-  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
-    return create(VK_LO_0_7, Expr, Ctx);
-  }
+  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
+  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res,

From 7efc861ec45e05be9dae59fc7483a98510066160 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 16 Jun 2025 07:29:18 +0100
Subject: [PATCH 557/851] [AArch64][GlobalISel] Add test coverage for
 fdiv-combine.ll. NFC

---
 llvm/test/CodeGen/AArch64/fdiv-combine.ll | 156 +++++++++++++++-------
 1 file changed, 105 insertions(+), 51 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
index 0627250d07791..d8f7f0a306684 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
@@ -1,19 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for splat_fdiv_nxv4f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_three_fdiv_nxv4f32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_fdiv_nxv2f64
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for splat_two_fdiv_nxv2f64
 
 ; Following test cases check:
 ;   a / D; b / D; c / D;
 ;                =>
 ;   recip = 1.0 / D; a * recip; b * recip; c * recip;
 define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
-; CHECK-LABEL: three_fdiv_float:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s4, #1.00000000
-; CHECK-NEXT:    fdiv s4, s4, s0
-; CHECK-NEXT:    fmul s0, s1, s4
-; CHECK-NEXT:    fmul s1, s2, s4
-; CHECK-NEXT:    fmul s2, s3, s4
-; CHECK-NEXT:    b foo_3f
+; CHECK-SD-LABEL: three_fdiv_float:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s4, #1.00000000
+; CHECK-SD-NEXT:    fdiv s4, s4, s0
+; CHECK-SD-NEXT:    fmul s0, s1, s4
+; CHECK-SD-NEXT:    fmul s1, s2, s4
+; CHECK-SD-NEXT:    fmul s2, s3, s4
+; CHECK-SD-NEXT:    b foo_3f
+;
+; CHECK-GI-LABEL: three_fdiv_float:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv s4, s1, s0
+; CHECK-GI-NEXT:    fdiv s1, s2, s0
+; CHECK-GI-NEXT:    fdiv s2, s3, s0
+; CHECK-GI-NEXT:    fmov s0, s4
+; CHECK-GI-NEXT:    b foo_3f
   %div = fdiv float %a, %D
   %div1 = fdiv float %b, %D
   %div2 = fdiv float %c, %D
@@ -22,14 +36,22 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 {
 }
 
 define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
-; CHECK-LABEL: three_fdiv_double:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d4, #1.00000000
-; CHECK-NEXT:    fdiv d4, d4, d0
-; CHECK-NEXT:    fmul d0, d1, d4
-; CHECK-NEXT:    fmul d1, d2, d4
-; CHECK-NEXT:    fmul d2, d3, d4
-; CHECK-NEXT:    b foo_3d
+; CHECK-SD-LABEL: three_fdiv_double:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov d4, #1.00000000
+; CHECK-SD-NEXT:    fdiv d4, d4, d0
+; CHECK-SD-NEXT:    fmul d0, d1, d4
+; CHECK-SD-NEXT:    fmul d1, d2, d4
+; CHECK-SD-NEXT:    fmul d2, d3, d4
+; CHECK-SD-NEXT:    b foo_3d
+;
+; CHECK-GI-LABEL: three_fdiv_double:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv d4, d1, d0
+; CHECK-GI-NEXT:    fdiv d1, d2, d0
+; CHECK-GI-NEXT:    fdiv d2, d3, d0
+; CHECK-GI-NEXT:    fmov d0, d4
+; CHECK-GI-NEXT:    b foo_3d
   %div = fdiv double %a, %D
   %div1 = fdiv double %b, %D
   %div2 = fdiv double %c, %D
@@ -38,14 +60,22 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 {
 }
 
 define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
-; CHECK-LABEL: three_fdiv_4xfloat:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v4.4s, #1.00000000
-; CHECK-NEXT:    fdiv v4.4s, v4.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmul v2.4s, v3.4s, v4.4s
-; CHECK-NEXT:    b foo_3_4xf
+; CHECK-SD-LABEL: three_fdiv_4xfloat:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov v4.4s, #1.00000000
+; CHECK-SD-NEXT:    fdiv v4.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    b foo_3_4xf
+;
+; CHECK-GI-LABEL: three_fdiv_4xfloat:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv v4.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    fdiv v1.4s, v2.4s, v0.4s
+; CHECK-GI-NEXT:    fdiv v2.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    b foo_3_4xf
   %div = fdiv <4 x float> %a, %D
   %div1 = fdiv <4 x float> %b, %D
   %div2 = fdiv <4 x float> %c, %D
@@ -54,14 +84,22 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b,
 }
 
 define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
-; CHECK-LABEL: three_fdiv_2xdouble:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov v4.2d, #1.00000000
-; CHECK-NEXT:    fdiv v4.2d, v4.2d, v0.2d
-; CHECK-NEXT:    fmul v0.2d, v1.2d, v4.2d
-; CHECK-NEXT:    fmul v1.2d, v2.2d, v4.2d
-; CHECK-NEXT:    fmul v2.2d, v3.2d, v4.2d
-; CHECK-NEXT:    b foo_3_2xd
+; CHECK-SD-LABEL: three_fdiv_2xdouble:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov v4.2d, #1.00000000
+; CHECK-SD-NEXT:    fdiv v4.2d, v4.2d, v0.2d
+; CHECK-SD-NEXT:    fmul v0.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    fmul v1.2d, v2.2d, v4.2d
+; CHECK-SD-NEXT:    fmul v2.2d, v3.2d, v4.2d
+; CHECK-SD-NEXT:    b foo_3_2xd
+;
+; CHECK-GI-LABEL: three_fdiv_2xdouble:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fdiv v4.2d, v1.2d, v0.2d
+; CHECK-GI-NEXT:    fdiv v1.2d, v2.2d, v0.2d
+; CHECK-GI-NEXT:    fdiv v2.2d, v3.2d, v0.2d
+; CHECK-GI-NEXT:    mov v0.16b, v4.16b
+; CHECK-GI-NEXT:    b foo_3_2xd
   %div = fdiv <2 x double> %a, %D
   %div1 = fdiv <2 x double> %b, %D
   %div2 = fdiv <2 x double> %c, %D
@@ -98,16 +136,25 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 {
 }
 
 define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
-; CHECK-LABEL: splat_three_fdiv_4xfloat:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    fmov v4.4s, #1.00000000
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-NEXT:    fdiv v4.4s, v4.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v2.4s, v4.4s
-; CHECK-NEXT:    fmul v2.4s, v3.4s, v4.4s
-; CHECK-NEXT:    b foo_3_4xf
+; CHECK-SD-LABEL: splat_three_fdiv_4xfloat:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    fmov v4.4s, #1.00000000
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    fdiv v4.4s, v4.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v1.4s, v2.4s, v4.4s
+; CHECK-SD-NEXT:    fmul v2.4s, v3.4s, v4.4s
+; CHECK-SD-NEXT:    b foo_3_4xf
+;
+; CHECK-GI-LABEL: splat_three_fdiv_4xfloat:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    dup v4.4s, v0.s[0]
+; CHECK-GI-NEXT:    fdiv v0.4s, v1.4s, v4.4s
+; CHECK-GI-NEXT:    fdiv v1.4s, v2.4s, v4.4s
+; CHECK-GI-NEXT:    fdiv v2.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    b foo_3_4xf
   %D.ins = insertelement <4 x float> poison, float %D, i64 0
   %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
   %div = fdiv <4 x float> %a, %splat
@@ -118,14 +165,21 @@ define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b,
 }
 
 define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
-; CHECK-LABEL: splat_fdiv_v4f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    fmov v2.4s, #1.00000000
-; CHECK-NEXT:    dup v0.4s, v0.s[0]
-; CHECK-NEXT:    fdiv v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: splat_fdiv_v4f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT:    fmov v2.4s, #1.00000000
+; CHECK-SD-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-SD-NEXT:    fdiv v0.4s, v2.4s, v0.4s
+; CHECK-SD-NEXT:    fmul v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: splat_fdiv_v4f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-GI-NEXT:    fdiv v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %D.ins = insertelement <4 x float> poison, float %D, i64 0
   %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer

From f875efe1d82d920790e368f9ab2b31f173a523e1 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Mon, 16 Jun 2025 13:32:44 +0800
Subject: [PATCH 558/851] [RISCV] Use `GetVTypeMinimalPredicates` instead of
 `GetVTypePredicates` for vrgatherei16/vslideup/vslidedown. NFC.

---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 5e554d2d03911..9c03c7c83af04 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -4916,8 +4916,8 @@ multiclass VPatBinaryV_VV_INT_EEW<string intrinsic, string instruction,
       defvar emul_str = octuple_to_str<octuple_emul>.ret;
       defvar ivti = !cast<VTypeInfo>("VI" # eew # emul_str);
       defvar inst = instruction # "_VV_" # vti.LMul.MX # "_E" # vti.SEW # "_" # emul_str;
-      let Predicates = !listconcat(GetVTypePredicates<vti>.Predicates,
-                                   GetVTypePredicates<ivti>.Predicates) in
+      let Predicates = !listconcat(GetVTypeMinimalPredicates<vti>.Predicates,
+                                   GetVTypeMinimalPredicates<ivti>.Predicates) in
       defm : VPatBinary<intrinsic, inst,
                         vti.Vector, vti.Vector, ivti.Vector, vti.Mask,
                         vti.Log2SEW, vti.RegClass,
@@ -5584,7 +5584,7 @@ multiclass VPatTernaryV_VV_AAXA_RM<string intrinsic, string instruction,
 multiclass VPatTernaryV_VX<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatTernaryWithPolicy<intrinsic, instruction, "VX",
                                  vti.Vector, vti.Vector, XLenVT, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
@@ -5616,7 +5616,7 @@ multiclass VPatTernaryV_VX_AAXA_RM<string intrinsic, string instruction,
 multiclass VPatTernaryV_VI<string intrinsic, string instruction,
                            list<VTypeInfo> vtilist, Operand Imm_type> {
   foreach vti = vtilist in
-    let Predicates = GetVTypePredicates<vti>.Predicates in
+    let Predicates = GetVTypeMinimalPredicates<vti>.Predicates in
     defm : VPatTernaryWithPolicy<intrinsic, instruction, "VI",
                                  vti.Vector, vti.Vector, XLenVT, vti.Mask,
                                  vti.Log2SEW, vti.LMul, vti.RegClass,
@@ -7414,12 +7414,8 @@ defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllInteger
 defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>;
 
-defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFP16Vectors, uimm5>;
-defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectorsExceptFP16, uimm5>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFP16Vectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>;
+defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>;
 defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>;
 
@@ -7436,10 +7432,7 @@ defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
 defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER",
                                 AllBFloatVectors, uimm5>;
 defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                              eew=16, vtilist=AllFloatVectorsExceptFP16>;
-let Predicates = [HasVInstructionsF16Minimal] in
-  defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16",
-                                eew=16, vtilist=AllFP16Vectors>;
+                              eew=16, vtilist=AllFloatVectors>;
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
 //===----------------------------------------------------------------------===//

From 7d9a451d875368baece310ca7226e3adbc00e1bf Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Mon, 16 Jun 2025 12:28:12 +0530
Subject: [PATCH 559/851] [RISCV] Change input register type for QC_SWM and
 QC_SWMI (#144294)

Version 0.13 of the `Xqci` spec changes the register type of input
operand `rs3` from `GPR` to `GPRNoX0` for these two instructions.

The spec can be found at
https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0
---
 llvm/docs/RISCVUsage.rst                    |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 17 +++++++++--------
 llvm/test/MC/RISCV/xqcilsm-invalid.s        |  8 ++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 7d0d0cc21a27d..64f17f59575ea 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -487,7 +487,7 @@ The current vendor extensions supported are:
   LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilsm``
-  LLVM implements `version 0.5 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.6 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisim``
   LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 9f96a3ed80561..b94fee3c6e575 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -574,9 +574,10 @@ class QCILoadMultiple<bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
 
 // rd corresponds to the source for the store 'rs3' described in the spec.
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-class QCIStoreMultiple<bits<2> funct2, DAGOperand InTyRs2, string opcodestr>
+class QCIStoreMultiple<bits<2> funct2, DAGOperand InTyRd, DAGOperand InTyRs2,
+                       string opcodestr>
     : RVInstRBase<0b111, OPC_CUSTOM_1, (outs),
-                  (ins GPR:$rd, GPR:$rs1, InTyRs2:$rs2, uimm7_lsb00:$imm),
+                  (ins InTyRd:$rd, GPR:$rs1, InTyRs2:$rs2, uimm7_lsb00:$imm),
                   opcodestr, "$rd, $rs2, ${imm}(${rs1})"> {
   bits<7> imm;
   let Inst{31-25} = {funct2, imm{6-2}};
@@ -967,10 +968,10 @@ let Predicates = [HasVendorXqcics, IsRV32] in {
 } // Predicates = [HasVendorXqcics, IsRV32]
 
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
-    def QC_SWM : QCIStoreMultiple<0b00, GPRNoX0, "qc.swm">;
-    def QC_SWMI : QCIStoreMultiple<0b01, uimm5nonzero, "qc.swmi">;
-    def QC_SETWM : QCIStoreMultiple<0b10, GPRNoX0, "qc.setwm">;
-    def QC_SETWMI : QCIStoreMultiple<0b11, uimm5nonzero, "qc.setwmi">;
+    def QC_SWM : QCIStoreMultiple<0b00, GPRNoX0, GPRNoX0, "qc.swm">;
+    def QC_SWMI : QCIStoreMultiple<0b01, GPRNoX0, uimm5nonzero, "qc.swmi">;
+    def QC_SETWM : QCIStoreMultiple<0b10, GPR, GPRNoX0, "qc.setwm">;
+    def QC_SETWMI : QCIStoreMultiple<0b11, GPR, uimm5nonzero, "qc.setwmi">;
 
     def QC_LWM : QCILoadMultiple<0b00, GPRNoX0, "qc.lwm">;
     def QC_LWMI : QCILoadMultiple<0b01, uimm5nonzero, "qc.lwmi">;
@@ -1211,9 +1212,9 @@ let EmitPriority = 0 in {
 let Predicates = [HasVendorXqcilsm, IsRV32] in {
 let EmitPriority = 0 in {
   def : InstAlias<"qc.swm $rs3, $rs2, (${rs1})",
-                  (QC_SWM GPR:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
+                  (QC_SWM GPRNoX0:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
   def : InstAlias<"qc.swmi $rs3, $length, (${rs1})",
-                  (QC_SWMI GPR:$rs3, GPR:$rs1, uimm5nonzero:$length, 0)>;
+                  (QC_SWMI GPRNoX0:$rs3, GPR:$rs1, uimm5nonzero:$length, 0)>;
   def : InstAlias<"qc.setwm $rs3, $rs2, (${rs1})",
                   (QC_SETWM GPR:$rs3, GPR:$rs1, GPRNoX0:$rs2, 0)>;
   def : InstAlias<"qc.setwmi $rs3, $length, (${rs1})",
diff --git a/llvm/test/MC/RISCV/xqcilsm-invalid.s b/llvm/test/MC/RISCV/xqcilsm-invalid.s
index 15d55021d64e4..a3421db0eff4f 100644
--- a/llvm/test/MC/RISCV/xqcilsm-invalid.s
+++ b/llvm/test/MC/RISCV/xqcilsm-invalid.s
@@ -7,6 +7,10 @@
 # CHECK: :[[@LINE+1]]:20: error: expected register
 qc.swm x5, x20, 12(20)
 
+# CHECK-PLUS: :[[@LINE+2]]:8: error: register must be a GPR excluding zero (x0)
+# CHECK-MINUS: :[[@LINE+1]]:8: error: invalid operand for instruction
+qc.swm x0, x20, 12(x3)
+
 # CHECK-PLUS: :[[@LINE+2]]:12: error: register must be a GPR excluding zero (x0)
 # CHECK-MINUS: :[[@LINE+1]]:12: error: invalid operand for instruction
 qc.swm x5, x0, 12(x3)
@@ -24,6 +28,10 @@ qc.swm x5, x20, 12(x3)
 # CHECK: :[[@LINE+1]]:20: error: expected register
 qc.swmi x10, 4, 20(4)
 
+# CHECK-PLUS: :[[@LINE+2]]:9: error: register must be a GPR excluding zero (x0)
+# CHECK-MINUS: :[[@LINE+1]]:9: error: invalid operand for instruction
+qc.swmi x0, 4, 20(x4)
+
 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
 qc.swmi x10, 4, 20
 

From 222ab28a9240e03479341cba2f487b8350635fce Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Mon, 16 Jun 2025 00:15:06 -0700
Subject: [PATCH 560/851] [aarch64] Fix Arm64EC libcall lowering after recent
 refactoring. (#143977)

The refactored code accidentally tokenized a string instead of just
concatenating it.

Add a regression test and some assertions to ensure consistency.

Fixes #143890 .
---
 llvm/include/llvm/IR/RuntimeLibcalls.def      | 48 +++++++++----------
 llvm/lib/IR/RuntimeLibcalls.cpp               | 15 ++++--
 .../WebAssemblyRuntimeLibcallSignatures.cpp   |  2 +
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll | 45 +++++++++++++++++
 4 files changed, 82 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 4ddae8e48193f..247643525ff48 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -70,16 +70,16 @@ HANDLE_LIBCALL(UREM_I16, "__umodhi3")
 HANDLE_LIBCALL(UREM_I32, "__umodsi3")
 HANDLE_LIBCALL(UREM_I64, "__umoddi3")
 HANDLE_LIBCALL(UREM_I128, "__umodti3")
-HANDLE_LIBCALL(SDIVREM_I8, nullptr)
-HANDLE_LIBCALL(SDIVREM_I16, nullptr)
-HANDLE_LIBCALL(SDIVREM_I32, nullptr)
-HANDLE_LIBCALL(SDIVREM_I64, nullptr)
-HANDLE_LIBCALL(SDIVREM_I128, nullptr)
-HANDLE_LIBCALL(UDIVREM_I8, nullptr)
-HANDLE_LIBCALL(UDIVREM_I16, nullptr)
-HANDLE_LIBCALL(UDIVREM_I32, nullptr)
-HANDLE_LIBCALL(UDIVREM_I64, nullptr)
-HANDLE_LIBCALL(UDIVREM_I128, nullptr)
+HANDLE_LIBCALL(SDIVREM_I8, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I16, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SDIVREM_I128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I8, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I16, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(UDIVREM_I128, LIBCALL_NO_NAME)
 HANDLE_LIBCALL(NEG_I32, "__negsi2")
 HANDLE_LIBCALL(NEG_I64, "__negdi2")
 HANDLE_LIBCALL(CTLZ_I32, "__clzsi2")
@@ -240,13 +240,13 @@ HANDLE_LIBCALL(ATAN2_F64, "atan2")
 HANDLE_LIBCALL(ATAN2_F80, "atan2l")
 HANDLE_LIBCALL(ATAN2_F128,"atan2l")
 HANDLE_LIBCALL(ATAN2_PPCF128, "atan2l")
-HANDLE_LIBCALL(SINCOS_F32, nullptr)
-HANDLE_LIBCALL(SINCOS_F64, nullptr)
-HANDLE_LIBCALL(SINCOS_F80, nullptr)
-HANDLE_LIBCALL(SINCOS_F128, nullptr)
-HANDLE_LIBCALL(SINCOS_PPCF128, nullptr)
-HANDLE_LIBCALL(SINCOS_STRET_F32, nullptr)
-HANDLE_LIBCALL(SINCOS_STRET_F64, nullptr)
+HANDLE_LIBCALL(SINCOS_F32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F64, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F80, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_F128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_PPCF128, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_STRET_F32, LIBCALL_NO_NAME)
+HANDLE_LIBCALL(SINCOS_STRET_F64, LIBCALL_NO_NAME)
 HANDLE_LIBCALL(POW_F32, "powf")
 HANDLE_LIBCALL(POW_F64, "pow")
 HANDLE_LIBCALL(POW_F80, "powl")
@@ -518,7 +518,7 @@ HANDLE_LIBCALL(MEMMOVE, "memmove")
 HANDLE_LIBCALL(MEMSET, "memset")
 // DSEPass can emit calloc if it finds a pair of malloc/memset
 HANDLE_LIBCALL(CALLOC, "calloc")
-HANDLE_LIBCALL(BZERO, nullptr)
+HANDLE_LIBCALL(BZERO, LIBCALL_NO_NAME)
 
 // Element-wise unordered-atomic memory of different sizes
 HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1")
@@ -669,10 +669,10 @@ HANDLE_LIBCALL(ATOMIC_FETCH_NAND_16, "__atomic_fetch_nand_16")
 
 // Out-of-line atomics libcalls
 #define HLCALLS(A, N)                                                          \
-  HANDLE_LIBCALL(A##N##_RELAX, nullptr)                                        \
-  HANDLE_LIBCALL(A##N##_ACQ, nullptr)                                          \
-  HANDLE_LIBCALL(A##N##_REL, nullptr)                                          \
-  HANDLE_LIBCALL(A##N##_ACQ_REL, nullptr)
+  HANDLE_LIBCALL(A##N##_RELAX, LIBCALL_NO_NAME)                                \
+  HANDLE_LIBCALL(A##N##_ACQ, LIBCALL_NO_NAME)                                  \
+  HANDLE_LIBCALL(A##N##_REL, LIBCALL_NO_NAME)                                  \
+  HANDLE_LIBCALL(A##N##_ACQ_REL, LIBCALL_NO_NAME)
 #define HLCALL5(A)                                                             \
   HLCALLS(A, 1) HLCALLS(A, 2) HLCALLS(A, 4) HLCALLS(A, 8) HLCALLS(A, 16)
 HLCALL5(OUTLINE_ATOMIC_CAS)
@@ -691,11 +691,11 @@ HANDLE_LIBCALL(STACKPROTECTOR_CHECK_FAIL, "__stack_chk_fail")
 HANDLE_LIBCALL(DEOPTIMIZE, "__llvm_deoptimize")
 
 // Return address
-HANDLE_LIBCALL(RETURN_ADDRESS, nullptr)
+HANDLE_LIBCALL(RETURN_ADDRESS, LIBCALL_NO_NAME)
 
 // Clear cache
 HANDLE_LIBCALL(CLEAR_CACHE, "__clear_cache")
 HANDLE_LIBCALL(RISCV_FLUSH_ICACHE, "__riscv_flush_icache")
 
-HANDLE_LIBCALL(UNKNOWN_LIBCALL, nullptr)
+HANDLE_LIBCALL(UNKNOWN_LIBCALL, LIBCALL_NO_NAME)
 
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d655f84b37c50..d63d398e243f9 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -21,13 +21,17 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
 #define HANDLE_LIBCALL(code, name)                                             \
-  {                                                                            \
+  if (sizeof(name) != 1) {                                                     \
     const char *libcallName = Info.getLibcallName(RTLIB::code);                \
-    if (libcallName && libcallName[0] != '#')                                  \
-      Info.setLibcallName(RTLIB::code, "#" #name);                             \
+    if (libcallName && libcallName[0] != '#') {                                \
+      assert(strcmp(libcallName, name) == 0 && "Unexpected name");             \
+      Info.setLibcallName(RTLIB::code, "#" name);                              \
+    }                                                                          \
   }
+#define LIBCALL_NO_NAME ""
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
   }
 }
 
@@ -223,8 +227,10 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
             nullptr);
 
 #define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
+#define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
 
   // Initialize calling conventions to their default.
   for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
@@ -462,7 +468,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
   }
 
   // Setup Windows compiler runtime calls.
-  if (TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()) {
+  if (TT.getArch() == Triple::x86 &&
+      (TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char *const Name;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index ce795d3dedc6a..d5c4532824c07 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -531,8 +531,10 @@ struct StaticLibcallNameMap {
   StaticLibcallNameMap() {
     static const std::pair<const char *, RTLIB::Libcall> NameLibcalls[] = {
 #define HANDLE_LIBCALL(code, name) {(const char *)name, RTLIB::code},
+#define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
+#undef LIBCALL_NO_NAME
     };
     for (const auto &NameLibcall : NameLibcalls) {
       if (NameLibcall.first != nullptr &&
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
new file mode 100644
index 0000000000000..92b95a90d89a0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -0,0 +1,45 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+define void @f1(ptr %p, i64 %n) {
+; CHECK-LABEL: "#f1":
+; CHECK: bl "#memset"
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 %n, i1 false)
+  ret void
+}
+
+define void @f2(ptr %p1, ptr %p2, i64 %n) {
+; CHECK-LABEL: "#f2":
+; CHECK: bl "#memcpy"
+  call void @llvm.memcpy.p0.i64(ptr %p1, ptr %p2, i64 %n, i1 false)
+  ret void
+}
+
+define double @f3(double %x, double %y) {
+; CHECK-LABEL: "#f3":
+; CHECK: b "#fmod"
+  %r = frem double %x, %y
+  ret double %r
+}
+
+define i128 @f4(i128 %x, i128 %y) {
+; CHECK-LABEL: "#f4":
+; CHECK: bl "#__divti3"
+  %r = sdiv i128 %x, %y
+  ret i128 %r
+}
+
+; FIXME: This is wrong; should be "#__aarch64_cas1_relax"
+define i8 @f5(i8 %expected, i8 %new, ptr %ptr) "target-features"="+outline-atomics" {
+; CHECK-LABEL: "#f5":
+; CHECK: bl __aarch64_cas1_relax
+    %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
+   %r = extractvalue { i8, i1 } %pair, 0
+    ret i8 %r
+}
+
+define float @f6(float %val, i32 %a) {
+; CHECK-LABEL: "#f6":
+; CHECK: bl "#ldexp"
+  %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
+  ret float %call
+}

From 9fcd14d9b013d0c4b8ec245772b3be3d5c31b885 Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Mon, 16 Jun 2025 09:21:05 +0200
Subject: [PATCH 561/851] [MLIR][ODS] Optionally generate public C++ functions
 for attribute constraints (#144275)

Add `gen-attr-constraint-decls` and `gen-attr-constraint-defs`, which
generate public C++ functions for attribute constraints. The name of the C++
function is specified in the `cppFunctionName` field.

This generalize `cppFunctionName` from `TypeConstraint` introduced in
 https://github.com/llvm/llvm-project/pull/104577 to be usable also in `AttrConstraint`.
---
 mlir/docs/DefiningDialects/Constraints.md   | 23 +++---
 mlir/include/mlir/IR/Constraints.td         | 19 +++--
 mlir/test/mlir-tblgen/attr-constraints.td   | 14 ++++
 mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 91 +++++++++++++++++----
 4 files changed, 114 insertions(+), 33 deletions(-)
 create mode 100644 mlir/test/mlir-tblgen/attr-constraints.td

diff --git a/mlir/docs/DefiningDialects/Constraints.md b/mlir/docs/DefiningDialects/Constraints.md
index 52a4283d6084c..40863e7aecf4a 100644
--- a/mlir/docs/DefiningDialects/Constraints.md
+++ b/mlir/docs/DefiningDialects/Constraints.md
@@ -24,8 +24,8 @@ code is generated for type/attribute constraints. Type constraints can not only
 be used when defining operation arguments, but also when defining type
 parameters.
 
-Optionally, C++ functions can be generated, so that type constraints can be
-checked from C++. The name of the C++ function must be specified in the
+Optionally, C++ functions can be generated, so that type/attribute constraints
+can be checked from C++. The name of the C++ function must be specified in the
 `cppFunctionName` field. If no function name is specified, no C++ function is
 emitted.
 
@@ -43,17 +43,20 @@ bool isValidVectorTypeElementType(::mlir::Type type) {
 }
 ```
 
-An extra TableGen rule is needed to emit C++ code for type constraints. This
-will generate only the declarations/definitions of the type constaraints that
-are defined in the specified `.td` file, but not those that are in included
-`.td` files.
+An extra TableGen rule is needed to emit C++ code for type/attribute
+constraints. This will generate only the declarations/definitions of the
+type/attribute constaraints that are defined in the specified `.td` file, but
+not those that are in included `.td` files.
 
 ```cmake
 mlir_tablegen(<Your Dialect>TypeConstraints.h.inc -gen-type-constraint-decls)
 mlir_tablegen(<Your Dialect>TypeConstraints.cpp.inc -gen-type-constraint-defs)
+mlir_tablegen(<Your Dialect>AttrConstraints.h.inc -gen-attr-constraint-decls)
+mlir_tablegen(<Your Dialect>AttrConstraints.cpp.inc -gen-attr-constraint-defs)
 ```
 
-The generated `<Your Dialect>TypeConstraints.h.inc` will need to be included
-whereever you are referencing the type constraint in C++. Note that no C++
-namespace will be emitted by the code generator. The `#include` statements of
-the `.h.inc`/`.cpp.inc` files should be wrapped in C++ namespaces by the user.
+The generated `<Your Dialect>TypeConstraints.h.inc` respectivelly
+`<Your Dialect>AttrConstraints.h.inc` will need to be included whereever you are
+referencing the type/attributes constraint in C++. Note that no C++ namespace
+will be emitted by the code generator. The `#include` statements of the
+`.h.inc`/`.cpp.inc` files should be wrapped in C++ namespaces by the user.
diff --git a/mlir/include/mlir/IR/Constraints.td b/mlir/include/mlir/IR/Constraints.td
index 33e8581ecd356..0d59fffce9df9 100644
--- a/mlir/include/mlir/IR/Constraints.td
+++ b/mlir/include/mlir/IR/Constraints.td
@@ -148,6 +148,15 @@ class Constraint<Pred pred, string desc = ""> {
   string summary = desc;
 }
 
+// Base class for constraints on types and attributes.
+class AttrTypeConstraint<Pred pred, string summary = "",
+                         string cppFunctionNameParam = ""> :
+    Constraint<pred, summary> {
+  // The name of the C++ function that is generated for this constraint.
+  // If empty, no C++ function is generated.
+  string cppFunctionName = cppFunctionNameParam;
+}
+
 // Subclasses used to differentiate different constraint kinds. These are used
 // as markers for the TableGen backend to handle different constraint kinds
 // differently if needed. Constraints not deriving from the following subclasses
@@ -157,17 +166,15 @@ class Constraint<Pred pred, string desc = ""> {
 class TypeConstraint<Pred predicate, string summary = "",
                      string cppTypeParam = "::mlir::Type",
                      string cppFunctionNameParam = ""> :
-    Constraint<predicate, summary> {
+    AttrTypeConstraint<predicate, summary, cppFunctionNameParam> {
   // The name of the C++ Type class if known, or Type if not.
   string cppType = cppTypeParam;
-  // The name of the C++ function that is generated for this type constraint.
-  // If empty, no C++ function is generated.
-  string cppFunctionName = cppFunctionNameParam;
 }
 
 // Subclass for constraints on an attribute.
-class AttrConstraint<Pred predicate, string summary = ""> :
-    Constraint<predicate, summary>;
+class AttrConstraint<Pred predicate, string summary = "",
+                     string cppFunctionNameParam = ""> :
+    AttrTypeConstraint<predicate, summary, cppFunctionNameParam>;
 
 // Subclass for constraints on a property.
 class PropConstraint<Pred predicate, string summary = "", string interfaceTypeParam = ""> :
diff --git a/mlir/test/mlir-tblgen/attr-constraints.td b/mlir/test/mlir-tblgen/attr-constraints.td
new file mode 100644
index 0000000000000..59bc5f2526603
--- /dev/null
+++ b/mlir/test/mlir-tblgen/attr-constraints.td
@@ -0,0 +1,14 @@
+// RUN: mlir-tblgen -gen-attr-constraint-decls -I %S/../../include %s | FileCheck %s --check-prefix=DECL
+// RUN: mlir-tblgen -gen-attr-constraint-defs -I %S/../../include %s | FileCheck %s --check-prefix=DEF
+
+include "mlir/IR/CommonAttrConstraints.td"
+
+def DummyConstraint : AnyAttrOf<[APIntAttr, ArrayAttr, UnitAttr]> {
+  let cppFunctionName = "isValidDummy";
+}
+
+// DECL: bool isValidDummy(::mlir::Attribute attr);
+
+// DEF: bool isValidDummy(::mlir::Attribute attr) {
+// DEF:   return (((::llvm::isa<::mlir::IntegerAttr>(attr))) || ((::llvm::isa<::mlir::ArrayAttr>(attr))) || ((::llvm::isa<::mlir::UnitAttr>(attr))));
+// DEF: }
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 2a6071602fa49..defd1fa12ca1a 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -1083,15 +1083,15 @@ bool DefGenerator::emitDefs(StringRef selectedDialect) {
 }
 
 //===----------------------------------------------------------------------===//
-// Type Constraints
+// Constraints
 //===----------------------------------------------------------------------===//
 
 /// Find all type constraints for which a C++ function should be generated.
-static std::vector<Constraint>
-getAllTypeConstraints(const RecordKeeper &records) {
+static std::vector<Constraint> getAllCppConstraints(const RecordKeeper &records,
+                                                    StringRef constraintKind) {
   std::vector<Constraint> result;
   for (const Record *def :
-       records.getAllDerivedDefinitionsIfDefined("TypeConstraint")) {
+       records.getAllDerivedDefinitionsIfDefined(constraintKind)) {
     // Ignore constraints defined outside of the top-level file.
     if (llvm::SrcMgr.FindBufferContainingLoc(def->getLoc()[0]) !=
         llvm::SrcMgr.getMainFileID())
@@ -1105,32 +1105,74 @@ getAllTypeConstraints(const RecordKeeper &records) {
   return result;
 }
 
+static std::vector<Constraint>
+getAllCppTypeConstraints(const RecordKeeper &records) {
+  return getAllCppConstraints(records, "TypeConstraint");
+}
+
+static std::vector<Constraint>
+getAllCppAttrConstraints(const RecordKeeper &records) {
+  return getAllCppConstraints(records, "AttrConstraint");
+}
+
+/// Emit the declarations for the given constraints, of the form:
+/// `bool <constraintCppFunctionName>(<parameterTypeName> <parameterName>);`
+static void emitConstraintDecls(const std::vector<Constraint> &constraints,
+                                raw_ostream &os, StringRef parameterTypeName,
+                                StringRef parameterName) {
+  static const char *const constraintDecl = "bool {0}({1} {2});\n";
+  for (Constraint constr : constraints)
+    os << strfmt(constraintDecl, *constr.getCppFunctionName(),
+                 parameterTypeName, parameterName);
+}
+
 static void emitTypeConstraintDecls(const RecordKeeper &records,
                                     raw_ostream &os) {
-  static const char *const typeConstraintDecl = R"(
-bool {0}(::mlir::Type type);
-)";
+  emitConstraintDecls(getAllCppTypeConstraints(records), os, "::mlir::Type",
+                      "type");
+}
 
-  for (Constraint constr : getAllTypeConstraints(records))
-    os << strfmt(typeConstraintDecl, *constr.getCppFunctionName());
+static void emitAttrConstraintDecls(const RecordKeeper &records,
+                                    raw_ostream &os) {
+  emitConstraintDecls(getAllCppAttrConstraints(records), os,
+                      "::mlir::Attribute", "attr");
 }
 
-static void emitTypeConstraintDefs(const RecordKeeper &records,
-                                   raw_ostream &os) {
-  static const char *const typeConstraintDef = R"(
-bool {0}(::mlir::Type type) {
-  return ({1});
+/// Emit the definitions for the given constraints, of the form:
+/// `bool <constraintCppFunctionName>(<parameterTypeName> <parameterName>) {
+///   return (<condition>); }`
+/// where `<condition>` is the condition template with the `self` variable
+/// replaced with the `selfName` parameter.
+static void emitConstraintDefs(const std::vector<Constraint> &constraints,
+                               raw_ostream &os, StringRef parameterTypeName,
+                               StringRef selfName) {
+  static const char *const constraintDef = R"(
+bool {0}({1} {2}) {
+return ({3});
 }
 )";
 
-  for (Constraint constr : getAllTypeConstraints(records)) {
+  for (Constraint constr : constraints) {
     FmtContext ctx;
-    ctx.withSelf("type");
+    ctx.withSelf(selfName);
     std::string condition = tgfmt(constr.getConditionTemplate(), &ctx);
-    os << strfmt(typeConstraintDef, *constr.getCppFunctionName(), condition);
+    os << strfmt(constraintDef, *constr.getCppFunctionName(), parameterTypeName,
+                 selfName, condition);
   }
 }
 
+static void emitTypeConstraintDefs(const RecordKeeper &records,
+                                   raw_ostream &os) {
+  emitConstraintDefs(getAllCppTypeConstraints(records), os, "::mlir::Type",
+                     "type");
+}
+
+static void emitAttrConstraintDefs(const RecordKeeper &records,
+                                   raw_ostream &os) {
+  emitConstraintDefs(getAllCppAttrConstraints(records), os, "::mlir::Attribute",
+                     "attr");
+}
+
 //===----------------------------------------------------------------------===//
 // GEN: Registration hooks
 //===----------------------------------------------------------------------===//
@@ -1158,6 +1200,21 @@ static mlir::GenRegistration
                    return generator.emitDecls(attrDialect);
                  });
 
+static mlir::GenRegistration
+    genAttrConstrDefs("gen-attr-constraint-defs",
+                      "Generate attribute constraint definitions",
+                      [](const RecordKeeper &records, raw_ostream &os) {
+                        emitAttrConstraintDefs(records, os);
+                        return false;
+                      });
+static mlir::GenRegistration
+    genAttrConstrDecls("gen-attr-constraint-decls",
+                       "Generate attribute constraint declarations",
+                       [](const RecordKeeper &records, raw_ostream &os) {
+                         emitAttrConstraintDecls(records, os);
+                         return false;
+                       });
+
 //===----------------------------------------------------------------------===//
 // TypeDef
 //===----------------------------------------------------------------------===//

From 0bb4d9c30207c4a69731e6848ba7cb6ef52b5906 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 00:21:13 -0700
Subject: [PATCH 562/851] ARM: Migrate to the new relocation specifier
 representation

Use MCSpecifierExpr directly and remove the ARMMCExpr subclass. Define
printImpl and evaluateAsRelocationImpl within ARM*MCAsmInfo classes.
While there is some duplication, it enables better separation for
object file formats.
---
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp         | 30 ++++---
 llvm/lib/Target/ARM/ARMMCInstLower.cpp        | 12 +--
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 17 ++--
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp  | 61 ++++++++++++++-
 .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.h    | 46 ++++++++++-
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     |  2 +-
 .../lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp | 78 -------------------
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h  | 43 ----------
 .../MCTargetDesc/ARMMachORelocationInfo.cpp   |  6 +-
 .../Target/ARM/MCTargetDesc/CMakeLists.txt    |  1 -
 .../llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn |  1 -
 11 files changed, 142 insertions(+), 155 deletions(-)
 delete mode 100644 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
 delete mode 100644 llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h

diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fef7a17ae0b63..fa14370025515 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1619,12 +1619,15 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
                     MI->getOperand(2).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4;
-    const MCExpr *PCRelExpr =
-      ARMMCExpr::createLower16(MCBinaryExpr::createSub(GVSymExpr,
-                                      MCBinaryExpr::createAdd(LabelSymExpr,
-                                      MCConstantExpr::create(PCAdj, OutContext),
-                                      OutContext), OutContext), OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+    const MCExpr *PCRelExpr = ARM::createLower16(
+        MCBinaryExpr::createSub(
+            GVSymExpr,
+            MCBinaryExpr::createAdd(LabelSymExpr,
+                                    MCConstantExpr::create(PCAdj, OutContext),
+                                    OutContext),
+            OutContext),
+        OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
 
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
@@ -1652,12 +1655,15 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
                     MI->getOperand(3).getImm(), OutContext);
     const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext);
     unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4;
-    const MCExpr *PCRelExpr =
-        ARMMCExpr::createUpper16(MCBinaryExpr::createSub(GVSymExpr,
-                                   MCBinaryExpr::createAdd(LabelSymExpr,
-                                      MCConstantExpr::create(PCAdj, OutContext),
-                                          OutContext), OutContext), OutContext);
-      TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
+    const MCExpr *PCRelExpr = ARM::createUpper16(
+        MCBinaryExpr::createSub(
+            GVSymExpr,
+            MCBinaryExpr::createAdd(LabelSymExpr,
+                                    MCConstantExpr::create(PCAdj, OutContext),
+                                    OutContext),
+            OutContext),
+        OutContext);
+    TmpInst.addOperand(MCOperand::createExpr(PCRelExpr));
     // Add predicate operands.
     TmpInst.addOperand(MCOperand::createImm(ARMCC::AL));
     TmpInst.addOperand(MCOperand::createReg(0));
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index b32de6b66058b..f5d6597f214dd 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -49,27 +49,27 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
     break;
   case ARMII::MO_LO16:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower16(Expr, OutContext);
+    Expr = ARM::createLower16(Expr, OutContext);
     break;
   case ARMII::MO_HI16:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper16(Expr, OutContext);
+    Expr = ARM::createUpper16(Expr, OutContext);
     break;
   case ARMII::MO_LO_0_7:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower0_7(Expr, OutContext);
+    Expr = ARM::createLower0_7(Expr, OutContext);
     break;
   case ARMII::MO_LO_8_15:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createLower8_15(Expr, OutContext);
+    Expr = ARM::createLower8_15(Expr, OutContext);
     break;
   case ARMII::MO_HI_0_7:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper0_7(Expr, OutContext);
+    Expr = ARM::createUpper0_7(Expr, OutContext);
     break;
   case ARMII::MO_HI_8_15:
     Expr = MCSymbolRefExpr::create(Symbol, Specifier, OutContext);
-    Expr = ARMMCExpr::createUpper8_15(Expr, OutContext);
+    Expr = ARM::createUpper8_15(Expr, OutContext);
     break;
   }
 
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 6e9efe40dc54c..f3bdcd64805d8 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -454,7 +454,7 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
-  bool parsePrefix(ARMMCExpr::Specifier &);
+  bool parsePrefix(ARM::Specifier &);
   bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
                               unsigned &ShiftAmount);
   bool parseLiteralValues(unsigned Size, SMLoc L);
@@ -1326,7 +1326,7 @@ class ARMOperand : public MCParsedAsmOperand {
     if (isImm() && !isa<MCConstantExpr>(getImm())) {
       // We want to avoid matching :upper16: and :lower16: as we want these
       // expressions to match in isImm0_65535Expr()
-      const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(getImm());
+      auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(getImm());
       return (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
                              ARM16Expr->getSpecifier() != ARM::S_LO16));
     }
@@ -6424,7 +6424,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     // ":upper8_15:", expression prefixes
     // FIXME: Check it's an expression prefix,
     // e.g. (FOO - :lower16:BAR) isn't legal.
-    ARMMCExpr::Specifier Spec;
+    ARM::Specifier Spec;
     if (parsePrefix(Spec))
       return true;
 
@@ -6432,7 +6432,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     if (getParser().parseExpression(SubExprVal))
       return true;
 
-    const MCExpr *ExprVal = ARMMCExpr::create(Spec, SubExprVal, getContext());
+    const auto *ExprVal =
+        MCSpecifierExpr::create(SubExprVal, Spec, getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E, *this));
     return false;
@@ -6471,7 +6472,7 @@ bool ARMAsmParser::parseImmExpr(int64_t &Out) {
 // parsePrefix - Parse ARM 16-bit relocations expression prefixes, i.e.
 // :lower16: and :upper16: and Thumb 8-bit relocation expression prefixes, i.e.
 // :upper8_15:, :upper0_7:, :lower8_15: and :lower0_7:
-bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
+bool ARMAsmParser::parsePrefix(ARM::Specifier &Spec) {
   MCAsmParser &Parser = getParser();
   Spec = ARM::S_None;
 
@@ -6495,7 +6496,7 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::Specifier &Spec) {
   };
   static const struct PrefixEntry {
     const char *Spelling;
-    ARMMCExpr::Specifier Spec;
+    ARM::Specifier Spec;
     uint8_t SupportedFormats;
   } PrefixEntries[] = {
       {"upper16", ARM::S_HI16, COFF | ELF | MACHO},
@@ -6879,7 +6880,7 @@ static bool isThumbI8Relocation(MCParsedAsmOperand &MCOp) {
   const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
   if (!E)
     return false;
-  const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+  auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(E);
   if (ARM16Expr && (ARM16Expr->getSpecifier() == ARM::S_HI_8_15 ||
                     ARM16Expr->getSpecifier() == ARM::S_HI_0_7 ||
                     ARM16Expr->getSpecifier() == ARM::S_LO_8_15 ||
@@ -8286,7 +8287,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     if (CE) break;
     const MCExpr *E = dyn_cast<MCExpr>(Op.getImm());
     if (!E) break;
-    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    auto *ARM16Expr = dyn_cast<MCSpecifierExpr>(E);
     if (!ARM16Expr || (ARM16Expr->getSpecifier() != ARM::S_HI16 &&
                        ARM16Expr->getSpecifier() != ARM::S_LO16))
       return Error(
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index f8ec0237dcb59..a3d86f690e4a8 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMCAsmInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -153,3 +153,62 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
 
   initializeVariantKinds(variantKindDescs);
 }
+
+void ARM::printSpecifierExpr(const MCAsmInfo &MAI, raw_ostream &OS,
+                             const MCSpecifierExpr &Expr) {
+  switch (Expr.getSpecifier()) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case ARM::S_HI16:
+    OS << ":upper16:";
+    break;
+  case ARM::S_LO16:
+    OS << ":lower16:";
+    break;
+  case ARM::S_HI_8_15:
+    OS << ":upper8_15:";
+    break;
+  case ARM::S_HI_0_7:
+    OS << ":upper0_7:";
+    break;
+  case ARM::S_LO_8_15:
+    OS << ":lower8_15:";
+    break;
+  case ARM::S_LO_0_7:
+    OS << ":lower0_7:";
+    break;
+  }
+
+  const MCExpr *Sub = Expr.getSubExpr();
+  if (Sub->getKind() != MCExpr::SymbolRef)
+    OS << '(';
+  MAI.printExpr(OS, *Sub);
+  if (Sub->getKind() != MCExpr::SymbolRef)
+    OS << ')';
+}
+
+const MCSpecifierExpr *ARM::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI16, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower16(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO16, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createUpper8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI_8_15, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_HI_0_7, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower8_15(const MCExpr *Expr,
+                                            MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO_8_15, Ctx);
+}
+
+const MCSpecifierExpr *ARM::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
+  return MCSpecifierExpr::create(Expr, ARM::S_LO_0_7, Ctx);
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index baadf74e0d5a5..f3f075e99d961 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCASMINFO_H
 
-#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -22,11 +21,24 @@
 namespace llvm {
 class Triple;
 
+namespace ARM {
+void printSpecifierExpr(const MCAsmInfo &MAI, raw_ostream &OS,
+                        const MCSpecifierExpr &Expr);
+}
+
 class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
   virtual void anchor();
 
 public:
   explicit ARMMCAsmInfoDarwin(const Triple &TheTriple);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMELFMCAsmInfo : public MCAsmInfoELF {
@@ -36,6 +48,14 @@ class ARMELFMCAsmInfo : public MCAsmInfoELF {
   explicit ARMELFMCAsmInfo(const Triple &TT);
 
   void setUseIntegratedAssembler(bool Value) override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
@@ -43,6 +63,14 @@ class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
 
 public:
   explicit ARMCOFFMCAsmInfoMicrosoft();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
@@ -50,9 +78,18 @@ class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
 
 public:
   explicit ARMCOFFMCAsmInfoGNU();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override {
+    ARM::printSpecifierExpr(*this, OS, Expr);
+  }
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &,
+                                 const MCAssembler *) const override {
+    return false;
+  }
 };
 
 namespace ARM {
+using Specifier = uint16_t;
 enum {
   S_None,
   S_HI16 =
@@ -93,6 +130,13 @@ enum {
   S_TLSLDO,
   S_TPOFF,
 };
+
+const MCSpecifierExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
+const MCSpecifierExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index f006e00ada328..fba32eae4dfa8 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1192,7 +1192,7 @@ uint32_t ARMMCCodeEmitter::getHiLoImmOpValue(const MCInst &MI, unsigned OpIdx,
   const MCExpr *E = MO.getExpr();
   MCFixupKind Kind;
   if (E->getKind() == MCExpr::Specifier) {
-    const ARMMCExpr *ARM16Expr = cast<ARMMCExpr>(E);
+    auto *ARM16Expr = cast<MCSpecifierExpr>(E);
     E = ARM16Expr->getSubExpr();
 
     if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(E)) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
deleted file mode 100644
index 1e6760a57608a..0000000000000
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===-- ARMMCExpr.cpp - ARM specific MC expression classes ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARMMCExpr.h"
-#include "ARMMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "armmcexpr"
-
-const ARMMCExpr *ARMMCExpr::create(Specifier S, const MCExpr *Expr,
-                                   MCContext &Ctx) {
-  return new (Ctx) ARMMCExpr(S, Expr);
-}
-
-void ARMMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  switch (specifier) {
-  default: llvm_unreachable("Invalid kind!");
-  case ARM::S_HI16:
-    OS << ":upper16:";
-    break;
-  case ARM::S_LO16:
-    OS << ":lower16:";
-    break;
-  case ARM::S_HI_8_15:
-    OS << ":upper8_15:";
-    break;
-  case ARM::S_HI_0_7:
-    OS << ":upper0_7:";
-    break;
-  case ARM::S_LO_8_15:
-    OS << ":lower8_15:";
-    break;
-  case ARM::S_LO_0_7:
-    OS << ":lower0_7:";
-    break;
-  }
-
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  MAI->printExpr(OS, *Expr);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper16(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI16, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower16(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO16, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper8_15(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI_8_15, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createUpper0_7(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_HI_0_7, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower8_15(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO_8_15, Expr, Ctx);
-}
-
-const ARMMCExpr *ARMMCExpr::createLower0_7(const MCExpr *Expr, MCContext &Ctx) {
-  return ARMMCExpr::create(ARM::S_LO_0_7, Expr, Ctx);
-}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
deleted file mode 100644
index f29d05ba2a88d..0000000000000
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===-- ARMMCExpr.h - ARM specific MC expression classes --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
-#define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class ARMMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = uint16_t;
-
-private:
-  explicit ARMMCExpr(Specifier S, const MCExpr *Expr)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const ARMMCExpr *create(Specifier S, const MCExpr *Expr,
-                                 MCContext &Ctx);
-
-  static const ARMMCExpr *createUpper16(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower16(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createUpper8_15(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createUpper0_7(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower8_15(const MCExpr *Expr, MCContext &Ctx);
-  static const ARMMCExpr *createLower0_7(const MCExpr *Expr, MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override {
-    return false;
-  }
-};
-} // end namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 886b7e7bc84e8..72d9379f50384 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMMCExpr.h"
+#include "ARMMCAsmInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
@@ -24,9 +24,9 @@ class ARMMachORelocationInfo : public MCRelocationInfo {
                                              unsigned VariantKind) override {
     switch(VariantKind) {
     case LLVMDisassembler_VariantKind_ARM_HI16:
-      return ARMMCExpr::createUpper16(SubExpr, Ctx);
+      return ARM::createUpper16(SubExpr, Ctx);
     case LLVMDisassembler_VariantKind_ARM_LO16:
-      return ARMMCExpr::createLower16(SubExpr, Ctx);
+      return ARM::createLower16(SubExpr, Ctx);
     default:
       return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
                                                             VariantKind);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 8b3ef0ee651e5..977f8bf5548fd 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMARMDesc
   ARMMachORelocationInfo.cpp
   ARMMCAsmInfo.cpp
   ARMMCCodeEmitter.cpp
-  ARMMCExpr.cpp
   ARMMCTargetDesc.cpp
   ARMTargetStreamer.cpp
   ARMUnwindOpAsm.cpp
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
index 981639faf71d3..698607f3a2267 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/MCTargetDesc/BUILD.gn
@@ -67,7 +67,6 @@ static_library("MCTargetDesc") {
     "ARMInstPrinter.cpp",
     "ARMMCAsmInfo.cpp",
     "ARMMCCodeEmitter.cpp",
-    "ARMMCExpr.cpp",
     "ARMMCTargetDesc.cpp",
     "ARMMachORelocationInfo.cpp",
     "ARMMachObjectWriter.cpp",

From ee2d7a6975f37c11bffbf3207879696aca7fcc65 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 00:35:18 -0700
Subject: [PATCH 563/851] MIPS: Remove unneeded printImpl

Follow-up to 05a9ad977624c4f6def7c0f4cf7103e28d6c6541
---
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 7 -------
 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h   | 2 --
 2 files changed, 9 deletions(-)

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 280d944f2fbb3..821f662f0cbfb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -37,10 +37,3 @@ const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
   return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
                 Ctx);
 }
-
-void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  if (MAI)
-    MAI->printExpr(OS, *this);
-  else // llc -asm-show-inst
-    MipsELFMCAsmInfo(Triple(), MCTargetOptions()).printExpr(OS, *this);
-}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 91ec094821857..b78aeabb57992 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -29,8 +29,6 @@ class MipsMCExpr : public MCSpecifierExpr {
                                   MCContext &Ctx);
   static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr,
                                        MCContext &Ctx);
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 };
 
 } // end namespace llvm

From 4e0dd007ac6a7b7e0a284062b61c6d22250337df Mon Sep 17 00:00:00 2001
From: mayanksolanki393 <mayanksolanki393@gmail.com>
Date: Mon, 16 Jun 2025 13:16:52 +0530
Subject: [PATCH 564/851] [InstCombine] Combine trunc (lshr X, BW-1) to i1 -->
 icmp slt X, 0 (#142593) (#143846)

Fixes #142593, the issue was fixed using the suggestion on the ticket
itself.

Godbolt: https://godbolt.org/z/oW5b74jc4
alive2 proof: https://alive2.llvm.org/ce/z/QHnD7e
---
 .../InstCombine/InstCombineCasts.cpp          |  6 ++
 .../Transforms/InstCombine/logical-select.ll  |  4 +-
 .../test/Transforms/InstCombine/trunc-lshr.ll | 95 +++++++++++++++++++
 3 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/trunc-lshr.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d4a2fe5e37ef5..033ef8be700eb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -815,6 +815,12 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
       return new ICmpInst(ICmpInst::ICMP_EQ, X, CmpC);
     }
 
+    if (match(Src, m_Shr(m_Value(X), m_SpecificInt(SrcWidth - 1)))) {
+      // trunc (ashr X, BW-1) to i1 --> icmp slt X, 0
+      // trunc (lshr X, BW-1) to i1 --> icmp slt X, 0
+      return new ICmpInst(ICmpInst::ICMP_SLT, X, Zero);
+    }
+
     Constant *C;
     if (match(Src, m_OneUse(m_LShr(m_Value(X), m_ImmConstant(C))))) {
       // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index 050a53406a9c5..87e05002665ce 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -807,9 +807,9 @@ define <2 x i16> @bitcast_vec_cond_commute3(<4 x i8> %cond, <2 x i16> %pc, <2 x
 ; CHECK-LABEL: @bitcast_vec_cond_commute3(
 ; CHECK-NEXT:    [[C:%.*]] = mul <2 x i16> [[PC:%.*]], [[PC]]
 ; CHECK-NEXT:    [[D:%.*]] = mul <2 x i16> [[PD:%.*]], [[PD]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt <4 x i8> [[COND:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[D]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i16> [[C]] to <4 x i8>
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt <4 x i8> [[COND:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <4 x i1> [[DOTNOT2]], <4 x i8> [[TMP1]], <4 x i8> [[TMP2]]
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i8> [[TMP3]] to <2 x i16>
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
@@ -1069,8 +1069,8 @@ define <2 x i1> @not_d_bools_vector_poison(<2 x i1> %c, <2 x i1> %x, <2 x i1> %y
 
 define i32 @not_d_allSignBits(i32 %cond, i32 %tval, i32 %fval) {
 ; CHECK-LABEL: @not_d_allSignBits(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[FVAL:%.*]], -1
 ; CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp slt i32 [[COND:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[FVAL:%.*]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[DOTNOT2]], i32 [[TVAL:%.*]], i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/trunc-lshr.ll b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
new file mode 100644
index 0000000000000..4364b09cfa709
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/trunc-lshr.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @test1(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test1(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = lshr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+define i1 @test2(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test2(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = ashr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = ashr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+define i1 @test3(i32 %i, ptr %p, ptr %q) {
+; CHECK-LABEL: define i1 @test3(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 31
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[Q]], align 1
+; CHECK-NEXT:    ret i1 false
+;
+  %lobit = lshr i32 %i, 31
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  store i32 %lobit, ptr %q, align 1
+  ret i1 %op
+}
+
+; Negative Test
+define i1 @test4(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test4(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = lshr i32 [[I]], 30
+; CHECK-NEXT:    [[T:%.*]] = trunc nuw i32 [[DOTLOBIT]] to i1
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[I]], 0
+; CHECK-NEXT:    [[NOT_:%.*]] = xor i1 [[T]], true
+; CHECK-NEXT:    [[COMMON_RET1_OP:%.*]] = select i1 [[NOT_]], i1 [[B]], i1 false
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 [[COMMON_RET1_OP]]
+;
+  %lobit = lshr i32 %i, 30 ; should not fold as no. of bits shifted < BitWidth - 1
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+
+; Negative Test
+define i1 @test5(i32 %i, ptr %p) {
+; CHECK-LABEL: define i1 @test5(
+; CHECK-SAME: i32 [[I:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[DOTLOBIT:%.*]] = ashr i32 [[I]], 30
+; CHECK-NEXT:    [[T:%.*]] = trunc nuw i32 [[DOTLOBIT]] to i1
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[I]], 0
+; CHECK-NEXT:    [[NOT_:%.*]] = xor i1 [[T]], true
+; CHECK-NEXT:    [[COMMON_RET1_OP:%.*]] = select i1 [[NOT_]], i1 [[B]], i1 false
+; CHECK-NEXT:    store i32 [[DOTLOBIT]], ptr [[P]], align 1
+; CHECK-NEXT:    ret i1 [[COMMON_RET1_OP]]
+;
+  %lobit = ashr i32 %i, 30 ; should not fold as no. of bits shifted < BitWidth - 1
+  %t = trunc nuw i32 %lobit to i1
+  %b = icmp slt i32 %i, 0
+  %not = xor i1 %t, true
+  %op = select i1 %not, i1 %b, i1 false
+  store i32 %lobit, ptr %p, align 1
+  ret i1 %op
+}
+

From e61405033bbaec3604c79a0b323a3e21efc720bc Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Mon, 16 Jun 2025 09:55:22 +0200
Subject: [PATCH 565/851] [clang] Fix -fclang-abi-compat for clang 20 (#144109)

The value was known already, but it was parsed as latest which is
incorrect because we are already doing clang 21.
---
 clang/lib/Frontend/CompilerInvocation.cpp | 2 ++
 clang/test/CodeGen/X86/avx-cxx-record.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index dd021ad2e441b..5c52dc33ddf6c 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4475,6 +4475,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         Opts.setClangABICompat(LangOptions::ClangABI::Ver18);
       else if (Major <= 19)
         Opts.setClangABICompat(LangOptions::ClangABI::Ver19);
+      else if (Major <= 20)
+        Opts.setClangABICompat(LangOptions::ClangABI::Ver20);
     } else if (Ver != "latest") {
       Diags.Report(diag::err_drv_invalid_value)
           << A->getAsString(Args) << A->getValue();
diff --git a/clang/test/CodeGen/X86/avx-cxx-record.cpp b/clang/test/CodeGen/X86/avx-cxx-record.cpp
index bcd9c361fda90..6ce6815a521a1 100644
--- a/clang/test/CodeGen/X86/avx-cxx-record.cpp
+++ b/clang/test/CodeGen/X86/avx-cxx-record.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -O2 -target-cpu x86-64-v3 -fclang-abi-compat=20 -o - | FileCheck --check-prefix CLANG-20 %s
 
 using UInt64x2 = unsigned long long __attribute__((__vector_size__(16), may_alias));
 
@@ -11,6 +12,7 @@ struct XMM2 : XMM1<0>, XMM1<1> {
 };
 
 // CHECK: define{{.*}} @_Z3foov({{.*}} [[ARG:%.*]]){{.*}}
+// CLANG-20: define{{.*}} <4 x double> @_Z3foov()
 // CHECK: entry:
 // CHECK-NEXT: store {{.*}}, ptr [[ARG]]{{.*}}
 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr {{.*}}, ptr [[ARG]]{{.*}}

From fbade95ebf2bc959fada5206e47f792a2090d72e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 16 Jun 2025 08:55:46 +0100
Subject: [PATCH 566/851] [LV] Strip unnecessary make_{pair,optional} (NFC)
 (#141924)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 62 ++++++++-----------
 1 file changed, 27 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index eb04e2d5ca7b4..34f49a7721a30 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1080,7 +1080,7 @@ class LoopVectorizationCostModel {
   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
-    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+    WideningDecisions[{I, VF}] = {W, Cost};
   }
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1102,11 +1102,9 @@ class LoopVectorizationCostModel {
     for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
       if (auto *I = Grp->getMember(Idx)) {
         if (Grp->getInsertPos() == I)
-          WideningDecisions[std::make_pair(I, VF)] =
-              std::make_pair(W, InsertPosCost);
+          WideningDecisions[{I, VF}] = {W, InsertPosCost};
         else
-          WideningDecisions[std::make_pair(I, VF)] =
-              std::make_pair(W, OtherMemberCost);
+          WideningDecisions[{I, VF}] = {W, OtherMemberCost};
       }
     }
   }
@@ -1120,7 +1118,7 @@ class LoopVectorizationCostModel {
         TheLoop->isInnermost() &&
         "cost-model should not be used for outer loops (in VPlan-native path)");
 
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
       return CM_Unknown;
@@ -1131,7 +1129,7 @@ class LoopVectorizationCostModel {
   /// width \p VF.
   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
     assert(VF.isVector() && "Expected VF >=2");
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
     assert(WideningDecisions.contains(InstOnVF) &&
            "The cost is not calculated");
     return WideningDecisions[InstOnVF].second;
@@ -1150,8 +1148,7 @@ class LoopVectorizationCostModel {
                                std::optional<unsigned> MaskPos,
                                InstructionCost Cost) {
     assert(!VF.isScalar() && "Expected vector VF");
-    CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
-                                                     MaskPos, Cost};
+    CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
   }
 
   CallWideningDecision getCallWideningDecision(CallInst *CI,
@@ -1348,21 +1345,20 @@ class LoopVectorizationCostModel {
   void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
     assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
     if (!Legal->canFoldTailByMasking()) {
-      ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+      ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
       return;
     }
 
     if (!ForceTailFoldingStyle.getNumOccurrences()) {
-      ChosenTailFoldingStyle = std::make_pair(
+      ChosenTailFoldingStyle = {
           TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
-          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
+          TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
       return;
     }
 
     // Set styles when forced.
-    ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),
-                                            ForceTailFoldingStyle.getValue());
+    ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
+                              ForceTailFoldingStyle.getValue()};
     if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
       return;
     // Override forced styles if needed.
@@ -1375,9 +1371,8 @@ class LoopVectorizationCostModel {
       // If for some reason EVL mode is unsupported, fallback to
       // DataWithoutLaneMask to try to vectorize the loop with folded tail
       // in a generic way.
-      ChosenTailFoldingStyle =
-          std::make_pair(TailFoldingStyle::DataWithoutLaneMask,
-                         TailFoldingStyle::DataWithoutLaneMask);
+      ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+                                TailFoldingStyle::DataWithoutLaneMask};
       LLVM_DEBUG(
           dbgs()
           << "LV: Preference for VP intrinsics indicated. Will "
@@ -8138,7 +8133,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
     PartialReductionChain Chain = Pair.first;
     if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
         ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));
+      ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
   }
 }
 
@@ -8210,12 +8205,11 @@ bool VPRecipeBuilder::getScaledReductions(
           [&](ElementCount VF) {
             InstructionCost Cost = TTI->getPartialReductionCost(
                 Update->getOpcode(), A->getType(), B->getType(), PHI->getType(),
-                VF, OpAExtend, OpBExtend,
-                std::make_optional(BinOp->getOpcode()));
+                VF, OpAExtend, OpBExtend, BinOp->getOpcode());
             return Cost.isValid();
           },
           Range)) {
-    Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
+    Chains.emplace_back(Chain, TargetScaleFactor);
     return true;
   }
 
@@ -10108,9 +10102,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   bool VectorizeLoop = true, InterleaveLoop = true;
   if (VF.Width.isScalar()) {
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-    VecDiagMsg = std::make_pair(
+    VecDiagMsg = {
         "VectorizationNotBeneficial",
-        "the cost-model indicates that vectorization is not beneficial");
+        "the cost-model indicates that vectorization is not beneficial"};
     VectorizeLoop = false;
   }
 
@@ -10119,16 +10113,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
                          "interleaving should be avoided up front\n");
-    IntDiagMsg = std::make_pair(
-        "InterleavingAvoided",
-        "Ignoring UserIC, because interleaving was avoided up front");
+    IntDiagMsg = {"InterleavingAvoided",
+                  "Ignoring UserIC, because interleaving was avoided up front"};
     InterleaveLoop = false;
   } else if (IC == 1 && UserIC <= 1) {
     // Tell the user interleaving is not beneficial.
     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
-    IntDiagMsg = std::make_pair(
+    IntDiagMsg = {
         "InterleavingNotBeneficial",
-        "the cost-model indicates that interleaving is not beneficial");
+        "the cost-model indicates that interleaving is not beneficial"};
     InterleaveLoop = false;
     if (UserIC == 1) {
       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
@@ -10139,10 +10132,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // Tell the user interleaving is beneficial, but it explicitly disabled.
     LLVM_DEBUG(
         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
-    IntDiagMsg = std::make_pair(
-        "InterleavingBeneficialButDisabled",
-        "the cost-model indicates that interleaving is beneficial "
-        "but is explicitly disabled or interleave count is set to 1");
+    IntDiagMsg = {"InterleavingBeneficialButDisabled",
+                  "the cost-model indicates that interleaving is beneficial "
+                  "but is explicitly disabled or interleave count is set to 1"};
     InterleaveLoop = false;
   }
 
@@ -10152,10 +10144,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
     LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
                       << "to histogram operations.\n");
-    IntDiagMsg = std::make_pair(
+    IntDiagMsg = {
         "HistogramPreventsScalarInterleaving",
         "Unable to interleave without vectorization due to constraints on "
-        "the order of histogram operations");
+        "the order of histogram operations"};
     InterleaveLoop = false;
   }
 

From cca454b54c7d58930e261c7fa72f44a1a8976997 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 16 Jun 2025 09:12:42 +0100
Subject: [PATCH 567/851] [ValueTracking] Remove opcode whitelist from
 matchSimpleRecurrence. (#144031)

This also patches HashRecognize to avoid it mishandling some opcodes.
---
 llvm/lib/Analysis/HashRecognize.cpp           | 11 +++--
 llvm/lib/Analysis/ValueTracking.cpp           | 40 +++++--------------
 .../HashRecognize/cyclic-redundancy-check.ll  | 30 +++++++++++++-
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp
index b245548dea6d5..1edb8b3bdc9a8 100644
--- a/llvm/lib/Analysis/HashRecognize.cpp
+++ b/llvm/lib/Analysis/HashRecognize.cpp
@@ -542,7 +542,11 @@ static bool arePHIsIntertwined(
 // doing this, we're immune to whether the IR expression is mul/udiv or
 // equivalently shl/lshr. Return false when it is a UDiv, true when it is a Mul,
 // and std::nullopt otherwise.
-static std::optional<bool> isBigEndianBitShift(const SCEV *E) {
+static std::optional<bool> isBigEndianBitShift(Value *V, ScalarEvolution &SE) {
+  if (!V->getType()->isIntegerTy())
+    return {};
+
+  const SCEV *E = SE.getSCEV(V);
   if (match(E, m_scev_UDiv(m_SCEV(), m_scev_SpecificInt(2))))
     return false;
   if (match(E, m_scev_Mul(m_scev_SpecificInt(2), m_SCEV())))
@@ -576,12 +580,11 @@ HashRecognize::recognizeCRC() const {
   // Make sure that all recurrences are either all SCEVMul with two or SCEVDiv
   // with two, or in other words, that they're single bit-shifts.
   std::optional<bool> ByteOrderSwapped =
-      isBigEndianBitShift(SE.getSCEV(ConditionalRecurrence.BO));
+      isBigEndianBitShift(ConditionalRecurrence.BO, SE);
   if (!ByteOrderSwapped)
     return "Loop with non-unit bitshifts";
   if (SimpleRecurrence) {
-    if (isBigEndianBitShift(SE.getSCEV(SimpleRecurrence.BO)) !=
-        ByteOrderSwapped)
+    if (isBigEndianBitShift(SimpleRecurrence.BO, SE) != ByteOrderSwapped)
       return "Loop with non-unit bitshifts";
     if (!arePHIsIntertwined(SimpleRecurrence.Phi, ConditionalRecurrence.Phi, L,
                             Instruction::BinaryOps::Xor))
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index e7a1f07c0270d..d39efb2859747 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9071,6 +9071,7 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
   // Handle the case of a simple two-predecessor recurrence PHI.
   // There's a lot more that could theoretically be done here, but
   // this is sufficient to catch some interesting cases.
+  // TODO: Expand list -- gep, uadd.sat etc.
   if (P->getNumIncomingValues() != 2)
     return false;
 
@@ -9081,35 +9082,16 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
     if (!LU)
       continue;
     unsigned Opcode = LU->getOpcode();
-
-    switch (Opcode) {
-    default:
-      continue;
-    // TODO: Expand list -- xor, gep, uadd.sat etc.
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::Shl:
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::UDiv:
-    case Instruction::URem:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Mul:
-    case Instruction::FMul: {
-      Value *LL = LU->getOperand(0);
-      Value *LR = LU->getOperand(1);
-      // Find a recurrence.
-      if (LL == P)
-        L = LR;
-      else if (LR == P)
-        L = LL;
-      else
-        continue; // Check for recurrence with L and R flipped.
-
-      break; // Match!
-    }
-    };
+    Value *LL = LU->getOperand(0);
+    Value *LR = LU->getOperand(1);
+
+    // Find a recurrence.
+    if (LL == P)
+      L = LR;
+    else if (LR == P)
+      L = LL;
+    else
+      continue; // Check for recurrence with L and R flipped.
 
     // We have matched a recurrence of the form:
     //   %iv = [R, %entry], [%iv.next, %backedge]
diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
index 3e05a9b5c8499..7a3082056ad29 100644
--- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
+++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll
@@ -873,7 +873,7 @@ exit:                                              ; preds = %loop
 define i16 @not.crc.float.simple.recurrence(float %msg, i16 %checksum) {
 ; CHECK-LABEL: 'not.crc.float.simple.recurrence'
 ; CHECK-NEXT:  Did not find a hash algorithm
-; CHECK-NEXT:  Reason: Found stray PHI
+; CHECK-NEXT:  Reason: Loop with non-unit bitshifts
 ;
 entry:
   br label %loop
@@ -897,3 +897,31 @@ loop:                                              ; preds = %loop, %entry
 exit:                                              ; preds = %loop
   ret i16 %crc.next
 }
+
+define i16 @not.crc.stray.phi(i8 %msg, i16 %checksum, i1 %c) {
+; CHECK-LABEL: 'not.crc.stray.phi'
+; CHECK-NEXT:  Did not find a hash algorithm
+; CHECK-NEXT:  Reason: Found stray PHI
+;
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ]
+  %crc = phi i16 [ %checksum, %entry ], [ %crc.next, %loop ]
+  %data = phi i8 [ %msg, %entry ], [ %data.next, %loop ]
+  %crc.trunc = trunc i16 %crc to i8
+  %xor.data.crc = xor i8 %data, %crc.trunc
+  %and.data.crc = and i8 %xor.data.crc, 1
+  %data.next = select i1 %c, i8 %data, i8 1
+  %check.sb = icmp eq i8 %and.data.crc, 0
+  %crc.lshr = lshr i16 %crc, 1
+  %xor = xor i16 %crc.lshr, -24575
+  %crc.next = select i1 %check.sb, i16 %crc.lshr, i16 %xor
+  %iv.next = add nuw nsw i8 %iv, 1
+  %exit.cond = icmp samesign ult i8 %iv, 7
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:                                              ; preds = %loop
+  ret i16 %crc.next
+}

From 0952992ac6e1470d9f776a99c5793745a6b58d98 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Mon, 16 Jun 2025 09:42:59 +0100
Subject: [PATCH 568/851] [BOLT] Fix LLVM_APPEND_VC_REV support (#142410)

The CMake flag LLVM_APPEND_VC_REV can be passed when building BOLT a
BOLT to prevent including a VC Revision. This patch enables this
functionality.

Usage: `-DLLVM_APPEND_VC_REV=OFF` when running CMake.
---
 bolt/lib/Utils/CMakeLists.txt | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/bolt/lib/Utils/CMakeLists.txt b/bolt/lib/Utils/CMakeLists.txt
index efba6d54449d3..94933644ef5ef 100644
--- a/bolt/lib/Utils/CMakeLists.txt
+++ b/bolt/lib/Utils/CMakeLists.txt
@@ -6,12 +6,25 @@ set(version_inc "${CMAKE_CURRENT_BINARY_DIR}/VCSVersion.inc")
 
 set(generate_vcs_version_script "${LLVM_CMAKE_DIR}/GenerateVersionFromVCS.cmake")
 
+if(llvm_vc AND LLVM_APPEND_VC_REV)
+  set(llvm_source_dir ${LLVM_MAIN_SRC_DIR})
+endif()
+if(LLVM_VC_REPOSITORY AND LLVM_VC_REVISION)
+  set(llvm_source_dir ${LLVM_SOURCE_DIR})
+  set(llvm_vc_repository ${LLVM_VC_REPOSITORY})
+  set(llvm_vc_revision ${LLVM_VC_REVISION})
+endif()
+if(bolt_vc AND LLVM_APPEND_VC_REV)
+  set(bolt_source_dir ${BOLT_SOURCE_DIR})
+endif()
+
 # Create custom target to generate the VC revision include.
 add_custom_command(OUTPUT "${version_inc}"
   DEPENDS "${llvm_vc}" "${bolt_vc}" "${generate_vcs_version_script}"
   COMMAND ${CMAKE_COMMAND} "-DNAMES=BOLT"
+                           "-DLLVM_SOURCE_DIR=${llvm_source_dir}"
+                           "-DBOLT_SOURCE_DIR=${bolt_source_dir}"
                            "-DHEADER_FILE=${version_inc}"
-                           "-DBOLT_SOURCE_DIR=${BOLT_SOURCE_DIR}"
                            "-DLLVM_VC_REPOSITORY=${llvm_vc_repository}"
                            "-DLLVM_VC_REVISION=${llvm_vc_revision}"
                            "-DLLVM_FORCE_VC_REVISION=${LLVM_FORCE_VC_REVISION}"

From 383b3268794da1ca763deb91cec777742e6e54a8 Mon Sep 17 00:00:00 2001
From: Javier Lopez-Gomez <javier.lopez.gomez@proton.me>
Date: Mon, 16 Jun 2025 10:47:00 +0200
Subject: [PATCH 569/851] [llvm-debuginfo-analyzer] Fix ODR violation in
 llvm::logicalview::LVObject (#140265)

Some data members are only part of a class definition in a Debug build,
e.g. `LVObject::ID`. If `debuginfologicalview` is used as a library,
`NDEBUG` cannot be used for this purpose, as this PP macro may have a
different definition in a downstream project, which in turn triggers an
ODR violation. Fix it by
- Making `LVObject::ID` an unconditional data member.
- Making `LVObject::dump()` non-virtual. Rationale: `virtual` is not
needed (and it calls `print()`, which is virtual anyway).

Fixes #139098.
---
 .../CommandGuide/llvm-debuginfo-analyzer.rst  |  3 +-
 .../llvm/DebugInfo/LogicalView/Core/LVLine.h  |  4 --
 .../DebugInfo/LogicalView/Core/LVLocation.h   |  6 +--
 .../DebugInfo/LogicalView/Core/LVObject.h     | 39 ++++++-------------
 .../llvm/DebugInfo/LogicalView/Core/LVRange.h |  4 --
 .../llvm/DebugInfo/LogicalView/Core/LVScope.h |  4 --
 .../DebugInfo/LogicalView/Core/LVSymbol.h     |  4 --
 .../llvm/DebugInfo/LogicalView/Core/LVType.h  |  4 --
 .../DebugInfo/LogicalView/Core/LVObject.cpp   |  6 +--
 .../DebugInfo/LogicalView/Core/LVOptions.cpp  |  2 -
 10 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
index 453af0751e2a1..1264f80206618 100644
--- a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
+++ b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst
@@ -676,8 +676,7 @@ INTERNAL
  Typically these kind of options are available only in *debug* builds.
 
  :program:`llvm-debuginfo-analyzer` supports these advanced options in
- both *release* and *debug* builds, with the exception of the unique ID
- that is generated only in *debug* builds.
+ both *release* and *debug* builds.
 
 .. option:: --internal=<value[,value,...]>
 
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
index c979dc4a6be2e..3618ce7b0ecda 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
@@ -105,10 +105,6 @@ class LLVM_ABI LVLine : public LVElement {
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent a DWARF line record object.
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
index 7b466ae206e4e..0718e33f5645b 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
@@ -51,7 +51,7 @@ class LVOperation final {
   LLVM_ABI void print(raw_ostream &OS, bool Full = true) const;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() { print(dbgs()); }
+  void dump() const { print(dbgs()); }
 #endif
 };
 
@@ -159,10 +159,6 @@ class LLVM_ABI LVLocation : public LVObject {
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 class LLVM_ABI LVLocationSymbol final : public LVLocation {
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
index ec02120e69b73..be64cdaea3d78 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h
@@ -36,7 +36,7 @@ namespace logicalview {
 using LVSectionIndex = uint64_t;
 using LVAddress = uint64_t;
 using LVHalf = uint16_t;
-using LVLevel = uint32_t;
+using LVLevel = uint16_t;
 using LVOffset = uint64_t;
 using LVSigned = int64_t;
 using LVUnsigned = uint64_t;
@@ -129,8 +129,6 @@ class LLVM_ABI LVObject {
     HasCodeViewLocation, // CodeView object with debug location.
     LastEntry
   };
-  // Typed bitvector with properties for this object.
-  LVProperties<Property> Properties;
 
   LVOffset Offset = 0;
   uint32_t LineNumber = 0;
@@ -140,6 +138,14 @@ class LLVM_ABI LVObject {
     dwarf::Attribute Attr;
     LVSmall Opcode;
   } TagAttrOpcode = {dwarf::DW_TAG_null};
+  // Typed bitvector with properties for this object.
+  LVProperties<Property> Properties;
+
+  // This is an internal ID used for debugging logical elements. It is used
+  // for cases where an unique offset within the binary input file is not
+  // available.
+  static uint32_t GID;
+  uint32_t ID = 0;
 
   // The parent of this object (nullptr if the root scope). For locations,
   // the parent is a symbol object; otherwise it is a scope object.
@@ -155,9 +161,7 @@ class LLVM_ABI LVObject {
   // copy constructor to create that object; it is used to print a reference
   // to another object and in the case of templates, to print its encoded args.
   LVObject(const LVObject &Object) {
-#ifndef NDEBUG
     incID();
-#endif
     Properties = Object.Properties;
     Offset = Object.Offset;
     LineNumber = Object.LineNumber;
@@ -166,18 +170,10 @@ class LLVM_ABI LVObject {
     Parent = Object.Parent;
   }
 
-#ifndef NDEBUG
-  // This is an internal ID used for debugging logical elements. It is used
-  // for cases where an unique offset within the binary input file is not
-  // available.
-  static uint64_t GID;
-  uint64_t ID = 0;
-
   void incID() {
     ++GID;
     ID = GID;
   }
-#endif
 
 protected:
   // Get a string representation for the given number and discriminator.
@@ -193,11 +189,7 @@ class LLVM_ABI LVObject {
   virtual void printFileIndex(raw_ostream &OS, bool Full = true) const {}
 
 public:
-  LVObject() {
-#ifndef NDEBUG
-    incID();
-#endif
-  };
+  LVObject() { incID(); };
   LVObject &operator=(const LVObject &) = delete;
   virtual ~LVObject() = default;
 
@@ -313,17 +305,10 @@ class LLVM_ABI LVObject {
   virtual void printExtra(raw_ostream &OS, bool Full = true) const {}
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  virtual void dump() const { print(dbgs()); }
+  void dump() const { print(dbgs()); }
 #endif
 
-  uint64_t getID() const {
-    return
-#ifndef NDEBUG
-        ID;
-#else
-        0;
-#endif
-  }
+  uint32_t getID() const { return ID; }
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
index 07d5813e5b19b..b5c833330e59e 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
@@ -87,10 +87,6 @@ class LLVM_ABI LVRange final : public LVObject {
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
index 0f536b5c16b96..5715a37185b2b 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
@@ -325,10 +325,6 @@ class LLVM_ABI LVScope : public LVElement {
   void printExtra(raw_ostream &OS, bool Full = true) const override;
   virtual void printWarnings(raw_ostream &OS, bool Full = true) const {}
   virtual void printMatchedElements(raw_ostream &OS, bool UseMatchedElements) {}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent a DWARF Union/Structure/Class.
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
index 93ca2a73d64dd..ec9017e16b659 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
@@ -183,10 +183,6 @@ class LLVM_ABI LVSymbol final : public LVElement {
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 } // end namespace logicalview
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
index cbce9cb65c920..59e6a92be8ce6 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
@@ -146,10 +146,6 @@ class LLVM_ABI LVType : public LVElement {
 
   void print(raw_ostream &OS, bool Full = true) const override;
   void printExtra(raw_ostream &OS, bool Full = true) const override;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump() const override { print(dbgs()); }
-#endif
 };
 
 // Class to represent DW_TAG_typedef_type.
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
index 75acbf3225e08..5ccbcbfa4f0aa 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVObject.cpp
@@ -21,9 +21,7 @@ using namespace llvm::logicalview;
 
 #define DEBUG_TYPE "Object"
 
-#ifndef NDEBUG
-uint64_t LVObject::GID = 0;
-#endif
+uint32_t LVObject::GID = 0;
 
 StringRef llvm::logicalview::typeNone() { return StringRef(); }
 StringRef llvm::logicalview::typeVoid() { return "void"; }
@@ -137,10 +135,8 @@ void LVObject::printAttributes(raw_ostream &OS, bool Full, StringRef Name,
 }
 
 void LVObject::printAttributes(raw_ostream &OS, bool Full) const {
-#ifndef NDEBUG
   if (options().getInternalID())
     OS << hexSquareString(getID());
-#endif
   if (options().getCompareExecute() &&
       (options().getAttributeAdded() || options().getAttributeMissing()))
     OS << (getIsAdded() ? '+' : getIsMissing() ? '-' : ' ');
diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
index 467bb98670b40..af35e58ac0dd6 100644
--- a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp
@@ -259,12 +259,10 @@ void LVOptions::resolveDependencies() {
 }
 
 void LVOptions::calculateIndentationSize() {
-#ifndef NDEBUG
   if (getInternalID()) {
     std::string String = hexSquareString(0);
     IndentationSize += String.length();
   }
-#endif
   if (getCompareExecute() && (getAttributeAdded() || getAttributeMissing()))
     ++IndentationSize;
   if (getAttributeOffset()) {

From f12dd8f86a2911f69349807359d3bc792e6b773d Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj@nvidia.com>
Date: Mon, 16 Jun 2025 09:57:21 +0100
Subject: [PATCH 570/851] [ValueTracking] Remove unused variable in
 matchSimpleRecurrence (NFC). (#144316)

---
 llvm/lib/Analysis/ValueTracking.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d39efb2859747..9df667926faf0 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9081,7 +9081,6 @@ bool llvm::matchSimpleRecurrence(const PHINode *P, BinaryOperator *&BO,
     auto *LU = dyn_cast<BinaryOperator>(L);
     if (!LU)
       continue;
-    unsigned Opcode = LU->getOpcode();
     Value *LL = LU->getOperand(0);
     Value *LR = LU->getOperand(1);
 

From 3dd61c1876446fb9db7c87b89006ad6d81f72f0d Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Mon, 16 Jun 2025 09:58:03 +0100
Subject: [PATCH 571/851] [LV] Fix MVE regression from #132190 (#141736)

Register pressure was only considered if the vector bandwidth was being
maximised (chosen either by the target or user options), but #132190
inadvertently caused high pressure VFs to be pruned even when max
bandwidth wasn't enabled. This PR returns to the previous behaviour.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  51 +++-
 .../ARM/mve-reg-pressure-vmla.ll              | 136 +++++++++
 .../RISCV/interleaved-masked-access.ll        | 284 +++++++++---------
 .../LoopVectorize/RISCV/reg-usage-bf16.ll     |   3 +-
 .../LoopVectorize/RISCV/reg-usage-f16.ll      |   6 +-
 .../LoopVectorize/RISCV/reg-usage.ll          |  40 +--
 6 files changed, 338 insertions(+), 182 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 34f49a7721a30..bdbfecd962443 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -953,6 +953,14 @@ class LoopVectorizationCostModel {
     return expectedCost(UserVF).isValid();
   }
 
+  /// \return True if maximizing vector bandwidth is enabled by the target or
+  /// user options, for the given register kind.
+  bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
+
+  /// \return True if maximizing vector bandwidth is enabled by the target or
+  /// user options, for the given vector factor.
+  bool useMaxBandwidth(ElementCount VF);
+
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
@@ -3921,6 +3929,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   return FixedScalableVFPair::getNone();
 }
 
+bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
+  return useMaxBandwidth(VF.isScalable()
+                             ? TargetTransformInfo::RGK_ScalableVector
+                             : TargetTransformInfo::RGK_FixedWidthVector);
+}
+
+bool LoopVectorizationCostModel::useMaxBandwidth(
+    TargetTransformInfo::RegisterKind RegKind) {
+  return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+                               (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+                                (UseWiderVFIfCallVariantsPresent &&
+                                 Legal->hasVectorCallVariants())));
+}
+
 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, bool FoldTailByMasking) {
@@ -3986,10 +4008,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (MaximizeBandwidth ||
-      (MaximizeBandwidth.getNumOccurrences() == 0 &&
-       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
-        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
+  if (useMaxBandwidth(RegKind)) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
@@ -4344,15 +4363,21 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
   for (auto &P : VPlans) {
     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
                                P->vectorFactors().end());
-    auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
-    for (auto [VF, RU] : zip_equal(VFs, RUs)) {
+
+    SmallVector<VPRegisterUsage, 8> RUs;
+    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
+
+    for (unsigned I = 0; I < VFs.size(); I++) {
+      ElementCount VF = VFs[I];
       // The cost for scalar VF=1 is already calculated, so ignore it.
       if (VF.isScalar())
         continue;
 
       /// Don't consider the VF if it exceeds the number of registers for the
       /// target.
-      if (RU.exceedsMaxNumRegs(TTI))
+      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
         continue;
 
       InstructionCost C = CM.expectedCost(VF);
@@ -7106,8 +7131,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   for (auto &P : VPlans) {
     ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
                                P->vectorFactors().end());
-    auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
-    for (auto [VF, RU] : zip_equal(VFs, RUs)) {
+
+    SmallVector<VPRegisterUsage, 8> RUs;
+    if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
+        CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
+      RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
+
+    for (unsigned I = 0; I < VFs.size(); I++) {
+      ElementCount VF = VFs[I];
       if (VF.isScalar())
         continue;
       if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -7129,7 +7160,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
       InstructionCost Cost = cost(*P, VF);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (RU.exceedsMaxNumRegs(TTI)) {
+      if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
         LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
                           << VF << " because it uses too many registers\n");
         continue;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
new file mode 100644
index 0000000000000..4c29a3a0d1d01
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 5
+; RUN: opt -mattr=+mve -passes=loop-vectorize < %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+; Even though it has high register pressure, this example should still vectorise since the mul+add chains become VMLAs.
+
+define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
+; CHECK-LABEL: define void @fn(
+; CHECK-SAME: i32 noundef [[N:%.*]], ptr [[IN:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP46_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP46_NOT]], [[EXIT:label %.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[N]], 3
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[OUT]], i32 [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[IN]], i32 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[OUT]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[IN]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[IN]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI2:%.*]] = phi ptr [ [[OUT]], %[[VECTOR_PH]] ], [ [[PTR_IND3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 19595)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER5]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], splat (i32 38470)
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw <4 x i32> [[TMP7]], splat (i32 7471)
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw <4 x i32> [[TMP4]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw <4 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], splat (i32 16)
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 32767)
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw <4 x i32> [[TMP5]], splat (i32 16762097)
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i32> [[TMP7]], splat (i32 16759568)
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP14]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw <4 x i32> [[TMP17]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = lshr <4 x i32> [[TMP19]], splat (i32 16)
+; CHECK-NEXT:    [[TMP21:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i8>
+; CHECK-NEXT:    [[TMP22:%.*]] = mul nuw nsw <4 x i32> [[TMP3]], splat (i32 13282)
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw <4 x i32> [[TMP5]], splat (i32 16744449)
+; CHECK-NEXT:    [[TMP24:%.*]] = mul nuw nsw <4 x i32> [[TMP7]], splat (i32 19485)
+; CHECK-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP22]], splat (i32 32768)
+; CHECK-NEXT:    [[TMP26:%.*]] = add nuw <4 x i32> [[TMP25]], [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add nuw <4 x i32> [[TMP26]], [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], splat (i32 16)
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i8>
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> [[TMP30]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> [[TMP31]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
+; CHECK-NEXT:    [[PTR_IND3]] = getelementptr i8, ptr [[POINTER_PHI2]], i32 12
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br [[EXIT_LOOPEXIT:label %.*]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  %cmp46.not = icmp eq i32 %n, 0
+  br i1 %cmp46.not, label %exit, label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %ptr.iv.1 = phi ptr [ %in, %entry ], [ %ptr.iv.1.next, %for.body ]
+  %ptr.iv.2 = phi ptr [ %out, %entry ], [ %ptr.iv.2.next, %for.body ]
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 1
+  %0 = load i8, ptr %ptr.iv.1, align 1
+  %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 2
+  %1 = load i8, ptr %incdec.ptr, align 1
+  %ptr.iv.1.next = getelementptr inbounds nuw i8, ptr %ptr.iv.1, i32 3
+  %2 = load i8, ptr %incdec.ptr1, align 1
+  %conv = zext i8 %0 to i32
+  %mul = mul nuw nsw i32 %conv, 19595
+  %conv3 = zext i8 %1 to i32
+  %mul4 = mul nuw nsw i32 %conv3, 38470
+  %conv5 = zext i8 %2 to i32
+  %mul6 = mul nuw nsw i32 %conv5, 7471
+  %add = add nuw nsw i32 %mul, 32768
+  %add7 = add nuw nsw i32 %add, %mul4
+  %add8 = add nuw nsw i32 %add7, %mul6
+  %shr = lshr i32 %add8, 16
+  %conv9 = trunc nuw i32 %shr to i8
+  %mul11 = mul nuw nsw i32 %conv, 32767
+  %mul13 = mul nuw i32 %conv3, 16762097
+  %mul16 = mul nuw i32 %conv5, 16759568
+  %add14 = add nuw nsw i32 %mul11, 32768
+  %add17 = add nuw i32 %add14, %mul13
+  %add18 = add i32 %add17, %mul16
+  %shr19 = lshr i32 %add18, 16
+  %conv20 = trunc i32 %shr19 to i8
+  %mul22 = mul nuw nsw i32 %conv, 13282
+  %mul24 = mul nuw i32 %conv3, 16744449
+  %mul27 = mul nuw nsw i32 %conv5, 19485
+  %add25 = add nuw nsw i32 %mul22, 32768
+  %add28 = add nuw i32 %add25, %mul24
+  %add29 = add nuw i32 %add28, %mul27
+  %shr30 = lshr i32 %add29, 16
+  %conv31 = trunc i32 %shr30 to i8
+  %incdec.ptr32 = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 1
+  store i8 %conv9, ptr %ptr.iv.2, align 1
+  %incdec.ptr33 = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 2
+  store i8 %conv20, ptr %incdec.ptr32, align 1
+  %ptr.iv.2.next = getelementptr inbounds nuw i8, ptr %ptr.iv.2, i32 3
+  store i8 %conv31, ptr %incdec.ptr33, align 1
+  %iv.next = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 1b0feef3e6664..b7c9612e57aec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -11,44 +11,44 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP9]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP10]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP15]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP14]], <vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP17]], <vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP9]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -63,42 +63,42 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP9]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP13]], <vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP16]], <vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -180,60 +180,60 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:  entry:
 ; SCALAR_EPILOGUE-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_EPILOGUE-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_EPILOGUE:       vector.ph:
 ; SCALAR_EPILOGUE-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
+; SCALAR_EPILOGUE-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_EPILOGUE-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 3
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
-; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 1)
-; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 2)
-; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 8 x i32> [[TMP8]], splat (i32 3)
-; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP13]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP15]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP16]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP17]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP19]], i32 1, <vscale x 8 x i1> [[TMP7]], <vscale x 8 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP20]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP22]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP24]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP25]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP26:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP26]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP27]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP28:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP28]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP29]], i32 1, <vscale x 8 x i1> [[TMP7]])
-; SCALAR_EPILOGUE-NEXT:    [[TMP30:%.*]] = zext nneg <vscale x 8 x i32> [[TMP11]] to <vscale x 8 x i64>
-; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP30]]
-; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP23]], <vscale x 8 x ptr> [[TMP31]], i32 1, <vscale x 8 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_EPILOGUE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP8:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
+; SCALAR_EPILOGUE-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 2)
+; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 3)
+; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP14]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP16]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP18]]
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP22]]
+; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP24]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP25]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP26:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP26]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP27]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP28:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP28]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP29]], i32 1, <vscale x 16 x i1> [[TMP7]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP30:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP30]]
+; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> [[TMP31]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -248,58 +248,58 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 3
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.stepvector.nxv8i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[CONV]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[DOTSPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 8 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 [[INDEX]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i1> [[TMP5]], <vscale x 8 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 8 x i32> [[VEC_IND]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 2)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 8 x i32> [[TMP7]], splat (i32 3)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP12]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP14]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP16]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 8 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[TMP18]], i32 1, <vscale x 8 x i1> [[TMP6]], <vscale x 8 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 8 x i8> @llvm.smax.nxv8i8(<vscale x 8 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 8 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 8 x i8> zeroinitializer, [[TMP21]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 8 x i32> [[TMP7]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP23]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP19]], <vscale x 8 x ptr> [[TMP24]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 8 x i32> [[TMP8]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP25]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP20]], <vscale x 8 x ptr> [[TMP26]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 8 x i32> [[TMP9]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP27]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP21]], <vscale x 8 x ptr> [[TMP28]], i32 1, <vscale x 8 x i1> [[TMP6]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 8 x i32> [[TMP10]] to <vscale x 8 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 8 x i64> [[TMP29]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP22]], <vscale x 8 x ptr> [[TMP30]], i32 1, <vscale x 8 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 1024)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 2)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 3)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP19]], <vscale x 16 x ptr> [[TMP24]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP25]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP20]], <vscale x 16 x ptr> [[TMP26]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP27]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP21]], <vscale x 16 x ptr> [[TMP28]], i32 1, <vscale x 16 x i1> [[TMP6]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP29:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
index cb071f989dafa..5a67b54c7a3d5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
@@ -3,8 +3,7 @@
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; CHECK:       LV(REG): VF = 8
-; CHECK-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
index 15facfc48137b..d4909fa61b4f5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
@@ -4,14 +4,12 @@
 
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
-; ZVFH:       LV(REG): VF = 8
-; ZVFH-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFH:  LV(REG): Found max usage: 2 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; ZVFH-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; ZVFHMIN:       LV(REG): VF = 8
-; ZVFHMIN-NEXT:  LV(REG): Found max usage: 2 item
+; ZVFHMIN:  LV(REG): Found max usage: 2 item
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; ZVFHMIN-NEXT:  LV(REG): Found invariant usage: 1 item
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index 870f52876c5a9..cee0b1222b6be 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -28,28 +28,24 @@ define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture rea
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL1:       LV(REG): VF = 2
-; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL2:       LV(REG): VF = 4
-; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL4:       LV(REG): VF = 8
-; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
-; CHECK-LMUL8:       LV(REG): VF = 16
-; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
+; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 
@@ -80,21 +76,17 @@ define void @goo(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-SCALAR:      LV(REG): VF = 1
 ; CHECK-SCALAR-NEXT: LV(REG): Found max usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL1:       LV(REG): VF = 2
-; CHECK-LMUL1-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-LMUL2:       LV(REG): VF = 4
-; CHECK-LMUL2-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 1 registers
+; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
-; CHECK-LMUL4:       LV(REG): VF = 8
-; CHECK-LMUL4-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
-; CHECK-LMUL8:       LV(REG): VF = 16
-; CHECK-LMUL8-NEXT:  LV(REG): Found max usage: 2 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
+; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 entry:
   %cmp3 = icmp sgt i32 %n, 0

From a75e0627f97ccc36ec222a53c6a1106157a380ac Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 16 Jun 2025 10:02:38 +0100
Subject: [PATCH 572/851] [LV] Use vscale for tuning when updating profile
 information (#143690)

In fixVectorizedLoop we call setProfileInfoAfterUnrolling to update the
profile information after vectorising, however for scalable VFs we
pessimistically assume vscale=1. We can improve upon this by using the
value of vscale used for tuning, i.e. when targeting neoverse-v1 the
expected value is 2.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  35 +++---
 .../LoopVectorize/AArch64/check-prof-info.ll  | 119 ++++++++++++++++++
 .../LoopVectorize/check-prof-info.ll          |  32 +++++
 3 files changed, 169 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bdbfecd962443..bd0a2ec3986d3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2691,6 +2691,20 @@ static void cse(BasicBlock *BB) {
   }
 }
 
+/// This function attempts to return a value that represents the vectorization
+/// factor at runtime. For fixed-width VFs we know this precisely at compile
+/// time, but for scalable VFs we calculate it based on an estimate of the
+/// vscale value.
+static unsigned getEstimatedRuntimeVF(ElementCount VF,
+                                      std::optional<unsigned> VScale) {
+  unsigned EstimatedVF = VF.getKnownMinValue();
+  if (VF.isScalable())
+    if (VScale)
+      EstimatedVF *= *VScale;
+  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
+  return EstimatedVF;
+}
+
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                               ElementCount VF) const {
@@ -2790,10 +2804,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   //
   // For scalable vectorization we can't know at compile time how many
   // iterations of the loop are handled in one vector iteration, so instead
-  // assume a pessimistic vscale of '1'.
+  // use the value of vscale used for tuning.
   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
-  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
-                               VF.getKnownMinValue() * UF);
+  unsigned EstimatedVFxUF =
+      getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning());
+  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
 }
 
 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -4031,20 +4046,6 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   return MaxVF;
 }
 
-/// This function attempts to return a value that represents the vectorization
-/// factor at runtime. For fixed-width VFs we know this precisely at compile
-/// time, but for scalable VFs we calculate it based on an estimate of the
-/// vscale value.
-static unsigned getEstimatedRuntimeVF(ElementCount VF,
-                                      std::optional<unsigned> VScale) {
-  unsigned EstimatedVF = VF.getKnownMinValue();
-  if (VF.isScalable())
-    if (VScale)
-      EstimatedVF *= *VScale;
-  assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
-  return EstimatedVF;
-}
-
 bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
                                                 const VectorizationFactor &B,
                                                 const unsigned MaxTripCount,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
new file mode 100644
index 0000000000000..9435c544fc812
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:" --version 5
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V1-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=1 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC1
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -mcpu=neoverse-v2 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-V2-IC4
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@a = global [1024 x i32] zeroinitializer, align 16
+@b = global [1024 x i32] zeroinitializer, align 16
+
+; We expect the branch weight computations after vectorisation to use
+; vscale=2 for neoverse-v1 and vscale=1 for neoverse-v2.
+define void @_Z3foov() {
+; CHECK-V1-IC1-LABEL: define void @_Z3foov(
+; CHECK-V1-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V1-IC1:  [[ENTRY:.*:]]
+; CHECK-V1-IC1:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V1-IC1:  [[VECTOR_PH]]:
+; CHECK-V1-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V1-IC1:  [[VECTOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF0]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-V1-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V1-IC1:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF4:![0-9]+]]
+; CHECK-V1-IC1:  [[SCALAR_PH]]:
+; CHECK-V1-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V1-IC1:  [[FOR_BODY]]:
+; CHECK-V1-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF5:![0-9]+]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-V1-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC1-LABEL: define void @_Z3foov(
+; CHECK-V2-IC1-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC1:  [[ENTRY:.*:]]
+; CHECK-V2-IC1:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC1:  [[VECTOR_PH]]:
+; CHECK-V2-IC1:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC1:  [[VECTOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[TMP4:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC1:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC1:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC1:  [[SCALAR_PH]]:
+; CHECK-V2-IC1:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC1:  [[FOR_BODY]]:
+; CHECK-V2-IC1:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC1:  [[FOR_COND_CLEANUP]]:
+;
+; CHECK-V2-IC4-LABEL: define void @_Z3foov(
+; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY1:.*:]]
+; CHECK-V2-IC4:    br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-V2-IC4:  [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]]
+; CHECK-V2-IC4:  [[VECTOR_PH]]:
+; CHECK-V2-IC4:    br label %[[VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-V2-IC4:  [[MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-V2-IC4:    br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_PH]]:
+; CHECK-V2-IC4:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-V2-IC4:    br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]]
+; CHECK-V2-IC4:  [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-V2-IC4:    br label %[[FOR_BODY:.*]]
+; CHECK-V2-IC4:  [[FOR_BODY]]:
+; CHECK-V2-IC4:    br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-V2-IC4:  [[FOR_COND_CLEANUP]]:
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
+  %load = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
+  store i32 %load, ptr %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !0
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 1023}
+;.
+; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V1-IC1: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK-V1-IC1: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V1-IC1: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V1-IC1: [[PROF4]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V1-IC1: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V1-IC1: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META2]]}
+;.
+; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK-V2-IC1: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC1: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC1: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC1: [[PROF5]] = !{!"branch_weights", i32 1, i32 3}
+; CHECK-V2-IC1: [[PROF6]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]}
+;.
+; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
+; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-V2-IC4: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
+; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15}
+; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0}
+; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]}
+; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1}
+; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
index 87c1ccb702277..40741941d4b02 100644
--- a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "br" --filter "^.*:"
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
 ; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
+; RUN: opt -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 \
+; RUN:   -scalable-vectorization=on -force-target-supports-scalable-vectors -S < %s |  FileCheck %s -check-prefix=CHECK-SCALABLE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -39,6 +41,21 @@ define void @_Z3foov() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foov(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -92,6 +109,21 @@ define void @_Z3foo2v() {
 ; CHECK-MASKED:  for.body:
 ; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
 ;
+; CHECK-SCALABLE-LABEL: @_Z3foo2v(
+; CHECK-SCALABLE:  entry:
+; CHECK-SCALABLE:    br i1 [[MIN_ITERS_CHECK:%.*]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF0]]
+; CHECK-SCALABLE:  vector.ph:
+; CHECK-SCALABLE:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SCALABLE:  vector.body:
+; CHECK-SCALABLE:    br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SCALABLE:  middle.block:
+; CHECK-SCALABLE:    br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
+; CHECK-SCALABLE:  scalar.ph:
+; CHECK-SCALABLE:    br label [[FOR_BODY:%.*]]
+; CHECK-SCALABLE:  for.cond.cleanup:
+; CHECK-SCALABLE:  for.body:
+; CHECK-SCALABLE:    br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+;
 entry:
   br label %for.body
 

From 79a2b15a4c2d63784fe2a92a72828a14b72412df Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 16 Jun 2025 11:12:15 +0200
Subject: [PATCH 573/851] [libc++] Remove a few workarounds for old Clang
 versions (#143858)

---
 libcxx/include/__config         |  9 +--------
 libcxx/include/__utility/pair.h | 15 +++------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 38c47e8d45c81..af8a297fdf3fd 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -505,13 +505,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_HIDE_FROM_ABI_AFTER_V1 _LIBCPP_HIDE_FROM_ABI
 #  endif
 
-// TODO: Remove this workaround once we drop support for Clang 16
-#  if __has_warning("-Wc++23-extensions")
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")
-#  else
-#    define _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++2b-extensions")
-#  endif
-
 // Clang modules take a significant compile time hit when pushing and popping diagnostics.
 // Since all the headers are marked as system headers unless _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER is defined, we can
 // simply disable this pushing and popping when _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER isn't defined.
@@ -522,7 +515,7 @@ typedef __char32_t char32_t;
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                           \
       _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                           \
-      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED_CXX23_EXTENSION                                                                 \
+      _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wc++23-extensions")                                                           \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++14-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++17-extensions")                                                             \
       _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wc++20-extensions")                                                             \
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index ab390aafa0d9d..dbacbce044766 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -209,21 +209,12 @@ struct pair
 #  endif
 
 #  if _LIBCPP_STD_VER >= 23
-  // TODO: Remove this workaround in LLVM 20. The bug got fixed in Clang 18.
-  // This is a workaround for http://llvm.org/PR60710. We should be able to remove it once Clang is fixed.
-  template <class _PairLike>
-  _LIBCPP_HIDE_FROM_ABI static constexpr bool __pair_like_explicit_wknd() {
-    if constexpr (__pair_like_no_subrange<_PairLike>) {
-      return !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
-             !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>;
-    }
-    return false;
-  }
-
   template <__pair_like_no_subrange _PairLike>
     requires(is_constructible_v<first_type, decltype(std::get<0>(std::declval<_PairLike &&>()))> &&
              is_constructible_v<second_type, decltype(std::get<1>(std::declval<_PairLike &&>()))>)
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit(__pair_like_explicit_wknd<_PairLike>()) pair(_PairLike&& __p)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit(
+      !is_convertible_v<decltype(std::get<0>(std::declval<_PairLike&&>())), first_type> ||
+      !is_convertible_v<decltype(std::get<1>(std::declval<_PairLike&&>())), second_type>) pair(_PairLike&& __p)
       : first(std::get<0>(std::forward<_PairLike>(__p))), second(std::get<1>(std::forward<_PairLike>(__p))) {}
 #  endif
 

From eddab9b757722da7b908723a5a61d280540b48cf Mon Sep 17 00:00:00 2001
From: Oliver Hunt <oliver@apple.com>
Date: Mon, 16 Jun 2025 12:12:22 +0300
Subject: [PATCH 574/851] [clang] Fix PointerAuth semantics of
 cpp_trivially_relocatable (#143969)

This adds a number of functions to ASTContext to query whether a
type contains data protected with address discriminated pointer
authentication, and whether the protected values are just vtable
pointers, or if there are other address discriminated types included.

For the standardized version, __builtin_is_cpp_trivially_relocatable
this means accepting types where the only address discriminated
values are vtable pointers. Other address discriminated types are
not considered relocatable. In addition to that any union containing
any address discriminated data, including vtable pointers, is not
relocatable.

For the old deprecated __builtin_is_trivially_relocatable we reject
any type containing any address discriminated value, as it is
semantically intended as being a "is this memcopyable" which is
not true for anything with address discrimination.

This PR does not update the codegen for __builtin_trivially_relocate,
that will be in a follow on PR that is much more complex.
---
 clang/include/clang/AST/ASTContext.h          |  39 +++++++
 clang/lib/AST/ASTContext.cpp                  |  67 +++++++++++
 clang/lib/Sema/SemaTypeTraits.cpp             |  17 ++-
 .../SemaCXX/cxx2c-trivially-relocatable.cpp   |   1 +
 clang/test/SemaCXX/ptrauth-triviality.cpp     |  44 ++++++-
 .../SemaCXX/trivially-relocatable-ptrauth.cpp | 109 ++++++++++++++++++
 6 files changed, 268 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 3abb49312255a..e01361e2466b5 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -629,10 +629,48 @@ class ASTContext : public RefCountedBase<ASTContext> {
   void setRelocationInfoForCXXRecord(const CXXRecordDecl *,
                                      CXXRecordDeclRelocationInfo);
 
+  /// Examines a given type, and returns whether the type itself
+  /// is address discriminated, or any transitively embedded types
+  /// contain data that is address discriminated. This includes
+  /// implicitly authenticated values like vtable pointers, as well as
+  /// explicitly qualified fields.
+  bool containsAddressDiscriminatedPointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) != PointerAuthContent::None;
+  }
+
+  /// Examines a given type, and returns whether the type itself
+  /// or any data it transitively contains has a pointer authentication
+  /// schema that is not safely relocatable. e.g. any data or fields
+  /// with address discrimination other than any otherwise similar
+  /// vtable pointers.
+  bool containsNonRelocatablePointerAuth(QualType T) {
+    if (!isPointerAuthenticationAvailable())
+      return false;
+    return findPointerAuthContent(T) ==
+           PointerAuthContent::AddressDiscriminatedData;
+  }
+
 private:
   llvm::DenseMap<const CXXRecordDecl *, CXXRecordDeclRelocationInfo>
       RelocatableClasses;
 
+  // FIXME: store in RecordDeclBitfields in future?
+  enum class PointerAuthContent : uint8_t {
+    None,
+    AddressDiscriminatedVTable,
+    AddressDiscriminatedData
+  };
+
+  // A simple helper function to short circuit pointer auth checks.
+  bool isPointerAuthenticationAvailable() const {
+    return LangOpts.PointerAuthCalls || LangOpts.PointerAuthIntrinsics;
+  }
+  PointerAuthContent findPointerAuthContent(QualType T);
+  llvm::DenseMap<const RecordDecl *, PointerAuthContent>
+      RecordContainsAddressDiscriminatedPointerAuth;
+
   ImportDecl *FirstLocalImport = nullptr;
   ImportDecl *LastLocalImport = nullptr;
 
@@ -3668,6 +3706,7 @@ OPT_LIST(V)
   /// authentication policy for the specified record.
   const CXXRecordDecl *
   baseForVTableAuthentication(const CXXRecordDecl *ThisClass);
+
   bool useAbbreviatedThunkName(GlobalDecl VirtualMethodDecl,
                                StringRef MangledName);
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 4d44f23c0f503..189e67e4eed0d 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1705,6 +1705,73 @@ void ASTContext::setRelocationInfoForCXXRecord(
   RelocatableClasses.insert({D, Info});
 }
 
+static bool primaryBaseHaseAddressDiscriminatedVTableAuthentication(
+    ASTContext &Context, const CXXRecordDecl *Class) {
+  if (!Class->isPolymorphic())
+    return false;
+  const CXXRecordDecl *BaseType = Context.baseForVTableAuthentication(Class);
+  using AuthAttr = VTablePointerAuthenticationAttr;
+  const AuthAttr *ExplicitAuth = BaseType->getAttr<AuthAttr>();
+  if (!ExplicitAuth)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  AuthAttr::AddressDiscriminationMode AddressDiscrimination =
+      ExplicitAuth->getAddressDiscrimination();
+  if (AddressDiscrimination == AuthAttr::DefaultAddressDiscrimination)
+    return Context.getLangOpts().PointerAuthVTPtrAddressDiscrimination;
+  return AddressDiscrimination == AuthAttr::AddressDiscrimination;
+}
+
+ASTContext::PointerAuthContent ASTContext::findPointerAuthContent(QualType T) {
+  assert(isPointerAuthenticationAvailable());
+
+  T = T.getCanonicalType();
+  if (T.hasAddressDiscriminatedPointerAuth())
+    return PointerAuthContent::AddressDiscriminatedData;
+  const RecordDecl *RD = T->getAsRecordDecl();
+  if (!RD)
+    return PointerAuthContent::None;
+
+  if (auto Existing = RecordContainsAddressDiscriminatedPointerAuth.find(RD);
+      Existing != RecordContainsAddressDiscriminatedPointerAuth.end())
+    return Existing->second;
+
+  PointerAuthContent Result = PointerAuthContent::None;
+
+  auto SaveResultAndReturn = [&]() -> PointerAuthContent {
+    auto [ResultIter, DidAdd] =
+        RecordContainsAddressDiscriminatedPointerAuth.try_emplace(RD, Result);
+    (void)ResultIter;
+    (void)DidAdd;
+    assert(DidAdd);
+    return Result;
+  };
+  auto ShouldContinueAfterUpdate = [&](PointerAuthContent NewResult) {
+    static_assert(PointerAuthContent::None <
+                  PointerAuthContent::AddressDiscriminatedVTable);
+    static_assert(PointerAuthContent::AddressDiscriminatedVTable <
+                  PointerAuthContent::AddressDiscriminatedData);
+    if (NewResult > Result)
+      Result = NewResult;
+    return Result != PointerAuthContent::AddressDiscriminatedData;
+  };
+  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+    if (primaryBaseHaseAddressDiscriminatedVTableAuthentication(*this, CXXRD) &&
+        !ShouldContinueAfterUpdate(
+            PointerAuthContent::AddressDiscriminatedVTable))
+      return SaveResultAndReturn();
+    for (auto Base : CXXRD->bases()) {
+      if (!ShouldContinueAfterUpdate(findPointerAuthContent(Base.getType())))
+        return SaveResultAndReturn();
+    }
+  }
+  for (auto *FieldDecl : RD->fields()) {
+    if (!ShouldContinueAfterUpdate(
+            findPointerAuthContent(FieldDecl->getType())))
+      return SaveResultAndReturn();
+  }
+  return SaveResultAndReturn();
+}
+
 void ASTContext::addedLocalImportDecl(ImportDecl *Import) {
   assert(!Import->getNextLocalImport() &&
          "Import declaration already in the chain");
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1738ab4466001..4dbb2450857e0 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -188,6 +188,7 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
       return false;
   }
 
+  bool IsUnion = D->isUnion();
   for (const FieldDecl *Field : D->fields()) {
     if (Field->getType()->isDependentType())
       continue;
@@ -197,6 +198,12 @@ static bool IsEligibleForTrivialRelocation(Sema &SemaRef,
     // of a trivially relocatable type
     if (!SemaRef.IsCXXTriviallyRelocatableType(Field->getType()))
       return false;
+
+    // A union contains values with address discriminated pointer auth
+    // cannot be relocated.
+    if (IsUnion && SemaRef.Context.containsAddressDiscriminatedPointerAuth(
+                       Field->getType()))
+      return false;
   }
   return !D->hasDeletedDestructor();
 }
@@ -313,7 +320,6 @@ bool Sema::IsCXXTriviallyRelocatableType(const CXXRecordDecl &RD) {
 }
 
 bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
-
   QualType BaseElementType = getASTContext().getBaseElementType(Type);
 
   if (Type->isVariableArrayType())
@@ -322,10 +328,10 @@ bool Sema::IsCXXTriviallyRelocatableType(QualType Type) {
   if (BaseElementType.hasNonTrivialObjCLifetime())
     return false;
 
-  if (BaseElementType.hasAddressDiscriminatedPointerAuth())
+  if (BaseElementType->isIncompleteType())
     return false;
 
-  if (BaseElementType->isIncompleteType())
+  if (Context.containsNonRelocatablePointerAuth(Type))
     return false;
 
   if (BaseElementType->isScalarType() || BaseElementType->isVectorType())
@@ -670,7 +676,10 @@ static bool IsTriviallyRelocatableType(Sema &SemaRef, QualType T) {
   if (!BaseElementType->isObjectType())
     return false;
 
-  if (T.hasAddressDiscriminatedPointerAuth())
+  // The deprecated __builtin_is_trivially_relocatable does not have
+  // an equivalent to __builtin_trivially_relocate, so there is no
+  // safe way to use it if there are any address discriminated values.
+  if (SemaRef.getASTContext().containsAddressDiscriminatedPointerAuth(T))
     return false;
 
   if (const auto *RD = BaseElementType->getAsCXXRecordDecl();
diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
index 9d43994ee7661..7152a5937d9b7 100644
--- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
+++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++2c -verify %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-intrinsics -fptrauth-calls -std=c++2c -verify %s
 
 class Trivial {};
 static_assert(__builtin_is_cpp_trivially_relocatable(Trivial));
diff --git a/clang/test/SemaCXX/ptrauth-triviality.cpp b/clang/test/SemaCXX/ptrauth-triviality.cpp
index 60d1b57230f18..ba8a8273d5c05 100644
--- a/clang/test/SemaCXX/ptrauth-triviality.cpp
+++ b/clang/test/SemaCXX/ptrauth-triviality.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
-// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++20 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c++26 -fptrauth-calls -fptrauth-intrinsics -verify -fsyntax-only %s
 
 #define AQ __ptrauth(1,1,50)
 #define IQ __ptrauth(1,0,50)
@@ -83,7 +83,7 @@ static_assert(!__is_trivially_constructible(Holder<S3>, const Holder<S3>&));
 static_assert(!__is_trivially_assignable(Holder<S3>, const Holder<S3>&));
 static_assert(__is_trivially_destructible(Holder<S3>));
 static_assert(!__is_trivially_copyable(Holder<S3>));
-static_assert(__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
+static_assert(!__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S3>));
 static_assert(!__is_trivially_equality_comparable(Holder<S3>));
 
@@ -99,7 +99,6 @@ static_assert(!__is_trivially_assignable(S4, const S4&));
 static_assert(__is_trivially_destructible(S4));
 static_assert(!__is_trivially_copyable(S4));
 static_assert(!__is_trivially_relocatable(S4)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S4));
 static_assert(!__is_trivially_equality_comparable(S4));
 
@@ -124,7 +123,6 @@ static_assert(!__is_trivially_assignable(S5, const S5&));
 static_assert(__is_trivially_destructible(S5));
 static_assert(!__is_trivially_copyable(S5));
 static_assert(!__is_trivially_relocatable(S5)); // expected-warning{{deprecated}}
-//FIXME
 static_assert(__builtin_is_cpp_trivially_relocatable(S5));
 static_assert(!__is_trivially_equality_comparable(S5));
 
@@ -182,3 +180,39 @@ static_assert(__is_trivially_copyable(Holder<S7>));
 static_assert(__is_trivially_relocatable(Holder<S7>)); // expected-warning{{deprecated}}
 static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S7>));
 static_assert(__is_trivially_equality_comparable(Holder<S7>));
+
+template <class... Bases> struct MultipleInheriter : Bases... {
+};
+
+template <class T> static const bool test_is_trivially_relocatable_v = __builtin_is_cpp_trivially_relocatable(T);
+template <class... Types> static const bool multiple_inheritance_is_relocatable = test_is_trivially_relocatable_v<MultipleInheriter<Types...>>;
+template <class... Types> static const bool inheritance_relocatability_matches_bases_v =
+  (test_is_trivially_relocatable_v<Types> && ...) == multiple_inheritance_is_relocatable<Types...>;
+
+static_assert(multiple_inheritance_is_relocatable<S4, S5> == multiple_inheritance_is_relocatable<S5, S4>);
+static_assert(inheritance_relocatability_matches_bases_v<S4, S5>);
+static_assert(inheritance_relocatability_matches_bases_v<S5, S4>);
+
+struct AA AddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void foo();
+};
+
+struct IA NoAddressDiscriminatedPolymorphicBase trivially_relocatable_if_eligible {
+  virtual void bar();
+};
+
+template <class T> struct UnionWrapper trivially_relocatable_if_eligible {
+  union U {
+    T field1;
+  } u;
+};
+
+static_assert(test_is_trivially_relocatable_v<AddressDiscriminatedPolymorphicBase>);
+static_assert(test_is_trivially_relocatable_v<NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>);
+static_assert(inheritance_relocatability_matches_bases_v<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>);
+
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<AddressDiscriminatedPolymorphicBase>>);
+static_assert(test_is_trivially_relocatable_v<UnionWrapper<NoAddressDiscriminatedPolymorphicBase>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>>>);
+static_assert(!test_is_trivially_relocatable_v<UnionWrapper<MultipleInheriter<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>>>);
diff --git a/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
new file mode 100644
index 0000000000000..b38499a634fcf
--- /dev/null
+++ b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple arm64 -fptrauth-calls -fptrauth-intrinsics -std=c++26 -verify %s
+
+// This test intentionally does not enable the global address discrimination
+// of vtable pointers. This lets us configure them with different schemas
+// and verify that we're correctly tracking the existence of address discrimination
+
+// expected-no-diagnostics
+
+struct NonAddressDiscPtrauth {
+  void * __ptrauth(1, 0, 1234) p;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscPtrauth));
+
+struct AddressDiscPtrauth {
+  void * __ptrauth(1, 1, 1234) p;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscPtrauth));
+
+struct MultipleBaseClasses : NonAddressDiscPtrauth, AddressDiscPtrauth {
+
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleBaseClasses));
+
+struct MultipleMembers1 {
+   NonAddressDiscPtrauth field0;
+   AddressDiscPtrauth field1;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(MultipleMembers1));
+
+struct MultipleMembers2 {
+   NonAddressDiscPtrauth field0;
+   NonAddressDiscPtrauth field1;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(MultipleMembers2));
+
+struct UnionOfPtrauth {
+    union {
+        NonAddressDiscPtrauth field0;
+        AddressDiscPtrauth field1;
+    } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPtrauth));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,address_discrimination,no_extra_discrimination)]] Polymorphic trivially_relocatable_if_eligible {
+  virtual ~Polymorphic();
+};
+
+struct Foo : Polymorphic {
+  Foo(const Foo&);
+  ~Foo();
+};
+
+
+static_assert(__builtin_is_cpp_trivially_relocatable(Polymorphic));
+
+struct [[clang::ptrauth_vtable_pointer(process_independent,no_address_discrimination,no_extra_discrimination)]] NonAddressDiscriminatedPolymorphic trivially_relocatable_if_eligible {
+  virtual ~NonAddressDiscriminatedPolymorphic();
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(NonAddressDiscriminatedPolymorphic));
+
+
+struct PolymorphicMembers {
+    Polymorphic field;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(PolymorphicMembers));
+
+struct UnionOfPolymorphic {
+  union trivially_relocatable_if_eligible {
+    Polymorphic p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic));
+
+
+struct UnionOfNonAddressDiscriminatedPolymorphic {
+  union trivially_relocatable_if_eligible {
+    NonAddressDiscriminatedPolymorphic p;
+    int i;
+  } u;
+};
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPolymorphic));
+
+struct UnionOfNonAddressDiscriminatedPtrauth {
+  union {
+    NonAddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfNonAddressDiscriminatedPtrauth));
+
+struct UnionOfAddressDisriminatedPtrauth {
+  union {
+    AddressDiscPtrauth p;
+    int i;
+  } u;
+};
+
+static_assert(!__builtin_is_cpp_trivially_relocatable(UnionOfAddressDisriminatedPtrauth));

From b2bf017acd0369fff89b933cf7c653f62b49f8d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 10:31:24 +0100
Subject: [PATCH 575/851] [X86] X86FixupInstTuning - prefer VPBLENDD to
 VPBLENDW shuffles on AVX2+ targets (#144269)

On many Intel AVX2 targets (Haswell+), VPBLENDD has notably better throughput than VPBLENDW - and the remaining Intel/AMD targets have no preference.

This patch replaces VPBLENDW shuffles if the shuffle mask can be safely widened from vXi16 to vXi32 and that the scheduler model doesn't consider it a regression (I haven't found any target where this is true, but we should retain the model check).

Noticed while working on #142972 where VMOVSS nodes were regressing to VPBLENDW nodes during domain switching.
---
 llvm/lib/Target/X86/X86FixupInstTuning.cpp    | 26 ++++++++++++
 llvm/test/CodeGen/X86/combine-or-shuffle.ll   |  2 +-
 llvm/test/CodeGen/X86/dpbusd.ll               | 12 +++---
 llvm/test/CodeGen/X86/dpbusd_const.ll         | 16 +++----
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll |  6 +--
 .../CodeGen/X86/vector-reduce-add-mask.ll     |  2 +-
 .../CodeGen/X86/vector-reduce-add-zext.ll     |  4 +-
 llvm/test/CodeGen/X86/vector-reduce-add.ll    | 27 ++++++++----
 .../CodeGen/X86/zero_extend_vector_inreg.ll   | 42 +++++++++----------
 .../zero_extend_vector_inreg_of_broadcast.ll  |  6 +--
 10 files changed, 89 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
index 89093b2e1a3f5..33dc0a232815c 100644
--- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp
+++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp
@@ -242,6 +242,26 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessUNPCKToIntDomain(NewOpc);
   };
 
+  auto ProcessBLENDWToBLENDD = [&](unsigned MovOpc, unsigned NumElts) -> bool {
+    if (!ST->hasAVX2() || !NewOpcPreferable(MovOpc))
+      return false;
+    // Convert to VPBLENDD if scaling the VPBLENDW mask down/up loses no bits.
+    APInt MaskW =
+        APInt(8, MI.getOperand(NumOperands - 1).getImm(), /*IsSigned=*/false);
+    APInt MaskD = APIntOps::ScaleBitMask(MaskW, 4, /*MatchAllBits=*/true);
+    if (MaskW != APIntOps::ScaleBitMask(MaskD, 8, /*MatchAllBits=*/true))
+      return false;
+    APInt NewMaskD = APInt::getSplat(NumElts, MaskD);
+    LLVM_DEBUG(dbgs() << "Replacing: " << MI);
+    {
+      MI.setDesc(TII->get(MovOpc));
+      MI.removeOperand(NumOperands - 1);
+      MI.addOperand(MachineOperand::CreateImm(NewMaskD.getZExtValue()));
+    }
+    LLVM_DEBUG(dbgs() << "     With: " << MI);
+    return true;
+  };
+
   auto ProcessBLENDToMOV = [&](unsigned MovOpc, unsigned Mask,
                                unsigned MovImm) -> bool {
     if ((MI.getOperand(NumOperands - 1).getImm() & Mask) != MovImm)
@@ -270,6 +290,12 @@ bool X86FixupInstTuningPass::processInstruction(
     return ProcessBLENDToMOV(X86::VMOVSSrr, 0xF, 0x1) ||
            ProcessBLENDToMOV(X86::VMOVSDrr, 0xF, 0x3);
 
+  case X86::VPBLENDWrri:
+    // TODO: Add X86::VPBLENDWrmi handling
+    // TODO: Add X86::VPBLENDWYrri handling
+    // TODO: Add X86::VPBLENDWYrmi handling
+    return ProcessBLENDWToBLENDD(X86::VPBLENDDrri, 4);
+
   case X86::VPERMILPDri:
     return ProcessVPERMILPDri(X86::VSHUFPDrri);
   case X86::VPERMILPDYri:
diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 14e3767f65564..38ea796c0fcb0 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -424,7 +424,7 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 04d7a9691b645..3aa77c3955c63 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -317,8 +317,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVXVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVXVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVXVNNI-NEXT:    vmovd %xmm2, %eax
 ; AVXVNNI-NEXT:    addl %edx, %eax
@@ -328,9 +328,9 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
@@ -343,8 +343,8 @@ define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm1, %xmm0, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index dfae853f9961e..456e6e8f263aa 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -27,7 +27,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_zc:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -36,7 +36,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_zc:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -47,7 +47,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_zc:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX512VLVNNI-NEXT:    vmovd %xmm1, %eax
@@ -67,7 +67,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVXVNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVXVNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -78,7 +78,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
 ; AVX512VNNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VNNI-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512VNNI-NEXT:    vmovd %xmm1, %eax
@@ -107,7 +107,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVXVNNI-LABEL: mul_4xi8_cs:
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVXVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
@@ -117,7 +117,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI-LABEL: mul_4xi8_cs:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
@@ -129,7 +129,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI-LABEL: mul_4xi8_cs:
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512VLVNNI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 84ae818d91832..05c855ed90b3f 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -1014,7 +1014,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -1023,7 +1023,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1038,7 +1038,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 4898ae98faea2..983ae594e3ab1 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -112,7 +112,7 @@ define i64 @test_v4i64_v4i16(<4 x i64> %a0) {
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
index 937ac3d2db885..d99b200385585 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll
@@ -231,7 +231,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX2-LABEL: test_v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovd %xmm0, %eax
 ; AVX2-NEXT:    retq
@@ -239,7 +239,7 @@ define i32 @test_v4i32(<4 x i8> %a0) {
 ; AVX512-LABEL: test_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll
index 6cc0e1e73fcdb..aed4e023e340c 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll
@@ -1025,19 +1025,28 @@ define i8 @test_v4i8(<4 x i8> %a0) {
 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: test_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
+; AVX1-LABEL: test_v4i8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    # kill: def $al killed $al killed $eax
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    # kill: def $al killed $al killed $eax
+; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: test_v4i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index ddd7f10168936..cacc43e96b6ea 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1329,7 +1329,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -1340,7 +1340,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -1351,7 +1351,7 @@ define void @vec128_v4i32_to_v1i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -2428,7 +2428,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX2-NEXT:    vzeroupper
@@ -2439,7 +2439,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
 ; AVX512F-NEXT:    vzeroupper
@@ -2450,7 +2450,7 @@ define void @vec256_v8i32_to_v1i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -4996,7 +4996,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3]
 ; AVX2-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
@@ -5063,7 +5063,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} {z}
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm1, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -5282,7 +5282,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5295,7 +5295,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -5308,7 +5308,7 @@ define void @vec384_v12i32_to_v1i384_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
@@ -7347,9 +7347,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -7362,7 +7362,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7376,7 +7376,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
@@ -7405,9 +7405,9 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -7419,7 +7419,7 @@ define void @vec512_v16i32_to_v2i256_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512BW-FAST-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
@@ -7491,7 +7491,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7504,7 +7504,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX512F-NEXT:    vmovaps %ymm1, 32(%rcx)
@@ -7517,7 +7517,7 @@ define void @vec512_v16i32_to_v1i512_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index ed53c3693c9dc..572ed314ab31d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -4875,7 +4875,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
 ; AVX2-SLOW-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5068,7 +5068,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm1, (%rcx)
@@ -6847,7 +6847,7 @@ define void @vec512_i32_widen_to_i256_factor8_broadcast_to_v2i256_factor2(ptr %i
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)

From 3824a2dbcefe266849b9f8b3eaa1dd23354b15de Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 16 Jun 2025 11:48:55 +0200
Subject: [PATCH 576/851] [MemoryBuiltins] Support allocas in
 getInitialValueOfAllocation (NFC)

---
 llvm/lib/Analysis/MemoryBuiltins.cpp   | 3 +++
 llvm/lib/Transforms/IPO/Attributor.cpp | 2 --
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 6b7a3e1ffe347..e0b7f65d18a30 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -428,6 +428,9 @@ llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI,
 Constant *llvm::getInitialValueOfAllocation(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             Type *Ty) {
+  if (isa<AllocaInst>(V))
+    return UndefValue::get(Ty);
+
   auto *Alloc = dyn_cast<CallBase>(V);
   if (!Alloc)
     return nullptr;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 050eed376ed3f..dac1f7a30c370 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -242,8 +242,6 @@ Constant *
 AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
                           Value &Obj, Type &Ty, const TargetLibraryInfo *TLI,
                           const DataLayout &DL, AA::RangeTy *RangePtr) {
-  if (isa<AllocaInst>(Obj))
-    return UndefValue::get(&Ty);
   if (Constant *Init = getInitialValueOfAllocation(&Obj, TLI, &Ty))
     return Init;
   auto *GV = dyn_cast<GlobalVariable>(&Obj);

From 299a55a88fae4fc423c440436b2632d2a6bd800a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 16 Jun 2025 18:07:27 +0800
Subject: [PATCH 577/851] [InstCombine][Docs] Update InstCombine contributor
 guide (#144228)

Update the guideline to reduce the chance of miscompilation/performance
regression.

---------

Co-authored-by: Nikita Popov <github@npopov.com>
Co-authored-by: Antonio Frighetto <me@antoniofrighetto.com>
---
 llvm/docs/InstCombineContributorGuide.md | 39 ++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/llvm/docs/InstCombineContributorGuide.md b/llvm/docs/InstCombineContributorGuide.md
index b4041f8a5b93f..cee0a7ce446a6 100644
--- a/llvm/docs/InstCombineContributorGuide.md
+++ b/llvm/docs/InstCombineContributorGuide.md
@@ -404,11 +404,32 @@ The use of TargetTransformInfo is only allowed for hooks for target-specific
 intrinsics, such as `TargetTransformInfo::instCombineIntrinsic()`. These are
 already inherently target-dependent anyway.
 
+If some canonicalization narrow/widen the integer width of expressions, please
+check `shouldChangeType()` first. Otherwise, we may evaluate the expression 
+in illegal/inefficient types.
+
 For vector-specific transforms that require cost-modelling, the VectorCombine
 pass can be used instead. In very rare circumstances, if there are no other
 alternatives, target-dependent transforms may be accepted into
 AggressiveInstCombine.
 
+Generally, we prefer unsigned operations over signed operations in the middle-end, even
+if signed operations are more efficient on some targets. The following is an incomplete
+list of canonicalizations that are implemented in InstCombine:
+
+| Original Pattern             | Canonical Form             | Condition                     |
+|------------------------------|----------------------------|-------------------------------|
+| `icmp spred X, Y`            | `icmp samesign upred X, Y` | `sign(X) == sign(Y)`          |
+| `smin/smax X, Y`             | `umin/umax X, Y`           | `sign(X) == sign(Y)`          |
+| `sext X`                     | `zext nneg X`              | `X >=s 0`                     |
+| `sitofp X`                   | `uitofp nneg X`            | `X >=s 0`                     |
+| `ashr X, Y`                  | `lshr X, Y`                | `X >=s 0`                     |
+| `sdiv/srem X, Y`             | `udiv/urem X, Y`           | `X >=s 0 && Y >=s 0`          |
+| `add X, Y`                   | `or disjoint X, Y`         | `(X & Y) != 0`                |
+| `mul X, C`                   | `shl X, Log2(C)`           | `isPowerOf2(C)`               |
+| `select Cond1, Cond2, false` | `and Cond1, Cond2`         | `impliesPoison(Cond2, Cond1)` |
+| `select Cond1, true, Cond2`  | `or Cond1, Cond2`          | `impliesPoison(Cond2, Cond1)` |
+
 ### PatternMatch
 
 Many transforms make use of the matching infrastructure defined in
@@ -531,6 +552,19 @@ need to add a one-use check for the inner instruction.
 One-use checks can be performed using the `m_OneUse()` matcher, or the
 `V->hasOneUse()` method.
 
+### Flag handling
+
+When possible, favour propagation of poison-generating flags like `nuw` and `nsw` since they may be
+hard to salvage later. Avoid doing so if it introduces additional complexity (e.g. requires querying `willNotOverflow`
+or KnownBits).
+
+Be careful with in-place operand/predicate changes, as poison-generating flags may not be valid for new
+operands. It is recommended to create a new instruction with careful handling of flags. If not
+applicable, call `Instruction::dropPoisonGeneratingFlags()` to clear flags in a conservative manner.
+
+Do not rely on fcmp's `nsz` flag to perform optimizations. It is meaningless for fcmp so it should not affect
+the optimization.
+
 ### Generalization
 
 Transforms can both be too specific (only handling some odd subset of patterns,
@@ -558,6 +592,11 @@ guidelines.
    use of ValueTracking queries. Whether this makes sense depends on the case,
    but it's usually a good idea to only handle the constant pattern first, and
    then generalize later if it seems useful.
+ * When possible, handle more canonical patterns as well. It is encouraged to avoid
+   potential phase-ordering issues. For example, if the motivating transform holds for
+   `add`, it also holds for `or disjoint`. See the canonicalization list above for details.
+   In most cases, it can be easily implemented with matchers like
+   `m_AddLike/m_SExtLike/m_LogicalAnd/m_LogicalOr`.
 
 ## Guidelines for reviewers
 

From e00853859e89114d8db24aa0b863b618175f79c7 Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Mon, 16 Jun 2025 13:40:50 +0200
Subject: [PATCH 578/851] [MLIR][Transform] apply_registered_pass: support
 ListOptions (#144026)

Interpret an option value with multiple values, either in the form of an
`ArrayAttr` (either static or passed through a param) or as the multiple
attrs associated to a param, as a comma-separated list, i.e. as a
ListOption on a pass.
---
 .../mlir/Dialect/Transform/IR/TransformOps.td |   5 +-
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 166 ++++++++++++------
 .../mlir/dialects/transform/__init__.py       |  41 ++---
 .../Transform/test-pass-application.mlir      | 145 ++++++++++++---
 mlir/test/python/dialects/transform.py        |  58 ++++--
 5 files changed, 301 insertions(+), 114 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 0aa750e625436..62e66b3dabee8 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -418,11 +418,14 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass",
         with options = { "top-down" = false,
                          "max-iterations" = %max_iter,
                          "test-convergence" = true,
-                         "max-num-rewrites" =  %max_rewrites }
+                         "max-num-rewrites" = %max_rewrites }
         to %module
     : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op
     ```
 
+    Options' values which are `ArrayAttr`s are converted to comma-separated
+    lists of options. Likewise for params which associate multiple values.
+
     This op first looks for a pass pipeline with the specified name. If no such
     pipeline exists, it looks for a pass with the specified name. If no such
     pass exists either, this op fails definitely.
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 582d082153bef..bb9bdd70625e4 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -788,46 +788,47 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter,
   // Obtain a single options-string to pass to the pass(-pipeline) from options
   // passed in as a dictionary of keys mapping to values which are either
   // attributes or param-operands pointing to attributes.
+  OperandRange dynamicOptions = getDynamicOptions();
 
   std::string options;
   llvm::raw_string_ostream optionsStream(options); // For "printing" attrs.
 
-  OperandRange dynamicOptions = getDynamicOptions();
-  for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) {
-    if (idx > 0)
-      optionsStream << " "; // Interleave options separator.
-    optionsStream << namedAttribute.getName().str(); // Append the key.
-    optionsStream << "="; // And the key-value separator.
-
-    Attribute valueAttrToAppend;
-    if (auto paramOperandIndex =
-            dyn_cast<transform::ParamOperandAttr>(namedAttribute.getValue())) {
-      // The corresponding value attribute is passed in via a param.
+  // A helper to convert an option's attribute value into a corresponding
+  // string representation, with the ability to obtain the attr(s) from a param.
+  std::function<void(Attribute)> appendValueAttr = [&](Attribute valueAttr) {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
+      // The corresponding value attribute(s) is/are passed in via a param.
       // Obtain the param-operand via its specified index.
-      size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt();
+      size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       assert(dynamicOptionIdx < dynamicOptions.size() &&
-             "number of dynamic option markers (UnitAttr) in options ArrayAttr "
+             "the number of ParamOperandAttrs in the options DictionaryAttr"
              "should be the same as the number of options passed as params");
-      ArrayRef<Attribute> dynamicOption =
+      ArrayRef<Attribute> attrsAssociatedToParam =
           state.getParams(dynamicOptions[dynamicOptionIdx]);
-      if (dynamicOption.size() != 1)
-        return emitSilenceableError()
-               << "options passed as a param must have "
-                  "a single value associated, param "
-               << dynamicOptionIdx << " associates " << dynamicOption.size();
-      valueAttrToAppend = dynamicOption[0];
-    } else {
-      // Value is a static attribute.
-      valueAttrToAppend = namedAttribute.getValue();
-    }
-
-    // Append string representation of value attribute.
-    if (auto strAttr = dyn_cast<StringAttr>(valueAttrToAppend)) {
+      // Recursive so as to append all attrs associated to the param.
+      llvm::interleave(attrsAssociatedToParam, optionsStream, appendValueAttr,
+                       ",");
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recursive so as to append all nested attrs of the array.
+      llvm::interleave(arrayAttr, optionsStream, appendValueAttr, ",");
+    } else if (auto strAttr = dyn_cast<StringAttr>(valueAttr)) {
+      // Convert to unquoted string.
       optionsStream << strAttr.getValue().str();
     } else {
-      valueAttrToAppend.print(optionsStream, /*elideType=*/true);
+      // For all other attributes, ask the attr to print itself (without type).
+      valueAttr.print(optionsStream, /*elideType=*/true);
     }
-  }
+  };
+
+  // Convert the options DictionaryAttr into a single string.
+  llvm::interleave(
+      getOptions(), optionsStream,
+      [&](auto namedAttribute) {
+        optionsStream << namedAttribute.getName().str(); // Append the key.
+        optionsStream << "="; // And the key-value separator.
+        appendValueAttr(namedAttribute.getValue()); // And the attr's str repr.
+      },
+      " ");
   optionsStream.flush();
 
   // Get pass or pass pipeline from registry.
@@ -878,23 +879,30 @@ static ParseResult parseApplyRegisteredPassOptions(
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &dynamicOptions) {
   // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax.
   SmallVector<NamedAttribute> keyValuePairs;
-
   size_t dynamicOptionsIdx = 0;
-  auto parseKeyValuePair = [&]() -> ParseResult {
-    // Parse items of the form `key = value` where `key` is a bare identifier or
-    // a string and `value` is either an attribute or an operand.
 
-    std::string key;
-    Attribute valueAttr;
-    if (parser.parseOptionalKeywordOrString(&key))
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected key to either be an identifier or a string";
-    if (key.empty())
-      return failure();
+  // Helper for allowing parsing of option values which can be of the form:
+  // - a normal attribute
+  // - an operand (which would be converted to an attr referring to the operand)
+  // - ArrayAttrs containing the foregoing (in correspondence with ListOptions)
+  std::function<ParseResult(Attribute &)> parseValue =
+      [&](Attribute &valueAttr) -> ParseResult {
+    // Allow for array syntax, e.g. `[0 : i64, %param, true, %other_param]`:
+    if (succeeded(parser.parseOptionalLSquare())) {
+      SmallVector<Attribute> attrs;
 
-    if (parser.parseEqual())
-      return parser.emitError(parser.getCurrentLocation())
-             << "expected '=' after key in key-value pair";
+      // Recursively parse the array's elements, which might be operands.
+      if (parser.parseCommaSeparatedList(
+              AsmParser::Delimiter::None,
+              [&]() -> ParseResult { return parseValue(attrs.emplace_back()); },
+              " in options dictionary") ||
+          parser.parseRSquare())
+        return failure(); // NB: Attempted parse should've output error message.
+
+      valueAttr = ArrayAttr::get(parser.getContext(), attrs);
+
+      return success();
+    }
 
     // Parse the value, which can be either an attribute or an operand.
     OptionalParseResult parsedValueAttr =
@@ -903,9 +911,7 @@ static ParseResult parseApplyRegisteredPassOptions(
       OpAsmParser::UnresolvedOperand operand;
       ParseResult parsedOperand = parser.parseOperand(operand);
       if (failed(parsedOperand))
-        return parser.emitError(parser.getCurrentLocation())
-               << "expected a valid attribute or operand as value associated "
-               << "to key '" << key << "'";
+        return failure(); // NB: Attempted parse should've output error message.
       // To make use of the operand, we need to store it in the options dict.
       // As SSA-values cannot occur in attributes, what we do instead is store
       // an attribute in its place that contains the index of the param-operand,
@@ -924,7 +930,30 @@ static ParseResult parseApplyRegisteredPassOptions(
              << "in the generic print format";
     }
 
+    return success();
+  };
+
+  // Helper for `key = value`-pair parsing where `key` is a bare identifier or a
+  // string and `value` looks like either an attribute or an operand-in-an-attr.
+  std::function<ParseResult()> parseKeyValuePair = [&]() -> ParseResult {
+    std::string key;
+    Attribute valueAttr;
+
+    if (failed(parser.parseOptionalKeywordOrString(&key)) || key.empty())
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected key to either be an identifier or a string";
+
+    if (failed(parser.parseEqual()))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected '=' after key in key-value pair";
+
+    if (failed(parseValue(valueAttr)))
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected a valid attribute or operand as value associated "
+             << "to key '" << key << "'";
+
     keyValuePairs.push_back(NamedAttribute(key, valueAttr));
+
     return success();
   };
 
@@ -951,16 +980,27 @@ static void printApplyRegisteredPassOptions(OpAsmPrinter &printer,
   if (options.empty())
     return;
 
-  printer << "{";
-  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
-    printer << namedAttribute.getName() << " = ";
-    Attribute value = namedAttribute.getValue();
-    if (auto indexAttr = dyn_cast<transform::ParamOperandAttr>(value)) {
+  std::function<void(Attribute)> printOptionValue = [&](Attribute valueAttr) {
+    if (auto paramOperandAttr =
+            dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
       // Resolve index of param-operand to its actual SSA-value and print that.
-      printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]);
+      printer.printOperand(
+          dynamicOptions[paramOperandAttr.getIndex().getInt()]);
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // This case is so that ArrayAttr-contained operands are pretty-printed.
+      printer << "[";
+      llvm::interleaveComma(arrayAttr, printer, printOptionValue);
+      printer << "]";
     } else {
-      printer.printAttribute(value);
+      printer.printAttribute(valueAttr);
     }
+  };
+
+  printer << "{";
+  llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) {
+    printer << namedAttribute.getName();
+    printer << " = ";
+    printOptionValue(namedAttribute.getValue());
   });
   printer << "}";
 }
@@ -970,9 +1010,11 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
   // and references to dynamic options in the options dictionary.
 
   auto dynamicOptions = SmallVector<Value>(getDynamicOptions());
-  for (NamedAttribute namedAttr : getOptions())
-    if (auto paramOperand =
-            dyn_cast<transform::ParamOperandAttr>(namedAttr.getValue())) {
+
+  // Helper for option values to mark seen operands as having been seen (once).
+  std::function<LogicalResult(Attribute)> checkOptionValue =
+      [&](Attribute valueAttr) -> LogicalResult {
+    if (auto paramOperand = dyn_cast<transform::ParamOperandAttr>(valueAttr)) {
       size_t dynamicOptionIdx = paramOperand.getIndex().getInt();
       if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size())
         return emitOpError()
@@ -983,8 +1025,20 @@ LogicalResult transform::ApplyRegisteredPassOp::verify() {
         return emitOpError() << "dynamic option index " << dynamicOptionIdx
                              << " is already used in options";
       dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used.
+    } else if (auto arrayAttr = dyn_cast<ArrayAttr>(valueAttr)) {
+      // Recurse into ArrayAttrs as they may contain references to operands.
+      for (auto eltAttr : arrayAttr)
+        if (failed(checkOptionValue(eltAttr)))
+          return failure();
     }
+    return success();
+  };
+
+  for (NamedAttribute namedAttr : getOptions())
+    if (failed(checkOptionValue(namedAttr.getValue())))
+      return failure();
 
+  // All dynamicOptions-params seen in the dict will have been set to null.
   for (Value dynamicOption : dynamicOptions)
     if (dynamicOption)
       return emitOpError() << "a param operand does not have a corresponding "
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index bfe96b1b3e5d4..b075919d1ef0f 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -219,6 +219,11 @@ def __init__(
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
 
 
+OptionValueTypes = Union[
+    Sequence["OptionValueTypes"], Attribute, Value, Operation, OpView, str, int, bool
+]
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class ApplyRegisteredPassOp(ApplyRegisteredPassOp):
     def __init__(
@@ -227,12 +232,7 @@ def __init__(
         target: Union[Operation, Value, OpView],
         pass_name: Union[str, StringAttr],
         *,
-        options: Optional[
-            Dict[
-                Union[str, StringAttr],
-                Union[Attribute, Value, Operation, OpView, str, int, bool],
-            ]
-        ] = None,
+        options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
         loc=None,
         ip=None,
     ):
@@ -243,26 +243,32 @@ def __init__(
         context = (loc and loc.context) or Context.current
 
         cur_param_operand_idx = 0
-        for key, value in options.items() if options is not None else {}:
-            if isinstance(key, StringAttr):
-                key = key.value
 
+        def option_value_to_attr(value):
+            nonlocal cur_param_operand_idx
             if isinstance(value, (Value, Operation, OpView)):
                 dynamic_options.append(_get_op_result_or_value(value))
-                options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context)
                 cur_param_operand_idx += 1
+                return ParamOperandAttr(cur_param_operand_idx - 1, context)
             elif isinstance(value, Attribute):
-                options_dict[key] = value
+                return value
             # The following cases auto-convert Python values to attributes.
             elif isinstance(value, bool):
-                options_dict[key] = BoolAttr.get(value)
+                return BoolAttr.get(value)
             elif isinstance(value, int):
                 default_int_type = IntegerType.get_signless(64, context)
-                options_dict[key] = IntegerAttr.get(default_int_type, value)
+                return IntegerAttr.get(default_int_type, value)
             elif isinstance(value, str):
-                options_dict[key] = StringAttr.get(value)
+                return StringAttr.get(value)
+            elif isinstance(value, Sequence):
+                return ArrayAttr.get([option_value_to_attr(elt) for elt in value])
             else:
                 raise TypeError(f"Unsupported option type: {type(value)}")
+
+        for key, value in options.items() if options is not None else {}:
+            if isinstance(key, StringAttr):
+                key = key.value
+            options_dict[key] = option_value_to_attr(value)
         super().__init__(
             result,
             _get_op_result_or_value(target),
@@ -279,12 +285,7 @@ def apply_registered_pass(
     target: Union[Operation, Value, OpView],
     pass_name: Union[str, StringAttr],
     *,
-    options: Optional[
-        Dict[
-            Union[str, StringAttr],
-            Union[Attribute, Value, Operation, OpView, str, int, bool],
-        ]
-    ] = None,
+    options: Optional[Dict[Union[str, StringAttr], OptionValueTypes]] = None,
     loc=None,
     ip=None,
 ) -> Value:
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 1d1be9eda3496..ce8f69c58701d 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -164,6 +164,128 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func private @valid_multiple_values_as_list_option_single_param()
+module {
+  func.func @valid_multiple_values_as_list_option_single_param() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    %multiple_symbol_names = transform.merge_handles %symbol_a, %symbol_b : !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_as_list_option()
+module {
+  func.func @valid_array_attr_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = ["a", "b"] } to %2
+        : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_array_attr_param_as_list_option()
+module {
+  func.func @valid_array_attr_param_as_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %multiple_symbol_names = transform.param.constant ["a","b"] -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = %multiple_symbol_names } to %2
+        : (!transform.any_op, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func private @valid_multiple_params_as_single_list_option()
+module {
+  func.func @valid_multiple_params_as_single_list_option() {
+    return
+  }
+
+  // CHECK: func @a()
+  func.func @a() {
+    return
+  }
+  // CHECK: func @b()
+  func.func @b() {
+    return
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %2 = transform.get_parent_op %1 { deduplicate } : (!transform.any_op) -> !transform.any_op
+    %symbol_a = transform.param.constant "a" -> !transform.any_param
+    %symbol_b = transform.param.constant "b" -> !transform.any_param
+    transform.apply_registered_pass "symbol-privatize"
+        with options = { exclude = [%symbol_a, %symbol_b] } to %2
+        : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @invalid_options_as_str() {
   return
 }
@@ -203,7 +325,8 @@ func.func @invalid_options_due_to_reserved_attr() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+3 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}}
+    // expected-error @+2 {{expected a valid attribute or operand as value associated to key 'top-down'}}
     %2 = transform.apply_registered_pass "canonicalize"
         with options = { "top-down" = #transform.param_operand<index=0> } to %1 : (!transform.any_op) -> !transform.any_op
     transform.yield
@@ -262,26 +385,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-func.func @too_many_pass_option_params() {
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
-    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %x = transform.param.constant true -> !transform.any_param
-    %y = transform.param.constant false -> !transform.any_param
-    %topdown_options = transform.merge_handles %x, %y : !transform.any_param
-    // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}}
-    transform.apply_registered_pass "canonicalize"
-        with options = { "top-down" = %topdown_options } to %1
-        : (!transform.any_op, !transform.any_param) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
 module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index eeb95605d7a9a..6c5e4e5505b1c 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -256,30 +256,45 @@ def testReplicateOp(module: Module):
     # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]]
 
 
+# CHECK-LABEL: TEST: testApplyRegisteredPassOp
 @run
 def testApplyRegisteredPassOp(module: Module):
+    # CHECK: transform.sequence
     sequence = transform.SequenceOp(
         transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get()
     )
     with InsertionPoint(sequence.body):
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize"
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # CHECK-SAME:    with options = {"top-down" = false}
+        # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
         mod = transform.ApplyRegisteredPassOp(
             transform.AnyOpType.get(),
             mod.result,
             "canonicalize",
             options={"top-down": BoolAttr.get(False)},
         )
+        # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
         max_iter = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 10),
         )
+        # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
         max_rewrites = transform.param_constant(
             transform.AnyParamType.get(),
             IntegerAttr.get(IntegerType.get_signless(64), 1),
         )
-        transform.apply_registered_pass(
+        # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
+        # NB: MLIR has sorted the dict lexicographically by key:
+        # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
+        # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
+        # CHECK-SAME:                    "test-convergence" = true,
+        # CHECK-SAME:                    "top-down" = false}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
             transform.AnyOpType.get(),
             mod,
             "canonicalize",
@@ -290,19 +305,30 @@ def testApplyRegisteredPassOp(module: Module):
                 "max-rewrites": max_rewrites,
             },
         )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = ["a", "b"]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": ("a", "b")},
+        )
+        # CHECK:   %[[SYMBOL_A:.+]] = transform.param.constant
+        symbol_a = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("a")
+        )
+        # CHECK:   %[[SYMBOL_B:.+]] = transform.param.constant
+        symbol_b = transform.param_constant(
+            transform.AnyParamType.get(), StringAttr.get("b")
+        )
+        # CHECK:   %{{.*}} = apply_registered_pass "symbol-privatize"
+        # CHECK-SAME:    with options = {"exclude" = [%[[SYMBOL_A]], %[[SYMBOL_B]]]}
+        # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op
+        mod = transform.apply_registered_pass(
+            transform.AnyOpType.get(),
+            mod,
+            "symbol-privatize",
+            options={"exclude": (symbol_a, symbol_b)},
+        )
         transform.YieldOp()
-    # CHECK-LABEL: TEST: testApplyRegisteredPassOp
-    # CHECK: transform.sequence
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # CHECK-SAME:    with options = {"top-down" = false}
-    # CHECK-SAME:    to {{.*}} : (!transform.any_op) -> !transform.any_op
-    # CHECK:   %[[MAX_ITER:.+]] = transform.param.constant
-    # CHECK:   %[[MAX_REWRITE:.+]] = transform.param.constant
-    # CHECK:   %{{.*}} = apply_registered_pass "canonicalize"
-    # NB: MLIR has sorted the dict lexicographically by key:
-    # CHECK-SAME:    with options = {"max-iterations" = %[[MAX_ITER]],
-    # CHECK-SAME:                    "max-rewrites" =  %[[MAX_REWRITE]],
-    # CHECK-SAME:                    "test-convergence" = true,
-    # CHECK-SAME:                    "top-down" = false}
-    # CHECK-SAME:    to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op

From ddea4fe85a01f645a1c5e2c4a8ea607a85cf986f Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 07:48:50 -0400
Subject: [PATCH 579/851] Fix some "not all control paths return" warnings; NFC

---
 clang-tools-extra/clang-doc/Representation.cpp | 1 +
 clang-tools-extra/clang-doc/Serialize.cpp      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/clang-tools-extra/clang-doc/Representation.cpp b/clang-tools-extra/clang-doc/Representation.cpp
index 820d644ef8b83..71a926f1c73e0 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -147,6 +147,7 @@ mergeInfos(std::vector<std::unique_ptr<Info>> &Values) {
     return llvm::createStringError(llvm::inconvertibleErrorCode(),
                                    "unexpected info type");
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 bool CommentInfo::operator==(const CommentInfo &Other) const {
diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp
index e8f1a9cee2675..820e8bfd8e644 100644
--- a/clang-tools-extra/clang-doc/Serialize.cpp
+++ b/clang-tools-extra/clang-doc/Serialize.cpp
@@ -392,6 +392,7 @@ std::string serialize(std::unique_ptr<Info> &I) {
   case InfoType::IT_default:
     return "";
   }
+  llvm_unreachable("unhandled enumerator");
 }
 
 static void parseFullComment(const FullComment *C, CommentInfo &CI) {

From 4f7b5e6d8327f8cea41ba31fdbbb0ee9c1f754c3 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 07:49:20 -0400
Subject: [PATCH 580/851] Fix a tablegen pattern that results in a warning; NFC

We were generating `1 || 1` which caused some issues for -Werror builds
---
 clang/utils/TableGen/ClangAttrEmitter.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 42627f02cf356..f892626a447e5 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3739,7 +3739,8 @@ static void GenerateHasAttrSpellingStringSwitch(
                       : '(' + itostr(Version) + ')';
 
     if (Scope.empty() || Scope == Spelling.nameSpace()) {
-      if (TestStringMap.contains(Spelling.name()))
+      if (TestStringMap.contains(Spelling.name()) &&
+          TestStringMap[Spelling.name()] != TestStr)
         TestStringMap[Spelling.name()] += " || " + TestStr;
       else
         TestStringMap[Spelling.name()] = TestStr;

From 01f9dff61fb028f69493a44616014256dee5fb2a Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 16 Jun 2025 13:10:45 +0100
Subject: [PATCH 581/851] [Flang] Add llvm-profdata to list of tools to be
 built (#144325)

Fixes #144179
---
 flang/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index a658f6f984faf..8520bec646971 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -73,6 +73,7 @@ if (NOT FLANG_STANDALONE_BUILD)
     not
     llvm-dis
     llvm-objdump
+    llvm-profdata
     llvm-readobj
     split-file
   )

From 329dfa16564da74451d26b601cab2d8af0e5f4d3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 14:37:34 +0100
Subject: [PATCH 582/851] [X86] fixup-blend.ll - add commuted load test
 coverage

---
 llvm/test/CodeGen/X86/fixup-blend.ll | 208 ++++++++++++++++++++++++---
 1 file changed, 187 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fixup-blend.ll b/llvm/test/CodeGen/X86/fixup-blend.ll
index 3126e4823bee6..d64dd6d3114a6 100644
--- a/llvm/test/CodeGen/X86/fixup-blend.ll
+++ b/llvm/test/CodeGen/X86/fixup-blend.ll
@@ -59,21 +59,45 @@ define <2 x double> @test_v2f64_blend_movsd_optsize(<2 x double> %a0, <2 x doubl
   ret <2 x double> %r
 }
 
-define <2 x double> @test_v2f64_blend_movsd_load(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+define <2 x double> @test_v2f64_blend_movsd_load(ptr %p0, <2 x double> %a1, <2 x double> %a2) {
 ; SSE2-LABEL: test_v2f64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2f64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE4-NEXT:    addpd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v2f64_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <2 x double>, ptr %p0
+  %s = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 1>
+  %r = fadd <2 x double> %s, %a2
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_v2f64_blend_movsd_load_commute(<2 x double> %a0, ptr %p1, <2 x double> %a2) {
+; SSE2-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT:    addpd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2f64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE4-NEXT:    addpd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v2f64_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = mem[0],xmm0[1]
 ; AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -178,27 +202,57 @@ define <2 x i64> @test_v2i64_blend_movsd_optsize(<2 x i64> %a0, <2 x i64> %a1, <
   ret <2 x i64> %r
 }
 
-define <2 x i64> @test_v2i64_blend_movsd_load(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+define <2 x i64> @test_v2i64_blend_movsd_load(ptr %p0, <2 x i64> %a1, <2 x i64> %a2) {
 ; SSE2-LABEL: test_v2i64_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v2i64_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddq %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v2i64_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v2i64_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <2 x i64>, ptr %p0
+  %s = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 2, i32 1>
+  %r = add <2 x i64> %s, %a2
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @test_v2i64_blend_movsd_load_commute(<2 x i64> %a0, ptr %p1, <2 x i64> %a2) {
+; SSE2-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v2i64_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddq %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v2i64_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
@@ -338,21 +392,47 @@ define <4 x float> @test_v4f32_blend_movsd_optsize(<4 x float> %a0, <4 x float>
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movss_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movss_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4f32_blend_movss_load:
+; SSE4-LABEL: test_v4f32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f32_blend_movss_load:
+; AVX-LABEL: test_v4f32_blend_movss_load_commute:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
@@ -363,21 +443,45 @@ define <4 x float> @test_v4f32_blend_movss_load(<4 x float> %a0, ptr %p1, <4 x f
   ret <4 x float> %r
 }
 
-define <4 x float> @test_v4f32_blend_movsd_load(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+define <4 x float> @test_v4f32_blend_movsd_load(ptr %p0, <4 x float> %a1, <4 x float> %a2) {
 ; SSE2-LABEL: test_v4f32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    addps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4f32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
 ; SSE4-NEXT:    addps %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX-LABEL: test_v4f32_blend_movsd_load:
 ; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a0 = load <4 x float>, ptr %p0
+  %s = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = fadd <4 x float> %s, %a2
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_v4f32_blend_movsd_load_commute(<4 x float> %a0, ptr %p1, <4 x float> %a2) {
+; SSE2-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    addps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4f32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    blendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE4-NEXT:    addps %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: test_v4f32_blend_movsd_load_commute:
+; AVX:       # %bb.0:
 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
@@ -580,27 +684,59 @@ define <4 x i32> @test_v4i32_blend_movsd_optsize(<4 x i32> %a0, <4 x i32> %a1, <
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movss_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movss_load:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movss_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movss_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movaps (%rdi), %xmm2
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSE4-LABEL: test_v4i32_blend_movss_load:
+; SSE4-LABEL: test_v4i32_blend_movss_load_commute:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i32_blend_movss_load:
+; AVX1-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i32_blend_movss_load:
+; AVX2-LABEL: test_v4i32_blend_movss_load_commute:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
@@ -611,27 +747,57 @@ define <4 x i32> @test_v4i32_blend_movss_load(<4 x i32> %a0, ptr %p1, <4 x i32>
   ret <4 x i32> %r
 }
 
-define <4 x i32> @test_v4i32_blend_movsd_load(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+define <4 x i32> @test_v4i32_blend_movsd_load(ptr %p0, <4 x i32> %a1, <4 x i32> %a2) {
 ; SSE2-LABEL: test_v4i32_blend_movsd_load:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE4-LABEL: test_v4i32_blend_movsd_load:
 ; SSE4:       # %bb.0:
-; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; SSE4-NEXT:    paddd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test_v4i32_blend_movsd_load:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_v4i32_blend_movsd_load:
 ; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+  %a0 = load <4 x i32>, ptr %p0
+  %s = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  %r = add <4 x i32> %s, %a2
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_v4i32_blend_movsd_load_commute(<4 x i32> %a0, ptr %p1, <4 x i32> %a2) {
+; SSE2-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: test_v4i32_blend_movsd_load_commute:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; SSE4-NEXT:    paddd %xmm1, %xmm0
+; SSE4-NEXT:    retq
+;
+; AVX1-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v4i32_blend_movsd_load_commute:
+; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq

From d57b86701a7b5bf7d98fea032f33e726b2abb424 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Mon, 16 Jun 2025 14:14:56 +0000
Subject: [PATCH 583/851] Revert "AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo"

This reverts commit 4ea616d072d126a31149174ca2efdbdace9ce568.

This change is causing buildbot failures on MacOS:
 - https://lab.llvm.org/buildbot/#/builders/190/builds/21510
 - http://45.33.8.238/macm1/108620/step_10.txt
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 138 ------------------
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 ---
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +++++++++++-
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 112 insertions(+), 166 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index a82896dbe0d6c..31965d85d9eb4 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,7 +15,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -54,80 +53,6 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
-StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
-  // clang-format off
-  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
-  case AArch64MCExpr::VK_CALL:                return "";
-  case AArch64MCExpr::VK_LO12:                return ":lo12:";
-  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
-  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
-  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
-  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
-  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
-  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
-  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
-  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
-  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
-  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
-  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
-  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
-  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
-  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
-  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
-  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
-  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
-  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
-  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
-  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case AArch64MCExpr::VK_ABS_PAGE:            return "";
-  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case AArch64MCExpr::VK_GOT:                 return ":got:";
-  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
-  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
-  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
-  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case AArch64MCExpr::VK_TLSDESC:             return "";
-  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
-  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
-  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
-  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
-  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
-                     const MCAssembler *Asm) {
-  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(Expr.getSpecifier());
-  return true;
-}
-
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -166,34 +91,6 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
-void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
-
-void AArch64MCAsmInfoDarwin::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
-    return AE->print(OS, this);
-  // FIXME: tryParseAdrLabel should not use VK_ABS for Mach-O
-  assert(Expr.getSpecifier() == AArch64MCExpr::VK_ABS);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -230,19 +127,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
-void AArch64MCAsmInfoELF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
-    return AE->print(OS, this);
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -262,17 +146,6 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
-void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
-
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -291,14 +164,3 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
-
-void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
-    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
-  OS << AArch64::getSpecifierName(Expr);
-  printExpr(OS, *Expr.getSubExpr());
-}
-
-bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
-    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
-  return evaluate(Expr, Res, Asm);
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index bc02586d73884..225e0c8e55fca 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
-#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -27,42 +26,20 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
-  void printSpecifierExpr(raw_ostream &OS,
-                          const MCSpecifierExpr &Expr) const override;
-  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
-namespace AArch64 {
-/// Return the string representation of the ELF relocation specifier
-/// (e.g. ":got:", ":lo12:").
-StringRef getSpecifierName(const MCSpecifierExpr &Expr);
-} // namespace AArch64
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 7a7c6f7effd9f..d934af91b9ff5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,19 +12,100 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
-#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
+StringRef AArch64MCExpr::getSpecifierName() const {
+  // clang-format off
+  switch (static_cast<uint32_t>(getSpecifier())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_PREL_G3:             return ":prel_g3:";
+  case VK_PREL_G2:             return ":prel_g2:";
+  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case VK_PREL_G1:             return ":prel_g1:";
+  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case VK_PREL_G0:             return ":prel_g0:";
+  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case VK_GOT:                 return ":got:";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL:            return ":gottprel:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case VK_TLSDESC_AUTH:        return "";
+  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case VK_SECREL_LO12:         return ":secrel_lo12:";
+  case VK_SECREL_HI12:         return ":secrel_hi12:";
+  case VK_GOT_AUTH:            return ":got_auth:";
+  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  OS << getSpecifierName();
+  Expr->print(OS, MAI);
+}
+
+bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                              const MCAssembler *Asm) const {
+  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(getSpecifier());
+  return true;
+}
+
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -33,3 +114,17 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
+
+void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 541f24c943a15..9c383894c7f54 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,6 +147,8 @@ class AArch64MCExpr : public MCSpecifierExpr {
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
+  /// @name VariantKind information extractors.
+  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -157,6 +159,16 @@ class AArch64MCExpr : public MCSpecifierExpr {
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
+
+  /// @}
+
+  /// Return the string representation of the ELF relocation specifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getSpecifierName() const;
+
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -177,7 +189,7 @@ class AArch64AuthMCExpr final : public AArch64MCExpr {
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 2e997631655ed..3009bd2ca2758 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCAsmInfo.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
+                                          A64E->getSpecifierName() +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          AArch64::getSpecifierName(*A64E) +
+                                          A64E->getSpecifierName() +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From a54712c8ec25a94ab55a4783bfd9d5467d2ec968 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Mon, 16 Jun 2025 15:23:40 +0100
Subject: [PATCH 584/851] [LSR] Make canHoistIVInc allow non-integer types
 (#143707)

canHoistIVInc was made to only allow integer types to avoid a crash in
isIndexedLoadLegal/isIndexedStoreLegal due to them failing an assertion
in getValueType (or rather in MVT::getVT which gets called from that)
when passed a struct type. Adjusting these functions to pass
AllowUnknown=true to getValueType means we don't get an assertion
failure (MVT::Other is returned which TLI->isIndexedLoadLegal should
then return false for), meaning we can remove this check for integer
type.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   4 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   5 +-
 .../AArch64/postidx-load.ll                   | 189 ++++++++++++++++++
 3 files changed, 193 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 574152e254f15..3b87978fe3fab 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -478,12 +478,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
   }
 
   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty) const override {
-    EVT VT = getTLI()->getValueType(DL, Ty);
+    EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 242e571c072af..e4f35e4b2108b 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6008,9 +6008,8 @@ static bool canHoistIVInc(const TargetTransformInfo &TTI, const LSRFixup &Fixup,
 
   Instruction *I = Fixup.UserInst;
   Type *Ty = I->getType();
-  return Ty->isIntegerTy() &&
-         ((isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
-          (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty)));
+  return (isa<LoadInst>(I) && TTI.isIndexedLoadLegal(TTI.MIM_PostInc, Ty)) ||
+         (isa<StoreInst>(I) && TTI.isIndexedStoreLegal(TTI.MIM_PostInc, Ty));
 }
 
 /// Rewrite all the fixup locations with new values, following the chosen
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
new file mode 100644
index 0000000000000..5976658ccdf86
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/postidx-load.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-none-elf | FileCheck %s
+
+; Check that the load in the loop has postindex addressing, regardless of the
+; type or whether the input uses postindex or offset addressing.
+
+define i32 @i32_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB0_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB0_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB0_5
+; CHECK-NEXT:  // %bb.3: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB0_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.inc ], [ 0, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load i32, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.inc ]
+  ret i32 %ret
+}
+
+define i32 @i32_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: i32_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB1_5
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB1_2: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr w9, [x0], #4
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    b.lo .LBB1_5
+; CHECK-NEXT:  // %bb.3: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB1_2
+; CHECK-NEXT:  // %bb.4: // %cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_5:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi i32 [ %add, %for.cond ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %iv
+  %val = load i32, ptr %arrayidx, align 4
+  %add = add i32 %accum, %val
+  %cmp2 = icmp ult i32 %add, 0
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi i32 [ 0, %entry ], [ 0, %for.body ], [ %add, %for.cond ]
+  ret i32 %ret
+}
+
+define float @float_initially_postidx(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_postidx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB2_3
+; CHECK-NEXT:  .LBB2_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB2_4
+; CHECK-NEXT:  // %bb.2: // %for.inc
+; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB2_1
+; CHECK-NEXT:  .LBB2_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.inc ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.inc ], [ 0.000000e+00, %entry ]
+  %ptr = phi ptr [ %ptr.next, %for.inc ], [ %p, %entry ]
+  %val = load float, ptr %ptr, align 4
+  %ptr.next = getelementptr inbounds nuw i8, ptr %ptr, i64 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.inc
+
+for.inc:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.inc ]
+  ret float %ret
+}
+
+define float @float_initially_offset(ptr %p, i64 %n) {
+; CHECK-LABEL: float_initially_offset:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    cmp x1, #1
+; CHECK-NEXT:    b.lt .LBB3_3
+; CHECK-NEXT:  .LBB3_1: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr s1, [x0], #4
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    fcmp s0, #0.0
+; CHECK-NEXT:    b.mi .LBB3_4
+; CHECK-NEXT:  // %bb.2: // %for.cond
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    subs x1, x1, #1
+; CHECK-NEXT:    b.ne .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    movi d0, #0000000000000000
+; CHECK-NEXT:    ret
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %cleanup
+
+for.cond:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %cleanup, label %for.body
+
+for.body:
+  %iv = phi i64 [ %iv.next, %for.cond ], [ 0, %entry ]
+  %accum = phi float [ %add, %for.cond ], [ 0.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds nuw float, ptr %p, i64 %iv
+  %val = load float, ptr %arrayidx, align 4
+  %add = fadd float %accum, %val
+  %cmp2 = fcmp olt float %add, 0.000000e+00
+  br i1 %cmp2, label %cleanup, label %for.cond
+
+cleanup:
+  %ret = phi float [ 0.000000e+00, %entry ], [ 0.000000e+00, %for.body ], [ %add, %for.cond ]
+  ret float %ret
+}

From 39ad3151e073e9f721d1e2e2849fb4bdc9443ae3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 16 Jun 2025 15:26:47 +0100
Subject: [PATCH 585/851] [TableGen] Use default member initializers. NFC.
 (#144349)

Automated with clang-tidy -fix -checks=-*,modernize-use-default-member-init
---
 llvm/utils/TableGen/AsmMatcherEmitter.cpp      |  9 ++++-----
 .../TableGen/Common/CodeGenDAGPatterns.cpp     | 18 ++++++++----------
 .../utils/TableGen/Common/CodeGenRegisters.cpp |  4 ++--
 llvm/utils/TableGen/Common/CodeGenRegisters.h  | 10 ++++------
 llvm/utils/TableGen/Common/CodeGenSchedule.h   | 14 +++++++-------
 .../Common/GlobalISel/GlobalISelMatchTable.h   | 18 ++++++++----------
 llvm/utils/TableGen/Common/PredicateExpander.h |  9 ++++-----
 llvm/utils/TableGen/DAGISelMatcherGen.cpp      |  9 ++++-----
 llvm/utils/TableGen/DecoderEmitter.cpp         |  5 ++---
 llvm/utils/TableGen/FastISelEmitter.cpp        |  4 ++--
 10 files changed, 45 insertions(+), 55 deletions(-)

diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 32098e96ce721..b6d9c9f3a1584 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -388,7 +388,7 @@ struct MatchableInfo {
     StringRef Token;
 
     /// The unique class instance this operand should match.
-    ClassInfo *Class;
+    ClassInfo *Class = nullptr;
 
     /// The operand name this is, if anything.
     StringRef SrcOpName;
@@ -397,18 +397,17 @@ struct MatchableInfo {
     StringRef OrigSrcOpName;
 
     /// The suboperand index within SrcOpName, or -1 for the entire operand.
-    int SubOpIdx;
+    int SubOpIdx = -1;
 
     /// Whether the token is "isolated", i.e., it is preceded and followed
     /// by separators.
     bool IsIsolatedToken;
 
     /// Register record if this token is singleton register.
-    const Record *SingletonReg;
+    const Record *SingletonReg = nullptr;
 
     explicit AsmOperand(bool IsIsolatedToken, StringRef T)
-        : Token(T), Class(nullptr), SubOpIdx(-1),
-          IsIsolatedToken(IsIsolatedToken), SingletonReg(nullptr) {}
+        : Token(T), IsIsolatedToken(IsIsolatedToken) {}
   };
 
   /// ResOperand - This represents a single operand in the result instruction
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 810b35e65b310..3a4ca1b451567 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3604,16 +3604,14 @@ class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
 
 public:
-  bool hasSideEffects;
-  bool mayStore;
-  bool mayLoad;
-  bool isBitcast;
-  bool isVariadic;
-  bool hasChain;
-
-  InstAnalyzer(const CodeGenDAGPatterns &cdp)
-      : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
-        isBitcast(false), isVariadic(false), hasChain(false) {}
+  bool hasSideEffects = false;
+  bool mayStore = false;
+  bool mayLoad = false;
+  bool isBitcast = false;
+  bool isVariadic = false;
+  bool hasChain = false;
+
+  InstAnalyzer(const CodeGenDAGPatterns &cdp) : CDP(cdp) {}
 
   void Analyze(const PatternToMatch &Pat) {
     const TreePatternNode &N = Pat.getSrcPattern();
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index f52c21e97f9c8..57a243158692b 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -164,8 +164,8 @@ CodeGenRegister::CodeGenRegister(const Record *R, unsigned Enum)
     : TheDef(R), EnumValue(Enum),
       CostPerUse(R->getValueAsListOfInts("CostPerUse")),
       CoveredBySubRegs(R->getValueAsBit("CoveredBySubRegs")),
-      HasDisjunctSubRegs(false), Constant(R->getValueAsBit("isConstant")),
-      SubRegsComplete(false), SuperRegsComplete(false), TopoSig(~0u) {
+      Constant(R->getValueAsBit("isConstant")), SubRegsComplete(false),
+      SuperRegsComplete(false), TopoSig(~0u) {
   Artificial = R->getValueAsBit("isArtificial");
 }
 
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index 3f4c157fab69a..bbcd44ce2cc5b 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -564,7 +564,7 @@ struct RegUnit {
   // Weight assigned to this RegUnit for estimating register pressure.
   // This is useful when equalizing weights in register classes with mixed
   // register topologies.
-  unsigned Weight;
+  unsigned Weight = 0;
 
   // Each native RegUnit corresponds to one or two root registers. The full
   // set of registers containing this unit can be computed as the union of
@@ -573,14 +573,12 @@ struct RegUnit {
 
   // Index into RegClassUnitSets where we can find the list of UnitSets that
   // contain this unit.
-  unsigned RegClassUnitSetsIdx;
+  unsigned RegClassUnitSetsIdx = 0;
   // A register unit is artificial if at least one of its roots is
   // artificial.
-  bool Artificial;
+  bool Artificial = false;
 
-  RegUnit() : Weight(0), RegClassUnitSetsIdx(0), Artificial(false) {
-    Roots[0] = Roots[1] = nullptr;
-  }
+  RegUnit() { Roots[0] = Roots[1] = nullptr; }
 
   ArrayRef<const CodeGenRegister *> getRoots() const {
     assert(!(Roots[1] && !Roots[0]) && "Invalid roots array");
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h
index 697a1ce8f75ac..1d5e953cf70c7 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h
@@ -193,7 +193,7 @@ struct CodeGenRegisterFile {
   unsigned MaxMovesEliminatedPerCycle;
   bool AllowZeroMoveEliminationOnly;
 
-  unsigned NumPhysRegs;
+  unsigned NumPhysRegs = 0;
   std::vector<CodeGenRegisterCost> Costs;
 
   CodeGenRegisterFile(StringRef name, const Record *def,
@@ -201,7 +201,7 @@ struct CodeGenRegisterFile {
                       bool AllowZeroMoveElimOnly = false)
       : Name(name), RegisterFileDef(def),
         MaxMovesEliminatedPerCycle(MaxMoveElimPerCy),
-        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly), NumPhysRegs(0) {}
+        AllowZeroMoveEliminationOnly(AllowZeroMoveElimOnly) {}
 
   bool hasDefaultCosts() const { return Costs.empty(); }
 };
@@ -261,16 +261,16 @@ struct CodeGenProcModel {
   std::vector<CodeGenRegisterFile> RegisterFiles;
 
   // Optional Retire Control Unit definition.
-  const Record *RetireControlUnit;
+  const Record *RetireControlUnit = nullptr;
 
   // Load/Store queue descriptors.
-  const Record *LoadQueue;
-  const Record *StoreQueue;
+  const Record *LoadQueue = nullptr;
+  const Record *StoreQueue = nullptr;
 
   CodeGenProcModel(unsigned Idx, std::string Name, const Record *MDef,
                    const Record *IDef)
-      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef),
-        RetireControlUnit(nullptr), LoadQueue(nullptr), StoreQueue(nullptr) {}
+      : Index(Idx), ModelName(std::move(Name)), ModelDef(MDef), ItinsDef(IDef) {
+  }
 
   bool hasItineraries() const {
     return !ItinsDef->getValueAsListOfDefs("IID").empty();
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 66472576eea8f..620f88db66109 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -501,13 +501,13 @@ class RuleMatcher : public Matcher {
 
   /// ID for the next instruction variable defined with
   /// implicitlyDefineInsnVar()
-  unsigned NextInsnVarID;
+  unsigned NextInsnVarID = 0;
 
   /// ID for the next output instruction allocated with allocateOutputInsnID()
-  unsigned NextOutputInsnID;
+  unsigned NextOutputInsnID = 0;
 
   /// ID for the next temporary register ID allocated with allocateTempRegID()
-  unsigned NextTempRegID;
+  unsigned NextTempRegID = 0;
 
   /// ID for the next recorded type. Starts at -1 and counts down.
   TempTypeIdx NextTempTypeIdx = -1;
@@ -545,9 +545,7 @@ class RuleMatcher : public Matcher {
                              StringRef FlagName, GISelFlags FlagBit);
 
 public:
-  RuleMatcher(ArrayRef<SMLoc> SrcLoc)
-      : NextInsnVarID(0), NextOutputInsnID(0), NextTempRegID(0), SrcLoc(SrcLoc),
-        RuleID(NextRuleID++) {}
+  RuleMatcher(ArrayRef<SMLoc> SrcLoc) : SrcLoc(SrcLoc), RuleID(NextRuleID++) {}
   RuleMatcher(RuleMatcher &&Other) = default;
   RuleMatcher &operator=(RuleMatcher &&Other) = default;
 
@@ -2039,12 +2037,12 @@ class CopyConstantAsImmRenderer : public OperandRenderer {
   unsigned NewInsnID;
   /// The name of the operand.
   const std::string SymbolicName;
-  bool Signed;
+  bool Signed = true;
 
 public:
   CopyConstantAsImmRenderer(unsigned NewInsnID, StringRef SymbolicName)
       : OperandRenderer(OR_CopyConstantAsImm), NewInsnID(NewInsnID),
-        SymbolicName(SymbolicName), Signed(true) {}
+        SymbolicName(SymbolicName) {}
 
   static bool classof(const OperandRenderer *R) {
     return R->getKind() == OR_CopyConstantAsImm;
@@ -2359,7 +2357,7 @@ class BuildMIAction : public MatchAction {
 private:
   unsigned InsnID;
   const CodeGenInstruction *I;
-  InstructionMatcher *Matched;
+  InstructionMatcher *Matched = nullptr;
   std::vector<std::unique_ptr<OperandRenderer>> OperandRenderers;
   SmallPtrSet<const Record *, 4> DeadImplicitDefs;
 
@@ -2372,7 +2370,7 @@ class BuildMIAction : public MatchAction {
 
 public:
   BuildMIAction(unsigned InsnID, const CodeGenInstruction *I)
-      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I), Matched(nullptr) {}
+      : MatchAction(AK_BuildMI), InsnID(InsnID), I(I) {}
 
   static bool classof(const MatchAction *A) {
     return A->getKind() == AK_BuildMI;
diff --git a/llvm/utils/TableGen/Common/PredicateExpander.h b/llvm/utils/TableGen/Common/PredicateExpander.h
index 0c3a8718a473f..4439327af2b03 100644
--- a/llvm/utils/TableGen/Common/PredicateExpander.h
+++ b/llvm/utils/TableGen/Common/PredicateExpander.h
@@ -25,9 +25,9 @@ namespace llvm {
 class Record;
 
 class PredicateExpander {
-  bool EmitCallsByRef;
-  bool NegatePredicate;
-  bool ExpandForMC;
+  bool EmitCallsByRef = true;
+  bool NegatePredicate = false;
+  bool ExpandForMC = false;
   StringRef TargetName;
 
   PredicateExpander(const PredicateExpander &) = delete;
@@ -38,8 +38,7 @@ class PredicateExpander {
 
 public:
   explicit PredicateExpander(StringRef Target, unsigned Indent = 1)
-      : EmitCallsByRef(true), NegatePredicate(false), ExpandForMC(false),
-        TargetName(Target), Indent(Indent, 2) {}
+      : TargetName(Target), Indent(Indent, 2) {}
   bool isByRef() const { return EmitCallsByRef; }
   bool shouldNegate() const { return NegatePredicate; }
   bool shouldExpandForMC() const { return ExpandForMC; }
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 0039ff4f3e2d7..227311b0a3bc8 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -76,7 +76,7 @@ class MatcherGen {
   /// NextRecordedOperandNo - As we emit opcodes to record matched values in
   /// the RecordedNodes array, this keeps track of which slot will be next to
   /// record into.
-  unsigned NextRecordedOperandNo;
+  unsigned NextRecordedOperandNo = 0;
 
   /// MatchedChainNodes - This maintains the position in the recorded nodes
   /// array of all of the recorded input nodes that have chains.
@@ -94,11 +94,11 @@ class MatcherGen {
   SmallVector<std::pair<const Record *, unsigned>, 2> PhysRegInputs;
 
   /// Matcher - This is the top level of the generated matcher, the result.
-  Matcher *TheMatcher;
+  Matcher *TheMatcher = nullptr;
 
   /// CurPredicate - As we emit matcher nodes, this points to the latest check
   /// which should have future checks stuck into its Next position.
-  Matcher *CurPredicate;
+  Matcher *CurPredicate = nullptr;
 
 public:
   MatcherGen(const PatternToMatch &pattern, const CodeGenDAGPatterns &cgp);
@@ -147,8 +147,7 @@ class MatcherGen {
 
 MatcherGen::MatcherGen(const PatternToMatch &pattern,
                        const CodeGenDAGPatterns &cgp)
-    : Pattern(pattern), CGP(cgp), NextRecordedOperandNo(0), TheMatcher(nullptr),
-      CurPredicate(nullptr) {
+    : Pattern(pattern), CGP(cgp) {
   // We need to produce the matcher tree for the patterns source pattern.  To
   // do this we need to match the structure as well as the types.  To do the
   // type matching, we want to figure out the fewest number of type checks we
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 56c3644c134f1..7489d369c9932 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -104,10 +104,9 @@ struct OperandInfo {
   std::vector<EncodingField> Fields;
   std::string Decoder;
   bool HasCompleteDecoder;
-  uint64_t InitValue;
+  uint64_t InitValue = 0;
 
-  OperandInfo(std::string D, bool HCD)
-      : Decoder(D), HasCompleteDecoder(HCD), InitValue(0) {}
+  OperandInfo(std::string D, bool HCD) : Decoder(D), HasCompleteDecoder(HCD) {}
 
   void addField(unsigned Base, unsigned Width, unsigned Offset) {
     Fields.push_back(EncodingField(Base, Width, Offset));
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index a8b6f79c176a7..694d89a5ada3c 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -86,10 +86,10 @@ namespace {
 struct OperandsSignature {
   class OpKind {
     enum { OK_Reg, OK_FP, OK_Imm, OK_Invalid = -1 };
-    char Repr;
+    char Repr = OK_Invalid;
 
   public:
-    OpKind() : Repr(OK_Invalid) {}
+    OpKind() {}
 
     bool operator<(OpKind RHS) const { return Repr < RHS.Repr; }
     bool operator==(OpKind RHS) const { return Repr == RHS.Repr; }

From 595a273d9232a7378c583fb109212370d6d2f4e4 Mon Sep 17 00:00:00 2001
From: Andrey Timonin <timonina1909@gmail.com>
Date: Mon, 16 Jun 2025 19:37:39 +0500
Subject: [PATCH 586/851] [mlir][emitc] Support 'emitc::LValueType' in
 'emitc::VerbatimOp' (#144151)

This PR introduces support for `emitc::LvalueType` in
`emitc::VerbatimOp`, providing a mechanism to reduce the number of
operations required when working with verbatim operations whose
arguments are of type `emitc::LvalueType`.

Before:
```mlir
emitc.func @foo() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  %loaded_a = load %a : !emitc.lvalue<i32>
  emitc.verbatim "{} + {};" args %loaded_a, %loaded_a : i32, i32

  return
}
```

After:
```mlir
emitc.func @bar() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  emitc.verbatim "{} + {};" args %a, %a : !emitc.lvalue<i32>, !emitc.lvalue<i32>

  return
}
```

You can now write something like this:
```mlir
emitc.func @baz() {
  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
  emitc.verbatim "++{};" args %a : !emitc.lvalue<i32>

  return
}
```
---
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td |  2 +-
 mlir/test/Dialect/EmitC/ops.mlir            | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index d4aea52a0d485..e53d3e45875d5 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -1304,7 +1304,7 @@ def EmitC_VerbatimOp : EmitC_Op<"verbatim"> {
     FailureOr<SmallVector<::mlir::emitc::ReplacementItem>> parseFormatString();
   }];
 
-  let arguments = (ins StrAttr:$value, Variadic<EmitCType>:$fmtArgs);
+  let arguments = (ins StrAttr:$value, Variadic<AnyTypeOf<[EmitCType, EmitC_LValueType]>>:$fmtArgs);
 
   let builders = [OpBuilder<(ins "::mlir::StringAttr":$value),
                             [{ build($_builder, $_state, value, {}); }]>];
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 36d12e763afc7..ad40313f95df9 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -246,12 +246,20 @@ emitc.verbatim "typedef float f32;"
 // The value is not interpreted as format string if there are no operands.
 emitc.verbatim "{} {  }"
 
-func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32) {
+func.func @test_verbatim(%arg0 : !emitc.ptr<i32>, %arg1 : i32, %arg2: !emitc.array<3x!emitc.ptr<i32>>) {
+  %a = "emitc.variable"() <{value = #emitc.opaque<"1">}> : () -> !emitc.lvalue<i32>
+
+  // Check that the lvalue type can be used by verbatim.
+  emitc.verbatim "++{};" args %a : !emitc.lvalue<i32>
+
+  // Check that the array type can be used by verbatim.
+  emitc.verbatim "*{}[0] = 1;" args %arg2 : !emitc.array<3x!emitc.ptr<i32>>
+
   emitc.verbatim "{} + {};" args %arg0, %arg1 : !emitc.ptr<i32>, i32
 
-  // Check there is no ambiguity whether %a is the argument to the emitc.verbatim op.
-  emitc.verbatim "a"
-  %a = "emitc.constant"(){value = 42 : i32} : () -> i32
+  // Check there is no ambiguity whether %b is the argument to the emitc.verbatim op.
+  emitc.verbatim "b"
+  %b = "emitc.constant"(){value = 42 : i32} : () -> i32
 
   return
 }

From 8e333e3cedec69f9e538ed79ed9f577956215edb Mon Sep 17 00:00:00 2001
From: Max191 <44243577+Max191@users.noreply.github.com>
Date: Mon, 16 Jun 2025 10:50:13 -0400
Subject: [PATCH 587/851] [mlir] Expose linearize/delinearize lowering
 transforms (#144156)

Moves the transformation logic from the AffineLinearizeOp and
AffineDelinearizeOp lowerings into separate transform functions that can
now be called separately. This provides a more controlled way to apply
the op lowerings.

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
---
 .../Dialect/Affine/Transforms/Transforms.h    |  14 ++
 .../Transforms/AffineExpandIndexOps.cpp       | 218 +++++++++---------
 2 files changed, 125 insertions(+), 107 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index bf830a29613fd..5c538d28c1835 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -32,6 +32,20 @@ enum class BoundType;
 
 namespace affine {
 class AffineApplyOp;
+class AffineDelinearizeIndexOp;
+class AffineLinearizeIndexOp;
+
+/// Lowers `affine.delinearize_index` into a sequence of division and remainder
+/// operations.
+LogicalResult lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                            AffineDelinearizeIndexOp op);
+
+/// Lowers `affine.linearize_index` into a sequence of multiplications and
+/// additions. Make a best effort to sort the input indices so that
+/// the most loop-invariant terms are at the left of the additions
+/// to enable loop-invariant code motion.
+LogicalResult lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                          AffineLinearizeIndexOp op);
 
 /// Populate patterns that expand affine index operations into more fundamental
 /// operations (not necessarily restricted to Affine dialect).
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
index 35205a6ca2eee..c0ef28c648ac5 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineExpandIndexOps.cpp
@@ -84,126 +84,130 @@ static SmallVector<Value> computeStrides(Location loc, RewriterBase &rewriter,
   return result;
 }
 
+LogicalResult
+affine::lowerAffineDelinearizeIndexOp(RewriterBase &rewriter,
+                                      AffineDelinearizeIndexOp op) {
+  Location loc = op.getLoc();
+  Value linearIdx = op.getLinearIndex();
+  unsigned numResults = op.getNumResults();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numResults == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  if (numResults == 1) {
+    rewriter.replaceOp(op, linearIdx);
+    return success();
+  }
+
+  SmallVector<Value> results;
+  results.reserve(numResults);
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/true);
+
+  Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
+
+  Value initialPart =
+      rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
+  results.push_back(initialPart);
+
+  auto emitModTerm = [&](Value stride) -> Value {
+    Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
+    Value remainderNegative = rewriter.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::slt, remainder, zero);
+    // If the correction is relevant, this term is <= stride, which is known
+    // to be positive in `index`. Otherwise, while 2 * stride might overflow,
+    // this branch won't be taken, so the risk of `poison` is fine.
+    Value corrected = rewriter.create<arith::AddIOp>(
+        loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
+    Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
+                                                 corrected, remainder);
+    return mod;
+  };
+
+  // Generate all the intermediate parts
+  for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
+    Value thisStride = strides[i];
+    Value nextStride = strides[i + 1];
+    Value modulus = emitModTerm(thisStride);
+    // We know both inputs are positive, so floorDiv == div.
+    // This could potentially be a divui, but it's not clear if that would
+    // cause issues.
+    Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
+    results.push_back(divided);
+  }
+
+  results.push_back(emitModTerm(strides.back()));
+
+  rewriter.replaceOp(op, results);
+  return success();
+}
+
+LogicalResult affine::lowerAffineLinearizeIndexOp(RewriterBase &rewriter,
+                                                  AffineLinearizeIndexOp op) {
+  // Should be folded away, included here for safety.
+  if (op.getMultiIndex().empty()) {
+    rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
+    return success();
+  }
+
+  Location loc = op.getLoc();
+  ValueRange multiIndex = op.getMultiIndex();
+  size_t numIndexes = multiIndex.size();
+  ArrayRef<int64_t> staticBasis = op.getStaticBasis();
+  if (numIndexes == staticBasis.size())
+    staticBasis = staticBasis.drop_front();
+
+  SmallVector<Value> strides =
+      computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
+                     /*knownNonNegative=*/op.getDisjoint());
+  SmallVector<std::pair<Value, int64_t>> scaledValues;
+  scaledValues.reserve(numIndexes);
+
+  // Note: strides doesn't contain a value for the final element (stride 1)
+  // and everything else lines up. We use the "mutable" accessor so we can get
+  // our hands on an `OpOperand&` for the loop invariant counting function.
+  for (auto [stride, idxOp] :
+       llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
+    Value scaledIdx = rewriter.create<arith::MulIOp>(
+        loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
+    int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
+    scaledValues.emplace_back(scaledIdx, numHoistableLoops);
+  }
+  scaledValues.emplace_back(
+      multiIndex.back(),
+      numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
+
+  // Sort by how many enclosing loops there are, ties implicitly broken by
+  // size of the stride.
+  llvm::stable_sort(scaledValues,
+                    [&](auto l, auto r) { return l.second > r.second; });
+
+  Value result = scaledValues.front().first;
+  for (auto [scaledValue, numHoistableLoops] : llvm::drop_begin(scaledValues)) {
+    std::ignore = numHoistableLoops;
+    result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
+                                            arith::IntegerOverflowFlags::nsw);
+  }
+  rewriter.replaceOp(op, result);
+  return success();
+}
+
 namespace {
-/// Lowers `affine.delinearize_index` into a sequence of division and remainder
-/// operations.
 struct LowerDelinearizeIndexOps
     : public OpRewritePattern<AffineDelinearizeIndexOp> {
   using OpRewritePattern<AffineDelinearizeIndexOp>::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineDelinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    Location loc = op.getLoc();
-    Value linearIdx = op.getLinearIndex();
-    unsigned numResults = op.getNumResults();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numResults == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    if (numResults == 1) {
-      rewriter.replaceOp(op, linearIdx);
-      return success();
-    }
-
-    SmallVector<Value> results;
-    results.reserve(numResults);
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/true);
-
-    Value zero = rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0);
-
-    Value initialPart =
-        rewriter.create<arith::FloorDivSIOp>(loc, linearIdx, strides.front());
-    results.push_back(initialPart);
-
-    auto emitModTerm = [&](Value stride) -> Value {
-      Value remainder = rewriter.create<arith::RemSIOp>(loc, linearIdx, stride);
-      Value remainderNegative = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, remainder, zero);
-      // If the correction is relevant, this term is <= stride, which is known
-      // to be positive in `index`. Otherwise, while 2 * stride might overflow,
-      // this branch won't be taken, so the risk of `poison` is fine.
-      Value corrected = rewriter.create<arith::AddIOp>(
-          loc, remainder, stride, arith::IntegerOverflowFlags::nsw);
-      Value mod = rewriter.create<arith::SelectOp>(loc, remainderNegative,
-                                                   corrected, remainder);
-      return mod;
-    };
-
-    // Generate all the intermediate parts
-    for (size_t i = 0, e = strides.size() - 1; i < e; ++i) {
-      Value thisStride = strides[i];
-      Value nextStride = strides[i + 1];
-      Value modulus = emitModTerm(thisStride);
-      // We know both inputs are positive, so floorDiv == div.
-      // This could potentially be a divui, but it's not clear if that would
-      // cause issues.
-      Value divided = rewriter.create<arith::DivSIOp>(loc, modulus, nextStride);
-      results.push_back(divided);
-    }
-
-    results.push_back(emitModTerm(strides.back()));
-
-    rewriter.replaceOp(op, results);
-    return success();
+    return affine::lowerAffineDelinearizeIndexOp(rewriter, op);
   }
 };
 
-/// Lowers `affine.linearize_index` into a sequence of multiplications and
-/// additions. Make a best effort to sort the input indices so that
-/// the most loop-invariant terms are at the left of the additions
-/// to enable loop-invariant code motion.
 struct LowerLinearizeIndexOps final : OpRewritePattern<AffineLinearizeIndexOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(AffineLinearizeIndexOp op,
                                 PatternRewriter &rewriter) const override {
-    // Should be folded away, included here for safety.
-    if (op.getMultiIndex().empty()) {
-      rewriter.replaceOpWithNewOp<arith::ConstantIndexOp>(op, 0);
-      return success();
-    }
-
-    Location loc = op.getLoc();
-    ValueRange multiIndex = op.getMultiIndex();
-    size_t numIndexes = multiIndex.size();
-    ArrayRef<int64_t> staticBasis = op.getStaticBasis();
-    if (numIndexes == staticBasis.size())
-      staticBasis = staticBasis.drop_front();
-
-    SmallVector<Value> strides =
-        computeStrides(loc, rewriter, op.getDynamicBasis(), staticBasis,
-                       /*knownNonNegative=*/op.getDisjoint());
-    SmallVector<std::pair<Value, int64_t>> scaledValues;
-    scaledValues.reserve(numIndexes);
-
-    // Note: strides doesn't contain a value for the final element (stride 1)
-    // and everything else lines up. We use the "mutable" accessor so we can get
-    // our hands on an `OpOperand&` for the loop invariant counting function.
-    for (auto [stride, idxOp] :
-         llvm::zip_equal(strides, llvm::drop_end(op.getMultiIndexMutable()))) {
-      Value scaledIdx = rewriter.create<arith::MulIOp>(
-          loc, idxOp.get(), stride, arith::IntegerOverflowFlags::nsw);
-      int64_t numHoistableLoops = numEnclosingInvariantLoops(idxOp);
-      scaledValues.emplace_back(scaledIdx, numHoistableLoops);
-    }
-    scaledValues.emplace_back(
-        multiIndex.back(),
-        numEnclosingInvariantLoops(op.getMultiIndexMutable()[numIndexes - 1]));
-
-    // Sort by how many enclosing loops there are, ties implicitly broken by
-    // size of the stride.
-    llvm::stable_sort(scaledValues,
-                      [&](auto l, auto r) { return l.second > r.second; });
-
-    Value result = scaledValues.front().first;
-    for (auto [scaledValue, numHoistableLoops] :
-         llvm::drop_begin(scaledValues)) {
-      std::ignore = numHoistableLoops;
-      result = rewriter.create<arith::AddIOp>(loc, result, scaledValue,
-                                              arith::IntegerOverflowFlags::nsw);
-    }
-    rewriter.replaceOp(op, result);
-    return success();
+    return affine::lowerAffineLinearizeIndexOp(rewriter, op);
   }
 };
 

From 7c25db3fbfc63f76b270940e341f267e497e95d9 Mon Sep 17 00:00:00 2001
From: Xu Zhang <simonzgx@gmail.com>
Date: Mon, 16 Jun 2025 22:55:26 +0800
Subject: [PATCH 588/851] [DAG] Fold (and X, (add (not Y), Z)) -> (and X, (not
 (sub Y, Z))). (#141476)

Fixes #140639

---------

Co-authored-by: Simon Pilgrim <llvm-dev@redking.me.uk>
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++
 .../AArch64/aarch64-bitwisenot-fold.ll        | 98 +++++++++++++++++++
 .../test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll | 10 +-
 3 files changed, 126 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d62ded171f4f..f6d811ddba8ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -396,6 +396,8 @@ namespace {
     bool PromoteLoad(SDValue Op);
 
     SDValue foldShiftToAvg(SDNode *N);
+    // Fold `a bitwiseop (~b +/- c)` -> `a bitwiseop ~(b -/+ c)`
+    SDValue foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT);
 
     SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                 SDValue RHS, SDValue True, SDValue False,
@@ -7541,6 +7543,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return DAG.getNode(ISD::AND, DL, VT, X,
                          DAG.getNOT(DL, DAG.getNode(Opc, DL, VT, Y, Z), VT));
 
+  // Fold (and X, (add (not Y), Z)) -> (and X, (not (sub Y, Z)))
+  // Fold (and X, (sub (not Y), Z)) -> (and X, (not (add Y, Z)))
+  if (TLI.hasAndNot(SDValue(N, 0)))
+    if (SDValue Folded = foldBitwiseOpWithNeg(N, DL, VT))
+      return Folded;
+
   // Fold (and (srl X, C), 1) -> (srl X, BW-1) for signbit extraction
   // If we are shifting down an extended sign bit, see if we can simplify
   // this to shifting the MSB directly to expose further simplifications.
@@ -11652,6 +11660,22 @@ SDValue DAGCombiner::foldShiftToAvg(SDNode *N) {
   return DAG.getNode(FloorISD, SDLoc(N), N->getValueType(0), {A, B});
 }
 
+SDValue DAGCombiner::foldBitwiseOpWithNeg(SDNode *N, const SDLoc &DL, EVT VT) {
+  unsigned Opc = N->getOpcode();
+  SDValue X, Y, Z;
+  if (sd_match(
+          N, m_BitwiseLogic(m_Value(X), m_Add(m_Not(m_Value(Y)), m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::SUB, DL, VT, Y, Z), VT));
+
+  if (sd_match(N, m_BitwiseLogic(m_Value(X), m_Sub(m_OneUse(m_Not(m_Value(Y))),
+                                                   m_Value(Z)))))
+    return DAG.getNode(Opc, DL, VT, X,
+                       DAG.getNOT(DL, DAG.getNode(ISD::ADD, DL, VT, Y, Z), VT));
+
+  return SDValue();
+}
+
 /// Generate Min/Max node
 SDValue DAGCombiner::combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                          SDValue RHS, SDValue True,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
new file mode 100644
index 0000000000000..5fbf38b2560d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bitwisenot-fold.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64-linux | FileCheck %s
+
+define i8 @andnot_add_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %sum = add i8 %not, %a1
+  %and = and i8 %sum, %a0
+  ret i8 %and
+}
+
+define i8 @andnot_sub_with_neg_i8(i8 %a0, i8 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %diff = sub i8 %not, %a1
+  %and = and i8 %diff, %a0
+  ret i8 %and
+}
+
+define i16 @andnot_add_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %sum = add i16 %not, %a1
+  %and = and i16 %sum, %a0
+  ret i16 %and
+}
+
+define i16 @andnot_sub_with_neg_i16(i16 %a0, i16 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i16 %a0, -1
+  %diff = sub i16 %not, %a1
+  %and = and i16 %diff, %a0
+  ret i16 %and
+}
+
+define i32 @andnot_add_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %sum = add i32 %not, %a1
+  %and = and i32 %sum, %a0
+  ret i32 %and
+}
+
+define i32 @andnot_sub_with_neg_i32(i32 %a0, i32 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add w8, w0, w1
+; CHECK-NEXT:    bic w0, w0, w8
+; CHECK-NEXT:    ret
+  %not = xor i32 %a0, -1
+  %diff = sub i32 %not, %a1
+  %and = and i32 %diff, %a0
+  ret i32 %and
+}
+
+define i64 @andnot_add_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_add_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %sum = add i64 %not, %a1
+  %and = and i64 %sum, %a0
+  ret i64 %and
+}
+
+define i64 @andnot_sub_with_neg_i64(i64 %a0, i64 %a1) {
+; CHECK-LABEL: andnot_sub_with_neg_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    bic x0, x0, x8
+; CHECK-NEXT:    ret
+  %not = xor i64 %a0, -1
+  %diff = sub i64 %not, %a1
+  %and = and i64 %diff, %a0
+  ret i64 %and
+}
diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
index e564d7bddea6f..27be02c50f1c7 100644
--- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
+++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll
@@ -885,9 +885,8 @@ define i64 @test_cttz_i64(i64 %a) nounwind {
 define i8 @test_not_cttz_i8(i8 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i8:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    andi $a1, $a1, 85
 ; LA32R-NEXT:    sub.w $a0, $a0, $a1
@@ -921,9 +920,8 @@ define i8 @test_not_cttz_i8(i8 %a) nounwind {
 define i16 @test_not_cttz_i16(i16 %a) nounwind {
 ; LA32R-LABEL: test_not_cttz_i16:
 ; LA32R:       # %bb.0:
-; LA32R-NEXT:    nor $a1, $a0, $zero
-; LA32R-NEXT:    addi.w $a1, $a1, -1
-; LA32R-NEXT:    and $a0, $a0, $a1
+; LA32R-NEXT:    addi.w $a1, $a0, 1
+; LA32R-NEXT:    andn $a0, $a0, $a1
 ; LA32R-NEXT:    srli.w $a1, $a0, 1
 ; LA32R-NEXT:    lu12i.w $a2, 5
 ; LA32R-NEXT:    ori $a2, $a2, 1365

From f2734aa25e808e8c1967f7125fdea6c8b2dab9e1 Mon Sep 17 00:00:00 2001
From: Acthinks Yang <yangzhh@mail.ustc.edu.cn>
Date: Mon, 16 Jun 2025 23:05:30 +0800
Subject: [PATCH 589/851] [InstCombine] fold icmp with add/sub instructions
 having the same operands (#143241)

Closes #143211.
---
 .../InstCombine/InstCombineCompares.cpp       |  24 ++++
 .../Transforms/InstCombine/icmp-subadd.ll     | 111 ++++++++++++++++++
 2 files changed, 135 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-subadd.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c112fae351817..084e7fbaa268a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -7728,6 +7728,30 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
     }
   }
 
+  // icmp slt (sub nsw x, y), (add nsw x, y)  -->  icmp sgt y, 0
+  // icmp ult (sub nuw x, y), (add nuw x, y)  -->  icmp ugt y, 0
+  // icmp eq (sub nsw/nuw x, y), (add nsw/nuw x, y)   -->  icmp eq y, 0
+  {
+    Value *A, *B;
+    CmpPredicate CmpPred;
+    if (match(&I, m_c_ICmp(CmpPred, m_Sub(m_Value(A), m_Value(B)),
+                           m_c_Add(m_Deferred(A), m_Deferred(B))))) {
+      auto *I0 = cast<OverflowingBinaryOperator>(Op0);
+      auto *I1 = cast<OverflowingBinaryOperator>(Op1);
+      bool I0NUW = I0->hasNoUnsignedWrap();
+      bool I1NUW = I1->hasNoUnsignedWrap();
+      bool I0NSW = I0->hasNoSignedWrap();
+      bool I1NSW = I1->hasNoSignedWrap();
+      if ((ICmpInst::isUnsigned(Pred) && I0NUW && I1NUW) ||
+          (ICmpInst::isSigned(Pred) && I0NSW && I1NSW) ||
+          (ICmpInst::isEquality(Pred) &&
+           ((I0NUW || I0NSW) && (I1NUW || I1NSW)))) {
+        return new ICmpInst(CmpPredicate::getSwapped(CmpPred), B,
+                            ConstantInt::get(Op0->getType(), 0));
+      }
+    }
+  }
+
   // Try to optimize equality comparisons against alloca-based pointers.
   if (Op0->getType()->isPointerTy() && I.isEquality()) {
     assert(Op1->getType()->isPointerTy() &&
diff --git a/llvm/test/Transforms/InstCombine/icmp-subadd.ll b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
new file mode 100644
index 0000000000000..fd7e1250d893f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-subadd.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-slt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-slt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp slt i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-sle(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-sle(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sle i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-nuw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-same-operands-sub-add-nsw-icmp-eq(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-same-operands-sub-add-nsw-icmp-eq(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp eq i8 %sub, %add
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nsw-icmp-sgt(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nsw-icmp-sgt(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[B]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nsw i8 %a, %b
+  %add = add nsw i8 %a, %b
+  %cmp = icmp sgt i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}
+
+; Check not folded
+define i1 @test-add-sub-nuw-icmp-sge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-sub-nuw-icmp-sge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[A]], [[B]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[ADD]], [[SUB]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %a, %b
+  %cmp = icmp sge i8 %add, %sub
+  ret i1 %cmp
+}
+
+define i1 @test-add-swap-sub-nuw-icmp-uge(i8 %a, i8 %b) {
+; CHECK-LABEL: define i1 @test-add-swap-sub-nuw-icmp-uge(
+; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %sub = sub nuw i8 %a, %b
+  %add = add nuw i8 %b, %a
+  %cmp = icmp uge i8 %add, %sub
+  ret i1 %cmp
+}

From 4692f0d3448e32381a2b21c7359c7daed07a8850 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Mon, 16 Jun 2025 09:06:18 -0600
Subject: [PATCH 590/851] Revert "[AMDGPU] Extended vector promotion to
 aggregate types." (#144366)

Reverts llvm/llvm-project#143784

Patch fails some internal tests. Will investigate more thoroughly before
attempting to remerge.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 106 +++----
 .../CodeGen/AMDGPU/promote-alloca-structs.ll  | 286 ------------------
 2 files changed, 41 insertions(+), 351 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e90a3a275f67c..700dc87d2f821 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -818,39 +818,6 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB,
   return I;
 }
 
-/// Get the underlying type of a homogeneous aggregate type, or nullptr if the
-/// type is non-homogeneous.
-static Type *getHomogeneousType(Type *Ty) {
-  Type *ElemTy = nullptr;
-  SmallVector<Type *> WorkList;
-  WorkList.push_back(Ty);
-  while (!WorkList.empty()) {
-    Type *CurTy = WorkList.pop_back_val();
-
-    // Check if the current type is an aggregate type.
-    if (auto *VectorTy = dyn_cast<FixedVectorType>(CurTy)) {
-      WorkList.push_back(VectorTy->getElementType());
-      continue;
-    }
-    if (auto *ArrayTy = dyn_cast<ArrayType>(CurTy)) {
-      WorkList.push_back(ArrayTy->getElementType());
-      continue;
-    }
-    if (auto *StructTy = dyn_cast<StructType>(CurTy)) {
-      WorkList.append(StructTy->element_begin(), StructTy->element_end());
-      continue;
-    }
-
-    // If not, it must be the same as all other non-aggregate types.
-    if (!ElemTy)
-      ElemTy = CurTy;
-    else if (ElemTy != CurTy)
-      return nullptr;
-  }
-
-  return ElemTy;
-}
-
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -861,42 +828,42 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   }
 
   Type *AllocaTy = Alloca.getAllocatedType();
-  Type *ElemTy = getHomogeneousType(AllocaTy);
-
-  if (!ElemTy || !VectorType::isValidElementType(ElemTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
-    return false;
-  }
+  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+    uint64_t NumElems = 1;
+    Type *ElemTy;
+    do {
+      NumElems *= ArrayTy->getNumElements();
+      ElemTy = ArrayTy->getElementType();
+    } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
+
+    // Check for array of vectors
+    auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
+    if (InnerVectorTy) {
+      NumElems *= InnerVectorTy->getNumElements();
+      ElemTy = InnerVectorTy->getElementType();
+    }
 
-  unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy);
-  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) {
-    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
-                         "does not match the type's size\n");
-    return false;
-  }
-  unsigned ElementSize = ElementSizeInBits / 8;
-  if (ElementSize == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot create vector of zero-sized elements\n");
-    return false;
+    if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
+      unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
+      if (ElementSize > 0) {
+        unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
+        // Expand vector if required to match padding of inner type,
+        // i.e. odd size subvectors.
+        // Storage size of new vector must match that of alloca for correct
+        // behaviour of byte offsets and GEP computation.
+        if (NumElems * ElementSize != AllocaSize)
+          NumElems = AllocaSize / ElementSize;
+        if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
+          VectorTy = FixedVectorType::get(ElemTy, NumElems);
+      }
+    }
   }
 
-  // Calculate the size of the corresponding vector, accounting for padding of
-  // inner types, e.g., odd-sized subvectors. Storage size of new vector must
-  // match that of alloca for correct behaviour of byte offsets and GEP
-  // computation.
-  unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
-  unsigned NumElems = AllocaSize / ElementSize;
-  if (NumElems == 0) {
-    LLVM_DEBUG(dbgs() << "  Cannot vectorize an empty aggregate type\n");
-    return false;
-  }
-  if (NumElems * ElementSize != AllocaSize) {
-    LLVM_DEBUG(
-        dbgs() << "  Cannot convert type into vector of the same size\n");
+  if (!VectorTy) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
-  auto *VectorTy = FixedVectorType::get(ElemTy, NumElems);
-  assert(VectorTy && "Failed to create vector type.");
 
   const unsigned MaxElements =
       (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
@@ -928,6 +895,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
 
   LLVM_DEBUG(dbgs() << "  Attempting promotion to: " << *VectorTy << "\n");
 
+  Type *VecEltTy = VectorTy->getElementType();
+  unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
+  if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
+    LLVM_DEBUG(dbgs() << "  Cannot convert to vector if the allocation size "
+                         "does not match the type's size\n");
+    return false;
+  }
+  unsigned ElementSize = ElementSizeInBits / 8;
+  assert(ElementSize > 0);
   for (auto *U : Uses) {
     Instruction *Inst = cast<Instruction>(U->getUser());
 
@@ -967,7 +943,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
       // If we can't compute a vector index from this GEP, then we can't
       // promote this alloca to vector.
-      Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts);
+      Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
       if (!Index)
         return RejectUser(Inst, "cannot compute vector index for GEP");
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
deleted file mode 100644
index 1cdd027fef89d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll
+++ /dev/null
@@ -1,286 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s
-
-define i8 @test_v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca <4 x i8>, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [4 x i8], align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <4 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x <3 x i8>], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [4 x i8]], align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_a2a3i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_a2a3i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca [2 x [3 x i8]], align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1v4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1v4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s1a4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s1a4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {[4 x i8]}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s4i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s4i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v4i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2v2i8v3i8(
-; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <8 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5)
-  store i64 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s4i8(
-; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <6 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5)
-  store i48 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s2s2i8s3i8(
-; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <5 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5)
-  store i40 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <4 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_s3i8i8s0(
-; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = freeze <2 x i8> poison
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]]
-; CHECK-NEXT:    ret i8 [[TMP2]]
-;
-  %stack = alloca {i8, i8, {}}, align 4, addrspace(5)
-  store i16 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; heterogeneous element types are not supported
-define i8 @test_heterogeneous(i32 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_heterogeneous(
-; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5)
-; CHECK-NEXT:    store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, i8, i16}, align 4, addrspace(5)
-  store i32 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}
-
-; empty types are not supported
-define void @test_empty() {
-; CHECK-LABEL: define void @test_empty() {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca {}, align 4, addrspace(5)
-; CHECK-NEXT:    ret void
-;
-  %stack = alloca {}, align 4, addrspace(5)
-  ret void
-}
-
-; singleton types are not supported
-define i8 @test_singleton(i8 %bits, i64 %idx) {
-; CHECK-LABEL: define i8 @test_singleton(
-; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) {
-; CHECK-NEXT:    [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5)
-; CHECK-NEXT:    store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1
-; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1
-; CHECK-NEXT:    ret i8 [[VAL]]
-;
-  %stack = alloca {i8, {}}, align 4, addrspace(5)
-  store i8 %bits, ptr addrspace(5) %stack
-  %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx
-  %val = load i8, ptr addrspace(5) %ptr, align 1
-  ret i8 %val
-}

From 38fa7533fbac525198206200cf2caf04071fcdb1 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 11:12:59 -0400
Subject: [PATCH 591/851] Fix diagnostic documentation build errors

---
 clang/include/clang/Basic/DiagnosticGroups.td | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 38b4f581fa5c9..36fa3227fd6a6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -815,19 +815,22 @@ changes to one object won't affect the others, the object's initializer will run
 once per copy, etc.
 
 Specifically, this warning fires when it detects an object which:
-1. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
-2. Has external linkage (otherwise it's supposed to be duplicated), and
-3. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
+
+#. Is defined as ``inline`` in a header file (so it might get compiled into multiple libaries), and
+#. Has external linkage (otherwise it's supposed to be duplicated), and
+#. Has hidden visibility (posix) or lacks a dllimport/dllexport attribute (windows).
 
 As well as one of the following:
-1. The object is mutable, or
-2. The object's initializer definitely has side effects.
+
+#. The object is mutable, or
+#. The object's initializer definitely has side effects.
 
 The warning can be resolved by removing one of the conditions above. In rough
 order of preference, this may be done by:
-1. Marking the object ``const`` (if possible)
-2. Moving the object's definition to a source file
-3. Making the object visible using ``__attribute((visibility("default")))``,
+
+#. Marking the object ``const`` (if possible)
+#. Moving the object's definition to a source file
+#. Making the object visible using ``__attribute((visibility("default")))``,
    ``__declspec(dllimport)``, or ``__declspec(dllexport)``.
 
 When annotating an object with ``__declspec(dllimport)`` or ``__declspec(dllexport)``,

From f0373295e82315f95a97ce1b34c78ff46f475863 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0@owo.li>
Date: Mon, 16 Jun 2025 23:17:47 +0800
Subject: [PATCH 592/851] [clang][Parser] Fix crash on malformed using
 declaration in constexpr function (#144286)

---
 clang/docs/ReleaseNotes.rst                               | 1 +
 clang/lib/Parse/ParseDeclCXX.cpp                          | 4 ++++
 .../Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp  | 8 ++++++++
 3 files changed, 13 insertions(+)
 create mode 100644 clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 33ee8a53b5f37..59d9612268d30 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -704,6 +704,7 @@ Bug Fixes in This Version
 - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168)
 - Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216)
 - Fixed an infinite recursion when checking constexpr destructors. (#GH141789)
+- Fixed a crash when a malformed using declaration appears in a ``constexpr`` function. (#GH144264)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index f31c9265a0074..a5c76501c7c18 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -760,6 +760,10 @@ Parser::DeclGroupPtrTy Parser::ParseUsingDeclaration(
 
     Decl *AD = ParseAliasDeclarationAfterDeclarator(
         TemplateInfo, UsingLoc, D, DeclEnd, AS, Attrs, &DeclFromDeclSpec);
+
+    if (!AD)
+      return nullptr;
+
     return Actions.ConvertDeclToDeclGroup(AD, DeclFromDeclSpec);
   }
 
diff --git a/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
new file mode 100644
index 0000000000000..94fa8c8c820a5
--- /dev/null
+++ b/clang/test/Parser/cxx-invalid-using-decl-in-constexpr-crash.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// issue144264
+constexpr void test() 
+{ 
+    using TT = struct T[; 
+    // expected-error@-1 {{expected expression}}
+}

From 6f1b5ed7e127b7806ae36783c6b9406434416c95 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 16:00:17 +0100
Subject: [PATCH 593/851] [X86] LowerCONCAT_VECTORS - pull out repeated
 SDLoc(). NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++++++++++--------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b4670e270141f..290fad07be4f9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9614,13 +9614,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 // TODO: Detect subvector broadcast here instead of DAG combine?
-static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, const SDLoc &dl,
+                                      SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
-
-  assert((ResVT.is256BitVector() ||
-          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
+  assert((ResVT.is256BitVector() || ResVT.is512BitVector()) &&
+         "Value type must be 256-/512-bit wide");
 
   unsigned NumOperands = Op.getNumOperands();
   unsigned NumFreezeUndef = 0;
@@ -9688,13 +9687,11 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
 // zeros) of the result of a node that already zeros all upper bits of
 // k-register.
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
-static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
+static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const SDLoc &dl,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
-  SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
-
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
@@ -9766,19 +9763,18 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
+  SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
-    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
-
-  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
-         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
-          Op.getNumOperands() == 4)));
+    return LowerCONCAT_VECTORSvXi1(Op, DL, Subtarget, DAG);
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
-
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
-  return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
+  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
+         (VT.is512BitVector() &&
+          (Op.getNumOperands() == 2 || Op.getNumOperands() == 4)));
+  return LowerAVXCONCAT_VECTORS(Op, DL, DAG, Subtarget);
 }
 
 //===----------------------------------------------------------------------===//

From 404597061f974470e8bd1198e44d024fac8319a1 Mon Sep 17 00:00:00 2001
From: Pranav Bhandarkar <pranav.bhandarkar@amd.com>
Date: Mon, 16 Jun 2025 10:27:48 -0500
Subject: [PATCH 594/851] [OMPIRBuilder] - Make offloading input data persist
 for deferred target tasks (#133499)

When we offload to the target, the pointers to data used by the kernel
are passed in arrays created by `OMPIRBuilder`. These arrays of pointers
are allocated on the stack on the host. This is fine for the most part
because absent the `nowait` clause, the default behavior is that target
tasks are included tasks. That is, the host is blocked until the
offloaded target kernel is done. In turn, this means that the host's
stack frame is intact and accessing the array of pointers when
offloading is safe. However, when `nowait` is used on the `!$ omp
target` instance, then the target task is a deferred task meaning, the
generating task on the host does not have to wait for the target task
to finish. In such cases, it is very likely that the stack frame of the
function invoking the target call is wound up thereby leading to memory
access errors as shown below.
```
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid.
AMDGPU error: Error in hsa_amd_memory_pool_allocate: HSA_STATUS_ERROR_INVALID_ALLOCATION: The requested allocation is not valid. "PluginInterface" error: Failure to allocate device memory: Failed to allocate from memory manager
fort.cod.out: /llvm/llvm-project/offload/plugins-nextgen/common/src/PluginInterface.cpp:1434: Error llvm::omp::target::plugin::PinnedAllocationMapTy::lockMappedHostBuffer(void *, size_t): Assertion `HstPtr && "Invalid pointer"' failed.
Aborted (core dumped)
```
This PR implements support in `OMPIRBuilder` to store these arrays of
pointers in the task structure that is passed to the target task thereby
ensuring it is available to the target task when the target task is
eventually scheduled.

---------

Co-authored-by: Sergio Afonso <safonsof@amd.com>
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 335 ++++++++++++++----
 mlir/test/Target/LLVMIR/omptarget-depend.mlir |   3 +-
 .../Target/LLVMIR/omptarget-nowait-llvm.mlir  |  45 ++-
 mlir/test/Target/LLVMIR/omptarget-nowait.mlir |  70 ++++
 .../LLVMIR/omptargetdata-nowait-llvm.mlir     |  45 +--
 6 files changed, 387 insertions(+), 113 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-nowait.mlir

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e4b1241151e9d..93fb0d8e8d078 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2507,7 +2507,7 @@ class OpenMPIRBuilder {
       TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
       const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-      bool HasNoWait);
+      const TargetDataRTArgs &RTArgs, bool HasNoWait);
 
   /// Emit the arguments to be passed to the runtime library based on the
   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ca3d8438654dc..c1f02b2b240de 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6703,7 +6703,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
                             /*TargetTaskAllocaIP=*/{}));
       else
         cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
-                                /*Dependencies=*/{}, Info.HasNoWait));
+                                /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
     } else {
       Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
           omp::OMPRTL___tgt_target_data_begin_mapper);
@@ -7150,15 +7150,55 @@ static Expected<Function *> createOutlinedFunction(
                                     ValueReplacementMap);
   return Func;
 }
+/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
+/// of pointers containing shared data between the parent task and the created
+/// task.
+static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
+                                                  IRBuilderBase &Builder,
+                                                  Value *TaskWithPrivates,
+                                                  Type *TaskWithPrivatesTy) {
 
+  Type *TaskTy = OMPIRBuilder.Task;
+  LLVMContext &Ctx = Builder.getContext();
+  Value *TaskT =
+      Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
+  Value *Shareds = TaskT;
+  // TaskWithPrivatesTy can be one of the following
+  // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+  //                                        %struct.privates }
+  // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
+  //
+  // In the former case, that is when  TaskWithPrivatesTy != TaskTy,
+  // its first member has to be the task descriptor. TaskTy is the type of the
+  // task descriptor. TaskT is the pointer to the task descriptor. Loading the
+  // first member of TaskT, gives us the pointer to shared data.
+  if (TaskWithPrivatesTy != TaskTy)
+    Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
+  return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+}
 /// Create an entry point for a target task with the following.
 /// It'll have the following signature
 /// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
 /// This function is called from emitTargetTask once the
 /// code to launch the target kernel has been outlined already.
-static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
-                                             IRBuilderBase &Builder,
-                                             CallInst *StaleCI) {
+/// NumOffloadingArrays is the number of offloading arrays that we need to copy
+/// into the task structure so that the deferred target task can access this
+/// data even after the stack frame of the generating task has been rolled
+/// back. Offloading arrays contain base pointers, pointers, sizes etc
+/// of the data that the target kernel will access. These in effect are the
+/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
+static Function *emitTargetTaskProxyFunction(
+    OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
+    StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
+    const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
+
+  // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
+  // This is because PrivatesTy is the type of the structure in which
+  // we pass the offloading arrays to the deferred target task.
+  assert((!NumOffloadingArrays || PrivatesTy) &&
+         "PrivatesTy cannot be nullptr when there are offloadingArrays"
+         "to privatize");
+
   Module &M = OMPBuilder.M;
   // KernelLaunchFunction is the target launch function, i.e.
   // the function that sets up kernel arguments and calls
@@ -7185,34 +7225,48 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
   // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
   OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
                                     StaleCI->getIterator());
+
   LLVMContext &Ctx = StaleCI->getParent()->getContext();
+
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
   Type *TaskPtrTy = OMPBuilder.TaskPtr;
   Type *TaskTy = OMPBuilder.Task;
+
   auto ProxyFnTy =
       FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
                         /* isVarArg */ false);
   auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
                                   ".omp_target_task_proxy_func",
                                   Builder.GetInsertBlock()->getModule());
-  ProxyFn->getArg(0)->setName("thread.id");
-  ProxyFn->getArg(1)->setName("task");
+  Value *ThreadId = ProxyFn->getArg(0);
+  Value *TaskWithPrivates = ProxyFn->getArg(1);
+  ThreadId->setName("thread.id");
+  TaskWithPrivates->setName("task");
 
+  bool HasShareds = SharedArgsOperandNo > 0;
+  bool HasOffloadingArrays = NumOffloadingArrays > 0;
   BasicBlock *EntryBB =
       BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
   Builder.SetInsertPoint(EntryBB);
 
-  bool HasShareds = StaleCI->arg_size() > 1;
-  // TODO: This is a temporary assert to prove to ourselves that
-  // the outlined target launch function is always going to have
-  // atmost two arguments if there is any data shared between
-  // host and device.
-  assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
-         "StaleCI with shareds should have exactly two arguments.");
+  SmallVector<Value *> KernelLaunchArgs;
+  KernelLaunchArgs.reserve(StaleCI->arg_size());
+  KernelLaunchArgs.push_back(ThreadId);
+
+  if (HasOffloadingArrays) {
+    assert(TaskTy != TaskWithPrivatesTy &&
+           "If there are offloading arrays to pass to the target"
+           "TaskTy cannot be the same as TaskWithPrivatesTy");
+    Value *Privates =
+        Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
+    for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
+      KernelLaunchArgs.push_back(
+          Builder.CreateStructGEP(PrivatesTy, Privates, i));
+  }
 
-  Value *ThreadId = ProxyFn->getArg(0);
   if (HasShareds) {
-    auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+    auto *ArgStructAlloca =
+        dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
     assert(ArgStructAlloca &&
            "Unable to find the alloca instruction corresponding to arguments "
            "for extracted function");
@@ -7220,27 +7274,67 @@ static Function *emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder,
 
     AllocaInst *NewArgStructAlloca =
         Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
-    Value *TaskT = ProxyFn->getArg(1);
+
     Value *SharedsSize =
         Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
 
-    Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
-    LoadInst *LoadShared =
-        Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
+    LoadInst *LoadShared = loadSharedDataFromTaskDescriptor(
+        OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
 
     Builder.CreateMemCpy(
         NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
         LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
-
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
-  } else {
-    Builder.CreateCall(KernelLaunchFunction, {ThreadId});
+    KernelLaunchArgs.push_back(NewArgStructAlloca);
   }
-
+  Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
   Builder.CreateRetVoid();
   return ProxyFn;
 }
+static Type *getOffloadingArrayType(Value *V) {
 
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
+    return GEP->getSourceElementType();
+  if (auto *Alloca = dyn_cast<AllocaInst>(V))
+    return Alloca->getAllocatedType();
+
+  llvm_unreachable("Unhandled Instruction type");
+  return nullptr;
+}
+// This function returns a struct that has at most two members.
+// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
+// descriptor. The second member, if needed, is a struct containing arrays
+// that need to be passed to the offloaded target kernel. For example,
+// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
+// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
+// respectively, then the types created  by this function are
+//
+// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
+// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
+//                                     %struct.privates }
+// %struct.task_with_privates is returned by this function.
+// If there aren't any offloading arrays to pass to the target kernel,
+// %struct.kmp_task_ompbuilder_t is returned.
+static StructType *
+createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
+                         ArrayRef<Value *> OffloadingArraysToPrivatize) {
+
+  if (OffloadingArraysToPrivatize.empty())
+    return OMPIRBuilder.Task;
+
+  SmallVector<Type *, 4> StructFieldTypes;
+  for (Value *V : OffloadingArraysToPrivatize) {
+    assert(V->getType()->isPointerTy() &&
+           "Expected pointer to array to privatize. Got a non-pointer value "
+           "instead");
+    Type *ArrayTy = getOffloadingArrayType(V);
+    assert(ArrayTy && "ArrayType cannot be nullptr");
+    StructFieldTypes.push_back(ArrayTy);
+  }
+  StructType *PrivatesStructTy =
+      StructType::create(StructFieldTypes, "struct.privates");
+  return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
+                            "struct.task_with_privates");
+}
 static Error emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
     TargetRegionEntryInfo &EntryInfo,
@@ -7266,7 +7360,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
     OpenMPIRBuilder::InsertPointTy AllocaIP,
     const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
-    bool HasNoWait) {
+    const TargetDataRTArgs &RTArgs, bool HasNoWait) {
 
   // The following explains the code-gen scenario for the `target` directive. A
   // similar scneario is followed for other device-related directives (e.g.
@@ -7276,27 +7370,30 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // When we arrive at this function, the target region itself has been
   // outlined into the function OutlinedFn.
   // So at ths point, for
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //   void user_code_that_offloads(...) {
-  //     omp target depend(..) map(from:a) map(to:b, c)
-  //        a = b + c
+  //     omp target depend(..) map(from:a) map(to:b) private(i)
+  //     do i = 1, 10
+  //        a(i) = b(i) + n
   //   }
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   // we have
   //
-  // --------------------------------------------------
+  // --------------------------------------------------------------
   //
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_mappers = alloca [2 x ptr], align 8
   //     ;; target region has been outlined and now we need to
   //     ;; offload to it via a target task.
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //     *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   // We have to now do the following
@@ -7309,33 +7406,59 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   // (iii) Create a task with the task entry point created in (ii)
   //
   // That is we create the following
-  //
+  //   struct task_with_privates {
+  //      struct kmp_task_ompbuilder_t task_struct;
+  //      struct privates {
+  //         [2 x ptr] ; baseptrs
+  //         [2 x ptr] ; ptrs
+  //         [2 x i64] ; sizes
+  //      }
+  //   }
   //   void user_code_that_offloads(...) {
-  //     %.offload_baseptrs = alloca [3 x ptr], align 8
-  //     %.offload_ptrs = alloca [3 x ptr], align 8
-  //     %.offload_mappers = alloca [3 x ptr], align 8
+  //     %.offload_baseptrs = alloca [2 x ptr], align 8
+  //     %.offload_ptrs = alloca [2 x ptr], align 8
+  //     %.offload_sizes = alloca [2 x i64], align 8
   //
   //     %structArg = alloca { ptr, ptr, ptr }, align 8
-  //     %strucArg[0] = %.offload_baseptrs
-  //     %strucArg[1] = %.offload_ptrs
-  //     %strucArg[2] = %.offload_mappers
-  //     proxy_target_task = @__kmpc_omp_task_alloc(...,
-  //                                               @.omp_target_task_proxy_func)
-  //     memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
+  //     %strucArg[0] = a
+  //     %strucArg[1] = b
+  //     %strucArg[2] = &n
+  //
+  //     target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
+  //                                               sizeof(kmp_task_ompbuilder_t),
+  //                                               sizeof(structArg),
+  //                                               @.omp_target_task_proxy_func,
+  //                                               ...)
+  //     memcpy(target_task_with_privates->task_struct->shareds, %structArg,
+  //            sizeof(structArg))
+  //     memcpy(target_task_with_privates->privates->baseptrs,
+  //            offload_baseptrs, sizeof(offload_baseptrs)
+  //     memcpy(target_task_with_privates->privates->ptrs,
+  //            offload_ptrs, sizeof(offload_ptrs)
+  //     memcpy(target_task_with_privates->privates->sizes,
+  //            offload_sizes, sizeof(offload_sizes)
   //     dependencies_array = ...
   //     ;; if nowait not present
   //     call @__kmpc_omp_wait_deps(..., dependencies_array)
   //     call @__kmpc_omp_task_begin_if0(...)
   //     call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
-  //     %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
+  //     %target_task_with_privates)
+  //     call @__kmpc_omp_task_complete_if0(...)
   //   }
   //
   //   define internal void @.omp_target_task_proxy_func(i32 %thread.id,
   //                                                     ptr %task) {
   //       %structArg = alloca {ptr, ptr, ptr}
-  //       %shared_data = load (getelementptr %task, 0, 0)
-  //       mempcy(%structArg, %shared_data, sizeof(structArg))
-  //       kernel_launch_function(%thread.id, %structArg)
+  //       %task_ptr = getelementptr(%task, 0, 0)
+  //       %shared_data = load (getelementptr %task_ptr, 0, 0)
+  //       mempcy(%structArg, %shared_data, sizeof(%structArg))
+  //
+  //       %offloading_arrays = getelementptr(%task, 0, 1)
+  //       %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
+  //       %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
+  //       %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
+  //       kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
+  //                              %offload_sizes, %structArg)
   //   }
   //
   //   We need the proxy function because the signature of the task entry point
@@ -7343,21 +7466,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   //   that of the kernel_launch function.
   //
   //   kernel_launch_function is generated by emitKernelLaunch and has the
-  //   always_inline attribute.
-  //   void kernel_launch_function(thread_id,
-  //                               structArg) alwaysinline {
+  //   always_inline attribute. For this example, it'll look like so:
+  //   void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
+  //                               %offload_sizes,  %structArg) alwaysinline {
   //       %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
-  //       offload_baseptrs = load(getelementptr structArg, 0, 0)
-  //       offload_ptrs = load(getelementptr structArg, 0, 1)
-  //       offload_mappers = load(getelementptr structArg, 0, 2)
+  //       ; load aggregated data from %structArg
   //       ; setup kernel_args using offload_baseptrs, offload_ptrs and
-  //       ; offload_mappers
+  //       ; offload_sizes
   //       call i32 @__tgt_target_kernel(...,
   //                                     outlined_device_function,
   //                                     ptr %kernel_args)
   //   }
-  //   void outlined_device_function(ptr a, ptr b, ptr c) {
-  //      *a = *b + *c
+  //   void outlined_device_function(ptr a, ptr b, ptr n) {
+  //     n = *n_ptr;
+  //     do i = 1, 10
+  //       a(i) = b(i) +  n
   //   }
   //
   BasicBlock *TargetTaskBodyBB =
@@ -7378,6 +7501,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
       Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
 
+  // Generate the task body which will subsequently be outlined.
   Builder.restoreIP(TargetTaskBodyIP);
   if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
     return Err;
@@ -7396,15 +7520,57 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
             /*IsFinished=*/true);
 
-  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, HasNoWait,
-                      DeviceID](Function &OutlinedFn) mutable {
+  SmallVector<Value *, 2> OffloadingArraysToPrivatize;
+  bool NeedsTargetTask = HasNoWait && DeviceID;
+  if (NeedsTargetTask) {
+    for (auto *V :
+         {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
+          RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
+          RTArgs.SizesArray}) {
+      if (V && !isa<ConstantPointerNull, GlobalVariable>(V)) {
+        OffloadingArraysToPrivatize.push_back(V);
+        OI.ExcludeArgsFromAggregate.push_back(V);
+      }
+    }
+  }
+  OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
+                      DeviceID, OffloadingArraysToPrivatize](
+                         Function &OutlinedFn) mutable {
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
 
     CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
-    bool HasShareds = StaleCI->arg_size() > 1;
 
-    Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
+    // The first argument of StaleCI is always the thread id.
+    // The next few arguments are the pointers to offloading arrays
+    // if any. (see OffloadingArraysToPrivatize)
+    // Finally, all other local values that are live-in into the outlined region
+    // end up in a structure whose pointer is passed as the last argument. This
+    // piece of data is passed in the "shared" field of the task structure. So,
+    // we know we have to pass shareds to the task if the number of arguments is
+    // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
+    // thread id. Further, for safety, we assert that the number of arguments of
+    // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
+    const unsigned int NumStaleCIArgs = StaleCI->arg_size();
+    bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
+    assert(
+        !HasShareds ||
+        NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2) &&
+            "Wrong number of arguments for StaleCI when shareds are present");
+    int SharedArgOperandNo =
+        HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
+
+    StructType *TaskWithPrivatesTy =
+        createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
+    StructType *PrivatesTy = nullptr;
+
+    if (!OffloadingArraysToPrivatize.empty())
+      PrivatesTy =
+          static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
+
+    Function *ProxyFn = emitTargetTaskProxyFunction(
+        *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
+        OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
 
     LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
                       << "\n");
@@ -7422,7 +7588,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // If `HasNoWait == true`, we call  @__kmpc_omp_target_task_alloc to provide
     // the DeviceID to the deferred task and also since
     // @__kmpc_omp_target_task_alloc creates an untied/async task.
-    bool NeedsTargetTask = HasNoWait && DeviceID;
     Function *TaskAllocFn =
         !NeedsTargetTask
             ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
@@ -7435,17 +7600,19 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     // Argument - `sizeof_kmp_task_t` (TaskSize)
     // Tasksize refers to the size in bytes of kmp_task_t data structure
-    // including private vars accessed in task.
-    // TODO: add kmp_task_t_with_privates (privates)
-    Value *TaskSize =
-        Builder.getInt64(M.getDataLayout().getTypeStoreSize(Task));
+    // plus any other data to be passed to the target task, if any, which
+    // is packed into a struct. kmp_task_t and the struct so created are
+    // packed into a wrapper struct whose type is TaskWithPrivatesTy.
+    Value *TaskSize = Builder.getInt64(
+        M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
 
     // Argument - `sizeof_shareds` (SharedsSize)
     // SharedsSize refers to the shareds array size in the kmp_task_t data
     // structure.
     Value *SharedsSize = Builder.getInt64(0);
     if (HasShareds) {
-      auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+      auto *ArgStructAlloca =
+          dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
       assert(ArgStructAlloca &&
              "Unable to find the alloca instruction corresponding to arguments "
              "for extracted function");
@@ -7483,13 +7650,32 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
 
     TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
 
+    Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
     if (HasShareds) {
-      Value *Shareds = StaleCI->getArgOperand(1);
-      Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
-      Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+      Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
+      Value *TaskShareds = loadSharedDataFromTaskDescriptor(
+          *this, Builder, TaskData, TaskWithPrivatesTy);
       Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
                            SharedsSize);
     }
+    if (!OffloadingArraysToPrivatize.empty()) {
+      Value *Privates =
+          Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
+      for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
+        Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
+        Type *ArrayType = getOffloadingArrayType(PtrToPrivatize);
+        assert(ArrayType && "ArrayType cannot be nullptr");
+
+        Type *ElementType = PrivatesTy->getElementType(i);
+        assert(ElementType == ArrayType &&
+               "ElementType should match ArrayType");
+
+        Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
+        Builder.CreateMemCpy(
+            Dst, Alignment, PtrToPrivatize, Alignment,
+            Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
+      }
+    }
 
     Value *DepArray = emitTaskDependencies(*this, Dependencies);
 
@@ -7635,9 +7821,10 @@ static void emitTargetCall(
         // Arguments that are intended to be directly forwarded to an
         // emitKernelLaunch call are pased as nullptr, since
         // OutlinedFnID=nullptr results in that call not being done.
+        OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
         return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
                                          /*RTLoc=*/nullptr, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, EmptyRTArgs, HasNoWait);
       }
       return EmitTargetCallFallbackCB(Builder.saveIP());
     }());
@@ -7649,6 +7836,7 @@ static void emitTargetCall(
   auto &&EmitTargetCallThen =
       [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
           OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
+    Info.HasNoWait = HasNoWait;
     OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
     OpenMPIRBuilder::TargetDataRTArgs RTArgs;
     if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
@@ -7726,7 +7914,8 @@ static void emitTargetCall(
       // explicit generation of the target task.
       if (RequiresOuterTargetTask)
         return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
-                                         Dependencies, HasNoWait);
+                                         Dependencies, KArgs.RTArgs,
+                                         Info.HasNoWait);
 
       return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
                                          EmitTargetCallFallbackCB, KArgs,
diff --git a/mlir/test/Target/LLVMIR/omptarget-depend.mlir b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
index f2948c6510138..0f2437639319a 100644
--- a/mlir/test/Target/LLVMIR/omptarget-depend.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-depend.mlir
@@ -126,7 +126,8 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK-DAG:  %[[DEP_ARRAY:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 
 // CHECK: %[[TASKDATA:.+]] = call ptr @__kmpc_omp_task_alloc({{.+}}, ptr @.omp_target_task_proxy_func)
-// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[TASKDATA]], align 8
+// CHECK: %[[SHARED_PTR:.+]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASKDATA]], i32 0, i32 0
+// CHECK: %[[SHARED_DATA:.+]] = load ptr, ptr %[[SHARED_PTR]], align 8
 // CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHARED_DATA]], ptr align 1 %[[STRUCTARG]], i64 24, i1 false)
 
 // CHECK: %[[DEP_INFO:.+]]  = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARRAY]], i64 0, i64 0
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
index b487b31d54477..5eee7b7d7d976 100644
--- a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
@@ -13,19 +13,48 @@ module attributes {omp.target_triples = ["dummy-target-triple"]} {
     }
     llvm.return
   }
+}
 
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [1 x ptr], [1 x ptr] }
 
 // CHECK: define void @_QPfoo() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [1 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [1 x ptr], align 8
+
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [1 x ptr], ptr %[[PTRS]], i32 0, i32 0
 
-// CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
-// CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
 
-// CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
+// CHECK: %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
+// CHECK-SAME: (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
+// CHECK-SAME: @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 8, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 8, i1 false)
+// CHECK: call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @_QPfoo..omp_par(i32 %{{.*}}, ptr %{{.*}})
-// CHECK: }
-}
+// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 8, i1 false)
+// CHECK:   call void @[[WORKER]](i32 %{{.*}}, ptr %{{.*}})
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
new file mode 100644
index 0000000000000..19333c44322f1
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait.mlir
@@ -0,0 +1,70 @@
+// RUN: mlir-translate -mlir-to-llvmir %s 2>&1 | FileCheck %s
+
+module attributes {omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @launch_(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}) {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x f64 {bindc_name = "n"} : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%2 : !llvm.ptr)  -> !llvm.ptr {name = ""}
+    %4 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(to) capture(ByRef) members(%3 : [0] : !llvm.ptr) -> !llvm.ptr {name = "a"}
+    %5 = omp.map.info var_ptr(%1 : !llvm.ptr, f64) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "n"}
+    omp.target nowait map_entries(%4 -> %arg1, %5 -> %arg2, %3 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %two_f = llvm.mlir.constant(2.000000e+00 : f64) : f64
+      %one_i = llvm.mlir.constant(1 : index) : i64
+      %6 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+      %7 = llvm.load %6 : !llvm.ptr -> !llvm.ptr
+      %8 = llvm.getelementptr %7[%one_i] : (!llvm.ptr, i64) -> !llvm.ptr, i8
+      %9 = llvm.load %8 : !llvm.ptr -> f64
+      %10 = llvm.fmul %9, %two_f {fastmathFlags = #llvm.fastmath<contract>} : f64
+      llvm.store %10, %8 : f64, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: %struct.[[TSK_WTH_PRVTS:.*]] = type { %struct.kmp_task_ompbuilder_t, %struct.[[PRVTS:.*]] }
+// CHECK: %struct.kmp_task_ompbuilder_t = type { ptr, ptr, i32, ptr, ptr }
+// CHECK: %struct.[[PRVTS]] = type { [5 x ptr], [5 x ptr], [5 x i64] }
+
+// CHECK: define void @launch_(ptr captures(none) %0)
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[BASEPTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[PTRS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[MAPPERS:.*]] = alloca [5 x ptr], align 8
+// CHECK: %[[SIZES:.*]] = alloca [5 x i64], align 4
+
+
+// CHECK: %[[VAL_20:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[BASEPTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[BASEPTRS]], i32 0, i32 0
+// CHECK: %[[PTRS_GEP:.*]] = getelementptr inbounds [5 x ptr], ptr %[[PTRS]], i32 0, i32 0
+// CHECK: %[[SIZES_GEP:.*]] = getelementptr inbounds [5 x i64], ptr %[[SIZES]], i32 0, i32 0
+
+// CHECK: %[[GL_THRD_NUM:.*]] = call i32 @__kmpc_global_thread_num
+// CHECK: %[[TASK_DESC:.*]] = call ptr @__kmpc_omp_target_task_alloc(ptr @4, i32 {{.*}}, i32 0, i64 160, i64 16, ptr [[TGT_TSK_PRXY_FNC:.*]], i64 -1)
+// CHECK: %[[TSK_PTR:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TSK_PTR]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[SHAREDS_PTR]], ptr align 1 %[[STRUCTARG]], i64 16, i1 false)
+// CHECK: %[[VAL_50:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC]], i32 0, i32 1
+// CHECK: %[[VAL_51:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_51]], ptr align 1 %[[BASEPTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_53:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 1
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_53]], ptr align 1 %[[PTRS_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_54:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[VAL_50]], i32 0, i32 2
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_54]], ptr align 1 %[[SIZES_GEP]], i64 40, i1 false)
+// CHECK: %[[VAL_55:.*]] = call i32 @__kmpc_omp_task(ptr @4, i32 %[[GL_THRD_NUM]], ptr %[[TASK_DESC]])
+
+// CHECK: define internal void @[[WORKER:.*]](i32 {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}) {
+
+// CHECK: define internal void [[TGT_TSK_PRXY_FNC]](i32 %[[THREAD_ID_PARAM:.*]], ptr %[[TASK_DESC_PARAM:.*]]) {
+// CHECK: %[[PRIVATE_DATA:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 1
+// CHECK: %[[BASEPTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 0
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 1
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds nuw %struct.[[PRVTS]], ptr %[[PRIVATE_DATA]], i32 0, i32 2
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[TASK:.*]] = getelementptr inbounds nuw %struct.[[TSK_WTH_PRVTS]], ptr %[[TASK_DESC_PARAM]], i32 0, i32 0
+// CHECK: %[[SHAREDS:.*]] = getelementptr inbounds nuw %struct.kmp_task_ompbuilder_t, ptr %[[TASK]], i32 0, i32 0
+// CHECK: %[[SHAREDS_PTR:.*]] = load ptr, ptr %[[SHAREDS]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[STRUCTARG]], ptr align 1 %[[SHAREDS_PTR]], i64 16, i1 false)
+// CHECK: call void @[[WORKER]](i32 %[[THREAD_ID_PARAM]], ptr %[[BASEPTRS]], ptr %[[PTRS]], ptr %[[SIZES]], ptr %[[STRUCTARG]])
diff --git a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
index 8124d02ef2174..dba8c553aaca5 100644
--- a/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptargetdata-nowait-llvm.mlir
@@ -14,25 +14,20 @@ llvm.func @_QPopenmp_target_data_enter() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_ENTER:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_ENTER:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_begin_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_ENTER]](i32 %{{.*}}, ptr %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -51,25 +46,20 @@ llvm.func @_QPopenmp_target_data_update() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_UPDATE:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_UPDATE:.*]](i32 {{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_update_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_UPDATE]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }
 
 // -----
@@ -88,23 +78,18 @@ llvm.func @_QPopenmp_target_data_exit() {
 
 // CHECK:   %[[TASK:.*]] = call ptr @__kmpc_omp_target_task_alloc
 // CHECK-SAME:     (ptr @{{.*}}, i32 %{{.*}}, i32 {{.*}}, i64 {{.*}}, i64 {{.*}}, ptr
-// CHECK-SAME:     @[[TASK_PROXY_FUNC:.*]], i64 {{.*}})
+// CHECK-SAME:     @[[TASK_PROXY_FUNC_EXIT:.*]], i64 {{.*}})
 
 // CHECK:   call i32 @__kmpc_omp_task(ptr {{.*}}, i32 %{{.*}}, ptr %[[TASK]])
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_BODY_FUNC:.*]](i32 %[[TID:.*]], ptr %[[TASK_ARG:.*]]) {
-// CHECK:   %[[OFFLOAD_BASE_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 0
-// CHECK:   %[[OFFLOAD_BASE_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_BASE_PTRS]], align 8
-// CHECK:   %[[OFFLOAD_PTRS:.*]] = getelementptr { ptr, ptr }, ptr %[[TASK_ARG]], i32 0, i32 1
-// CHECK:   %[[OFFLOAD_PTRS_VAL:.*]] = load ptr, ptr %[[OFFLOAD_PTRS]], align 8
-
+// CHECK: define internal void @[[TASK_BODY_FUNC_EXIT:.*]](i32 %{{.*}}, ptr %[[OFFLOAD_BASE_PTRS:.*]], ptr %[[OFFLOAD_PTRS:.*]]) {
 // CHECK:  call void @__tgt_target_data_end_nowait_mapper(
 // CHECK-SAME: ptr @{{.*}}, i64 -1, i32 1,
-// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS_VAL]], ptr %[[OFFLOAD_PTRS_VAL]],
+// CHECK-SAME: ptr %[[OFFLOAD_BASE_PTRS]], ptr %[[OFFLOAD_PTRS]],
 // CHECK-SAME: ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null, i32 0, ptr null, i32 0, ptr null)
 // CHECK: }
 
-// CHECK: define internal void @[[TASK_PROXY_FUNC]](i32 %{{.*}}, ptr %{{.*}}) {
-// CHECK:   call void @[[TASK_BODY_FUNC]](i32 %{{.*}}, ptr %{{.*}})
+// CHECK: define internal void @[[TASK_PROXY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}}) {
+// CHECK:   call void @[[TASK_BODY_FUNC_EXIT]](i32 %{{.*}}, ptr %{{.*}})
 // CHECK: }

From c7d85813fda88329979ae6c091d59a60833a9765 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 16:31:20 +0100
Subject: [PATCH 595/851] [IndVars] Add tests showing missed simplifications.

---
 .../simplify-icmp-operands-order.ll           | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
new file mode 100644
index 0000000000000..b0dbbd5eaedf4
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p indvars -S %s | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+
+declare void @use(ptr)
+declare void @use.i64(i64)
+
+define i64 @test_simplifycompare_rhs_constant(i64 %num_bytes, ptr %src) {
+; CHECK-LABEL: define i64 @test_simplifycompare_rhs_constant(
+; CHECK-SAME: i64 [[NUM_BYTES:%.*]], ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP_NOT_I:%.*]] = icmp ne i64 [[NUM_BYTES]], 0
+; CHECK-NEXT:    [[COND_I:%.*]] = zext i1 [[CMP_NOT_I]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 1, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[C_0:%.*]] = icmp ule i64 [[IV]], [[COND_I]]
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_0]])
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[C_1:%.*]] = icmp eq i32 [[L]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    call void @use(ptr [[SRC]])
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+entry:
+  %cmp.not.i = icmp ne i64 %num_bytes, 0
+  %cond.i = zext i1 %cmp.not.i to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c.0 = icmp ule i64 %iv, %cond.i
+  tail call void @llvm.assume(i1 %c.0)
+  %gep.src = getelementptr i32, ptr %src, i64 %iv
+  %l = load i32, ptr %gep.src, align 4
+  %c.1 = icmp eq i32 %l, 0
+  br i1 %c.1, label %then, label %loop.latch
+
+then:
+  call void @use(ptr %src)
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  br label %loop
+}
+
+define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant1() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[P]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 -8
+; CHECK-NEXT:    call void @use(ptr [[PTR_IV]])
+; CHECK-NEXT:    [[EC:%.*]] = icmp ult ptr [[PTR_IV_NEXT]], [[P]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p = alloca i64, align 8
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %p, %entry ], [ %ptr.iv.next, %loop ]
+  %ptr.iv.next = getelementptr i8, ptr %ptr.iv, i64 -8
+  call void @use(ptr %ptr.iv)
+  %ec = icmp ult ptr %ptr.iv.next, %p
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_simplifycompare_rhs_not_constant2(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant2(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[EXIT_LOOP_PREHEADER:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[EXIT_LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, %[[OUTER_LATCH_PREHEADER]] ], [ [[X]], %[[OUTER_LATCH]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_LCSSA]], %[[EXIT_LOOP_PREHEADER]] ], [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 0, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %exit.loop, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ 0, %outer.latch.preheader ], [ %x, %outer.latch ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+exit.loop:
+  %iv.2 = phi i32 [ %iv.1, %outer.header ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = zext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}
+
+define void @test_simplifycompare_rhs_addrec(i32 %x) {
+; CHECK-LABEL: define void @test_simplifycompare_rhs_addrec(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV:%.*]], 2
+; CHECK-NEXT:    br label %[[OUTER_HEADER]]
+; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ [[INDVARS_IV_NEXT]], %[[OUTER_HEADER_LOOPEXIT]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_1:%.*]] = phi i32 [ 2, %[[ENTRY]] ], [ [[IV_1_NEXT:%.*]], %[[OUTER_HEADER_LOOPEXIT]] ]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[C_1]], label %[[OUTER_EXIT:.*]], label %[[OUTER_LATCH_PREHEADER:.*]]
+; CHECK:       [[OUTER_LATCH_PREHEADER]]:
+; CHECK-NEXT:    [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 2
+; CHECK-NEXT:    br label %[[OUTER_LATCH:.*]]
+; CHECK:       [[OUTER_LATCH]]:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[X]], %[[OUTER_LATCH]] ], [ 0, %[[OUTER_LATCH_PREHEADER]] ]
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ult i32 [[P]], [[IV_1_NEXT]]
+; CHECK-NEXT:    br i1 [[C_2]], label %[[OUTER_LATCH]], label %[[OUTER_HEADER_LOOPEXIT]]
+; CHECK:       [[OUTER_EXIT]]:
+; CHECK-NEXT:    [[INDVARS_IV_LCSSA:%.*]] = phi i64 [ [[INDVARS_IV]], %[[OUTER_HEADER]] ]
+; CHECK-NEXT:    br label %[[EXIT_LOOP:.*]]
+; CHECK:       [[EXIT_LOOP]]:
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], %[[EXIT_LOOP]] ], [ [[INDVARS_IV_LCSSA]], %[[OUTER_EXIT]] ]
+; CHECK-NEXT:    call void @use.i64(i64 [[INDVARS_IV1]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], 1
+; CHECK-NEXT:    br label %[[EXIT_LOOP]]
+;
+entry:
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i32 [ 2, %entry ], [ %iv.1.next, %outer.latch ]
+  %c.1 = icmp sgt i32 %x, 0
+  br i1 %c.1, label %outer.exit, label %outer.latch.preheader
+
+outer.latch.preheader:
+  %iv.1.next = add nuw nsw i32 %iv.1, 2
+  br label %outer.latch
+
+outer.latch:
+  %p = phi i32 [ %x, %outer.latch ], [ 0, %outer.latch.preheader ]
+  %c.2 = icmp ult i32 %p, %iv.1.next
+  br i1 %c.2, label %outer.latch, label %outer.header
+
+outer.exit:
+  %sub = add nsw i32 %iv.1, -2
+  br label %exit.loop
+
+exit.loop:
+  %iv.2 = phi i32 [ %sub, %outer.exit ], [ %iv.2.next, %exit.loop ]
+  %iv.2.ext = sext i32 %iv.2 to i64
+  call void @use.i64(i64 %iv.2.ext)
+  %iv.2.next = add nsw i32 %iv.2, 1
+  br label %exit.loop
+}

From a5f0525d4b3edba50706cb0e4b9a48f0691e2b4c Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Mon, 16 Jun 2025 16:47:55 +0100
Subject: [PATCH 596/851] [AArch64][SelectionDAG] Enable new partial reduction
 lowering by default (#143565)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   96 +-
 .../neon-partial-reduce-dot-product.ll        | 1303 ++++++-----
 .../sve-fixed-length-partial-reduce.ll        |    6 +-
 .../AArch64/sve-partial-reduce-dot-product.ll | 1926 +++++++++--------
 .../AArch64/sve-partial-reduce-wide-add.ll    |  290 +--
 5 files changed, 1804 insertions(+), 1817 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7519ac5260a64..c86aed7b38c8c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
     cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
     cl::init(false));
 
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
-    "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
-    cl::desc("Use the new method of lowering partial reductions."));
-
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -1457,7 +1450,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::FADD, VT, Custom);
 
-    if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+    if (Subtarget->hasDotProd()) {
       static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
                                         ISD::PARTIAL_REDUCE_UMLA};
 
@@ -1895,7 +1888,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   }
 
   // Handle partial reduction operations
-  if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
     // Other pairs will default to 'Expand'.
     static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
@@ -1957,17 +1950,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
 
-      if (EnablePartialReduceNodes) {
-        static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                          ISD::PARTIAL_REDUCE_UMLA};
-        // Must be lowered to SVE instructions.
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
-        setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
-      }
+      static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                        ISD::PARTIAL_REDUCE_UMLA};
+      // Must be lowered to SVE instructions.
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+      setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
     }
   }
 
@@ -2165,16 +2156,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   assert(I->getIntrinsicID() ==
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Unexpected intrinsic!");
-  if (EnablePartialReduceNodes)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
-    return false;
   return true;
 }
 
@@ -2252,37 +2233,32 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
   bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
 
-  if (EnablePartialReduceNodes) {
-    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
-                                      ISD::PARTIAL_REDUCE_UMLA};
-    unsigned NumElts = VT.getVectorNumElements();
-    if (VT.getVectorElementType() == MVT::i64) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i32) {
-      setPartialReduceMLAAction(MLAOps, VT,
+  static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                    ISD::PARTIAL_REDUCE_UMLA};
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.getVectorElementType() == MVT::i64) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i32) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+  } else if (VT.getVectorElementType() == MVT::i16) {
+    setPartialReduceMLAAction(MLAOps, VT,
+                              MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
+  }
+  if (Subtarget->hasMatMulInt8()) {
+    if (VT.getVectorElementType() == MVT::i32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
                                 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
-      setPartialReduceMLAAction(
-          MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
-    } else if (VT.getVectorElementType() == MVT::i16) {
-      setPartialReduceMLAAction(MLAOps, VT,
-                                MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
-    }
-
-    if (Subtarget->hasMatMulInt8()) {
-      if (VT.getVectorElementType() == MVT::i32)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 4),
-                                  Custom);
-      else if (VT.getVectorElementType() == MVT::i64)
-        setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
-                                  MVT::getVectorVT(MVT::i8, NumElts * 8),
-                                  Custom);
-    }
+    else if (VT.getVectorElementType() == MVT::i64)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
+                                MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c138..0ea80a075fae9 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-DOT-I8MM
 
 define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT:       // %bb.0: // %entry
-; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT:    mov x8, xzr
-; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
-; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-DOT-NEXT:    add x8, x8, #16
-; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT:    cmp x8, #16
-; CHECK-DOT-NEXT:    b.ne .LBB1_1
-; CHECK-DOT-NEXT:  // %bb.2: // %end
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_in_loop:
 ; CHECK-NODOT:       // %bb.0: // %entry
 ; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -86,11 +106,6 @@ end:
 }
 
 define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 }
 
 define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 }
 
 define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB6_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB6_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB6_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB6_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB6_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -268,32 +317,44 @@ end:
 }
 
 define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: usdot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -302,27 +363,34 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %s.wide = sext <16 x i8> %u to <16 x i32>
   %u.wide = zext <16 x i8> %s to <16 x i32>
   %mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
@@ -331,60 +399,67 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 }
 
 define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: sudot_in_loop:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_in_loop:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #16
+; CHECK-NODOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB9_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_in_loop:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #16
+; CHECK-DOT-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB9_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB9_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_in_loop:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB9_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #16
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB9_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -408,32 +483,44 @@ end:
 }
 
 define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-NOI8MM-LABEL: sudot_narrow:
-; CHECK-NOI8MM:       // %bb.0:
-; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
-; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_narrow:
+; CHECK-NODOT:       // %bb.0:
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_narrow:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-DOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-DOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-DOT-NEXT:    smull2 v1.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-DOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-DOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-DOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_narrow:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -460,21 +547,21 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -503,21 +590,21 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -528,45 +615,61 @@ entry:
 }
 
 define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    sshll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %b.wide = sext <16 x i8> %b to <16 x i64>
@@ -577,45 +680,61 @@ entry:
 }
 
 define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: sudot_8to64:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sudot_8to64:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-DOT-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-DOT-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-DOT-NEXT:    ushll v17.4s, v3.4h, #0
+; CHECK-DOT-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-DOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-DOT-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-DOT-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-DOT-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-DOT-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-DOT-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-DOT-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sudot_8to64:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %b.wide = zext <16 x i8> %b to <16 x i64>
@@ -626,12 +745,6 @@ entry:
 }
 
 define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
@@ -641,77 +754,53 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
-; CHECK-NODOT-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NODOT-NEXT:    mov x8, xzr
-; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    add x8, x8, #16
-; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT:    b.ne .LBB16_1
-; CHECK-NODOT-NEXT:  // %bb.2: // %end
-; CHECK-NODOT-NEXT:    ret
-;
-; CHECK-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v2.16b, #1
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q3, [x0, x8]
-; CHECK-I8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    cmp x8, #16
-; CHECK-I8MM-NEXT:    udot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_in_loop:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_0
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x9, .LCPI16_2
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x8, .LCPI16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:    adrp x10, .LCPI16_3
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB16_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q6, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v17.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v7.4s, v16.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v7.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v2.4s, v2.4s, v6.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB16_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_no_bin_op_in_loop:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_0
+; CHECK-COMMON-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-COMMON-NEXT:    adrp x9, .LCPI16_2
+; CHECK-COMMON-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
+; CHECK-COMMON-NEXT:    adrp x8, .LCPI16_1
+; CHECK-COMMON-NEXT:    adrp x10, .LCPI16_3
+; CHECK-COMMON-NEXT:    ldr q3, [x8, :lo12:.LCPI16_1]
+; CHECK-COMMON-NEXT:    ldr q4, [x9, :lo12:.LCPI16_2]
+; CHECK-COMMON-NEXT:    ldr q5, [x10, :lo12:.LCPI16_3]
+; CHECK-COMMON-NEXT:    mov x8, xzr
+; CHECK-COMMON-NEXT:  .LBB16_1: // %vector.body
+; CHECK-COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT:    ldr q6, [x0, x8]
+; CHECK-COMMON-NEXT:    mov v0.16b, v2.16b
+; CHECK-COMMON-NEXT:    add x8, x8, #16
+; CHECK-COMMON-NEXT:    cmp x8, #16
+; CHECK-COMMON-NEXT:    tbl v7.16b, { v6.16b }, v3.16b
+; CHECK-COMMON-NEXT:    tbl v16.16b, { v6.16b }, v4.16b
+; CHECK-COMMON-NEXT:    tbl v17.16b, { v6.16b }, v5.16b
+; CHECK-COMMON-NEXT:    tbl v6.16b, { v6.16b }, v1.16b
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v17.4s
+; CHECK-COMMON-NEXT:    add v7.4s, v16.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-COMMON-NEXT:    add v2.4s, v2.4s, v6.4s
+; CHECK-COMMON-NEXT:    b.ne .LBB16_1
+; CHECK-COMMON-NEXT:  // %bb.2: // %end
+; CHECK-COMMON-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -731,12 +820,6 @@ end:
 }
 
 define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.16b, #1
-; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
@@ -746,18 +829,24 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.16b, #1
+; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.16b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
   ret <4 x i32> %partial.reduce
 }
 
 define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
@@ -772,18 +861,24 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    udot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
 }
 
 define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
-; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
-; CHECK-DOT:       // %bb.0:
-; CHECK-DOT-NEXT:    movi v2.8b, #1
-; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
-; CHECK-DOT-NEXT:    ret
-;
 ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
@@ -798,6 +893,18 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
+;
+; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v2.8b, #1
+; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-NEXT:    ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_narrow:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v2.8b, #1
+; CHECK-DOT-I8MM-NEXT:    sdot v0.2s, v1.8b, v2.8b
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
   ret <2 x i32> %partial.reduce
@@ -822,23 +929,23 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    udot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    uaddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
@@ -863,35 +970,35 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-I8MM:       // %bb.0:
-; CHECK-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
-; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT:       // %bb.0:
+; CHECK-DOT-NEXT:    movi v3.16b, #1
+; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v3.16b, #1
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NEWLOWERING-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-DOT-I8MM:       // %bb.0:
+; CHECK-DOT-I8MM-NEXT:    movi v3.16b, #1
+; CHECK-DOT-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    sdot v4.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
+; CHECK-DOT-I8MM-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-DOT-I8MM-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
   ret <4 x i64> %partial.reduce
 }
 
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-COMMON-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-COMMON-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
   %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -900,16 +1007,16 @@ define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 }
 
 define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
-; CHECK-LABEL: not_udot_narrow:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    bic v2.4h, #255, lsl #8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: not_udot_narrow:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    bic v2.4h, #255, lsl #8
+; CHECK-COMMON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-COMMON-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-COMMON-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
+; CHECK-COMMON-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-COMMON-NEXT:    ret
   %u.wide = zext <4 x i8> %u to <4 x i32>
   %s.wide = zext <4 x i8> %s to <4 x i32>
   %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
@@ -918,18 +1025,18 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
 }
 
 define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -939,18 +1046,18 @@ entry:
 }
 
 define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -960,18 +1067,18 @@ entry:
 }
 
 define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
-; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: usdot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    ushll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    sshll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
   %b.wide = sext <8 x i8> %b to <8 x i64>
@@ -981,18 +1088,18 @@ entry:
 }
 
 define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
-; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
-; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
-; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
-; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: sudot_different_types:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-COMMON-NEXT:    sshll v3.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    sshll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    ushll v4.4s, v2.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-COMMON-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-COMMON-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-COMMON-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
   %b.wide = zext <8 x i8> %b to <8 x i64>
@@ -1002,74 +1109,86 @@ entry:
 }
 
 define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
-; CHECK-NOI8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT:    mov x8, xzr
-; CHECK-NOI8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NOI8MM-NEXT:    add x8, x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v6.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    cmp x8, #1024
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v4.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
-; CHECK-NOI8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
-; CHECK-NOI8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-NODOT-LABEL: usdot_multiple_zext_users:
+; CHECK-NODOT:       // %bb.0: // %entry
+; CHECK-NODOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT:    mov x8, xzr
+; CHECK-NODOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NODOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    cmp x8, #1024
+; CHECK-NODOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NODOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-NODOT-NEXT:    b.ne .LBB28_1
+; CHECK-NODOT-NEXT:  // %bb.2: // %end
+; CHECK-NODOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NODOT-NEXT:    ret
 ;
-; CHECK-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    mov x8, xzr
-; CHECK-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-I8MM-NEXT:    add x8, x8, #16
-; CHECK-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-I8MM-NEXT:    cmp x8, #1024
-; CHECK-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-I8MM-NEXT:    ret
+; CHECK-DOT-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT:       // %bb.0: // %entry
+; CHECK-DOT-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT:    mov x8, xzr
+; CHECK-DOT-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-NEXT:    add x8, x8, #16
+; CHECK-DOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-DOT-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-DOT-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-DOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-DOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT:    cmp x8, #1024
+; CHECK-DOT-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-DOT-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-DOT-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-DOT-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
+; CHECK-DOT-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-NEXT:  // %bb.2: // %end
+; CHECK-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_multiple_zext_users:
-; CHECK-NEWLOWERING-I8MM:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEWLOWERING-I8MM-NEXT:    mov x8, xzr
-; CHECK-NEWLOWERING-I8MM-NEXT:  .LBB28_1: // %vector.body
-; CHECK-NEWLOWERING-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q3, [x1, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    ldr q4, [x2, x8]
-; CHECK-NEWLOWERING-I8MM-NEXT:    add x8, x8, #16
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
-; CHECK-NEWLOWERING-I8MM-NEXT:    cmp x8, #1024
-; CHECK-NEWLOWERING-I8MM-NEXT:    b.ne .LBB28_1
-; CHECK-NEWLOWERING-I8MM-NEXT:  // %bb.2: // %end
-; CHECK-NEWLOWERING-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEWLOWERING-I8MM-NEXT:    ret
+; CHECK-DOT-I8MM-LABEL: usdot_multiple_zext_users:
+; CHECK-DOT-I8MM:       // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT:    mov x8, xzr
+; CHECK-DOT-I8MM-NEXT:  .LBB28_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT:    ldr q4, [x2, x8]
+; CHECK-DOT-I8MM-NEXT:    add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT:    usdot v0.4s, v4.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT:    usdot v1.4s, v4.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT:    cmp x8, #1024
+; CHECK-DOT-I8MM-NEXT:    b.ne .LBB28_1
+; CHECK-DOT-I8MM-NEXT:  // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-DOT-I8MM-NEXT:    ret
 entry:
   br label %vector.body
 
@@ -1100,15 +1219,15 @@ end:
 }
 
 define <2 x i64> @udot_16to64(<2 x i64> %acc, <8 x i16> %input){
-; CHECK-LABEL: udot_16to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
-; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
-; CHECK-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: udot_16to64:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-COMMON-NEXT:    ushll2 v1.4s, v1.8h, #0
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v2.4s
+; CHECK-COMMON-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-COMMON-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-COMMON-NEXT:    ret
 entry:
     %input.wide = zext <8 x i16> %input to <8 x i64>
     %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
index af813ff16a202..33d5ac4cd299e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,NEON
-; RUN: llc -mattr=+sve,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes=true < %s | FileCheck %s --check-prefixes=COMMON,SVE
-; RUN: llc -mattr=+sme,+i8mm -aarch64-enable-partial-reduce-nodes=true -force-streaming < %s | FileCheck %s --check-prefix=SME
+; RUN: llc -mattr=+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: llc -mattr=+sve,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=COMMON,SVE
+; RUN: llc -mattr=+sme,+i8mm -force-streaming < %s | FileCheck %s --check-prefix=SME
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 221a15e5c8fe6..b2cde51e99619 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1,20 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve,+sme,+i8mm -force-streaming -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING,CHECK-NEWLOWERING-SME
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM
+; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME
 
 define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -24,15 +27,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -42,15 +50,20 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -60,15 +73,20 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -78,36 +96,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -117,36 +135,36 @@ entry:
 }
 
 define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z4.h, z2.b
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z5.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-SVE2-NEXT:    mla z0.s, p0/m, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z2.b, z1.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    usdot z0.s, z2.b, z1.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
@@ -156,41 +174,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: udot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -201,41 +207,29 @@ entry:
 }
 
 define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-LABEL: sdot_8to64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -246,82 +240,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
-; CHECK-I8MM-LABEL: usdot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
-;
-; CHECK-NOI8MM-LABEL: usdot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    sunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    uunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    sunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    uunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    sunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    uunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    sunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    uunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    sunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: usdot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: usdot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -332,82 +306,62 @@ entry:
 }
 
 define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-I8MM-LABEL: sudot_8to64:
-; CHECK-I8MM:       // %bb.0: // %entry
-; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
-; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-I8MM-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_8to64:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z2.b
+; CHECK-SVE2-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-SVE2-NEXT:    uunpklo z3.h, z3.b
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z6.s, z4.h
+; CHECK-SVE2-NEXT:    sunpklo z7.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z24.s, z5.h
+; CHECK-SVE2-NEXT:    uunpklo z25.s, z3.h
+; CHECK-SVE2-NEXT:    sunpkhi z4.s, z4.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z5.s, z5.h
+; CHECK-SVE2-NEXT:    uunpkhi z3.s, z3.h
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z6.s
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z7.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z24.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z25.s
+; CHECK-SVE2-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-SVE2-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-SVE2-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-SVE2-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    sunpklo z26.d, z4.s
+; CHECK-SVE2-NEXT:    uunpklo z28.d, z5.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    sunpklo z27.d, z2.s
+; CHECK-SVE2-NEXT:    uunpklo z29.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-SVE2-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NOI8MM-LABEL: sudot_8to64:
-; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z3.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.d
-; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z3.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NOI8MM-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sudot_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: sudot_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    usdot z4.s, z3.b, z2.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sudot_8to64:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
@@ -418,51 +372,69 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z2.b, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
   ret <vscale x 4 x i32> %partial.reduce
 }
 
 define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: udot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -470,17 +442,23 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
-; CHECK-LABEL: sdot_no_bin_op_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.h, #1 // =0x1
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.h, #1 // =0x1
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
@@ -488,137 +466,93 @@ entry:
 }
 
 define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: udot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    udot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    uaddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uaddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    udot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: udot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    udot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: udot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    udot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    uaddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    uaddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
-; CHECK-LABEL: sdot_no_bin_op_8to64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE:       // %bb.0:
-; CHECK-NEWLOWERING-SVE-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2:       // %bb.0:
+; CHECK-SVE2-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0:
-; CHECK-NEWLOWERING-SVE2-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEWLOWERING-SVE2-NEXT:    mov z4.b, #1 // =0x1
-; CHECK-NEWLOWERING-SVE2-NEXT:    sdot z3.s, z2.b, z4.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SVE2-I8MM:       // %bb.0:
+; CHECK-SVE2-I8MM-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SVE2-I8MM-NEXT:    mov z4.b, #1 // =0x1
+; CHECK-SVE2-I8MM-NEXT:    sdot z3.s, z2.b, z4.b
+; CHECK-SVE2-I8MM-NEXT:    saddwb z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    saddwt z0.d, z0.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: sdot_no_bin_op_8to64:
-; CHECK-NEWLOWERING-SME:       // %bb.0:
-; CHECK-NEWLOWERING-SME-NEXT:    mov z3.b, #1 // =0x1
-; CHECK-NEWLOWERING-SME-NEXT:    mov z4.s, #0 // =0x0
-; CHECK-NEWLOWERING-SME-NEXT:    sdot z4.s, z2.b, z3.b
-; CHECK-NEWLOWERING-SME-NEXT:    saddwb z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    saddwt z0.d, z0.d, z4.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: sdot_no_bin_op_8to64:
+; CHECK-SME:       // %bb.0:
+; CHECK-SME-NEXT:    mov z3.b, #1 // =0x1
+; CHECK-SME-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-SME-NEXT:    sdot z4.s, z2.b, z3.b
+; CHECK-SME-NEXT:    saddwb z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    saddwt z0.d, z0.d, z4.s
+; CHECK-SME-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
   ret <vscale x 4 x i64> %partial.reduce
 }
 
 define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: not_udot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z3.s, z4.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.s, z1.h, z2.h
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    umlalb z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    umlalt z0.s, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
@@ -628,47 +562,29 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
-; CHECK-LABEL: not_udot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
+; CHECK-SVE2-LABEL: not_udot_wide:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE2-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_udot_wide:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-I8MM-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SME-LABEL: not_udot_wide:
-; CHECK-NEWLOWERING-SME:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SME-NEXT:    and z2.s, z2.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SME-NEXT:    umlalb z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    umlalt z0.d, z1.s, z2.s
-; CHECK-NEWLOWERING-SME-NEXT:    ret
+; CHECK-SME-LABEL: not_udot_wide:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.s, z2.s, #0xffff
+; CHECK-SME-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SME-NEXT:    umlalb z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    umlalt z0.d, z1.s, z2.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
   %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
@@ -678,47 +594,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_usdot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_usdot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_usdot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: not_usdot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: not_usdot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -728,47 +665,68 @@ entry:
 }
 
 define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: not_sudot:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: not_sudot:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: not_sudot:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: not_sudot:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: not_sudot:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
@@ -778,49 +736,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -830,51 +810,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -884,51 +887,74 @@ entry:
 }
 
 define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: usdot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: usdot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: usdot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: usdot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: usdot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    uunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    sunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    uunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    sunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -938,49 +964,71 @@ entry:
 }
 
 define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sudot_different_types:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sudot_different_types:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-NEXT:    ptrue p0.d
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sudot_different_types:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.d
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SVE2-I8MM-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sudot_different_types:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sudot_different_types:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    sunpklo z3.s, z1.h
+; CHECK-SME-NEXT:    ptrue p0.d
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z5.d, z3.s
+; CHECK-SME-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z4.s
+; CHECK-SME-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    sunpklo z5.d, z1.s
+; CHECK-SME-NEXT:    uunpklo z6.d, z2.s
+; CHECK-SME-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SME-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-SME-NEXT:    mla z0.d, p0/m, z1.d, z2.d
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -990,29 +1038,26 @@ entry:
 }
 
 define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: udot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: udot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1022,31 +1067,29 @@ entry:
 }
 
 define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
-; CHECK-LABEL: sdot_nxv8i8_promote:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.h
+; CHECK-SVE2-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    ptrue p0.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
-; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_nxv8i8_promote:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    ptrue p0.h
+; CHECK-SME-NEXT:    sxtb z2.h, p0/m, z2.h
+; CHECK-SME-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-SME-NEXT:    sdot z0.d, z1.h, z2.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
   %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
@@ -1056,35 +1099,26 @@ entry:
 }
 
 define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
-; CHECK-LABEL: partial_reduce_only_split_acc:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpkhi z4.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z5.s, z3.h
-; CHECK-NEXT:    uunpklo z3.s, z3.h
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z2.s
-; CHECK-NEXT:    uunpklo z24.d, z5.s
-; CHECK-NEXT:    uunpklo z25.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEXT:    mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEXT:    mla z1.d, p0/m, z4.d, z5.d
-; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z3.d
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z3.h, z3.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    udot z0.d, z2.h, z3.h
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: partial_reduce_only_split_acc:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-SME-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-SME-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
   %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
@@ -1095,25 +1129,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sub z0.s, z0.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z3.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: sdot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: sdot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    sdot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
@@ -1122,41 +1154,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: sdot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: sdot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
+;
+; CHECK-SME-LABEL: sdot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    sunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    sunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
@@ -1165,27 +1215,23 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEXT:    mov z2.s, #255 // =0xff
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z2.s
-; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
-; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    mov z2.b, #-1 // =0xffffffffffffffff
+; CHECK-SME-NEXT:    udot z0.s, z1.b, z2.b
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
@@ -1194,41 +1240,59 @@ entry:
 }
 
 define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
-; CHECK-LABEL: udot_imm_does_not_fit:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit:
+; CHECK-SVE2-I8MM:       // %bb.0: // %entry
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SVE2-I8MM-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE2-I8MM-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SVE2-I8MM-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SVE2-I8MM-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE2-I8MM-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SVE2-I8MM-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-LABEL: udot_imm_does_not_fit:
-; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    lsl z4.s, z4.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z2.s, z2.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z3.s, z3.s, #8
-; CHECK-NEWLOWERING-NEXT:    lsl z1.s, z1.s, #8
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z2.s, z2.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    ret
+; CHECK-SME-LABEL: udot_imm_does_not_fit:
+; CHECK-SME:       // %bb.0: // %entry
+; CHECK-SME-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SME-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SME-NEXT:    uunpklo z3.s, z2.h
+; CHECK-SME-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-SME-NEXT:    uunpklo z4.s, z1.h
+; CHECK-SME-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SME-NEXT:    lsl z4.s, z4.s, #8
+; CHECK-SME-NEXT:    lsl z2.s, z2.s, #8
+; CHECK-SME-NEXT:    lsl z3.s, z3.s, #8
+; CHECK-SME-NEXT:    lsl z1.s, z1.s, #8
+; CHECK-SME-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-SME-NEXT:    add z2.s, z2.s, z4.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SME-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-SME-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 428dd4c3a0154..e62979d077fd2 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -1,16 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK-SVE2
-; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 -aarch64-enable-partial-reduce-nodes %s -o - | FileCheck %s --check-prefixes=CHECK-NEWLOWERING-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefix=CHECK-SVE
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
 
 define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
@@ -19,19 +11,11 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -39,12 +23,6 @@ entry:
 }
 
 define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
@@ -53,19 +31,11 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z1.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -73,12 +43,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
@@ -87,19 +51,11 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -107,12 +63,6 @@ entry:
 }
 
 define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
@@ -121,19 +71,11 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -141,12 +83,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
@@ -155,19 +91,11 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -175,12 +103,6 @@ entry:
 }
 
 define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
@@ -189,19 +111,11 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.h, z0.h, z1.h
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -209,16 +123,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    ptrue p0.s
-; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    ptrue p0.s
@@ -229,23 +133,13 @@ define <vscale x 2 x i32> @signed_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    ptrue p0.s
+; CHECK-SVE2-NEXT:    sxth z1.s, p0/m, z1.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -253,15 +147,6 @@ entry:
 }
 
 define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <vscale x 4 x i16> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-SVE2-NEXT:    uunpklo z2.d, z1.s
-; CHECK-SVE2-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i16:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    and z1.s, z1.s, #0xffff
@@ -271,21 +156,12 @@ define <vscale x 2 x i32> @unsigned_wide_add_nxv4i16(<vscale x 2 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z2.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv4i16:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    and z1.s, z1.s, #0xffff
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i16> %input to <vscale x 4 x i32>
     %partial.reduce = tail call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv4i32(<vscale x 2 x i32> %acc, <vscale x 4 x i32> %input.wide)
@@ -293,18 +169,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    sunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    sunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    sunpklo z4.d, z3.s
@@ -317,25 +181,13 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    saddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -343,18 +195,6 @@ entry:
 }
 
 define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    uunpklo z4.d, z3.s
-; CHECK-SVE2-NEXT:    uunpklo z5.d, z2.s
-; CHECK-SVE2-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-SVE2-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-SVE2-NEXT:    ret
-;
 ; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
 ; CHECK-SVE:       // %bb.0: // %entry
 ; CHECK-SVE-NEXT:    uunpklo z4.d, z3.s
@@ -367,25 +207,13 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-SVE-NEXT:    ret
 ;
-; CHECK-NEWLOWERING-SVE-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-SVE-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-SVE-NEXT:    add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-SVE-NEXT:    ret
-;
-; CHECK-NEWLOWERING-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING-SVE2:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
-; CHECK-NEWLOWERING-SVE2-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    uaddwt z1.d, z1.d, z3.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z2.s
+; CHECK-SVE2-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
     %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)

From 58d23476f0ce76c847497a880f975550a645c796 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Mon, 16 Jun 2025 08:48:41 -0700
Subject: [PATCH 597/851] [MLIR][XeGPU] Add unroll patterns for scatter ops 
 (#143602)

Add unrolling support for create_tdesc, load, store, prefetch, and update_offset.

---------

Co-authored-by: Adam Siemieniuk <adam.siemieniuk@intel.com>
Co-authored-by: Chao Chen <chao.chen@intel.com>
---
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  | 207 +++++++++++++++++-
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  | 141 ++++++++++++
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  23 ++
 3 files changed, 369 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 885477fe4cbd5..9c234c1e866b9 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -396,11 +396,214 @@ struct UnrollDpasOp : public UnrollPattern<xegpu::DpasOp> {
   }
 };
 
+struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
+  using UnrollPattern<xegpu::CreateDescOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::CreateDescOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
+
+    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
+    VectorType indiceVecTy = indiceVec.getType();
+
+    SmallVector<Type> convertedIndiceTypes =
+        getUnrolledTypes(indiceVecTy, *targetShape);
+    SmallVector<Value> convertedIndiceVec =
+        pack(indiceVec, convertedIndiceTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto indice : convertedIndiceVec) {
+      auto newOp = rewriter.create<xegpu::CreateDescOp>(loc, newTdescTy,
+                                                        op.getSource(), indice);
+      newOps.push_back(newOp);
+    }
+
+    Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+
+    return success();
+  }
+};
+
+struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
+  using UnrollPattern<xegpu::LoadGatherOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::LoadGatherOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    Type elemTy = tdescTy.getElementType();
+    VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes =
+        getUnrolledTypes(maskTy, *targetShape);
+    SmallVector<Value> convertedMasks =
+        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
+      auto newOp = rewriter.create<xegpu::LoadGatherOp>(
+          loc, newValueTy, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+      newOps.push_back(newOp);
+    }
+
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
+struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
+  using UnrollPattern<xegpu::PrefetchOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::PrefetchOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    for (auto t : convertedTdesc)
+      rewriter.create<xegpu::PrefetchOp>(loc, TypeRange(), t, op->getAttrs());
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
+  using UnrollPattern<xegpu::StoreScatterOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::StoreScatterOp op,
+                                PatternRewriter &rewriter) const override {
+
+    Location loc = op.getLoc();
+    VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedValTypes =
+        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+
+    SmallVector<Value> convertedValues =
+        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
+    SmallVector<Value> convertedTdescs = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Type> convertedMaskTypes =
+        getUnrolledTypes(maskTy, *targetShape);
+    SmallVector<Value> convertedMasks =
+        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+
+    for (size_t i = 0; i < convertedValues.size(); ++i) {
+      Value v = convertedValues[i];
+      Value t = convertedTdescs[i];
+      Value m = op.getMask() ? convertedMasks[i] : nullptr;
+      rewriter.create<xegpu::StoreScatterOp>(
+          loc, v, t, m, op.getTransposeAttr(), op.getL1HintAttr(),
+          op.getL2HintAttr(), op.getL3HintAttr());
+    }
+
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
+struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
+  using UnrollPattern<xegpu::UpdateOffsetOp>::UnrollPattern;
+  LogicalResult matchAndRewrite(xegpu::UpdateOffsetOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    xegpu::TensorDescType tdescTy = op.getTensorDescType();
+
+    // check if the tensor descriptor type is a 1d vector type
+    if (tdescTy.getRank() > 1)
+      return failure();
+
+    std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
+    if (!targetShape)
+      return failure();
+
+    SmallVector<Type> convertedTdescTypes =
+        getUnrolledTypes(tdescTy, *targetShape);
+    SmallVector<Value> convertedTdesc = pack(
+        op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
+
+    TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
+    VectorType offsetVecTy = offsetVec.getType();
+    SmallVector<Type> convertedOffsetTypes =
+        getUnrolledTypes(offsetVecTy, *targetShape);
+    SmallVector<Value> convertedOffsetVec =
+        pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
+
+    SmallVector<Value> newOps;
+    for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
+      auto newOp =
+          rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
+      newOps.push_back(newOp);
+    }
+    Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
+    rewriter.replaceOp(op, castOp);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::xegpu::populateXeGPUUnrollPatterns(
     RewritePatternSet &patterns, const xegpu::UnrollOptions &options) {
   patterns.add<UnrollCreateNdOp, UnrollUpdateNdOffsetOp, UnrollPrefetchNdOp,
-               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp>(
-      patterns.getContext(), options);
+               UnrollLoadNdOp, UnrollStoreNdOp, UnrollDpasOp,
+               UnrollCreateDescOp, UnrollLoadGatherOp, UnrollStoreScatterOp,
+               UnrollPrefetchOp, UnrollUpdateOffsetOp>(patterns.getContext(),
+                                                       options);
 }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index b911bb3bbdc1c..52ec3b856da49 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -158,4 +158,145 @@ gpu.module @test {
     %c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
     gpu.return %c : vector<32x32xf32>
   }
+
+//-----
+
+  // CHECK-LABEL: test_create_tdesc_vec
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>,  #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: test_create_tdesc_step
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+  }
+
+//-----
+
+  // CHECK-LABEL: test_load
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  gpu.func @test_load(%src: ui64) -> vector<32xf32> {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+      
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    %ld = xegpu.load %tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+      
+    gpu.return %ld : vector<32xf32> 
+  }
+
+//-----
+
+  // CHECK-LABEL: test_prefetch
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  gpu.func @test_prefetch(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    gpu.return
+  }
+
+//-----
+
+  // CHECK-LABEL: test_store
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  gpu.func @test_store(%src: ui64) {
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %st_vec = arith.constant dense<1023.0>: vector<32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.store %st_vec, %tdesc, %mask: vector<32xf32>, !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1>
+    
+    gpu.return
+  }
+
+//-----
+
+  // CHECK-LABEL: test_prefetch_load_store_update
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
+   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
+  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+
+  gpu.func @test_prefetch_load_store_update(%src: ui64)  {
+
+    %cst = arith.constant dense<[
+    0,   8,  16,  24,  32,  40,  48,  56,
+    64,  72,  80,  88,  96, 104, 112, 120,
+    128, 136, 144, 152, 160, 168, 176, 184,
+    192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
+   
+    %delta = arith.constant dense<[
+    32,   32,  32,  32,  32,  32,  32,  32,
+    32,   32,  32,  32,  32,  32,  32,  64,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 256 
+    ]> : vector<32xindex>
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+              : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>     
+ 
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
+
+    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+
+    %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
+    xegpu.store %st_vec, %tdesc, %mask: 
+                 vector<32xf32>, 
+                 !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, 
+                 vector<32xi1>
+  
+    gpu.return
+  }
 }
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 3f3461e92bc08..57aaecbd7962f 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -71,6 +71,29 @@ struct TestXeGPUUnrollingPatterns
             }
           }
 
+          if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
+                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
+            xegpu::TensorDescType tdescTy;
+            if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
+              tdescTy = createOp.getType();
+            } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
+              tdescTy = updateOp.getTensorDescType();
+            } else if (auto prefetchOp = dyn_cast<xegpu::PrefetchOp>(op)) {
+              tdescTy = prefetchOp.getTensorDescType();
+            } else if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
+              tdescTy = loadOp.getTensorDescType();
+            } else if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(op)) {
+              tdescTy = storeOp.getTensorDescType();
+            }
+
+            if (auto layout = tdescTy.getLayoutAttr()) {
+              auto inst_data = layout.getInstData();
+              if (inst_data && layout.isSgLayout())
+                return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
+                                            inst_data.asArrayRef().end());
+            }
+          }
+
           if (isa<xegpu::DpasOp>(op))
             return SmallVector<int64_t>{8, 16, 16};
 

From fc6aac72cc2c9a7a9dab443bca52f813a18461ef Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Mon, 16 Jun 2025 11:53:55 -0400
Subject: [PATCH 598/851] [DirectX] Fix bug where Flatten arrays was only using
 last index (#144146)

fixes #142836

We added a function called `collectIndicesAndDimsFromGEP` which builds
the Indicies and Dims up for the recursive case and the base case.
really to solve #142836 we didn't need to add it to the recursive case.
The recursive cases exists for gep chains which are ussually two
indicies per gep ie ptr index and array index. adding
collectIndicesAndDimsFromGEP to the recursive cases means we can now do
some mixed mode indexing say we get a case where its not the ussual 2
indicies but instead 3 we can now treat those last two indicies as part
of the computation for the flat array index.
---
 llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 44 +++++++++---
 llvm/test/CodeGen/DirectX/flatten-array.ll    | 70 +++++++++++++++++++
 .../DirectX/llc-vector-load-scalarize.ll      |  8 +--
 3 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
index b1f3f41a28e8b..0b7cf2f970172 100644
--- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
+++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp
@@ -86,6 +86,13 @@ class DXILFlattenArraysVisitor
   Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices,
                                       ArrayRef<uint64_t> Dims,
                                       IRBuilder<> &Builder);
+
+  // Helper function to collect indices and dimensions from a GEP instruction
+  void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    SmallVectorImpl<uint64_t> &Dims,
+                                    bool &AllIndicesAreConstInt);
+
   void
   recursivelyCollectGEPs(GetElementPtrInst &CurrGEP,
                          ArrayType *FlattenedArrayType, Value *PtrOperand,
@@ -218,6 +225,26 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) {
   return true;
 }
 
+void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP(
+    GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices,
+    SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) {
+
+  Type *CurrentType = GEP.getSourceElementType();
+
+  // Note index 0 is the ptr index.
+  for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) {
+    Indices.push_back(Index);
+    AllIndicesAreConstInt &= isa<ConstantInt>(Index);
+
+    if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) {
+      Dims.push_back(ArrayTy->getNumElements());
+      CurrentType = ArrayTy->getElementType();
+    } else {
+      assert(false && "Expected array type in GEP chain");
+    }
+  }
+}
+
 void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
     GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType,
     Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices,
@@ -226,12 +253,8 @@ void DXILFlattenArraysVisitor::recursivelyCollectGEPs(
   if (GEPChainMap.count(&CurrGEP) > 0)
     return;
 
-  Value *LastIndex = CurrGEP.getOperand(CurrGEP.getNumOperands() - 1);
-  AllIndicesAreConstInt &= isa<ConstantInt>(LastIndex);
-  Indices.push_back(LastIndex);
-  assert(isa<ArrayType>(CurrGEP.getSourceElementType()));
-  Dims.push_back(
-      cast<ArrayType>(CurrGEP.getSourceElementType())->getNumElements());
+  // Collect indices and dimensions from the current GEP
+  collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt);
   bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType());
   if (!IsMultiDimArr) {
     assert(GEPChainUseCount < FlattenedArrayType->getNumElements());
@@ -316,9 +339,12 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // Handle zero uses here because there won't be an update via
   // a child in the chain later.
   if (GEPChainUseCount == 0) {
-    SmallVector<Value *> Indices({GEP.getOperand(GEP.getNumOperands() - 1)});
-    SmallVector<uint64_t> Dims({ArrType->getNumElements()});
-    bool AllIndicesAreConstInt = isa<ConstantInt>(Indices[0]);
+    SmallVector<Value *> Indices;
+    SmallVector<uint64_t> Dims;
+    bool AllIndicesAreConstInt = true;
+
+    // Collect indices and dimensions from the GEP
+    collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt);
     GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand,
                     std::move(Indices), std::move(Dims), AllIndicesAreConstInt};
     return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP);
diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll
index 5c761014d471f..dc8c5f8421bfe 100644
--- a/llvm/test/CodeGen/DirectX/flatten-array.ll
+++ b/llvm/test/CodeGen/DirectX/flatten-array.ll
@@ -187,5 +187,75 @@ define void @global_gep_store() {
   ret void
 }
 
+@g = local_unnamed_addr addrspace(3) global [2 x [2 x float]] zeroinitializer, align 4
+define void @two_index_gep() {
+  ; CHECK-LABEL: define void @two_index_gep(
+  ; CHECK: [[THREAD_ID:%.*]] =  tail call i32 @llvm.dx.thread.id(i32 0)
+  ; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[THREAD_ID]], 2
+  ; CHECK-NEXT: [[ADD:%.*]] = add i32 1, [[MUL]]
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 [[ADD]]
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = tail call i32 @llvm.dx.thread.id(i32 0)
+  %2 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 %1, i32 1
+  %3 = load float, ptr addrspace(3) %2, align 4
+  ret void
+}
+
+define void @two_index_gep_const() {
+  ; CHECK-LABEL: define void @two_index_gep_const(
+  ; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 3
+  ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP_PTR]], align 4
+  ; CHECK-NEXT: ret void
+  %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 1, i32 1
+  %3 = load float, ptr addrspace(3) %1, align 4
+  ret void
+}
+
+define void @gep_4d_index_test()  {
+    ; CHECK-LABEL: gep_4d_index_test
+    ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 1
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 3
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 7
+    ; CHECK-NEXT: getelementptr inbounds [16 x i32], ptr %.1dim, i32 0, i32 15
+    ; CHECK-NEXT:    ret void
+    %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+    %2 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 0, i32 1
+    %3 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0, i32 1, i32 1
+    %4 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1, i32 1, i32 1
+    %5 = getelementptr inbounds [2 x [2 x[2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1, i32 1, i32 1
+    ret void
+}
+
+define void @gep_4d_index_and_gep_chain_mixed() {
+  ; CHECK-LABEL: gep_4d_index_and_gep_chain_mixed
+  ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16 x i32], align 4
+  ; CHECK-COUNT-16: getelementptr inbounds [16 x i32], ptr [[ALLOCA]], i32 0, i32 {{[0-9]|1[0-5]}}
+  ; CHECK-NEXT: ret void
+  %1 = alloca [2x[2 x[2 x [2 x i32]]]], align 4
+  %a4d0_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x[2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 0
+  %a2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 0
+  %a2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 0, i32 1
+  %a2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 0
+  %a2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %a4d0_0, i32 0, i32 1, i32 1
+  %b4d0_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 0, i32 1
+  %b2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 0
+  %b2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 0, i32 1
+  %b2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 0
+  %b2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %b4d0_1, i32 0, i32 1, i32 1
+  %c4d1_0 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 0
+  %c2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 0
+  %c2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 0, i32 1
+  %c2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 0
+  %c2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %c4d1_0, i32 0, i32 1, i32 1
+  %g4d1_1 = getelementptr inbounds [2 x [2 x [2 x [2 x i32]]]], [2 x [2 x [2 x [2 x i32]]]]* %1, i32 0, i32 1, i32 1
+  %g2d0_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 0
+  %g2d0_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 0, i32 1
+  %g2d1_0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 0
+  %g2d1_1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %g4d1_1, i32 0, i32 1, i32 1
+  ret void
+}
+
 ; Make sure we don't try to walk the body of a function declaration.
 declare void @opaque_function()
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index c960aad3d2627..778113bd3160f 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -111,13 +111,13 @@ define <4 x i32> @multid_load_test() #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 1) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 2) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), i32 3) to ptr addrspace(3)
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3) to ptr addrspace(3)
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
 ; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
 ; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]

From 2dd50bf79edefa28beffdbba4edfc2c753adae61 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:54:04 -0700
Subject: [PATCH 599/851] [OpenMP] Fix warnings

This patch fixes:

  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp:7233:9: error: unused
  variable 'TaskTy' [-Werror,-Wunused-variable]

  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp:7666:15: error: unused
  variable 'ArrayType' [-Werror,-Wunused-variable]
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index c1f02b2b240de..828205776f3fe 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7257,6 +7257,7 @@ static Function *emitTargetTaskProxyFunction(
     assert(TaskTy != TaskWithPrivatesTy &&
            "If there are offloading arrays to pass to the target"
            "TaskTy cannot be the same as TaskWithPrivatesTy");
+    (void)TaskTy;
     Value *Privates =
         Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
     for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
@@ -7669,6 +7670,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
         Type *ElementType = PrivatesTy->getElementType(i);
         assert(ElementType == ArrayType &&
                "ElementType should match ArrayType");
+        (void)ArrayType;
 
         Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
         Builder.CreateMemCpy(

From dfb14b65bc0a277f920c797b4557e79685055b4f Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Mon, 16 Jun 2025 16:55:40 +0100
Subject: [PATCH 600/851] [Flang] NFC: Update test to work on Mac (#144253)

`%flang` expands to `flang -isysroot <SDK location>` in Mac and probably
other OS as well. `fc1` is only accepted as the first argument and hence
in this case it fails.

Use the `%flang_fc1` option to correctly expand to `flang -fc1 -isysroot
<SDK location>`.
---
 flang/test/Preprocessing/bug518.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Preprocessing/bug518.F b/flang/test/Preprocessing/bug518.F
index 346e04cc56d38..0b680dd5751b9 100644
--- a/flang/test/Preprocessing/bug518.F
+++ b/flang/test/Preprocessing/bug518.F
@@ -1,4 +1,4 @@
-! RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
 ! CHECK: k=1_4
                         k=                                            1_99999999
      &4

From 711f6a8603717a6dc7e6202c614433ea2f9c0967 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 16 Jun 2025 16:58:00 +0100
Subject: [PATCH 601/851] [llvm][DebugInfo] Encode DW_AT_object_pointer on
 method declarations with DW_FORM_implicit_const (#124790)

We started attaching `DW_AT_object_pointer`s on method declarations in
https://github.com/llvm/llvm-project/pull/122742. However, that caused
the `.debug_info` section size to increase significantly (by around ~10%
on some projects). This was mainly due to the large number of new
`DW_FORM_ref4` values. This patch tries to address that regression by
changing the `DW_FORM_ref4` to a `DW_FORM_implicit_const` for
declarations. The value of `DW_FORM_implicit_const` will be the *index*
of the object parameter in the list of formal parameters of the
subprogram (i.e., if the first `DW_TAG_formal_parameter` is the object
pointer, the `DW_FORM_implicit_const` would be `0`). The DWARFv5 spec
only mentions the use of the `reference` attribute class to for
`DW_AT_object_pointer`. So using a `DW_FORM_impilicit_const` would be an
extension to (and not something mandated/specified by) the standard.
Though it'd make sense to extend the wording in the spec to allow for
this optimization.

That way we don't pay for the 4 byte references on every attribute
occurrence. In a local build of clang this barely affected the
`.debug_info` section size (but did increase `.debug_abbrev` by up to
10%, which doesn't impact the total debug-info size much however).

We guarded this on LLDB tuning (since using `DW_FORM_implicit_const` for
this purpose may surprise consumers) and DWARFv5 (since that's where
`DW_FORM_implicit_const` was first standardized).
---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp     | 28 ++++++-
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h       |  6 +-
 ...DW_AT_object_pointer-non-standard-index.ll | 79 +++++++++++++++++++
 .../DebugInfo/X86/DW_AT_object_pointer.ll     | 24 +++++-
 .../tools/llvm-dwarfdump/X86/statistics.ll    |  4 +-
 5 files changed, 132 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2481a9bd3ce74..bfe6e7d6a802a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -895,7 +895,10 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   }
 }
 
-void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+std::optional<unsigned>
+DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  // Args[0] is the return type.
+  std::optional<unsigned> ObjectPointerIndex;
   for (unsigned i = 1, N = Args.size(); i < N; ++i) {
     const DIType *Ty = Args[i];
     if (!Ty) {
@@ -906,8 +909,16 @@ void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
       addType(Arg, Ty);
       if (Ty->isArtificial())
         addFlag(Arg, dwarf::DW_AT_artificial);
+
+      if (Ty->isObjectPointer()) {
+        assert(!ObjectPointerIndex &&
+               "Can't have more than one object pointer");
+        ObjectPointerIndex = i;
+      }
     }
   }
+
+  return ObjectPointerIndex;
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
@@ -1458,7 +1469,20 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    constructSubprogramArguments(SPDie, Args);
+    //
+    // Encode the object pointer as an index instead of a DIE reference in order
+    // to minimize the affect on the .debug_info size.
+    if (std::optional<unsigned> ObjectPointerIndex =
+            constructSubprogramArguments(SPDie, Args)) {
+      if (getDwarfDebug().tuneForLLDB() &&
+          getDwarfDebug().getDwarfVersion() >= 5) {
+        // 0th index in Args is the return type, hence adjust by 1. In DWARF
+        // we want the first parameter to be at index 0.
+        assert(*ObjectPointerIndex > 0);
+        addSInt(SPDie, dwarf::DW_AT_object_pointer,
+                dwarf::DW_FORM_implicit_const, *ObjectPointerIndex - 1);
+      }
+    }
   }
 
   addThrownTypes(SPDie, SP->getThrownTypes());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index e1156bccfb1ab..43bf197563867 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -273,7 +273,11 @@ class DwarfUnit : public DIEUnit {
   void constructContainingTypeDIEs();
 
   /// Construct function argument DIEs.
-  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+  ///
+  /// \returns The index of the object parameter in \c Args if one exists.
+  /// Returns std::nullopt otherwise.
+  std::optional<unsigned> constructSubprogramArguments(DIE &Buffer,
+                                                       DITypeRefArray Args);
 
   /// Create a DIE with the given Tag, add the DIE to its parent, and
   /// call insertDIE if MD is not null.
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
new file mode 100644
index 0000000000000..40b791fd27e32
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer-non-standard-index.ll
@@ -0,0 +1,79 @@
+; Similar to DW_AT_object_pointer.ll but tests that we correctly
+; encode the object pointer index even if it's not the first argument
+; of the subprogram (which isn't something the major compilers do,
+; but is not mandated by DWARF).
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK
+
+; CHECK: DW_TAG_class_type
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK: DW_AT_object_pointer [DW_FORM_implicit_const] (2)
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
+; CHECK:   DW_TAG_formal_parameter
+; CHECK:   DW_TAG_formal_parameter
+; CHECK-NOT: "this"
+; CHECK: [[PARAM]]: DW_TAG_formal_parameter
+; CHECK: DW_AT_name
+; CHECK-SAME: = "this")
+; CHECK:   DW_TAG_formal_parameter
+
+%class.A = type { i8 }
+
+define linkonce_odr noundef ptr @_ZN1AC1Eii(ptr noundef nonnull returned align 1 dereferenceable(1) %this, i32 noundef %x, i32 noundef %y, i32 noundef %z) !dbg !24 {
+entry:
+  %this.addr = alloca ptr, align 8
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %z.addr = alloca i32, align 4
+  store ptr %this, ptr %this.addr, align 8
+    #dbg_declare(ptr %this.addr, !26, !DIExpression(), !28)
+  store i32 %x, ptr %x.addr, align 4
+    #dbg_declare(ptr %x.addr, !29, !DIExpression(), !30)
+  store i32 %y, ptr %y.addr, align 4
+    #dbg_declare(ptr %y.addr, !31, !DIExpression(), !32)
+  store i32 %z, ptr %y.addr, align 4
+    #dbg_declare(ptr %z.addr, !36, !DIExpression(), !37)
+  %this1 = load ptr, ptr %this.addr, align 8
+  %0 = load i32, ptr %x.addr, align 4, !dbg !33
+  %1 = load i32, ptr %y.addr, align 4, !dbg !33
+  %2 = load i32, ptr %z.addr, align 4, !dbg !33
+  ret ptr %this1, !dbg !34
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!12, !13}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 3, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 20.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!3 = !DIFile(filename: "object_ptr.cpp", directory: "/tmp")
+!4 = !{!0}
+!5 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !3, line: 1, size: 8, flags: DIFlagTypePassByValue | DIFlagNonTrivial, elements: !6, identifier: "_ZTS1A")
+!6 = !{!7}
+!7 = !DISubprogram(name: "A", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPublic | DIFlagPrototyped, spFlags: 0)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !11, !11, !10, !35}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 7, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = !{!"clang version 20.0.0git"}
+!24 = distinct !DISubprogram(name: "A", linkageName: "_ZN1AC1Eii", scope: !5, file: !3, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, declaration: !7, retainedNodes: !25)
+!25 = !{}
+!26 = !DILocalVariable(name: "this", arg: 3, scope: !24, type: !27, flags: DIFlagArtificial | DIFlagObjectPointer)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64)
+!28 = !DILocation(line: 0, scope: !24)
+!29 = !DILocalVariable(name: "x", arg: 2, scope: !24, file: !3, line: 2, type: !11)
+!30 = !DILocation(line: 2, column: 19, scope: !24)
+!31 = !DILocalVariable(name: "y", arg: 1, scope: !24, file: !3, line: 2, type: !11)
+!32 = !DILocation(line: 2, column: 26, scope: !24)
+!33 = !DILocation(line: 2, column: 29, scope: !24)
+!34 = !DILocation(line: 2, column: 30, scope: !24)
+!35 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!36 = !DILocalVariable(name: "z", arg: 4, scope: !24, file: !3, line: 2, type: !35)
+!37 = !DILocation(line: 2, column: 35, scope: !24)
diff --git a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
index d9988ac31451e..596727dce0433 100644
--- a/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
+++ b/llvm/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -1,14 +1,30 @@
-; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -v -debug-info %t | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=gdb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-GDB
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=4 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF4
+
+; RUN: llc -mtriple=x86_64-apple-darwin -debugger-tune=lldb -dwarf-version=5 -filetype=obj < %s | \
+; RUN:      llvm-dwarfdump -v -debug-info - | FileCheck %s --check-prefixes=CHECK,CHECK-LLDB-DWARF5
 
 ; CHECK: DW_TAG_formal_parameter [
 ; CHECK-NOT: ""
 ; CHECK: DW_TAG
 ; CHECK: DW_TAG_class_type
-; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK: [[DECL:0x[0-9a-f]+]]: DW_TAG_subprogram
+; CHECK:                         DW_AT_name {{.*}} "A"
+; CHECK-LLDB-DWARF5:             DW_AT_object_pointer [DW_FORM_implicit_const] (0)
+; CHECK-GDB-NOT:                 DW_AT_object_pointer
+; CHECK-LLDB-DWARF4-NOT:         DW_AT_object_pointer
+; CHECK: DW_TAG_formal_parameter
+;
+; CHECK: DW_TAG_subprogram
+; CHECK:   DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x{{[0-9a-f]*}} => {[[PARAM:0x[0-9a-f]*]]})
+; CHECK:   DW_AT_specification [DW_FORM_ref4] (cu + {{.*}} => {[[DECL]]}
 ; CHECK: [[PARAM]]:     DW_TAG_formal_parameter
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x{{[0-9a-f]*}}] = "this")
+; CHECK: DW_AT_name
+; CHECK-SAME        = "this")
 
 %class.A = type { i32 }
 
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
index a454bf14c3353..77de0241daeab 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 %s -o - -filetype=obj \
+; RUN: llc -O0 %s -o - -filetype=obj -debugger-tune=gdb -accel-tables=Apple \
 ; RUN:   | llvm-dwarfdump -statistics - | FileCheck %s
 ; CHECK: "version": 9,
 
@@ -55,7 +55,7 @@
 ; CHECK:      "#bytes within functions": [[FUNCSIZE:[0-9]+]]
 ; CHECK:      "#bytes within inlined functions": [[INLINESIZE:[0-9]+]]
 ; CHECK:      "#bytes in __debug_loc": 35,
-; CHECK-NEXT: "#bytes in __debug_abbrev": 384,
+; CHECK-NEXT: "#bytes in __debug_abbrev": 375,
 ; CHECK-NEXT: "#bytes in __debug_info": 459,
 ; CHECK-NEXT: "#bytes in __debug_str": 231,
 ; CHECK-NEXT: "#bytes in __apple_names": 348,

From c9ac1679b5d3a3839640486dd4bd931a19f4725a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:59:10 -0700
Subject: [PATCH 602/851] [lldb] Remove a redundant control flow statement
 (NFC) (#144284)

---
 lldb/tools/debugserver/source/RNBRemote.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 391d1c50168ea..8be384c6d24af 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -1476,7 +1476,6 @@ bool RNBRemote::InitializeRegisters(bool force) {
 
 void RNBRemote::NotifyThatProcessStopped(void) {
   RNBRemote::HandlePacket_last_signal(NULL);
-  return;
 }
 
 /* 'A arglen,argnum,arg,...'

From 05cd32adb7ce2354563814ab6e0b818f2ed6fa26 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 08:59:18 -0700
Subject: [PATCH 603/851] [llvm] Remove unused includes (NFC) (#144293)

These are identified by misc-include-cleaner.  I've filtered out those
that break builds.  Also, I'm staying away from llvm-config.h,
config.h, and Compiler.h, which likely cause platform- or
compiler-specific build failures.
---
 llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp                  | 1 -
 llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp                   | 1 -
 llvm/lib/CodeGen/MachineBasicBlock.cpp                         | 1 -
 llvm/lib/IR/BasicBlock.cpp                                     | 1 -
 llvm/lib/IR/IRBuilder.cpp                                      | 1 -
 llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | 1 -
 llvm/lib/Transforms/Scalar/ConstantHoisting.cpp                | 1 -
 llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp                 | 1 -
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp                  | 3 +--
 llvm/lib/Transforms/Utils/Local.cpp                            | 1 -
 10 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index e3e6c72165ebb..0f2c580c759cf 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -16,7 +16,6 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index f68420ed66e4b..1c4150127a908 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 48b406e016c05..c3c5a0f5102d7 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 8b3e91750f86c..3642e935397cb 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -21,7 +21,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 
 #include "LLVMContextImpl.h"
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index a33ef9c7d4a17..0a8b26b5f3d83 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 1d208de75db3b..a9751ab03e20e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -18,7 +18,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PatternMatch.h"
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 839f5933e09b0..db594e033e21e 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -44,7 +44,6 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d20378ece4eea..a09303bb4469f 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 1feed14b4fed8..98c65ae11b1c3 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -28,11 +28,10 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a3252a69874d3..33143700f5604 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -49,7 +49,6 @@
 #include "llvm/IR/EHPersonalities.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"

From ec32d8858559e4e6b5e520dfd36bfb64056fbdbb Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Mon, 16 Jun 2025 17:02:24 +0100
Subject: [PATCH 604/851] Annotate potentially unused variables introduced in
 #133499 (#144379)

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 828205776f3fe..cf17a84242c70 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7230,7 +7230,7 @@ static Function *emitTargetTaskProxyFunction(
 
   Type *ThreadIDTy = Type::getInt32Ty(Ctx);
   Type *TaskPtrTy = OMPBuilder.TaskPtr;
-  Type *TaskTy = OMPBuilder.Task;
+  [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
 
   auto ProxyFnTy =
       FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
@@ -7664,7 +7664,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
           Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
       for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
         Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
-        Type *ArrayType = getOffloadingArrayType(PtrToPrivatize);
+        [[maybe_unused]] Type *ArrayType =
+            getOffloadingArrayType(PtrToPrivatize);
         assert(ArrayType && "ArrayType cannot be nullptr");
 
         Type *ElementType = PrivatesTy->getElementType(i);

From 5acdd8d0cf785595b06c1a28326b560f720b4f16 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 09:15:59 -0700
Subject: [PATCH 605/851] AVR: Rename AVRMCExpr::VK_ to AVR::S_

Prepare for removing AVRMCExpr. Adopt the new naming convention (S_
instead of VK_; the relocation specifier was previously named
`VariantKind`)) used by most other targets.

Make AVRMCAsmInfo.h include AVRMCExpr.h and change .cpp files to include
AVRMCAsmInfo.h. We will eventually remove AVRMCExpr.h.
---
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp         |   4 +-
 llvm/lib/Target/AVR/AVRMCInstLower.cpp        |  14 +-
 .../lib/Target/AVR/AsmParser/AVRAsmParser.cpp |  14 +-
 .../AVR/MCTargetDesc/AVRELFObjectWriter.cpp   |  32 +--
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp  | 179 +++++++++++++++++
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.h    |  29 +++
 .../AVR/MCTargetDesc/AVRMCCodeEmitter.cpp     |   2 +-
 .../AVR/MCTargetDesc/AVRMCELFStreamer.cpp     |  22 +--
 .../AVR/MCTargetDesc/AVRMCELFStreamer.h       |   9 +-
 .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 185 +-----------------
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h  |  23 ---
 11 files changed, 260 insertions(+), 253 deletions(-)

diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index ed537f8cc7178..1a1e5155979e6 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -16,7 +16,7 @@
 #include "AVRSubtarget.h"
 #include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRInstPrinter.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -215,7 +215,7 @@ const MCExpr *AVRAsmPrinter::lowerConstant(const Constant *CV,
     bool IsProgMem = GV->getAddressSpace() == AVR::ProgramMemory;
     if (IsProgMem) {
       const MCExpr *Expr = MCSymbolRefExpr::create(getSymbol(GV), Ctx);
-      return AVRMCExpr::create(AVRMCExpr::VK_PM, Expr, false, Ctx);
+      return AVRMCExpr::create(AVR::S_PM, Expr, false, Ctx);
     }
   }
 
diff --git a/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
index 47d9073f6eb84..f4bddfdac3461 100644
--- a/llvm/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -13,7 +13,7 @@
 
 #include "AVRMCInstLower.h"
 #include "AVRInstrInfo.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/Mangler.h"
@@ -42,19 +42,19 @@ AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
 
   if (TF & AVRII::MO_LO) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_LO8_GS
-                                                        : AVRMCExpr::VK_PM_LO8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_LO8_GS
+                                                        : AVR::S_PM_LO8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_LO8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_LO8, Expr, IsNegated, Ctx);
     }
   } else if (TF & AVRII::MO_HI) {
     if (IsFunction) {
-      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVRMCExpr::VK_HI8_GS
-                                                        : AVRMCExpr::VK_PM_HI8,
+      Expr = AVRMCExpr::create(Subtarget.hasEIJMPCALL() ? AVR::S_HI8_GS
+                                                        : AVR::S_PM_HI8,
                                Expr, IsNegated, Ctx);
     } else {
-      Expr = AVRMCExpr::create(AVRMCExpr::VK_HI8, Expr, IsNegated, Ctx);
+      Expr = AVRMCExpr::create(AVR::S_HI8, Expr, IsNegated, Ctx);
     }
   } else if (TF != 0) {
     llvm_unreachable("Unknown target flag on symbol operand");
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index cab5caffdcba8..e82bd761eeb39 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRRegisterInfo.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCELFStreamer.h"
-#include "MCTargetDesc/AVRMCExpr.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "TargetInfo/AVRTargetInfo.h"
 
@@ -447,7 +447,7 @@ bool AVRAsmParser::tryParseExpression(OperandVector &Operands, int64_t offset) {
 
 bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   bool isNegated = false;
-  AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE;
+  AVR::Specifier ModifierKind = AVR::S_AVR_NONE;
 
   SMLoc S = Parser.getTok().getLoc();
 
@@ -473,14 +473,14 @@ bool AVRAsmParser::tryParseRelocExpression(OperandVector &Operands) {
   StringRef ModifierName = Parser.getTok().getString();
   ModifierKind = AVRMCExpr::parseSpecifier(ModifierName);
 
-  if (ModifierKind != AVRMCExpr::VK_AVR_NONE) {
+  if (ModifierKind != AVR::S_AVR_NONE) {
     Parser.Lex();
     Parser.Lex(); // Eat modifier name and parenthesis
     if (Parser.getTok().getString() == GENERATE_STUBS &&
         Parser.getTok().getKind() == AsmToken::Identifier) {
       std::string GSModName = ModifierName.str() + "_" + GENERATE_STUBS;
       ModifierKind = AVRMCExpr::parseSpecifier(GSModName);
-      if (ModifierKind != AVRMCExpr::VK_AVR_NONE)
+      if (ModifierKind != AVR::S_AVR_NONE)
         Parser.Lex(); // Eat gs modifier name
     }
   } else {
@@ -698,15 +698,15 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Tokens[1].getKind() == AsmToken::Identifier) {
     MCSymbol *Symbol = getContext().getOrCreateSymbol(".text");
     AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L,
-                                        AVRMCExpr::VK_AVR_NONE);
+                                        AVR::S_AVR_NONE);
     return ParseStatus::NoMatch;
   }
 
   if (Parser.getTok().getKind() == AsmToken::Identifier &&
       Parser.getLexer().peekTok().getKind() == AsmToken::LParen) {
     StringRef ModifierName = Parser.getTok().getString();
-    AVRMCExpr::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
-    if (Spec != AVRMCExpr::VK_AVR_NONE) {
+    AVR::Specifier Spec = AVRMCExpr::parseSpecifier(ModifierName);
+    if (Spec != AVR::S_AVR_NONE) {
       Parser.Lex();
       Parser.Lex(); // Eat the modifier and parenthesis
     } else {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index e79ba29e0cbec..619efb376c613 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AVRFixupKinds.h"
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/MC/MCAssembler.h"
@@ -36,42 +36,42 @@ AVRELFObjectWriter::AVRELFObjectWriter(uint8_t OSABI)
 unsigned AVRELFObjectWriter::getRelocType(const MCFixup &Fixup,
                                           const MCValue &Target,
                                           bool IsPCRel) const {
-  auto Modifier = AVRMCExpr::Specifier(Target.getSpecifier());
+  auto Spec = Target.getSpecifier();
   switch ((unsigned)Fixup.getKind()) {
   case FK_Data_1:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_8;
-    case AVRMCExpr::VK_DIFF8:
+    case AVR::S_DIFF8:
       return ELF::R_AVR_DIFF8;
-    case AVRMCExpr::VK_LO8:
+    case AVR::S_LO8:
       return ELF::R_AVR_8_LO8;
-    case AVRMCExpr::VK_HI8:
+    case AVR::S_HI8:
       return ELF::R_AVR_8_HI8;
-    case AVRMCExpr::VK_HH8:
+    case AVR::S_HH8:
       return ELF::R_AVR_8_HLO8;
     }
   case FK_Data_4:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_32;
-    case AVRMCExpr::VK_DIFF32:
+    case AVR::S_DIFF32:
       return ELF::R_AVR_DIFF32;
     }
   case FK_Data_2:
-    switch (Modifier) {
+    switch (Spec) {
     default:
       llvm_unreachable("Unsupported Modifier");
-    case AVRMCExpr::VK_None:
+    case AVR::S_None:
       return ELF::R_AVR_16;
-    case AVRMCExpr::VK_AVR_NONE:
-    case AVRMCExpr::VK_PM:
+    case AVR::S_AVR_NONE:
+    case AVR::S_PM:
       return ELF::R_AVR_16_PM;
-    case AVRMCExpr::VK_DIFF16:
+    case AVR::S_DIFF16:
       return ELF::R_AVR_DIFF16;
     }
   case AVR::fixup_32:
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index d37e39c51e159..68db5227d073c 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCAsmInfo.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
@@ -26,3 +29,179 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
 }
+
+namespace {
+const struct ModifierEntry {
+  const char *const Spelling;
+  AVRMCExpr::Specifier specifier;
+} ModifierNames[] = {
+    {"lo8", AVR::S_LO8},       {"hi8", AVR::S_HI8},
+    {"hh8", AVR::S_HH8}, // synonym with hlo8
+    {"hlo8", AVR::S_HH8},      {"hhi8", AVR::S_HHI8},
+
+    {"pm", AVR::S_PM},         {"pm_lo8", AVR::S_PM_LO8},
+    {"pm_hi8", AVR::S_PM_HI8}, {"pm_hh8", AVR::S_PM_HH8},
+
+    {"lo8_gs", AVR::S_LO8_GS}, {"hi8_gs", AVR::S_HI8_GS},
+    {"gs", AVR::S_GS},
+};
+
+} // end of anonymous namespace
+
+AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
+        return Mod.Spelling == Name;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->specifier;
+  }
+  return AVR::S_AVR_NONE;
+}
+
+const char *AVRMCExpr::getName() const {
+  const auto &Modifier =
+      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
+        return Mod.specifier == specifier;
+      });
+
+  if (Modifier != std::end(ModifierNames)) {
+    return Modifier->Spelling;
+  }
+  return nullptr;
+}
+
+AVR::Fixups AVRMCExpr::getFixupKind() const {
+  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
+    break;
+  case AVR::S_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
+    break;
+  case AVR::S_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
+    break;
+  case AVR::S_HHI8:
+    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
+    break;
+
+  case AVR::S_PM_LO8:
+    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
+    break;
+  case AVR::S_PM_HI8:
+    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
+    break;
+  case AVR::S_PM_HH8:
+    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Kind = AVR::fixup_16_pm;
+    break;
+  case AVR::S_LO8_GS:
+    Kind = AVR::fixup_lo8_ldi_gs;
+    break;
+  case AVR::S_HI8_GS:
+    Kind = AVR::fixup_hi8_ldi_gs;
+    break;
+
+  default:
+    llvm_unreachable("Uninitialized expression");
+  }
+
+  return Kind;
+}
+
+int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
+  if (Negated)
+    Value *= -1;
+
+  switch (specifier) {
+  case AVR::S_LO8:
+    Value &= 0xff;
+    break;
+  case AVR::S_HI8:
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_HH8:
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_HHI8:
+    Value &= 0xff000000;
+    Value >>= 24;
+    break;
+  case AVR::S_PM_LO8:
+  case AVR::S_LO8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff;
+    break;
+  case AVR::S_PM_HI8:
+  case AVR::S_HI8_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff00;
+    Value >>= 8;
+    break;
+  case AVR::S_PM_HH8:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    Value &= 0xff0000;
+    Value >>= 16;
+    break;
+  case AVR::S_PM:
+  case AVR::S_GS:
+    Value >>= 1; // Program memory addresses must always be shifted by one.
+    break;
+
+  case AVR::S_AVR_NONE:
+  default:
+    llvm_unreachable("Uninitialized expression.");
+  }
+  return static_cast<uint64_t>(Value) & 0xff;
+}
+
+bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+                                          const MCAssembler *Asm) const {
+  MCValue Value;
+  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+  } else {
+    if (!Asm || !Asm->hasLayout())
+      return false;
+
+    auto Spec = AVR::S_None;
+    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
+      return false;
+    assert(!Value.getSubSym());
+    if (specifier == AVR::S_PM)
+      Spec = AVR::S_PM;
+
+    // TODO: don't attach specifier to MCSymbolRefExpr.
+    Result =
+        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
+  }
+
+  return true;
+}
+
+bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
+  MCValue Value;
+  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
+  if (!isRelocatable)
+    return false;
+
+  if (Value.isAbsolute()) {
+    Result = evaluateAsInt64(Value.getConstant());
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index 17dd77f6266a1..649e247adab0f 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_AVR_ASM_INFO_H
 #define LLVM_AVR_ASM_INFO_H
 
+#include "MCTargetDesc/AVRMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
@@ -25,6 +27,33 @@ class AVRMCAsmInfo : public MCAsmInfo {
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
 };
 
+namespace AVR {
+using Specifier = uint16_t;
+enum {
+  S_None,
+
+  S_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
+
+  S_HI8,  ///< Corresponds to `hi8()`.
+  S_LO8,  ///< Corresponds to `lo8()`.
+  S_HH8,  ///< Corresponds to `hlo8() and hh8()`.
+  S_HHI8, ///< Corresponds to `hhi8()`.
+
+  S_PM,     ///< Corresponds to `pm()`, reference to program memory.
+  S_PM_LO8, ///< Corresponds to `pm_lo8()`.
+  S_PM_HI8, ///< Corresponds to `pm_hi8()`.
+  S_PM_HH8, ///< Corresponds to `pm_hh8()`.
+
+  S_LO8_GS, ///< Corresponds to `lo8(gs())`.
+  S_HI8_GS, ///< Corresponds to `hi8(gs())`.
+  S_GS,     ///< Corresponds to `gs()`.
+
+  S_DIFF8,
+  S_DIFF16,
+  S_DIFF32,
+};
+} // namespace AVR
+
 } // end namespace llvm
 
 #endif // LLVM_AVR_ASM_INFO_H
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index fa01dad5ec128..4934e1c71bc03 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -12,7 +12,7 @@
 
 #include "AVRMCCodeEmitter.h"
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 #include "llvm/ADT/APFloat.h"
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index 88393fb9928a4..0644f422b328e 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -23,19 +23,19 @@ using namespace llvm;
 void AVRMCELFStreamer::emitValueForModiferKind(
     const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc,
     AVRMCExpr::Specifier ModifierKind) {
-  AVRMCExpr::Specifier Kind = AVRMCExpr::VK_AVR_NONE;
-  if (ModifierKind == AVRMCExpr::VK_AVR_NONE) {
-    Kind = AVRMCExpr::VK_DIFF8;
+  AVRMCExpr::Specifier Kind = AVR::S_AVR_NONE;
+  if (ModifierKind == AVR::S_AVR_NONE) {
+    Kind = AVR::S_DIFF8;
     if (SizeInBytes == SIZE_LONG)
-      Kind = AVRMCExpr::VK_DIFF32;
+      Kind = AVR::S_DIFF32;
     else if (SizeInBytes == SIZE_WORD)
-      Kind = AVRMCExpr::VK_DIFF16;
-  } else if (ModifierKind == AVRMCExpr::VK_LO8)
-    Kind = AVRMCExpr::VK_LO8;
-  else if (ModifierKind == AVRMCExpr::VK_HI8)
-    Kind = AVRMCExpr::VK_HI8;
-  else if (ModifierKind == AVRMCExpr::VK_HH8)
-    Kind = AVRMCExpr::VK_HH8;
+      Kind = AVR::S_DIFF16;
+  } else if (ModifierKind == AVR::S_LO8)
+    Kind = AVR::S_LO8;
+  else if (ModifierKind == AVR::S_HI8)
+    Kind = AVR::S_HI8;
+  else if (ModifierKind == AVR::S_HH8)
+    Kind = AVR::S_HH8;
   MCELFStreamer::emitValue(
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VariantKind(Kind),
                               getContext()),
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 2d45de083583c..88352337524ad 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 #define LLVM_LIB_TARGET_AVR_MCTARGETDESC_AVRMCELFSTREAMER_H
 
-#include "MCTargetDesc/AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -41,9 +41,10 @@ class AVRMCELFStreamer : public MCELFStreamer {
                       std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
-  void emitValueForModiferKind(
-      const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc = SMLoc(),
-      AVRMCExpr::Specifier ModifierKind = AVRMCExpr::VK_AVR_NONE);
+  void
+  emitValueForModiferKind(const MCSymbol *Sym, unsigned SizeInBytes,
+                          SMLoc Loc = SMLoc(),
+                          AVRMCExpr::Specifier ModifierKind = AVR::S_AVR_NONE);
 };
 
 MCStreamer *createAVRELFStreamer(Triple const &TT, MCContext &Context,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 5be799093d2c1..3067e854d8dc8 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -7,41 +7,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRMCExpr.h"
+#include "MCTargetDesc/AVRMCAsmInfo.h"
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
 
 namespace llvm {
 
-namespace {
-
-const struct ModifierEntry {
-  const char *const Spelling;
-  AVRMCExpr::Specifier specifier;
-} ModifierNames[] = {
-    {"lo8", AVRMCExpr::VK_LO8},       {"hi8", AVRMCExpr::VK_HI8},
-    {"hh8", AVRMCExpr::VK_HH8}, // synonym with hlo8
-    {"hlo8", AVRMCExpr::VK_HH8},      {"hhi8", AVRMCExpr::VK_HHI8},
-
-    {"pm", AVRMCExpr::VK_PM},         {"pm_lo8", AVRMCExpr::VK_PM_LO8},
-    {"pm_hi8", AVRMCExpr::VK_PM_HI8}, {"pm_hh8", AVRMCExpr::VK_PM_HH8},
-
-    {"lo8_gs", AVRMCExpr::VK_LO8_GS}, {"hi8_gs", AVRMCExpr::VK_HI8_GS},
-    {"gs", AVRMCExpr::VK_GS},
-};
-
-} // end of anonymous namespace
-
 const AVRMCExpr *AVRMCExpr::create(Specifier Kind, const MCExpr *Expr,
                                    bool Negated, MCContext &Ctx) {
   return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
 }
 
 void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  assert(specifier != VK_AVR_NONE);
+  assert(specifier != AVR::S_AVR_NONE);
   OS << getName() << '(';
   if (isNegated())
     OS << '-' << '(';
@@ -51,164 +32,4 @@ void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << ')';
 }
 
-bool AVRMCExpr::evaluateAsConstant(int64_t &Result) const {
-  MCValue Value;
-
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, nullptr);
-
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = evaluateAsInt64(Value.getConstant());
-    return true;
-  }
-
-  return false;
-}
-
-bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
-                                          const MCAssembler *Asm) const {
-  MCValue Value;
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
-  if (!isRelocatable)
-    return false;
-
-  if (Value.isAbsolute()) {
-    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
-  } else {
-    if (!Asm || !Asm->hasLayout())
-      return false;
-
-    auto Spec = AVRMCExpr::VK_None;
-    if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
-      return false;
-    assert(!Value.getSubSym());
-    if (specifier == VK_PM)
-      Spec = AVRMCExpr::VK_PM;
-
-    // TODO: don't attach specifier to MCSymbolRefExpr.
-    Result =
-        MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec);
-  }
-
-  return true;
-}
-
-int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
-  if (Negated)
-    Value *= -1;
-
-  switch (specifier) {
-  case AVRMCExpr::VK_LO8:
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_HI8:
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_HH8:
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_HHI8:
-    Value &= 0xff000000;
-    Value >>= 24;
-    break;
-  case AVRMCExpr::VK_PM_LO8:
-  case AVRMCExpr::VK_LO8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff;
-    break;
-  case AVRMCExpr::VK_PM_HI8:
-  case AVRMCExpr::VK_HI8_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff00;
-    Value >>= 8;
-    break;
-  case AVRMCExpr::VK_PM_HH8:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    Value &= 0xff0000;
-    Value >>= 16;
-    break;
-  case AVRMCExpr::VK_PM:
-  case AVRMCExpr::VK_GS:
-    Value >>= 1; // Program memory addresses must always be shifted by one.
-    break;
-
-  case AVRMCExpr::VK_AVR_NONE:
-  default:
-    llvm_unreachable("Uninitialized expression.");
-  }
-  return static_cast<uint64_t>(Value) & 0xff;
-}
-
-AVR::Fixups AVRMCExpr::getFixupKind() const {
-  AVR::Fixups Kind = AVR::Fixups::LastTargetFixupKind;
-
-  switch (specifier) {
-  case VK_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_neg : AVR::fixup_lo8_ldi;
-    break;
-  case VK_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_neg : AVR::fixup_hi8_ldi;
-    break;
-  case VK_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_neg : AVR::fixup_hh8_ldi;
-    break;
-  case VK_HHI8:
-    Kind = isNegated() ? AVR::fixup_ms8_ldi_neg : AVR::fixup_ms8_ldi;
-    break;
-
-  case VK_PM_LO8:
-    Kind = isNegated() ? AVR::fixup_lo8_ldi_pm_neg : AVR::fixup_lo8_ldi_pm;
-    break;
-  case VK_PM_HI8:
-    Kind = isNegated() ? AVR::fixup_hi8_ldi_pm_neg : AVR::fixup_hi8_ldi_pm;
-    break;
-  case VK_PM_HH8:
-    Kind = isNegated() ? AVR::fixup_hh8_ldi_pm_neg : AVR::fixup_hh8_ldi_pm;
-    break;
-  case VK_PM:
-  case VK_GS:
-    Kind = AVR::fixup_16_pm;
-    break;
-  case VK_LO8_GS:
-    Kind = AVR::fixup_lo8_ldi_gs;
-    break;
-  case VK_HI8_GS:
-    Kind = AVR::fixup_hi8_ldi_gs;
-    break;
-
-  default:
-    llvm_unreachable("Uninitialized expression");
-  }
-
-  return Kind;
-}
-
-const char *AVRMCExpr::getName() const {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [this](ModifierEntry const &Mod) {
-        return Mod.specifier == specifier;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->Spelling;
-  }
-  return nullptr;
-}
-
-AVRMCExpr::Specifier AVRMCExpr::parseSpecifier(StringRef Name) {
-  const auto &Modifier =
-      llvm::find_if(ModifierNames, [&Name](ModifierEntry const &Mod) {
-        return Mod.Spelling == Name;
-      });
-
-  if (Modifier != std::end(ModifierNames)) {
-    return Modifier->specifier;
-  }
-  return VK_AVR_NONE;
-}
-
-} // end of namespace llvm
+} // namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index 69c60cde1f746..d72d36f108580 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -20,29 +20,6 @@ class AVRMCExpr : public MCSpecifierExpr {
 public:
   using Specifier = Spec;
   /// Specifies the type of an expression.
-  enum {
-    VK_None,
-
-    VK_AVR_NONE = MCSymbolRefExpr::FirstTargetSpecifier,
-
-    VK_HI8,  ///< Corresponds to `hi8()`.
-    VK_LO8,  ///< Corresponds to `lo8()`.
-    VK_HH8,  ///< Corresponds to `hlo8() and hh8()`.
-    VK_HHI8, ///< Corresponds to `hhi8()`.
-
-    VK_PM,     ///< Corresponds to `pm()`, reference to program memory.
-    VK_PM_LO8, ///< Corresponds to `pm_lo8()`.
-    VK_PM_HI8, ///< Corresponds to `pm_hi8()`.
-    VK_PM_HH8, ///< Corresponds to `pm_hh8()`.
-
-    VK_LO8_GS, ///< Corresponds to `lo8(gs())`.
-    VK_HI8_GS, ///< Corresponds to `hi8(gs())`.
-    VK_GS,     ///< Corresponds to `gs()`.
-
-    VK_DIFF8,
-    VK_DIFF16,
-    VK_DIFF32,
-  };
 
 public:
   /// Creates an AVR machine code expression.

From 25dcd231bfee1120c21b102e074542c54fb7c5c2 Mon Sep 17 00:00:00 2001
From: zGoldthorpe <Zach.Goldthorpe@amd.com>
Date: Mon, 16 Jun 2025 10:16:47 -0600
Subject: [PATCH 606/851] [IPO] Added attributor for identifying invariant
 loads (#141800)

The attributor conservatively marks pointers whose loads are eligible to
be marked as `!invariant.load`.
It does so by identifying:
1. Pointers marked `noalias` and `readonly`
2. Pointers whose underlying objects are all eligible for invariant
loads.

The attributor then manifests this attribute at non-atomic non-volatile
load instructions.
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  41 ++
 llvm/lib/Transforms/IPO/Attributor.cpp        |   2 +
 .../Transforms/IPO/AttributorAttributes.cpp   | 339 ++++++++++++++++
 .../Attributor/AMDGPU/tag-invariant-loads.ll  | 382 ++++++++++++++++++
 .../Attributor/dereferenceable-1.ll           |   1 -
 .../Attributor/value-simplify-local-remote.ll |  22 +-
 6 files changed, 772 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index e6eb756df987d..f19f3292c4798 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -6335,6 +6335,47 @@ struct AAUnderlyingObjects : AbstractAttribute {
                           AA::ValueScope Scope = AA::Interprocedural) const = 0;
 };
 
+/// An abstract interface for identifying pointers from which loads can be
+/// marked invariant.
+struct AAInvariantLoadPointer : public AbstractAttribute {
+  AAInvariantLoadPointer(const IRPosition &IRP) : AbstractAttribute(IRP) {}
+
+  /// See AbstractAttribute::isValidIRPositionForInit
+  static bool isValidIRPositionForInit(Attributor &A, const IRPosition &IRP) {
+    if (!IRP.getAssociatedType()->isPointerTy())
+      return false;
+
+    return AbstractAttribute::isValidIRPositionForInit(A, IRP);
+  }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAInvariantLoadPointer &createForPosition(const IRPosition &IRP,
+                                                   Attributor &A);
+
+  /// Return true if the pointer's contents are known to remain invariant.
+  virtual bool isKnownInvariant() const = 0;
+  virtual bool isKnownLocallyInvariant() const = 0;
+
+  /// Return true if the pointer's contents are assumed to remain invariant.
+  virtual bool isAssumedInvariant() const = 0;
+  virtual bool isAssumedLocallyInvariant() const = 0;
+
+  /// See AbstractAttribute::getName().
+  StringRef getName() const override { return "AAInvariantLoadPointer"; }
+
+  /// See AbstractAttribute::getIdAddr().
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAInvariantLoadPointer
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address).
+  static const char ID;
+};
+
 /// An abstract interface for address space information.
 struct AAAddressSpace : public StateWrapper<BooleanState, AbstractAttribute> {
   AAAddressSpace(const IRPosition &IRP, Attributor &A)
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index dac1f7a30c370..a2548258ddaf0 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3612,6 +3612,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       if (SimplifyAllLoads)
         getAssumedSimplified(IRPosition::value(I), nullptr,
                              UsedAssumedInformation, AA::Intraprocedural);
+      getOrCreateAAFor<AAInvariantLoadPointer>(
+          IRPosition::value(*LI->getPointerOperand()));
       getOrCreateAAFor<AAAddressSpace>(
           IRPosition::value(*LI->getPointerOperand()));
     } else {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 3799a696f67af..5cb8f888354bd 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -191,6 +191,7 @@ PIPE_OPERATOR(AAInterFnReachability)
 PIPE_OPERATOR(AAPointerInfo)
 PIPE_OPERATOR(AAAssumptionInfo)
 PIPE_OPERATOR(AAUnderlyingObjects)
+PIPE_OPERATOR(AAInvariantLoadPointer)
 PIPE_OPERATOR(AAAddressSpace)
 PIPE_OPERATOR(AAAllocationInfo)
 PIPE_OPERATOR(AAIndirectCallInfo)
@@ -12533,6 +12534,342 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
 };
 } // namespace
 
+/// --------------------- Invariant Load Pointer -------------------------------
+namespace {
+
+struct AAInvariantLoadPointerImpl
+    : public StateWrapper<BitIntegerState<uint8_t, 15>,
+                          AAInvariantLoadPointer> {
+
+  enum {
+    // pointer does not alias within the bounds of the function
+    IS_NOALIAS = 1 << 0,
+    // pointer is not involved in any effectful instructions within the bounds
+    // of the function
+    IS_NOEFFECT = 1 << 1,
+    // loads are invariant within the bounds of the function
+    IS_LOCALLY_INVARIANT = 1 << 2,
+    // memory lifetime is constrained within the bounds of the function
+    IS_LOCALLY_CONSTRAINED = 1 << 3,
+
+    IS_BEST_STATE = IS_NOALIAS | IS_NOEFFECT | IS_LOCALLY_INVARIANT |
+                    IS_LOCALLY_CONSTRAINED,
+  };
+  static_assert(getBestState() == IS_BEST_STATE, "Unexpected best state");
+
+  using Base =
+      StateWrapper<BitIntegerState<uint8_t, 15>, AAInvariantLoadPointer>;
+
+  // the BitIntegerState is optimistic about IS_NOALIAS and IS_NOEFFECT, but
+  // pessimistic about IS_KNOWN_INVARIANT
+  AAInvariantLoadPointerImpl(const IRPosition &IRP, Attributor &A)
+      : Base(IRP) {}
+
+  bool isKnownInvariant() const final {
+    return isKnownLocallyInvariant() && isKnown(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isKnownLocallyInvariant() const final {
+    if (isKnown(IS_LOCALLY_INVARIANT))
+      return true;
+    return isKnown(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  bool isAssumedInvariant() const final {
+    return isAssumedLocallyInvariant() && isAssumed(IS_LOCALLY_CONSTRAINED);
+  }
+
+  bool isAssumedLocallyInvariant() const final {
+    if (isAssumed(IS_LOCALLY_INVARIANT))
+      return true;
+    return isAssumed(IS_NOALIAS | IS_NOEFFECT);
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    Changed |= updateNoAlias(A);
+    if (requiresNoAlias() && !isAssumed(IS_NOALIAS))
+      return indicatePessimisticFixpoint();
+
+    Changed |= updateNoEffect(A);
+
+    Changed |= updateLocalInvariance(A);
+
+    return Changed;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!isKnownInvariant())
+      return ChangeStatus::UNCHANGED;
+
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    const Value *Ptr = &getAssociatedValue();
+    const auto TagInvariantLoads = [&](const Use &U, bool &) {
+      if (U.get() != Ptr)
+        return true;
+      auto *I = dyn_cast<Instruction>(U.getUser());
+      if (!I)
+        return true;
+
+      // Ensure that we are only changing uses from the corresponding callgraph
+      // SSC in the case that the AA isn't run on the entire module
+      if (!A.isRunOn(I->getFunction()))
+        return true;
+
+      if (I->hasMetadata(LLVMContext::MD_invariant_load))
+        return true;
+
+      if (auto *LI = dyn_cast<LoadInst>(I)) {
+        LI->setMetadata(LLVMContext::MD_invariant_load,
+                        MDNode::get(LI->getContext(), {}));
+        Changed = ChangeStatus::CHANGED;
+      }
+      return true;
+    };
+
+    (void)A.checkForAllUses(TagInvariantLoads, *this, *Ptr);
+    return Changed;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr(Attributor *) const override {
+    if (isKnownInvariant())
+      return "load-invariant pointer";
+    return "non-invariant pointer";
+  }
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {}
+
+private:
+  /// Indicate that noalias is required for the pointer to be invariant.
+  bool requiresNoAlias() const {
+    switch (getPositionKind()) {
+    default:
+      // Conservatively default to require noalias.
+      return true;
+    case IRP_FLOAT:
+    case IRP_RETURNED:
+    case IRP_CALL_SITE:
+      return false;
+    case IRP_CALL_SITE_RETURNED: {
+      const auto &CB = cast<CallBase>(getAnchorValue());
+      return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+          &CB, /*MustPreserveNullness=*/false);
+    }
+    case IRP_ARGUMENT: {
+      const Function *F = getAssociatedFunction();
+      assert(F && "no associated function for argument");
+      return !isCallableCC(F->getCallingConv());
+    }
+    }
+  }
+
+  bool isExternal() const {
+    const Function *F = getAssociatedFunction();
+    if (!F)
+      return true;
+    return isCallableCC(F->getCallingConv()) &&
+           getPositionKind() != IRP_CALL_SITE_RETURNED;
+  }
+
+  ChangeStatus updateNoAlias(Attributor &A) {
+    if (isKnown(IS_NOALIAS) || !isAssumed(IS_NOALIAS))
+      return ChangeStatus::UNCHANGED;
+
+    // Try to use AANoAlias.
+    if (const auto *ANoAlias = A.getOrCreateAAFor<AANoAlias>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (ANoAlias->isKnownNoAlias()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      if (!ANoAlias->isAssumedNoAlias()) {
+        removeAssumedBits(IS_NOALIAS);
+        return ChangeStatus::CHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // Try to infer noalias from argument attribute, since it is applicable for
+    // the duration of the function.
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->hasNoAliasAttr()) {
+        addKnownBits(IS_NOALIAS);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Noalias information is not provided, and cannot be inferred,
+      // so we conservatively assume the pointer aliases.
+      removeAssumedBits(IS_NOALIAS);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateNoEffect(Attributor &A) {
+    if (isKnown(IS_NOEFFECT) || !isAssumed(IS_NOEFFECT))
+      return ChangeStatus::UNCHANGED;
+
+    if (!getAssociatedFunction())
+      return indicatePessimisticFixpoint();
+
+    const auto HasNoEffectLoads = [&](const Use &U, bool &) {
+      const auto *LI = dyn_cast<LoadInst>(U.getUser());
+      return !LI || !LI->mayHaveSideEffects();
+    };
+    if (!A.checkForAllUses(HasNoEffectLoads, *this, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+
+    // Try to use AAMemoryBehavior to infer readonly attribute.
+    if (const auto *AMemoryBehavior = A.getOrCreateAAFor<AAMemoryBehavior>(
+            getIRPosition(), this, DepClassTy::REQUIRED)) {
+      if (!AMemoryBehavior->isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+
+      if (AMemoryBehavior->isKnownReadOnly()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      return ChangeStatus::UNCHANGED;
+    }
+
+    if (const Argument *Arg = getAssociatedArgument()) {
+      if (Arg->onlyReadsMemory()) {
+        addKnownBits(IS_NOEFFECT);
+        return ChangeStatus::UNCHANGED;
+      }
+
+      // Readonly information is not provided, and cannot be inferred from
+      // AAMemoryBehavior.
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus updateLocalInvariance(Attributor &A) {
+    if (isKnown(IS_LOCALLY_INVARIANT) || !isAssumed(IS_LOCALLY_INVARIANT))
+      return ChangeStatus::UNCHANGED;
+
+    // try to infer invariance from underlying objects
+    const auto *AUO = A.getOrCreateAAFor<AAUnderlyingObjects>(
+        getIRPosition(), this, DepClassTy::REQUIRED);
+    if (!AUO)
+      return ChangeStatus::UNCHANGED;
+
+    bool UsedAssumedInformation = false;
+    const auto IsLocallyInvariantLoadIfPointer = [&](const Value &V) {
+      if (!V.getType()->isPointerTy())
+        return true;
+      const auto *IsInvariantLoadPointer =
+          A.getOrCreateAAFor<AAInvariantLoadPointer>(IRPosition::value(V), this,
+                                                     DepClassTy::REQUIRED);
+      // Conservatively fail if invariance cannot be inferred.
+      if (!IsInvariantLoadPointer)
+        return false;
+
+      if (IsInvariantLoadPointer->isKnownLocallyInvariant())
+        return true;
+      if (!IsInvariantLoadPointer->isAssumedLocallyInvariant())
+        return false;
+
+      UsedAssumedInformation = true;
+      return true;
+    };
+    if (!AUO->forallUnderlyingObjects(IsLocallyInvariantLoadIfPointer))
+      return indicatePessimisticFixpoint();
+
+    if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue())) {
+      if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+              CB, /*MustPreserveNullness=*/false)) {
+        for (const Value *Arg : CB->args()) {
+          if (!IsLocallyInvariantLoadIfPointer(*Arg))
+            return indicatePessimisticFixpoint();
+        }
+      }
+    }
+
+    if (!UsedAssumedInformation) {
+      // Pointer is known and not just assumed to be locally invariant.
+      addKnownBits(IS_LOCALLY_INVARIANT);
+      return ChangeStatus::CHANGED;
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AAInvariantLoadPointerFloating final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerFloating(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+
+struct AAInvariantLoadPointerReturned final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteReturned final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for return from call");
+
+    if (!F->isDeclaration() && !F->isIntrinsic())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
+            &CB, /*MustPreserveNullness=*/false))
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    if (F->onlyReadsMemory() && F->hasNoSync())
+      return AAInvariantLoadPointerImpl::initialize(A);
+
+    // At this point, the function is opaque, so we conservatively assume
+    // non-invariance.
+    indicatePessimisticFixpoint();
+  }
+};
+
+struct AAInvariantLoadPointerArgument final : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+
+  void initialize(Attributor &) override {
+    const Function *F = getAssociatedFunction();
+    assert(F && "no associated function for argument");
+
+    if (!isCallableCC(F->getCallingConv())) {
+      addKnownBits(IS_LOCALLY_CONSTRAINED);
+      return;
+    }
+
+    if (!F->hasLocalLinkage())
+      removeAssumedBits(IS_LOCALLY_CONSTRAINED);
+  }
+};
+
+struct AAInvariantLoadPointerCallSiteArgument final
+    : AAInvariantLoadPointerImpl {
+  AAInvariantLoadPointerCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAInvariantLoadPointerImpl(IRP, A) {}
+};
+} // namespace
+
 /// ------------------------ Address Space  ------------------------------------
 namespace {
 
@@ -13038,6 +13375,7 @@ const char AAInterFnReachability::ID = 0;
 const char AAPointerInfo::ID = 0;
 const char AAAssumptionInfo::ID = 0;
 const char AAUnderlyingObjects::ID = 0;
+const char AAInvariantLoadPointer::ID = 0;
 const char AAAddressSpace::ID = 0;
 const char AAAllocationInfo::ID = 0;
 const char AAIndirectCallInfo::ID = 0;
@@ -13172,6 +13510,7 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFPClass)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInvariantLoadPointer)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAddressSpace)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAllocationInfo)
 
diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
new file mode 100644
index 0000000000000..ace68a19bf41f
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll
@@ -0,0 +1,382 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=attributor %s -S | FileCheck %s --check-prefix=AMDGCN
+
+@G = addrspace(1) global i32 zeroinitializer, align 4
+declare void @clobber(i32) #0
+declare ptr addrspace(1) @get_ptr() #0
+declare noalias ptr addrspace(1) @get_noalias_ptr() #0
+declare noalias ptr addrspace(1) @get_untouched_ptr() #1
+
+define void @test_nonkernel(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define void @test_nonkernel(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6:[0-9]+]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as the caller may modify %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_plain(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_plain(
+; AMDGCN-SAME: ptr addrspace(1) nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be !invariant.load, as %ptr may alias a pointer in @clobber
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_ptr(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_ptr(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0:![0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_gep(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_gep(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_noalias_gep(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_noalias_gep(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly align 4 captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[GEP]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+  %val = load i32, ptr addrspace(1) %gep, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_swap(ptr addrspace(1) noalias %ptr, i32 inreg %swap) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_swap(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]], i32 inreg [[SWAP:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    store i32 [[SWAP]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be !invariant.load due to the write to %ptr
+  store i32 %swap, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_volatile(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_volatile(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef align 4 [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load volatile i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load volatile i32, ptr addrspace(1) %ptr, align 4
+  ;; volatiles loads cannot be !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_unordered(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_unordered(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] unordered, align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr unordered, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_monotonic(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_monotonic(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[PTR]] monotonic, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load atomic i32, ptr addrspace(1) %ptr monotonic, align 4
+  ;; atomic loads with ordering guarantees may have side effects
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_global() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_global(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) @G, align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = load i32, ptr addrspace(1) @G, align 4
+  ;; is not an !invariant.load as global variables may change
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_noalias_load(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_noalias_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; is an !invariant.load due to its only caller @test_call_internal_noalias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_noalias_load(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7:[0-9]+]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_noalias_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_load(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_load(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since the pointer in @test_call_internal may alias
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_load(ptr addrspace(1) nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_load(ptr addrspace(1) %ptr)
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define internal i32 @test_internal_written(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define internal i32 @test_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) dereferenceable_or_null(4) [[PTR:%.*]]) #[[ATTR4]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    ret i32 [[VAL]]
+;
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; cannot be an !invariant.load because of the write in caller @test_call_internal_written
+  ret i32 %val
+}
+
+define amdgpu_kernel void @test_call_internal_written(ptr addrspace(1) noalias %ptr, i32 inreg %x) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_internal_written(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree captures(none) [[PTR:%.*]], i32 inreg [[X:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[VAL:%.*]] = call i32 @test_internal_written(ptr addrspace(1) noalias nofree noundef readonly align 4 captures(none) [[PTR]]) #[[ATTR7]]
+; AMDGCN-NEXT:    store i32 [[X]], ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %val = call i32 @test_internal_written(ptr addrspace(1) %ptr)
+  store i32 %x, ptr addrspace(1) %ptr
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_noalias_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_noalias_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call align 4 ptr addrspace(1) @get_noalias_ptr() #[[ATTR6]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_noalias_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; may not be an !invariant.load since %ptr may have been written to before returning
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_call_untouched_ptr() {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_call_untouched_ptr(
+; AMDGCN-SAME: ) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = call noalias align 4 ptr addrspace(1) @get_untouched_ptr() #[[ATTR8:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = call ptr addrspace(1) @get_untouched_ptr()
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer(
+; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9:[0-9]+]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  ;; original %ptr may alias
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias(
+; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR9]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0)
+  %val = load i32, ptr addrspace(7) %rsrc, align 4
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_selected_load_partial_noalias(i1 inreg %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_selected_load_partial_noalias(
+; AMDGCN-SAME: i1 inreg [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:    [[PTR:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PTR_TRUE]], ptr addrspace(1) [[PTR_FALSE]]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+  %ptr = select i1 %cond, ptr addrspace(1) %ptr.true, ptr addrspace(1) %ptr.false
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; %ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) noalias %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4, !invariant.load [[META0]]
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; either pointer yields an !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+define amdgpu_kernel void @test_branch_load_partial_noalias(i1 %cond, ptr addrspace(1) noalias %ptr.true, ptr addrspace(1) %ptr.false) {
+; AMDGCN-LABEL: define amdgpu_kernel void @test_branch_load_partial_noalias(
+; AMDGCN-SAME: i1 noundef [[COND:%.*]], ptr addrspace(1) noalias nofree readonly captures(none) [[PTR_TRUE:%.*]], ptr addrspace(1) nofree readonly captures(none) [[PTR_FALSE:%.*]]) #[[ATTR2]] {
+; AMDGCN-NEXT:  [[ENTRY:.*:]]
+; AMDGCN-NEXT:    br i1 [[COND]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; AMDGCN:       [[TRUE]]:
+; AMDGCN-NEXT:    call void @clobber(i32 noundef 1) #[[ATTR6]]
+; AMDGCN-NEXT:    br label %[[FINISH:.*]]
+; AMDGCN:       [[FALSE]]:
+; AMDGCN-NEXT:    br label %[[FINISH]]
+; AMDGCN:       [[FINISH]]:
+; AMDGCN-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[PTR_TRUE]], %[[TRUE]] ], [ [[PTR_FALSE]], %[[FALSE]] ]
+; AMDGCN-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4
+; AMDGCN-NEXT:    call void @clobber(i32 [[VAL]]) #[[ATTR6]]
+; AMDGCN-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %true, label %false
+true:
+  call void @clobber(i32 1)
+  br label %finish
+false:
+  br label %finish
+finish:
+  %ptr = phi ptr addrspace(1) [ %ptr.true, %true ], [ %ptr.false, %false ]
+  %val = load i32, ptr addrspace(1) %ptr, align 4
+  ;; ptr.false may alias, so no !invariant.load
+  call void @clobber(i32 %val)
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind willreturn }
+attributes #1 = { nofree norecurse nosync nounwind willreturn readonly }
+;.
+; AMDGCN: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 07e2d5ea15752..5bff2a2e6b208 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -207,7 +207,6 @@ define void @f7_1(ptr %ptr, i1 %cnd) {
 ; CHECK-LABEL: define {{[^@]+}}@f7_1
 ; CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], i1 noundef [[CND:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
-; CHECK-NEXT:    [[PTR_0:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(ptr noundef nonnull align 4 dereferenceable(4) [[PTR]]) #[[ATTR1]]
 ; CHECK-NEXT:    br i1 [[CND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
index 374d5ba7ff52b..4767244800d21 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll
@@ -135,7 +135,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; TUNIT-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR5:[0-9]+]]
-; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; TUNIT-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -145,7 +145,7 @@ define internal %S @foo.1(ptr %foo.this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[FOO_THIS]], ptr [[FOO_THIS]], align 8
 ; CGSCC-NEXT:    call void @bar.2(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[FOO_THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[FOO_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8:![0-9]+]]
 ; CGSCC-NEXT:    ret [[S]] [[FOO_RET]]
 ;
 entry:
@@ -234,7 +234,7 @@ define internal %S @bar.5(ptr %this) {
 ; TUNIT-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR4]]
-; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; TUNIT-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
@@ -244,7 +244,7 @@ define internal %S @bar.5(ptr %this) {
 ; CGSCC-NEXT:    [[RETVAL:%.*]] = alloca [[S:%.*]], i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @baz.6(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull align 8 dereferenceable(8) [[THIS]]) #[[ATTR9:[0-9]+]]
-; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[BAR_RET:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[BAR_RET]]
 ;
 entry:
@@ -286,7 +286,7 @@ define internal void @boom(ptr %this, ptr %data) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[DATA_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; TUNIT-NEXT:    store ptr [[DATA]], ptr [[DATA_ADDR]], align 8
-; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8
+; TUNIT-NEXT:    [[V:%.*]] = load ptr, ptr [[DATA_ADDR]], align 8, !invariant.load [[META8]]
 ; TUNIT-NEXT:    store ptr [[V]], ptr [[THIS]], align 8
 ; TUNIT-NEXT:    ret void
 ;
@@ -342,14 +342,6 @@ define %S.2 @t3.helper() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[S_2:%.*]], align 8
 ; CHECK-NEXT:    call void @ext1(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]])
-; CHECK-NEXT:    [[DOTFCA_0_LOAD:%.*]] = load ptr, ptr [[RETVAL]], align 8
-; CHECK-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue [[S_2]] poison, ptr [[DOTFCA_0_LOAD]], 0
-; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 1
-; CHECK-NEXT:    [[DOTFCA_1_LOAD:%.*]] = load i64, ptr [[DOTFCA_1_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_0_INSERT]], i64 [[DOTFCA_1_LOAD]], 1
-; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [[S_2]], ptr [[RETVAL]], i32 0, i32 2
-; CHECK-NEXT:    [[DOTFCA_2_LOAD:%.*]] = load i64, ptr [[DOTFCA_2_GEP]], align 8
-; CHECK-NEXT:    [[DOTFCA_2_INSERT:%.*]] = insertvalue [[S_2]] [[DOTFCA_1_INSERT]], i64 [[DOTFCA_2_LOAD]], 2
 ; CHECK-NEXT:    ret [[S_2]] zeroinitializer
 ;
 entry:
@@ -508,7 +500,7 @@ define internal %S @t4a(ptr %this) {
 ; CGSCC-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, i32 0, align 8
 ; CGSCC-NEXT:    store ptr [[THIS]], ptr [[THIS]], align 8
 ; CGSCC-NEXT:    call void @t4b(ptr noalias nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[RETVAL]], ptr nofree noundef nonnull writeonly align 8 dereferenceable(8) [[THIS]]) #[[ATTR6]]
-; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8
+; CGSCC-NEXT:    [[TMP0:%.*]] = load [[S]], ptr [[RETVAL]], align 8, !invariant.load [[META8]]
 ; CGSCC-NEXT:    ret [[S]] [[TMP0]]
 ;
 entry:
@@ -623,6 +615,7 @@ entry:
 ; TUNIT: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; TUNIT: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; TUNIT: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; TUNIT: [[META8]] = !{}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 5]}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
@@ -632,4 +625,5 @@ entry:
 ; CGSCC: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
 ; CGSCC: [[META6:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 2}
 ; CGSCC: [[META7:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CGSCC: [[META8]] = !{}
 ;.

From 1bd4f9719faac77f368a7bdfdb47ead56a808375 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Mon, 16 Jun 2025 13:20:30 -0300
Subject: [PATCH 607/851] [flang][OpenMP] Put taskgroup in a new scope
 (#144122)

Although taskgroup is a privatizing construct, because of
task_reduction clause, a new scope was not being created for it.
This could cause an extra privatization of variables when
taskgroup was lowered, because its scope would be the same as of
the parent privatizing construct.

This fixes regressions in tests 1052_0201 and 1052_0205, from
Fujitsu testsuite.

This issue didn't happen before because implicit symbols were
being created in a different way before #142154.
---
 flang/lib/Semantics/resolve-names.cpp        |  1 -
 flang/test/Lower/OpenMP/implicit-dsa.f90     | 23 ++++++++------
 flang/test/Lower/OpenMP/taskgroup02.f90      | 32 ++++++++++++++++++++
 flang/test/Semantics/OpenMP/implicit-dsa.f90 |  6 ++--
 4 files changed, 49 insertions(+), 13 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/taskgroup02.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e23e91b674a73..f66918e5c140e 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1729,7 +1729,6 @@ bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
   switch (beginDir.v) {
   case llvm::omp::Directive::OMPD_master:
   case llvm::omp::Directive::OMPD_ordered:
-  case llvm::omp::Directive::OMPD_taskgroup:
     return false;
   default:
     return true;
diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90
index f0f149bb415b0..0d2db63edfe79 100644
--- a/flang/test/Lower/OpenMP/implicit-dsa.f90
+++ b/flang/test/Lower/OpenMP/implicit-dsa.f90
@@ -5,6 +5,14 @@
 
 ! Privatizers
 
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_Y_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
+! CHECK-LABEL: omp.private
+! CHECK-SAME:      {type = firstprivate} @[[TEST7_X_FIRSTPRIV:.*]] : i32
+! CHECK-SAME:  copy {
+
 ! CHECK-LABEL: omp.private
 ! CHECK-SAME:      {type = private} @[[TEST6_Y_PRIV:.*]] : i32
 ! CHECK-NOT:   copy {
@@ -277,22 +285,19 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !CHECK-LABEL: func @_QPimplicit_dsa_test7
 !CHECK:       %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test7Ex"}
 !CHECK:       %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:       %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:       %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:       omp.task {
+!CHECK:       omp.task private(@[[TEST7_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:[^,]*]],
+!CHECK-SAME:      @[[TEST7_Y_FIRSTPRIV]] %[[Y_DECL]]#0 -> %[[PRIV_Y:.*]] : !fir.ref<i32>, !fir.ref<i32>) {
+!CHECK:         %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"}
+!CHECK:         %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"}
 !CHECK:         omp.taskgroup {
-!CHECK-NEXT:      %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test7Ex"}
-!CHECK-NEXT:      %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test7Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT:      %[[TEMP:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref<i32>
 !CHECK-NEXT:      hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
-!CHECK-NEXT:      %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test7Ey"}
-!CHECK-NEXT:      %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test7Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK-NEXT:      %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK-NEXT:      hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref<i32>
 !CHECK:         }
 !CHECK:       }
 subroutine implicit_dsa_test7
diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90
new file mode 100644
index 0000000000000..1e996a030c23a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/taskgroup02.f90
@@ -0,0 +1,32 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Check that variables are not privatized twice when TASKGROUP is used.
+
+!CHECK-LABEL: func.func @_QPsub() {
+!CHECK:         omp.parallel {
+!CHECK:           %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"}
+!CHECK:           omp.master {
+!CHECK:             omp.taskgroup {
+!CHECK-NEXT:          omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref<i32>) {
+!CHECK:                 %[[TASK_I_DECL:.*]]:2 = hlfir.declare %[[TASK_I]] {uniq_name = "_QFsubEi"}
+!CHECK:               }
+!CHECK:             }
+!CHECK:           }
+!CHECK:         }
+
+subroutine sub()
+  integer, dimension(10) :: a
+  integer :: i
+
+  !$omp parallel
+    !$omp master
+      do i=1,10
+       !$omp taskgroup
+         !$omp task shared(a)
+           a(i) = 1
+         !$omp end task
+       !$omp end taskgroup
+      end do
+    !$omp end master
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90
index 4a07e256e2bb6..1ee777d6b9723 100644
--- a/flang/test/Semantics/OpenMP/implicit-dsa.f90
+++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90
@@ -141,7 +141,7 @@ subroutine implicit_dsa_test6
   !$omp end task
 end subroutine
 
-! Test taskgroup - it uses the same scope as task.
+! Test taskgroup.
 !DEF: /implicit_dsa_test7 (Subroutine) Subprogram
 subroutine implicit_dsa_test7
   !DEF: /implicit_dsa_test7/x ObjectEntity INTEGER(4)
@@ -150,8 +150,8 @@ subroutine implicit_dsa_test7
 
   !$omp task
     !$omp taskgroup
-      !DEF: /implicit_dsa_test7/OtherConstruct1/x (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
-      !DEF: /implicit_dsa_test7/OtherConstruct1/y (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/x HostAssoc INTEGER(4)
+      !DEF: /implicit_dsa_test7/OtherConstruct1/OtherConstruct1/y HostAssoc INTEGER(4)
       x = y
     !$omp end taskgroup
   !$omp end task

From 22d9ea1b636d2c72a24fb0a8ce5216d609164635 Mon Sep 17 00:00:00 2001
From: Igor Wodiany <igor.wodiany@imgtec.com>
Date: Mon, 16 Jun 2025 17:41:52 +0100
Subject: [PATCH 608/851] [mlir][spirv] Add definition for GL Length (#144041)

A canonicalization pattern from `spirv.GL.Length` to `spirv.GL.FAbs` for scalar operands is also added.
---
 .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td       | 40 +++++++++++
 .../Dialect/SPIRV/IR/SPIRVCanonicalization.td |  8 +++
 .../SPIRV/IR/SPIRVGLCanonicalization.cpp      |  4 +-
 mlir/test/Dialect/SPIRV/IR/gl-ops.mlir        | 66 +++++++++++++++++++
 .../SPIRV/Transforms/gl-canonicalize.mlir     | 22 +++++++
 mlir/test/Target/SPIRV/gl-ops.mlir            |  4 ++
 6 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
index 2ec61758ba8ef..8c4da9b2dce18 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td
@@ -1160,6 +1160,46 @@ def SPIRV_GLFMixOp :
 
 // -----
 
+def SPIRV_GLLengthOp : SPIRV_GLOp<"Length", 66, [
+    Pure,
+    TypesMatchWith<"result type must match operand element type",
+                  "operand", "result",
+                  "::mlir::getElementTypeOrSelf($_self)">
+  ]> {
+  let summary = "Return the length of a vector x";
+
+  let description = [{
+    Result is the length of vector x, i.e., sqrt(x[0]**2 + x[1]**2 + ...).
+
+    The operand x must be a scalar or vector whose component type is floating-point.
+
+    Result Type must be a scalar of the same type as the component type of x.
+
+    #### Example:
+
+    ```mlir
+    %2 = spirv.GL.Length %0 : vector<3xf32> -> f32
+    %3 = spirv.GL.Length %1 : f32 -> f32
+    ```
+  }];
+
+  let arguments = (ins
+    SPIRV_ScalarOrVectorOf<SPIRV_Float>:$operand
+  );
+
+  let results = (outs
+    SPIRV_Float:$result
+  );
+
+  let assemblyFormat = [{
+    $operand attr-dict `:` type($operand) `->` type($result)
+  }];
+
+  let hasVerifier = 0;
+}
+
+// -----
+
 def SPIRV_GLDistanceOp : SPIRV_GLOp<"Distance", 67, [
     Pure,
     AllTypesMatch<["p0", "p1"]>,
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
index e8d2274d29aa0..39fbab8f37a2e 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td
@@ -75,3 +75,11 @@ def ConvertComparisonIntoClamp2_#CmpClampPair[0] : Pat<
         )),
     (CmpClampPair[1] $input, $min, $max)>;
 }
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length -> spirv.GL.FAbs
+//===----------------------------------------------------------------------===//
+
+def ConvertGLLengthToGLFAbs : Pat<
+    (SPIRV_GLLengthOp SPIRV_Float:$operand),
+    (SPIRV_GLFAbsOp $operand)>;
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
index 3ad8057a58dc9..46acb8c156fc6 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVGLCanonicalization.cpp
@@ -34,8 +34,8 @@ void populateSPIRVGLCanonicalizationPatterns(RewritePatternSet &results) {
               ConvertComparisonIntoClamp2_SPIRV_SLessThanOp,
               ConvertComparisonIntoClamp2_SPIRV_SLessThanEqualOp,
               ConvertComparisonIntoClamp2_SPIRV_ULessThanOp,
-              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp>(
-      results.getContext());
+              ConvertComparisonIntoClamp2_SPIRV_ULessThanEqualOp,
+              ConvertGLLengthToGLFAbs>(results.getContext());
 }
 } // namespace spirv
 } // namespace mlir
diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
index 642346cc40b0d..5c5d94c40e573 100644
--- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir
@@ -1000,3 +1000,69 @@ func.func @unpack_half_2x16_scalar_out(%arg0 : i32) -> () {
   %0 = spirv.GL.UnpackHalf2x16 %arg0 : i32 -> f32
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+func.func @length(%arg0 : f32) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : f32 -> f32
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  return
+}
+
+func.func @lengthvec(%arg0 : vector<3xf32>) -> () {
+  // CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_in(%arg0 : i32) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : i32 -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16_in(%arg0 : f16) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : f16 -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32vec_in(%arg0 : vector<3xi32>) -> () {
+  // expected-error @+1 {{op operand #0 must be 16/32/64-bit float or vector of 16/32/64-bit float values of length 2/3/4/8/16, but got 'vector<3xi32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xi32> -> f32
+  return
+}
+
+// -----
+
+func.func @length_f16vec_in(%arg0 : vector<3xf16>) -> () {
+  // expected-error @+1 {{op failed to verify that result type must match operand element type}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf16> -> f32
+  return
+}
+
+// -----
+
+func.func @length_i32_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'i32'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> i32
+  return
+}
+
+// -----
+
+func.func @length_vec_out(%arg0 : vector<3xf32>) -> () {
+  // expected-error @+1 {{op result #0 must be 16/32/64-bit float, but got 'vector<3xf32>'}}
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> vector<3xf32>
+  return
+}
diff --git a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
index c1447b38f0a48..33b877667512e 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/gl-canonicalize.mlir
@@ -177,3 +177,25 @@ func.func @clamp_ulessthanequal(%input: i32, %min: i32, %max: i32) -> i32 {
   // CHECK-NEXT: spirv.ReturnValue [[RES]]
   spirv.ReturnValue %2 : i32
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.GL.Length
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @convert_length_into_fabs_scalar
+func.func @convert_length_into_fabs_scalar(%arg0 : f32) -> f32 {
+  //CHECK: spirv.GL.FAbs {{%.*}} : f32
+  //CHECK-NOT: spirv.GL.Length
+  %0 = spirv.GL.Length %arg0 : f32 -> f32
+  spirv.ReturnValue %0 : f32
+}
+
+// CHECK-LABEL: @dont_convert_length_into_fabs_vec
+func.func @dont_convert_length_into_fabs_vec(%arg0 : vector<3xf32>) -> f32 {
+  //CHECK: spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+  //CHECK-NOT: spirv.GL.FAbs
+  %0 = spirv.GL.Length %arg0 : vector<3xf32> -> f32
+  spirv.ReturnValue %0 : f32
+}
diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir
index eacf36bfba9ce..832f7ea2fe314 100644
--- a/mlir/test/Target/SPIRV/gl-ops.mlir
+++ b/mlir/test/Target/SPIRV/gl-ops.mlir
@@ -128,6 +128,10 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %8 = spirv.GL.FindSMsb %arg3 : vector<3xi32>
     // CHECK: {{%.*}} = spirv.GL.FindUMsb {{%.*}} : vector<3xi32>
     %9 = spirv.GL.FindUMsb %arg3 : vector<3xi32>
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : f32 -> f32
+    %10 = spirv.GL.Length %arg0 : f32 -> f32
+    // CHECK: {{%.*}} = spirv.GL.Length {{%.*}} : vector<3xf32> -> f32
+    %11 = spirv.GL.Length %arg1 : vector<3xf32> -> f32
     spirv.Return
   }
 

From 8bbef3d1c9115b3c64365e9b8e4ee84275a4d001 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Mon, 16 Jun 2025 12:46:54 -0400
Subject: [PATCH 609/851] [InstCombine] Iterative replacement in PtrReplacer
 (#137215)

This patch enhances the PtrReplacer as follows:
1. Users are now collected iteratively to be generous on the stack. In
the case of PHIs with incoming values which have not yet been visited,
they are pushed back into the stack for reconsideration.
2. Replace users of the pointer root in a reverse-postorder traversal,
instead of a simple traversal over the collected users. This reordering
ensures that the operands of an instruction are replaced before
replacing the instruction itself.
3. During the replacement of PHI, use the same incoming value if it does
not have a replacement.

This patch specifically fixes the case when an incoming value of a PHI
is addrspacecasted.
---
 .../InstCombineLoadStoreAlloca.cpp            | 165 ++++++++++--------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 +++++++++
 2 files changed, 175 insertions(+), 69 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index a9751ab03e20e..9aec90120d8b0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,11 +243,10 @@ class PointerReplacer {
   void replacePointer(Value *V);
 
 private:
-  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *I);
+  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
   bool isAvailable(Instruction *I) const {
-    return I == &Root || Worklist.contains(I);
+    return I == &Root || UsersToReplace.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -259,8 +258,7 @@ class PointerReplacer {
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
-  SmallSetVector<Instruction *, 4> Worklist;
+  SmallSetVector<Instruction *, 32> UsersToReplace;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -269,72 +267,79 @@ class PointerReplacer {
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  if (!collectUsersRecursive(Root))
-    return false;
-
-  // Ensure that all outstanding (indirect) users of I
-  // are inserted into the Worklist. Return false
-  // otherwise.
-  return llvm::set_is_subset(ValuesToRevisit, Worklist);
-}
+  SmallVector<Instruction *> Worklist;
+  SmallSetVector<Instruction *, 32> ValuesToRevisit;
+
+  auto PushUsersToWorklist = [&](Instruction *Inst) {
+    for (auto *U : Inst->users())
+      if (auto *I = dyn_cast<Instruction>(U))
+        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
+          Worklist.emplace_back(I);
+  };
 
-bool PointerReplacer::collectUsersRecursive(Instruction &I) {
-  for (auto *U : I.users()) {
-    auto *Inst = cast<Instruction>(&*U);
+  PushUsersToWorklist(&Root);
+  while (!Worklist.empty()) {
+    Instruction *Inst = Worklist.pop_back_val();
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      Worklist.insert(Load);
+      UsersToReplace.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      // All incoming values must be instructions for replacability
-      if (any_of(PHI->incoming_values(),
-                 [](Value *V) { return !isa<Instruction>(V); }))
-        return false;
-
-      // If at least one incoming value of the PHI is not in Worklist,
-      // store the PHI for revisiting and skip this iteration of the
-      // loop.
-      if (any_of(PHI->incoming_values(), [this](Value *V) {
-            return !isAvailable(cast<Instruction>(V));
+      /// TODO: Handle poison and null pointers for PHI and select.
+      // If all incoming values are available, mark this PHI as
+      // replacable and push it's users into the worklist.
+      bool IsReplacable = true;
+      if (all_of(PHI->incoming_values(), [&](Value *V) {
+            if (!isa<Instruction>(V))
+              return IsReplacable = false;
+            return isAvailable(cast<Instruction>(V));
           })) {
-        ValuesToRevisit.insert(Inst);
+        UsersToReplace.insert(PHI);
+        PushUsersToWorklist(PHI);
         continue;
       }
 
-      Worklist.insert(PHI);
-      if (!collectUsersRecursive(*PHI))
-        return false;
-    } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      if (!isa<Instruction>(SI->getTrueValue()) ||
-          !isa<Instruction>(SI->getFalseValue()))
+      // Either an incoming value is not an instruction or not all
+      // incoming values are available. If this PHI was already
+      // visited prior to this iteration, return false.
+      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
         return false;
 
-      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
-          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
-        ValuesToRevisit.insert(Inst);
-        continue;
+      // Push PHI back into the stack, followed by unavailable
+      // incoming values.
+      Worklist.emplace_back(PHI);
+      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
+        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
+        if (UsersToReplace.contains(IncomingValue))
+          continue;
+        if (!ValuesToRevisit.insert(IncomingValue))
+          return false;
+        Worklist.emplace_back(IncomingValue);
       }
-      Worklist.insert(SI);
-      if (!collectUsersRecursive(*SI))
-        return false;
-    } else if (isa<GetElementPtrInst>(Inst)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
+    } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
+      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
+      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
+      if (!TrueInst || !FalseInst)
         return false;
+
+      UsersToReplace.insert(SI);
+      PushUsersToWorklist(SI);
+    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      UsersToReplace.insert(GEP);
+      PushUsersToWorklist(GEP);
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      Worklist.insert(Inst);
+      UsersToReplace.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      Worklist.insert(Inst);
-      if (!collectUsersRecursive(*Inst))
-        return false;
+      UsersToReplace.insert(Inst);
+      PushUsersToWorklist(Inst);
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
       return false;
     }
   }
@@ -342,7 +347,39 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) {
   return true;
 }
 
-Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
+void PointerReplacer::replacePointer(Value *V) {
+  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
+         "Invalid usage");
+  WorkMap[&Root] = V;
+  SmallVector<Instruction *> Worklist;
+  SetVector<Instruction *> PostOrderWorklist;
+  SmallPtrSet<Instruction *, 32> Visited;
+
+  // Perform a postorder traversal of the users of Root.
+  Worklist.push_back(&Root);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+
+    // If I has not been processed before, push each of its
+    // replacable users into the worklist.
+    if (Visited.insert(I).second) {
+      for (auto *U : I->users()) {
+        auto *UserInst = cast<Instruction>(U);
+        if (UsersToReplace.contains(UserInst))
+          Worklist.push_back(UserInst);
+      }
+      // Otherwise, users of I have already been pushed into
+      // the PostOrderWorklist. Push I as well.
+    } else {
+      PostOrderWorklist.insert(I);
+      Worklist.pop_back();
+    }
+  }
+
+  // Replace pointers in reverse-postorder.
+  for (Instruction *I : reverse(PostOrderWorklist))
+    replace(I);
+}
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -364,13 +401,15 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
-    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
-                                   PHI->getName(), PHI->getIterator());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
-      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
-                          PHI->getIncomingBlock(I));
-    WorkMap[PHI] = NewPHI;
+    // Create a new PHI by replacing any incoming value that is a user of the
+    // root pointer and has a replacement.
+    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
+    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
+      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
+      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
+    }
+    WorkMap[PHI] = PHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -434,18 +473,6 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-#ifndef NDEBUG
-  auto *PT = cast<PointerType>(Root.getType());
-  auto *NT = cast<PointerType>(V->getType());
-  assert(PT != NT && "Invalid usage");
-#endif
-  WorkMap[&Root] = V;
-
-  for (Instruction *Workitem : Worklist)
-    replace(Workitem);
-}
-
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
new file mode 100644
index 0000000000000..538cc19f9722e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
+
+%struct.type = type { [256 x <2 x i64>] }
+@g1 = external hidden addrspace(3) global %struct.type, align 16
+
+; This test requires the PtrReplacer to replace users in an RPO traversal.
+; Furthermore, %ptr.else need not to be replaced so it must be retained in
+; %ptr.sink.
+define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[SINK:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br label %[[SINK]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  br i1 %cmp.0, label %if.then, label %if.else
+
+if.then:                                    ; preds = %entry
+  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
+  br label %sink
+
+if.else:                                      ; preds = %entry
+  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
+  br label %sink
+
+sink:
+  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
+  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
+  ret <2 x i64> %val.sink
+}
+
+define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
+; CHECK-LABEL: define <2 x i64> @func_phi_loop(
+; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
+; CHECK:       [[SINK]]:
+; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
+; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
+;
+entry:
+  %coerce = alloca %struct.type, align 16, addrspace(5)
+  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
+  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
+  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
+  br label %loop
+
+loop:
+  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
+  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
+  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
+  br i1 %cmp.0, label %loop, label %sink
+
+sink:
+  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
+  ret <2 x i64> %val.sink
+}
+
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From d1dc080a858ca47c314334fb14f1ecb605fb4371 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 17:53:34 +0100
Subject: [PATCH 610/851] [lldb-dap] show function name in the instruction
 comment. (#144070)

putting the function name is the dissassembly instruction messes up the
alignment making it less readable. put it instead with the comment.

This also aligns the opcodes and instruction to the left matching the
cli
---
 .../Handler/DisassembleRequestHandler.cpp     | 34 ++++++++-----------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
index d5878d18289d6..85214b84b5c9c 100644
--- a/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/DisassembleRequestHandler.cpp
@@ -100,7 +100,7 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   const char *m = inst.GetMnemonic(target);
   const char *o = inst.GetOperands(target);
-  const char *c = inst.GetComment(target);
+  std::string c = inst.GetComment(target);
   auto d = inst.GetData(target);
 
   std::string bytes;
@@ -114,34 +114,30 @@ static DisassembledInstruction ConvertSBInstructionToDisassembledInstruction(
 
   DisassembledInstruction disassembled_inst;
   disassembled_inst.address = inst_addr;
-  disassembled_inst.instructionBytes =
-      bytes.size() > 0 ? bytes.substr(0, bytes.size() - 1) : "";
 
-  std::string instruction;
-  llvm::raw_string_ostream si(instruction);
+  if (!bytes.empty()) // remove last whitespace
+    bytes.pop_back();
+  disassembled_inst.instructionBytes = std::move(bytes);
+
+  llvm::raw_string_ostream si(disassembled_inst.instruction);
+  si << llvm::formatv("{0,-7} {1,-25}", m, o);
 
-  lldb::SBSymbol symbol = addr.GetSymbol();
   // Only add the symbol on the first line of the function.
-  if (symbol.IsValid() && symbol.GetStartAddress() == addr) {
-    // If we have a valid symbol, append it as a label prefix for the first
-    // instruction. This is so you can see the start of a function/callsite
-    // in the assembly, at the moment VS Code (1.80) does not visualize the
-    // symbol associated with the assembly instruction.
-    si << (symbol.GetMangledName() != nullptr ? symbol.GetMangledName()
-                                              : symbol.GetName())
-       << ": ";
+  // in the comment section
+  if (lldb::SBSymbol symbol = addr.GetSymbol();
+      symbol.GetStartAddress() == addr) {
+    const llvm::StringRef sym_display_name = symbol.GetDisplayName();
+    c.append(" ");
+    c.append(sym_display_name);
 
     if (resolve_symbols)
-      disassembled_inst.symbol = symbol.GetDisplayName();
+      disassembled_inst.symbol = sym_display_name;
   }
 
-  si << llvm::formatv("{0,7} {1,12}", m, o);
-  if (c && c[0]) {
+  if (!c.empty()) {
     si << " ; " << c;
   }
 
-  disassembled_inst.instruction = std::move(instruction);
-
   protocol::Source source = CreateSource(addr, target);
   lldb::SBLineEntry line_entry = GetLineEntryForAddress(target, addr);
 

From 20a1b357c0ff3c3f71de45bae42cb2dead7b66c9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 09:57:10 -0700
Subject: [PATCH 611/851] AArch64: Move AArch64MCExpr functions to
 AArch64MCAsmInfo

To migrate away from the legacy
XXXMCExpr::printImpl/evaluateAsRelocatableImpl overrides and align with
other targets.

While the AArch64MCAsmInfoXXX hooks introduce some duplication, they
enable better separation for object file formats.

Note: While AArch64MCAsmInfoDarwin uses the `@specifier` notation, it
might use AArch64MCExpr with specifier VK_ABS.
test/tools/llvm-mca/AArch64/Exynos/zero-latency-move.s abuses a parser
behavior that :lo12: is also parsed for Mach-O (though it will fail for
-filetype=obj).
---
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 137 ++++++++++++++++++
 .../AArch64/MCTargetDesc/AArch64MCAsmInfo.h   |  23 +++
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |  97 +------------
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      |  14 +-
 .../AArch64WinCOFFObjectWriter.cpp            |   6 +-
 5 files changed, 165 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 31965d85d9eb4..b2cd1d0f4156e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 using namespace llvm;
@@ -53,6 +54,80 @@ const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = {
     {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"},
 };
 
+StringRef AArch64::getSpecifierName(const MCSpecifierExpr &Expr) {
+  // clang-format off
+  switch (static_cast<uint32_t>(Expr.getSpecifier())) {
+  case AArch64MCExpr::VK_CALL:                return "";
+  case AArch64MCExpr::VK_LO12:                return ":lo12:";
+  case AArch64MCExpr::VK_ABS_G3:              return ":abs_g3:";
+  case AArch64MCExpr::VK_ABS_G2:              return ":abs_g2:";
+  case AArch64MCExpr::VK_ABS_G2_S:            return ":abs_g2_s:";
+  case AArch64MCExpr::VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case AArch64MCExpr::VK_ABS_G1:              return ":abs_g1:";
+  case AArch64MCExpr::VK_ABS_G1_S:            return ":abs_g1_s:";
+  case AArch64MCExpr::VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case AArch64MCExpr::VK_ABS_G0:              return ":abs_g0:";
+  case AArch64MCExpr::VK_ABS_G0_S:            return ":abs_g0_s:";
+  case AArch64MCExpr::VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case AArch64MCExpr::VK_PREL_G3:             return ":prel_g3:";
+  case AArch64MCExpr::VK_PREL_G2:             return ":prel_g2:";
+  case AArch64MCExpr::VK_PREL_G2_NC:          return ":prel_g2_nc:";
+  case AArch64MCExpr::VK_PREL_G1:             return ":prel_g1:";
+  case AArch64MCExpr::VK_PREL_G1_NC:          return ":prel_g1_nc:";
+  case AArch64MCExpr::VK_PREL_G0:             return ":prel_g0:";
+  case AArch64MCExpr::VK_PREL_G0_NC:          return ":prel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_G2:           return ":dtprel_g2:";
+  case AArch64MCExpr::VK_DTPREL_G1:           return ":dtprel_g1:";
+  case AArch64MCExpr::VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case AArch64MCExpr::VK_DTPREL_G0:           return ":dtprel_g0:";
+  case AArch64MCExpr::VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case AArch64MCExpr::VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case AArch64MCExpr::VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case AArch64MCExpr::VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case AArch64MCExpr::VK_TPREL_G2:            return ":tprel_g2:";
+  case AArch64MCExpr::VK_TPREL_G1:            return ":tprel_g1:";
+  case AArch64MCExpr::VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case AArch64MCExpr::VK_TPREL_G0:            return ":tprel_g0:";
+  case AArch64MCExpr::VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case AArch64MCExpr::VK_TPREL_HI12:          return ":tprel_hi12:";
+  case AArch64MCExpr::VK_TPREL_LO12:          return ":tprel_lo12:";
+  case AArch64MCExpr::VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case AArch64MCExpr::VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
+  case AArch64MCExpr::VK_ABS_PAGE:            return "";
+  case AArch64MCExpr::VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
+  case AArch64MCExpr::VK_GOT:                 return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE:            return ":got:";
+  case AArch64MCExpr::VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
+  case AArch64MCExpr::VK_GOT_LO12:            return ":got_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL:            return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case AArch64MCExpr::VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case AArch64MCExpr::VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case AArch64MCExpr::VK_TLSDESC:             return "";
+  case AArch64MCExpr::VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case AArch64MCExpr::VK_TLSDESC_AUTH:        return "";
+  case AArch64MCExpr::VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
+  case AArch64MCExpr::VK_SECREL_LO12:         return ":secrel_lo12:";
+  case AArch64MCExpr::VK_SECREL_HI12:         return ":secrel_hi12:";
+  case AArch64MCExpr::VK_GOT_AUTH:            return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_PAGE:       return ":got_auth:";
+  case AArch64MCExpr::VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
+  default:
+    llvm_unreachable("Invalid relocation specifier");
+  }
+  // clang-format on
+}
+
+static bool evaluate(const MCSpecifierExpr &Expr, MCValue &Res,
+                     const MCAssembler *Asm) {
+  if (!Expr.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+  Res.setSpecifier(Expr.getSpecifier());
+  return true;
+}
+
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
@@ -91,6 +166,33 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
 
+void AArch64AuthMCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
+  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
+  if (WrapSubExprInParens)
+    OS << '(';
+  getSubExpr()->print(OS, MAI);
+  if (WrapSubExprInParens)
+    OS << ')';
+
+  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
+  if (hasAddressDiversity())
+    OS << ",addr";
+  OS << ')';
+}
+
+void AArch64MCAsmInfoDarwin::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoDarwin::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
@@ -127,6 +229,19 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   initializeVariantKinds(ELFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoELF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  if (auto *AE = dyn_cast<AArch64AuthMCExpr>(&Expr))
+    return AE->print(OS, this);
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoELF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -146,6 +261,17 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() {
   initializeVariantKinds(COFFAtSpecifiers);
 }
 
+void AArch64MCAsmInfoMicrosoftCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoMicrosoftCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
+
 AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
@@ -164,3 +290,14 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() {
 
   initializeVariantKinds(COFFAtSpecifiers);
 }
+
+void AArch64MCAsmInfoGNUCOFF::printSpecifierExpr(
+    raw_ostream &OS, const MCSpecifierExpr &Expr) const {
+  OS << AArch64::getSpecifierName(Expr);
+  printExpr(OS, *Expr.getSubExpr());
+}
+
+bool AArch64MCAsmInfoGNUCOFF::evaluateAsRelocatableImpl(
+    const MCSpecifierExpr &Expr, MCValue &Res, const MCAssembler *Asm) const {
+  return evaluate(Expr, Res, Asm);
+}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 225e0c8e55fca..bc02586d73884 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -26,20 +27,42 @@ struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoMicrosoftCOFF : public MCAsmInfoMicrosoft {
   explicit AArch64MCAsmInfoMicrosoftCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 struct AArch64MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   explicit AArch64MCAsmInfoGNUCOFF();
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
+namespace AArch64 {
+/// Return the string representation of the ELF relocation specifier
+/// (e.g. ":got:", ":lo12:").
+StringRef getSpecifierName(const MCSpecifierExpr &Expr);
+} // namespace AArch64
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index d934af91b9ff5..7a7c6f7effd9f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,100 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCExpr.h"
+#include "AArch64MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-
 const AArch64MCExpr *AArch64MCExpr::create(const MCExpr *Expr, Specifier S,
                                            MCContext &Ctx) {
   return new (Ctx) AArch64MCExpr(Expr, S);
 }
 
-StringRef AArch64MCExpr::getSpecifierName() const {
-  // clang-format off
-  switch (static_cast<uint32_t>(getSpecifier())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_S:            return ":abs_g2_s:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_S:            return ":abs_g1_s:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_S:            return ":abs_g0_s:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_PREL_G3:             return ":prel_g3:";
-  case VK_PREL_G2:             return ":prel_g2:";
-  case VK_PREL_G2_NC:          return ":prel_g2_nc:";
-  case VK_PREL_G1:             return ":prel_g1:";
-  case VK_PREL_G1_NC:          return ":prel_g1_nc:";
-  case VK_PREL_G0:             return ":prel_g0:";
-  case VK_PREL_G0_NC:          return ":prel_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_HI12:          return ":tprel_hi12:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
-  case VK_GOT:                 return ":got:";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_PAGE_LO15:       return ":gotpage_lo15:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL:            return ":gottprel:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  case VK_TLSDESC_AUTH:        return "";
-  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
-  case VK_SECREL_LO12:         return ":secrel_lo12:";
-  case VK_SECREL_HI12:         return ":secrel_hi12:";
-  case VK_GOT_AUTH:            return ":got_auth:";
-  case VK_GOT_AUTH_PAGE:       return ":got_auth:";
-  case VK_GOT_AUTH_LO12:       return ":got_auth_lo12:";
-  default:
-    llvm_unreachable("Invalid relocation specifier");
-  }
-  // clang-format on
-}
-
-void AArch64MCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  OS << getSpecifierName();
-  Expr->print(OS, MAI);
-}
-
-bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-  Res.setSpecifier(getSpecifier());
-  return true;
-}
-
 const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
                                                    uint16_t Discriminator,
                                                    AArch64PACKey::ID Key,
@@ -114,17 +33,3 @@ const AArch64AuthMCExpr *AArch64AuthMCExpr::create(const MCExpr *Expr,
   return new (Ctx)
       AArch64AuthMCExpr(Expr, Discriminator, Key, HasAddressDiversity);
 }
-
-void AArch64AuthMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  bool WrapSubExprInParens = !isa<MCSymbolRefExpr>(getSubExpr());
-  if (WrapSubExprInParens)
-    OS << '(';
-  getSubExpr()->print(OS, MAI);
-  if (WrapSubExprInParens)
-    OS << ')';
-
-  OS << "@AUTH(" << AArch64PACKeyIDToString(Key) << ',' << Discriminator;
-  if (hasAddressDiversity())
-    OS << ",addr";
-  OS << ')';
-}
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 9c383894c7f54..541f24c943a15 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -147,8 +147,6 @@ class AArch64MCExpr : public MCSpecifierExpr {
 public:
   static const AArch64MCExpr *create(const MCExpr *Expr, Specifier,
                                      MCContext &Ctx);
-  /// @name VariantKind information extractors.
-  /// @{
 
   static Specifier getSymbolLoc(Specifier S) {
     return static_cast<Specifier>(S & VK_SymLocBits);
@@ -159,16 +157,6 @@ class AArch64MCExpr : public MCSpecifierExpr {
   }
 
   static bool isNotChecked(Specifier S) { return S & VK_NC; }
-
-  /// @}
-
-  /// Return the string representation of the ELF relocation specifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getSpecifierName() const;
-
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
 };
 
 class AArch64AuthMCExpr final : public AArch64MCExpr {
@@ -189,7 +177,7 @@ class AArch64AuthMCExpr final : public AArch64MCExpr {
   uint16_t getDiscriminator() const { return Discriminator; }
   bool hasAddressDiversity() const { return getSpecifier() == VK_AUTHADDR; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   static bool classof(const MCExpr *E) {
     return isa<AArch64MCExpr>(E) && classof(cast<AArch64MCExpr>(E));
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
index 3009bd2ca2758..2e997631655ed 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -8,7 +8,7 @@
 
 #include "AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCExpr.h"
+#include "MCTargetDesc/AArch64MCAsmInfo.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -73,7 +73,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
       break;
     default:
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
       return COFF::IMAGE_REL_ARM64_ABSOLUTE; // Dummy return value
     }
@@ -83,7 +83,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType(
   default: {
     if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(Expr)) {
       Ctx.reportError(Fixup.getLoc(), "relocation specifier " +
-                                          A64E->getSpecifierName() +
+                                          AArch64::getSpecifierName(*A64E) +
                                           " unsupported on COFF targets");
     } else {
       MCFixupKindInfo Info = MAB.getFixupKindInfo(Fixup.getKind());

From a733c6c7bb1c533ec28c96c49d3c5de7babd8b7f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 16 Jun 2025 10:04:28 -0700
Subject: [PATCH 612/851] [TargetLowering][RISCV] Allow scalable non-simple
 EVTs to be split even if the element type isn't a legal scalar type.
 (#144007)

This fixes an inconsistency in i64 vector handling between RV32 and
RV64. Even if i64 isn't legal as a scalar, we should still be able
to split a large i64 vector to get down to a legal vector type. We only
need to give up if we need to split a vscale x 1 vector.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    2 +-
 .../Analysis/CostModel/RISCV/cast-half.ll     |   16 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    | 6262 ++++++-----------
 llvm/test/Analysis/CostModel/RISCV/cmp.ll     |  490 +-
 llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll    |  798 +++
 5 files changed, 3059 insertions(+), 4509 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 935afaf9dd550..b1afdc2a3ac39 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1030,7 +1030,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If type is to be expanded, split the vector.
     //  <4 x i140> -> <2 x i140>
     if (LK.first == TypeExpandInteger) {
-      if (VT.getVectorElementCount().isScalable())
+      if (NumElts.isScalable() && NumElts.getKnownMinValue() == 1)
         return LegalizeKind(TypeScalarizeScalableVector, EltVT);
       return LegalizeKind(TypeSplitVector,
                           VT.getHalfNumVectorElementsVT(Context));
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
index 244c42cc94ba0..971b14467c0f8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast-half.ll
@@ -74,7 +74,7 @@ define void @fptosi() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -147,7 +147,7 @@ define void @fptosi() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptosi <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -440,7 +440,7 @@ define void @fptoui() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -513,7 +513,7 @@ define void @fptoui() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %nxv64f16_nxv64i8 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i8>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64f16_nxv64i16 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i16>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %nxv64f16_nxv64i32 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i32>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64i64 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i64>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64f16_nxv64i1 = fptoui <vscale x 64 x half> undef to <vscale x 64 x i1>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -806,7 +806,7 @@ define void @sitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -879,7 +879,7 @@ define void @sitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1172,7 +1172,7 @@ define void @uitofp() {
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -1245,7 +1245,7 @@ define void @uitofp() {
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %nxv64i8_nxv64f16 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv64i16_nxv64f16 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %nxv64i32_nxv64f16 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x half>
-; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
+; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64f16 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %nxv64i1_nxv64f16 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x half>
 ; RV32ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index e498ccc733040..bdd8540a2c475 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -3,651 +3,328 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @sext() {
-; RV32-LABEL: 'sext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = sext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = sext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = sext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = sext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = sext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = sext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = sext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = sext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = sext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = sext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = sext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = sext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = sext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = sext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = sext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = sext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = sext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = sext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = sext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = sext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = sext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = sext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = sext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = sext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = sext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = sext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = sext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = sext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = sext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = sext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = sext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = sext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = sext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = sext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = sext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = sext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = sext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = sext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = sext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = sext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = sext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = sext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = sext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = sext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = sext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = sext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = sext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = sext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = sext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = sext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = sext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = sext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = sext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = sext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = sext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = sext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = sext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = sext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = sext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = sext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = sext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = sext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = sext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = sext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = sext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = sext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = sext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = sext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = sext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = sext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = sext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = sext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = sext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = sext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = sext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = sext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = sext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = sext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = sext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = sext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = sext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = sext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = sext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = sext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = sext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = sext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = sext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = sext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = sext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = sext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = sext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = sext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = sext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = sext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = sext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = sext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = sext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = sext <2 x i8> undef to <2 x i32>
@@ -1005,651 +682,328 @@ define void @sext() {
 }
 
 define void @zext() {
-; RV32-LABEL: 'zext'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'zext'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'zext'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_v2i64 = zext <2 x i8> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i32 = zext <2 x i16> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i64 = zext <2 x i16> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i64 = zext <2 x i32> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i8 = zext <2 x i1> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i16 = zext <2 x i1> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i64 = zext <4 x i8> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i32 = zext <4 x i16> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i64 = zext <4 x i16> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i64 = zext <4 x i32> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i8 = zext <4 x i1> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i16 = zext <4 x i1> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_v8i64 = zext <8 x i8> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i32 = zext <8 x i16> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_v8i64 = zext <8 x i16> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i64 = zext <8 x i32> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i8 = zext <8 x i1> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i1_v8i16 = zext <8 x i1> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i8_v16i64 = zext <16 x i8> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16i32 = zext <16 x i16> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_v16i64 = zext <16 x i16> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16i64 = zext <16 x i32> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i1_v16i8 = zext <16 x i1> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_v16i16 = zext <16 x i1> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i8_v32i64 = zext <32 x i8> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32i32 = zext <32 x i16> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i16_v32i64 = zext <32 x i16> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32i64 = zext <32 x i32> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_v32i8 = zext <32 x i1> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i1_v32i16 = zext <32 x i1> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i8_v64i64 = zext <64 x i8> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64i32 = zext <64 x i16> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v64i16_v64i64 = zext <64 x i16> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64i64 = zext <64 x i32> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i1_v64i8 = zext <64 x i1> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i1_v64i16 = zext <64 x i1> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %v128i8_v128i64 = zext <128 x i8> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128i32 = zext <128 x i16> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v128i16_v128i64 = zext <128 x i16> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128i64 = zext <128 x i32> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i1_v128i8 = zext <128 x i1> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v128i1_v128i16 = zext <128 x i1> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %v256i8_v256i64 = zext <256 x i8> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v256i16_v256i32 = zext <256 x i16> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %v256i16_v256i64 = zext <256 x i16> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %v256i32_v256i64 = zext <256 x i32> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v256i1_v256i8 = zext <256 x i1> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v256i1_v256i16 = zext <256 x i1> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 140 for instruction: %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8_nxv1i64 = zext <vscale x 1 x i8> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i32 = zext <vscale x 1 x i16> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i64 = zext <vscale x 1 x i16> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i64 = zext <vscale x 1 x i32> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i8 = zext <vscale x 1 x i1> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i16 = zext <vscale x 1 x i1> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i64 = zext <vscale x 2 x i8> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i32 = zext <vscale x 2 x i16> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i64 = zext <vscale x 2 x i16> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i64 = zext <vscale x 2 x i32> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i8 = zext <vscale x 2 x i1> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i16 = zext <vscale x 2 x i1> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i8_nxv4i64 = zext <vscale x 4 x i8> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i32 = zext <vscale x 4 x i16> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i16_nxv4i64 = zext <vscale x 4 x i16> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i64 = zext <vscale x 4 x i32> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i8 = zext <vscale x 4 x i1> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i1_nxv4i16 = zext <vscale x 4 x i1> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i8_nxv8i64 = zext <vscale x 8 x i8> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i32 = zext <vscale x 8 x i16> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i16_nxv8i64 = zext <vscale x 8 x i16> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i64 = zext <vscale x 8 x i32> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i1_nxv8i8 = zext <vscale x 8 x i1> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i1_nxv8i16 = zext <vscale x 8 x i1> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i8_nxv16i64 = zext <vscale x 16 x i8> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i32 = zext <vscale x 16 x i16> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i16_nxv16i64 = zext <vscale x 16 x i16> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16i64 = zext <vscale x 16 x i32> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i1_nxv16i8 = zext <vscale x 16 x i1> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i1_nxv16i16 = zext <vscale x 16 x i1> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i8_nxv32i64 = zext <vscale x 32 x i8> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32i32 = zext <vscale x 32 x i16> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv32i16_nxv32i64 = zext <vscale x 32 x i16> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32i64 = zext <vscale x 32 x i32> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i1_nxv32i8 = zext <vscale x 32 x i1> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i1_nxv32i16 = zext <vscale x 32 x i1> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %nxv64i8_nxv64i64 = zext <vscale x 64 x i8> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64i32 = zext <vscale x 64 x i16> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv64i16_nxv64i64 = zext <vscale x 64 x i16> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64i64 = zext <vscale x 64 x i32> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i1_nxv64i8 = zext <vscale x 64 x i1> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv64i1_nxv64i16 = zext <vscale x 64 x i1> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i8_nxv128i128 = zext <vscale x 128 x i8> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv128i16_nxv128i32 = zext <vscale x 128 x i16> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i16_nxv128i128 = zext <vscale x 128 x i16> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i32_nxv128i128 = zext <vscale x 128 x i32> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv128i1_nxv128i8 = zext <vscale x 128 x i1> undef to <vscale x 128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %nxv128i1_nxv128i16 = zext <vscale x 128 x i1> undef to <vscale x 128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 70 for instruction: %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2i16 = zext <2 x i8> undef to <2 x i16>
   %v2i8_v2i32 = zext <2 x i8> undef to <2 x i32>
@@ -2007,631 +1361,318 @@ define void @zext() {
 }
 
 define void @trunc() {
-; RV32-LABEL: 'trunc'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'trunc'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'trunc'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i4 = trunc <2 x i16> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i6 = trunc <2 x i16> undef to <2 x i6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_v4i8 = trunc <4 x i64> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4i16 = trunc <4 x i32> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4i16 = trunc <4 x i64> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4i32 = trunc <4 x i64> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4i1 = trunc <4 x i8> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16_v4i1 = trunc <4 x i16> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8i8 = trunc <8 x i64> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_v8i16 = trunc <8 x i32> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_v8i16 = trunc <8 x i64> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8i32 = trunc <8 x i64> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8_v8i1 = trunc <8 x i8> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8i1 = trunc <8 x i16> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64_v16i8 = trunc <2 x i64> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_v16i16 = trunc <2 x i32> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i16 = trunc <2 x i64> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i64_v16i32 = trunc <2 x i64> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8_v16i1 = trunc <2 x i8> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16_v16i1 = trunc <2 x i16> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v32i64_v32i8 = trunc <16 x i64> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i32_v32i16 = trunc <16 x i32> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32i64_v32i16 = trunc <16 x i64> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i64_v32i32 = trunc <16 x i64> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8_v32i1 = trunc <16 x i8> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16_v32i1 = trunc <16 x i16> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64i64_v64i8 = trunc <64 x i64> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i32_v64i16 = trunc <64 x i32> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64i64_v64i16 = trunc <64 x i64> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128i32_v128i16 = trunc <128 x i32> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128i64_v128i16 = trunc <128 x i64> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v256i32_v256i16 = trunc <256 x i32> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %v256i64_v256i16 = trunc <256 x i64> undef to <256 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 108 for instruction: %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 271 for instruction: %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1i16 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i16 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1i32 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1i1 = trunc <vscale x 1 x i8> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1i1 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i64_nxv2i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i16_nxv2i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32_nxv4i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i64_nxv4i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i8_nxv4i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8i64_nxv8i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i32_nxv8i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i64_nxv8i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i8_nxv8i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16i64_nxv16i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i32_nxv16i16 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16i64_nxv16i16 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16i32 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i8_nxv16i1 = trunc <vscale x 16 x i8> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16i1 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32i64_nxv32i8 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32i32_nxv32i16 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32i64_nxv32i16 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32i32 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv32i8_nxv32i1 = trunc <vscale x 32 x i8> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv32i16_nxv32i1 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64i64_nxv64i8 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64i32_nxv64i16 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64i64_nxv64i16 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64i32 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv64i8_nxv64i1 = trunc <vscale x 64 x i8> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv64i16_nxv64i1 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 
   %v2i16_v2i2 = trunc <2 x i16> undef to <2 x i2>
@@ -3386,571 +2427,288 @@ define void @fptrunc() {
 }
 
 define void @fptosi() {
-; RV32-LABEL: 'fptosi'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptosi'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptosi'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
@@ -4264,571 +3022,288 @@ define void @fptosi() {
 }
 
 define void @fptoui() {
-; RV32-LABEL: 'fptoui'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'fptoui'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'fptoui'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
@@ -5142,571 +3617,288 @@ define void @fptoui() {
 }
 
 define void @sitofp() {
-; RV32-LABEL: 'sitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'sitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'sitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
@@ -6020,571 +4212,288 @@ define void @sitofp() {
 }
 
 define void @uitofp() {
-; RV32-LABEL: 'uitofp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'uitofp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'uitofp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
@@ -6985,3 +4894,6 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
index 69d4f27ac41be..793f0dd2fe049 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
@@ -3,331 +3,168 @@
 ; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @icmp() {
-; RV32-LABEL: 'icmp'
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Invalid cost for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; RV64-LABEL: 'icmp'
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
-; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'icmp'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt <vscale x 1 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt <vscale x 1 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt <vscale x 1 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt <vscale x 1 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt <vscale x 1 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %86 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %87 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %88 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i16> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %89 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i32> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %90 = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i64> undef, metadata !"slt", <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt <vscale x 2 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt <vscale x 2 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt <vscale x 2 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt <vscale x 2 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %96 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %97 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i8> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %98 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i16> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %99 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i32> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %100 = call <vscale x 2 x i1> @llvm.vp.icmp.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i64> undef, metadata !"slt", <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt <vscale x 4 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt <vscale x 4 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt <vscale x 4 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %106 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %107 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i8> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %108 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i16> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %109 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i32> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %110 = call <vscale x 4 x i1> @llvm.vp.icmp.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i64> undef, metadata !"slt", <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt <vscale x 8 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt <vscale x 8 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %116 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %117 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i8> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %118 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i16> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %119 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i32> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %120 = call <vscale x 8 x i1> @llvm.vp.icmp.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i64> undef, metadata !"slt", <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt <vscale x 16 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt <vscale x 16 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %126 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %127 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i8> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %128 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i16> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %129 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i32> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %130 = call <vscale x 16 x i1> @llvm.vp.icmp.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i64> undef, metadata !"slt", <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt <vscale x 32 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt <vscale x 32 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt <vscale x 32 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt <vscale x 32 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %136 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %137 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i8> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %138 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i16> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %139 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i32> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %140 = call <vscale x 32 x i1> @llvm.vp.icmp.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i64> undef, metadata !"slt", <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt <vscale x 64 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt <vscale x 64 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt <vscale x 64 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt <vscale x 64 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt <vscale x 64 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %146 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %147 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i8> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %148 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i16> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %149 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i32> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %150 = call <vscale x 64 x i1> @llvm.vp.icmp.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i64> undef, metadata !"slt", <vscale x 64 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt <vscale x 128 x i1> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt <vscale x 128 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt <vscale x 128 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt <vscale x 128 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt <vscale x 128 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %156 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %157 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i8> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %158 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i16> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %159 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i32> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %160 = call <vscale x 128 x i1> @llvm.vp.icmp.nxv128i64(<vscale x 128 x i64> undef, <vscale x 128 x i64> undef, metadata !"slt", <vscale x 128 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   icmp slt <2 x i1> undef, undef
   icmp slt <2 x i8> undef, undef
@@ -658,3 +495,6 @@ define void @fcmp() {
 
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
index 7442be92fffcd..83192930f5cdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-sdnode.ll
@@ -921,3 +921,801 @@ define <vscale x 8 x i32> @vsub_vi_mask_nxv8i32(<vscale x 8 x i32> %va, <vscale
   %vc = sub <vscale x 8 x i32> %va, %vs
   ret <vscale x 8 x i32> %vc
 }
+
+; Make sure we are able to split a type that isn't an MVT even if the scalar
+; element type isn't legal on RV32. This used to crash.
+define <vscale x 64 x i64> @vsub_vv_nxv64i64(<vscale x 64 x i64> %va, <vscale x 64 x i64> %vb) {
+; RV32-LABEL: vsub_vv_nxv64i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    .cfi_def_cfa_offset 80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 68(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 64(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 60(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 56(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s5, 52(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s6, 48(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s7, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s8, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s9, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    .cfi_offset s5, -28
+; RV32-NEXT:    .cfi_offset s6, -32
+; RV32-NEXT:    .cfi_offset s7, -36
+; RV32-NEXT:    .cfi_offset s8, -40
+; RV32-NEXT:    .cfi_offset s9, -44
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    mv a3, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a3, a3, a2
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0xd0, 0x00, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 80 + 88 * vlenb
+; RV32-NEXT:    mv s2, a7
+; RV32-NEXT:    mv s3, a1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr s6, vlenb
+; RV32-NEXT:    slli s7, s6, 4
+; RV32-NEXT:    slli s8, s6, 3
+; RV32-NEXT:    add a1, a7, s7
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, a7, s8
+; RV32-NEXT:    vl8re64.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    mv s0, a0
+; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s1, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s4, a0
+; RV32-NEXT:    add a0, s3, a0
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 48
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    mv s5, a0
+; RV32-NEXT:    add a0, s2, a0
+; RV32-NEXT:    slli s9, s6, 5
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s3, s9
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a0, s2, s4
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mv a0, s6
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v8, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    add a1, s3, a0
+; RV32-NEXT:    add a2, s2, s9
+; RV32-NEXT:    add a3, s3, s7
+; RV32-NEXT:    add a4, s2, a0
+; RV32-NEXT:    add s3, s3, s8
+; RV32-NEXT:    vl8re64.v v8, (s2)
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    mv a6, a5
+; RV32-NEXT:    slli a5, a5, 1
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 32
+; RV32-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a2)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add a2, a2, a5
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a4)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 4
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (s3)
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v8, (a3)
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    vl8re64.v v24, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v0, v0, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    addi a1, sp, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v8, v16, v8
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    slli a1, a1, 1
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vsub.vv v16, v24, v16
+; RV32-NEXT:    vs8r.v v0, (s0)
+; RV32-NEXT:    add s1, s0, s1
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 5
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s1)
+; RV32-NEXT:    add s5, s0, s5
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v24, (s5)
+; RV32-NEXT:    add s4, s0, s4
+; RV32-NEXT:    vs8r.v v16, (s4)
+; RV32-NEXT:    add s9, s0, s9
+; RV32-NEXT:    add a0, s0, a0
+; RV32-NEXT:    add s7, s0, s7
+; RV32-NEXT:    add s0, s0, s8
+; RV32-NEXT:    vs8r.v v8, (s9)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 32
+; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 6
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 32
+; RV32-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT:    vs8r.v v8, (s0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a1, a1, a0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 68(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 64(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 60(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 56(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s5, 52(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s6, 48(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s7, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s8, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s9, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    .cfi_restore s5
+; RV32-NEXT:    .cfi_restore s6
+; RV32-NEXT:    .cfi_restore s7
+; RV32-NEXT:    .cfi_restore s8
+; RV32-NEXT:    .cfi_restore s9
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsub_vv_nxv64i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -128
+; RV64-NEXT:    .cfi_def_cfa_offset 128
+; RV64-NEXT:    sd ra, 120(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 112(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 104(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 96(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 88(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 80(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s5, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s6, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s7, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s8, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s9, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    .cfi_offset s5, -56
+; RV64-NEXT:    .cfi_offset s6, -64
+; RV64-NEXT:    .cfi_offset s7, -72
+; RV64-NEXT:    .cfi_offset s8, -80
+; RV64-NEXT:    .cfi_offset s9, -88
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    mv a3, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a3, a3, a2
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    add a2, a2, a3
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0f, 0x72, 0x00, 0x11, 0x80, 0x01, 0x22, 0x11, 0xd8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 128 + 88 * vlenb
+; RV64-NEXT:    mv s2, a7
+; RV64-NEXT:    mv s3, a1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr s6, vlenb
+; RV64-NEXT:    slli s7, s6, 4
+; RV64-NEXT:    slli s8, s6, 3
+; RV64-NEXT:    add a1, a7, s7
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, a7, s8
+; RV64-NEXT:    vl8re64.v v8, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    li a1, 56
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s1, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 40
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s4, a0
+; RV64-NEXT:    add a0, s3, a0
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 48
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    mv s5, a0
+; RV64-NEXT:    add a0, s2, a0
+; RV64-NEXT:    slli s9, s6, 5
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s3, s9
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a0, s2, s4
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vs8r.v v8, (a0) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mv a0, s6
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 6
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v8, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    add a1, s3, a0
+; RV64-NEXT:    add a2, s2, s9
+; RV64-NEXT:    add a3, s3, s7
+; RV64-NEXT:    add a4, s2, a0
+; RV64-NEXT:    add s3, s3, s8
+; RV64-NEXT:    vl8re64.v v8, (s2)
+; RV64-NEXT:    csrr a5, vlenb
+; RV64-NEXT:    slli a5, a5, 3
+; RV64-NEXT:    mv a6, a5
+; RV64-NEXT:    slli a5, a5, 1
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    add a5, sp, a5
+; RV64-NEXT:    addi a5, a5, 32
+; RV64-NEXT:    vs8r.v v8, (a5) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a2)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    mv a5, a2
+; RV64-NEXT:    slli a2, a2, 1
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a4)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 4
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (s3)
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v8, (a3)
+; RV64-NEXT:    addi a2, sp, 32
+; RV64-NEXT:    vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    vl8re64.v v24, (a1)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v0, v0, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    addi a1, sp, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v8, v16, v8
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a2, a2, a1
+; RV64-NEXT:    slli a1, a1, 1
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v16, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vsub.vv v16, v24, v16
+; RV64-NEXT:    vs8r.v v0, (s0)
+; RV64-NEXT:    add s1, s0, s1
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 5
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s1)
+; RV64-NEXT:    add s5, s0, s5
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 3
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v24, (s5)
+; RV64-NEXT:    add s4, s0, s4
+; RV64-NEXT:    vs8r.v v16, (s4)
+; RV64-NEXT:    add s9, s0, s9
+; RV64-NEXT:    add a0, s0, a0
+; RV64-NEXT:    add s7, s0, s7
+; RV64-NEXT:    add s0, s0, s8
+; RV64-NEXT:    vs8r.v v8, (s9)
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    mv a2, a1
+; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 32
+; RV64-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 6
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 32
+; RV64-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; RV64-NEXT:    vs8r.v v8, (s0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a1, a1, a0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 128
+; RV64-NEXT:    ld ra, 120(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 112(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 104(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 96(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 88(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 80(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s5, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s6, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s7, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s8, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s9, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    .cfi_restore s5
+; RV64-NEXT:    .cfi_restore s6
+; RV64-NEXT:    .cfi_restore s7
+; RV64-NEXT:    .cfi_restore s8
+; RV64-NEXT:    .cfi_restore s9
+; RV64-NEXT:    addi sp, sp, 128
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+  %vc = sub <vscale x 64 x i64> %va, %vb
+  ret <vscale x 64 x i64> %vc
+}

From c62a6138d9d02bcc0fb6660bbed78b4e979fc3dc Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Mon, 16 Jun 2025 13:05:31 -0400
Subject: [PATCH 613/851] Revert "[InstCombine] Iterative replacement in
 PtrReplacer" (#144394)

Reverts llvm/llvm-project#137215

This commit caused a failure in the LLVM CI:
https://lab.llvm.org/buildbot/#/builders/10/builds/7442
---
 .../InstCombineLoadStoreAlloca.cpp            | 165 ++++++++----------
 .../InstCombine/AMDGPU/ptr-replace-alloca.ll  |  79 ---------
 2 files changed, 69 insertions(+), 175 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 9aec90120d8b0..a9751ab03e20e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -243,10 +243,11 @@ class PointerReplacer {
   void replacePointer(Value *V);
 
 private:
+  bool collectUsersRecursive(Instruction &I);
   void replace(Instruction *I);
-  Value *getReplacement(Value *V) const { return WorkMap.lookup(V); }
+  Value *getReplacement(Value *I);
   bool isAvailable(Instruction *I) const {
-    return I == &Root || UsersToReplace.contains(I);
+    return I == &Root || Worklist.contains(I);
   }
 
   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
@@ -258,7 +259,8 @@ class PointerReplacer {
     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
   }
 
-  SmallSetVector<Instruction *, 32> UsersToReplace;
+  SmallPtrSet<Instruction *, 32> ValuesToRevisit;
+  SmallSetVector<Instruction *, 4> Worklist;
   MapVector<Value *, Value *> WorkMap;
   InstCombinerImpl &IC;
   Instruction &Root;
@@ -267,79 +269,72 @@ class PointerReplacer {
 } // end anonymous namespace
 
 bool PointerReplacer::collectUsers() {
-  SmallVector<Instruction *> Worklist;
-  SmallSetVector<Instruction *, 32> ValuesToRevisit;
-
-  auto PushUsersToWorklist = [&](Instruction *Inst) {
-    for (auto *U : Inst->users())
-      if (auto *I = dyn_cast<Instruction>(U))
-        if (!isAvailable(I) && !ValuesToRevisit.contains(I))
-          Worklist.emplace_back(I);
-  };
+  if (!collectUsersRecursive(Root))
+    return false;
 
-  PushUsersToWorklist(&Root);
-  while (!Worklist.empty()) {
-    Instruction *Inst = Worklist.pop_back_val();
+  // Ensure that all outstanding (indirect) users of I
+  // are inserted into the Worklist. Return false
+  // otherwise.
+  return llvm::set_is_subset(ValuesToRevisit, Worklist);
+}
+
+bool PointerReplacer::collectUsersRecursive(Instruction &I) {
+  for (auto *U : I.users()) {
+    auto *Inst = cast<Instruction>(&*U);
     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
       if (Load->isVolatile())
         return false;
-      UsersToReplace.insert(Load);
+      Worklist.insert(Load);
     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
-      /// TODO: Handle poison and null pointers for PHI and select.
-      // If all incoming values are available, mark this PHI as
-      // replacable and push it's users into the worklist.
-      bool IsReplacable = true;
-      if (all_of(PHI->incoming_values(), [&](Value *V) {
-            if (!isa<Instruction>(V))
-              return IsReplacable = false;
-            return isAvailable(cast<Instruction>(V));
+      // All incoming values must be instructions for replacability
+      if (any_of(PHI->incoming_values(),
+                 [](Value *V) { return !isa<Instruction>(V); }))
+        return false;
+
+      // If at least one incoming value of the PHI is not in Worklist,
+      // store the PHI for revisiting and skip this iteration of the
+      // loop.
+      if (any_of(PHI->incoming_values(), [this](Value *V) {
+            return !isAvailable(cast<Instruction>(V));
           })) {
-        UsersToReplace.insert(PHI);
-        PushUsersToWorklist(PHI);
+        ValuesToRevisit.insert(Inst);
         continue;
       }
 
-      // Either an incoming value is not an instruction or not all
-      // incoming values are available. If this PHI was already
-      // visited prior to this iteration, return false.
-      if (!IsReplacable || !ValuesToRevisit.insert(PHI))
+      Worklist.insert(PHI);
+      if (!collectUsersRecursive(*PHI))
         return false;
-
-      // Push PHI back into the stack, followed by unavailable
-      // incoming values.
-      Worklist.emplace_back(PHI);
-      for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
-        auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
-        if (UsersToReplace.contains(IncomingValue))
-          continue;
-        if (!ValuesToRevisit.insert(IncomingValue))
-          return false;
-        Worklist.emplace_back(IncomingValue);
-      }
     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
-      auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
-      auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
-      if (!TrueInst || !FalseInst)
+      if (!isa<Instruction>(SI->getTrueValue()) ||
+          !isa<Instruction>(SI->getFalseValue()))
         return false;
 
-      UsersToReplace.insert(SI);
-      PushUsersToWorklist(SI);
-    } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-      UsersToReplace.insert(GEP);
-      PushUsersToWorklist(GEP);
+      if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
+          !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
+        ValuesToRevisit.insert(Inst);
+        continue;
+      }
+      Worklist.insert(SI);
+      if (!collectUsersRecursive(*SI))
+        return false;
+    } else if (isa<GetElementPtrInst>(Inst)) {
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
       if (MI->isVolatile())
         return false;
-      UsersToReplace.insert(Inst);
+      Worklist.insert(Inst);
     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
-      UsersToReplace.insert(Inst);
-      PushUsersToWorklist(Inst);
+      Worklist.insert(Inst);
+      if (!collectUsersRecursive(*Inst))
+        return false;
     } else if (Inst->isLifetimeStartOrEnd()) {
       continue;
     } else {
       // TODO: For arbitrary uses with address space mismatches, should we check
       // if we can introduce a valid addrspacecast?
-      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *Inst << '\n');
+      LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
       return false;
     }
   }
@@ -347,39 +342,7 @@ bool PointerReplacer::collectUsers() {
   return true;
 }
 
-void PointerReplacer::replacePointer(Value *V) {
-  assert(cast<PointerType>(Root.getType()) != cast<PointerType>(V->getType()) &&
-         "Invalid usage");
-  WorkMap[&Root] = V;
-  SmallVector<Instruction *> Worklist;
-  SetVector<Instruction *> PostOrderWorklist;
-  SmallPtrSet<Instruction *, 32> Visited;
-
-  // Perform a postorder traversal of the users of Root.
-  Worklist.push_back(&Root);
-  while (!Worklist.empty()) {
-    Instruction *I = Worklist.back();
-
-    // If I has not been processed before, push each of its
-    // replacable users into the worklist.
-    if (Visited.insert(I).second) {
-      for (auto *U : I->users()) {
-        auto *UserInst = cast<Instruction>(U);
-        if (UsersToReplace.contains(UserInst))
-          Worklist.push_back(UserInst);
-      }
-      // Otherwise, users of I have already been pushed into
-      // the PostOrderWorklist. Push I as well.
-    } else {
-      PostOrderWorklist.insert(I);
-      Worklist.pop_back();
-    }
-  }
-
-  // Replace pointers in reverse-postorder.
-  for (Instruction *I : reverse(PostOrderWorklist))
-    replace(I);
-}
+Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
 
 void PointerReplacer::replace(Instruction *I) {
   if (getReplacement(I))
@@ -401,15 +364,13 @@ void PointerReplacer::replace(Instruction *I) {
     // replacement (new value).
     WorkMap[NewI] = NewI;
   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
-    // Create a new PHI by replacing any incoming value that is a user of the
-    // root pointer and has a replacement.
-    Value *V = WorkMap.lookup(PHI->getIncomingValue(0));
-    PHI->mutateType(V ? V->getType() : PHI->getIncomingValue(0)->getType());
-    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I) {
-      Value *V = WorkMap.lookup(PHI->getIncomingValue(I));
-      PHI->setIncomingValue(I, V ? V : PHI->getIncomingValue(I));
-    }
-    WorkMap[PHI] = PHI;
+    Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
+    auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
+                                   PHI->getName(), PHI->getIterator());
+    for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
+      NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
+                          PHI->getIncomingBlock(I));
+    WorkMap[PHI] = NewPHI;
   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     auto *V = getReplacement(GEP->getPointerOperand());
     assert(V && "Operand not replaced");
@@ -473,6 +434,18 @@ void PointerReplacer::replace(Instruction *I) {
   }
 }
 
+void PointerReplacer::replacePointer(Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(Root.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && "Invalid usage");
+#endif
+  WorkMap[&Root] = V;
+
+  for (Instruction *Workitem : Worklist)
+    replace(Workitem);
+}
+
 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
     return I;
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
deleted file mode 100644
index 538cc19f9722e..0000000000000
--- a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll
+++ /dev/null
@@ -1,79 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=instcombine -S < %s | FileCheck %s
-
-%struct.type = type { [256 x <2 x i64>] }
-@g1 = external hidden addrspace(3) global %struct.type, align 16
-
-; This test requires the PtrReplacer to replace users in an RPO traversal.
-; Furthermore, %ptr.else need not to be replaced so it must be retained in
-; %ptr.sink.
-define <2 x i64> @func(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]]
-; CHECK:       [[IF_THEN]]:
-; CHECK-NEXT:    [[VAL_THEN:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[SINK:.*]]
-; CHECK:       [[IF_ELSE]]:
-; CHECK-NEXT:    [[PTR_ELSE:%.*]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br label %[[SINK]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[PTR_SINK:%.*]] = phi ptr [ [[PTR_ELSE]], %[[IF_ELSE]] ], [ [[VAL_THEN]], %[[IF_THEN]] ]
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_SINK]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  br i1 %cmp.0, label %if.then, label %if.else
-
-if.then:                                    ; preds = %entry
-  %ptr.then = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.then = addrspacecast ptr addrspace(5) %ptr.then to ptr
-  br label %sink
-
-if.else:                                      ; preds = %entry
-  %ptr.else = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.else = getelementptr inbounds nuw i8, ptr %ptr.else, i64 0
-  br label %sink
-
-sink:
-  %ptr.sink = phi ptr [ %val.else, %if.else ], [ %val.then, %if.then ]
-  %val.sink = load <2 x i64>, ptr %ptr.sink, align 16
-  ret <2 x i64> %val.sink
-}
-
-define <2 x i64> @func_phi_loop(ptr addrspace(4) byref(%struct.type) align 16 %0, i1 %cmp.0) {
-; CHECK-LABEL: define <2 x i64> @func_phi_loop(
-; CHECK-SAME: ptr addrspace(4) byref([[STRUCT_TYPE:%.*]]) align 16 [[TMP0:%.*]], i1 [[CMP_0:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[VAL_0:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr
-; CHECK-NEXT:    br label %[[LOOP:.*]]
-; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_PHI_R:%.*]] = phi ptr [ [[PTR_1:%.*]], %[[LOOP]] ], [ [[VAL_0]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[PTR_1]] = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-; CHECK-NEXT:    br i1 [[CMP_0]], label %[[LOOP]], label %[[SINK:.*]]
-; CHECK:       [[SINK]]:
-; CHECK-NEXT:    [[VAL_SINK:%.*]] = load <2 x i64>, ptr [[PTR_PHI_R]], align 16
-; CHECK-NEXT:    ret <2 x i64> [[VAL_SINK]]
-;
-entry:
-  %coerce = alloca %struct.type, align 16, addrspace(5)
-  call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 16 %coerce, ptr addrspace(4) align 16 %0, i64 4096, i1 false)
-  %ptr.0 = getelementptr inbounds i8, ptr addrspace(5) %coerce, i64 0
-  %val.0 = addrspacecast ptr addrspace(5) %ptr.0 to ptr
-  br label %loop
-
-loop:
-  %ptr.phi = phi ptr [ %val.1, %loop ], [ %val.0, %entry ]
-  %ptr.1 = load ptr, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @g1, i32 32), align 16
-  %val.1 = getelementptr inbounds nuw i8, ptr %ptr.1, i64 0
-  br i1 %cmp.0, label %loop, label %sink
-
-sink:
-  %val.sink = load <2 x i64>, ptr %ptr.phi, align 16
-  ret <2 x i64> %val.sink
-}
-
-declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0

From 6f9cd79fa2f43b8128be3e4386ee182ad5a843cc Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 10:07:56 -0700
Subject: [PATCH 614/851] [InstSimplify] Add basic simplifications for
 vp.reverse (#144112)

Directly modeled after what we do for vector.reverse, but with
restrictions on EVL and mask added.
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 17 +++++++++++++++++
 .../Transforms/InstSimplify/vp-reverse.ll     | 19 ++++++-------------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index e397a228afee0..d1ac8d9fbdfd1 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6969,6 +6969,23 @@ static Value *simplifyIntrinsic(CallBase *Call, Value *Callee,
     }
     return nullptr;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *Vec = Call->getArgOperand(0);
+    Value *Mask = Call->getArgOperand(1);
+    Value *EVL = Call->getArgOperand(2);
+
+    Value *X;
+    // vp.reverse(vp.reverse(X)) == X (with all ones mask and matching EVL)
+    if (match(Mask, m_AllOnes()) &&
+        match(Vec, m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+                       m_Value(X), m_AllOnes(), m_Specific(EVL))))
+      return X;
+
+    // vp.reverse(splat(X)) -> splat(X) (regardless of mask and EVL)
+    if (isSplatValue(Vec))
+      return Vec;
+    return nullptr;
+  }
   default:
     return nullptr;
   }
diff --git a/llvm/test/Transforms/InstSimplify/vp-reverse.ll b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
index 3c3bb871dc610..f19a2ac8ca9e1 100644
--- a/llvm/test/Transforms/InstSimplify/vp-reverse.ll
+++ b/llvm/test/Transforms/InstSimplify/vp-reverse.ll
@@ -3,9 +3,7 @@
 
 define <vscale x 4 x i32> @rev_of_rev(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_rev(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[RES]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A:%.*]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %res = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a.rev, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -25,8 +23,7 @@ define <vscale x 4 x i32> @rev_of_rev_diffevl(<vscale x 4 x i32> %a, i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 ; CHECK-LABEL: @rev_of_poison(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> poison
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> poison, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -34,8 +31,7 @@ define <vscale x 4 x i32> @rev_of_poison(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 ; CHECK-LABEL: @rev_of_undef(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> undef
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> undef, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -43,8 +39,7 @@ define <vscale x 4 x i32> @rev_of_undef(i32 %evl) {
 
 define <vscale x 4 x i32> @rev_of_zero(i32 %evl) {
 ; CHECK-LABEL: @rev_of_zero(
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
 ;
   %rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x i32> %rev
@@ -54,8 +49,7 @@ define <vscale x 4 x i32> @rev_of_splat(i32 %a, i32 %evl) {
 ; CHECK-LABEL: @rev_of_splat(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -67,8 +61,7 @@ define <vscale x 4 x i32> @rev_of_splat2(i32 %a, <vscale x 4 x i1> %m, i32 %evl)
 ; CHECK-LABEL: @rev_of_splat2(
 ; CHECK-NEXT:    [[A_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
 ; CHECK-NEXT:    [[A_VEC:%.*]] = shufflevector <vscale x 4 x i32> [[A_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A_VEC]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[REV]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[A_VEC]]
 ;
   %a.ins = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
   %a.vec = shufflevector <vscale x 4 x i32> %a.ins, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer

From 90d62e0ae352e67d808f94ffb6d215d033f4ec22 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 10:20:09 -0700
Subject: [PATCH 615/851] [RISCV][TTI] Refine reverse shuffle costing for high
 LMUL (#144155)

This contains two closely related changes:
1) Explicitly recurse on the i1 case - "3" happens to be the right
   magic constant at m1, but is not otherwise correct, and we're
   better off deferring this to existing logic.
2) Match the lowering for high LMUL shuffles - we've switched to using
   a linear number of m1 vrgather instead of a single big vrgather.
   This results in substantially faster (but also larger) code for
   reverse shuffles larger than m1.  Note that fixed vectors need
   a slide at the end, but scalable ones don't.

This will have the effect of biasing the vectorizer towards larger
(particularly scalable larger) vector factors. This increases VF for the
s112 and s1112 loops from TSVC_2 (in all configurations).

We could refine the high LMUL estimates a bit more, but I think getting
the linear scaling right is probably close enough for the moment.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 84 ++++++++++++++-----
 .../Analysis/CostModel/RISCV/rvv-shuffle.ll   | 68 +++++++--------
 .../CostModel/RISCV/shuffle-reverse.ll        | 52 ++++++------
 .../RISCV/riscv-vector-reverse.ll             | 20 ++---
 4 files changed, 132 insertions(+), 92 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bee47527cf428..fcc9d3977e5cd 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -602,6 +602,15 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
+// Consolidate!
+static MVT getLMUL1VT(MVT VT) {
+  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+         "Unexpected vector MVT");
+  return MVT::getScalableVectorVT(
+      VT.getVectorElementType(),
+      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
+}
+
 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                              VectorType *Tp, ArrayRef<int> Mask,
                                              TTI::TargetCostKind CostKind,
@@ -840,33 +849,64 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
   }
   case TTI::SK_Reverse: {
+
+    if (!LT.second.isVector())
+      return InstructionCost::getInvalid();
+
     // TODO: Cases to improve here:
     // * Illegal vector types
     // * i64 on RV32
-    // * i1 vector
-    // At low LMUL, most of the cost is producing the vrgather index register.
-    // At high LMUL, the cost of the vrgather itself will dominate.
-    // Example sequence:
-    //   csrr a0, vlenb
-    //   srli a0, a0, 3
-    //   addi a0, a0, -1
-    //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
-    //   vid.v v9
-    //   vrsub.vx v10, v9, a0
-    //   vrgather.vv v9, v8, v10
-    InstructionCost LenCost = 3;
+    if (Tp->getElementType()->isIntegerTy(1)) {
+      VectorType *WideTy =
+          VectorType::get(IntegerType::get(Tp->getContext(), 8),
+                          cast<VectorType>(Tp)->getElementCount());
+      return getCastInstrCost(Instruction::ZExt, WideTy, Tp,
+                              TTI::CastContextHint::None, CostKind) +
+             getShuffleCost(TTI::SK_Reverse, WideTy, {}, CostKind, 0, nullptr) +
+             getCastInstrCost(Instruction::Trunc, Tp, WideTy,
+                              TTI::CastContextHint::None, CostKind);
+    }
+
+    MVT ContainerVT = LT.second;
     if (LT.second.isFixedLengthVector())
-      // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
-      LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
-    unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
-    if (LT.second.isFixedLengthVector() &&
-        isInt<5>(LT.second.getVectorNumElements() - 1))
-      Opcodes[1] = RISCV::VRSUB_VI;
+      ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
+    MVT M1VT = getLMUL1VT(ContainerVT);
+    if (ContainerVT.bitsLE(M1VT)) {
+      // Example sequence:
+      //   csrr a0, vlenb
+      //   srli a0, a0, 3
+      //   addi a0, a0, -1
+      //   vsetvli a1, zero, e8, mf8, ta, mu (ignored)
+      //   vid.v v9
+      //   vrsub.vx v10, v9, a0
+      //   vrgather.vv v9, v8, v10
+      InstructionCost LenCost = 3;
+      if (LT.second.isFixedLengthVector())
+        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
+        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
+      unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
+      if (LT.second.isFixedLengthVector() &&
+          isInt<5>(LT.second.getVectorNumElements() - 1))
+        Opcodes[1] = RISCV::VRSUB_VI;
+      InstructionCost GatherCost =
+          getRISCVInstructionCost(Opcodes, LT.second, CostKind);
+      return LT.first * (LenCost + GatherCost);
+    }
+
+    // At high LMUL, we split into a series of M1 reverses (see
+    // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
+    // the resulting gap at the bottom (for fixed vectors only).  The important
+    // bit is that the cost scales linearly, not quadratically with LMUL.
+    unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
+    InstructionCost FixedCost =
+        getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
+    unsigned Ratio =
+        ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();
     InstructionCost GatherCost =
-        getRISCVInstructionCost(Opcodes, LT.second, CostKind);
-    // Mask operation additionally required extend and truncate
-    InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;
-    return LT.first * (LenCost + GatherCost + ExtendCost);
+        getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
+    InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
+      getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
+    return FixedCost + LT.first * (GatherCost + SlideCost);
   }
   }
   return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index e1bca71614125..437a9af8fcc83 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -78,47 +78,47 @@ declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x
 
 define void @vector_reverse() {
 ; CHECK-LABEL: 'vector_reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 166 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 332 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'vector_reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv32i8 = call <vscale x 32 x i8> @llvm.vector.reverse.nxv32i8(<vscale x 32 x i8> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i16 = call <vscale x 2 x i16> @llvm.vector.reverse.nxv2i16(<vscale x 2 x i16> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i16 = call <vscale x 4 x i16> @llvm.vector.reverse.nxv4i16(<vscale x 4 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv8i16 = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv16i16 = call <vscale x 16 x i16> @llvm.vector.reverse.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv4i32 = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv8i32 = call <vscale x 8 x i32> @llvm.vector.reverse.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv16i32 = call <vscale x 16 x i32> @llvm.vector.reverse.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv32i32 = call <vscale x 32 x i32> @llvm.vector.reverse.nxv32i32(<vscale x 32 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %reverse_nxv2i64 = call <vscale x 2 x i64> @llvm.vector.reverse.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %reverse_nxv4i64 = call <vscale x 4 x i64> @llvm.vector.reverse.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %reverse_nxv8i64 = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %reverse_nxv16i64 = call <vscale x 16 x i64> @llvm.vector.reverse.nxv16i64(<vscale x 16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %reverse_nxv32i64 = call <vscale x 32 x i64> @llvm.vector.reverse.nxv32i64(<vscale x 32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %reverse_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.reverse.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.reverse.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.reverse.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %reverse_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.reverse.nxv2i1(<vscale x 2 x i1> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %reverse_nxv16i8 = call <vscale x 16 x i8> @llvm.vector.reverse.nxv16i8(<vscale x 16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 8f3219861f2fd..d97d70e99ccbf 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -11,10 +11,10 @@
 define void @reverse() {
 ;
 ; CHECK-LABEL: 'reverse'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -22,31 +22,31 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reverse'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -54,24 +54,24 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16i32 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8i64 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i64 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index b026e68685812..ba4c4b6d58add 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -38,10 +38,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -147,10 +147,10 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -178,7 +178,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 32
+; CHECK-NEXT:  LV: Loop cost is 24
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.
@@ -447,10 +447,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -556,10 +556,10 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
-; CHECK-NEXT:  LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
+; CHECK-NEXT:  LV: Found an estimated cost of 9 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1
 ; CHECK-NEXT:  LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
@@ -587,7 +587,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: The target has 31 registers of RISCV::GPRRC register class
 ; CHECK-NEXT:  LV: The target has 32 registers of RISCV::VRRC register class
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Loop cost is 34
+; CHECK-NEXT:  LV: Loop cost is 26
 ; CHECK-NEXT:  LV: IC is 1
 ; CHECK-NEXT:  LV: VF is vscale x 4
 ; CHECK-NEXT:  LV: Not Interleaving.

From 267b859fc60acda510027bd6139c54d660c6fb21 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 16 Jun 2025 19:35:34 +0200
Subject: [PATCH 616/851] [CIR] Implement folder for VecCmpOp (#143322)

This change adds a folder for the VecCmpOp

Issue https://github.com/llvm/llvm-project/issues/136487
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |   2 +
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  98 ++++++++
 .../Dialect/Transforms/CIRCanonicalize.cpp    |   2 +-
 clang/test/CIR/Transforms/vector-cmp-fold.cir | 227 ++++++++++++++++++
 4 files changed, 328 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/Transforms/vector-cmp-fold.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index bd36d228578b7..8dd1f0ce361d7 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2155,6 +2155,8 @@ def VecCmpOp : CIR_Op<"vec.cmp", [Pure, SameTypeOperands]> {
     `(` $kind `,` $lhs `,` $rhs `)` `:` qualified(type($lhs)) `,`
     qualified(type($result)) attr-dict
   }];
+
+  let hasFolder = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 5578d4f5825a9..3fcb0213b219a 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1589,6 +1589,104 @@ OpFoldResult cir::VecExtractOp::fold(FoldAdaptor adaptor) {
   return elements[index];
 }
 
+//===----------------------------------------------------------------------===//
+// VecCmpOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::VecCmpOp::fold(FoldAdaptor adaptor) {
+  auto lhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getLhs());
+  auto rhsVecAttr =
+      mlir::dyn_cast_if_present<cir::ConstVectorAttr>(adaptor.getRhs());
+  if (!lhsVecAttr || !rhsVecAttr)
+    return {};
+
+  mlir::Type inputElemTy =
+      mlir::cast<cir::VectorType>(lhsVecAttr.getType()).getElementType();
+  if (!isAnyIntegerOrFloatingPointType(inputElemTy))
+    return {};
+
+  cir::CmpOpKind opKind = adaptor.getKind();
+  mlir::ArrayAttr lhsVecElhs = lhsVecAttr.getElts();
+  mlir::ArrayAttr rhsVecElhs = rhsVecAttr.getElts();
+  uint64_t vecSize = lhsVecElhs.size();
+
+  SmallVector<mlir::Attribute, 16> elements(vecSize);
+  bool isIntAttr = vecSize && mlir::isa<cir::IntAttr>(lhsVecElhs[0]);
+  for (uint64_t i = 0; i < vecSize; i++) {
+    mlir::Attribute lhsAttr = lhsVecElhs[i];
+    mlir::Attribute rhsAttr = rhsVecElhs[i];
+    int cmpResult = 0;
+    switch (opKind) {
+    case cir::CmpOpKind::lt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::le: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() <=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() <=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::gt: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ge: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() >=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() >=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::eq: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() ==
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() ==
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    case cir::CmpOpKind::ne: {
+      if (isIntAttr) {
+        cmpResult = mlir::cast<cir::IntAttr>(lhsAttr).getSInt() !=
+                    mlir::cast<cir::IntAttr>(rhsAttr).getSInt();
+      } else {
+        cmpResult = mlir::cast<cir::FPAttr>(lhsAttr).getValue() !=
+                    mlir::cast<cir::FPAttr>(rhsAttr).getValue();
+      }
+      break;
+    }
+    }
+
+    elements[i] = cir::IntAttr::get(getType().getElementType(), cmpResult);
+  }
+
+  return cir::ConstVectorAttr::get(
+      getType(), mlir::ArrayAttr::get(getContext(), elements));
+}
+
 //===----------------------------------------------------------------------===//
 // VecShuffleOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index 20c634d6c66f6..f07e234e5e84c 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -141,7 +141,7 @@ void CIRCanonicalizePass::runOnOperation() {
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
     if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
-            ComplexCreateOp, VecCreateOp, VecExtractOp, VecShuffleOp,
+            ComplexCreateOp, VecCmpOp, VecCreateOp, VecExtractOp, VecShuffleOp,
             VecShuffleDynamicOp, VecTernaryOp>(op))
       ops.push_back(op);
   });
diff --git a/clang/test/CIR/Transforms/vector-cmp-fold.cir b/clang/test/CIR/Transforms/vector-cmp-fold.cir
new file mode 100644
index 0000000000000..b207fc08748e2
--- /dev/null
+++ b/clang/test/CIR/Transforms/vector-cmp-fold.cir
@@ -0,0 +1,227 @@
+// RUN: cir-opt %s -cir-canonicalize -o - -split-input-file | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<3> : !s32i, #cir.int<5> : !s32i, #cir.int<7> : !s32i]> : !cir.vector<4 x !s32i>
+    %vec_2 = cir.const #cir.const_vector<[#cir.int<2> : !s32i, #cir.int<4> : !s32i, #cir.int<6> : !s32i, #cir.int<8> : !s32i]> : !cir.vector<4 x !s32i>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(eq, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ne, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(lt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(le, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<1> : !s32i, #cir.int<1> : !s32i,
+  // CHECK-SAME: #cir.int<1> : !s32i, #cir.int<1> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(gt, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}
+
+// -----
+
+!s32i = !cir.int<s, 32>
+
+module  {
+  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+    %vec_1 = cir.const #cir.const_vector<[#cir.fp<1.000000e+00> : !cir.float, #cir.fp<2.000000e+00>
+      : !cir.float, #cir.fp<3.000000e+00> : !cir.float, #cir.fp<4.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %vec_2 = cir.const #cir.const_vector<[#cir.fp<5.000000e+00> : !cir.float, #cir.fp<6.000000e+00>
+      : !cir.float, #cir.fp<7.000000e+00> : !cir.float, #cir.fp<8.000000e+00> : !cir.float]> : !cir.vector<4 x !cir.float>
+    %new_vec = cir.vec.cmp(ge, %vec_1, %vec_2) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
+    cir.return %new_vec : !cir.vector<4 x !s32i>
+  }
+
+  // CHECK:  cir.func @fold_cmp_vector_op_test() -> !cir.vector<4 x !s32i> {
+  // CHECK-NEXT: %[[RES:.*]] = cir.const #cir.const_vector<[#cir.int<0> : !s32i, #cir.int<0> : !s32i,
+  // CHECK-SAME: #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.vector<4 x !s32i>
+  // CHECK-NEXT: cir.return %[[RES]] : !cir.vector<4 x !s32i>
+}

From 4cd3e41bce449a10f431a3112b6cb8d7bc1b09cf Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 11:03:21 -0700
Subject: [PATCH 617/851] [libc] Removed public function calls in table.h
 (#144168)

Removed strcmp, strlen, and memset calls from table.h and replaced them
with internal functions.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 libc/src/__support/HashTable/CMakeLists.txt      |  5 ++---
 libc/src/__support/HashTable/table.h             | 15 ++++++++-------
 libc/test/src/__support/HashTable/table_test.cpp |  4 +++-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
index 3c487e4f29264..a1de0680cc7d5 100644
--- a/libc/src/__support/HashTable/CMakeLists.txt
+++ b/libc/src/__support/HashTable/CMakeLists.txt
@@ -32,9 +32,8 @@ add_header_library(
     libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
     libc.src.__support.memory_size
-    libc.src.string.memset
-    libc.src.string.strcmp
-    libc.src.string.strlen
+    libc.src.string.memory_utils.inline_strcmp
+    libc.src.string.string_utils
 )
 
 add_header_library(
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 13badb90dbfde..10dd9711afbf6 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -18,9 +18,8 @@
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"
 #include "src/__support/memory_size.h"
-#include "src/string/memset.h"
-#include "src/string/strcmp.h"
-#include "src/string/strlen.h"
+#include "src/string/memory_utils/inline_strcmp.h"
+#include "src/string/string_utils.h"
 #include <stddef.h>
 #include <stdint.h>
 
@@ -158,7 +157,9 @@ struct HashTable {
       for (size_t i : masks) {
         size_t index = (pos + i) & entries_mask;
         ENTRY &entry = this->entry(index);
-        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
+        auto comp = [](char l, char r) -> int { return l - r; };
+        if (LIBC_LIKELY(entry.key != nullptr &&
+                        inline_strcmp(entry.key, key, comp) == 0))
           return index;
       }
       BitMask available = ctrls.mask_available();
@@ -176,7 +177,7 @@ struct HashTable {
 
   LIBC_INLINE uint64_t oneshot_hash(const char *key) const {
     LIBC_NAMESPACE::internal::HashState hasher = state;
-    hasher.update(key, strlen(key));
+    hasher.update(key, internal::string_length(key));
     return hasher.finish();
   }
 
@@ -282,8 +283,8 @@ struct HashTable {
       table->entries_mask = entries - 1u;
       table->available_slots = entries / 8 * 7;
       table->state = HashState{randomness};
-      memset(&table->control(0), 0x80, ctrl_sizes);
-      memset(mem, 0, table->offset_from_entries());
+      __builtin_memset(&table->control(0), 0x80, ctrl_sizes);
+      __builtin_memset(mem, 0, table->offset_from_entries());
     }
     return table;
   }
diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp
index a579bfabb2d7b..ba9849b6b5af9 100644
--- a/libc/test/src/__support/HashTable/table_test.cpp
+++ b/libc/test/src/__support/HashTable/table_test.cpp
@@ -108,7 +108,9 @@ TEST(LlvmLibcTableTest, Insertion) {
             static_cast<void *>(keys[CAP].bytes));
 
   for (size_t i = 0; i <= CAP; ++i) {
-    ASSERT_EQ(strcmp(table->find(keys[i].bytes)->key, keys[i].bytes), 0);
+    auto comp = [](char l, char r) -> int { return l - r; };
+    ASSERT_EQ(
+        inline_strcmp(table->find(keys[i].bytes)->key, keys[i].bytes, comp), 0);
   }
   for (size_t i = CAP + 1; i < 256; ++i) {
     ASSERT_EQ(table->find(keys[i].bytes), static_cast<ENTRY *>(nullptr));

From ffc4d87f9b2b57f7020fa5fd0f1d3003370c2d80 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:03:48 -0700
Subject: [PATCH 618/851] [llvm] annotate interfaces in Passes for DLL export
 (#143794)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/Passes` library and
other pass-related headers. These annotations currently have no
meaningful impact on the LLVM build; however, they are a prerequisite to
support an LLVM Windows DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The following manual adjustments were also applied after running IDS on
Linux:
- Remove the redundant declaration of the `initializeKCFIPass` function
from llvm/include/llvm/InitializePasses.h because IDS only
auto-annotates the first declaration it encounters, and the second
un-annotated declaration results in an MSVC warning
- Add `LLVM_ABI` to a number of private `AnalysisKey` fields in classes
that extend the `AnalysisInfoMixin` template class.
- Add `LLVM_ABI` to the `ChangeReporter` and `TextChangeReporter`
template class definitions in
llvm/include/llvm/Passes/StandardInstrumentations.h and remove the
extern template instantiations. This is the only way I've found to get
everything compiling warning-free when building a DLL because both
template classes have methods implemented out-of-line.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/InitializePasses.h          | 589 +++++++++---------
 llvm/include/llvm/Pass.h                      |  13 +-
 llvm/include/llvm/PassAnalysisSupport.h       |  16 +-
 llvm/include/llvm/PassRegistry.h              |  17 +-
 llvm/include/llvm/PassSupport.h               |   3 +-
 llvm/include/llvm/Passes/OptimizationLevel.h  |  13 +-
 llvm/include/llvm/Passes/PassBuilder.h        | 191 +++---
 llvm/include/llvm/Passes/PassPlugin.h         |   2 +-
 .../llvm/Passes/StandardInstrumentations.h    |  68 +-
 9 files changed, 469 insertions(+), 443 deletions(-)

diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 42610d505c2bd..1b5b1d5888824 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -14,318 +14,331 @@
 #ifndef LLVM_INITIALIZEPASSES_H
 #define LLVM_INITIALIZEPASSES_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class PassRegistry;
 
 /// Initialize all passes linked into the Core library.
-void initializeCore(PassRegistry &);
+LLVM_ABI void initializeCore(PassRegistry &);
 
 /// Initialize all passes linked into the TransformUtils library.
-void initializeTransformUtils(PassRegistry &);
+LLVM_ABI void initializeTransformUtils(PassRegistry &);
 
 /// Initialize all passes linked into the ScalarOpts library.
-void initializeScalarOpts(PassRegistry &);
+LLVM_ABI void initializeScalarOpts(PassRegistry &);
 
 /// Initialize all passes linked into the Vectorize library.
-void initializeVectorization(PassRegistry &);
+LLVM_ABI void initializeVectorization(PassRegistry &);
 
 /// Initialize all passes linked into the InstCombine library.
-void initializeInstCombine(PassRegistry &);
+LLVM_ABI void initializeInstCombine(PassRegistry &);
 
 /// Initialize all passes linked into the IPO library.
-void initializeIPO(PassRegistry &);
+LLVM_ABI void initializeIPO(PassRegistry &);
 
 /// Initialize all passes linked into the Analysis library.
-void initializeAnalysis(PassRegistry &);
+LLVM_ABI void initializeAnalysis(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeCodeGen(PassRegistry &);
+LLVM_ABI void initializeCodeGen(PassRegistry &);
 
 /// Initialize all passes linked into the GlobalISel library.
-void initializeGlobalISel(PassRegistry &);
+LLVM_ABI void initializeGlobalISel(PassRegistry &);
 
 /// Initialize all passes linked into the CodeGen library.
-void initializeTarget(PassRegistry &);
+LLVM_ABI void initializeTarget(PassRegistry &);
 
-void initializeAAResultsWrapperPassPass(PassRegistry &);
-void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
-void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
-void initializeAssumptionCacheTrackerPass(PassRegistry &);
-void initializeAtomicExpandLegacyPass(PassRegistry &);
-void initializeBasicBlockPathCloningPass(PassRegistry &);
-void initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
-void initializeBasicBlockSectionsPass(PassRegistry &);
-void initializeBarrierNoopPass(PassRegistry &);
-void initializeBasicAAWrapperPassPass(PassRegistry &);
-void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeBranchFolderLegacyPass(PassRegistry &);
-void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeBranchRelaxationLegacyPass(PassRegistry &);
-void initializeBreakCriticalEdgesPass(PassRegistry &);
-void initializeBreakFalseDepsPass(PassRegistry &);
-void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
-void initializeCFGSimplifyPassPass(PassRegistry &);
-void initializeCFGuardPass(PassRegistry &);
-void initializeCFGuardLongjmpPass(PassRegistry &);
-void initializeCFIFixupPass(PassRegistry &);
-void initializeCFIInstrInserterPass(PassRegistry &);
-void initializeCallBrPreparePass(PassRegistry &);
-void initializeCallGraphDOTPrinterPass(PassRegistry &);
-void initializeCallGraphViewerPass(PassRegistry &);
-void initializeCallGraphWrapperPassPass(PassRegistry &);
-void initializeCheckDebugMachineModulePass(PassRegistry &);
-void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
-void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
-void initializeConstantHoistingLegacyPassPass(PassRegistry &);
-void initializeCycleInfoWrapperPassPass(PassRegistry &);
-void initializeDAEPass(PassRegistry &);
-void initializeDAHPass(PassRegistry &);
-void initializeDCELegacyPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
-void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
-void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
-void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
-void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
-void initializeDXILResourceWrapperPassPass(PassRegistry &);
-void initializeDeadMachineInstructionElimPass(PassRegistry &);
-void initializeDebugifyMachineModulePass(PassRegistry &);
-void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
-void initializeDetectDeadLanesLegacyPass(PassRegistry &);
-void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializeDomPrinterWrapperPassPass(PassRegistry &);
-void initializeDomViewerWrapperPassPass(PassRegistry &);
-void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
-void initializeDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
-void initializeEarlyCSELegacyPassPass(PassRegistry &);
-void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterLegacyPass(PassRegistry &);
-void initializeEarlyIfPredicatorPass(PassRegistry &);
-void initializeEarlyMachineLICMPass(PassRegistry &);
-void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
-void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
-void initializeEHContGuardTargetsPass(PassRegistry &);
-void initializeExpandFpLegacyPassPass(PassRegistry &);
-void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
-void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
-void initializeExpandPostRALegacyPass(PassRegistry &);
-void initializeExpandReductionsPass(PassRegistry &);
-void initializeExpandVariadicsPass(PassRegistry &);
-void initializeExternalAAWrapperPassPass(PassRegistry &);
-void initializeFEntryInserterLegacyPass(PassRegistry &);
-void initializeFinalizeISelPass(PassRegistry &);
-void initializeFinalizeMachineBundlesPass(PassRegistry &);
-void initializeFixIrreduciblePass(PassRegistry &);
-void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
-void initializeFlattenCFGLegacyPassPass(PassRegistry &);
-void initializeFuncletLayoutPass(PassRegistry &);
-void initializeGCEmptyBasicBlocksPass(PassRegistry &);
-void initializeGCMachineCodeAnalysisPass(PassRegistry &);
-void initializeGCModuleInfoPass(PassRegistry &);
-void initializeGVNLegacyPassPass(PassRegistry &);
-void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
-void initializeGlobalMergePass(PassRegistry &);
-void initializeGlobalsAAWrapperPassPass(PassRegistry &);
-void initializeHardwareLoopsLegacyPass(PassRegistry &);
-void initializeMIRProfileLoaderPassPass(PassRegistry &);
-void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
-void initializeIRTranslatorPass(PassRegistry &);
-void initializeIVUsersWrapperPassPass(PassRegistry &);
-void initializeIfConverterPass(PassRegistry &);
-void initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeImplicitNullChecksPass(PassRegistry &);
-void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
-void initializeInferAddressSpacesPass(PassRegistry &);
-void initializeInstSimplifyLegacyPassPass(PassRegistry &);
-void initializeInstructionCombiningPassPass(PassRegistry &);
-void initializeInstructionSelectPass(PassRegistry &);
-void initializeInterleavedAccessPass(PassRegistry &);
-void initializeInterleavedLoadCombinePass(PassRegistry &);
-void initializeJMCInstrumenterPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeLCSSAVerificationPassPass(PassRegistry &);
-void initializeLCSSAWrapperPassPass(PassRegistry &);
-void initializeLazyBFIPassPass(PassRegistry &);
-void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
-void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
-void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
-void initializeLegacyLICMPassPass(PassRegistry &);
-void initializeLegalizerPass(PassRegistry &);
-void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
-void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
-void initializeLiveDebugValuesLegacyPass(PassRegistry &);
-void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
-void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
-void initializeLiveRangeShrinkPass(PassRegistry &);
-void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
-void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
-void initializeLiveVariablesWrapperPassPass(PassRegistry &);
-void initializeLoadStoreOptPass(PassRegistry &);
-void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
-void initializeLocalStackSlotPassPass(PassRegistry &);
-void initializeLocalizerPass(PassRegistry &);
-void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
-void initializeLoopExtractorLegacyPassPass(PassRegistry &);
-void initializeLoopInfoWrapperPassPass(PassRegistry &);
-void initializeLoopPassPass(PassRegistry &);
-void initializeLoopSimplifyPass(PassRegistry &);
-void initializeLoopStrengthReducePass(PassRegistry &);
-void initializeLoopTermFoldPass(PassRegistry &);
-void initializeLoopUnrollPass(PassRegistry &);
-void initializeLowerAtomicLegacyPassPass(PassRegistry &);
-void initializeLowerEmuTLSPass(PassRegistry &);
-void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
-void initializeLowerIntrinsicsPass(PassRegistry &);
-void initializeLowerInvokeLegacyPassPass(PassRegistry &);
-void initializeLowerSwitchLegacyPassPass(PassRegistry &);
-void initializeKCFIPass(PassRegistry &);
-void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
-void initializeMIRCanonicalizerPass(PassRegistry &);
-void initializeMIRNamerPass(PassRegistry &);
-void initializeMIRPrintingPassPass(PassRegistry &);
-void initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
-void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
-void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
-void initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
-void initializeMachineCFGPrinterPass(PassRegistry &);
-void initializeMachineCSELegacyPass(PassRegistry &);
-void initializeMachineCombinerPass(PassRegistry &);
-void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
-void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineDominanceFrontierPass(PassRegistry &);
-void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineFunctionPrinterPassPass(PassRegistry &);
-void initializeMachineFunctionSplitterPass(PassRegistry &);
-void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
-void initializeMachineLICMPass(PassRegistry &);
-void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
-void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
-void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
-void initializeMachineOutlinerPass(PassRegistry &);
-void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
-void initializeStaticDataAnnotatorPass(PassRegistry &);
-void initializeMachinePipelinerPass(PassRegistry &);
-void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializeMachineRegionInfoPassPass(PassRegistry &);
-void initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
-void initializeMachineSchedulerLegacyPass(PassRegistry &);
-void initializeMachineSinkingLegacyPass(PassRegistry &);
-void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
-void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
-void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
-void initializeMachineVerifierLegacyPassPass(PassRegistry &);
-void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
-void initializeMemorySSAWrapperPassPass(PassRegistry &);
-void initializeMergeICmpsLegacyPassPass(PassRegistry &);
-void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeModuloScheduleTestPass(PassRegistry &);
-void initializeNaryReassociateLegacyPassPass(PassRegistry &);
-void initializeObjCARCContractLegacyPassPass(PassRegistry &);
-void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
-void initializeOptimizePHIsLegacyPass(PassRegistry &);
-void initializePEILegacyPass(PassRegistry &);
-void initializePHIEliminationPass(PassRegistry &);
-void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
-void initializePatchableFunctionLegacyPass(PassRegistry &);
-void initializePeepholeOptimizerLegacyPass(PassRegistry &);
-void initializePhiValuesWrapperPassPass(PassRegistry &);
-void initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
-void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
-void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
-void initializePostDomPrinterWrapperPassPass(PassRegistry &);
-void initializePostDomViewerWrapperPassPass(PassRegistry &);
-void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
-void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
-void initializePostMachineSchedulerLegacyPass(PassRegistry &);
-void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
-void initializePostRAMachineSinkingPass(PassRegistry &);
-void initializePostRASchedulerLegacyPass(PassRegistry &);
-void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
-void initializePrintFunctionPassWrapperPass(PassRegistry &);
-void initializePrintModulePassWrapperPass(PassRegistry &);
-void initializeProcessImplicitDefsPass(PassRegistry &);
-void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
-void initializePromoteLegacyPassPass(PassRegistry &);
-void initializeRABasicPass(PassRegistry &);
-void initializePseudoProbeInserterPass(PassRegistry &);
-void initializeRAGreedyLegacyPass(PassRegistry &);
-void initializeReachingDefAnalysisPass(PassRegistry &);
-void initializeReassociateLegacyPassPass(PassRegistry &);
-void initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocFastPass(PassRegistry &);
-void initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
-void initializeRegAllocScoringPass(PassRegistry &);
-void initializeRegBankSelectPass(PassRegistry &);
-void initializeRegToMemWrapperPassPass(PassRegistry &);
-void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
-void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
-void initializeRegionInfoPassPass(PassRegistry &);
-void initializeRegionOnlyPrinterPass(PassRegistry &);
-void initializeRegionOnlyViewerPass(PassRegistry &);
-void initializeRegionPrinterPass(PassRegistry &);
-void initializeRegionViewerPass(PassRegistry &);
-void initializeRegisterCoalescerLegacyPass(PassRegistry &);
-void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
-void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
-void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
-void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
-void initializeResetMachineFunctionPass(PassRegistry &);
-void initializeSCEVAAWrapperPassPass(PassRegistry &);
-void initializeSROALegacyPassPass(PassRegistry &);
-void initializeSafeStackLegacyPassPass(PassRegistry &);
-void initializeSafepointIRVerifierPass(PassRegistry &);
-void initializeSelectOptimizePass(PassRegistry &);
-void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
-void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
-void initializeScalarizerLegacyPassPass(PassRegistry &);
-void initializeScavengerTestPass(PassRegistry &);
-void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
-void initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
-void initializeShadowStackGCLoweringPass(PassRegistry &);
-void initializeShrinkWrapLegacyPass(PassRegistry &);
-void initializeSingleLoopExtractorPass(PassRegistry &);
-void initializeSinkingLegacyPassPass(PassRegistry &);
-void initializeSjLjEHPreparePass(PassRegistry &);
-void initializeSlotIndexesWrapperPassPass(PassRegistry &);
-void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
-void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
-void initializeStackColoringLegacyPass(PassRegistry &);
-void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
-void initializeStaticDataSplitterPass(PassRegistry &);
-void initializeStackMapLivenessPass(PassRegistry &);
-void initializeStackProtectorPass(PassRegistry &);
-void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
-void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
-void initializeStackSlotColoringLegacyPass(PassRegistry &);
-void initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
-void initializeStripDebugMachineModulePass(PassRegistry &);
-void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
-void initializeTailCallElimPass(PassRegistry &);
-void initializeTailDuplicateLegacyPass(PassRegistry &);
-void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
-void initializeTargetPassConfigPass(PassRegistry &);
-void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
-void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
-void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
-void initializeTypePromotionLegacyPass(PassRegistry &);
-void initializeInitUndefPass(PassRegistry &);
-void initializeUniformityInfoWrapperPassPass(PassRegistry &);
-void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
-void initializeUnpackMachineBundlesPass(PassRegistry &);
-void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
-void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
-void initializeVerifierLegacyPassPass(PassRegistry &);
-void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
-void initializeVirtRegRewriterLegacyPass(PassRegistry &);
-void initializeWasmEHPreparePass(PassRegistry &);
-void initializeWinEHPreparePass(PassRegistry &);
-void initializeWriteBitcodePassPass(PassRegistry &);
-void initializeXRayInstrumentationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeAAResultsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeAlwaysInlinerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeAssumptionCacheTrackerPass(PassRegistry &);
+LLVM_ABI void initializeAtomicExpandLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockPathCloningPass(PassRegistry &);
+LLVM_ABI void
+initializeBasicBlockSectionsProfileReaderWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBasicBlockSectionsPass(PassRegistry &);
+LLVM_ABI void initializeBarrierNoopPass(PassRegistry &);
+LLVM_ABI void initializeBasicAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchFolderLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeBranchRelaxationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeBreakCriticalEdgesPass(PassRegistry &);
+LLVM_ABI void initializeBreakFalseDepsPass(PassRegistry &);
+LLVM_ABI void initializeCanonicalizeFreezeInLoopsPass(PassRegistry &);
+LLVM_ABI void initializeCFGSimplifyPassPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardPass(PassRegistry &);
+LLVM_ABI void initializeCFGuardLongjmpPass(PassRegistry &);
+LLVM_ABI void initializeCFIFixupPass(PassRegistry &);
+LLVM_ABI void initializeCFIInstrInserterPass(PassRegistry &);
+LLVM_ABI void initializeCallBrPreparePass(PassRegistry &);
+LLVM_ABI void initializeCallGraphDOTPrinterPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphViewerPass(PassRegistry &);
+LLVM_ABI void initializeCallGraphWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeCheckDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeCodeGenPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeConstantHoistingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDAEPass(PassRegistry &);
+LLVM_ABI void initializeDAHPass(PassRegistry &);
+LLVM_ABI void initializeDCELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceBindingWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceImplicitBindingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceTypeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDXILResourceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDeadMachineInstructionElimPass(PassRegistry &);
+LLVM_ABI void initializeDebugifyMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeDependenceAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDetectDeadLanesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominanceFrontierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSELegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfConverterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEarlyIfPredicatorPass(PassRegistry &);
+LLVM_ABI void initializeEarlyMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeEarlyTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEdgeBundlesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeEHContGuardTargetsPass(PassRegistry &);
+LLVM_ABI void initializeExpandFpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeExpandPostRALegacyPass(PassRegistry &);
+LLVM_ABI void initializeExpandReductionsPass(PassRegistry &);
+LLVM_ABI void initializeExpandVariadicsPass(PassRegistry &);
+LLVM_ABI void initializeExternalAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeFEntryInserterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeISelPass(PassRegistry &);
+LLVM_ABI void initializeFinalizeMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeFixIrreduciblePass(PassRegistry &);
+LLVM_ABI void initializeFixupStatepointCallerSavedLegacyPass(PassRegistry &);
+LLVM_ABI void initializeFlattenCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeFuncletLayoutPass(PassRegistry &);
+LLVM_ABI void initializeGCEmptyBasicBlocksPass(PassRegistry &);
+LLVM_ABI void initializeGCMachineCodeAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeGCModuleInfoPass(PassRegistry &);
+LLVM_ABI void initializeGVNLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeGlobalMergePass(PassRegistry &);
+LLVM_ABI void initializeGlobalsAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeHardwareLoopsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMIRProfileLoaderPassPass(PassRegistry &);
+LLVM_ABI void initializeIRSimilarityIdentifierWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIRTranslatorPass(PassRegistry &);
+LLVM_ABI void initializeIVUsersWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeIfConverterPass(PassRegistry &);
+LLVM_ABI void
+initializeImmutableModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeImplicitNullChecksPass(PassRegistry &);
+LLVM_ABI void initializeIndirectBrExpandLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInferAddressSpacesPass(PassRegistry &);
+LLVM_ABI void initializeInstSimplifyLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionCombiningPassPass(PassRegistry &);
+LLVM_ABI void initializeInstructionSelectPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedAccessPass(PassRegistry &);
+LLVM_ABI void initializeInterleavedLoadCombinePass(PassRegistry &);
+LLVM_ABI void initializeJMCInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializeKCFIPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAVerificationPassPass(PassRegistry &);
+LLVM_ABI void initializeLCSSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBFIPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyBranchProbabilityInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeLazyValueInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLegacyLICMPassPass(PassRegistry &);
+LLVM_ABI void initializeLegalizerPass(PassRegistry &);
+LLVM_ABI void initializeGISelCSEAnalysisWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeGISelValueTrackingAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveDebugVariablesWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveIntervalsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLiveRangeShrinkPass(PassRegistry &);
+LLVM_ABI void initializeLiveRegMatrixWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveStacksWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeLiveVariablesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreOptPass(PassRegistry &);
+LLVM_ABI void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalStackSlotPassPass(PassRegistry &);
+LLVM_ABI void initializeLocalizerPass(PassRegistry &);
+LLVM_ABI void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopExtractorLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopPassPass(PassRegistry &);
+LLVM_ABI void initializeLoopSimplifyPass(PassRegistry &);
+LLVM_ABI void initializeLoopStrengthReducePass(PassRegistry &);
+LLVM_ABI void initializeLoopTermFoldPass(PassRegistry &);
+LLVM_ABI void initializeLoopUnrollPass(PassRegistry &);
+LLVM_ABI void initializeLowerAtomicLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerEmuTLSPass(PassRegistry &);
+LLVM_ABI void initializeLowerGlobalDtorsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerIntrinsicsPass(PassRegistry &);
+LLVM_ABI void initializeLowerInvokeLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeLowerSwitchLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMIRAddFSDiscriminatorsPass(PassRegistry &);
+LLVM_ABI void initializeMIRCanonicalizerPass(PassRegistry &);
+LLVM_ABI void initializeMIRNamerPass(PassRegistry &);
+LLVM_ABI void initializeMIRPrintingPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockPlacementStatsLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineBranchProbabilityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineCFGPrinterPass(PassRegistry &);
+LLVM_ABI void initializeMachineCSELegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCombinerPass(PassRegistry &);
+LLVM_ABI void initializeMachineCopyPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoPrinterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominanceFrontierPass(PassRegistry &);
+LLVM_ABI void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineFunctionSplitterPass(PassRegistry &);
+LLVM_ABI void initializeMachineLateInstrsCleanupLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineLICMPass(PassRegistry &);
+LLVM_ABI void initializeMachineLoopInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineModuleInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineOutlinerPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataProfileInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataAnnotatorPass(PassRegistry &);
+LLVM_ABI void initializeMachinePipelinerPass(PassRegistry &);
+LLVM_ABI void initializeMachinePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void
+initializeMachineSanitizerBinaryMetadataLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineSinkingLegacyPass(PassRegistry &);
+LLVM_ABI void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeMemoryDependenceWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMemorySSAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMergeICmpsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeModuloScheduleTestPass(PassRegistry &);
+LLVM_ABI void initializeNaryReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeObjCARCContractLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeOptimizePHIsLegacyPass(PassRegistry &);
+LLVM_ABI void initializePEILegacyPass(PassRegistry &);
+LLVM_ABI void initializePHIEliminationPass(PassRegistry &);
+LLVM_ABI void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePatchableFunctionLegacyPass(PassRegistry &);
+LLVM_ABI void initializePeepholeOptimizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePhiValuesWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializePhysicalRegisterUsageInfoWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializePlaceBackedgeSafepointsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomOnlyViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomPrinterWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDomViewerWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostDominatorTreeWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePostInlineEntryExitInstrumenterPass(PassRegistry &);
+LLVM_ABI void initializePostMachineSchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAHazardRecognizerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePostRAMachineSinkingPass(PassRegistry &);
+LLVM_ABI void initializePostRASchedulerLegacyPass(PassRegistry &);
+LLVM_ABI void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializePrintFunctionPassWrapperPass(PassRegistry &);
+LLVM_ABI void initializePrintModulePassWrapperPass(PassRegistry &);
+LLVM_ABI void initializeProcessImplicitDefsPass(PassRegistry &);
+LLVM_ABI void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializePromoteLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeRABasicPass(PassRegistry &);
+LLVM_ABI void initializePseudoProbeInserterPass(PassRegistry &);
+LLVM_ABI void initializeRAGreedyLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReachingDefAnalysisPass(PassRegistry &);
+LLVM_ABI void initializeReassociateLegacyPassPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocEvictionAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocFastPass(PassRegistry &);
+LLVM_ABI void
+initializeRegAllocPriorityAdvisorAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegAllocScoringPass(PassRegistry &);
+LLVM_ABI void initializeRegBankSelectPass(PassRegistry &);
+LLVM_ABI void initializeRegToMemWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoCollectorLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegUsageInfoPropagationLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRegionInfoPassPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionOnlyViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegionPrinterPass(PassRegistry &);
+LLVM_ABI void initializeRegionViewerPass(PassRegistry &);
+LLVM_ABI void initializeRegisterCoalescerLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveLoadsIntoFakeUsesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRemoveRedundantDebugValuesLegacyPass(PassRegistry &);
+LLVM_ABI void initializeRenameIndependentSubregsLegacyPass(PassRegistry &);
+LLVM_ABI void initializeReplaceWithVeclibLegacyPass(PassRegistry &);
+LLVM_ABI void initializeResetMachineFunctionPass(PassRegistry &);
+LLVM_ABI void initializeSCEVAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSROALegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafeStackLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSafepointIRVerifierPass(PassRegistry &);
+LLVM_ABI void initializeSelectOptimizePass(PassRegistry &);
+LLVM_ABI void initializeScalarEvolutionWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizeMaskedMemIntrinLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScalarizerLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeScavengerTestPass(PassRegistry &);
+LLVM_ABI void initializeScopedNoAliasAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void
+initializeSeparateConstOffsetFromGEPLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeShadowStackGCLoweringPass(PassRegistry &);
+LLVM_ABI void initializeShrinkWrapLegacyPass(PassRegistry &);
+LLVM_ABI void initializeSingleLoopExtractorPass(PassRegistry &);
+LLVM_ABI void initializeSinkingLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSjLjEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeSlotIndexesWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeSpeculativeExecutionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeSpillPlacementWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackColoringLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStackFrameLayoutAnalysisLegacyPass(PassRegistry &);
+LLVM_ABI void initializeStaticDataSplitterPass(PassRegistry &);
+LLVM_ABI void initializeStackMapLivenessPass(PassRegistry &);
+LLVM_ABI void initializeStackProtectorPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyGlobalInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSafetyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeStackSlotColoringLegacyPass(PassRegistry &);
+LLVM_ABI void
+initializeStraightLineStrengthReduceLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeStripDebugMachineModulePass(PassRegistry &);
+LLVM_ABI void initializeStructurizeCFGLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTailCallElimPass(PassRegistry &);
+LLVM_ABI void initializeTailDuplicateLegacyPass(PassRegistry &);
+LLVM_ABI void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTargetPassConfigPass(PassRegistry &);
+LLVM_ABI void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTwoAddressInstructionLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeTypeBasedAAWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &);
+LLVM_ABI void initializeInitUndefPass(PassRegistry &);
+LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVerifierLegacyPassPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegMapWrapperLegacyPass(PassRegistry &);
+LLVM_ABI void initializeVirtRegRewriterLegacyPass(PassRegistry &);
+LLVM_ABI void initializeWasmEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWinEHPreparePass(PassRegistry &);
+LLVM_ABI void initializeWriteBitcodePassPass(PassRegistry &);
+LLVM_ABI void initializeXRayInstrumentationLegacyPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h
index 921db0b5f7aec..58c45e75b3f0a 100644
--- a/llvm/include/llvm/Pass.h
+++ b/llvm/include/llvm/Pass.h
@@ -31,6 +31,7 @@
 #ifdef EXPENSIVE_CHECKS
 #include <cstdint>
 #endif
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -95,7 +96,7 @@ const char *to_string(ThinOrFullLTOPhase Phase);
 /// interprocedural optimization or you do not fit into any of the more
 /// constrained passes described below.
 ///
-class Pass {
+class LLVM_ABI Pass {
   AnalysisResolver *Resolver = nullptr;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
@@ -252,7 +253,7 @@ class Pass {
 /// interprocedural optimizations and analyses.  ModulePasses may do anything
 /// they want to the program.
 ///
-class ModulePass : public Pass {
+class LLVM_ABI ModulePass : public Pass {
 public:
   explicit ModulePass(char &pid) : Pass(PT_Module, pid) {}
 
@@ -282,7 +283,7 @@ class ModulePass : public Pass {
 /// ImmutablePass class - This class is used to provide information that does
 /// not need to be run.  This is useful for things like target information.
 ///
-class ImmutablePass : public ModulePass {
+class LLVM_ABI ImmutablePass : public ModulePass {
 public:
   explicit ImmutablePass(char &pid) : ModulePass(pid) {}
 
@@ -311,7 +312,7 @@ class ImmutablePass : public ModulePass {
 ///  2. Optimizing a function does not cause the addition or removal of any
 ///     functions in the module
 ///
-class FunctionPass : public Pass {
+class LLVM_ABI FunctionPass : public Pass {
 public:
   explicit FunctionPass(char &pid) : Pass(PT_Function, pid) {}
 
@@ -338,13 +339,13 @@ class FunctionPass : public Pass {
 /// If the user specifies the -time-passes argument on an LLVM tool command line
 /// then the value of this boolean will be true, otherwise false.
 /// This is the storage for the -time-passes option.
-extern bool TimePassesIsEnabled;
+LLVM_ABI extern bool TimePassesIsEnabled;
 /// If TimePassesPerRun is true, there would be one line of report for
 /// each pass invocation.
 /// If TimePassesPerRun is false, there would be only one line of
 /// report for each pass (even there are more than one pass objects).
 /// (For new pass manager only)
-extern bool TimePassesPerRun;
+LLVM_ABI extern bool TimePassesPerRun;
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/PassAnalysisSupport.h b/llvm/include/llvm/PassAnalysisSupport.h
index 4bed3cb55a901..02abb00b66b52 100644
--- a/llvm/include/llvm/PassAnalysisSupport.h
+++ b/llvm/include/llvm/PassAnalysisSupport.h
@@ -24,6 +24,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <tuple>
 #include <utility>
@@ -69,14 +70,14 @@ class AnalysisUsage {
 
   ///@{
   /// Add the specified ID to the required set of the usage info for a pass.
-  AnalysisUsage &addRequiredID(const void *ID);
-  AnalysisUsage &addRequiredID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(const void *ID);
+  LLVM_ABI AnalysisUsage &addRequiredID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequired() {
     return addRequiredID(PassClass::ID);
   }
 
-  AnalysisUsage &addRequiredTransitiveID(char &ID);
+  LLVM_ABI AnalysisUsage &addRequiredTransitiveID(char &ID);
   template<class PassClass>
   AnalysisUsage &addRequiredTransitive() {
     return addRequiredTransitiveID(PassClass::ID);
@@ -124,7 +125,7 @@ class AnalysisUsage {
   /// preserved by this pass. If no such Pass exists, do nothing. This can be
   /// useful when a pass is trivially preserved, but may not be linked in. Be
   /// careful about spelling!
-  AnalysisUsage &addPreserved(StringRef Arg);
+  LLVM_ABI AnalysisUsage &addPreserved(StringRef Arg);
 
   /// Set by analyses that do not transform their input at all
   void setPreservesAll() { PreservesAll = true; }
@@ -139,7 +140,7 @@ class AnalysisUsage {
   ///
   /// This function annotates the AnalysisUsage info object to say that analyses
   /// that only depend on the CFG are preserved by this pass.
-  void setPreservesCFG();
+  LLVM_ABI void setPreservesCFG();
 
   const VectorType &getRequiredSet() const { return Required; }
   const VectorType &getRequiredTransitiveSet() const {
@@ -174,7 +175,8 @@ class AnalysisResolver {
   }
 
   /// Find pass that is implementing PI. Initialize pass for Function F.
-  std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI, Function &F);
+  LLVM_ABI std::tuple<Pass *, bool> findImplPass(Pass *P, AnalysisID PI,
+                                                 Function &F);
 
   void addAnalysisImplsPair(AnalysisID PI, Pass *P) {
     if (findImplPass(PI) == P)
@@ -189,7 +191,7 @@ class AnalysisResolver {
   }
 
   /// Return analysis result or null if it doesn't exist.
-  Pass *getAnalysisIfAvailable(AnalysisID ID) const;
+  LLVM_ABI Pass *getAnalysisIfAvailable(AnalysisID ID) const;
 
 private:
   /// This keeps track of which passes implements the interfaces that are
diff --git a/llvm/include/llvm/PassRegistry.h b/llvm/include/llvm/PassRegistry.h
index 003c0ac4c374b..f3dada0c0ba6c 100644
--- a/llvm/include/llvm/PassRegistry.h
+++ b/llvm/include/llvm/PassRegistry.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/RWMutex.h"
 #include <memory>
 #include <vector>
@@ -49,36 +50,36 @@ class PassRegistry {
 
 public:
   PassRegistry() = default;
-  ~PassRegistry();
+  LLVM_ABI ~PassRegistry();
 
   /// getPassRegistry - Access the global registry object, which is
   /// automatically initialized at application launch and destroyed by
   /// llvm_shutdown.
-  static PassRegistry *getPassRegistry();
+  LLVM_ABI static PassRegistry *getPassRegistry();
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// type identifier (&MyPass::ID).
-  const PassInfo *getPassInfo(const void *TI) const;
+  LLVM_ABI const PassInfo *getPassInfo(const void *TI) const;
 
   /// getPassInfo - Look up a pass' corresponding PassInfo, indexed by the pass'
   /// argument string.
-  const PassInfo *getPassInfo(StringRef Arg) const;
+  LLVM_ABI const PassInfo *getPassInfo(StringRef Arg) const;
 
   /// registerPass - Register a pass (by means of its PassInfo) with the
   /// registry.  Required in order to use the pass with a PassManager.
-  void registerPass(const PassInfo &PI, bool ShouldFree = false);
+  LLVM_ABI void registerPass(const PassInfo &PI, bool ShouldFree = false);
 
   /// enumerateWith - Enumerate the registered passes, calling the provided
   /// PassRegistrationListener's passEnumerate() callback on each of them.
-  void enumerateWith(PassRegistrationListener *L);
+  LLVM_ABI void enumerateWith(PassRegistrationListener *L);
 
   /// addRegistrationListener - Register the given PassRegistrationListener
   /// to receive passRegistered() callbacks whenever a new pass is registered.
-  void addRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void addRegistrationListener(PassRegistrationListener *L);
 
   /// removeRegistrationListener - Unregister a PassRegistrationListener so that
   /// it no longer receives passRegistered() callbacks.
-  void removeRegistrationListener(PassRegistrationListener *L);
+  LLVM_ABI void removeRegistrationListener(PassRegistrationListener *L);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/PassSupport.h b/llvm/include/llvm/PassSupport.h
index b0897a6be37d1..7f0306e33e832 100644
--- a/llvm/include/llvm/PassSupport.h
+++ b/llvm/include/llvm/PassSupport.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Threading.h"
 #include <functional>
@@ -112,7 +113,7 @@ struct PassRegistrationListener {
 
   /// enumeratePasses - Iterate over the registered passes, calling the
   /// passEnumerate callback on each PassInfo object.
-  void enumeratePasses();
+  LLVM_ABI void enumeratePasses();
 
   /// passEnumerate - Callback function invoked when someone calls
   /// enumeratePasses on this PassRegistrationListener object.
diff --git a/llvm/include/llvm/Passes/OptimizationLevel.h b/llvm/include/llvm/Passes/OptimizationLevel.h
index d2c3fde4935fb..1cf258f1ffd0d 100644
--- a/llvm/include/llvm/Passes/OptimizationLevel.h
+++ b/llvm/include/llvm/Passes/OptimizationLevel.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_PASSES_OPTIMIZATIONLEVEL_H
 #define LLVM_PASSES_OPTIMIZATIONLEVEL_H
 
+#include "llvm/Support/Compiler.h"
 #include <assert.h>
 
 namespace llvm {
@@ -38,7 +39,7 @@ class OptimizationLevel final {
   /// Disable as many optimizations as possible. This doesn't completely
   /// disable the optimizer in all cases, for example always_inline functions
   /// can be required to be inlined for correctness.
-  static const OptimizationLevel O0;
+  LLVM_ABI static const OptimizationLevel O0;
 
   /// Optimize quickly without destroying debuggability.
   ///
@@ -54,7 +55,7 @@ class OptimizationLevel final {
   /// vectorization, or fusion don't make sense here due to the degree to
   /// which the executed code differs from the source code, and the compile
   /// time cost.
-  static const OptimizationLevel O1;
+  LLVM_ABI static const OptimizationLevel O1;
   /// Optimize for fast execution as much as possible without triggering
   /// significant incremental compile time or code size growth.
   ///
@@ -71,7 +72,7 @@ class OptimizationLevel final {
   ///
   /// This is expected to be a good default optimization level for the vast
   /// majority of users.
-  static const OptimizationLevel O2;
+  LLVM_ABI static const OptimizationLevel O2;
   /// Optimize for fast execution as much as possible.
   ///
   /// This mode is significantly more aggressive in trading off compile time
@@ -86,7 +87,7 @@ class OptimizationLevel final {
   /// order to make even significantly slower compile times at least scale
   /// reasonably. This does not preclude very substantial constant factor
   /// costs though.
-  static const OptimizationLevel O3;
+  LLVM_ABI static const OptimizationLevel O3;
   /// Similar to \c O2 but tries to optimize for small code size instead of
   /// fast execution without triggering significant incremental execution
   /// time slowdowns.
@@ -97,7 +98,7 @@ class OptimizationLevel final {
   /// A consequence of the different core goal is that this should in general
   /// produce substantially smaller executables that still run in
   /// a reasonable amount of time.
-  static const OptimizationLevel Os;
+  LLVM_ABI static const OptimizationLevel Os;
   /// A very specialized mode that will optimize for code size at any and all
   /// costs.
   ///
@@ -105,7 +106,7 @@ class OptimizationLevel final {
   /// any effort taken to reduce the size is worth it regardless of the
   /// execution time impact. You should expect this level to produce rather
   /// slow, but very small, code.
-  static const OptimizationLevel Oz;
+  LLVM_ABI static const OptimizationLevel Oz;
 
   bool isOptimizingForSpeed() const { return SizeLevel == 0 && SpeedLevel > 0; }
 
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 51ccaa53447d7..f13b5c678a894 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/RegAllocCommon.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,7 +45,7 @@ class PipelineTuningOptions {
 public:
   /// Constructor sets pipeline tuning defaults based on cl::opts. Each option
   /// can be set in the PassBuilder when using a LLVM as a library.
-  PipelineTuningOptions();
+  LLVM_ABI PipelineTuningOptions();
 
   /// Tuning option to set loop interleaving on/off, set based on opt level.
   bool LoopInterleaving;
@@ -126,20 +127,20 @@ class PassBuilder {
     std::vector<PipelineElement> InnerPipeline;
   };
 
-  explicit PassBuilder(TargetMachine *TM = nullptr,
-                       PipelineTuningOptions PTO = PipelineTuningOptions(),
-                       std::optional<PGOOptions> PGOOpt = std::nullopt,
-                       PassInstrumentationCallbacks *PIC = nullptr);
+  LLVM_ABI explicit PassBuilder(
+      TargetMachine *TM = nullptr,
+      PipelineTuningOptions PTO = PipelineTuningOptions(),
+      std::optional<PGOOptions> PGOOpt = std::nullopt,
+      PassInstrumentationCallbacks *PIC = nullptr);
 
   /// Cross register the analysis managers through their proxies.
   ///
   /// This is an interface that can be used to cross register each
   /// AnalysisManager with all the others analysis managers.
-  void crossRegisterProxies(LoopAnalysisManager &LAM,
-                            FunctionAnalysisManager &FAM,
-                            CGSCCAnalysisManager &CGAM,
-                            ModuleAnalysisManager &MAM,
-                            MachineFunctionAnalysisManager *MFAM = nullptr);
+  LLVM_ABI void
+  crossRegisterProxies(LoopAnalysisManager &LAM, FunctionAnalysisManager &FAM,
+                       CGSCCAnalysisManager &CGAM, ModuleAnalysisManager &MAM,
+                       MachineFunctionAnalysisManager *MFAM = nullptr);
 
   /// Registers all available module analysis passes.
   ///
@@ -147,7 +148,7 @@ class PassBuilder {
   /// ModuleAnalysisManager with all registered module analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerModuleAnalyses(ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerModuleAnalyses(ModuleAnalysisManager &MAM);
 
   /// Registers all available CGSCC analysis passes.
   ///
@@ -155,7 +156,7 @@ class PassBuilder {
   /// with all registered CGSCC analyses. Callers can still manually register any
   /// additional analyses. Callers can also pre-register analyses and this will
   /// not override those.
-  void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
+  LLVM_ABI void registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM);
 
   /// Registers all available function analysis passes.
   ///
@@ -163,14 +164,14 @@ class PassBuilder {
   /// FunctionAnalysisManager with all registered function analyses. Callers can
   /// still manually register any additional analyses. Callers can also
   /// pre-register analyses and this will not override those.
-  void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
+  LLVM_ABI void registerFunctionAnalyses(FunctionAnalysisManager &FAM);
 
   /// Registers all available loop analysis passes.
   ///
   /// This is an interface that can be used to populate a \c LoopAnalysisManager
   /// with all registered loop analyses. Callers can still manually register any
   /// additional analyses.
-  void registerLoopAnalyses(LoopAnalysisManager &LAM);
+  LLVM_ABI void registerLoopAnalyses(LoopAnalysisManager &LAM);
 
   /// Registers all available machine function analysis passes.
   ///
@@ -178,7 +179,8 @@ class PassBuilder {
   /// MachineFunctionAnalysisManager with all registered function analyses.
   /// Callers can still manually register any additional analyses. Callers can
   /// also pre-register analyses and this will not override those.
-  void registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
+  LLVM_ABI void
+  registerMachineFunctionAnalyses(MachineFunctionAnalysisManager &MFAM);
 
   /// Construct the core LLVM function canonicalization and simplification
   /// pipeline.
@@ -194,9 +196,8 @@ class PassBuilder {
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  FunctionPassManager
-  buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI FunctionPassManager buildFunctionSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module canonicalization and simplification
   /// pipeline.
@@ -213,18 +214,18 @@ class PassBuilder {
   /// build them.
   ///
   /// \p Phase indicates the current ThinLTO phase.
-  ModulePassManager buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                                      ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager buildModuleSimplificationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining as well as
   /// the inlining-driven cleanups.
-  ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level,
-                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModuleInlinerWrapperPass
+  buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the module pipeline that performs inlining with
   /// module inliner pass.
-  ModulePassManager buildModuleInlinerPipeline(OptimizationLevel Level,
-                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI ModulePassManager
+  buildModuleInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase);
 
   /// Construct the core LLVM module optimization pipeline.
   ///
@@ -239,9 +240,8 @@ class PassBuilder {
   /// only intended for use when attempting to optimize code. If frontends
   /// require some transformations for semantic reasons, they should explicitly
   /// build them.
-  ModulePassManager
-  buildModuleOptimizationPipeline(OptimizationLevel Level,
-                                  ThinOrFullLTOPhase LTOPhase);
+  LLVM_ABI ModulePassManager buildModuleOptimizationPipeline(
+      OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase);
 
   /// Build a per-module default optimization pipeline.
   ///
@@ -249,7 +249,7 @@ class PassBuilder {
   /// optimization and code generation without any link-time optimization. It
   /// typically correspond to frontend "-O[123]" options for optimization
   /// levels \c O1, \c O2 and \c O3 resp.
-  ModulePassManager buildPerModuleDefaultPipeline(
+  LLVM_ABI ModulePassManager buildPerModuleDefaultPipeline(
       OptimizationLevel Level,
       ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -258,8 +258,9 @@ class PassBuilder {
   /// This builds a pipeline that runs the LTO/ThinLTO  pre-link pipeline, and
   /// emits a section containing the pre-link bitcode along side the object code
   /// generated in non-LTO compilation.
-  ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
-                                               bool ThinLTO, bool EmitSummary);
+  LLVM_ABI ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
+                                                        bool ThinLTO,
+                                                        bool EmitSummary);
 
   /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
   /// a pass manager.
@@ -268,7 +269,8 @@ class PassBuilder {
   /// a ThinLTO run. It works to minimize the IR which needs to be analyzed
   /// without making irreversible decisions which could be made better during
   /// the LTO run.
-  ModulePassManager buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build a ThinLTO default optimization pipeline to a pass manager.
   ///
@@ -276,9 +278,8 @@ class PassBuilder {
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildThinLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager
-  buildThinLTODefaultPipeline(OptimizationLevel Level,
-                              const ModuleSummaryIndex *ImportSummary);
+  LLVM_ABI ModulePassManager buildThinLTODefaultPipeline(
+      OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary);
 
   /// Build a pre-link, LTO-targeting default optimization pipeline to a pass
   /// manager.
@@ -287,7 +288,8 @@ class PassBuilder {
   /// run. It works to minimize the IR which needs to be analyzed without
   /// making irreversible decisions which could be made better during the LTO
   /// run.
-  ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
+  LLVM_ABI ModulePassManager
+  buildLTOPreLinkDefaultPipeline(OptimizationLevel Level);
 
   /// Build an LTO default optimization pipeline to a pass manager.
   ///
@@ -295,13 +297,13 @@ class PassBuilder {
   /// optimization and code generation. It is particularly tuned to fit well
   /// when IR coming into the LTO phase was first run through \c
   /// buildLTOPreLinkDefaultPipeline, and the two coordinate closely.
-  ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level,
-                                            ModuleSummaryIndex *ExportSummary);
+  LLVM_ABI ModulePassManager buildLTODefaultPipeline(
+      OptimizationLevel Level, ModuleSummaryIndex *ExportSummary);
 
   /// Build an O0 pipeline with the minimal semantically required passes.
   ///
   /// This should only be used for non-LTO and LTO pre-link pipelines.
-  ModulePassManager
+  LLVM_ABI ModulePassManager
   buildO0DefaultPipeline(OptimizationLevel Level,
                          ThinOrFullLTOPhase Phase = ThinOrFullLTOPhase::None);
 
@@ -310,7 +312,7 @@ class PassBuilder {
   ///
   /// This also adds target-specific alias analyses registered via
   /// TargetMachine::registerDefaultAliasAnalyses().
-  AAManager buildDefaultAAPipeline();
+  LLVM_ABI AAManager buildDefaultAAPipeline();
 
   /// Parse a textual pass pipeline description into a \c
   /// ModulePassManager.
@@ -352,7 +354,8 @@ class PassBuilder {
   /// specifically want the pass to run under a adaptor directly. This is
   /// preferred when a pipeline is largely of one type, but one or just a few
   /// passes are of different types(See PassBuilder.cpp for examples).
-  Error parsePassPipeline(ModulePassManager &MPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(ModulePassManager &MPM,
+                                   StringRef PipelineText);
 
   /// {{@ Parse a textual pass pipeline description into a specific PassManager
   ///
@@ -361,9 +364,12 @@ class PassBuilder {
   /// this is the valid pipeline text:
   ///
   ///   function(lpass)
-  Error parsePassPipeline(CGSCCPassManager &CGPM, StringRef PipelineText);
-  Error parsePassPipeline(FunctionPassManager &FPM, StringRef PipelineText);
-  Error parsePassPipeline(LoopPassManager &LPM, StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(CGSCCPassManager &CGPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(FunctionPassManager &FPM,
+                                   StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(LoopPassManager &LPM,
+                                   StringRef PipelineText);
   /// @}}
 
   /// Parse a textual MIR pipeline into the provided \c MachineFunctionPass
@@ -375,8 +381,8 @@ class PassBuilder {
   ///
   /// There is no need to specify the pass nesting, and this function
   /// currently cannot handle the pass nesting.
-  Error parsePassPipeline(MachineFunctionPassManager &MFPM,
-                          StringRef PipelineText);
+  LLVM_ABI Error parsePassPipeline(MachineFunctionPassManager &MFPM,
+                                   StringRef PipelineText);
 
   /// Parse a textual alias analysis pipeline into the provided AA manager.
   ///
@@ -393,14 +399,14 @@ class PassBuilder {
   /// Returns false if the text cannot be parsed cleanly. The specific state of
   /// the \p AA manager is unspecified if such an error is encountered and this
   /// returns false.
-  Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
+  LLVM_ABI Error parseAAPipeline(AAManager &AA, StringRef PipelineText);
 
   /// Parse RegAllocFilterName to get RegAllocFilterFunc.
-  std::optional<RegAllocFilterFunc>
+  LLVM_ABI std::optional<RegAllocFilterFunc>
   parseRegAllocFilter(StringRef RegAllocFilterName);
 
   /// Print pass names.
-  void printPassNames(raw_ostream &OS);
+  LLVM_ABI void printPassNames(raw_ostream &OS);
 
   /// Register a callback for a default optimizer pipeline extension
   /// point
@@ -614,16 +620,17 @@ class PassBuilder {
   /// If the PassManager type is not given at the top level of the pipeline
   /// text, this Callback should be used to determine the appropriate stack of
   /// PassManagers and populate the passed ModulePassManager.
-  void registerParseTopLevelPipelineCallback(
+  LLVM_ABI void registerParseTopLevelPipelineCallback(
       const std::function<bool(ModulePassManager &, ArrayRef<PipelineElement>)>
           &C);
 
   /// Add PGOInstrumenation passes for O0 only.
-  void addPGOInstrPassesForO0(ModulePassManager &MPM, bool RunProfileGen,
-                              bool IsCS, bool AtomicCounterUpdate,
-                              std::string ProfileFile,
-                              std::string ProfileRemappingFile,
-                              IntrusiveRefCntPtr<vfs::FileSystem> FS);
+  LLVM_ABI void addPGOInstrPassesForO0(ModulePassManager &MPM,
+                                       bool RunProfileGen, bool IsCS,
+                                       bool AtomicCounterUpdate,
+                                       std::string ProfileFile,
+                                       std::string ProfileRemappingFile,
+                                       IntrusiveRefCntPtr<vfs::FileSystem> FS);
 
   /// Returns PIC. External libraries can use this to register pass
   /// instrumentation callbacks.
@@ -634,35 +641,38 @@ class PassBuilder {
   // Invoke the callbacks registered for the various extension points.
   // Custom pipelines should use these to invoke the callbacks registered
   // by TargetMachines and other clients.
-  void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
-                                 OptimizationLevel Level);
-  void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
-                                              OptimizationLevel Level);
-  void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
-                                         OptimizationLevel Level);
-  void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
-                                            OptimizationLevel Level);
-  void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
-                                           OptimizationLevel Level);
-  void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
-                                        OptimizationLevel Level);
-  void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
-                                      OptimizationLevel Level);
-  void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
-                                       OptimizationLevel Level,
-                                       ThinOrFullLTOPhase Phase);
-  void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level,
-                                      ThinOrFullLTOPhase Phase);
-  void invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
-                                                      OptimizationLevel Level);
-  void invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+  LLVM_ABI void invokePeepholeEPCallbacks(FunctionPassManager &FPM,
+                                          OptimizationLevel Level);
+  LLVM_ABI void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM,
+                                                       OptimizationLevel Level);
+  LLVM_ABI void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM,
+                                                  OptimizationLevel Level);
+  LLVM_ABI void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM,
                                                      OptimizationLevel Level);
-  void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
-                                      OptimizationLevel Level);
-  void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
-                                                    OptimizationLevel Level,
-                                                    ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM,
+                                                    OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void invokeVectorizerEndEPCallbacks(FunctionPassManager &FPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level,
+                                                ThinOrFullLTOPhase Phase);
+  LLVM_ABI void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM,
+                                                 OptimizationLevel Level);
+  LLVM_ABI void
+  invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM,
+                                                OptimizationLevel Level);
+  LLVM_ABI void invokePipelineStartEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level);
+  LLVM_ABI void
+  invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM,
+                                               OptimizationLevel Level,
+                                               ThinOrFullLTOPhase Phase);
 
   static bool checkParametrizedPassName(StringRef Name, StringRef PassName) {
     if (!Name.consume_front(PassName))
@@ -713,9 +723,9 @@ class PassBuilder {
   /// Handle passes only accept one bool-valued parameter.
   ///
   /// \return false when Params is empty.
-  static Expected<bool> parseSinglePassOption(StringRef Params,
-                                              StringRef OptionName,
-                                              StringRef PassName);
+  LLVM_ABI static Expected<bool> parseSinglePassOption(StringRef Params,
+                                                       StringRef OptionName,
+                                                       StringRef PassName);
 
 private:
   // O1 pass pipeline
@@ -898,7 +908,7 @@ struct NoOpModulePass : PassInfoMixin<NoOpModulePass> {
 /// No-op module analysis.
 class NoOpModuleAnalysis : public AnalysisInfoMixin<NoOpModuleAnalysis> {
   friend AnalysisInfoMixin<NoOpModuleAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -916,7 +926,7 @@ struct NoOpCGSCCPass : PassInfoMixin<NoOpCGSCCPass> {
 /// No-op CGSCC analysis.
 class NoOpCGSCCAnalysis : public AnalysisInfoMixin<NoOpCGSCCAnalysis> {
   friend AnalysisInfoMixin<NoOpCGSCCAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -935,7 +945,7 @@ struct NoOpFunctionPass : PassInfoMixin<NoOpFunctionPass> {
 /// No-op function analysis.
 class NoOpFunctionAnalysis : public AnalysisInfoMixin<NoOpFunctionAnalysis> {
   friend AnalysisInfoMixin<NoOpFunctionAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -968,7 +978,7 @@ struct NoOpMachineFunctionPass : public PassInfoMixin<NoOpMachineFunctionPass> {
 /// No-op loop analysis.
 class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
   friend AnalysisInfoMixin<NoOpLoopAnalysis>;
-  static AnalysisKey Key;
+  LLVM_ABI static AnalysisKey Key;
 
 public:
   struct Result {};
@@ -978,8 +988,7 @@ class NoOpLoopAnalysis : public AnalysisInfoMixin<NoOpLoopAnalysis> {
 };
 
 /// Common option used by multiple tools to print pipeline passes
-extern cl::opt<bool> PrintPipelinePasses;
-
+LLVM_ABI extern cl::opt<bool> PrintPipelinePasses;
 }
 
 #endif
diff --git a/llvm/include/llvm/Passes/PassPlugin.h b/llvm/include/llvm/Passes/PassPlugin.h
index 013b7a827c47d..947504bc207a7 100644
--- a/llvm/include/llvm/Passes/PassPlugin.h
+++ b/llvm/include/llvm/Passes/PassPlugin.h
@@ -64,7 +64,7 @@ class PassPlugin {
   /// \returns Returns an error if either the library cannot be found or loaded,
   /// there is no public entry point, or the plugin implements the wrong API
   /// version.
-  static Expected<PassPlugin> Load(const std::string &Filename);
+  LLVM_ABI static Expected<PassPlugin> Load(const std::string &Filename);
 
   /// Get the filename of the loaded plugin.
   StringRef getFilename() const { return Filename; }
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index f7a65a88ecf5b..4ee5ab2554868 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -27,6 +27,7 @@
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
 
@@ -46,9 +47,9 @@ class PassInstrumentationCallbacks;
 /// (typically Loop or SCC).
 class PrintIRInstrumentation {
 public:
-  ~PrintIRInstrumentation();
+  LLVM_ABI ~PrintIRInstrumentation();
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   struct PassRunDescriptor {
@@ -104,7 +105,7 @@ class PrintIRInstrumentation {
 class OptNoneInstrumentation {
 public:
   OptNoneInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool DebugLogging;
@@ -116,8 +117,8 @@ class OptPassGateInstrumentation {
   bool HasWrittenIR = false;
 public:
   OptPassGateInstrumentation(LLVMContext &Context) : Context(Context) {}
-  bool shouldRun(StringRef PassName, Any IR);
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI bool shouldRun(StringRef PassName, Any IR);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 };
 
 struct PrintPassOptions {
@@ -136,7 +137,7 @@ class PrintPassInstrumentation {
 public:
   PrintPassInstrumentation(bool Enabled, PrintPassOptions Opts)
       : Enabled(Enabled), Opts(Opts) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   bool Enabled;
@@ -167,7 +168,7 @@ class PreservedCFGCheckerInstrumentation {
     std::optional<DenseMap<intptr_t, BBGuard>> BBGuards;
     DenseMap<const BasicBlock *, DenseMap<const BasicBlock *, unsigned>> Graph;
 
-    CFG(const Function *F, bool TrackBBLifetime);
+    LLVM_ABI CFG(const Function *F, bool TrackBBLifetime);
 
     bool operator==(const CFG &G) const {
       return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph;
@@ -179,18 +180,18 @@ class PreservedCFGCheckerInstrumentation {
              });
     }
 
-    static void printDiff(raw_ostream &out, const CFG &Before,
-                          const CFG &After);
-    bool invalidate(Function &F, const PreservedAnalyses &PA,
-                    FunctionAnalysisManager::Invalidator &);
+    LLVM_ABI static void printDiff(raw_ostream &out, const CFG &Before,
+                                   const CFG &After);
+    LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
+                             FunctionAnalysisManager::Invalidator &);
   };
 
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
   SmallVector<StringRef, 8> PassStack;
 #endif
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager &MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager &MAM);
 };
 
 // Base class for classes that report changes to the IR.
@@ -208,7 +209,7 @@ class PreservedCFGCheckerInstrumentation {
 // 6.  When a pass is run on an IR that is not interesting (based on options).
 // 7.  When a pass is ignored (pass manager or adapter pass).
 // 8.  To compare two IR representations (of type \p T).
-template <typename IRUnitT> class ChangeReporter {
+template <typename IRUnitT> class LLVM_ABI ChangeReporter {
 protected:
   ChangeReporter(bool RunInVerboseMode) : VerboseMode(RunInVerboseMode) {}
 
@@ -257,7 +258,7 @@ template <typename IRUnitT> class ChangeReporter {
 // An abstract template base class that handles printing banners and
 // reporting when things have not changed or are filtered out.
 template <typename IRUnitT>
-class TextChangeReporter : public ChangeReporter<IRUnitT> {
+class LLVM_ABI TextChangeReporter : public ChangeReporter<IRUnitT> {
 protected:
   TextChangeReporter(bool Verbose);
 
@@ -281,7 +282,7 @@ class TextChangeReporter : public ChangeReporter<IRUnitT> {
 // by unwrapAndPrint.  The string representation is stored in a std::string
 // to preserve it as the IR changes in each pass.  Note that the banner is
 // included in this representation but it is massaged before reporting.
-class IRChangedPrinter : public TextChangeReporter<std::string> {
+class LLVM_ABI IRChangedPrinter : public TextChangeReporter<std::string> {
 public:
   IRChangedPrinter(bool VerboseMode)
       : TextChangeReporter<std::string>(VerboseMode) {}
@@ -298,7 +299,7 @@ class IRChangedPrinter : public TextChangeReporter<std::string> {
                    Any) override;
 };
 
-class IRChangedTester : public IRChangedPrinter {
+class LLVM_ABI IRChangedTester : public IRChangedPrinter {
 public:
   IRChangedTester() : IRChangedPrinter(true) {}
   ~IRChangedTester() override;
@@ -444,7 +445,8 @@ template <typename T> class IRComparer {
 // and added, respectively.  Changes to the IR that do not affect basic
 // blocks are not reported as having changed the IR.  The option
 // -print-module-scope does not affect this change reporter.
-class InLineChangePrinter : public TextChangeReporter<IRDataT<EmptyData>> {
+class LLVM_ABI InLineChangePrinter
+    : public TextChangeReporter<IRDataT<EmptyData>> {
 public:
   InLineChangePrinter(bool VerboseMode, bool ColourMode)
       : TextChangeReporter<IRDataT<EmptyData>>(VerboseMode),
@@ -475,8 +477,8 @@ class VerifyInstrumentation {
 
 public:
   VerifyInstrumentation(bool DebugLogging) : DebugLogging(DebugLogging) {}
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM);
 };
 
 /// This class implements --time-trace functionality for new pass manager.
@@ -484,12 +486,12 @@ class VerifyInstrumentation {
 /// execution time. They collect time tracing info by TimeProfiler.
 class TimeProfilingPassesHandler {
 public:
-  TimeProfilingPassesHandler();
+  LLVM_ABI TimeProfilingPassesHandler();
   // We intend this to be unique per-compilation, thus no copies.
   TimeProfilingPassesHandler(const TimeProfilingPassesHandler &) = delete;
   void operator=(const TimeProfilingPassesHandler &) = delete;
 
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
 
 private:
   // Implementation of pass instrumentation callbacks.
@@ -502,8 +504,8 @@ class TimeProfilingPassesHandler {
 class DCData {
 public:
   // Fill the map with the transitions from basic block \p B.
-  DCData(const BasicBlock &B);
-  DCData(const MachineBasicBlock &B);
+  LLVM_ABI DCData(const BasicBlock &B);
+  LLVM_ABI DCData(const MachineBasicBlock &B);
 
   // Return an iterator to the names of the successor blocks.
   StringMap<std::string>::const_iterator begin() const {
@@ -531,7 +533,7 @@ class DCData {
 
 // A change reporter that builds a website with links to pdf files showing
 // dot control flow graphs with changed instructions shown in colour.
-class DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
+class LLVM_ABI DotCfgChangeReporter : public ChangeReporter<IRDataT<DCData>> {
 public:
   DotCfgChangeReporter(bool Verbose);
   ~DotCfgChangeReporter() override;
@@ -578,9 +580,9 @@ class PrintCrashIRInstrumentation {
 public:
   PrintCrashIRInstrumentation()
       : SavedIR("*** Dump of IR Before Last Pass Unknown ***") {}
-  ~PrintCrashIRInstrumentation();
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
-  void reportCrashIR();
+  LLVM_ABI ~PrintCrashIRInstrumentation();
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC);
+  LLVM_ABI void reportCrashIR();
 
 protected:
   std::string SavedIR;
@@ -614,26 +616,22 @@ class StandardInstrumentations {
   bool VerifyEach;
 
 public:
+  LLVM_ABI
   StandardInstrumentations(LLVMContext &Context, bool DebugLogging,
                            bool VerifyEach = false,
                            PrintPassOptions PrintPassOpts = PrintPassOptions());
 
   // Register all the standard instrumentation callbacks. If \p FAM is nullptr
   // then PreservedCFGChecker is not enabled.
-  void registerCallbacks(PassInstrumentationCallbacks &PIC,
-                         ModuleAnalysisManager *MAM = nullptr);
+  LLVM_ABI void registerCallbacks(PassInstrumentationCallbacks &PIC,
+                                  ModuleAnalysisManager *MAM = nullptr);
 
   TimePassesHandler &getTimePasses() { return TimePasses; }
 };
 
-extern template class ChangeReporter<std::string>;
-extern template class TextChangeReporter<std::string>;
-
 extern template class BlockDataT<EmptyData>;
 extern template class FuncDataT<EmptyData>;
 extern template class IRDataT<EmptyData>;
-extern template class ChangeReporter<IRDataT<EmptyData>>;
-extern template class TextChangeReporter<IRDataT<EmptyData>>;
 extern template class IRComparer<EmptyData>;
 
 } // namespace llvm

From febb7e8443c4e8ff55e6b21bec4e2233b62d832b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:04:17 -0700
Subject: [PATCH 619/851] [llvm] annotate interfaces in XRay for DLL export
 (#143765)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/XRay` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

The bulk of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

Additionally, I manually added `LLVM_ABI_FRIEND` to friend member
functions declared with `LLVM_ABI`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/XRay/BlockIndexer.h       |  3 ++-
 llvm/include/llvm/XRay/BlockPrinter.h       |  3 ++-
 llvm/include/llvm/XRay/BlockVerifier.h      |  3 ++-
 llvm/include/llvm/XRay/FDRRecordConsumer.h  |  5 ++--
 llvm/include/llvm/XRay/FDRRecordProducer.h  |  3 ++-
 llvm/include/llvm/XRay/FDRRecords.h         | 29 +++++++++++----------
 llvm/include/llvm/XRay/FDRTraceWriter.h     |  5 ++--
 llvm/include/llvm/XRay/FileHeaderReader.h   |  5 ++--
 llvm/include/llvm/XRay/InstrumentationMap.h | 11 +++++---
 llvm/include/llvm/XRay/Profile.h            | 19 +++++++-------
 llvm/include/llvm/XRay/RecordPrinter.h      |  3 ++-
 llvm/include/llvm/XRay/Trace.h              |  8 +++---
 12 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index 77af77e5ec269..e9782dafed618 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -14,6 +14,7 @@
 #define LLVM_XRAY_BLOCKINDEXER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <cstdint>
 #include <vector>
@@ -23,7 +24,7 @@ namespace xray {
 
 // The BlockIndexer will gather all related records associated with a
 // process+thread and group them by 'Block'.
-class BlockIndexer : public RecordVisitor {
+class LLVM_ABI BlockIndexer : public RecordVisitor {
 public:
   struct Block {
     uint64_t ProcessID;
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index 2f9fed668069c..caf78c5c4a5a6 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_BLOCKPRINTER_H
 #define LLVM_XRAY_BLOCKPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/RecordPrinter.h"
@@ -20,7 +21,7 @@
 namespace llvm {
 namespace xray {
 
-class BlockPrinter : public RecordVisitor {
+class LLVM_ABI BlockPrinter : public RecordVisitor {
   enum class State {
     Start,
     Preamble,
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index 2450ad89ffe3d..b88785c393e37 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -13,12 +13,13 @@
 #ifndef LLVM_XRAY_BLOCKVERIFIER_H
 #define LLVM_XRAY_BLOCKVERIFIER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class BlockVerifier : public RecordVisitor {
+class LLVM_ABI BlockVerifier : public RecordVisitor {
 public:
   // We force State elements to be size_t, to be used as indices for containers.
   enum class State : std::size_t {
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 8fff9fb861582..473777f0e04f2 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDCONSUMER_H
 #define LLVM_XRAY_FDRRECORDCONSUMER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include <algorithm>
@@ -25,7 +26,7 @@ class RecordConsumer {
 
 // This consumer will collect all the records into a vector of records, in
 // arrival order.
-class LogBuilderConsumer : public RecordConsumer {
+class LLVM_ABI LogBuilderConsumer : public RecordConsumer {
   std::vector<std::unique_ptr<Record>> &Records;
 
 public:
@@ -38,7 +39,7 @@ class LogBuilderConsumer : public RecordConsumer {
 // A PipelineConsumer applies a set of visitors to every consumed Record, in the
 // order by which the visitors are added to the pipeline in the order of
 // appearance.
-class PipelineConsumer : public RecordConsumer {
+class LLVM_ABI PipelineConsumer : public RecordConsumer {
   std::vector<RecordVisitor *> Visitors;
 
 public:
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 25c123aec1b29..083b57139d397 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -8,6 +8,7 @@
 #ifndef LLVM_XRAY_FDRRECORDPRODUCER_H
 #define LLVM_XRAY_FDRRECORDPRODUCER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -24,7 +25,7 @@ class RecordProducer {
   virtual ~RecordProducer() = default;
 };
 
-class FileBasedRecordProducer : public RecordProducer {
+class LLVM_ABI FileBasedRecordProducer : public RecordProducer {
   const XRayFileHeader &Header;
   DataExtractor &E;
   uint64_t &OffsetPtr;
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 8af88f5b0e132..7ee8db61b2106 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FDRRECORDS_H
 #define LLVM_XRAY_FDRRECORDS_H
 
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <string>
 
@@ -47,7 +48,7 @@ class Record {
     RK_Function,
   };
 
-  static StringRef kindToString(RecordKind K);
+  LLVM_ABI static StringRef kindToString(RecordKind K);
 
 private:
   const RecordKind T;
@@ -107,7 +108,7 @@ class MetadataRecord : public Record {
 // What follows are specific Metadata record types which encapsulate the
 // information associated with specific metadata record types in an FDR mode
 // log.
-class BufferExtents : public MetadataRecord {
+class LLVM_ABI BufferExtents : public MetadataRecord {
   uint64_t Size = 0;
   friend class RecordInitializer;
 
@@ -130,7 +131,7 @@ class BufferExtents : public MetadataRecord {
   }
 };
 
-class WallclockRecord : public MetadataRecord {
+class LLVM_ABI WallclockRecord : public MetadataRecord {
   uint64_t Seconds = 0;
   uint32_t Nanos = 0;
   friend class RecordInitializer;
@@ -155,7 +156,7 @@ class WallclockRecord : public MetadataRecord {
   }
 };
 
-class NewCPUIDRecord : public MetadataRecord {
+class LLVM_ABI NewCPUIDRecord : public MetadataRecord {
   uint16_t CPUId = 0;
   uint64_t TSC = 0;
   friend class RecordInitializer;
@@ -181,7 +182,7 @@ class NewCPUIDRecord : public MetadataRecord {
   }
 };
 
-class TSCWrapRecord : public MetadataRecord {
+class LLVM_ABI TSCWrapRecord : public MetadataRecord {
   uint64_t BaseTSC = 0;
   friend class RecordInitializer;
 
@@ -203,7 +204,7 @@ class TSCWrapRecord : public MetadataRecord {
   }
 };
 
-class CustomEventRecord : public MetadataRecord {
+class LLVM_ABI CustomEventRecord : public MetadataRecord {
   int32_t Size = 0;
   uint64_t TSC = 0;
   uint16_t CPU = 0;
@@ -232,7 +233,7 @@ class CustomEventRecord : public MetadataRecord {
   }
 };
 
-class CustomEventRecordV5 : public MetadataRecord {
+class LLVM_ABI CustomEventRecordV5 : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   std::string Data{};
@@ -259,7 +260,7 @@ class CustomEventRecordV5 : public MetadataRecord {
   }
 };
 
-class TypedEventRecord : public MetadataRecord {
+class LLVM_ABI TypedEventRecord : public MetadataRecord {
   int32_t Size = 0;
   int32_t Delta = 0;
   uint16_t EventType = 0;
@@ -288,7 +289,7 @@ class TypedEventRecord : public MetadataRecord {
   }
 };
 
-class CallArgRecord : public MetadataRecord {
+class LLVM_ABI CallArgRecord : public MetadataRecord {
   uint64_t Arg = 0;
   friend class RecordInitializer;
 
@@ -310,7 +311,7 @@ class CallArgRecord : public MetadataRecord {
   }
 };
 
-class PIDRecord : public MetadataRecord {
+class LLVM_ABI PIDRecord : public MetadataRecord {
   int32_t PID = 0;
   friend class RecordInitializer;
 
@@ -333,7 +334,7 @@ class PIDRecord : public MetadataRecord {
   }
 };
 
-class NewBufferRecord : public MetadataRecord {
+class LLVM_ABI NewBufferRecord : public MetadataRecord {
   int32_t TID = 0;
   friend class RecordInitializer;
 
@@ -356,7 +357,7 @@ class NewBufferRecord : public MetadataRecord {
   }
 };
 
-class EndBufferRecord : public MetadataRecord {
+class LLVM_ABI EndBufferRecord : public MetadataRecord {
 public:
   EndBufferRecord()
       : MetadataRecord(RecordKind::RK_Metadata_EndOfBuffer,
@@ -369,7 +370,7 @@ class EndBufferRecord : public MetadataRecord {
   }
 };
 
-class FunctionRecord : public Record {
+class LLVM_ABI FunctionRecord : public Record {
   RecordTypes Kind;
   int32_t FuncId = 0;
   uint32_t Delta = 0;
@@ -415,7 +416,7 @@ class RecordVisitor {
   virtual Error visit(TypedEventRecord &) = 0;
 };
 
-class RecordInitializer : public RecordVisitor {
+class LLVM_ABI RecordInitializer : public RecordVisitor {
   DataExtractor &E;
   uint64_t &OffsetPtr;
   uint16_t Version;
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index 40d5f5af91c92..a3dc58e03333e 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -12,8 +12,9 @@
 #ifndef LLVM_XRAY_FDRTRACEWRITER_H
 #define LLVM_XRAY_FDRTRACEWRITER_H
 
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 #include "llvm/XRay/XRayRecord.h"
 
@@ -26,7 +27,7 @@ namespace xray {
 /// generate various kinds of execution traces without using the XRay runtime.
 /// Note that this writer does not do any validation, but uses the types of
 /// records defined in the FDRRecords.h file.
-class FDRTraceWriter : public RecordVisitor {
+class LLVM_ABI FDRTraceWriter : public RecordVisitor {
 public:
   // Construct an FDRTraceWriter associated with an output stream.
   explicit FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H);
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index 485d26d71456b..ecdb975a30661 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_XRAY_FILEHEADERREADER_H
 #define LLVM_XRAY_FILEHEADERREADER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -23,8 +24,8 @@ namespace xray {
 
 /// Convenience function for loading the file header given a data extractor at a
 /// specified offset.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
-                                                uint64_t &OffsetPtr);
+LLVM_ABI Expected<XRayFileHeader>
+readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
 
 } // namespace xray
 } // namespace llvm
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index 1979108ff4133..54737e226df89 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -15,6 +15,7 @@
 #define LLVM_XRAY_INSTRUMENTATIONMAP_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -31,7 +32,8 @@ class InstrumentationMap;
 
 /// Loads the instrumentation map from |Filename|. This auto-deduces the type of
 /// the instrumentation map.
-Expected<InstrumentationMap> loadInstrumentationMap(StringRef Filename);
+LLVM_ABI Expected<InstrumentationMap>
+loadInstrumentationMap(StringRef Filename);
 
 /// Represents an XRay instrumentation sled entry from an object file.
 struct SledEntry {
@@ -83,17 +85,18 @@ class InstrumentationMap {
   FunctionAddressMap FunctionAddresses;
   FunctionAddressReverseMap FunctionIds;
 
-  friend Expected<InstrumentationMap> loadInstrumentationMap(StringRef);
+  LLVM_ABI_FRIEND friend Expected<InstrumentationMap>
+      loadInstrumentationMap(StringRef);
 
 public:
   /// Provides a raw accessor to the unordered map of function addresses.
   const FunctionAddressMap &getFunctionAddresses() { return FunctionAddresses; }
 
   /// Returns an XRay computed function id, provided a function address.
-  std::optional<int32_t> getFunctionId(uint64_t Addr) const;
+  LLVM_ABI std::optional<int32_t> getFunctionId(uint64_t Addr) const;
 
   /// Returns the function address for a function id.
-  std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
+  LLVM_ABI std::optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
 
   /// Provide read-only access to the entries of the instrumentation map.
   const SledContainer &sleds() const { return Sleds; };
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index 79d9b53387f39..e30c01e489d33 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include <list>
 #include <utility>
@@ -34,18 +35,18 @@ class Trace;
 ///
 /// For any errors encountered in the loading of the profile data from
 /// |Filename|, this function will return an Error condition appropriately.
-Expected<Profile> loadProfile(StringRef Filename);
+LLVM_ABI Expected<Profile> loadProfile(StringRef Filename);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by Thread ID.
-Profile mergeProfilesByThread(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByThread(const Profile &L, const Profile &R);
 
 /// This algorithm will merge two Profile instances into a single Profile
 /// instance, aggregating blocks by function call stack.
-Profile mergeProfilesByStack(const Profile &L, const Profile &R);
+LLVM_ABI Profile mergeProfilesByStack(const Profile &L, const Profile &R);
 
 /// This function takes a Trace and creates a Profile instance from it.
-Expected<Profile> profileFromTrace(const Trace &T);
+LLVM_ABI Expected<Profile> profileFromTrace(const Trace &T);
 
 /// Profile instances are thread-compatible.
 class Profile {
@@ -68,11 +69,11 @@ class Profile {
   ///
   /// Returns an error if |P| had not been interned before into the Profile.
   ///
-  Expected<std::vector<FuncID>> expandPath(PathID P) const;
+  LLVM_ABI Expected<std::vector<FuncID>> expandPath(PathID P) const;
 
   /// The stack represented in |P| must be in stack order (leaf to root). This
   /// will always return the same PathID for |P| that has the same sequence.
-  PathID internPath(ArrayRef<FuncID> P);
+  LLVM_ABI PathID internPath(ArrayRef<FuncID> P);
 
   /// Appends a fully-formed Block instance into the Profile.
   ///
@@ -80,7 +81,7 @@ class Profile {
   ///
   ///    - The PathData component of the Block is empty
   ///
-  Error addBlock(Block &&B);
+  LLVM_ABI Error addBlock(Block &&B);
 
   Profile() = default;
   ~Profile() = default;
@@ -99,8 +100,8 @@ class Profile {
     return *this;
   }
 
-  Profile(const Profile &);
-  Profile &operator=(const Profile &);
+  LLVM_ABI Profile(const Profile &);
+  LLVM_ABI Profile &operator=(const Profile &);
 
   friend void swap(Profile &L, Profile &R) {
     using std::swap;
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 8ca4794dce5e2..5d2c27757255a 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -13,13 +13,14 @@
 #ifndef LLVM_XRAY_RECORDPRINTER_H
 #define LLVM_XRAY_RECORDPRINTER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/FDRRecords.h"
 
 namespace llvm {
 namespace xray {
 
-class RecordPrinter : public RecordVisitor {
+class LLVM_ABI RecordPrinter : public RecordVisitor {
   raw_ostream &OS;
   std::string Delim;
 
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index eb1f03b2a0d4a..af1d35c67817b 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Error.h"
 #include "llvm/XRay/XRayRecord.h"
@@ -50,7 +51,7 @@ class Trace {
 
   typedef std::vector<XRayRecord>::const_iterator citerator;
 
-  friend Expected<Trace> loadTrace(const DataExtractor &, bool);
+  LLVM_ABI_FRIEND friend Expected<Trace> loadTrace(const DataExtractor &, bool);
 
 public:
   using size_type = RecordVector::size_type;
@@ -68,11 +69,12 @@ class Trace {
 
 /// This function will attempt to load XRay trace records from the provided
 /// |Filename|.
-Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
 
 /// This function will attempt to load XRay trace records from the provided
 /// DataExtractor.
-Expected<Trace> loadTrace(const DataExtractor &Extractor, bool Sort = false);
+LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
+                                   bool Sort = false);
 
 } // namespace xray
 } // namespace llvm

From 695c4f2309718c441bc2e5b7dd3e3267737a12e6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 16 Jun 2025 14:04:30 -0400
Subject: [PATCH 620/851] [NFC][mlir][tensor] Use `ValueRange` instead of
 `SmallVector` in `tensor::createPadHighOp` (#144397)

Use `ValueRange` instead of `SmallVector` in `tensor::createPadHighOp`
for the `dynOutDims` arg.
---
 mlir/include/mlir/Dialect/Tensor/Utils/Utils.h | 2 +-
 mlir/lib/Dialect/Tensor/Utils/Utils.cpp        | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
index 1a4733df3f187..a1ce4e252c2f4 100644
--- a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -30,7 +30,7 @@ namespace tensor {
 // for _static_ dimensions.
 PadOp createPadHighOp(RankedTensorType resType, Value source, Value pad,
                       bool nofold, Location loc, OpBuilder &builder,
-                      SmallVector<Value> dynOutDims = {});
+                      ValueRange dynOutDims = std::nullopt);
 
 // Creates dim ops for each dynamic dimension of the ranked tensor argument and
 // returns these as values.
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 11ae0108594dd..289296a07d9d3 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -24,8 +24,7 @@ using namespace mlir::tensor;
 
 PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source,
                                     Value pad, bool nofold, Location loc,
-                                    OpBuilder &b,
-                                    SmallVector<Value> dynOutDims) {
+                                    OpBuilder &b, ValueRange dynOutDims) {
 
   // This assumption simplifies the following logic without limiting what's
   // required _today_. If needed, we can relax it in the future.

From 492d25bbe12af7702a392fa7ad41eb9e09a48cf2 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Mon, 16 Jun 2025 11:04:56 -0700
Subject: [PATCH 621/851] [llvm] annotate interfaces in llvm/ObjectYAML for DLL
 export (#143763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/ObjectYAML`
library. These annotations currently have no meaningful impact on the
LLVM build; however, they are a prerequisite to support an LLVM Windows
DLL (shared library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

These were generated automatically using the [Interface Definition
Scanner (IDS)](https://github.com/compnerd/ids) tool, followed
formatting with `git clang-format`.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../ObjectYAML/CodeViewYAMLDebugSections.h    | 12 +--
 .../llvm/ObjectYAML/CodeViewYAMLSymbols.h     |  6 +-
 .../llvm/ObjectYAML/CodeViewYAMLTypes.h       | 14 ++--
 llvm/include/llvm/ObjectYAML/DWARFEmitter.h   | 39 ++++-----
 llvm/include/llvm/ObjectYAML/DWARFYAML.h      | 64 ++++++++-------
 .../include/llvm/ObjectYAML/DXContainerYAML.h | 81 ++++++++++---------
 llvm/include/llvm/ObjectYAML/YAML.h           |  9 ++-
 llvm/include/llvm/ObjectYAML/yaml2obj.h       | 42 ++++++----
 8 files changed, 148 insertions(+), 119 deletions(-)

diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
index 6c712956dfb5d..4e7984c54a72a 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLDebugSections.h
@@ -19,6 +19,7 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -108,23 +109,24 @@ struct InlineeInfo {
 };
 
 struct YAMLDebugSubsection {
-  static Expected<YAMLDebugSubsection>
+  LLVM_ABI static Expected<YAMLDebugSubsection>
   fromCodeViewSubection(const codeview::StringsAndChecksumsRef &SC,
                         const codeview::DebugSubsectionRecord &SS);
 
   std::shared_ptr<detail::YAMLSubsectionBase> Subsection;
 };
 
-Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
+LLVM_ABI Expected<std::vector<std::shared_ptr<codeview::DebugSubsection>>>
 toCodeViewSubsectionList(BumpPtrAllocator &Allocator,
                          ArrayRef<YAMLDebugSubsection> Subsections,
                          const codeview::StringsAndChecksums &SC);
 
-std::vector<YAMLDebugSubsection>
+LLVM_ABI std::vector<YAMLDebugSubsection>
 fromDebugS(ArrayRef<uint8_t> Data, const codeview::StringsAndChecksumsRef &SC);
 
-void initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
-                                   codeview::StringsAndChecksums &SC);
+LLVM_ABI void
+initializeStringsAndChecksums(ArrayRef<YAMLDebugSubsection> Sections,
+                              codeview::StringsAndChecksums &SC);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
index 7c05c9eea05ed..dccc77dc1a0c5 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLSymbols.h
@@ -16,6 +16,7 @@
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <memory>
@@ -32,11 +33,12 @@ struct SymbolRecordBase;
 struct SymbolRecord {
   std::shared_ptr<detail::SymbolRecordBase> Symbol;
 
-  codeview::CVSymbol
+  LLVM_ABI codeview::CVSymbol
   toCodeViewSymbol(BumpPtrAllocator &Allocator,
                    codeview::CodeViewContainer Container) const;
 
-  static Expected<SymbolRecord> fromCodeViewSymbol(codeview::CVSymbol Symbol);
+  LLVM_ABI static Expected<SymbolRecord>
+  fromCodeViewSymbol(codeview::CVSymbol Symbol);
 };
 
 } // end namespace CodeViewYAML
diff --git a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
index 04b5e0ba3aa1a..3c239ce507dfc 100644
--- a/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
+++ b/llvm/include/llvm/ObjectYAML/CodeViewYAMLTypes.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
@@ -45,15 +46,16 @@ struct MemberRecord {
 struct LeafRecord {
   std::shared_ptr<detail::LeafRecordBase> Leaf;
 
-  codeview::CVType
+  LLVM_ABI codeview::CVType
   toCodeViewRecord(codeview::AppendingTypeTableBuilder &Serializer) const;
-  static Expected<LeafRecord> fromCodeViewRecord(codeview::CVType Type);
+  LLVM_ABI static Expected<LeafRecord>
+  fromCodeViewRecord(codeview::CVType Type);
 };
 
-std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
-                                   StringRef SectionName);
-ArrayRef<uint8_t> toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc,
-                           StringRef SectionName);
+LLVM_ABI std::vector<LeafRecord> fromDebugT(ArrayRef<uint8_t> DebugTorP,
+                                            StringRef SectionName);
+LLVM_ABI ArrayRef<uint8_t>
+toDebugT(ArrayRef<LeafRecord>, BumpPtrAllocator &Alloc, StringRef SectionName);
 
 } // end namespace CodeViewYAML
 
diff --git a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
index 5e1b88f4fef64..050ff60bcd408 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFEmitter.h
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TargetParser/Host.h"
@@ -27,26 +28,26 @@ namespace DWARFYAML {
 
 struct Data;
 
-Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
-Error emitDebugStr(raw_ostream &OS, const Data &DI);
-
-Error emitDebugAranges(raw_ostream &OS, const Data &DI);
-Error emitDebugRanges(raw_ostream &OS, const Data &DI);
-Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
-Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
-Error emitDebugInfo(raw_ostream &OS, const Data &DI);
-Error emitDebugLine(raw_ostream &OS, const Data &DI);
-Error emitDebugAddr(raw_ostream &OS, const Data &DI);
-Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
-Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
-Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
-Error emitDebugNames(raw_ostream &OS, const Data &DI);
-
-std::function<Error(raw_ostream &, const Data &)>
+LLVM_ABI Error emitDebugAbbrev(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStr(raw_ostream &OS, const Data &DI);
+
+LLVM_ABI Error emitDebugAranges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRanges(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubnames(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugGNUPubtypes(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugInfo(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLine(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugAddr(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugStrOffsets(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugRnglists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugLoclists(raw_ostream &OS, const Data &DI);
+LLVM_ABI Error emitDebugNames(raw_ostream &OS, const Data &DI);
+
+LLVM_ABI std::function<Error(raw_ostream &, const Data &)>
 getDWARFEmitterByName(StringRef SecName);
-Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
+LLVM_ABI Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
 emitDebugSections(StringRef YAMLString,
                   bool IsLittleEndian = sys::IsLittleEndianHost,
                   bool Is64BitAddrSize = true);
diff --git a/llvm/include/llvm/ObjectYAML/DWARFYAML.h b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
index 69f8c4f27d7a3..c8528686592ab 100644
--- a/llvm/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DWARFYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 #include <optional>
@@ -255,16 +256,16 @@ struct Data {
   std::optional<std::vector<ListTable<LoclistEntry>>> DebugLoclists;
   std::optional<DebugNamesSection> DebugNames;
 
-  bool isEmpty() const;
+  LLVM_ABI bool isEmpty() const;
 
-  SetVector<StringRef> getNonEmptySectionNames() const;
+  LLVM_ABI SetVector<StringRef> getNonEmptySectionNames() const;
 
   struct AbbrevTableInfo {
     uint64_t Index;
     uint64_t Offset;
   };
-  Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
-  StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
+  LLVM_ABI Expected<AbbrevTableInfo> getAbbrevTableInfoByID(uint64_t ID) const;
+  LLVM_ABI StringRef getAbbrevTableContentByIndex(uint64_t Index) const;
 
 private:
   mutable std::unordered_map<uint64_t, AbbrevTableInfo> AbbrevTableInfoMap;
@@ -310,88 +311,90 @@ namespace llvm {
 namespace yaml {
 
 template <> struct MappingTraits<DWARFYAML::Data> {
-  static void mapping(IO &IO, DWARFYAML::Data &DWARF);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Data &DWARF);
 };
 
 template <> struct MappingTraits<DWARFYAML::AbbrevTable> {
-  static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AbbrevTable &AbbrevTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::Abbrev> {
-  static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Abbrev &Abbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::AttributeAbbrev> {
-  static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARangeDescriptor> {
-  static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARangeDescriptor &Descriptor);
 };
 
 template <> struct MappingTraits<DWARFYAML::ARange> {
-  static void mapping(IO &IO, DWARFYAML::ARange &ARange);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::ARange &ARange);
 };
 
 template <> struct MappingTraits<DWARFYAML::RangeEntry> {
-  static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RangeEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::Ranges> {
-  static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Ranges &Ranges);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubEntry> {
-  static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubEntry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::PubSection> {
-  static void mapping(IO &IO, DWARFYAML::PubSection &Section);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::PubSection &Section);
 };
 
 template <> struct MappingTraits<DWARFYAML::Unit> {
-  static void mapping(IO &IO, DWARFYAML::Unit &Unit);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Unit &Unit);
 };
 
 template <> struct MappingTraits<DWARFYAML::DebugNamesSection> {
-  static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNamesSection &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameEntry> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameEntry &);
 };
 template <> struct MappingTraits<DWARFYAML::DebugNameAbbreviation> {
-  static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::DebugNameAbbreviation &);
 };
 template <> struct MappingTraits<DWARFYAML::IdxForm> {
-  static void mapping(IO &IO, DWARFYAML::IdxForm &);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::IdxForm &);
 };
 
 template <> struct MappingTraits<DWARFYAML::Entry> {
-  static void mapping(IO &IO, DWARFYAML::Entry &Entry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::Entry &Entry);
 };
 
 template <> struct MappingTraits<DWARFYAML::FormValue> {
-  static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::FormValue &FormValue);
 };
 
 template <> struct MappingTraits<DWARFYAML::File> {
-  static void mapping(IO &IO, DWARFYAML::File &File);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::File &File);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTableOpcode> {
-  static void mapping(IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::LineTableOpcode &LineTableOpcode);
 };
 
 template <> struct MappingTraits<DWARFYAML::LineTable> {
-  static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::SegAddrPair> {
-  static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::SegAddrPair &SegAddrPair);
 };
 
 template <> struct MappingTraits<DWARFYAML::DWARFOperation> {
-  static void mapping(IO &IO, DWARFYAML::DWARFOperation &DWARFOperation);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::DWARFOperation &DWARFOperation);
 };
 
 template <typename EntryType>
@@ -407,19 +410,20 @@ struct MappingTraits<DWARFYAML::ListEntries<EntryType>> {
 };
 
 template <> struct MappingTraits<DWARFYAML::RnglistEntry> {
-  static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::RnglistEntry &RnglistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::LoclistEntry> {
-  static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::LoclistEntry &LoclistEntry);
 };
 
 template <> struct MappingTraits<DWARFYAML::AddrTableEntry> {
-  static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
+  LLVM_ABI static void mapping(IO &IO, DWARFYAML::AddrTableEntry &AddrTable);
 };
 
 template <> struct MappingTraits<DWARFYAML::StringOffsetsTable> {
-  static void mapping(IO &IO, DWARFYAML::StringOffsetsTable &StrOffsetsTable);
+  LLVM_ABI static void mapping(IO &IO,
+                               DWARFYAML::StringOffsetsTable &StrOffsetsTable);
 };
 
 template <> struct ScalarEnumerationTraits<dwarf::DwarfFormat> {
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 8a0dfd8718796..c235112dacf7c 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -19,6 +19,7 @@
 #include "llvm/BinaryFormat/DXContainer.h"
 #include "llvm/Object/DXContainer.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <array>
 #include <optional>
@@ -59,14 +60,14 @@ struct DXILProgram {
 #define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) bool Val = false;
 struct ShaderFeatureFlags {
   ShaderFeatureFlags() = default;
-  ShaderFeatureFlags(uint64_t FlagData);
-  uint64_t getEncodedFlags();
+  LLVM_ABI ShaderFeatureFlags(uint64_t FlagData);
+  LLVM_ABI uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
 
 struct ShaderHash {
   ShaderHash() = default;
-  ShaderHash(const dxbc::ShaderHash &Data);
+  LLVM_ABI ShaderHash(const dxbc::ShaderHash &Data);
 
   bool IncludesSource;
   std::vector<llvm::yaml::Hex8> Digest;
@@ -84,7 +85,7 @@ struct RootDescriptorYaml {
   uint32_t ShaderRegister;
   uint32_t RegisterSpace;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define ROOT_DESCRIPTOR_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -97,7 +98,7 @@ struct DescriptorRangeYaml {
   uint32_t RegisterSpace;
   uint32_t OffsetInDescriptorsFromTableStart;
 
-  uint32_t getEncodedFlags() const;
+  LLVM_ABI uint32_t getEncodedFlags() const;
 
 #define DESCRIPTOR_RANGE_FLAG(Num, Val) bool Val = false;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -193,13 +194,13 @@ struct RootSignatureYamlDesc {
   RootParameterYamlDesc Parameters;
   SmallVector<StaticSamplerYamlDesc> StaticSamplers;
 
-  uint32_t getEncodedFlags();
+  LLVM_ABI uint32_t getEncodedFlags();
 
   iterator_range<StaticSamplerYamlDesc *> samplers() {
     return make_range(StaticSamplers.begin(), StaticSamplers.end());
   }
 
-  static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
+  LLVM_ABI static llvm::Expected<DXContainerYAML::RootSignatureYamlDesc>
   create(const object::DirectX::RootSignature &Data);
 
 #define ROOT_ELEMENT_FLAG(Num, Val) bool Val = false;
@@ -258,13 +259,13 @@ struct PSVInfo {
 
   StringRef EntryName;
 
-  void mapInfoForVersion(yaml::IO &IO);
+  LLVM_ABI void mapInfoForVersion(yaml::IO &IO);
 
-  PSVInfo();
-  PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
-  PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
-  PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
+  LLVM_ABI PSVInfo();
+  LLVM_ABI PSVInfo(const dxbc::PSV::v0::RuntimeInfo *P, uint16_t Stage);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v1::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v2::RuntimeInfo *P);
+  LLVM_ABI PSVInfo(const dxbc::PSV::v3::RuntimeInfo *P, StringRef StringTable);
 };
 
 struct SignatureParameter {
@@ -328,88 +329,96 @@ class raw_ostream;
 namespace yaml {
 
 template <> struct MappingTraits<DXContainerYAML::VersionTuple> {
-  static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::VersionTuple &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::FileHeader> {
-  static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::FileHeader &Header);
 };
 
 template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
-  static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO,
+                               DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ShaderHash &Hash);
 };
 
 template <> struct MappingTraits<DXContainerYAML::PSVInfo> {
-  static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::PSVInfo &PSV);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Part> {
-  static void mapping(IO &IO, DXContainerYAML::Part &Version);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Part &Version);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Object> {
-  static void mapping(IO &IO, DXContainerYAML::Object &Obj);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::Object &Obj);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ResourceBindInfo> {
-  static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
+  LLVM_ABI static void mapping(IO &IO, DXContainerYAML::ResourceBindInfo &Res);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureElement> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureElement &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureElement &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::SignatureParameter> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::SignatureParameter &El);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::SignatureParameter &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::Signature> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
+  LLVM_ABI static void mapping(IO &IO, llvm::DXContainerYAML::Signature &El);
 };
 
 template <> struct MappingTraits<DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      DXContainerYAML::RootSignatureYamlDesc &RootSignature);
+  LLVM_ABI static void
+  mapping(IO &IO, DXContainerYAML::RootSignatureYamlDesc &RootSignature);
 };
 
 template <>
 struct MappingContextTraits<DXContainerYAML::RootParameterLocationYaml,
                             DXContainerYAML::RootSignatureYamlDesc> {
-  static void mapping(IO &IO,
-                      llvm::DXContainerYAML::RootParameterLocationYaml &L,
-                      DXContainerYAML::RootSignatureYamlDesc &S);
+  LLVM_ABI static void
+  mapping(IO &IO, llvm::DXContainerYAML::RootParameterLocationYaml &L,
+          DXContainerYAML::RootSignatureYamlDesc &S);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootConstantsYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootConstantsYaml &C);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootConstantsYaml &C);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::RootDescriptorYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::RootDescriptorYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::RootDescriptorYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorTableYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorTableYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorTableYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::DescriptorRangeYaml> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::DescriptorRangeYaml &D);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::DescriptorRangeYaml &D);
 };
 
 template <> struct MappingTraits<llvm::DXContainerYAML::StaticSamplerYamlDesc> {
-  static void mapping(IO &IO, llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
+  LLVM_ABI static void mapping(IO &IO,
+                               llvm::DXContainerYAML::StaticSamplerYamlDesc &S);
 };
 
 } // namespace yaml
diff --git a/llvm/include/llvm/ObjectYAML/YAML.h b/llvm/include/llvm/ObjectYAML/YAML.h
index 3bf6527a7e2da..709520c934d7d 100644
--- a/llvm/include/llvm/ObjectYAML/YAML.h
+++ b/llvm/include/llvm/ObjectYAML/YAML.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/YAMLTraits.h"
 #include <cstdint>
 
@@ -86,13 +87,13 @@ class BinaryRef {
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as binary to the given raw_ostream.
   /// N can be used to specify the maximum number of bytes.
-  void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
+  LLVM_ABI void writeAsBinary(raw_ostream &OS, uint64_t N = UINT64_MAX) const;
 
   /// Write the contents (regardless of whether it is binary or a
   /// hex string) as hex to the given raw_ostream.
   ///
   /// For example, a possible output could be `DEADBEEFCAFEBABE`.
-  void writeAsHex(raw_ostream &OS) const;
+  LLVM_ABI void writeAsHex(raw_ostream &OS) const;
 };
 
 inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
@@ -104,8 +105,8 @@ inline bool operator==(const BinaryRef &LHS, const BinaryRef &RHS) {
 }
 
 template <> struct ScalarTraits<BinaryRef> {
-  static void output(const BinaryRef &, void *, raw_ostream &);
-  static StringRef input(StringRef, void *, BinaryRef &);
+  LLVM_ABI static void output(const BinaryRef &, void *, raw_ostream &);
+  LLVM_ABI static StringRef input(StringRef, void *, BinaryRef &);
   static QuotingType mustQuote(StringRef S) { return needsQuotes(S); }
 };
 
diff --git a/llvm/include/llvm/ObjectYAML/yaml2obj.h b/llvm/include/llvm/ObjectYAML/yaml2obj.h
index 3b458c3cd890b..4c9084b790507 100644
--- a/llvm/include/llvm/ObjectYAML/yaml2obj.h
+++ b/llvm/include/llvm/ObjectYAML/yaml2obj.h
@@ -12,6 +12,7 @@
 #define LLVM_OBJECTYAML_YAML2OBJ_H
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
 #include <memory>
 
 namespace llvm {
@@ -66,25 +67,32 @@ struct YamlObjectFile;
 
 using ErrorHandler = llvm::function_ref<void(const Twine &Msg)>;
 
-bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
-              uint64_t MaxSize);
-bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
-                   ErrorHandler EH);
-bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH);
-bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
-                      ErrorHandler EH);
-
-bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
-                 unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
+LLVM_ABI bool yaml2archive(ArchYAML::Archive &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2coff(COFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2goff(GOFFYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2elf(ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
+                       uint64_t MaxSize);
+LLVM_ABI bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2minidump(MinidumpYAML::Object &Doc, raw_ostream &Out,
+                            ErrorHandler EH);
+LLVM_ABI bool yaml2offload(OffloadYAML::Binary &Doc, raw_ostream &Out,
+                           ErrorHandler EH);
+LLVM_ABI bool yaml2wasm(WasmYAML::Object &Doc, raw_ostream &Out,
+                        ErrorHandler EH);
+LLVM_ABI bool yaml2xcoff(XCOFFYAML::Object &Doc, raw_ostream &Out,
+                         ErrorHandler EH);
+LLVM_ABI bool yaml2dxcontainer(DXContainerYAML::Object &Doc, raw_ostream &Out,
+                               ErrorHandler EH);
+
+LLVM_ABI bool convertYAML(Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
+                          unsigned DocNum = 1, uint64_t MaxSize = UINT64_MAX);
 
 /// Convenience function for tests.
-std::unique_ptr<object::ObjectFile>
+LLVM_ABI std::unique_ptr<object::ObjectFile>
 yaml2ObjectFile(SmallVectorImpl<char> &Storage, StringRef Yaml,
                 ErrorHandler ErrHandler);
 

From fccab5d757778204666d70e2f1592952fc8b336d Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Mon, 16 Jun 2025 20:10:40 +0200
Subject: [PATCH 622/851] [CIR] Upstream ComplexType ImaginaryLiteral (#144223)

This change adds support for ComplexType ImaginaryLiteral

https://github.com/llvm/llvm-project/issues/141365
---
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 30 +++++++++++++++++++++
 clang/test/CIR/CodeGen/complex.cpp          | 29 ++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 2ffe75a388e98..26070a6ca307a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -21,6 +21,8 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
                           bool isInit);
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
+
+  mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il);
 };
 
 } // namespace
@@ -66,6 +68,34 @@ mlir::Value ComplexExprEmitter::VisitInitListExpr(InitListExpr *e) {
   return builder.create<cir::ConstantOp>(loc, complexAttr);
 }
 
+mlir::Value
+ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) {
+  auto ty = mlir::cast<cir::ComplexType>(cgf.convertType(il->getType()));
+  mlir::Type elementTy = ty.getElementType();
+  mlir::Location loc = cgf.getLoc(il->getExprLoc());
+
+  mlir::TypedAttr realValueAttr;
+  mlir::TypedAttr imagValueAttr;
+
+  if (mlir::isa<cir::IntType>(elementTy)) {
+    llvm::APInt imagValue = cast<IntegerLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::IntAttr::get(elementTy, 0);
+    imagValueAttr = cir::IntAttr::get(elementTy, imagValue);
+  } else {
+    assert(mlir::isa<cir::CIRFPTypeInterface>(elementTy) &&
+           "Expected complex element type to be floating-point");
+
+    llvm::APFloat imagValue =
+        cast<FloatingLiteral>(il->getSubExpr())->getValue();
+    realValueAttr = cir::FPAttr::get(
+        elementTy, llvm::APFloat::getZero(imagValue.getSemantics()));
+    imagValueAttr = cir::FPAttr::get(elementTy, imagValue);
+  }
+
+  auto complexAttr = cir::ConstComplexAttr::get(realValueAttr, imagValueAttr);
+  return builder.create<cir::ConstantOp>(loc, complexAttr);
+}
+
 mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) {
   assert(e && getComplexType(e->getType()) &&
          "Invalid complex expression to emit");
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index d193b9f32efbc..db0b9111ab4fb 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -176,3 +176,32 @@ void foo7() {
 // OGCG: store float %[[TMP_A]], ptr %[[C_REAL_PTR]], align 4
 // OGCG: store float 2.000000e+00, ptr %[[C_IMAG_PTR]], align 4
 
+void foo8() {
+  double _Complex c = 2.00i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.fp<0.000000e+00> : !cir.double, #cir.fp<2.000000e+00> : !cir.double> : !cir.complex<!cir.double>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { double, double }, i64 1, align 8
+// LLVM: store { double, double } { double 0.000000e+00, double 2.000000e+00 }, ptr %[[COMPLEX]], align 8
+
+// OGCG: %[[COMPLEX:.*]] = alloca { double, double }, align 8
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store double 0.000000e+00, ptr %[[C_REAL_PTR]], align 8
+// OGCG: store double 2.000000e+00, ptr %[[C_IMAG_PTR]], align 8
+
+void foo14() {
+  int _Complex c = 2i;
+}
+
+// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<0> : !s32i, #cir.int<2> : !s32i> : !cir.complex<!s32i>
+
+// LLVM: %[[COMPLEX:.*]] = alloca { i32, i32 }, i64 1, align 4
+// LLVM: store { i32, i32 } { i32 0, i32 2 }, ptr %[[COMPLEX]], align 4
+
+// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4
+// OGCG: %[[C_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0
+// OGCG: %[[C_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1
+// OGCG: store i32 0, ptr %[[C_REAL_PTR]], align 4
+// OGCG: store i32 2, ptr %[[C_IMAG_PTR]], align 4

From 3f794759f4f2c0ba248a21fb3ec9eb4ff7e35724 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 11:24:22 -0700
Subject: [PATCH 623/851] [build] Fixed LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 handling. (#144391)

Change in #107278 modified the CMake CACHE variable with values
that are not supported for it as documented. This patch renames the
derived vars so that they do not conflict with the CACHE variable.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake         | 10 +++++-----
 llvm/include/llvm/Config/llvm-config.h.cmake       |  4 ++--
 llvm/include/llvm/IR/DebugLoc.h                    | 14 +++++++-------
 llvm/lib/IR/DebugLoc.cpp                           |  4 ++--
 llvm/lib/Transforms/Utils/Debugify.cpp             |  2 +-
 .../gn/secondary/llvm/include/llvm/Config/BUILD.gn |  4 ++--
 6 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 743eb6f5529f2..8004d3571fc8a 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -199,17 +199,17 @@ endif()
 string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING)
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
-  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 1 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default.
-  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0 )
+  set( LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0 )
 else()
   message(FATAL_ERROR "Unknown value for LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING: \"${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}\"!")
 endif()
-# LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING (non-cached) is expected to be
+# LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE (non-cached) is expected to be
 # 1 or 0 here, assuming referenced in #cmakedefine01.
 
 if(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS)
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 6d3c37cc8b194..a0ad517a6ecf4 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif
diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index 2fabae9bfc66e..999e03b6374a5 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -26,7 +26,7 @@ namespace llvm {
   class DILocation;
   class Function;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   // Used to represent different "kinds" of DebugLoc, expressing that the
   // instruction it is part of is either normal and should contain a valid
   // DILocation, or otherwise describing the reason why the instruction does
@@ -90,7 +90,7 @@ namespace llvm {
   using DebugLocTrackingRef = DILocAndCoverageTracking;
 #else
   using DebugLocTrackingRef = TrackingMDNodeRef;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
   /// A debug info location.
   ///
@@ -117,12 +117,12 @@ namespace llvm {
     /// IR.
     LLVM_ABI explicit DebugLoc(const MDNode *N);
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     DebugLoc(DebugLocKind Kind) : Loc(Kind) {}
     DebugLocKind getKind() const { return Loc.Kind; }
 #endif
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     static inline DebugLoc getTemporary() {
       return DebugLoc(DebugLocKind::Temporary);
     }
@@ -140,7 +140,7 @@ namespace llvm {
     static inline DebugLoc getUnknown() { return DebugLoc(); }
     static inline DebugLoc getCompilerGenerated() { return DebugLoc(); }
     static inline DebugLoc getDropped() { return DebugLoc(); }
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
     /// When two instructions are combined into a single instruction we also
     /// need to combine the original locations into a single location.
@@ -174,7 +174,7 @@ namespace llvm {
     DebugLoc orElse(DebugLoc Other) const {
       if (*this)
         return *this;
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
       if (Other)
         return Other;
       if (getKind() != DebugLocKind::Normal)
@@ -184,7 +184,7 @@ namespace llvm {
       return *this;
 #else
       return Other;
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
     }
 
     /// Get the underlying \a DILocation.
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0be6d55d724e0..ffeeeb6f1e4b0 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -11,11 +11,11 @@
 #include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
     : TrackingMDNodeRef(const_cast<DILocation *>(L)),
       Kind(DebugLocKind::Normal) {}
-#endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#endif // LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 //===----------------------------------------------------------------------===//
 // DebugLoc Implementation
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 729813a92f516..ff8a91bc7e7d4 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -299,7 +299,7 @@ bool llvm::stripDebugifyMetadata(Module &M) {
 
 bool hasLoc(const Instruction &I) {
   const DILocation *Loc = I.getDebugLoc().get();
-#if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
   DebugLocKind Kind = I.getDebugLoc().getKind();
   return Loc || Kind != DebugLocKind::Normal;
 #else
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index ca05ac1b24647..c1d107eefdf9b 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -298,8 +298,8 @@ write_cmake_config("llvm-config") {
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_TELEMETRY=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
-    "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=",
-    "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=",
+    "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=",
     "LLVM_ENABLE_DUMP=",
     "LLVM_ENABLE_HTTPLIB=",
     "LLVM_FORCE_USE_OLD_TOOLCHAIN=",

From a3d35b87eacece8cdbb4615ff6c65003773f5cbf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 16 Jun 2025 11:24:33 -0700
Subject: [PATCH 624/851] [RISCV] Use RISCV::RVVBitsPerBlock instead of 64 in
 getLMUL1VT. NFC (#144401)

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        | 2 +-
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601c..779786fa400fc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3499,7 +3499,7 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
 }
 
 static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
          "Unexpected vector MVT");
   return MVT::getScalableVectorVT(
       VT.getVectorElementType(),
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fcc9d3977e5cd..0093c92ea5ef0 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -604,7 +604,7 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
 
 // Consolidate!
 static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= 64 &&
+  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
          "Unexpected vector MVT");
   return MVT::getScalableVectorVT(
       VT.getVectorElementType(),

From 539cf824259cbb23ccc68b83ef3cde575ca50842 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 19:24:59 +0100
Subject: [PATCH 625/851] [lldb-dap] Use structured types for stepInTargets
 request (#144072)

uses the `SendTargetCapabilities` from #142831
---
 .../test/tools/lldb-dap/dap_server.py         |   2 +-
 .../stepInTargets/TestDAP_stepInTargets.py    |  44 ++++
 lldb/tools/lldb-dap/EventHelper.cpp           |   5 +
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |  13 +-
 .../Handler/StepInTargetsRequestHandler.cpp   | 200 +++++++-----------
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  10 +
 .../lldb-dap/Protocol/ProtocolRequests.h      |  15 ++
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |  24 +++
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |  28 +++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      |  20 ++
 10 files changed, 225 insertions(+), 136 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 9786678aa53f9..baf2d4ae542ba 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -494,7 +494,7 @@ def wait_for_terminated(self, timeout: Optional[float] = None):
             raise ValueError("didn't get terminated event")
         return event_dict
 
-    def get_capability(self, key):
+    def get_capability(self, key: str):
         """Get a value for the given key if it there is a key/value pair in
         the capabilities reported by the adapter.
         """
diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
index 07acfe07c9ffc..51ccf2ccbdcad 100644
--- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -78,3 +78,47 @@ def test_basic(self):
         leaf_frame = self.dap_server.get_stackFrame()
         self.assertIsNotNone(leaf_frame, "expect a leaf frame")
         self.assertEqual(step_in_targets[1]["label"], leaf_frame["name"])
+
+    @skipIf(archs=no_match(["x86", "x86_64"]))
+    def test_supported_capability_x86_arch(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        self.assertEqual(
+            is_supported,
+            True,
+            f"expect capability `stepInTarget` is supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
+
+    @skipIf(archs=["x86", "x86_64"])
+    def test_supported_capability_other_archs(self):
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+        source = "main.cpp"
+        bp_lines = [line_number(source, "// set breakpoint here")]
+        breakpoint_ids = self.set_source_breakpoints(source, bp_lines)
+        self.assertEqual(
+            len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
+        )
+        self.continue_to_breakpoints(breakpoint_ids)
+        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        self.assertEqual(
+            is_supported,
+            False,
+            f"expect capability `stepInTarget` is not supported with architecture {self.getArchitecture()}",
+        )
+        # clear breakpoints.
+        self.set_source_breakpoints(source, [])
+        self.continue_to_exit()
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index 9641f29698b10..364cc7ab4ef8c 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -44,6 +44,11 @@ void SendTargetBasedCapabilities(DAP &dap) {
 
   protocol::CapabilitiesEventBody body;
 
+  const llvm::StringRef target_triple = dap.target.GetTriple();
+  if (target_triple.starts_with("x86"))
+    body.capabilities.supportedFeatures.insert(
+        protocol::eAdapterFeatureStepInTargetsRequest);
+
   // We only support restarting launch requests not attach requests.
   if (dap.last_launch_request)
     body.capabilities.supportedFeatures.insert(
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index d3f231589b54c..0ac8ca7c9a49e 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -353,14 +353,15 @@ class StepInRequestHandler : public RequestHandler<protocol::StepInArguments,
   llvm::Error Run(const protocol::StepInArguments &args) const override;
 };
 
-class StepInTargetsRequestHandler : public LegacyRequestHandler {
+class StepInTargetsRequestHandler
+    : public RequestHandler<
+          protocol::StepInTargetsArguments,
+          llvm::Expected<protocol::StepInTargetsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "stepInTargets"; }
-  FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureStepInTargetsRequest};
-  }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::StepInTargetsResponseBody>
+  Run(const protocol::StepInTargetsArguments &args) const override;
 };
 
 class StepOutRequestHandler : public RequestHandler<protocol::StepOutArguments,
diff --git a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
index 9b99791599f82..1a76371be2d58 100644
--- a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
@@ -7,143 +7,85 @@
 //===----------------------------------------------------------------------===//
 
 #include "DAP.h"
-#include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include "lldb/API/SBInstruction.h"
+#include "lldb/lldb-defines.h"
 
+using namespace lldb_dap::protocol;
 namespace lldb_dap {
 
-// "StepInTargetsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "This request retrieves the possible step-in targets for
-//     the specified stack frame.\nThese targets can be used in the `stepIn`
-//     request.\nClients should only call this request if the corresponding
-//     capability `supportsStepInTargetsRequest` is true.", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "stepInTargets" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/StepInTargetsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "StepInTargetsArguments": {
-//   "type": "object",
-//   "description": "Arguments for `stepInTargets` request.",
-//   "properties": {
-//     "frameId": {
-//       "type": "integer",
-//       "description": "The stack frame for which to retrieve the possible
-//       step-in targets."
-//     }
-//   },
-//   "required": [ "frameId" ]
-// },
-// "StepInTargetsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to `stepInTargets` request.",
-//     "properties": {
-//       "body": {
-//         "type": "object",
-//         "properties": {
-//           "targets": {
-//             "type": "array",
-//             "items": {
-//               "$ref": "#/definitions/StepInTarget"
-//             },
-//             "description": "The possible step-in targets of the specified
-//             source location."
-//           }
-//         },
-//         "required": [ "targets" ]
-//       }
-//     },
-//     "required": [ "body" ]
-//   }]
-// }
-void StepInTargetsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-
+// This request retrieves the possible step-in targets for the specified stack
+// frame.
+// These targets can be used in the `stepIn` request.
+// Clients should only call this request if the corresponding capability
+// `supportsStepInTargetsRequest` is true.
+llvm::Expected<StepInTargetsResponseBody>
+StepInTargetsRequestHandler::Run(const StepInTargetsArguments &args) const {
   dap.step_in_targets.clear();
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  if (frame.IsValid()) {
-    lldb::SBAddress pc_addr = frame.GetPCAddress();
-    lldb::SBAddress line_end_addr =
-        pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
-    lldb::SBInstructionList insts = dap.target.ReadInstructions(
-        pc_addr, line_end_addr, /*flavor_string=*/nullptr);
-
-    if (!insts.IsValid()) {
-      response["success"] = false;
-      response["message"] = "Failed to get instructions for frame.";
-      dap.SendJSON(llvm::json::Value(std::move(response)));
-      return;
-    }
+  const lldb::SBFrame frame = dap.GetLLDBFrame(args.frameId);
+  if (!frame.IsValid())
+    return llvm::make_error<DAPError>("Failed to get frame for input frameId.");
+
+  lldb::SBAddress pc_addr = frame.GetPCAddress();
+  lldb::SBAddress line_end_addr =
+      pc_addr.GetLineEntry().GetSameLineContiguousAddressRangeEnd(true);
+  lldb::SBInstructionList insts = dap.target.ReadInstructions(
+      pc_addr, line_end_addr, /*flavor_string=*/nullptr);
+
+  if (!insts.IsValid())
+    return llvm::make_error<DAPError>("Failed to get instructions for frame.");
+
+  StepInTargetsResponseBody body;
+  const size_t num_insts = insts.GetSize();
+  for (size_t i = 0; i < num_insts; ++i) {
+    lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
+    if (!inst.IsValid())
+      break;
+
+    const lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
+    if (inst_addr == LLDB_INVALID_ADDRESS)
+      break;
+
+    // Note: currently only x86/x64 supports flow kind.
+    const lldb::InstructionControlFlowKind flow_kind =
+        inst.GetControlFlowKind(dap.target);
+
+    if (flow_kind == lldb::eInstructionControlFlowKindCall) {
+
+      const llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
+      lldb::addr_t call_target_addr = LLDB_INVALID_ADDRESS;
+      if (call_operand_name.getAsInteger(0, call_target_addr))
+        continue;
+
+      const lldb::SBAddress call_target_load_addr =
+          dap.target.ResolveLoadAddress(call_target_addr);
+      if (!call_target_load_addr.IsValid())
+        continue;
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
+          call_target_load_addr, lldb::eSymbolContextFunction);
+
+      // The existing ThreadPlanStepInRange only accept step in target
+      // function with debug info.
+      llvm::StringRef step_in_target_name;
+      if (sc.IsValid() && sc.GetFunction().IsValid())
+        step_in_target_name = sc.GetFunction().GetDisplayName();
+
+      // Skip call sites if we fail to resolve its symbol name.
+      if (step_in_target_name.empty())
+        continue;
 
-    llvm::json::Array step_in_targets;
-    const auto num_insts = insts.GetSize();
-    for (size_t i = 0; i < num_insts; ++i) {
-      lldb::SBInstruction inst = insts.GetInstructionAtIndex(i);
-      if (!inst.IsValid())
-        break;
-
-      lldb::addr_t inst_addr = inst.GetAddress().GetLoadAddress(dap.target);
-
-      // Note: currently only x86/x64 supports flow kind.
-      lldb::InstructionControlFlowKind flow_kind =
-          inst.GetControlFlowKind(dap.target);
-      if (flow_kind == lldb::eInstructionControlFlowKindCall) {
-        // Use call site instruction address as id which is easy to debug.
-        llvm::json::Object step_in_target;
-        step_in_target["id"] = inst_addr;
-
-        llvm::StringRef call_operand_name = inst.GetOperands(dap.target);
-        lldb::addr_t call_target_addr;
-        if (call_operand_name.getAsInteger(0, call_target_addr))
-          continue;
-
-        lldb::SBAddress call_target_load_addr =
-            dap.target.ResolveLoadAddress(call_target_addr);
-        if (!call_target_load_addr.IsValid())
-          continue;
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        lldb::SBSymbolContext sc = dap.target.ResolveSymbolContextForAddress(
-            call_target_load_addr, lldb::eSymbolContextFunction);
-
-        // The existing ThreadPlanStepInRange only accept step in target
-        // function with debug info.
-        std::string step_in_target_name;
-        if (sc.IsValid() && sc.GetFunction().IsValid())
-          step_in_target_name = sc.GetFunction().GetDisplayName();
-
-        // Skip call sites if we fail to resolve its symbol name.
-        if (step_in_target_name.empty())
-          continue;
-
-        dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
-        step_in_target.try_emplace("label", step_in_target_name);
-        step_in_targets.emplace_back(std::move(step_in_target));
-      }
+      StepInTarget target;
+      target.id = inst_addr;
+      target.label = step_in_target_name;
+      dap.step_in_targets.try_emplace(inst_addr, step_in_target_name);
+      body.targets.emplace_back(std::move(target));
     }
-    llvm::json::Object body;
-    body.try_emplace("targets", std::move(step_in_targets));
-    response.try_emplace("body", std::move(body));
-  } else {
-    response["success"] = llvm::json::Value(false);
-    response["message"] = "Failed to get frame for input frameId.";
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
-}
+  return body;
+};
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 2cb7c47d60203..1b1891ba59e61 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -368,6 +368,16 @@ bool fromJSON(const json::Value &Params, StepInArguments &SIA, json::Path P) {
          OM.mapOptional("granularity", SIA.granularity);
 }
 
+bool fromJSON(const llvm::json::Value &Params, StepInTargetsArguments &SITA,
+              llvm::json::Path P) {
+  json::ObjectMapper OM(Params, P);
+  return OM && OM.map("frameId", SITA.frameId);
+}
+
+llvm::json::Value toJSON(const StepInTargetsResponseBody &SITR) {
+  return llvm::json::Object{{"targets", SITR.targets}};
+}
+
 bool fromJSON(const json::Value &Params, StepOutArguments &SOA, json::Path P) {
   json::ObjectMapper OM(Params, P);
   return OM && OM.map("threadId", SOA.threadId) &&
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index d199cc886b11c..583c203be8e1a 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -533,6 +533,21 @@ bool fromJSON(const llvm::json::Value &, StepInArguments &, llvm::json::Path);
 /// body field is required.
 using StepInResponse = VoidResponse;
 
+/// Arguments for `stepInTargets` request.
+struct StepInTargetsArguments {
+  /// The stack frame for which to retrieve the possible step-in targets.
+  uint64_t frameId = LLDB_INVALID_FRAME_ID;
+};
+bool fromJSON(const llvm::json::Value &, StepInTargetsArguments &,
+              llvm::json::Path);
+
+/// Response to `stepInTargets` request.
+struct StepInTargetsResponseBody {
+  /// The possible step-in targets of the specified source location.
+  std::vector<StepInTarget> targets;
+};
+llvm::json::Value toJSON(const StepInTargetsResponseBody &);
+
 /// Arguments for `stepOut` request.
 struct StepOutArguments {
   /// Specifies the thread for which to resume execution for one step-out (of
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index 085d53bb006ef..c21f8382320a5 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -582,6 +582,30 @@ llvm::json::Value toJSON(const SteppingGranularity &SG) {
   llvm_unreachable("unhandled stepping granularity.");
 }
 
+bool fromJSON(const json::Value &Params, StepInTarget &SIT, json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("id", SIT.id) && O.map("label", SIT.label) &&
+         O.mapOptional("line", SIT.line) &&
+         O.mapOptional("column", SIT.column) &&
+         O.mapOptional("endLine", SIT.endLine) &&
+         O.mapOptional("endColumn", SIT.endColumn);
+}
+
+llvm::json::Value toJSON(const StepInTarget &SIT) {
+  json::Object target{{"id", SIT.id}, {"label", SIT.label}};
+
+  if (SIT.line != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"line", SIT.line});
+  if (SIT.column != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"column", SIT.column});
+  if (SIT.endLine != LLDB_INVALID_LINE_NUMBER)
+    target.insert({"endLine", SIT.endLine});
+  if (SIT.endLine != LLDB_INVALID_COLUMN_NUMBER)
+    target.insert({"endColumn", SIT.endColumn});
+
+  return target;
+}
+
 bool fromJSON(const json::Value &Params, Thread &T, json::Path P) {
   json::ObjectMapper O(Params, P);
   return O && O.map("id", T.id) && O.map("name", T.name);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index c7acfc482987b..d7094fbab9e59 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -414,6 +414,34 @@ bool fromJSON(const llvm::json::Value &, SteppingGranularity &,
               llvm::json::Path);
 llvm::json::Value toJSON(const SteppingGranularity &);
 
+/// A `StepInTarget` can be used in the `stepIn` request and determines into
+/// which single target the `stepIn` request should step.
+struct StepInTarget {
+  /// Unique identifier for a step-in target.
+  lldb::addr_t id = LLDB_INVALID_ADDRESS;
+
+  /// The name of the step-in target (shown in the UI).
+  std::string label;
+
+  /// The line of the step-in target.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// Start position of the range covered by the step in target. It is measured
+  /// in UTF-16 code units and the client capability `columnsStartAt1`
+  /// determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The end line of the range covered by the step-in target.
+  uint32_t endLine = LLDB_INVALID_LINE_NUMBER;
+
+  /// End position of the range covered by the step in target. It is measured in
+  /// UTF-16 code units and the client capability `columnsStartAt1` determines
+  /// whether it is 0- or 1-based.
+  uint32_t endColumn = LLDB_INVALID_COLUMN_NUMBER;
+};
+bool fromJSON(const llvm::json::Value &, StepInTarget &, llvm::json::Path);
+llvm::json::Value toJSON(const StepInTarget &);
+
 /// A Thread.
 struct Thread {
   /// Unique identifier for the thread.
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index adf43c9ac2046..f2a23db346565 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -686,3 +686,23 @@ TEST(ProtocolTypesTest, CapabilitiesEventBody) {
   // Validate toJSON
   EXPECT_EQ(json, pp(body));
 }
+
+TEST(ProtocolTypesTest, StepInTarget) {
+  StepInTarget target;
+  target.id = 230;
+  target.label = "the_function_name";
+  target.line = 2;
+  target.column = 320;
+  target.endLine = 32;
+  target.endColumn = 23;
+
+  llvm::Expected<StepInTarget> deserialized_target = roundtrip(target);
+  ASSERT_THAT_EXPECTED(deserialized_target, llvm::Succeeded());
+
+  EXPECT_EQ(target.id, deserialized_target->id);
+  EXPECT_EQ(target.label, deserialized_target->label);
+  EXPECT_EQ(target.line, deserialized_target->line);
+  EXPECT_EQ(target.column, deserialized_target->column);
+  EXPECT_EQ(target.endLine, deserialized_target->endLine);
+  EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
+}
\ No newline at end of file

From b5c6245cb46354923940b95a89213fa0924e5c5f Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 16 Jun 2025 11:26:23 -0700
Subject: [PATCH 626/851] [CIR][NFC] Refactor constant pointer l-value handling
 (#144165)

This change introduces a ConstantLValueEmitter class, which will be
needed for emitting CIR for non-trivial constant pointers. This change
introduces the class with most branches reaching an NYI diagnostic. The
only path that is currently implemented is the case where an absolute
pointer (usually a null pointer) is emitted. This corresponds to the
existing handler for emitting l-value constants.
---
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 235 +++++++++++++++++--
 1 file changed, 218 insertions(+), 17 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index c41ab54be09ca..1976742d4039e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -329,6 +329,222 @@ emitArrayConstant(CIRGenModule &cgm, mlir::Type desiredType,
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+//                          ConstantLValueEmitter
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A struct which can be used to peephole certain kinds of finalization
+/// that normally happen during l-value emission.
+struct ConstantLValue {
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> value;
+  bool hasOffsetApplied;
+
+  ConstantLValue(std::nullptr_t) : value(nullptr), hasOffsetApplied(false) {}
+  ConstantLValue() : value(nullptr), hasOffsetApplied(false) {}
+};
+
+/// A helper class for emitting constant l-values.
+class ConstantLValueEmitter
+    : public ConstStmtVisitor<ConstantLValueEmitter, ConstantLValue> {
+  CIRGenModule &cgm;
+  ConstantEmitter &emitter;
+  const APValue &value;
+  QualType destType;
+
+  // Befriend StmtVisitorBase so that we don't have to expose Visit*.
+  friend StmtVisitorBase;
+
+public:
+  ConstantLValueEmitter(ConstantEmitter &emitter, const APValue &value,
+                        QualType destType)
+      : cgm(emitter.cgm), emitter(emitter), value(value), destType(destType) {}
+
+  mlir::Attribute tryEmit();
+
+private:
+  mlir::Attribute tryEmitAbsolute(mlir::Type destTy);
+  ConstantLValue tryEmitBase(const APValue::LValueBase &base);
+
+  ConstantLValue VisitStmt(const Stmt *s) { return nullptr; }
+  ConstantLValue VisitConstantExpr(const ConstantExpr *e);
+  ConstantLValue VisitCompoundLiteralExpr(const CompoundLiteralExpr *e);
+  ConstantLValue VisitStringLiteral(const StringLiteral *e);
+  ConstantLValue VisitObjCBoxedExpr(const ObjCBoxedExpr *e);
+  ConstantLValue VisitObjCEncodeExpr(const ObjCEncodeExpr *e);
+  ConstantLValue VisitObjCStringLiteral(const ObjCStringLiteral *e);
+  ConstantLValue VisitPredefinedExpr(const PredefinedExpr *e);
+  ConstantLValue VisitAddrLabelExpr(const AddrLabelExpr *e);
+  ConstantLValue VisitCallExpr(const CallExpr *e);
+  ConstantLValue VisitBlockExpr(const BlockExpr *e);
+  ConstantLValue VisitCXXTypeidExpr(const CXXTypeidExpr *e);
+  ConstantLValue
+  VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e);
+};
+
+} // namespace
+
+mlir::Attribute ConstantLValueEmitter::tryEmit() {
+  const APValue::LValueBase &base = value.getLValueBase();
+
+  // The destination type should be a pointer or reference
+  // type, but it might also be a cast thereof.
+  //
+  // FIXME: the chain of casts required should be reflected in the APValue.
+  // We need this in order to correctly handle things like a ptrtoint of a
+  // non-zero null pointer and addrspace casts that aren't trivially
+  // represented in LLVM IR.
+  mlir::Type destTy = cgm.getTypes().convertTypeForMem(destType);
+  assert(mlir::isa<cir::PointerType>(destTy));
+
+  // If there's no base at all, this is a null or absolute pointer,
+  // possibly cast back to an integer type.
+  if (!base)
+    return tryEmitAbsolute(destTy);
+
+  // Otherwise, try to emit the base.
+  ConstantLValue result = tryEmitBase(base);
+
+  // If that failed, we're done.
+  llvm::PointerUnion<mlir::Value, mlir::Attribute> &value = result.value;
+  if (!value)
+    return {};
+
+  // Apply the offset if necessary and not already done.
+  if (!result.hasOffsetApplied) {
+    cgm.errorNYI("ConstantLValueEmitter: apply offset");
+    return {};
+  }
+
+  // Convert to the appropriate type; this could be an lvalue for
+  // an integer. FIXME: performAddrSpaceCast
+  if (mlir::isa<cir::PointerType>(destTy)) {
+    if (auto attr = mlir::dyn_cast<mlir::Attribute>(value))
+      return attr;
+    cgm.errorNYI("ConstantLValueEmitter: non-attribute pointer");
+    return {};
+  }
+
+  cgm.errorNYI("ConstantLValueEmitter: other?");
+  return {};
+}
+
+/// Try to emit an absolute l-value, such as a null pointer or an integer
+/// bitcast to pointer type.
+mlir::Attribute ConstantLValueEmitter::tryEmitAbsolute(mlir::Type destTy) {
+  // If we're producing a pointer, this is easy.
+  auto destPtrTy = mlir::cast<cir::PointerType>(destTy);
+  return cgm.getBuilder().getConstPtrAttr(
+      destPtrTy, value.getLValueOffset().getQuantity());
+}
+
+ConstantLValue
+ConstantLValueEmitter::tryEmitBase(const APValue::LValueBase &base) {
+  // Handle values.
+  if (const ValueDecl *d = base.dyn_cast<const ValueDecl *>()) {
+    // The constant always points to the canonical declaration. We want to look
+    // at properties of the most recent declaration at the point of emission.
+    d = cast<ValueDecl>(d->getMostRecentDecl());
+
+    if (d->hasAttr<WeakRefAttr>()) {
+      cgm.errorNYI(d->getSourceRange(),
+                   "ConstantLValueEmitter: emit pointer base for weakref");
+      return {};
+    }
+
+    if (auto *fd = dyn_cast<FunctionDecl>(d)) {
+      cgm.errorNYI(fd->getSourceRange(),
+                   "ConstantLValueEmitter: function decl");
+      return {};
+    }
+
+    if (auto *vd = dyn_cast<VarDecl>(d)) {
+      cgm.errorNYI(vd->getSourceRange(), "ConstantLValueEmitter: var decl");
+      return {};
+    }
+  }
+
+  // Handle typeid(T).
+  if (base.dyn_cast<TypeInfoLValue>()) {
+    cgm.errorNYI("ConstantLValueEmitter: typeid");
+    return {};
+  }
+
+  // Otherwise, it must be an expression.
+  return Visit(base.get<const Expr *>());
+}
+
+ConstantLValue ConstantLValueEmitter::VisitConstantExpr(const ConstantExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: constant expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCompoundLiteralExpr(const CompoundLiteralExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: compound literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitStringLiteral(const StringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCEncodeExpr(const ObjCEncodeExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc encode expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: objc string literal");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitObjCBoxedExpr(const ObjCBoxedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: objc boxed expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitPredefinedExpr(const PredefinedExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: predefined expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitAddrLabelExpr(const AddrLabelExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: addr label expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitCallExpr(const CallExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: call expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitBlockExpr(const BlockExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: block expr");
+  return {};
+}
+
+ConstantLValue
+ConstantLValueEmitter::VisitCXXTypeidExpr(const CXXTypeidExpr *e) {
+  cgm.errorNYI(e->getSourceRange(), "ConstantLValueEmitter: cxx typeid expr");
+  return {};
+}
+
+ConstantLValue ConstantLValueEmitter::VisitMaterializeTemporaryExpr(
+    const MaterializeTemporaryExpr *e) {
+  cgm.errorNYI(e->getSourceRange(),
+               "ConstantLValueEmitter: materialize temporary expr");
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 //                             ConstantEmitter
 //===----------------------------------------------------------------------===//
@@ -556,23 +772,8 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value,
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate member pointer");
     return {};
   }
-  case APValue::LValue: {
-
-    if (value.getLValueBase()) {
-      cgm.errorNYI("non-null pointer initialization");
-    } else {
-
-      mlir::Type desiredType = cgm.convertType(destType);
-      if (const cir::PointerType ptrType =
-              mlir::dyn_cast<cir::PointerType>(desiredType)) {
-        return builder.getConstPtrAttr(ptrType,
-                                       value.getLValueOffset().getQuantity());
-      } else {
-        llvm_unreachable("non-pointer variable initialized with a pointer");
-      }
-    }
-    return {};
-  }
+  case APValue::LValue:
+    return ConstantLValueEmitter(*this, value, destType).tryEmit();
   case APValue::Struct:
   case APValue::Union:
     cgm.errorNYI("ConstExprEmitter::tryEmitPrivate struct or union");

From 00582728767599bb0e88beb96e8264dbe676da53 Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Mon, 16 Jun 2025 11:27:25 -0700
Subject: [PATCH 627/851] [NFC] Remove unused test code from
 ELFObjectFileTest.cpp

---
 llvm/unittests/Object/ELFObjectFileTest.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp
index 493e673d6a07d..1073df95c379a 100644
--- a/llvm/unittests/Object/ELFObjectFileTest.cpp
+++ b/llvm/unittests/Object/ELFObjectFileTest.cpp
@@ -896,17 +896,6 @@ TEST(ELFObjectFileTest, InvalidDecodePGOAnalysisMap) {
             "are enabled: version = 1 feature = 4");
   }
 
-  SmallString<128> CommonVersionedYamlString(CommonYamlString);
-  CommonVersionedYamlString += R"(
-      - Version: 2
-        BBRanges:
-          - BBEntries:
-              - ID:            1
-                AddressOffset: 0x0
-                Size:          0x1
-                Metadata:      0x2
-)";
-
   // Check that we fail when function entry count is enabled but not provided.
   SmallString<128> MissingFuncEntryCount(CommonYamlString);
   MissingFuncEntryCount += R"(

From 8ed43c47dec36bc38bbae4c6f024cdb824555a76 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Mon, 16 Jun 2025 14:38:27 -0400
Subject: [PATCH 628/851] [Matrix] Hoist IRBuilder<> out of Visit* functions.
 NFC (#144369)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 48 ++++++++-----------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 1e37f40fa9d52..ece0bb56fff01 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1146,24 +1146,24 @@ class LowerMatrixIntrinsics {
       Value *Op1;
       Value *Op2;
       MatrixTy Result;
+      IRBuilder<> Builder(Inst);
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
-        Result = VisitBinaryOperator(BinOp, SI);
+        Result = VisitBinaryOperator(BinOp, SI, Builder);
       else if (auto *Cast = dyn_cast<CastInst>(Inst))
-        Result = VisitCastInstruction(Cast, SI);
+        Result = VisitCastInstruction(Cast, SI, Builder);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
-        Result = VisitUnaryOperator(UnOp, SI);
+        Result = VisitUnaryOperator(UnOp, SI, Builder);
       else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
-        Result = VisitIntrinsicInst(Intr, SI);
+        Result = VisitIntrinsicInst(Intr, SI, Builder);
       else if (auto *Select = dyn_cast<SelectInst>(Inst))
-        Result = VisitSelectInst(Select, SI);
+        Result = VisitSelectInst(Select, SI, Builder);
       else if (match(Inst, m_Load(m_Value(Op1))))
-        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
+        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
-        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2);
+        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);
       else
         continue;
 
-      IRBuilder<> Builder(Inst);
       finalizeLowering(Inst, Result, Builder);
       Changed = true;
     }
@@ -1204,7 +1204,8 @@ class LowerMatrixIntrinsics {
   }
 
   /// Replace intrinsic calls.
-  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     assert(Inst->getCalledFunction() &&
            Inst->getCalledFunction()->isIntrinsic());
 
@@ -1219,7 +1220,6 @@ class LowerMatrixIntrinsics {
       return LowerColumnMajorStore(Inst);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
-      IRBuilder<> Builder(Inst);
       MatrixTy Result;
       MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
@@ -1298,7 +1298,6 @@ class LowerMatrixIntrinsics {
                       ShapeInfo MatrixShape, Value *I, Value *J,
                       ShapeInfo ResultShape, Type *EltTy,
                       IRBuilder<> &Builder) {
-
     Value *Offset = Builder.CreateAdd(
         Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
 
@@ -2228,26 +2227,24 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower load instructions.
-  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
+                     IRBuilder<> &Builder) {
     return LowerLoad(Inst, Ptr, Inst->getAlign(),
                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
-                      Value *Ptr) {
-    IRBuilder<> Builder(Inst);
+                      Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
                       Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
   /// Lower binary operators.
-  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI,
+                               IRBuilder<> &Builder) {
     Value *Lhs = Inst->getOperand(0);
     Value *Rhs = Inst->getOperand(1);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(Lhs, SI, Builder);
     MatrixTy B = getMatrix(Rhs, SI, Builder);
@@ -2265,11 +2262,10 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower unary operators.
-  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI,
+                              IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, SI, Builder);
 
@@ -2293,11 +2289,10 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower cast instructions.
-  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape,
+                                IRBuilder<> &Builder) {
     Value *Op = Inst->getOperand(0);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy M = getMatrix(Op, Shape, Builder);
 
@@ -2315,13 +2310,12 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower selects.
-  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitSelectInst(SelectInst *Inst, const ShapeInfo &Shape,
+                           IRBuilder<> &Builder) {
     Value *Cond = Inst->getOperand(0);
     Value *OpA = Inst->getOperand(1);
     Value *OpB = Inst->getOperand(2);
 
-    IRBuilder<> Builder(Inst);
-
     MatrixTy Result;
     MatrixTy A = getMatrix(OpA, Shape, Builder);
     MatrixTy B = getMatrix(OpB, Shape, Builder);

From 63b80dd01dafc92104ee43e4f0f5296d644c25ec Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Mon, 16 Jun 2025 11:45:19 -0700
Subject: [PATCH 629/851] [NFC][RootSignature] Use `llvm::EnumEntry` for
 serialization of Root Signature Elements (#144106)

It has pointed out
[here](https://github.com/llvm/llvm-project/pull/143198#discussion_r2132877388)
that we may be able to use `llvm::EnumEntry` so that we can re-use the
printing logic across enumerations.

- Enables re-use of `printEnum` and `printFlags` methods via templates
- Allows easy definition of `getEnumName` function for enum-to-string
conversion, eliminating the need to use a string stream for constructing
the Name SmallString

- Also, does a small fix-up of the operands for descriptor table clause
to be consistent with other `Build*` methods

For reference, the
[test-cases](https://github.com/llvm/llvm-project/blob/main/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp)
that must not change expected output.
---
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 209 +++++++++---------
 1 file changed, 104 insertions(+), 105 deletions(-)

diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 765a3bcbed7e2..7d744781da04f 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -15,111 +15,46 @@
 #include "llvm/ADT/bit.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 namespace llvm {
 namespace hlsl {
 namespace rootsig {
 
-static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
-  switch (Reg.ViewType) {
-  case RegisterType::BReg:
-    OS << "b";
-    break;
-  case RegisterType::TReg:
-    OS << "t";
-    break;
-  case RegisterType::UReg:
-    OS << "u";
-    break;
-  case RegisterType::SReg:
-    OS << "s";
-    break;
-  }
-  OS << Reg.Number;
-  return OS;
+template <typename T>
+static std::optional<StringRef> getEnumName(const T Value,
+                                            ArrayRef<EnumEntry<T>> Enums) {
+  for (const auto &EnumItem : Enums)
+    if (EnumItem.Value == Value)
+      return EnumItem.Name;
+  return std::nullopt;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const ShaderVisibility &Visibility) {
-  switch (Visibility) {
-  case ShaderVisibility::All:
-    OS << "All";
-    break;
-  case ShaderVisibility::Vertex:
-    OS << "Vertex";
-    break;
-  case ShaderVisibility::Hull:
-    OS << "Hull";
-    break;
-  case ShaderVisibility::Domain:
-    OS << "Domain";
-    break;
-  case ShaderVisibility::Geometry:
-    OS << "Geometry";
-    break;
-  case ShaderVisibility::Pixel:
-    OS << "Pixel";
-    break;
-  case ShaderVisibility::Amplification:
-    OS << "Amplification";
-    break;
-  case ShaderVisibility::Mesh:
-    OS << "Mesh";
-    break;
-  }
-
-  return OS;
-}
-
-static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
-  switch (Type) {
-  case ClauseType::CBuffer:
-    OS << "CBV";
-    break;
-  case ClauseType::SRV:
-    OS << "SRV";
-    break;
-  case ClauseType::UAV:
-    OS << "UAV";
-    break;
-  case ClauseType::Sampler:
-    OS << "Sampler";
-    break;
-  }
-
+template <typename T>
+static raw_ostream &printEnum(raw_ostream &OS, const T Value,
+                              ArrayRef<EnumEntry<T>> Enums) {
+  auto MaybeName = getEnumName(Value, Enums);
+  if (MaybeName)
+    OS << *MaybeName;
   return OS;
 }
 
-static raw_ostream &operator<<(raw_ostream &OS,
-                               const DescriptorRangeFlags &Flags) {
+template <typename T>
+static raw_ostream &printFlags(raw_ostream &OS, const T Value,
+                               ArrayRef<EnumEntry<T>> Flags) {
   bool FlagSet = false;
-  unsigned Remaining = llvm::to_underlying(Flags);
+  unsigned Remaining = llvm::to_underlying(Value);
   while (Remaining) {
     unsigned Bit = 1u << llvm::countr_zero(Remaining);
     if (Remaining & Bit) {
       if (FlagSet)
         OS << " | ";
 
-      switch (static_cast<DescriptorRangeFlags>(Bit)) {
-      case DescriptorRangeFlags::DescriptorsVolatile:
-        OS << "DescriptorsVolatile";
-        break;
-      case DescriptorRangeFlags::DataVolatile:
-        OS << "DataVolatile";
-        break;
-      case DescriptorRangeFlags::DataStaticWhileSetAtExecute:
-        OS << "DataStaticWhileSetAtExecute";
-        break;
-      case DescriptorRangeFlags::DataStatic:
-        OS << "DataStatic";
-        break;
-      case DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks:
-        OS << "DescriptorsStaticKeepingBufferBoundsChecks";
-        break;
-      default:
+      auto MaybeFlag = getEnumName(T(Bit), Flags);
+      if (MaybeFlag)
+        OS << *MaybeFlag;
+      else
         OS << "invalid: " << Bit;
-        break;
-      }
 
       FlagSet = true;
     }
@@ -128,6 +63,68 @@ static raw_ostream &operator<<(raw_ostream &OS,
 
   if (!FlagSet)
     OS << "None";
+  return OS;
+}
+
+static const EnumEntry<RegisterType> RegisterNames[] = {
+    {"b", RegisterType::BReg},
+    {"t", RegisterType::TReg},
+    {"u", RegisterType::UReg},
+    {"s", RegisterType::SReg},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const Register &Reg) {
+  printEnum(OS, Reg.ViewType, ArrayRef(RegisterNames));
+  OS << Reg.Number;
+
+  return OS;
+}
+
+static const EnumEntry<ShaderVisibility> VisibilityNames[] = {
+    {"All", ShaderVisibility::All},
+    {"Vertex", ShaderVisibility::Vertex},
+    {"Hull", ShaderVisibility::Hull},
+    {"Domain", ShaderVisibility::Domain},
+    {"Geometry", ShaderVisibility::Geometry},
+    {"Pixel", ShaderVisibility::Pixel},
+    {"Amplification", ShaderVisibility::Amplification},
+    {"Mesh", ShaderVisibility::Mesh},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ShaderVisibility &Visibility) {
+  printEnum(OS, Visibility, ArrayRef(VisibilityNames));
+
+  return OS;
+}
+
+static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
+    {"CBV", dxil::ResourceClass::CBuffer},
+    {"SRV", dxil::ResourceClass::SRV},
+    {"UAV", dxil::ResourceClass::UAV},
+    {"Sampler", dxil::ResourceClass::Sampler},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
+  printEnum(OS, dxil::ResourceClass(llvm::to_underlying(Type)),
+            ArrayRef(ResourceClassNames));
+
+  return OS;
+}
+
+static const EnumEntry<DescriptorRangeFlags> DescriptorRangeFlagNames[] = {
+    {"DescriptorsVolatile", DescriptorRangeFlags::DescriptorsVolatile},
+    {"DataVolatile", DescriptorRangeFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     DescriptorRangeFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", DescriptorRangeFlags::DataStatic},
+    {"DescriptorsStaticKeepingBufferBoundsChecks",
+     DescriptorRangeFlags::DescriptorsStaticKeepingBufferBoundsChecks},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const DescriptorRangeFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(DescriptorRangeFlagNames));
 
   return OS;
 }
@@ -236,12 +233,13 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) {
 
 MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) {
   IRBuilder<> Builder(Ctx);
-  llvm::SmallString<7> Name;
-  llvm::raw_svector_ostream OS(Name);
-  OS << "Root" << ClauseType(llvm::to_underlying(Descriptor.Type));
-
+  std::optional<StringRef> TypeName =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Descriptor.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(TypeName && "Provided an invalid Resource Class");
+  llvm::SmallString<7> Name({"Root", *TypeName});
   Metadata *Operands[] = {
-      MDString::get(Ctx, OS.str()),
+      MDString::get(Ctx, Name),
       ConstantAsMetadata::get(
           Builder.getInt32(llvm::to_underlying(Descriptor.Visibility))),
       ConstantAsMetadata::get(Builder.getInt32(Descriptor.Reg.Number)),
@@ -277,19 +275,20 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) {
 MDNode *MetadataBuilder::BuildDescriptorTableClause(
     const DescriptorTableClause &Clause) {
   IRBuilder<> Builder(Ctx);
-  std::string Name;
-  llvm::raw_string_ostream OS(Name);
-  OS << Clause.Type;
-  return MDNode::get(
-      Ctx, {
-               MDString::get(Ctx, OS.str()),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
-               ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
-               ConstantAsMetadata::get(
-                   Builder.getInt32(llvm::to_underlying(Clause.Flags))),
-           });
+  std::optional<StringRef> Name =
+      getEnumName(dxil::ResourceClass(llvm::to_underlying(Clause.Type)),
+                  ArrayRef(ResourceClassNames));
+  assert(Name && "Provided an invalid Resource Class");
+  Metadata *Operands[] = {
+      MDString::get(Ctx, *Name),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.NumDescriptors)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Reg.Number)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Space)),
+      ConstantAsMetadata::get(Builder.getInt32(Clause.Offset)),
+      ConstantAsMetadata::get(
+          Builder.getInt32(llvm::to_underlying(Clause.Flags))),
+  };
+  return MDNode::get(Ctx, Operands);
 }
 
 MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {

From fcc10e55cabb90f3097a8da4c114e827a1d746eb Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 14:51:48 -0400
Subject: [PATCH 630/851] Remove unnecessary BOM from file; NFC

Fixes #144373
---
 llvm/include/llvm/ADT/GenericUniformityImpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 715df7ab9a7aa..c2460c497b40b 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -1,4 +1,4 @@
-﻿//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
+//===- GenericUniformityImpl.h -----------------------*- C++ -*------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.

From f83d09a1f60aee28a8ed9020cd72971ec2885f24 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Mon, 16 Jun 2025 14:53:15 -0400
Subject: [PATCH 631/851] Revert "[RISCV] Remove B and Zbc extension from Andes
 series cpus." (#144402)

Reverts llvm/llvm-project#144022

This has been failing postcommit CI for two days:
https://lab.llvm.org/buildbot/#/builders/63
---
 .../Driver/print-enabled-extensions/riscv-andes-a25.c     | 7 ++++++-
 .../Driver/print-enabled-extensions/riscv-andes-a45.c     | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-ax25.c    | 7 ++++++-
 .../Driver/print-enabled-extensions/riscv-andes-ax45.c    | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-n45.c     | 6 +++++-
 .../Driver/print-enabled-extensions/riscv-andes-nx45.c    | 6 +++++-
 llvm/lib/Target/RISCV/RISCVProcessors.td                  | 8 ++++++++
 llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s              | 2 +-
 8 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
index cfb4d0ed58d11..d8b3848d84520 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,12 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
index 3c3c554dffc57..a0a1c35911409 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-a45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
index 70100a0a8df13..3f933ecd8ac83 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax25.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,12 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
index d2b1a32e321e5..6460d701411bc 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-ax45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
index 1a2c30bfc7a2e..4d9c514b756e6 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-n45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -18,8 +19,11 @@
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
 // CHECK-NEXT:     zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zcf1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
index 50c38da3bd034..5eaada3f9e164 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-andes-nx45.c
@@ -10,6 +10,7 @@
 // CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
 // CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
 // CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
@@ -17,8 +18,11 @@
 // CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
 // CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
 // CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_xandesperf5p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zicsr2p0_zifencei2p0_zmmul1p0_zaamo1p0_zalrsc1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbs1p0_xandesperf5p0
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index d7e6c71ea062e..32f4ab607a34c 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -703,6 +703,8 @@ def ANDES_A25 : RISCVProcessorModel<"andes-a25",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
+                                     FeatureStdExtZbc,
                                      FeatureVendorXAndesPerf]>;
 
 def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
@@ -716,6 +718,8 @@ def ANDES_AX25 : RISCVProcessorModel<"andes-ax25",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
+                                      FeatureStdExtZbc,
                                       FeatureVendorXAndesPerf]>;
 
 defvar Andes45TuneFeatures = [TuneAndes45,
@@ -737,6 +741,7 @@ def ANDES_N45 : RISCVProcessorModel<"andes-n45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -751,6 +756,7 @@ def ANDES_NX45 : RISCVProcessorModel<"andes-nx45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
 
@@ -765,6 +771,7 @@ def ANDES_A45 : RISCVProcessorModel<"andes-a45",
                                      FeatureStdExtF,
                                      FeatureStdExtD,
                                      FeatureStdExtC,
+                                     FeatureStdExtB,
                                      FeatureVendorXAndesPerf],
                                     Andes45TuneFeatures>;
 
@@ -779,5 +786,6 @@ def ANDES_AX45 : RISCVProcessorModel<"andes-ax45",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC,
+                                      FeatureStdExtB,
                                       FeatureVendorXAndesPerf],
                                      Andes45TuneFeatures>;
diff --git a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
index d90dce8c5c3fc..f6dc6eef3f0ff 100644
--- a/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
+++ b/llvm/test/tools/llvm-mca/RISCV/Andes45/gpr.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+b,+zbc -timeline -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=andes-nx45 -mattr=+zbc -timeline -iterations=1 < %s | FileCheck %s
 
 # Two ALUs without dependency can be dispatched in the same cycle.
 add a0, a0, a0

From e8362234f60612a250d832cc8d0f68fe7fa9ea17 Mon Sep 17 00:00:00 2001
From: Scott Linder <scott.linder@amd.com>
Date: Mon, 16 Jun 2025 15:03:02 -0400
Subject: [PATCH 632/851] [Object][AMDGPU] Support REL relocations (#143966)

Shaders compiled with DXC/LLPC generate these relocations, and even if
that changes in the future we want to handle existing binaries. The
friction to support this and the maintenance cost long term both seem
incredibly low, considering other targets like ARM support both REL/RELA
static relocations behind the same interface.
---
 llvm/docs/AMDGPUUsage.rst                     |  3 +-
 llvm/lib/Object/RelocationResolver.cpp        |  6 +-
 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml | 86 +++++++++++++++++++
 3 files changed, 92 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 39f04f8e01b85..c052b076c21c3 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2709,7 +2709,8 @@ The following relocation types are supported:
 the ``mesa3d`` OS, which does not support ``R_AMDGPU_ABS64``.
 
 There is no current OS loader support for 32-bit programs and so
-``R_AMDGPU_ABS32`` is not used.
+``R_AMDGPU_ABS32`` is only generated for static relocations, for example to
+implement some DWARF32 forms.
 
 .. _amdgpu-loaded-code-object-path-uniform-resource-identifier:
 
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 8cf748aa5681c..b6318bbe3ab74 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -274,11 +274,13 @@ static bool supportsAmdgpu(uint64_t Type) {
 }
 
 static uint64_t resolveAmdgpu(uint64_t Type, uint64_t Offset, uint64_t S,
-                              uint64_t /*LocData*/, int64_t Addend) {
+                              uint64_t LocData, int64_t Addend) {
+  assert((LocData == 0 || Addend == 0) &&
+         "one of LocData and Addend must be 0");
   switch (Type) {
   case ELF::R_AMDGPU_ABS32:
   case ELF::R_AMDGPU_ABS64:
-    return S + Addend;
+    return S + LocData + Addend;
   default:
     llvm_unreachable("Invalid relocation type");
   }
diff --git a/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
new file mode 100644
index 0000000000000..23b7f087e9570
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/dwarfdump-rel.yaml
@@ -0,0 +1,86 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-dwarfdump -i %t | FileCheck %s
+
+# Test REL relocation handling for AMDGPU
+
+# CHECK: DW_TAG_compile_unit
+# CHECK: DW_AT_producer ("dxc")
+# CHECK: DW_AT_name (".\\example.hlsl")
+# CHECK: DW_AT_str_offsets_base (0x00000008)
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  OSABI:           ELFOSABI_AMDGPU_PAL
+  Type:            ET_REL
+  Machine:         EM_AMDGPU
+  Flags:           [ EF_AMDGPU_MACH_AMDGCN_GFX1201 ]
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         01110125251305032572171017110B120673178C0117000000
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         23000000050001080000000001000400010800000000000000005C000000080000000C00000000
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         0C000000050000000000000004000000
+  - Name:            .rel.debug_info
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_info
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_abbrev
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0x11
+        Symbol:          .debug_str_offsets
+        Type:            R_AMDGPU_ABS32
+  - Name:            .rel.debug_str_offsets
+    Type:            SHT_REL
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .debug_str_offsets
+    Relocations:
+      - Offset:          0x8
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+      - Offset:          0xC
+        Symbol:          .debug_str
+        Type:            R_AMDGPU_ABS32
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .debug_abbrev
+      - Name:            .debug_info
+      - Name:            .rel.debug_info
+      - Name:            .debug_str_offsets
+      - Name:            .rel.debug_str_offsets
+      - Name:            .debug_str
+      - Name:            .symtab
+Symbols:
+  - Name:            .debug_abbrev
+    Type:            STT_SECTION
+    Section:         .debug_abbrev
+  - Name:            .debug_info
+    Type:            STT_SECTION
+    Section:         .debug_info
+  - Name:            .debug_str_offsets
+    Type:            STT_SECTION
+    Section:         .debug_str_offsets
+  - Name:            .debug_str
+    Type:            STT_SECTION
+    Section:         .debug_str
+DWARF:
+  debug_str:
+    - 'dxc'
+    - '.\example.hlsl'
+...

From a00b736a797d252d9e26cc13fb45993d7b02ede2 Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Mon, 16 Jun 2025 12:05:20 -0700
Subject: [PATCH 633/851] [mlir][Vector] Support `vector.extract(xfer_read)`
 folding with dynamic indices (#143269)

This PR is part of the last step to remove `vector.extractelement` and `vector.insertelement` ops.
RFC: https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops

It adds support for folding `vector.transfer_read(vector.extract) ->
memref.load` with dynamic indices, which is currently supported by
`vector.extractelement`.
---
 .../Transforms/VectorTransferOpTransforms.cpp | 28 ++++++++++++-----
 .../scalar-vector-transfer-to-memref.mlir     | 30 +++++++++++++++++++
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
index 7dbb7a334fe62..384717aeca665 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp
@@ -886,17 +886,31 @@ class RewriteScalarExtractOfTransferRead
     SmallVector<Value> newIndices(xferOp.getIndices().begin(),
                                   xferOp.getIndices().end());
     for (auto [i, pos] : llvm::enumerate(extractOp.getMixedPosition())) {
-      assert(isa<Attribute>(pos) && "Unexpected non-constant index");
-      int64_t offset = cast<IntegerAttr>(cast<Attribute>(pos)).getInt();
       int64_t idx = newIndices.size() - extractOp.getNumIndices() + i;
-      OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
-          rewriter, extractOp.getLoc(),
-          rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
-      if (auto value = dyn_cast<Value>(ofr)) {
+
+      // Compute affine expression `newIndices[idx] + pos` where `pos` can be
+      // either a constant or a value.
+      OpFoldResult composedIdx;
+      if (auto attr = dyn_cast<Attribute>(pos)) {
+        int64_t offset = cast<IntegerAttr>(attr).getInt();
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(),
+            rewriter.getAffineSymbolExpr(0) + offset, {newIndices[idx]});
+      } else {
+        Value dynamicOffset = cast<Value>(pos);
+        AffineExpr sym0, sym1;
+        bindSymbols(rewriter.getContext(), sym0, sym1);
+        composedIdx = affine::makeComposedFoldedAffineApply(
+            rewriter, extractOp.getLoc(), sym0 + sym1,
+            {newIndices[idx], dynamicOffset});
+      }
+
+      // Update the corresponding index with the folded result.
+      if (auto value = dyn_cast<Value>(composedIdx)) {
         newIndices[idx] = value;
       } else {
         newIndices[idx] = rewriter.create<arith::ConstantIndexOp>(
-            extractOp.getLoc(), *getConstantIntValue(ofr));
+            extractOp.getLoc(), *getConstantIntValue(composedIdx));
       }
     }
     if (isa<MemRefType>(xferOp.getBase().getType())) {
diff --git a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
index 52b0fdee184f6..7a1d6b3a8344a 100644
--- a/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
+++ b/mlir/test/Dialect/Vector/scalar-vector-transfer-to-memref.mlir
@@ -148,3 +148,33 @@ func.func @subvector_extract(%m: memref<?x?xf32>, %idx: index) -> vector<16xf32>
   return %1 : vector<16xf32>
 }
 
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_1d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?xf32>, %[[M_IDX:.*]]: index, %[[E_IDX:.*]]: index
+//       CHECK:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[M_IDX]], %[[E_IDX]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[APPLY]]]
+func.func @transfer_read_1d_extract_dynamic(%m: memref<?xf32>, %idx: index,
+                                            %offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%idx], %cst {in_bounds = [true]} : memref<?xf32>, vector<5xf32>
+  %elem = vector.extract %vec[%offset] : f32 from vector<5xf32>
+  return %elem : f32
+}
+
+// -----
+
+//       CHECK: #[[$MAP:.*]] = affine_map<()[s0, s1] -> (s0 + s1)>
+// CHECK-LABEL: func @transfer_read_2d_extract_dynamic(
+//  CHECK-SAME:     %[[MEMREF:.*]]: memref<?x?xf32>, %[[ROW_IDX:.*]]: index, %[[COL_IDX:.*]]: index, %[[ROW_OFFSET:.*]]: index, %[[COL_OFFSET:.*]]: index
+//       CHECK:   %[[ROW_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[ROW_IDX]], %[[ROW_OFFSET]]]
+//       CHECK:   %[[COL_APPLY:.*]] = affine.apply #[[$MAP]]()[%[[COL_IDX]], %[[COL_OFFSET]]]
+//       CHECK:   %[[RES:.*]] = memref.load %[[MEMREF]][%[[ROW_APPLY]], %[[COL_APPLY]]]
+func.func @transfer_read_2d_extract_dynamic(%m: memref<?x?xf32>, %row_idx: index, %col_idx: index,
+                                            %row_offset: index, %col_offset: index) -> f32 {
+  %cst = arith.constant 0.0 : f32
+  %vec = vector.transfer_read %m[%row_idx, %col_idx], %cst {in_bounds = [true, true]} : memref<?x?xf32>, vector<10x5xf32>
+  %elem = vector.extract %vec[%row_offset, %col_offset] : f32 from vector<10x5xf32>
+  return %elem : f32
+}

From a0662ceba83cf8782da4047b8ee6d175591f168f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez=20Troiti=C3=B1o?=
 <danielrodriguez@meta.com>
Date: Mon, 16 Jun 2025 12:06:25 -0700
Subject: [PATCH 634/851] [objcopy][MachO] Revert special handling of
 encryptable binaries (#144058)

Code originally added in #120995 and later corrected in #130517 but
apparently still not correct according to #141494 and
rust-lang/rust#141913.

Revert the special handling because the test written in #120995 and
#130517 still passes without those changes. Kept the test and improved
it with a `__DATA` section to keep the current behaviour checked in case
other changes modify the behaviour and break this edge case.
---
 llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp |   8 -
 llvm/lib/ObjCopy/MachO/MachOObject.cpp        |   4 -
 llvm/lib/ObjCopy/MachO/MachOObject.h          |   3 -
 llvm/lib/ObjCopy/MachO/MachOReader.cpp        |   4 -
 .../MachO/strip-with-encryption-info.test     | 156 ++++++++++++------
 5 files changed, 106 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
index 8ecd669e67178..93bc6631e64c8 100644
--- a/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOLayoutBuilder.cpp
@@ -116,10 +116,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
   const bool IsObjectFile =
       O.Header.FileType == MachO::HeaderFileType::MH_OBJECT;
   uint64_t Offset = IsObjectFile ? (HeaderSize + O.Header.SizeOfCmds) : 0;
-  // If we are emitting an encryptable binary, our load commands must have a
-  // separate (non-encrypted) page to themselves.
-  bool RequiresFirstSectionOutsideFirstPage =
-      O.EncryptionInfoCommandIndex.has_value();
   for (LoadCommand &LC : O.LoadCommands) {
     auto &MLC = LC.MachOLoadCommand;
     StringRef Segname;
@@ -173,10 +169,6 @@ uint64_t MachOLayoutBuilder::layoutSegments() {
         if (!Sec->hasValidOffset()) {
           Sec->Offset = 0;
         } else {
-          if (RequiresFirstSectionOutsideFirstPage) {
-            SectOffset = alignToPowerOf2(SectOffset, PageSize);
-            RequiresFirstSectionOutsideFirstPage = false;
-          }
           Sec->Offset = SegOffset + SectOffset;
           Sec->Size = Sec->Content.size();
           SegFileSize = std::max(SegFileSize, SectOffset + Sec->Size);
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.cpp b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
index e0819d89d24ff..8d2c02dc37c99 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.cpp
@@ -98,10 +98,6 @@ void Object::updateLoadCommandIndexes() {
     case MachO::LC_DYLD_EXPORTS_TRIE:
       ExportsTrieCommandIndex = Index;
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      EncryptionInfoCommandIndex = Index;
-      break;
     }
   }
 }
diff --git a/llvm/lib/ObjCopy/MachO/MachOObject.h b/llvm/lib/ObjCopy/MachO/MachOObject.h
index 13ac87ed3ed06..8f9444f5fb025 100644
--- a/llvm/lib/ObjCopy/MachO/MachOObject.h
+++ b/llvm/lib/ObjCopy/MachO/MachOObject.h
@@ -341,9 +341,6 @@ struct Object {
   /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
   /// corresponding to the __TEXT segment.
   std::optional<size_t> TextSegmentCommandIndex;
-  /// The index of the LC_ENCRYPTION_INFO or LC_ENCRYPTION_INFO_64 load command
-  /// if present.
-  std::optional<size_t> EncryptionInfoCommandIndex;
 
   BumpPtrAllocator Alloc;
   StringSaver NewSectionsContents;
diff --git a/llvm/lib/ObjCopy/MachO/MachOReader.cpp b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
index ef0e0262f9395..2b344f36d8e78 100644
--- a/llvm/lib/ObjCopy/MachO/MachOReader.cpp
+++ b/llvm/lib/ObjCopy/MachO/MachOReader.cpp
@@ -184,10 +184,6 @@ Error MachOReader::readLoadCommands(Object &O) const {
     case MachO::LC_DYLD_CHAINED_FIXUPS:
       O.ChainedFixupsCommandIndex = O.LoadCommands.size();
       break;
-    case MachO::LC_ENCRYPTION_INFO:
-    case MachO::LC_ENCRYPTION_INFO_64:
-      O.EncryptionInfoCommandIndex = O.LoadCommands.size();
-      break;
     }
 #define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
   case MachO::LCName:                                                          \
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
index 2b2bd670613de..d6f6fe10d88c2 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-with-encryption-info.test
@@ -16,7 +16,11 @@
 # CHECK:       fileoff: 0
 
 # The YAML below is the following code
+# ```
+# static int foo = 12345;
+# int bar = 4567;
 # int main(int argc, char **argv) { return 0; }
+# ```
 # Compiled on macOS against the macOS SDK and passing `-Wl,-encryptable`
 # Contents are removed, since they are not important for the test. We need a
 # small text segment (smaller than a page).
@@ -26,8 +30,8 @@ FileHeader:
   cputype:         0x100000C
   cpusubtype:      0x0
   filetype:        0x2
-  ncmds:           15
-  sizeofcmds:      696
+  ncmds:           18
+  sizeofcmds:      920
   flags:           0x200085
   reserved:        0x0
 LoadCommands:
@@ -69,7 +73,7 @@ LoadCommands:
       - sectname:        __unwind_info
         segname:         __TEXT
         addr:            0x100004020
-        size:            4152
+        size:            88
         offset:          0x4020
         align:           2
         reloff:          0x0
@@ -79,37 +83,61 @@ LoadCommands:
         reserved2:       0x0
         reserved3:       0x0
   - cmd:             LC_SEGMENT_64
-    cmdsize:         72
-    segname:         __LINKEDIT
+    cmdsize:         152
+    segname:         __DATA
     vmaddr:          4295000064
-    vmsize:          592
+    vmsize:          16384
     fileoff:         32768
-    filesize:        592
+    filesize:        16384
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x100008000
+        size:            4
+        offset:          0x8000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4295016448
+    vmsize:          16384
+    fileoff:         49152
+    filesize:        768
     maxprot:         1
     initprot:        1
     nsects:          0
     flags:           0
   - cmd:             LC_DYLD_CHAINED_FIXUPS
     cmdsize:         16
-    dataoff:         32768
-    datasize:        48
+    dataoff:         49152
+    datasize:        56
   - cmd:             LC_DYLD_EXPORTS_TRIE
     cmdsize:         16
-    dataoff:         32816
-    datasize:        48
+    dataoff:         49208
+    datasize:        64
   - cmd:             LC_SYMTAB
     cmdsize:         24
-    symoff:          32872
-    nsyms:           2
-    stroff:          32904
-    strsize:         32
+    symoff:          49280
+    nsyms:           3
+    stroff:          49328
+    strsize:         40
   - cmd:             LC_DYSYMTAB
     cmdsize:         80
     ilocalsym:       0
     nlocalsym:       0
     iextdefsym:      0
-    nextdefsym:      2
-    iundefsym:       2
+    nextdefsym:      3
+    iundefsym:       3
     nundefsym:       0
     tocoff:          0
     ntoc:            0
@@ -123,12 +151,6 @@ LoadCommands:
     nextrel:         0
     locreloff:       0
     nlocrel:         0
-  - cmd:             LC_ENCRYPTION_INFO_64
-    cmdsize:         24
-    cryptoff:        16384
-    cryptsize:       16384
-    cryptid:         0
-    pad:             0
   - cmd:             LC_LOAD_DYLINKER
     cmdsize:         32
     name:            12
@@ -136,32 +158,50 @@ LoadCommands:
     ZeroPadBytes:    7
   - cmd:             LC_UUID
     cmdsize:         24
-    uuid:            4C4C4447-5555-3144-A18A-01E9EB7E7D92
+    uuid:            ADDA943C-657A-3A49-9580-168E17A40FFB
   - cmd:             LC_BUILD_VERSION
     cmdsize:         32
     platform:        1
     minos:           983040
-    sdk:             983552
+    sdk:             984320
     ntools:          1
     Tools:
-      - tool:            4
-        version:         1310720
+      - tool:            3
+        version:         76481537
+  - cmd:             LC_SOURCE_VERSION
+    cmdsize:         16
+    version:         0
   - cmd:             LC_MAIN
     cmdsize:         24
     entryoff:        16384
     stacksize:       0
+  - cmd:             LC_ENCRYPTION_INFO_64
+    cmdsize:         24
+    cryptoff:        16384
+    cryptsize:       16384
+    cryptid:         0
+    pad:             0
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       2
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
   - cmd:             LC_FUNCTION_STARTS
     cmdsize:         16
-    dataoff:         32864
+    dataoff:         49272
     datasize:        8
   - cmd:             LC_DATA_IN_CODE
     cmdsize:         16
-    dataoff:         32872
+    dataoff:         49280
     datasize:        0
   - cmd:             LC_CODE_SIGNATURE
     cmdsize:         16
-    dataoff:         32944
-    datasize:        416
+    dataoff:         49376
+    datasize:        544
 LinkEditData:
   ExportTrie:
     TerminalSize:    0
@@ -173,51 +213,67 @@ LinkEditData:
     ImportName:      ''
     Children:
       - TerminalSize:    0
-        NodeOffset:      5
+        NodeOffset:      25
         Name:            _
         Flags:           0x0
         Address:         0x0
         Other:           0x0
         ImportName:      ''
         Children:
+          - TerminalSize:    2
+            NodeOffset:      9
+            Name:            _mh_execute_header
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
           - TerminalSize:    4
-            NodeOffset:      33
-            Name:            main
+            NodeOffset:      13
+            Name:            bar
             Flags:           0x0
-            Address:         0x4000
+            Address:         0x8000
             Other:           0x0
             ImportName:      ''
-          - TerminalSize:    2
-            NodeOffset:      39
-            Name:            _mh_execute_header
+          - TerminalSize:    4
+            NodeOffset:      19
+            Name:            main
             Flags:           0x0
-            Address:         0x0
+            Address:         0x4000
             Other:           0x0
             ImportName:      ''
   NameList:
     - n_strx:          2
       n_type:          0xF
       n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0xF
+      n_sect:          3
       n_desc:          0
-      n_value:         4294983680
-    - n_strx:          8
+      n_value:         4295000064
+    - n_strx:          27
       n_type:          0xF
       n_sect:          1
-      n_desc:          16
-      n_value:         4294967296
+      n_desc:          0
+      n_value:         4294983680
   StringTable:
     - ' '
-    - _main
     - __mh_execute_header
+    - _bar
+    - _main
+    - ''
+    - ''
+    - ''
     - ''
     - ''
     - ''
     - ''
   FunctionStarts:  [ 0x4000 ]
-  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x30, 0x0, 
-                     0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 
-                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
+  ChainedFixups:   [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x34, 0x0,
+                     0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                     0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ]
 ...
-

From 402c376daa659c0c3a477ad038a415079ffa0a48 Mon Sep 17 00:00:00 2001
From: William Huynh <William.Huynh@arm.com>
Date: Mon, 16 Jun 2025 20:22:58 +0100
Subject: [PATCH 635/851] [libc] Change default behaviour of baremetal/printf
 to use stdout (#143703)

In #94078, `write_to_stdout` had not been fully implemented. However,
now that it has been implemented, to conform with the C standard
(7.23.6.3. The printf function, specifically point 2), we use `stdout`.
This issue is tracked in #94685.

- Also prefer `static constexpr`
- Made it explicit that we are writing to `stdout`
---
 libc/src/stdio/baremetal/printf.cpp  | 8 ++++----
 libc/src/stdio/baremetal/putchar.cpp | 2 +-
 libc/src/stdio/baremetal/puts.cpp    | 4 ++--
 libc/src/stdio/baremetal/vprintf.cpp | 8 ++++----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libc/src/stdio/baremetal/printf.cpp b/libc/src/stdio/baremetal/printf.cpp
index c94698ec02953..7253c6549a4e4 100644
--- a/libc/src/stdio/baremetal/printf.cpp
+++ b/libc/src/stdio/baremetal/printf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -35,11 +35,11 @@ LLVM_LIBC_FUNCTION(int, printf, (const char *__restrict format, ...)) {
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
   va_end(vlist);
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);
diff --git a/libc/src/stdio/baremetal/putchar.cpp b/libc/src/stdio/baremetal/putchar.cpp
index 0ba46a5ade6c9..ac21e6e783b01 100644
--- a/libc/src/stdio/baremetal/putchar.cpp
+++ b/libc/src/stdio/baremetal/putchar.cpp
@@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, putchar, (int c)) {
   char uc = static_cast<char>(c);
 
-  write_to_stderr(cpp::string_view(&uc, 1));
+  write_to_stdout(cpp::string_view(&uc, 1));
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/puts.cpp b/libc/src/stdio/baremetal/puts.cpp
index 5062efda1c0dc..fcd3aa086b2bf 100644
--- a/libc/src/stdio/baremetal/puts.cpp
+++ b/libc/src/stdio/baremetal/puts.cpp
@@ -17,8 +17,8 @@ LLVM_LIBC_FUNCTION(int, puts, (const char *__restrict str)) {
   cpp::string_view str_view(str);
 
   // TODO: Can we combine these to avoid needing two writes?
-  write_to_stderr(str_view);
-  write_to_stderr("\n");
+  write_to_stdout(str_view);
+  write_to_stdout("\n");
 
   return 0;
 }
diff --git a/libc/src/stdio/baremetal/vprintf.cpp b/libc/src/stdio/baremetal/vprintf.cpp
index 3e8631abd90d9..ab02533f14911 100644
--- a/libc/src/stdio/baremetal/vprintf.cpp
+++ b/libc/src/stdio/baremetal/vprintf.cpp
@@ -21,8 +21,8 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace {
 
-LIBC_INLINE int raw_write_hook(cpp::string_view new_str, void *) {
-  write_to_stderr(new_str);
+LIBC_INLINE int stdout_write_hook(cpp::string_view new_str, void *) {
+  write_to_stdout(new_str);
   return printf_core::WRITE_OK;
 }
 
@@ -33,11 +33,11 @@ LLVM_LIBC_FUNCTION(int, vprintf,
   internal::ArgList args(vlist); // This holder class allows for easier copying
                                  // and pointer semantics, as well as handling
                                  // destruction automatically.
-  constexpr size_t BUFF_SIZE = 1024;
+  static constexpr size_t BUFF_SIZE = 1024;
   char buffer[BUFF_SIZE];
 
   printf_core::WriteBuffer<printf_core::WriteMode::FLUSH_TO_STREAM> wb(
-      buffer, BUFF_SIZE, &raw_write_hook, nullptr);
+      buffer, BUFF_SIZE, &stdout_write_hook, nullptr);
   printf_core::Writer<printf_core::WriteMode::FLUSH_TO_STREAM> writer(wb);
 
   int retval = printf_core::printf_main(&writer, format, args);

From 25781221d68a700eae679a19f701d4ad67e91dc9 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 16 Jun 2025 12:43:13 -0700
Subject: [PATCH 636/851] [instcombine] Delete dead transform for reverse of
 binop (#143967)

We canonicalize reverse to after a binop in foldVectorBinop, and
simplify reverse pairs in InstSimplify, so these elimination transforms
are redundant.
---
 .../InstCombine/InstCombineCalls.cpp          | 29 ++++---------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8c8cc0859e4af..03897117861f6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3555,32 +3555,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::vector_reverse: {
-    Value *BO0, *BO1, *X, *Y;
     Value *Vec = II->getArgOperand(0);
-    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
-      auto *OldBinOp = cast<BinaryOperator>(Vec);
-      if (match(BO0, m_VecReverse(m_Value(X)))) {
-        // rev(binop rev(X), rev(Y)) --> binop X, Y
-        if (match(BO1, m_VecReverse(m_Value(Y))))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, Y,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
-        if (isSplatValue(BO1))
-          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
-                                             OldBinOp->getOpcode(), X, BO1,
-                                             OldBinOp, OldBinOp->getName(),
-                                             II->getIterator()));
-      }
-      // rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y
-      if (match(BO1, m_VecReverse(m_Value(Y))) && isSplatValue(BO0))
-        return replaceInstUsesWith(CI,
-                                   BinaryOperator::CreateWithCopiedFlags(
-                                       OldBinOp->getOpcode(), BO0, Y, OldBinOp,
-                                       OldBinOp->getName(), II->getIterator()));
-    }
+    // Note: We canonicalize reverse after binops, so we don't need a
+    // corresponding binop case here. TODO: Consider canonicalizing
+    // reverse after fneg?
+
     // rev(unop rev(X)) --> unop X
+    Value *X;
     if (match(Vec, m_OneUse(m_UnOp(m_VecReverse(m_Value(X)))))) {
       auto *OldUnOp = cast<UnaryOperator>(Vec);
       auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(

From b0378e7ca953c2390168f352c5a88fd325cde894 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Mon, 16 Jun 2025 12:55:12 -0700
Subject: [PATCH 637/851] [AArch64TargetParser]Fix
 reconstructFromParsedFeatures ignoring negative features (#142236)

The `targetFeatureToExtension` function used by
reconstructFromParsedFeatures only found positive `+FEATURE` strings,
but not negative `-FEATURE` strings. Extend the function to handle both
to fix `reconstructFromParsedFeatures`.
---
 .../CodeGen/aarch64-always-inline-feature-bug.c  |  8 ++++++++
 llvm/lib/TargetParser/AArch64TargetParser.cpp    |  5 +++--
 llvm/unittests/TargetParser/TargetParserTest.cpp | 16 ++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-always-inline-feature-bug.c

diff --git a/clang/test/CodeGen/aarch64-always-inline-feature-bug.c b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
new file mode 100644
index 0000000000000..27c3983c66d2b
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-always-inline-feature-bug.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple aarch64-- -target-feature +neon -target-feature +sve\
+// RUN:   -target-feature -sve -emit-llvm %s -o - | FileCheck %s
+
+// Reproducer for bug where clang would reject always_inline for unrelated
+// target features if they were disable with `-feature` on the command line.
+// CHECK: @bar
+__attribute__((always_inline)) __attribute__((target("neon"))) void foo() {}
+void bar() { foo(); }
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index e13c6e6d28c2b..4a2523440f0f0 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -60,7 +60,7 @@ uint64_t AArch64::getFMVPriority(ArrayRef<StringRef> Features) {
   ExtensionSet FeatureBits;
   for (const StringRef Feature : Features) {
     std::optional<FMVInfo> FMV = parseFMVExtension(Feature);
-    if (!FMV) {
+    if (!FMV && Feature.starts_with('+')) {
       if (std::optional<ExtensionInfo> Info = targetFeatureToExtension(Feature))
         FMV = lookupFMVByID(Info->ID);
     }
@@ -181,7 +181,8 @@ std::optional<AArch64::FMVInfo> AArch64::parseFMVExtension(StringRef FMVExt) {
 std::optional<AArch64::ExtensionInfo>
 AArch64::targetFeatureToExtension(StringRef TargetFeature) {
   for (const auto &E : Extensions)
-    if (TargetFeature == E.PosTargetFeature)
+    if (TargetFeature == E.PosTargetFeature ||
+        TargetFeature == E.NegTargetFeature)
       return E;
   return {};
 }
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index f4c93334ac682..c4efb991ab6fd 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1831,6 +1831,22 @@ TEST_P(AArch64ExtensionDependenciesBaseCPUTestFixture,
   }
 }
 
+TEST(TargetParserTest, testAArch64ReconstructFromParsedFeatures) {
+  AArch64::ExtensionSet Extensions;
+  std::vector<std::string> FeatureOptions = {
+      "-sve2", "-Baz", "+sve", "+FooBar", "+sve2", "+neon", "-sve",
+  };
+  std::vector<std::string> NonExtensions;
+  Extensions.reconstructFromParsedFeatures(FeatureOptions, NonExtensions);
+
+  std::vector<std::string> NonExtensionsExpected = {"-Baz", "+FooBar"};
+  ASSERT_THAT(NonExtensions, testing::ContainerEq(NonExtensionsExpected));
+  std::vector<StringRef> Features;
+  Extensions.toLLVMFeatureList(Features);
+  std::vector<StringRef> FeaturesExpected = {"+neon", "-sve", "+sve2"};
+  ASSERT_THAT(Features, testing::ContainerEq(FeaturesExpected));
+}
+
 AArch64ExtensionDependenciesBaseArchTestParams
     AArch64ExtensionDependenciesArchData[] = {
         // Base architecture features

From a637584fadb1f0b9a4fc526a2952345b14147634 Mon Sep 17 00:00:00 2001
From: DrSergei <serzhdruzhok@gmail.com>
Date: Mon, 16 Jun 2025 22:56:02 +0300
Subject: [PATCH 638/851] [lldb-dap] Add supported languages in package.json
 (#144414)

This patch fixes the [problem]. It was caused by missing supported
languages list in `package.json`. VSCode uses `guessDebugger` [function]
to find supported debuggers based on supported languages in case of
opened file. It uses `interestedInLanguage` [function][1] to do that, so
we should provide list of supported languages. Also, fixed typo in
`fortran`.

[problem]: https://github.com/llvm/llvm-project/issues/144239
[function]: https://github.com/microsoft/vscode/blob/main/src/vs/workbench/contrib/debug/browser/debugAdapterManager.ts#L344
[1]: https://github.com/microsoft/vscode/blob/main/src/vs/workbench/contrib/debug/common/debugger.ts#L171
---
 lldb/tools/lldb-dap/package.json | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json
index 0f51c4f935e33..b150dee792c34 100644
--- a/lldb/tools/lldb-dap/package.json
+++ b/lldb/tools/lldb-dap/package.json
@@ -290,7 +290,7 @@
         "language": "d"
       },
       {
-        "language": "fortan"
+        "language": "fortran"
       },
       {
         "language": "fortran-modern"
@@ -318,6 +318,22 @@
       {
         "type": "lldb-dap",
         "label": "LLDB DAP Debugger",
+        "languages": [
+          "ada",
+          "arm",
+          "c",
+          "cpp",
+          "crystal",
+          "d",
+          "fortran",
+          "fortran-modern",
+          "nim",
+          "objective-c",
+          "objectpascal",
+          "pascal",
+          "rust",
+          "swift"
+        ],
         "configurationAttributes": {
           "launch": {
             "required": [

From 8adccaee2a9e2d967ac54a783ffb71ac6ff79e85 Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Mon, 16 Jun 2025 20:06:46 +0000
Subject: [PATCH 639/851] [libc] Implemented CharacterConverter push/pop for
 utf32->utf8 conversions (#143971)

Implemented CharacterConverter methods for conversion between utf32 ->
utf8
Added tests

---------

Co-authored-by: Michael Jones <michaelrj@google.com>
---
 libc/src/__support/wchar/CMakeLists.txt       |   9 +-
 .../__support/wchar/character_converter.cpp   |  70 ++++++-
 .../src/__support/wchar/character_converter.h |   8 +-
 libc/src/__support/wchar/mbstate.h            |   9 +
 libc/src/__support/wchar/utf_ret.h            |  24 ---
 libc/test/src/__support/CMakeLists.txt        |   6 +
 libc/test/src/__support/wchar/CMakeLists.txt  |  11 ++
 .../src/__support/wchar/utf32_to_8_test.cpp   | 180 ++++++++++++++++++
 8 files changed, 278 insertions(+), 39 deletions(-)
 delete mode 100644 libc/src/__support/wchar/utf_ret.h
 create mode 100644 libc/test/src/__support/wchar/CMakeLists.txt
 create mode 100644 libc/test/src/__support/wchar/utf32_to_8_test.cpp

diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
index 5cca58400ff45..6715e354e23e5 100644
--- a/libc/src/__support/wchar/CMakeLists.txt
+++ b/libc/src/__support/wchar/CMakeLists.txt
@@ -15,12 +15,7 @@ add_object_library(
   DEPENDS
     libc.hdr.types.char8_t
     libc.hdr.types.char32_t
+    libc.src.__support.error_or
+    libc.src.__support.math_extras
     .mbstate
-    .utf_ret
-)
-
-add_header_library(
-  utf_ret
-  HDRS
-    utf_ret.h
 )
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index f09c7815a6cc4..bac2f6d827e13 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,8 +8,10 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/math_extras.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 #include "character_converter.h"
 
@@ -18,17 +20,75 @@ namespace internal {
 
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
+void CharacterConverter::clear() {
+  state->partial = 0;
+  state->bytes_processed = 0;
+  state->total_bytes = 0;
+}
+
 bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char32_t utf32) {
+  // we can't be partially through a conversion when pushing a utf32 value
+  if (!isComplete())
+    return -1;
+
+  state->partial = utf32;
+  state->bytes_processed = 0;
+
+  // determine number of utf-8 bytes needed to represent this utf32 value
+  constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
+  constexpr int NUM_RANGES = 4;
+  for (uint8_t i = 0; i < NUM_RANGES; i++) {
+    if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
+      state->total_bytes = i + 1;
+      return 0;
+    }
+  }
+
+  // `utf32` contains a value that is too large to actually represent a valid
+  // unicode character
+  clear();
+  return -1;
+}
+
+ErrorOr<char8_t> CharacterConverter::pop_utf8() {
+  if (isComplete())
+    return Error(-1);
+
+  constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
+  constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
 
-int CharacterConverter::push(char32_t utf32) {}
+  // the number of bits per utf-8 byte that actually encode character
+  // information not metadata (# of bits excluding the byte headers)
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  constexpr int MASK_ENCODED_BITS =
+      mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
 
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+  char32_t output;
 
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+  // Shift to get the next 6 bits from the utf32 encoding
+  const char32_t shift_amount =
+      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
+  if (state->bytes_processed == 0) {
+    /*
+      Choose the correct set of most significant bits to encode the length
+      of the utf8 sequence. The remaining bits contain the most significant
+      bits of the unicode value of the character.
+    */
+    output = FIRST_BYTE_HEADERS[state->total_bytes - 1] |
+             (state->partial >> shift_amount);
+  } else {
+    // Get the next 6 bits and format it like so: 10xxxxxx
+    output = CONTINUING_BYTE_HEADER |
+             ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
+  }
+
+  state->bytes_processed++;
+  return static_cast<char8_t>(output);
+}
 
 } // namespace internal
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index d0602d2defe22..c4ba7cf6b689f 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -11,8 +11,9 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
 #include "src/__support/wchar/mbstate.h"
-#include "src/__support/wchar/utf_ret.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
@@ -24,13 +25,14 @@ class CharacterConverter {
 public:
   CharacterConverter(mbstate *mbstate);
 
+  void clear();
   bool isComplete();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
 
-  utf_ret<char8_t> pop_utf8();
-  utf_ret<char32_t> pop_utf32();
+  ErrorOr<char8_t> pop_utf8();
+  ErrorOr<char32_t> pop_utf32();
 };
 
 } // namespace internal
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index d33ee354a5443..fb08fb4eaa188 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -17,8 +17,17 @@ namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
 struct mbstate {
+  // store a partial codepoint (in UTF-32)
   char32_t partial;
+
+  /*
+  Progress towards a conversion
+    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
+    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+  */
   uint8_t bytes_processed;
+
+  // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
 };
 
diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h
deleted file mode 100644
index fa99b76159bd8..0000000000000
--- a/libc/src/__support/wchar/utf_ret.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
-
-#include "src/__support/common.h"
-
-namespace LIBC_NAMESPACE_DECL {
-namespace internal {
-template <typename T> struct utf_ret {
-  T out;
-  int error;
-};
-
-} // namespace internal
-} // namespace LIBC_NAMESPACE_DECL
-
-#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..76218a16e0cf7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
+if(NOT(LIBC_TARGET_OS_IS_DARWIN))
+  add_subdirectory(wchar)
+endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..5dff6e9115f7d
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+  utf32_to_8_test
+  SUITE
+    libc-support-tests
+  SRCS
+    utf32_to_8_test.cpp
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
new file mode 100644
index 0000000000000..f4c5cb863ff38
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -0,0 +1,180 @@
+//===-- Unittests for the CharacterConverter class (utf32 -> 8) -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // utf8 1-byte encodings are identical to their utf32 representations
+  char32_t utf32_A = 0x41; // 'A'
+  cr.push(utf32_A);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'A');
+  ASSERT_TRUE(cr.isComplete());
+
+  char32_t utf32_B = 0x42; // 'B'
+  cr.push(utf32_B);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<char>(popped.value()), 'B');
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xff -> utf8: 0xc3 0xbf
+  char32_t utf32 = 0xff;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x58e -> utf8: 0xd6 0x8e
+  utf32 = 0x58e;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
+  char32_t utf32 = 0xac15;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
+  utf32 = 0x267b;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
+  char32_t utf32 = 0x1f921;
+  cr.push(utf32);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  utf32 = 0x12121;
+  cr.push(utf32);
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
+  ASSERT_TRUE(!cr.isComplete());
+  popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+  ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
+  ASSERT_TRUE(cr.isComplete());
+
+  // should error if we try to pop another utf8 byte out
+  popped = cr.pop_utf8();
+  ASSERT_FALSE(popped.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF32To8Test, CantPushMidConversion) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  LIBC_NAMESPACE::internal::CharacterConverter cr(&state);
+  cr.clear();
+
+  // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
+  char32_t utf32 = 0x12121;
+  ASSERT_EQ(cr.push(utf32), 0);
+  auto popped = cr.pop_utf8();
+  ASSERT_TRUE(popped.has_value());
+
+  // can't push a utf32 without finishing popping the utf8 bytes out
+  int err = cr.push(utf32);
+  ASSERT_EQ(err, -1);
+}

From 1e60dd4f236dcca0215decc0e4885fb2dcdc1528 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 16 Jun 2025 13:09:57 -0700
Subject: [PATCH 640/851] [lldb] Fix a warning

This patch fixes:

  lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp:89:2:
  error: extra ';' outside of a function is incompatible with C++98
  [-Werror,-Wc++98-compat-extra-semi]
---
 lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
index 1a76371be2d58..9295b6ceae36d 100644
--- a/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/StepInTargetsRequestHandler.cpp
@@ -86,6 +86,6 @@ StepInTargetsRequestHandler::Run(const StepInTargetsArguments &args) const {
     }
   }
   return body;
-};
+}
 
 } // namespace lldb_dap

From d3bc834ece48cb993fcabcf20311bdcc9e591a21 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 21:10:11 +0100
Subject: [PATCH 641/851] [LV] Update check to find epilogue resume value to
 check all incoming.

This fixes a crash where all incoming values for the epilogue resume
value are zero, because there are no remaining iterations to execute for
the epilogue loop.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   5 +-
 ...sve-epilog-vect-no-remaining-iterations.ll | 146 ++++++++++++++++++
 2 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bd0a2ec3986d3..f1470fd1f7314 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9765,7 +9765,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
                 match(
                     P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
                     m_SpecificInt(0)) &&
-                is_contained(P.incoming_values(), EPI.VectorTripCount))
+                all_of(P.incoming_values(), [&EPI](Value *Inc) {
+                  return Inc == EPI.VectorTripCount ||
+                         match(Inc, m_SpecificInt(0));
+                }))
               return &P;
             return nullptr;
           });
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
new file mode 100644
index 0000000000000..f8551d774de49
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "aarch64-linux-gnu"
+
+define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
+; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
+; CHECK-NEXT:    [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP31]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i64 1, [[TMP20]]
+; CHECK-NEXT:    [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
+; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
+; CHECK-NEXT:    store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
+; CHECK-NEXT:    [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
+; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
+; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
+; CHECK-NEXT:    [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEXT:    [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
+; CHECK-NEXT:    [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
+; CHECK-NEXT:    [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i8 0, ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
+  %l = load i8, ptr %gep.src.i.i, align 1
+  %l.ext = zext i8 %l to i32
+  %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
+  %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
+  %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
+  %gep.dst = getelementptr inbounds i8, ptr  %dst, i64 %iv
+  store i8 0, ptr %gep.dst, align 1
+  %min.ext = zext i32 %min.1 to i64
+  %red.next = or i64 %red, %min.ext
+  %iv.next = add i64 %iv, 1
+  %exitcond.not.i.i = icmp eq i64 %iv.next, 16
+  br i1 %exitcond.not.i.i, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
+
+declare i32 @llvm.umin.i32(i32, i32)
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+attributes #0 = { "target-cpu"="neoverse-512tvb" }

From 34be09ad731d631d7b950a334cfe25673ebe5519 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 16 Jun 2025 21:18:21 +0100
Subject: [PATCH 642/851] [lldb-dap][test] fix not supported error. (#144419)

Fixes #144072

buildbot error.
---
 .../tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
index 51ccf2ccbdcad..03b79a805d341 100644
--- a/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
+++ b/lldb/test/API/tools/lldb-dap/stepInTargets/TestDAP_stepInTargets.py
@@ -112,7 +112,13 @@ def test_supported_capability_other_archs(self):
             len(breakpoint_ids), len(bp_lines), "expect correct number of breakpoints"
         )
         self.continue_to_breakpoints(breakpoint_ids)
-        is_supported = self.dap_server.get_capability("supportsStepInTargetsRequest")
+
+        try:
+            is_supported = self.dap_server.get_capability(
+                "supportsStepInTargetsRequest"
+            )
+        except dap_server.NotSupportedError:
+            is_supported = False
 
         self.assertEqual(
             is_supported,

From a027eb4472ee8fa504c98bef655cac8c8bfe333a Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 16 Jun 2025 16:44:55 -0400
Subject: [PATCH 643/851] [HLSL] Use hidden visibility for external linkage.
 (#140292)

Implements

https://github.com/llvm/wg-hlsl/blob/main/proposals/0026-symbol-visibility.md.

The change is to stop using the `hlsl.export` attribute. Instead,
symbols with "program linkage" in HLSL will have export linkage with
default visibility, and symbols with "external linkage" in HLSL will
have export linkage with hidden visibility.
---
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  8 --
 clang/lib/CodeGen/CodeGenFunction.cpp         |  1 -
 clang/lib/CodeGen/CodeGenModule.cpp           |  5 +
 clang/test/CodeGenHLSL/ArrayAssignable.hlsl   | 30 +++---
 clang/test/CodeGenHLSL/ArrayTemporary.hlsl    | 12 +--
 .../BasicFeatures/ArrayOutputArguments.hlsl   | 14 +--
 .../CodeGenHLSL/BasicFeatures/InitLists.hlsl  | 36 ++++----
 .../BasicFeatures/OutputArguments.hlsl        | 16 ++--
 clang/test/CodeGenHLSL/Bool.hlsl              |  2 +-
 clang/test/CodeGenHLSL/BoolVector.hlsl        | 14 +--
 .../CodeGenHLSL/GlobalConstructorLib.hlsl     |  2 +-
 clang/test/CodeGenHLSL/basic_types.hlsl       | 64 ++++++-------
 .../test/CodeGenHLSL/builtins/AddUint64.hlsl  |  4 +-
 .../ByteAddressBuffers-constructors.hlsl      | 14 +--
 .../GroupMemoryBarrierWithGroupSync.hlsl      |  8 +-
 .../builtins/RWBuffer-constructor.hlsl        | 12 +--
 .../CodeGenHLSL/builtins/ScalarSwizzles.hlsl  |  2 +-
 .../StructuredBuffers-constructors.hlsl       | 12 +--
 clang/test/CodeGenHLSL/builtins/abs.hlsl      | 56 +++++------
 clang/test/CodeGenHLSL/builtins/all.hlsl      |  8 +-
 clang/test/CodeGenHLSL/builtins/and.hlsl      | 12 +--
 clang/test/CodeGenHLSL/builtins/any.hlsl      |  8 +-
 .../CodeGenHLSL/builtins/ceil-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/ceil.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/clamp-overloads.hlsl |  8 +-
 clang/test/CodeGenHLSL/builtins/clamp.hlsl    |  8 +-
 .../CodeGenHLSL/builtins/clip-builtin.hlsl    |  2 +-
 clang/test/CodeGenHLSL/builtins/clip.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/cos-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/cos.hlsl      | 24 ++---
 clang/test/CodeGenHLSL/builtins/cross.hlsl    |  8 +-
 .../builtins/degrees-overloads.hlsl           |  4 +-
 clang/test/CodeGenHLSL/builtins/degrees.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/distance.hlsl | 32 +++----
 .../CodeGenHLSL/builtins/exp-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/exp.hlsl      | 24 ++---
 .../CodeGenHLSL/builtins/exp2-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/exp2.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/floor-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/floor.hlsl    | 24 ++---
 clang/test/CodeGenHLSL/builtins/fmod.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/frac-overloads.hlsl  |  4 +-
 clang/test/CodeGenHLSL/builtins/frac.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl | 14 +--
 .../CodeGenHLSL/builtins/isinf-overloads.hlsl |  8 +-
 clang/test/CodeGenHLSL/builtins/isinf.hlsl    | 16 ++--
 clang/test/CodeGenHLSL/builtins/ldexp.hlsl    | 16 ++--
 clang/test/CodeGenHLSL/builtins/length.hlsl   | 45 ++++-----
 .../CodeGenHLSL/builtins/lerp-overloads.hlsl  |  8 +-
 .../CodeGenHLSL/builtins/log-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log.hlsl      | 24 ++---
 .../CodeGenHLSL/builtins/log10-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log10.hlsl    | 24 ++---
 .../CodeGenHLSL/builtins/log2-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/log2.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/max-overloads.hlsl   | 22 ++---
 clang/test/CodeGenHLSL/builtins/max.hlsl      | 80 ++++++++--------
 .../CodeGenHLSL/builtins/min-overloads.hlsl   | 22 ++---
 clang/test/CodeGenHLSL/builtins/min.hlsl      | 82 ++++++++---------
 .../builtins/normalize-overloads.hlsl         |  4 +-
 .../test/CodeGenHLSL/builtins/normalize.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/or.hlsl       | 14 +--
 .../CodeGenHLSL/builtins/pow-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/pow.hlsl      | 24 ++---
 .../builtins/radians-overloads.hlsl           |  4 +-
 clang/test/CodeGenHLSL/builtins/radians.hlsl  |  8 +-
 clang/test/CodeGenHLSL/builtins/rcp.hlsl      | 64 ++++++-------
 clang/test/CodeGenHLSL/builtins/reflect.hlsl  | 32 +++----
 .../CodeGenHLSL/builtins/reversebits.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/round-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/round.hlsl    | 24 ++---
 .../CodeGenHLSL/builtins/rsqrt-overloads.hlsl |  4 +-
 clang/test/CodeGenHLSL/builtins/rsqrt.hlsl    |  8 +-
 clang/test/CodeGenHLSL/builtins/sign.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/sin-overloads.hlsl   | 40 ++++----
 clang/test/CodeGenHLSL/builtins/sin.hlsl      | 24 ++---
 .../test/CodeGenHLSL/builtins/smoothstep.hlsl | 32 +++----
 .../CodeGenHLSL/builtins/splitdouble.hlsl     | 10 +-
 .../CodeGenHLSL/builtins/sqrt-overloads.hlsl  | 40 ++++----
 clang/test/CodeGenHLSL/builtins/sqrt.hlsl     | 24 ++---
 .../CodeGenHLSL/builtins/step-overloads.hlsl  |  4 +-
 clang/test/CodeGenHLSL/builtins/step.hlsl     |  8 +-
 .../CodeGenHLSL/builtins/trunc-overloads.hlsl | 40 ++++----
 clang/test/CodeGenHLSL/builtins/trunc.hlsl    | 24 ++---
 .../wave_get_lane_index_do_while.hlsl         |  2 +-
 .../builtins/wave_get_lane_index_simple.hlsl  |  4 +-
 .../builtins/wave_get_lane_index_subcall.hlsl |  4 +-
 clang/test/CodeGenHLSL/cbuffer.hlsl           | 92 +++++++++----------
 .../CodeGenHLSL/cbuffer_and_namespaces.hlsl   |  8 +-
 .../CodeGenHLSL/cbuffer_with_packoffset.hlsl  | 10 +-
 ...uffer_with_static_global_and_function.hlsl |  2 +-
 .../CodeGenHLSL/convergence/do.while.hlsl     | 10 +-
 clang/test/CodeGenHLSL/convergence/for.hlsl   | 14 +--
 clang/test/CodeGenHLSL/convergence/while.hlsl | 12 +--
 clang/test/CodeGenHLSL/default_cbuffer.hlsl   | 12 +--
 .../default_cbuffer_with_layout.hlsl          | 12 +--
 clang/test/CodeGenHLSL/export.hlsl            | 10 +-
 clang/test/CodeGenHLSL/group_shared.hlsl      |  2 +-
 .../implicit-norecurse-attrib.hlsl            | 11 +--
 clang/test/CodeGenHLSL/inline-functions.hlsl  | 17 ++--
 .../CodeGenHLSL/inline-spirv/SpirvType.hlsl   |  4 +-
 clang/test/CodeGenHLSL/no_int_promotion.hlsl  | 14 +--
 .../test/CodeGenHLSL/out-of-line-static.hlsl  |  4 +-
 clang/test/CodeGenHLSL/shift-mask.hlsl        | 16 ++--
 .../CodeGenHLSL/this-assignment-overload.hlsl |  4 +-
 clang/test/CodeGenHLSL/vk-input-builtin.hlsl  |  2 +-
 .../enable_16bit_types_validation_spirv.hlsl  |  2 +-
 .../Target/DirectX/DXILFinalizeLinkage.cpp    |  4 +-
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |  3 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  3 +-
 .../finalize-linkage-remove-dead-lib.ll       | 77 ++++++----------
 .../DirectX/finalize-linkage-remove-dead.ll   | 46 +++++-----
 llvm/test/CodeGen/DirectX/finalize_linkage.ll | 25 +++--
 113 files changed, 1101 insertions(+), 1140 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 571ff53b7d644..585411bc59e16 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -471,14 +471,6 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
   }
 }
 
-void CGHLSLRuntime::setHLSLFunctionAttributes(const FunctionDecl *FD,
-                                              llvm::Function *Fn) {
-  if (FD->isInExportDeclContext()) {
-    const StringRef ExportAttrKindStr = "hlsl.export";
-    Fn->addFnAttr(ExportAttrKindStr);
-  }
-}
-
 static void gatherFunctions(SmallVectorImpl<Function *> &Fns, llvm::Module &M,
                             bool CtorOrDtor) {
   const auto *GV =
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 13d0633e9b1c0..70a09795d02fe 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1267,7 +1267,6 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
     if (FD->hasAttr<HLSLShaderAttr>()) {
       CGM.getHLSLRuntime().emitEntryFunction(FD, Fn);
     }
-    CGM.getHLSLRuntime().setHLSLFunctionAttributes(FD, Fn);
   }
 
   EmitFunctionProlog(*CurFnInfo, CurFn, Args);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index c036902b0b130..06c0e1f8afe1b 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1666,6 +1666,11 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
     return;
   }
 
+  if (Context.getLangOpts().HLSL && !D->isInExportDeclContext()) {
+    GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
+    return;
+  }
+
   if (GV->hasDLLExportStorageClass() || GV->hasDLLImportStorageClass()) {
     // Reject incompatible dlllstorage and visibility annotations.
     if (!LV.isVisibilityExplicit())
diff --git a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
index c3204570d6ef3..aaa486eff10b7 100644
--- a/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayAssignable.hlsl
@@ -7,10 +7,10 @@ struct S {
 
 // CHECK: [[CBLayout:%.*]] = type <{ [2 x float], [2 x <4 x i32>], [2 x [2 x i32]], [1 x target("dx.Layout", %S, 8, 0, 4)] }>
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", [[CBLayout]], 136, 0, 32, 64, 128))
-// CHECK: @c1 = external addrspace(2) global [2 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <4 x i32>], align 16
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x i32]], align 4
-// CHECK: @c4 = external addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
+// CHECK: @c1 = external hidden addrspace(2) global [2 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <4 x i32>], align 16
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x i32]], align 4
+// CHECK: @c4 = external hidden addrspace(2) global [1 x target("dx.Layout", %S, 8, 0, 4)], align 1
 
 cbuffer CBArrays : register(b0) {
   float c1[2];
@@ -19,7 +19,7 @@ cbuffer CBArrays : register(b0) {
   S c4[1];
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign1
+// CHECK-LABEL: define hidden void {{.*}}arr_assign1
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -33,7 +33,7 @@ void arr_assign1() {
   Arr = Arr2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign2
+// CHECK-LABEL: define hidden void {{.*}}arr_assign2
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -51,7 +51,7 @@ void arr_assign2() {
   Arr = Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign3
+// CHECK-LABEL: define hidden void {{.*}}arr_assign3
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -65,7 +65,7 @@ void arr_assign3() {
   Arr2 = Arr3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign4
+// CHECK-LABEL: define hidden void {{.*}}arr_assign4
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NOT: alloca
@@ -81,7 +81,7 @@ void arr_assign4() {
   (Arr = Arr2)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign5
+// CHECK-LABEL: define hidden void {{.*}}arr_assign5
 // CHECK: [[Arr:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Arr3:%.*]] = alloca [2 x i32], align 4
@@ -101,7 +101,7 @@ void arr_assign5() {
   (Arr = Arr2 = Arr3)[0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign6
+// CHECK-LABEL: define hidden void {{.*}}arr_assign6
 // CHECK: [[Arr3:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr4:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -118,7 +118,7 @@ void arr_assign6() {
   (Arr = Arr2)[0][0] = 6;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign7
+// CHECK-LABEL: define hidden void {{.*}}arr_assign7
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: [[Arr2:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NOT: alloca
@@ -138,7 +138,7 @@ void arr_assign7() {
 
 // Verify you can assign from a cbuffer array
 
-// CHECK-LABEL: define void {{.*}}arr_assign8
+// CHECK-LABEL: define hidden void {{.*}}arr_assign8
 // CHECK: [[C:%.*]] = alloca [2 x float], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c1, i32 8, i1 false)
@@ -148,7 +148,7 @@ void arr_assign8() {
   C = c1;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign9
+// CHECK-LABEL: define hidden void {{.*}}arr_assign9
 // CHECK: [[C:%.*]] = alloca [2 x <4 x i32>], align 16
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[C]], ptr align 16 {{.*}}, i32 32, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 16 [[C]], ptr addrspace(2) align 16 @c2, i32 32, i1 false)
@@ -158,7 +158,7 @@ void arr_assign9() {
   C = c2;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign10
+// CHECK-LABEL: define hidden void {{.*}}arr_assign10
 // CHECK: [[C:%.*]] = alloca [2 x [2 x i32]], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[C]], ptr align 4 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p2.i32(ptr align 4 [[C]], ptr addrspace(2) align 4 @c3, i32 16, i1 false)
@@ -168,7 +168,7 @@ void arr_assign10() {
   C = c3;
 }
 
-// CHECK-LABEL: define void {{.*}}arr_assign11
+// CHECK-LABEL: define hidden void {{.*}}arr_assign11
 // CHECK: [[C:%.*]] = alloca [1 x %struct.S], align 1
 // CHECK: call void @llvm.memcpy.p0.p2.i32(ptr align 1 [[C]], ptr addrspace(2) align 1 @c4, i32 8, i1 false)
 // CHECK-NEXT: ret void
diff --git a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
index 29ea896045bb1..42a469ae87957 100644
--- a/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
+++ b/clang/test/CodeGenHLSL/ArrayTemporary.hlsl
@@ -3,7 +3,7 @@
 
 void fn(float x[2]) { }
 
-// CHECK-LABEL: define void {{.*}}call{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x float]
 // CHECK: [[Tmp:%.*]] = alloca [2 x float]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 4 [[Arr]], i8 0, i32 8, i1 false)
@@ -21,7 +21,7 @@ struct Obj {
 
 void fn2(Obj O[4]) { }
 
-// CHECK-LABEL: define void {{.*}}call2{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call2{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: [[Tmp:%.*]] = alloca [4 x %struct.Obj]
 // CHECK: call void @llvm.memset.p0.i32(ptr align 1 [[Arr]], i8 0, i32 32, i1 false)
@@ -35,7 +35,7 @@ void call2() {
 
 void fn3(float x[2][2]) { }
 
-// CHECK-LABEL: define void {{.*}}call3{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}call3{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x [2 x float]]
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Arr]], ptr align 4 {{.*}}, i32 16, i1 false)
@@ -46,7 +46,7 @@ void call3() {
   fn3(Arr);
 }
 
-// CHECK-LABEL: define void {{.*}}call4{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}call4{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x [2 x float]]) align 4 [[Arr:%.*]])
 // CHECK: [[Tmp:%.*]] = alloca [2 x [2 x float]]
 // CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[Arr]], i32 16, i1 false)
@@ -59,7 +59,7 @@ void call4(float Arr[2][2]) {
 // Verify that each template instantiation codegens to a unique and correctly
 // mangled function name.
 
-// CHECK-LABEL: define void {{.*}}template_call{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}template_call{{.*}}(ptr
 
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]],
 // CHECK-SAME: ptr noundef byval([4 x float]) align 4 [[FA4:%[0-9A-Z]+]],
@@ -86,7 +86,7 @@ void template_call(float FA2[2], float FA4[4], int IA3[3]) {
 
 
 // Verify that Array parameter element access correctly codegens.
-// CHECK-LABEL: define void {{.*}}element_access{{.*}}(ptr
+// CHECK-LABEL: define hidden void {{.*}}element_access{{.*}}(ptr
 // CHECK-SAME: noundef byval([2 x float]) align 4 [[FA2:%[0-9A-Z]+]]
 
 // CHECK: [[Addr:%.*]] = getelementptr inbounds [2 x float], ptr [[FA2]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
index eb7d755bca61d..bccfaf597f0ed 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/ArrayOutputArguments.hlsl
@@ -11,7 +11,7 @@ void increment(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -32,7 +32,7 @@ void fn2(out int Arr[2]) {
 // CHECK: [[A:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}fn2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -56,7 +56,7 @@ void nestedCall(inout int Arr[2], uint index) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0) #3
+// CHECK-NEXT: call void @{{.*}}nestedCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]], i32 noundef 0)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 1
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -70,7 +70,7 @@ export int arrayCall3() {
 // CHECK-LABEL: outerCall
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 %{{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}increment{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 {{.*}}, ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: ret void
 void outerCall(inout int Arr[2]) {
@@ -82,7 +82,7 @@ void outerCall(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
@@ -99,7 +99,7 @@ void fn3(int Arr[2]) {}
 // CHECK-LABEL: outerCall2
 // CHECK: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 {{.*}}, i32 8, i1 false)
-// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void {{.*}}fn3{{.*}}(ptr noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: ret void
 void outerCall2(inout int Arr[2]) {
   fn3(Arr);
@@ -110,7 +110,7 @@ void outerCall2(inout int Arr[2]) {
 // CHECK-NEXT: [[Tmp:%.*]] = alloca [2 x i32], align 4
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 @{{.*}}, i32 8, i1 false)
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[Tmp]], ptr align 4 [[A]], i32 8, i1 false)
-// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]]) #3
+// CHECK-NEXT: call void @{{.*}}outerCall2{{.*}}(ptr noalias noundef byval([2 x i32]) align 4 [[Tmp]])
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr align 4 [[Tmp]], i32 8, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x i32], ptr [[A]], i32 0, i32 0
 // CHECK-NEXT: [[B:%.*]] = load i32, ptr [[Idx]], align 4
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
index 371f31c9e4afc..c30c640519cda 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/InitLists.hlsl
@@ -46,7 +46,7 @@ struct SlicyBits {
 };
 
 // Case 1: Extraneous braces get ignored in literal instantiation.
-// CHECK-LABEL: define void @_Z5case1v(
+// CHECK-LABEL: define hidden void @_Z5case1v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case1v.TF1, i32 8, i1 false)
@@ -58,7 +58,7 @@ TwoFloats case1() {
 }
 
 // Case 2: Valid C/C++ initializer is handled appropriately.
-// CHECK-LABEL: define void @_Z5case2v(
+// CHECK-LABEL: define hidden void @_Z5case2v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[AGG_RESULT]], ptr align 1 @__const._Z5case2v.TF2, i32 8, i1 false)
@@ -70,7 +70,7 @@ TwoFloats case2() {
 }
 
 // Case 3: Simple initialization with conversion of an argument.
-// CHECK-LABEL: define void @_Z5case3i(
+// CHECK-LABEL: define hidden void @_Z5case3i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[VAL:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -90,7 +90,7 @@ TwoFloats case3(int Val) {
 
 // Case 4: Initialization from a scalarized vector into a structure with element
 // conversions.
-// CHECK-LABEL: define void @_Z5case4Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case4Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -113,7 +113,7 @@ TwoFloats case4(int2 TwoVals) {
 }
 
 // Case 5: Initialization from a scalarized vector of matching type.
-// CHECK-LABEL: define void @_Z5case5Dv2_i(
+// CHECK-LABEL: define hidden void @_Z5case5Dv2_i(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], <2 x i32> noundef [[TWOVALS:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TWOVALS_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -135,7 +135,7 @@ TwoInts case5(int2 TwoVals) {
 
 // Case 6: Initialization from a scalarized structure of different type with
 // different element types.
-// CHECK-LABEL: define void @_Z5case69TwoFloats(
+// CHECK-LABEL: define hidden void @_Z5case69TwoFloats(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -157,7 +157,7 @@ TwoInts case6(TwoFloats TF4) {
 
 // Case 7: Initialization of a complex structure, with bogus braces and element
 // conversions from a collection of scalar values, and structures.
-// CHECK-LABEL: define void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
+// CHECK-LABEL: define hidden void @_Z5case77TwoIntsS_i9TwoFloatsS0_S0_S0_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_DOGGO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI1:%.*]], ptr noundef byval([[STRUCT_TWOINTS]]) align 1 [[TI2:%.*]], i32 noundef [[VAL:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF3:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF4:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VAL_ADDR:%.*]] = alloca i32, align 4
@@ -221,7 +221,7 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2,
 
 // Case 8: Initialization of a structure from a different structure with
 // significantly different element types and grouping.
-// CHECK-LABEL: define void @_Z5case85Doggo(
+// CHECK-LABEL: define hidden void @_Z5case85Doggo(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ANIMALBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[LEGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ANIMALBITS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -307,7 +307,7 @@ AnimalBits case8(Doggo D1) {
 // Case 9: Everything everywhere all at once... Initializing mismatched
 // structures from different layouts, different component groupings, with no
 // top-level bracing separation.
-// CHECK-LABEL: define void @_Z5case95Doggo10AnimalBits(
+// CHECK-LABEL: define hidden void @_Z5case95Doggo10AnimalBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_ZOO:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_DOGGO:%.*]]) align 1 [[D1:%.*]], ptr noundef byval([[STRUCT_ANIMALBITS:%.*]]) align 1 [[A1:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOGS:%.*]] = getelementptr inbounds nuw [[STRUCT_ZOO]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -723,7 +723,7 @@ Zoo case9(Doggo D1, AnimalBits A1) {
 }
 
 // Case 10: Initialize an object with a base class from two objects.
-// CHECK-LABEL: define void @_Z6case109TwoFloatsS_(
+// CHECK-LABEL: define hidden void @_Z6case109TwoFloatsS_(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS:%.*]]) align 1 [[TF1:%.*]], ptr noundef byval([[STRUCT_TWOFLOATS]]) align 1 [[TF2:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -750,7 +750,7 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) {
 }
 
 // Case 11: Initialize an object with a base class from a vector splat.
-// CHECK-LABEL: define void @_Z6case11f(
+// CHECK-LABEL: define hidden void @_Z6case11f(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], float noundef nofpclass(nan inf) [[F:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[F_ADDR:%.*]] = alloca float, align 4
@@ -799,7 +799,7 @@ FourFloats case11(float F) {
 }
 
 // Case 12: Initialize bitfield from two integers.
-// CHECK-LABEL: define void @_Z6case12ii(
+// CHECK-LABEL: define hidden void @_Z6case12ii(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
@@ -821,7 +821,7 @@ SlicyBits case12(int I, int J) {
 }
 
 // Case 13: Initialize bitfield from a struct of two ints.
-// CHECK-LABEL: define void @_Z6case137TwoInts(
+// CHECK-LABEL: define hidden void @_Z6case137TwoInts(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_SLICYBITS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_TWOINTS:%.*]]) align 1 [[TI:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[TI]], i32 0, i32 0
@@ -841,7 +841,7 @@ SlicyBits case13(TwoInts TI) {
 }
 
 // Case 14: Initialize struct of ints from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case149SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case149SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOINTS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOINTS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -861,7 +861,7 @@ TwoInts case14(SlicyBits SB) {
 }
 
 // Case 15: Initialize struct of floats from struct with bitfields.
-// CHECK-LABEL: define void @_Z6case159SlicyBits(
+// CHECK-LABEL: define hidden void @_Z6case159SlicyBits(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_SLICYBITS:%.*]]) align 1 [[SB:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_TWOFLOATS]], ptr [[AGG_RESULT]], i32 0, i32 0
@@ -884,7 +884,7 @@ TwoFloats case15(SlicyBits SB) {
 
 // Case 16: Side-effecting initialization list arguments. The important thing
 // here is that case16 only has _one_ call to makeTwo.
-// CHECK-LABEL: define void @_Z7makeTwoRf(
+// CHECK-LABEL: define hidden void @_Z7makeTwoRf(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_TWOFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]], ptr noalias noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
@@ -910,7 +910,7 @@ TwoFloats makeTwo(inout float X) {
     return TF;
 }
 
-// CHECK-LABEL: define void @_Z6case16v(
+// CHECK-LABEL: define hidden void @_Z6case16v(
 // CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FOURFLOATS:%.*]]) align 1 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[X:%.*]] = alloca float, align 4
@@ -948,7 +948,7 @@ int case17Helper(int x) {
 }
 
 // InitList with OpaqueValueExpr
-// CHECK-LABEL: define void {{.*}}case17
+// CHECK-LABEL: define hidden void {{.*}}case17
 // CHECK: [[X:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[C:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 0)
 // CHECK-NEXT: [[C1:%.*]] = call noundef i32 {{.*}}case17Helper{{.*}}(i32 noundef 1)
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
index 1f45a7f9b46d3..d0ba8f447b732 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/OutputArguments.hlsl
@@ -6,7 +6,7 @@
 // integer. It is converted to an integer on call and converted back after the
 // function.
 
-// CHECK: define void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_Param{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void trunc_Param(inout int X) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) float {{.*}}case1
@@ -32,7 +32,7 @@ export float case1(float F) {
 // uninitialized in the function. If they are not initialized before the
 // function returns the value is undefined.
 
-// CHECK: define void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}undef{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void undef(out int Z) { }
 
 // ALL-LABEL: define noundef i32 {{.*}}case2
@@ -54,7 +54,7 @@ export int case2() {
 // This test should verify that an out parameter value is written to as
 // expected.
 
-// CHECK: define void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}zero{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void zero(out int Z) { Z = 0; }
 
 // ALL-LABEL: define noundef i32 {{.*}}case3
@@ -76,7 +76,7 @@ export int case3() {
 // Vector swizzles in HLSL produce lvalues, so they can be used as arguments to
 // inout parameters and the swizzle is reversed on writeback.
 
-// CHECK: define void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}funky{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void funky(inout int3 X) {
   X.x += 1;
   X.y += 2;
@@ -116,7 +116,7 @@ export int3 case4() {
 
 // Case 5: Straightforward inout of a scalar value.
 
-// CHECK: define void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
+// CHECK: define hidden void {{.*}}increment{{.*}}(ptr noalias noundef nonnull align 4 dereferenceable(4) {{%.*}})
 void increment(inout int I) {
   I += 1;
 }
@@ -144,7 +144,7 @@ struct S {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(out S s) {
   s.X = 3;
   s.Y = 4;
@@ -170,7 +170,7 @@ struct R {
   float Y;
 };
 
-// CHECK: define void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
+// CHECK: define hidden void {{.*}}init{{.*}}(ptr noalias noundef nonnull align 1 dereferenceable(8) {{%.*}})
 void init(inout R s) {
   s.X = 3;
   s.Y = 4;
@@ -194,7 +194,7 @@ export int case7() {
 
 // Case 8: Non-scalars with a cast expression.
 
-// CHECK: define void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
+// CHECK: define hidden void {{.*}}trunc_vec{{.*}}(ptr noalias noundef nonnull align 16 dereferenceable(16) {{%.*}})
 void trunc_vec(inout int3 V) {}
 
 // ALL-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}case8
diff --git a/clang/test/CodeGenHLSL/Bool.hlsl b/clang/test/CodeGenHLSL/Bool.hlsl
index fb0f32b11241d..21328c1f9d4df 100644
--- a/clang/test/CodeGenHLSL/Bool.hlsl
+++ b/clang/test/CodeGenHLSL/Bool.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn{{.*}}(i1 noundef %x)
 // CHECK: [[X:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[Y:%.*]] = zext i1 {{%.*}} to i32
 // CHECK-NEXT: store i32 [[Y]], ptr [[X]], align 4
diff --git a/clang/test/CodeGenHLSL/BoolVector.hlsl b/clang/test/CodeGenHLSL/BoolVector.hlsl
index 35d8b9dac801d..d5054a5a92b5d 100644
--- a/clang/test/CodeGenHLSL/BoolVector.hlsl
+++ b/clang/test/CodeGenHLSL/BoolVector.hlsl
@@ -9,7 +9,7 @@ struct S {
     float f;
 };
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn1{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn1{{.*}}
 // CHECK: [[B:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[B]], align 8
 // CHECK-NEXT: [[BoolVec:%.*]] = load <2 x i32>, ptr [[B]], align 8
@@ -21,7 +21,7 @@ bool fn1() {
   return B[0];
 }
 
-// CHECK-LABEL: define noundef <2 x i1> {{.*}}fn2{{.*}}
+// CHECK-LABEL: define hidden noundef <2 x i1> {{.*}}fn2{{.*}}
 // CHECK: [[VAddr:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[A:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: [[StoreV:%.*]] = zext i1 {{.*}} to i32
@@ -40,7 +40,7 @@ bool2 fn2(bool V) {
   return A;
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn3{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn3{{.*}}
 // CHECK: [[s:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[s]], ptr align 1 [[ConstS]], i32 12, i1 false)
 // CHECK-NEXT: [[BV:%.*]] = getelementptr inbounds nuw %struct.S, ptr [[s]], i32 0, i32 0
@@ -53,7 +53,7 @@ bool fn3() {
   return s.bv[0];
 }
 
-// CHECK-LABEL: define noundef i1 {{.*}}fn4{{.*}}
+// CHECK-LABEL: define hidden noundef i1 {{.*}}fn4{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 [[ConstArr]], i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
@@ -66,7 +66,7 @@ bool fn4() {
   return Arr[0][1];
 }
 
-// CHECK-LABEL: define void {{.*}}fn5{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn5{{.*}}
 // CHECK: [[Arr:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[Arr]], align 8
 // CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[Arr]], align 8
@@ -78,7 +78,7 @@ void fn5() {
   Arr[1] = false;
 }
 
-// CHECK-LABEL: define void {{.*}}fn6{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn6{{.*}}
 // CHECK: [[V:%.*]] = alloca i32, align 4
 // CHECK-NEXT: [[S:%.*]] = alloca %struct.S, align 1
 // CHECK-NEXT: store i32 0, ptr [[V]], align 4
@@ -97,7 +97,7 @@ void fn6() {
   s.bv[1] = V;
 }
 
-// CHECK-LABEL: define void {{.*}}fn7{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}fn7{{.*}}
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 8 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[Arr]], i32 0, i32 0
diff --git a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
index 9090e9e85ed98..afda714106fac 100644
--- a/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
+++ b/clang/test/CodeGenHLSL/GlobalConstructorLib.hlsl
@@ -33,7 +33,7 @@ void SecondEntry() {}
 
 // Verify the constructor is alwaysinline
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
-// NOINLINE-NEXT: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
+// NOINLINE-NEXT: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjijjPKc({{.*}} [[CtorAttr:\#[0-9]+]]
 
 // NOINLINE: ; Function Attrs: {{.*}}alwaysinline
 // NOINLINE-NEXT: define internal void @_GLOBAL__sub_I_GlobalConstructorLib.hlsl() [[InitAttr:\#[0-9]+]]
diff --git a/clang/test/CodeGenHLSL/basic_types.hlsl b/clang/test/CodeGenHLSL/basic_types.hlsl
index 362042654ea8c..37fb5195e9768 100644
--- a/clang/test/CodeGenHLSL/basic_types.hlsl
+++ b/clang/test/CodeGenHLSL/basic_types.hlsl
@@ -6,38 +6,38 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - -DNAMESPACED| FileCheck %s
 
 
-// CHECK: @uint16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @int16_t_Val = external addrspace(2) global i16, align 2
-// CHECK: @uint_Val = external addrspace(2) global i32, align 4
-// CHECK: @uint64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int64_t_Val = external addrspace(2) global i64, align 8
-// CHECK: @int16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @int16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @int16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @uint16_t2_Val = external addrspace(2) global <2 x i16>, align 4
-// CHECK: @uint16_t3_Val = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @uint16_t4_Val = external addrspace(2) global <4 x i16>, align 8
-// CHECK: @int2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @int3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @int4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @uint2_Val = external addrspace(2) global <2 x i32>, align 8
-// CHECK: @uint3_Val = external addrspace(2) global <3 x i32>, align 16
-// CHECK: @uint4_Val = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @int64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @int64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @int64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @uint64_t2_Val = external addrspace(2) global <2 x i64>, align 16
-// CHECK: @uint64_t3_Val = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @uint64_t4_Val = external addrspace(2) global <4 x i64>, align 32
-// CHECK: @half2_Val = external addrspace(2) global <2 x half>, align 4
-// CHECK: @half3_Val = external addrspace(2) global <3 x half>, align 8
-// CHECK: @half4_Val = external addrspace(2) global <4 x half>, align 8
-// CHECK: @float2_Val = external addrspace(2) global <2 x float>, align 8
-// CHECK: @float3_Val = external addrspace(2) global <3 x float>, align 16
-// CHECK: @float4_Val = external addrspace(2) global <4 x float>, align 16
-// CHECK: @double2_Val = external addrspace(2) global <2 x double>, align 16
-// CHECK: @double3_Val = external addrspace(2) global <3 x double>, align 32
-// CHECK: @double4_Val = external addrspace(2) global <4 x double>, align 32
+// CHECK: @uint16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @int16_t_Val = external hidden addrspace(2) global i16, align 2
+// CHECK: @uint_Val = external hidden addrspace(2) global i32, align 4
+// CHECK: @uint64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int64_t_Val = external hidden addrspace(2) global i64, align 8
+// CHECK: @int16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @int16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @int16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @uint16_t2_Val = external hidden addrspace(2) global <2 x i16>, align 4
+// CHECK: @uint16_t3_Val = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @uint16_t4_Val = external hidden addrspace(2) global <4 x i16>, align 8
+// CHECK: @int2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @int3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @int4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @uint2_Val = external hidden addrspace(2) global <2 x i32>, align 8
+// CHECK: @uint3_Val = external hidden addrspace(2) global <3 x i32>, align 16
+// CHECK: @uint4_Val = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @int64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @int64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @int64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @uint64_t2_Val = external hidden addrspace(2) global <2 x i64>, align 16
+// CHECK: @uint64_t3_Val = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @uint64_t4_Val = external hidden addrspace(2) global <4 x i64>, align 32
+// CHECK: @half2_Val = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @half3_Val = external hidden addrspace(2) global <3 x half>, align 8
+// CHECK: @half4_Val = external hidden addrspace(2) global <4 x half>, align 8
+// CHECK: @float2_Val = external hidden addrspace(2) global <2 x float>, align 8
+// CHECK: @float3_Val = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @float4_Val = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK: @double2_Val = external hidden addrspace(2) global <2 x double>, align 16
+// CHECK: @double3_Val = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @double4_Val = external hidden addrspace(2) global <4 x double>, align 32
 
 #ifdef NAMESPACED
 #define TYPE_DECL(T)  hlsl::T T##_Val
diff --git a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
index e1832bdbbf33f..8457ad6da293f 100644
--- a/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/AddUint64.hlsl
@@ -4,7 +4,7 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
 
-// CHECK-LABEL: define noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z20test_AddUint64_uint2Dv2_jS_(
 // CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i32>, align 8
@@ -31,7 +31,7 @@ uint2 test_AddUint64_uint2(uint2 a, uint2 b) {
   return AddUint64(a, b);
 }
 
-// CHECK-LABEL: define noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z20test_AddUint64_uint4Dv4_jS_(
 // CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i32>, align 16
diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
index 403d473ce9680..3a8d2c03e173c 100644
--- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 1, i32 noundef 2, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of ByteAddressBuffer C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME:  %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,27 +47,27 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWByteAddressBuffer C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this1,
 // CHECK-SAME: i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
 
 // Buf3 initialization part 1 - local variable declared in function foo() is initialized by 
 // RasterizerOrderedByteAddressBuffer C1 default constructor
-// CHECK: define void @_Z3foov() #2 {
+// CHECK: define void @_Z3foov()
 // CHECK-NEXT: entry:
 // CHECK-NEXT: %Buf3 = alloca %"class.hlsl::RasterizerOrderedByteAddressBuffer", align 4
 // CHECK-NEXT: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RasterizerOrderedByteAddressBuffer default C1 constructor that
 // calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 // CHECK-NEXT: ret void
 
 // Buf1 initialization part 3 - ByteAddressBuffer C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl17ByteAddressBufferC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -76,7 +76,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWByteAddressBuffer C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl19RWByteAddressBufferC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_i8_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -85,7 +85,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of RasterizerOrderedByteAddressBuffer default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl34RasterizerOrderedByteAddressBufferC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RasterizerOrderedByteAddressBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", i8, 1, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
index 9d95d54852c0b..114230d38ba54 100644
--- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef -check-prefixes=CHECK,CHECK-DXIL
+// RUN:   -DTARGET=dx -check-prefixes=CHECK,CHECK-DXIL
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef" -check-prefixes=CHECK,CHECK-SPIRV
+// RUN:   -DTARGET=spv -check-prefixes=CHECK,CHECK-SPIRV
 
-// CHECK-DXIL: define void @
-// CHECK-SPIRV: define spir_func void @
+// CHECK-DXIL: define hidden void @
+// CHECK-SPIRV: define hidden spir_func void @
 void test_GroupMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
 // CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index e74a7ed270b01..114468914e2ea 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -35,7 +35,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 5, i32 noundef 3, i32 noundef 1, i32 noundef 0, ptr noundef @[[Buf1Str]])
 
 // Buf1 initialization part 2 - body of RWBuffer<float> C1 constructor with explicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -47,7 +47,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -59,12 +59,12 @@ export void foo() {
 // CHECK-NEXT: call void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %Buf3)
 
 // Buf3 initialization part 2 - body of RWBuffer<int> default C1 constructor that calls the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of RWBuffer<float> C2 constructor with explicit binding that initializes
 // handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(
 // CHECK-DXIL-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -73,7 +73,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIdEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.TypedBuffer", double, 1, 0, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.TypedBuffer_f64_1_0_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -81,7 +81,7 @@ export void foo() {
 // CHECK-NEXT: store target("dx.TypedBuffer", double, 1, 0, 0) %[[HANDLE]], ptr %__handle, align 4
 
 // Buf3 initialization part 3 - body of RWBuffer<int> default C2 constructor that initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl8RWBufferIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::RWBuffer.1", ptr %{{.*}}, i32 0, i32 0
 // CHECK-NEXT: store target("dx.TypedBuffer", i32, 1, 0, 1) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
index 8a3958ad8fd04..7804239edccae 100644
--- a/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ScalarSwizzles.hlsl
@@ -304,7 +304,7 @@ bool2 AccessBools() {
   return X.zw;
 }
 
-// CHECK-LABEL: define void {{.*}}BoolSizeMismatch{{.*}}
+// CHECK-LABEL: define hidden void {{.*}}BoolSizeMismatch{{.*}}
 // CHECK: [[B:%.*]] = alloca <4 x i32>, align 16
 // CHECK-NEXT: [[Tmp:%.*]] = alloca <1 x i32>, align 4
 // CHECK-NEXT: store <4 x i32> splat (i32 1), ptr [[B]], align 16
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
index fc7b6be5c9000..28841732df99e 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
@@ -36,7 +36,7 @@ export void foo() {
 
 // Buf1 initialization part 2 - body of StructuredBuffer<float> C1 constructor with explicit binding 
 // that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC1EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this, 
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -49,7 +49,7 @@ export void foo() {
 // CHECK-SAME: i32 noundef 0, i32 noundef 1, i32 noundef 0, i32 noundef 0, ptr noundef @[[Buf2Str]])
 
 // Buf2 initialization part 2 - body of RWStructuredBuffer<float> C1 constructor with implicit binding that calls the C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC1EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: call void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4)
 // CHECK-SAME: %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, i32 noundef %{{.*}}, ptr noundef %{{.*}})
@@ -63,12 +63,12 @@ export void foo() {
 
 // Buf3 initialization part 2 - body of AppendStructuredBuffer<float> default C1 constructor that calls
 // the default C2 constructor
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: call void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %{{.*}})
 
 // Buf1 initialization part 3 - body of AppendStructuredBuffer<float> C2 constructor with explicit binding 
 // that initializes handle with @llvm.dx.resource.handlefrombinding
-// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl16StructuredBufferIfEC2EjjijPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %registerNo, i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, ptr noundef %name)
 // CHECK-DXIL: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(
 // CHECK-SAME: i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -77,7 +77,7 @@ export void foo() {
 
 // Buf2 initialization part 3 - body of RWStructuredBuffer<float> C2 constructor with implicit binding that initializes
 // handle with @llvm.dx.resource.handlefromimplicitbinding
-// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl18RWStructuredBufferIfEC2EjijjPKc(ptr noundef nonnull align 4 dereferenceable(4) %this,
 // CHECK-SAME: i32 noundef %spaceNo, i32 noundef %range, i32 noundef %index, i32 noundef %orderId, ptr noundef %name)
 // CHECK: %[[HANDLE:.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefromimplicitbinding.tdx.RawBuffer_f32_1_0t
 // CHECK-SAME: (i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 false, ptr %{{.*}})
@@ -86,7 +86,7 @@ export void foo() {
 
 // Buf3 initialization part 3 - body of AppendStructuredBuffer<float> default C2 constructor that
 // initializes handle to poison
-// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
+// CHECK: define linkonce_odr hidden void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this)
 // CHECK: %__handle = getelementptr inbounds nuw %"class.hlsl::AppendStructuredBuffer", ptr %{{.*}}, i32 0, i32 0
 // CHECK: store target("dx.RawBuffer", float, 1, 0) poison, ptr %__handle, align 4
 
diff --git a/clang/test/CodeGenHLSL/builtins/abs.hlsl b/clang/test/CodeGenHLSL/builtins/abs.hlsl
index e8a6ee0449571..6abe2f816c844 100644
--- a/clang/test/CodeGenHLSL/builtins/abs.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/abs.hlsl
@@ -8,16 +8,16 @@
 using hlsl::abs;
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z16test_abs_int16_t
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z16test_abs_int16_t
 // NATIVE_HALF: call i16 @llvm.abs.i16(
 int16_t test_abs_int16_t(int16_t p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z17test_abs_int16_t2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z17test_abs_int16_t2
 // NATIVE_HALF: call <2 x i16> @llvm.abs.v2i16(
 int16_t2 test_abs_int16_t2(int16_t2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z17test_abs_int16_t3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z17test_abs_int16_t3
 // NATIVE_HALF: call <3 x i16> @llvm.abs.v3i16(
 int16_t3 test_abs_int16_t3(int16_t3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z17test_abs_int16_t4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z17test_abs_int16_t4
 // NATIVE_HALF: call <4 x i16> @llvm.abs.v4i16(
 int16_t4 test_abs_int16_t4(int16_t4 p0) { return abs(p0); }
 
@@ -50,76 +50,76 @@ uint16_t3 test_abs_uint64_t3(uint16_t3 p0) { return abs(p0); }
 uint16_t4 test_abs_uint64_t4(uint16_t4 p0) { return abs(p0); }
 #endif // __HLSL_ENABLE_16_BIT
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_abs_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_abs_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.fabs.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_abs_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_abs_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(float %0)
 half test_abs_half(half p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_abs_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.fabs.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_abs_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 half2 test_abs_half2(half2 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_abs_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.fabs.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_abs_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 half3 test_abs_half3(half3 p0) { return abs(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_abs_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.fabs.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_abs_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 half4 test_abs_half4(half4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i32 @_Z12test_abs_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_abs_int
 // CHECK: call i32 @llvm.abs.i32(
 int test_abs_int(int p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_abs_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_abs_int2
 // CHECK: call <2 x i32> @llvm.abs.v2i32(
 int2 test_abs_int2(int2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_abs_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_abs_int3
 // CHECK: call <3 x i32> @llvm.abs.v3i32(
 int3 test_abs_int3(int3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_abs_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_abs_int4
 // CHECK: call <4 x i32> @llvm.abs.v4i32(
 int4 test_abs_int4(int4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_abs_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_abs_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.fabs.f32(
 float test_abs_float(float p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_abs_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.fabs.v2f32(
 float2 test_abs_float2(float2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_abs_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.fabs.v3f32(
 float3 test_abs_float3(float3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_abs_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.fabs.v4f32(
 float4 test_abs_float4(float4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef i64 @_Z16test_abs_int64_t
+// CHECK-LABEL: define hidden noundef i64 @_Z16test_abs_int64_t
 // CHECK: call i64 @llvm.abs.i64(
 int64_t test_abs_int64_t(int64_t p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z17test_abs_int64_t2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z17test_abs_int64_t2
 // CHECK: call <2 x i64> @llvm.abs.v2i64(
 int64_t2 test_abs_int64_t2(int64_t2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z17test_abs_int64_t3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z17test_abs_int64_t3
 // CHECK: call <3 x i64> @llvm.abs.v3i64(
 int64_t3 test_abs_int64_t3(int64_t3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z17test_abs_int64_t4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z17test_abs_int64_t4
 // CHECK: call <4 x i64> @llvm.abs.v4i64(
 int64_t4 test_abs_int64_t4(int64_t4 p0) { return abs(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_abs_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_abs_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.fabs.f64(
 double test_abs_double(double p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_abs_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.fabs.v2f64(
 double2 test_abs_double2(double2 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_abs_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.fabs.v3f64(
 double3 test_abs_double3(double3 p0) { return abs(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_abs_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.fabs.v4f64(
 double4 test_abs_double4(double4 p0) { return abs(p0); }
 
diff --git a/clang/test/CodeGenHLSL/builtins/all.hlsl b/clang/test/CodeGenHLSL/builtins/all.hlsl
index 39f364c5953d6..391fad0ef33f5 100644
--- a/clang/test/CodeGenHLSL/builtins/all.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/all.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/and.hlsl b/clang/test/CodeGenHLSL/builtins/and.hlsl
index b77889cd9ae70..d2ca7cf4163ed 100644
--- a/clang/test/CodeGenHLSL/builtins/and.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/and.hlsl
@@ -3,7 +3,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s
 
-// CHECK-LABEL: define noundef i1 @_Z15test_and_scalarbb(
+// CHECK-LABEL: define hidden noundef i1 @_Z15test_and_scalarbb(
 // CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and i1 [[X]], [[Y]]
@@ -13,7 +13,7 @@ bool test_and_scalar(bool x, bool y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
+// CHECK-LABEL: define hidden noundef <2 x i1> @_Z14test_and_bool2Dv2_bS_(
 // CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <2 x i1> [[X]], [[Y]]
@@ -23,7 +23,7 @@ bool2 test_and_bool2(bool2 x, bool2 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
+// CHECK-LABEL: define hidden noundef <3 x i1> @_Z14test_and_bool3Dv3_bS_(
 // CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <3 x i1> [[X]], [[Y]]
@@ -33,7 +33,7 @@ bool3 test_and_bool3(bool3 x, bool3 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_and_bool4Dv4_bS_(
 // CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_AND:%.*]] = and <4 x i1> [[X]], [[Y]]
@@ -43,7 +43,7 @@ bool4 test_and_bool4(bool4 x, bool4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_and_int4Dv4_iS_(
 // CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne <4 x i32> [[X]], zeroinitializer
@@ -55,7 +55,7 @@ bool4 test_and_int4(int4 x, int4 y) {
   return and(x, y);
 }
 
-// CHECK-LABEL: define noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef <4 x i1> @_Z15test_and_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[X]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/any.hlsl b/clang/test/CodeGenHLSL/builtins/any.hlsl
index 3d9d8e9e689ed..e4837876e2693 100644
--- a/clang/test/CodeGenHLSL/builtins/any.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/any.hlsl
@@ -2,20 +2,20 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef" -DTARGET=dx
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i1 @
diff --git a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
index b313c99e89a53..bdefe46b802e7 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::ceil;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_double(double p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_double2(double2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_double3(double3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_double4(double4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int(int p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int2(int2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int3(int3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int4(int4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint(uint p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint2(uint2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint3(uint3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint4(uint4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_int64_t(int64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_int64_t2(int64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_int64_t3(int64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_int64_t4(int64_t4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_ceil_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_uint64_t(uint64_t p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_ceil_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_uint64_t2(uint64_t2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_ceil_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_uint64_t3(uint64_t3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_ceil_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_uint64_t4(uint64_t4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ceil.hlsl b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
index fe0b8f8983838..1a9c630b60e57 100644
--- a/clang/test/CodeGenHLSL/builtins/ceil.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ceil.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::ceil;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_ceil_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_ceil_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.ceil.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_ceil_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_ceil_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(float %0)
 half test_ceil_half(half p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_ceil_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.ceil.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_ceil_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 half2 test_ceil_half2(half2 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_ceil_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.ceil.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_ceil_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 half3 test_ceil_half3(half3 p0) { return ceil(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_ceil_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.ceil.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_ceil_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 half4 test_ceil_half4(half4 p0) { return ceil(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_ceil_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_ceil_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.ceil.f32(
 float test_ceil_float(float p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_ceil_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.ceil.v2f32(
 float2 test_ceil_float2(float2 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_ceil_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.ceil.v3f32(
 float3 test_ceil_float3(float3 p0) { return ceil(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_ceil_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.ceil.v4f32(
 float4 test_ceil_float4(float4 p0) { return ceil(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
index c0e1e914831aa..eaedfb419c195 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp-overloads.hlsl
@@ -1,18 +1,18 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] <4 x i16> {{.*}}test_clamp_short4_mismatch
diff --git a/clang/test/CodeGenHLSL/builtins/clamp.hlsl b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
index d01c2a45c43c8..58db4423799be 100644
--- a/clang/test/CodeGenHLSL/builtins/clamp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clamp.hlsl
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=dx -DFNATTRS=noundef -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=dx -DFNATTRS="hidden noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -fnative-half-type -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan-compute %s \
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:  -DTARGET=spv -DFNATTRS="spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
+// RUN:  -DTARGET=spv -DFNATTRS="hidden spir_func noundef" -DFFNATTRS="nofpclass(nan inf)"
 
 #ifdef __HLSL_ENABLE_16_BIT
 // NATIVE_HALF: define [[FNATTRS]] i16 @_Z16test_clamp_short
diff --git a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
index c864f93af472b..aaeb2f026449b 100644
--- a/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip-builtin.hlsl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK:      define void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
+// CHECK:      define hidden void @{{.*}}builtin_clip_float{{.*}}(float {{.*}} [[P0:%.*]])
 // CHECK:      [[LOAD:%.*]] = load float, ptr [[P0]].addr, align 4
 // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
 // CHECK-NO:   call i1 @llvm.dx.any
diff --git a/clang/test/CodeGenHLSL/builtins/clip.hlsl b/clang/test/CodeGenHLSL/builtins/clip.hlsl
index 5a1753766a8a1..e067828c38bf6 100644
--- a/clang/test/CodeGenHLSL/builtins/clip.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/clip.hlsl
@@ -3,13 +3,13 @@
 
 
 void test_scalar(float Buf) {
-  // CHECK:      define void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // CHECK-NO:   call i1 @llvm.dx.any
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[FCMP]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_scalar{{.*}}(float {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load float, ptr [[VALP]].addr, align 4
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float [[LOAD]], 0.000000e+00
   // SPIRV-NO:   call i1 @llvm.spv.any
@@ -21,13 +21,13 @@ void test_scalar(float Buf) {
 }
 
 void test_vector4(float4 Buf) {
-  // CHECK:      define void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // CHECK:      define hidden void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // CHECK:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // CHECK-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // CHECK-NEXT: [[ANYC:%.*]] = call i1 @llvm.dx.any.v4i1(<4 x i1> [[FCMP]])
   // CHECK-NEXT: call void @llvm.dx.discard(i1 [[ANYC]])
   //
-  // SPIRV:      define spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
+  // SPIRV:      define hidden spir_func void @{{.*}}test_vector{{.*}}(<4 x float> {{.*}} [[VALP:%.*]])
   // SPIRV:      [[LOAD:%.*]] = load <4 x float>, ptr [[VALP]].addr, align 16
   // SPIRV-NEXT: [[FCMP:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> [[LOAD]], zeroinitializer
   // SPIRV-NEXT: [[ANYC:%.*]] = call i1 @llvm.spv.any.v4i1(<4 x i1> [[FCMP]]) 
diff --git a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
index b7b11b1c3bd6d..70926cc8ba743 100644
--- a/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_double(double p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_double2(double2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_double3(double3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_double4(double4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int(int p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int2(int2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int3(int3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int4(int4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint(uint p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint2(uint2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint3(uint3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint4(uint4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_int64_t(int64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_int64_t2(int64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_int64_t3(int64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_int64_t4(int64_t4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_cos_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_uint64_t(uint64_t p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_cos_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_uint64_t2(uint64_t2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_cos_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_uint64_t3(uint64_t3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_cos_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_uint64_t4(uint64_t4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cos.hlsl b/clang/test/CodeGenHLSL/builtins/cos.hlsl
index 5f993d50498bf..79f9e1e6fbec2 100644
--- a/clang/test/CodeGenHLSL/builtins/cos.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cos.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_cos_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_cos_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.cos.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_cos_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_cos_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 half test_cos_half(half p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_cos_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.cos.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_cos_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32(
 half2 test_cos_half2(half2 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_cos_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.cos.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_cos_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32(
 half3 test_cos_half3(half3 p0) { return cos(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_cos_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.cos.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_cos_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32(
 half4 test_cos_half4(half4 p0) { return cos(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_cos_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_cos_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.cos.f32(
 float test_cos_float(float p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_cos_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.cos.v2f32
 float2 test_cos_float2(float2 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_cos_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.cos.v3f32
 float3 test_cos_float3(float3 p0) { return cos(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_cos_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.cos.v4f32
 float4 test_cos_float4(float4 p0) { return cos(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index b2a1d6316787d..89ac383e2517f 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] <3 x half> @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half>
diff --git a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
index bafd2368c9961..a1abf435ea10c 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].degrees.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/degrees.hlsl b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
index 64531dd2785eb..f0fb12855e5f6 100644
--- a/clang/test/CodeGenHLSL/builtins/degrees.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/degrees.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.degrees = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].degrees.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl
index ac38cf1853799..0c24fbb9f1859 100644
--- a/clang/test/CodeGenHLSL/builtins/distance.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl
@@ -6,14 +6,14 @@
 // RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]]
@@ -22,7 +22,7 @@
 //
 half test_distance_half(half X, half Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -30,7 +30,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]]
@@ -39,7 +39,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); }
 //
 half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -47,7 +47,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]]
@@ -56,7 +56,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); }
 //
 half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -64,7 +64,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z19test_distance_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]]
@@ -73,14 +73,14 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); }
 //
 half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]]
@@ -89,7 +89,7 @@ half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); }
 //
 float test_distance_float(float X, float Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -97,7 +97,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]]
@@ -106,7 +106,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); }
 //
 float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -114,7 +114,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]]
@@ -123,7 +123,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); }
 //
 float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
@@ -131,7 +131,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); }
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z20test_distance_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]]
diff --git a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
index 858a1210169d2..df34beeba7a8c 100644
--- a/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_double
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_double(double p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_double2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_double2(double2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_double3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_double3(double3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_double4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_double4(double4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int(int p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int2(int2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int3(int3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int4(int4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint(uint p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint2(uint2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint3(uint3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint4(uint4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_int64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_int64_t(int64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_int64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_int64_t2(int64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_int64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_int64_t3(int64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_int64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_int64_t4(int64_t4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp_uint64_t
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_uint64_t(uint64_t p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp_uint64_t2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_uint64_t2(uint64_t2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp_uint64_t3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_uint64_t3(uint64_t3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp_uint64_t4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_uint64_t4(uint64_t4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp.hlsl b/clang/test/CodeGenHLSL/builtins/exp.hlsl
index 6ed40ed8f433c..5a8f60528a84c 100644
--- a/clang/test/CodeGenHLSL/builtins/exp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_exp_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_exp_half
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn half @llvm.exp.f16(
 // NATIVE_HALF: ret half %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_exp_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_exp_half
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // NO_HALF: ret float %elt.exp
 half test_exp_half(half p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_exp_half2
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_exp_half2
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp
 half2 test_exp_half2(half2 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_exp_half3
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_exp_half3
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp
 half3 test_exp_half3(half3 p0) { return exp(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_exp_half4
 // NATIVE_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_exp_half4
 // NO_HALF: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp
 half4 test_exp_half4(half4 p0) { return exp(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp_float
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn float @llvm.exp.f32(
 // CHECK: ret float %elt.exp
 float test_exp_float(float p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp_float2
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp.v2f32
 // CHECK: ret <2 x float> %elt.exp
 float2 test_exp_float2(float2 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp_float3
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp.v3f32
 // CHECK: ret <3 x float> %elt.exp
 float3 test_exp_float3(float3 p0) { return exp(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp_float4
 // CHECK: %elt.exp = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp.v4f32
 // CHECK: ret <4 x float> %elt.exp
 float4 test_exp_float4(float4 p0) { return exp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
index ef522afc244a8..20482777a18de 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_double
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_double(double p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_double2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_double2(double2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_double3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_double3(double3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_double4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_double4(double4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int(int p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int2(int2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int3(int3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int4(int4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint(uint p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint2(uint2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint3(uint3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint4(uint4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_int64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_int64_t(int64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_int64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_int64_t2(int64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_int64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_int64_t3(int64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_int64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_int64_t4(int64_t4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_exp2_uint64_t
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_uint64_t(uint64_t p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_exp2_uint64_t2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_uint64_t2(uint64_t2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_exp2_uint64_t3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_uint64_t3(uint64_t3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_exp2_uint64_t4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_uint64_t4(uint64_t4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/exp2.hlsl b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
index b067427e46368..a9bbcb0d9bff9 100644
--- a/clang/test/CodeGenHLSL/builtins/exp2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/exp2.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_exp2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_exp2_half
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(
 // NATIVE_HALF: ret half %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_exp2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_exp2_half
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // NO_HALF: ret float %elt.exp2
 half test_exp2_half(half p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_exp2_half2
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_exp2_half2
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(
 // NO_HALF: ret <2 x float> %elt.exp2
 half2 test_exp2_half2(half2 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_exp2_half3
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_exp2_half3
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(
 // NO_HALF: ret <3 x float> %elt.exp2
 half3 test_exp2_half3(half3 p0) { return exp2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_exp2_half4
 // NATIVE_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.exp2
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_exp2_half4
 // NO_HALF: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(
 // NO_HALF: ret <4 x float> %elt.exp2
 half4 test_exp2_half4(half4 p0) { return exp2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_exp2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_exp2_float
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(
 // CHECK: ret float %elt.exp2
 float test_exp2_float(float p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_exp2_float2
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32
 // CHECK: ret <2 x float> %elt.exp2
 float2 test_exp2_float2(float2 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_exp2_float3
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32
 // CHECK: ret <3 x float> %elt.exp2
 float3 test_exp2_float3(float3 p0) { return exp2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_exp2_float4
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32
 // CHECK: ret <4 x float> %elt.exp2
 float4 test_exp2_float4(float4 p0) { return exp2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
index 26d83443ea489..1e413e53f333e 100644
--- a/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor-overloads.hlsl
@@ -4,67 +4,67 @@
 
 using hlsl::floor;
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_double(double p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_double2(double2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_double3(double3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_double4(double4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int(int p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int2(int2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int3(int3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int4(int4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint(uint p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint2(uint2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint3(uint3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint4(uint4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_int64_t(int64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_int64_t2(int64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_int64_t3(int64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_int64_t4(int64_t4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_floor_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_uint64_t(uint64_t p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_floor_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_uint64_t2(uint64_t2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_floor_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_uint64_t3(uint64_t3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_floor_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_uint64_t4(uint64_t4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/floor.hlsl b/clang/test/CodeGenHLSL/builtins/floor.hlsl
index f610baeeefd48..b3ff58317981a 100644
--- a/clang/test/CodeGenHLSL/builtins/floor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/floor.hlsl
@@ -7,36 +7,36 @@
 
 using hlsl::floor;
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_floor_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_floor_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.floor.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_floor_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_floor_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(float %0)
 half test_floor_half(half p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_floor_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.floor.v2f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_floor_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 half2 test_floor_half2(half2 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_floor_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.floor.v3f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_floor_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 half3 test_floor_half3(half3 p0) { return floor(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_floor_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.floor.v4f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_floor_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 half4 test_floor_half4(half4 p0) { return floor(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_floor_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_floor_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.floor.f32(
 float test_floor_float(float p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_floor_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.floor.v2f32(
 float2 test_floor_float2(float2 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_floor_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.floor.v3f32(
 float3 test_floor_float3(float3 p0) { return floor(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_floor_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.floor.v4f32(
 float4 test_floor_float4(float4 p0) { return floor(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/fmod.hlsl b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
index 7ecc5854b3988..cc91c0b67f6cc 100644
--- a/clang/test/CodeGenHLSL/builtins/fmod.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/fmod.hlsl
@@ -4,7 +4,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -emit-llvm -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=half -DINT_TYPE=f16 --check-prefixes=DXCHECK
 
 //
@@ -12,7 +12,7 @@
 //
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm \
-// RUN:   -o - | FileCheck %s -DFNATTRS="noundef nofpclass(nan inf)" \
+// RUN:   -o - | FileCheck %s -DFNATTRS="hidden noundef nofpclass(nan inf)" \
 // RUN:   -DTYPE=float -DINT_TYPE=f32 --check-prefixes=DXCHECK
 
 
@@ -23,7 +23,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=half
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=half
 
 //
 // ---------- No Native Half support test -----------
@@ -31,7 +31,7 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm \
 // RUN:   -o - | FileCheck %s \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTYPE=float
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTYPE=float
 
 
diff --git a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
index b0e844bd8a8d8..7a3f7b0069480 100644
--- a/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.frac = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].frac.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/frac.hlsl b/clang/test/CodeGenHLSL/builtins/frac.hlsl
index 7b105ce84359f..d8397407cd013 100644
--- a/clang/test/CodeGenHLSL/builtins/frac.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/frac.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.frac = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].frac.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
index 6d2ae6535ecb3..24114b11c7602 100644
--- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
@@ -6,9 +6,9 @@ using handle_float_t = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::c
 // CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", %struct.MyStruct, 0, 0)
 // CHECK: %struct.MyStruct = type <{ <4 x float>, <2 x i32> }>
 
-// CHECK: define void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 // CHECK: call void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %0)
-// CHECK: declare void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
+// CHECK: declare hidden void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
 
 void foo1(handle_float_t res);
 
@@ -16,14 +16,14 @@ void fa(handle_float_t a) {
     foo1(a);
 }
 
-// CHECK: define void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: define hidden void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
 void fb(handle_float_t a) {
     handle_float_t b = a;
 }
 
-// CHECK: define void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %a)
 // CHECK: call void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 4)
 void foo2(RWBuffer<float4> buf);
 
 void fc(RWBuffer<float4> a) {
@@ -39,9 +39,9 @@ struct MyStruct {
   int2 i;
 };
 
-// CHECK: define void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
+// CHECK: define hidden void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %a)
 // CHECK: call void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4 %agg.tmp)
-// CHECK: declare void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
+// CHECK: declare hidden void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 4)
 void foo3(StructuredBuffer<MyStruct> buf);
 
 void fe(StructuredBuffer<MyStruct> a) {
diff --git a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
index ace209003ce43..f39cba9ace6e3 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf-overloads.hlsl
@@ -2,19 +2,19 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_double(double p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_double2(double2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_double3(double3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_double4(double4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/isinf.hlsl b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
index df44fc4a91dfd..4d53daaafb692 100644
--- a/clang/test/CodeGenHLSL/builtins/isinf.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/isinf.hlsl
@@ -6,40 +6,40 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // NATIVE_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f16(
 // NO_HALF: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_half(half p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // NATIVE_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f16
 // NO_HALF: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32(
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_half2(half2 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <3 x i1> @
+// NATIVE_HALF: define hidden noundef <3 x i1> @
 // NATIVE_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f16
 // NO_HALF: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32(
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_half3(half3 p0) { return isinf(p0); }
-// NATIVE_HALF: define noundef <4 x i1> @
+// NATIVE_HALF: define hidden noundef <4 x i1> @
 // NATIVE_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f16
 // NO_HALF: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32(
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_half4(half4 p0) { return isinf(p0); }
 
-// CHECK: define noundef i1 @
+// CHECK: define hidden noundef i1 @
 // CHECK: %dx.isinf = call i1 @llvm.dx.isinf.f32(
 // CHECK: ret i1 %dx.isinf
 bool test_isinf_float(float p0) { return isinf(p0); }
-// CHECK: define noundef <2 x i1> @
+// CHECK: define hidden noundef <2 x i1> @
 // CHECK: %dx.isinf = call <2 x i1> @llvm.dx.isinf.v2f32
 // CHECK: ret <2 x i1> %dx.isinf
 bool2 test_isinf_float2(float2 p0) { return isinf(p0); }
-// CHECK: define noundef <3 x i1> @
+// CHECK: define hidden noundef <3 x i1> @
 // CHECK: %dx.isinf = call <3 x i1> @llvm.dx.isinf.v3f32
 // CHECK: ret <3 x i1> %dx.isinf
 bool3 test_isinf_float3(float3 p0) { return isinf(p0); }
-// CHECK: define noundef <4 x i1> @
+// CHECK: define hidden noundef <4 x i1> @
 // CHECK: %dx.isinf = call <4 x i1> @llvm.dx.isinf.v4f32
 // CHECK: ret <4 x i1> %dx.isinf
 bool4 test_isinf_float4(float4 p0) { return isinf(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
index ea0d1348c6e4e..f8fa06c39f2a1 100644
--- a/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ldexp.hlsl
@@ -1,48 +1,48 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) half @_ZN4hlsl8__detail10ldexp_implIDhEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn half @llvm.exp2.f16(half %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn half %elt.exp2, %{{.*}}
 // CHECK: ret half %mul
 half test_ldexp_half(half X, half Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x half> @_ZN4hlsl8__detail10ldexp_implIDv2_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.exp2.v2f16(<2 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x half> %mul
 half2 test_ldexp_half2(half2 X, half2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x half> @_ZN4hlsl8__detail10ldexp_implIDv3_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.exp2.v3f16(<3 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x half> %mul
 half3 test_ldexp_half3(half3 X, half3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x half> @_ZN4hlsl8__detail10ldexp_implIDv4_DhEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.exp2.v4f16(<4 x half> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x half> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x half> %mul
 half4 test_ldexp_half4(half4 X, half4 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) float @_ZN4hlsl8__detail10ldexp_implIfEET_S2_S2_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn float @llvm.exp2.f32(float %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn float %elt.exp2, %{{.*}}
 // CHECK: ret float %mul
 float test_ldexp_float(float X, float Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <2 x float> @_ZN4hlsl8__detail10ldexp_implIDv2_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.exp2.v2f32(<2 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <2 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <2 x float> %mul
 float2 test_ldexp_float2(float2 X, float2 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <3 x float> @_ZN4hlsl8__detail10ldexp_implIDv3_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.exp2.v3f32(<3 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <3 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <3 x float> %mul
 float3 test_ldexp_float3(float3 X, float3 Exp) { return ldexp(X, Exp); }
 
-// CHECK-LABEL: define linkonce_odr noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
+// CHECK-LABEL: define linkonce_odr hidden noundef nofpclass(nan inf) <4 x float> @_ZN4hlsl8__detail10ldexp_implIDv4_fEET_S3_S3_
 // CHECK: %elt.exp2 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.exp2.v4f32(<4 x float> %{{.*}})
 // CHECK: %mul = fmul reassoc nnan ninf nsz arcp afn <4 x float> %elt.exp2, %{{.*}}
 // CHECK: ret <4 x float> %mul
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 0b17d03d7097d..9297c35abfd16 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -8,16 +8,13 @@
 // RUN: -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
-//
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret half [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]])
@@ -28,18 +25,14 @@ half test_length_half(half p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
-//
-
-
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[P0]], <2 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[P0]])
@@ -50,15 +43,14 @@ half test_length_half2(half2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[P0]], <3 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[P0]])
@@ -69,15 +61,14 @@ half test_length_half3(half3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[P0]], <4 x half> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret half [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[P0]])
@@ -88,14 +79,13 @@ half test_length_half4(half4 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    ret float [[ELT_ABS_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]])
@@ -106,15 +96,14 @@ float test_length_float(float p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[P0]], <2 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[P0]])
@@ -125,15 +114,14 @@ float test_length_float2(float2 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[P0]], <3 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[P0]])
@@ -144,15 +132,14 @@ float test_length_float3(float3 p0)
   return length(p0);
 }
 
-// DXCHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[P0]], <4 x float> nofpclass(nan inf) [[P0]])
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]])
 // CHECK-NEXT:    ret float [[TMP0]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[P0]])
diff --git a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
index 3cb14f8555cab..3b13e43873c77 100644
--- a/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lerp-overloads.hlsl
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
-// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple  dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NATIVE_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple spirv-unknown-vulkan-compute %s -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @_Z16test_lerp_doubled(
 // CHECK:    [[CONV0:%.*]] = fptrunc {{.*}} double %{{.*}} to float
diff --git a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
index 5c63d630c3f3c..d7aacdc486ac6 100644
--- a/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_double(double p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_double2(double2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_double3(double3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_double4(double4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int(int p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int2(int2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int3(int3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int4(int4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint(uint p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint2(uint2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint3(uint3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint4(uint4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_int64_t(int64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_int64_t2(int64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_int64_t3(int64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_int64_t4(int64_t4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_uint64_t(uint64_t p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_uint64_t2(uint64_t2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_uint64_t3(uint64_t3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_uint64_t4(uint64_t4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log.hlsl b/clang/test/CodeGenHLSL/builtins/log.hlsl
index e489939594a53..0136c1a052ed4 100644
--- a/clang/test/CodeGenHLSL/builtins/log.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_log_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_log_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_log_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_log_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 half test_log_half(half p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_log_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_log_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32(
 half2 test_log_half2(half2 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_log_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_log_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32(
 half3 test_log_half3(half3 p0) { return log(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_log_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_log_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32(
 half4 test_log_half4(half4 p0) { return log(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log.f32(
 float test_log_float(float p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log.v2f32
 float2 test_log_float2(float2 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log.v3f32
 float3 test_log_float3(float3 p0) { return log(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log.v4f32
 float4 test_log_float4(float4 p0) { return log(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
index 1a0539c3517d5..e408f4a5d45ce 100644
--- a/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_double(double p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_double2(double2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_double3(double3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_double4(double4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int(int p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int2(int2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int3(int3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int4(int4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint(uint p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint2(uint2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint3(uint3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint4(uint4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_int64_t(int64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_int64_t2(int64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_int64_t3(int64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_int64_t4(int64_t4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log10_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_uint64_t(uint64_t p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log10_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_uint64_t2(uint64_t2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log10_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_uint64_t3(uint64_t3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log10_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_uint64_t4(uint64_t4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log10.hlsl b/clang/test/CodeGenHLSL/builtins/log10.hlsl
index 37c8e837c45a3..6a75444143b18 100644
--- a/clang/test/CodeGenHLSL/builtins/log10.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log10.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_log10_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_log10_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log10.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log10_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log10_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 half test_log10_half(half p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_log10_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log10.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log10_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32(
 half2 test_log10_half2(half2 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_log10_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log10.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log10_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32(
 half3 test_log10_half3(half3 p0) { return log10(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_log10_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log10.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log10_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32(
 half4 test_log10_half4(half4 p0) { return log10(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_log10_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_log10_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log10.f32(
 float test_log10_float(float p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_log10_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log10.v2f32
 float2 test_log10_float2(float2 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_log10_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log10.v3f32
 float3 test_log10_float3(float3 p0) { return log10(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_log10_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log10.v4f32
 float4 test_log10_float4(float4 p0) { return log10(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
index c35b50d8e490a..f88d5ab849212 100644
--- a/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_double(double p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_double2(double2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_double3(double3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_double4(double4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int(int p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int2(int2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int3(int3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int4(int4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint(uint p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint2(uint2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint3(uint3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint4(uint4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_int64_t(int64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_int64_t2(int64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_int64_t3(int64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_int64_t4(int64_t4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_log2_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_uint64_t(uint64_t p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_log2_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_uint64_t2(uint64_t2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_log2_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_uint64_t3(uint64_t3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_log2_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_uint64_t4(uint64_t4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/log2.hlsl b/clang/test/CodeGenHLSL/builtins/log2.hlsl
index 5159d5bb0fa4e..84d73c1810890 100644
--- a/clang/test/CodeGenHLSL/builtins/log2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/log2.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_log2_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_log2_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.log2.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_log2_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_log2_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 half test_log2_half(half p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_log2_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.log2.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_log2_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32(
 half2 test_log2_half2(half2 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_log2_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.log2.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_log2_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32(
 half3 test_log2_half3(half3 p0) { return log2(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_log2_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.log2.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_log2_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32(
 half4 test_log2_half4(half4 p0) { return log2(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_log2_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_log2_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.log2.f32(
 float test_log2_float(float p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_log2_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.log2.v2f32
 float2 test_log2_float2(float2 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_log2_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.log2.v3f32
 float3 test_log2_float3(float3 p0) { return log2(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_log2_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.log2.v4f32
 float4 test_log2_float4(float4 p0) { return log2(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
index d952398a6a592..cd7013ba75825 100644
--- a/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.smax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MAX]]
 int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_max_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call noundef <4 x i16> @llvm.umax.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_max_short4_mismatch(int16_t4 p0, int16_t p1) { return max(p0, p1);
 uint16_t4 test_max_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.smax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 int4 test_max_int4_mismatch(int4 p0, int p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_max_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_max_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i32> @llvm.umax.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MAX]]
 uint4 test_max_uint4_mismatch(uint4 p0, uint p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.smax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 int64_t4 test_max_long4_mismatch(int64_t4 p0, int64_t p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_max_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call noundef <4 x i64> @llvm.umax.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MAX]]
 uint64_t4 test_max_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_max_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.maxnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MAX]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MAX]]
 half4 test_max_half4_mismatch(half4 p0, half p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_max_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MAX]]
 float4 test_max_float4_mismatch(float4 p0, float p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MAX]]
 double4 test_max_double4_mismatch(double4 p0, double p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_max_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MAX:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.maxnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/max.hlsl b/clang/test/CodeGenHLSL/builtins/max.hlsl
index 0b767335556ee..fab53a160c856 100644
--- a/clang/test/CodeGenHLSL/builtins/max.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/max.hlsl
@@ -6,128 +6,128 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_max_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_max_short
 // NATIVE_HALF: call i16 @llvm.smax.i16(
 int16_t test_max_short(int16_t p0, int16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_max_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_max_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smax.v2i16(
 int16_t2 test_max_short2(int16_t2 p0, int16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_max_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_max_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smax.v3i16
 int16_t3 test_max_short3(int16_t3 p0, int16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_max_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_max_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smax.v4i16
 int16_t4 test_max_short4(int16_t4 p0, int16_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_max_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_max_ushort
 // NATIVE_HALF: call i16 @llvm.umax.i16(
 uint16_t test_max_ushort(uint16_t p0, uint16_t p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_max_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_max_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umax.v2i16
 uint16_t2 test_max_ushort2(uint16_t2 p0, uint16_t2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_max_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_max_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umax.v3i16
 uint16_t3 test_max_ushort3(uint16_t3 p0, uint16_t3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_max_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_max_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umax.v4i16
 uint16_t4 test_max_ushort4(uint16_t4 p0, uint16_t4 p1) { return max(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_max_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_max_int
 // CHECK: call i32 @llvm.smax.i32(
 int test_max_int(int p0, int p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_max_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_max_int2
 // CHECK: call <2 x i32> @llvm.smax.v2i32
 int2 test_max_int2(int2 p0, int2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_max_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_max_int3
 // CHECK: call <3 x i32> @llvm.smax.v3i32
 int3 test_max_int3(int3 p0, int3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_max_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_max_int4
 // CHECK: call <4 x i32> @llvm.smax.v4i32
 int4 test_max_int4(int4 p0, int4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_max_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_max_uint
 // CHECK: call i32 @llvm.umax.i32(
 int test_max_uint(uint p0, uint p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_max_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_max_uint2
 // CHECK: call <2 x i32> @llvm.umax.v2i32
 uint2 test_max_uint2(uint2 p0, uint2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_max_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_max_uint3
 // CHECK: call <3 x i32> @llvm.umax.v3i32
 uint3 test_max_uint3(uint3 p0, uint3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_max_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_max_uint4
 // CHECK: call <4 x i32> @llvm.umax.v4i32
 uint4 test_max_uint4(uint4 p0, uint4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_max_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_max_long
 // CHECK: call i64 @llvm.smax.i64(
 int64_t test_max_long(int64_t p0, int64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_max_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_max_long2
 // CHECK: call <2 x i64> @llvm.smax.v2i64
 int64_t2 test_max_long2(int64_t2 p0, int64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_max_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_max_long3
 // CHECK: call <3 x i64> @llvm.smax.v3i64
 int64_t3 test_max_long3(int64_t3 p0, int64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_max_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_max_long4
 // CHECK: call <4 x i64> @llvm.smax.v4i64
 int64_t4 test_max_long4(int64_t4 p0, int64_t4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_max_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_max_ulong
 // CHECK: call i64 @llvm.umax.i64(
 uint64_t test_max_ulong(uint64_t p0, uint64_t p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_max_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_max_ulong2
 // CHECK: call <2 x i64> @llvm.umax.v2i64
 uint64_t2 test_max_ulong2(uint64_t2 p0, uint64_t2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_max_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_max_ulong3
 // CHECK: call <3 x i64> @llvm.umax.v3i64
 uint64_t3 test_max_ulong3(uint64_t3 p0, uint64_t3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_max_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_max_ulong4
 // CHECK: call <4 x i64> @llvm.umax.v4i64
 uint64_t4 test_max_ulong4(uint64_t4 p0, uint64_t4 p1) { return max(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_max_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_max_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.maxnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_max_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_max_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 half test_max_half(half p0, half p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_max_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.maxnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_max_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32(
 half2 test_max_half2(half2 p0, half2 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_max_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.maxnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_max_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32(
 half3 test_max_half3(half3 p0, half3 p1) { return max(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_max_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.maxnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_max_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32(
 half4 test_max_half4(half4 p0, half4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_max_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_max_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.maxnum.f32(
 float test_max_float(float p0, float p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_max_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.maxnum.v2f32
 float2 test_max_float2(float2 p0, float2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_max_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.maxnum.v3f32
 float3 test_max_float3(float3 p0, float3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_max_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.maxnum.v4f32
 float4 test_max_float4(float4 p0, float4 p1) { return max(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_max_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_max_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.maxnum.f64(
 double test_max_double(double p0, double p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_max_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.maxnum.v2f64
 double2 test_max_double2(double2 p0, double2 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_max_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.maxnum.v3f64
 double3 test_max_double3(double3 p0, double3 p1) { return max(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_max_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.maxnum.v4f64
 double4 test_max_double4(double4 p0, double4 p1) { return max(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
index 5c200f488c246..f81fa128ce9c7 100644
--- a/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min-overloads.hlsl
@@ -4,14 +4,14 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_short4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_short4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.smin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
 // NATIVE_HALF: ret <4 x i16> [[MIN]]
 int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> {{.*}}test_min_ushort4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x i16> poison, i16 %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x i16> [[CONV0]], <4 x i16> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call noundef <4 x i16> @llvm.umin.v4i16(<4 x i16> %{{.*}}, <4 x i16> [[CONV1]])
@@ -19,61 +19,61 @@ int16_t4 test_min_short4_mismatch(int16_t4 p0, int16_t p1) { return min(p0, p1);
 uint16_t4 test_min_ushort4_mismatch(uint16_t4 p0, uint16_t p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_int4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_int4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.smin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 int4 test_min_int4_mismatch(int4 p0, int p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i32> {{.*}}test_min_uint4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i32> {{.*}}test_min_uint4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i32> poison, i32 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i32> [[CONV0]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i32> @llvm.umin.v4i32(<4 x i32> %{{.*}}, <4 x i32> [[CONV1]])
 // CHECK: ret <4 x i32> [[MIN]]
 uint4 test_min_uint4_mismatch(uint4 p0, uint p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_long4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_long4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.smin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 int64_t4 test_min_long4_mismatch(int64_t4 p0, int64_t p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
+// CHECK-LABEL: define hidden noundef <4 x i64> {{.*}}test_min_ulong4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x i64> poison, i64 %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x i64> [[CONV0]], <4 x i64> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call noundef <4 x i64> @llvm.umin.v4i64(<4 x i64> %{{.*}}, <4 x i64> [[CONV1]])
 // CHECK: ret <4 x i64> [[MIN]]
 uint64_t4 test_min_ulong4_mismatch(uint64_t4 p0, uint64_t p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> {{.*}}test_min_half4_mismatch
 // NATIVE_HALF: [[CONV0:%.*]] = insertelement <4 x half> poison, half %{{.*}}, i64 0
 // NATIVE_HALF: [[CONV1:%.*]] = shufflevector <4 x half> [[CONV0]], <4 x half> poison, <4 x i32> zeroinitializer
 // NATIVE_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.minnum.v4f16(<4 x half> %{{.*}}, <4 x half> [[CONV1]])
 // NATIVE_HALF: ret <4 x half> [[MIN]]
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_half4_mismatch
 // NO_HALF: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // NO_HALF: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // NO_HALF: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // NO_HALF: ret <4 x float> [[MIN]]
 half4 test_min_half4_mismatch(half4 p0, half p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_min_float4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x float> poison, float %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x float> [[CONV0]], <4 x float> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[MIN]]
 float4 test_min_float4_mismatch(float4 p0, float p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> %{{.*}}, <4 x double> [[CONV1]])
 // CHECK: ret <4 x double> [[MIN]]
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch2
 // CHECK: [[CONV0:%.*]] = insertelement <4 x double> poison, double %{{.*}}, i64 0
 // CHECK: [[CONV1:%.*]] = shufflevector <4 x double> [[CONV0]], <4 x double> poison, <4 x i32> zeroinitializer
 // CHECK: [[MIN:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x double> @llvm.minnum.v4f64(<4 x double> [[CONV1]], <4 x double> %{{.*}})
diff --git a/clang/test/CodeGenHLSL/builtins/min.hlsl b/clang/test/CodeGenHLSL/builtins/min.hlsl
index 508d8b68ea452..b3e8fedff9b1b 100644
--- a/clang/test/CodeGenHLSL/builtins/min.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/min.hlsl
@@ -6,131 +6,131 @@
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
 #ifdef __HLSL_ENABLE_16_BIT
-// NATIVE_HALF-LABEL: define noundef i16 @_Z14test_min_short
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z14test_min_short
 // NATIVE_HALF: call i16 @llvm.smin.i16(
 int16_t test_min_short(int16_t p0, int16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z15test_min_short2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z15test_min_short2
 // NATIVE_HALF: call <2 x i16> @llvm.smin.v2i16(
 int16_t2 test_min_short2(int16_t2 p0, int16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z15test_min_short3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z15test_min_short3
 // NATIVE_HALF: call <3 x i16> @llvm.smin.v3i16
 int16_t3 test_min_short3(int16_t3 p0, int16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z15test_min_short4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z15test_min_short4
 // NATIVE_HALF: call <4 x i16> @llvm.smin.v4i16
 int16_t4 test_min_short4(int16_t4 p0, int16_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef i16 @_Z15test_min_ushort
+// NATIVE_HALF-LABEL: define hidden noundef i16 @_Z15test_min_ushort
 // NATIVE_HALF: call i16 @llvm.umin.i16(
 uint16_t test_min_ushort(uint16_t p0, uint16_t p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <2 x i16> @_Z16test_min_ushort2
+// NATIVE_HALF-LABEL: define hidden noundef <2 x i16> @_Z16test_min_ushort2
 // NATIVE_HALF: call <2 x i16> @llvm.umin.v2i16
 uint16_t2 test_min_ushort2(uint16_t2 p0, uint16_t2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <3 x i16> @_Z16test_min_ushort3
+// NATIVE_HALF-LABEL: define hidden noundef <3 x i16> @_Z16test_min_ushort3
 // NATIVE_HALF: call <3 x i16> @llvm.umin.v3i16
 uint16_t3 test_min_ushort3(uint16_t3 p0, uint16_t3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef <4 x i16> @_Z16test_min_ushort4
+// NATIVE_HALF-LABEL: define hidden noundef <4 x i16> @_Z16test_min_ushort4
 // NATIVE_HALF: call <4 x i16> @llvm.umin.v4i16
 uint16_t4 test_min_ushort4(uint16_t4 p0, uint16_t4 p1) { return min(p0, p1); }
 #endif
 
-// CHECK-LABEL: define noundef i32 @_Z12test_min_int
+// CHECK-LABEL: define hidden noundef i32 @_Z12test_min_int
 // CHECK: call i32 @llvm.smin.i32(
 int test_min_int(int p0, int p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z13test_min_int2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z13test_min_int2
 // CHECK: call <2 x i32> @llvm.smin.v2i32
 int2 test_min_int2(int2 p0, int2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z13test_min_int3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z13test_min_int3
 // CHECK: call <3 x i32> @llvm.smin.v3i32
 int3 test_min_int3(int3 p0, int3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z13test_min_int4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z13test_min_int4
 // CHECK: call <4 x i32> @llvm.smin.v4i32
 int4 test_min_int4(int4 p0, int4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i32 @_Z13test_min_uint
+// CHECK-LABEL: define hidden noundef i32 @_Z13test_min_uint
 // CHECK: call i32 @llvm.umin.i32(
 int test_min_uint(uint p0, uint p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i32> @_Z14test_min_uint2
+// CHECK-LABEL: define hidden noundef <2 x i32> @_Z14test_min_uint2
 // CHECK: call <2 x i32> @llvm.umin.v2i32
 uint2 test_min_uint2(uint2 p0, uint2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i32> @_Z14test_min_uint3
+// CHECK-LABEL: define hidden noundef <3 x i32> @_Z14test_min_uint3
 // CHECK: call <3 x i32> @llvm.umin.v3i32
 uint3 test_min_uint3(uint3 p0, uint3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i32> @_Z14test_min_uint4
+// CHECK-LABEL: define hidden noundef <4 x i32> @_Z14test_min_uint4
 // CHECK: call <4 x i32> @llvm.umin.v4i32
 uint4 test_min_uint4(uint4 p0, uint4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z13test_min_long
+// CHECK-LABEL: define hidden noundef i64 @_Z13test_min_long
 // CHECK: call i64 @llvm.smin.i64(
 int64_t test_min_long(int64_t p0, int64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z14test_min_long2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z14test_min_long2
 // CHECK: call <2 x i64> @llvm.smin.v2i64
 int64_t2 test_min_long2(int64_t2 p0, int64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z14test_min_long3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z14test_min_long3
 // CHECK: call <3 x i64> @llvm.smin.v3i64
 int64_t3 test_min_long3(int64_t3 p0, int64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z14test_min_long4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z14test_min_long4
 // CHECK: call <4 x i64> @llvm.smin.v4i64
 int64_t4 test_min_long4(int64_t4 p0, int64_t4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef i64 @_Z14test_min_ulong
+// CHECK-LABEL: define hidden noundef i64 @_Z14test_min_ulong
 // CHECK: call i64 @llvm.umin.i64(
 uint64_t test_min_ulong(uint64_t p0, uint64_t p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <2 x i64> @_Z15test_min_ulong2
+// CHECK-LABEL: define hidden noundef <2 x i64> @_Z15test_min_ulong2
 // CHECK: call <2 x i64> @llvm.umin.v2i64
 uint64_t2 test_min_ulong2(uint64_t2 p0, uint64_t2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <3 x i64> @_Z15test_min_ulong3
+// CHECK-LABEL: define hidden noundef <3 x i64> @_Z15test_min_ulong3
 // CHECK: call <3 x i64> @llvm.umin.v3i64
 uint64_t3 test_min_ulong3(uint64_t3 p0, uint64_t3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef <4 x i64> @_Z15test_min_ulong4
+// CHECK-LABEL: define hidden noundef <4 x i64> @_Z15test_min_ulong4
 // CHECK: call <4 x i64> @llvm.umin.v4i64
 uint64_t4 test_min_ulong4(uint64_t4 p0, uint64_t4 p1) { return min(p0, p1); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_min_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_min_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.minnum.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_min_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_min_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 half test_min_half(half p0, half p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_min_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.minnum.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_min_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32(
 half2 test_min_half2(half2 p0, half2 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_min_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.minnum.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_min_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32(
 half3 test_min_half3(half3 p0, half3 p1) { return min(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_min_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.minnum.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_min_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32(
 half4 test_min_half4(half4 p0, half4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_min_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_min_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.minnum.f32(
 float test_min_float(float p0, float p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_min_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.minnum.v2f32
 float2 test_min_float2(float2 p0, float2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_min_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.minnum.v3f32
 float3 test_min_float3(float3 p0, float3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_min_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.minnum.v4f32
 float4 test_min_float4(float4 p0, float4 p1) { return min(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) double @_Z15test_min_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) double @_Z15test_min_double
 // CHECK: call reassoc nnan ninf nsz arcp afn double @llvm.minnum.f64(
 double test_min_double(double p0, double p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x double> @_Z16test_min_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.minnum.v2f64
 double2 test_min_double2(double2 p0, double2 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x double> @_Z16test_min_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.minnum.v3f64
 double3 test_min_double3(double3 p0, double3 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> @_Z16test_min_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4(double4 p0, double4 p1) { return min(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x double> {{.*}}test_min_double4_mismatch
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.minnum.v4f64
 double4 test_min_double4_mismatch(double4 p0, double p1) { return min(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
index e9baa25fc6409..52ff7da94c4f7 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].normalize.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 830fc26b7acf0..cc2378756a50a 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].normalize.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/or.hlsl b/clang/test/CodeGenHLSL/builtins/or.hlsl
index 69c57c5455f7d..66cc5572a75b5 100644
--- a/clang/test/CodeGenHLSL/builtins/or.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/or.hlsl
@@ -2,7 +2,7 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-//CHECK-LABEL: define noundef i1 @_Z14test_or_scalarbb(
+//CHECK-LABEL: define hidden noundef i1 @_Z14test_or_scalarbb(
 //CHECK-SAME: i1 noundef [[X:%.*]], i1 noundef [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]]
@@ -12,7 +12,7 @@ bool test_or_scalar(bool x, bool y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
+//CHECK-LABEL: define hidden noundef <2 x i1> @_Z13test_or_bool2Dv2_bS_(
 //CHECK-SAME: <2 x i1> noundef [[X:%.*]], <2 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <2 x i1> [[A:%.*]], [[B:%.*]]
@@ -22,7 +22,7 @@ bool2 test_or_bool2(bool2 x, bool2 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
+//CHECK-LABEL: define hidden noundef <3 x i1> @_Z13test_or_bool3Dv3_bS_(
 //CHECK-SAME: <3 x i1> noundef [[X:%.*]], <3 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <3 x i1> [[A:%.*]], [[B:%.*]]
@@ -32,7 +32,7 @@ bool3 test_or_bool3(bool3 x, bool3 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z13test_or_bool4Dv4_bS_(
 //CHECK-SAME: <4 x i1> noundef [[X:%.*]], <4 x i1> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[HLSL_OR:%.*]] = or <4 x i1> [[A:%.*]], [[B:%.*]]
@@ -42,7 +42,7 @@ bool4 test_or_bool4(bool4 x, bool4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef i1 @_Z11test_or_intii(
+//CHECK-LABEL: define hidden noundef i1 @_Z11test_or_intii(
 //CHECK-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBBOL:%.*]] = icmp ne i32 [[A:%.*]], 0
@@ -54,7 +54,7 @@ bool test_or_int(int x, int y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z12test_or_int4Dv4_iS_(
 //CHECK-SAME: <4 x i32> noundef [[X:%.*]], <4 x i32> noundef [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] = icmp ne <4 x i32> [[A:%.*]], zeroinitializer
@@ -66,7 +66,7 @@ bool4 test_or_int4(int4 x, int4 y)
     return or(x, y);
 }
 
-//CHECK-LABEL: define noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
+//CHECK-LABEL: define hidden noundef <4 x i1> @_Z14test_or_float4Dv4_fS_(
 //CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR0]] {
 //CHECK-NEXT:  entry:
 //CHECK:         [[TOBOOL:%.*]] =  fcmp reassoc nnan ninf nsz arcp afn une <4 x float> [[A:%.*]], zeroinitializer
diff --git a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
index 39003aef7b7b5..0d1f3d3546a33 100644
--- a/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow-overloads.hlsl
@@ -2,125 +2,125 @@
 // RUN:  -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK \
 // RUN:  -DFLOATATTRS="reassoc nnan ninf nsz arcp afn"
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_double
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] double %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_double(double p0, double p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_double2
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <2 x double> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_double2(double2 p0, double2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_double3
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <3 x double> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_double3(double3 p0, double3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_double4
 // CHECK: [[CONV0:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = fptrunc [[FLOATATTRS]] <4 x double> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_double4(double4 p0, double4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int
 // CHECK: [[CONV0:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int(int p0, int p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int2(int2 p0, int2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int3(int3 p0, int3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int4(int4 p0, int4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint
 // CHECK: [[CONV0:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i32 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint(uint p0, uint p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i32> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint2(uint2 p0, uint2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i32> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint3(uint3 p0, uint3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i32> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_uint4(uint4 p0, uint4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_int64_t
 // CHECK: [[CONV0:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = sitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_int64_t(int64_t p0, int64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_int64_t2
 // CHECK: [[CONV0:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_int64_t2(int64_t2 p0, int64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_int64_t3
 // CHECK: [[CONV0:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_int64_t3(int64_t3 p0, int64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_int64_t4
 // CHECK: [[CONV0:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = sitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
 // CHECK: ret <4 x float> [[POW]]
 float4 test_pow_int64_t4(int64_t4 p0, int64_t4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_pow_uint64_t
 // CHECK: [[CONV0:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[CONV1:%.*]] = uitofp i64 %{{.*}} to float
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef float @llvm.pow.f32(float [[CONV0]], float [[CONV1]])
 // CHECK: ret float [[POW]]
 float test_pow_uint64_t(uint64_t p0, uint64_t p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_pow_uint64_t2
 // CHECK: [[CONV0:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <2 x i64> %{{.*}} to <2 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <2 x float> @llvm.pow.v2f32(<2 x float> [[CONV0]], <2 x float> [[CONV1]])
 // CHECK: ret <2 x float> [[POW]]
 float2 test_pow_uint64_t2(uint64_t2 p0, uint64_t2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_pow_uint64_t3
 // CHECK: [[CONV0:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <3 x i64> %{{.*}} to <3 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <3 x float> @llvm.pow.v3f32(<3 x float> [[CONV0]], <3 x float> [[CONV1]])
 // CHECK: ret <3 x float> [[POW]]
 float3 test_pow_uint64_t3(uint64_t3 p0, uint64_t3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_pow_uint64_t4
 // CHECK: [[CONV0:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[CONV1:%.*]] = uitofp <4 x i64> %{{.*}} to <4 x float>
 // CHECK: [[POW:%.*]] = call [[FLOATATTRS]] noundef <4 x float> @llvm.pow.v4f32(<4 x float> [[CONV0]], <4 x float> [[CONV1]])
diff --git a/clang/test/CodeGenHLSL/builtins/pow.hlsl b/clang/test/CodeGenHLSL/builtins/pow.hlsl
index fd21f1b94c57e..fcde755e15fcc 100644
--- a/clang/test/CodeGenHLSL/builtins/pow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/pow.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_pow_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_pow_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.pow.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_pow_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_pow_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 half test_pow_half(half p0, half p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_pow_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.pow.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_pow_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32(
 half2 test_pow_half2(half2 p0, half2 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_pow_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.pow.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_pow_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32(
 half3 test_pow_half3(half3 p0, half3 p1) { return pow(p0, p1); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_pow_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.pow.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_pow_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32(
 half4 test_pow_half4(half4 p0, half4 p1) { return pow(p0, p1); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_pow_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_pow_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.pow.f32(
 float test_pow_float(float p0, float p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_pow_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.pow.v2f32
 float2 test_pow_float2(float2 p0, float2 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_pow_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.pow.v3f32
 float3 test_pow_float3(float3 p0, float3 p1) { return pow(p0, p1); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_pow_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.pow.v4f32
 float4 test_pow_float4(float4 p0, float4 p1) { return pow(p0, p1); }
diff --git a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
index d0cfc7b60265b..4b12f590edcd6 100644
--- a/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].radians.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/radians.hlsl b/clang/test/CodeGenHLSL/builtins/radians.hlsl
index efdeb9f6e142a..f281747fbf298 100644
--- a/clang/test/CodeGenHLSL/builtins/radians.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/radians.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS="noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef nofpclass(nan inf)"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)"
 
 
 // NATIVE_HALF: define [[FNATTRS]] half @
diff --git a/clang/test/CodeGenHLSL/builtins/rcp.hlsl b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
index 8f07f3a031531..cdfaa3c5f1ee3 100644
--- a/clang/test/CodeGenHLSL/builtins/rcp.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rcp.hlsl
@@ -13,90 +13,90 @@
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF,SPIR_NO_HALF,SPIR_CHECK
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) half @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) half @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) half @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) half @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn half 0xH3C00, %{{.*}} 
 // NATIVE_HALF: ret half %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) float @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) float @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) float @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // NO_HALF: ret float %hlsl.rcp
 half test_rcp_half(half p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <2 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <2 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <2 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <2 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <2 x float> %hlsl.rcp
 half2 test_rcp_half2(half2 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <3 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <3 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <3 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <3 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <3 x float> %hlsl.rcp
 half3 test_rcp_half3(half3 p0) { return rcp(p0); }
 
-// DXIL_NATIVE_HALF: define noundef nofpclass(nan inf) <4 x half> @
-// SPIR_NATIVE_HALF: define spir_func noundef nofpclass(nan inf) <4 x half> @
+// DXIL_NATIVE_HALF: define hidden noundef nofpclass(nan inf) <4 x half> @
+// SPIR_NATIVE_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @
 // NATIVE_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x half> splat (half  0xH3C00), %{{.*}} 
 // NATIVE_HALF: ret <4 x half> %hlsl.rcp
-// DXIL_NO_HALF: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_NO_HALF: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_NO_HALF: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_NO_HALF: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // NO_HALF: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // NO_HALF: ret <4 x float> %hlsl.rcp
 half4 test_rcp_half4(half4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) float @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) float @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) float @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) float @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn float 1.000000e+00, %{{.*}}
 // CHECK: ret float %hlsl.rcp
 float test_rcp_float(float p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x float> %hlsl.rcp
 float2 test_rcp_float2(float2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x float> %hlsl.rcp
 float3 test_rcp_float3(float3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x float> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x float> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x float> %hlsl.rcp
 float4 test_rcp_float4(float4 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) double @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) double @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) double @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) double @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn double 1.000000e+00, %{{.*}} 
 // CHECK: ret double %hlsl.rcp
 double test_rcp_double(double p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <2 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <2 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <2 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <2 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <2 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <2 x double> %hlsl.rcp
 double2 test_rcp_double2(double2 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <3 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <3 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <3 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <3 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <3 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <3 x double> %hlsl.rcp
 double3 test_rcp_double3(double3 p0) { return rcp(p0); }
 
-// DXIL_CHECK: define noundef nofpclass(nan inf) <4 x double> @
-// SPIR_CHECK: define spir_func noundef nofpclass(nan inf) <4 x double> @
+// DXIL_CHECK: define hidden noundef nofpclass(nan inf) <4 x double> @
+// SPIR_CHECK: define hidden spir_func noundef nofpclass(nan inf) <4 x double> @
 // CHECK: %hlsl.rcp = fdiv reassoc nnan ninf nsz arcp afn <4 x double> splat (double 1.000000e+00), %{{.*}}
 // CHECK: ret <4 x double> %hlsl.rcp
 double4 test_rcp_double4(double4 p0) { return rcp(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
index c082e63ac1da6..65fefd801ffed 100644
--- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -15,7 +15,7 @@
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret half [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_reflect_halfDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[I]], 0xH4000
@@ -28,7 +28,7 @@ half test_reflect_half(half I, half N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -39,7 +39,7 @@ half test_reflect_half(half I, half N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]])
@@ -49,7 +49,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -60,7 +60,7 @@ half2 test_reflect_half2(half2 I, half2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]])
@@ -70,7 +70,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -81,7 +81,7 @@ half3 test_reflect_half3(half3 I, half3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x half> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]])
@@ -91,7 +91,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -100,7 +100,7 @@ half4 test_reflect_half4(half4 I, half4 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[I]], [[MUL2_I]]
 // CHECK-NEXT:    ret float [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_reflect_floatff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[I]], 2.000000e+00
@@ -113,7 +113,7 @@ float test_reflect_float(float I, float N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -124,7 +124,7 @@ float test_reflect_float(float I, float N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <2 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]])
@@ -134,7 +134,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -145,7 +145,7 @@ float2 test_reflect_float2(float2 I, float2 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <3 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]])
@@ -155,7 +155,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
     return reflect(I, N);
 }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
@@ -166,7 +166,7 @@ float3 test_reflect_float3(float3 I, float3 N) {
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[I]], [[MUL1_I]]
 // CHECK-NEXT:    ret <4 x float> [[SUB_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]])
diff --git a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
index fe137b9cae4e9..91375c8f4eb8f 100644
--- a/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/reversebits.hlsl
@@ -3,25 +3,25 @@
 // RUN:   -emit-llvm -disable-llvm-passes -O3 -o - | FileCheck %s
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: call i16 @llvm.bitreverse.i16(
 uint16_t test_bitreverse_ushort(uint16_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: call <2 x i16> @llvm.bitreverse.v2i16
 uint16_t2 test_bitreverse_ushort2(uint16_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: call <3 x i16> @llvm.bitreverse.v3i16
 uint16_t3 test_bitreverse_ushort3(uint16_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: call <4 x i16> @llvm.bitreverse.v4i16
 uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 {
@@ -29,50 +29,50 @@ uint16_t4 test_bitreverse_ushort4(uint16_t4 p0)
 }
 #endif
 
-// CHECK: define noundef i32 @
+// CHECK: define hidden noundef i32 @
 // CHECK: call i32 @llvm.bitreverse.i32(
 int test_bitreverse_uint(uint p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i32> @
+// CHECK: define hidden noundef <2 x i32> @
 // CHECK: call <2 x i32> @llvm.bitreverse.v2i32
 uint2 test_bitreverse_uint2(uint2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i32> @
+// CHECK: define hidden noundef <3 x i32> @
 // CHECK: call <3 x i32> @llvm.bitreverse.v3i32
 uint3 test_bitreverse_uint3(uint3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i32> @
+// CHECK: define hidden noundef <4 x i32> @
 // CHECK: call <4 x i32> @llvm.bitreverse.v4i32
 uint4 test_bitreverse_uint4(uint4 p0)
 {
 	return reversebits(p0);
 }
 
-// CHECK: define noundef i64 @
+// CHECK: define hidden noundef i64 @
 // CHECK: call i64 @llvm.bitreverse.i64(
 uint64_t test_bitreverse_long(uint64_t p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <2 x i64> @
+// CHECK: define hidden noundef <2 x i64> @
 // CHECK: call <2 x i64> @llvm.bitreverse.v2i64
 uint64_t2 test_bitreverse_long2(uint64_t2 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <3 x i64> @
+// CHECK: define hidden noundef <3 x i64> @
 // CHECK: call <3 x i64> @llvm.bitreverse.v3i64
 uint64_t3 test_bitreverse_long3(uint64_t3 p0)
 {
 	return reversebits(p0);
 }
-// CHECK: define noundef <4 x i64> @
+// CHECK: define hidden noundef <4 x i64> @
 // CHECK: call <4 x i64> @llvm.bitreverse.v4i64
 uint64_t4 test_bitreverse_long4(uint64_t4 p0)
 {
diff --git a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
index 109633a64d34e..3b07fcec064d8 100644
--- a/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_double
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_double(double p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_double2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_double2(double2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_double3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_double3(double3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_double4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_double4(double4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int(int p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int2(int2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int3(int3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int4(int4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint(uint p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint2(uint2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint3(uint3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint4(uint4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_int64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_int64_t(int64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_int64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_int64_t2(int64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_int64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_int64_t3(int64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_int64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_int64_t4(int64_t4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_round_uint64_t
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_uint64_t(uint64_t p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_round_uint64_t2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_uint64_t2(uint64_t2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_round_uint64_t3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_uint64_t3(uint64_t3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_round_uint64_t4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_uint64_t4(uint64_t4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/round.hlsl b/clang/test/CodeGenHLSL/builtins/round.hlsl
index a945a9677abbb..755f2e86fb116 100644
--- a/clang/test/CodeGenHLSL/builtins/round.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/round.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_round_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_round_half
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn half @llvm.roundeven.f16(
 // NATIVE_HALF: ret half %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_round_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_round_half
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // NO_HALF: ret float %elt.roundeven
 half test_round_half(half p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_round_half2
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.roundeven.v2f16
 // NATIVE_HALF: ret <2 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_round_half2
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32(
 // NO_HALF: ret <2 x float> %elt.roundeven
 half2 test_round_half2(half2 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_round_half3
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.roundeven.v3f16
 // NATIVE_HALF: ret <3 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_round_half3
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32(
 // NO_HALF: ret <3 x float> %elt.roundeven
 half3 test_round_half3(half3 p0) { return round(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_round_half4
 // NATIVE_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.roundeven.v4f16
 // NATIVE_HALF: ret <4 x half> %elt.roundeven
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_round_half4
 // NO_HALF: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32(
 // NO_HALF: ret <4 x float> %elt.roundeven
 half4 test_round_half4(half4 p0) { return round(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_round_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_round_float
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn float @llvm.roundeven.f32(
 // CHECK: ret float %elt.roundeven
 float test_round_float(float p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_round_float2
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.roundeven.v2f32
 // CHECK: ret <2 x float> %elt.roundeven
 float2 test_round_float2(float2 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_round_float3
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.roundeven.v3f32
 // CHECK: ret <3 x float> %elt.roundeven
 float3 test_round_float3(float3 p0) { return round(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_round_float4
 // CHECK: %elt.roundeven = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.roundeven.v4f32
 // CHECK: ret <4 x float> %elt.roundeven
 float4 test_round_float4(float4 p0) { return round(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
index 09f21f366b9d2..262f306b92572 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].rsqrt.f32(
diff --git a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
index 6c9b1f643713b..9c398fd6f06cb 100644
--- a/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/rsqrt.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: %hlsl.rsqrt = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].rsqrt.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index 8cc910933f462..cbdb929388934 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=dx -DFNATTRS=noundef
+// RUN:   -DTARGET=dx -DFNATTRS="hidden noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DTARGET=spv -DFNATTRS="spir_func noundef"
+// RUN:   -DTARGET=spv -DFNATTRS="hidden spir_func noundef"
 
 // NATIVE_HALF: define [[FNATTRS]] i32 @
 // NATIVE_HALF: %hlsl.sign = call i32 @llvm.[[TARGET]].sign.f16(
diff --git a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
index a5522e4f28b7f..e471cb3d42c5c 100644
--- a/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin-overloads.hlsl
@@ -2,67 +2,67 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_double(double p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_double2(double2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_double3(double3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_double4(double4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int(int p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int2(int2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int3(int3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int4(int4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint(uint p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint2(uint2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint3(uint3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint4(uint4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_int64_t(int64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_int64_t2(int64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_int64_t3(int64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_int64_t4(int64_t4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sin_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_uint64_t(uint64_t p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sin_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_uint64_t2(uint64_t2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sin_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_uint64_t3(uint64_t3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sin_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_uint64_t4(uint64_t4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sin.hlsl b/clang/test/CodeGenHLSL/builtins/sin.hlsl
index 69c657239ef95..9bbe97997aa33 100644
--- a/clang/test/CodeGenHLSL/builtins/sin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sin.hlsl
@@ -5,36 +5,36 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z13test_sin_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z13test_sin_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.sin.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z13test_sin_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z13test_sin_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 half test_sin_half(half p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z14test_sin_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sin.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z14test_sin_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32(
 half2 test_sin_half2(half2 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z14test_sin_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sin.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z14test_sin_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32(
 half3 test_sin_half3(half3 p0) { return sin(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z14test_sin_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sin.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z14test_sin_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32(
 half4 test_sin_half4(half4 p0) { return sin(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sin_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sin_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.sin.f32(
 float test_sin_float(float p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sin_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sin.v2f32
 float2 test_sin_float2(float2 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sin_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sin.v3f32
 float3 test_sin_float3(float3 p0) { return sin(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sin_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sin.v4f32
 float4 test_sin_float4(float4 p0) { return sin(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
index d3e5c1059029c..bef64ce77d470 100644
--- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl
@@ -6,7 +6,7 @@
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -O1 -o - | FileCheck %s --check-prefix=SPVCHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // CHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[MIN]]
@@ -19,7 +19,7 @@
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret half [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh(
 // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half nofpclass(nan inf) [[MIN]], half nofpclass(nan inf) [[MAX]], half nofpclass(nan inf) [[X]])
@@ -27,7 +27,7 @@
 //
 half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[MIN]]
@@ -40,7 +40,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_(
 // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> nofpclass(nan inf) [[MIN]], <2 x half> nofpclass(nan inf) [[MAX]], <2 x half> nofpclass(nan inf) [[X]])
@@ -48,7 +48,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M
 //
 half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[MIN]]
@@ -61,7 +61,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_(
 // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> nofpclass(nan inf) [[MIN]], <3 x half> nofpclass(nan inf) [[MAX]], <3 x half> nofpclass(nan inf) [[X]])
@@ -69,7 +69,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M
 //
 half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[MIN]]
@@ -82,7 +82,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x half> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_(
 // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> nofpclass(nan inf) [[MIN]], <4 x half> nofpclass(nan inf) [[MAX]], <4 x half> nofpclass(nan inf) [[X]])
@@ -90,7 +90,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M
 //
 half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // CHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[MIN]]
@@ -103,7 +103,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret float [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff(
 // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float nofpclass(nan inf) [[MIN]], float nofpclass(nan inf) [[MAX]], float nofpclass(nan inf) [[X]])
@@ -111,7 +111,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M
 //
 float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[MIN]]
@@ -124,7 +124,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <2 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_(
 // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> nofpclass(nan inf) [[MIN]], <2 x float> nofpclass(nan inf) [[MAX]], <2 x float> nofpclass(nan inf) [[X]])
@@ -132,7 +132,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M
 //
 float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[MIN]]
@@ -145,7 +145,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <3 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_(
 // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> nofpclass(nan inf) [[MIN]], <3 x float> nofpclass(nan inf) [[MAX]], <3 x float> nofpclass(nan inf) [[X]])
@@ -153,7 +153,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths
 //
 float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smoothstep(Min, Max, X); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[MIN]]
@@ -166,7 +166,7 @@ float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smooths
 // CHECK-NEXT:    [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[TMP0]], [[SUB2_I]]
 // CHECK-NEXT:    ret <4 x float> [[MUL4_I]]
 //
-// SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
+// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_(
 // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SPVCHECK-NEXT:  [[ENTRY:.*:]]
 // SPVCHECK-NEXT:    [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> nofpclass(nan inf) [[MIN]], <4 x float> nofpclass(nan inf) [[MAX]], <4 x float> nofpclass(nan inf) [[X]])
diff --git a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
index a883c9d5cc355..aeb2b79e90291 100644
--- a/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/splitdouble.hlsl
@@ -8,7 +8,7 @@
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} i32 {{.*}}test_scalar{{.*}}(double {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load double, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[CAST:%.*]] = bitcast double [[LOAD]] to <2 x i32>
@@ -26,7 +26,7 @@ uint test_scalar(double D) {
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { i32, i32 } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <1 x i32> {{.*}}test_double1{{.*}}(<1 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <1 x double>, ptr [[VALD]].addr, align 8
 // SPIRV-NEXT: [[TRUNC:%.*]] = extractelement <1 x double> [[LOAD]], i64 0
@@ -44,7 +44,7 @@ uint1 test_double1(double1 D) {
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <2 x i32>, <2 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <2 x i32> {{.*}}test_vector2{{.*}}(<2 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <2 x double>, ptr [[VALD]].addr, align 16
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <2 x double> [[LOAD]] to <4 x i32>
@@ -61,7 +61,7 @@ uint2 test_vector2(double2 D) {
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <3 x i32>, <3 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <3 x i32> {{.*}}test_vector3{{.*}}(<3 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT:  @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <3 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <3 x double> [[LOAD]] to <6 x i32>
@@ -78,7 +78,7 @@ uint3 test_vector3(double3 D) {
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 0
 // CHECK-NEXT: extractvalue { <4 x i32>, <4 x i32> } [[VALRET]], 1
 //
-// SPIRV: define spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
+// SPIRV: define hidden spir_func {{.*}} <4 x i32> {{.*}}test_vector4{{.*}}(<4 x double> {{.*}} [[VALD:%.*]])
 // SPIRV-NOT: @llvm.dx.splitdouble.i32
 // SPIRV:      [[LOAD:%.*]] = load <4 x double>, ptr [[VALD]].addr, align 32
 // SPIRV-NEXT: [[CAST1:%.*]] = bitcast <4 x double> [[LOAD]] to <8 x i32>
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
index 48b74c9db5c64..d4de244f38b3e 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt-overloads.hlsl
@@ -2,87 +2,87 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_double
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_double(double p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_double2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_double2(double2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_double3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_double3(double3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_double4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_double4(double4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int(int p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int2(int2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int3(int3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int4(int4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint(uint p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint2(uint2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint3(uint3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint4(uint4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_int64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_int64_t(int64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_int64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_int64_t2(int64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_int64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_int64_t3(int64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_int64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_int64_t4(int64_t4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_sqrt_uint64_t
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_uint64_t(uint64_t p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_sqrt_uint64_t2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_uint64_t2(uint64_t2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_sqrt_uint64_t3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_uint64_t3(uint64_t3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_sqrt_uint64_t4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_uint64_t4(uint64_t4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
index 94d966f0bef8a..31839f6bc177d 100644
--- a/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sqrt.hlsl
@@ -5,48 +5,48 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z14test_sqrt_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z14test_sqrt_half
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(
 // NATIVE_HALF: ret half %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z14test_sqrt_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z14test_sqrt_half
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // NO_HALF: ret float %{{.*}}
 half test_sqrt_half(half p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z15test_sqrt_half2
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16
 // NATIVE_HALF: ret <2 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z15test_sqrt_half2
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(
 // NO_HALF: ret <2 x float> %{{.*}}
 half2 test_sqrt_half2(half2 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z15test_sqrt_half3
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16
 // NATIVE_HALF: ret <3 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z15test_sqrt_half3
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(
 // NO_HALF: ret <3 x float> %{{.*}}
 half3 test_sqrt_half3(half3 p0) { return sqrt(p0); }
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z15test_sqrt_half4
 // NATIVE_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16
 // NATIVE_HALF: ret <4 x half> %{{.*}}
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z15test_sqrt_half4
 // NO_HALF: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(
 // NO_HALF: ret <4 x float> %{{.*}}
 half4 test_sqrt_half4(half4 p0) { return sqrt(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z15test_sqrt_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_sqrt_float
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(
 // CHECK: ret float %{{.*}}
 float test_sqrt_float(float p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_sqrt_float2
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32
 // CHECK: ret <2 x float> %{{.*}}
 float2 test_sqrt_float2(float2 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_sqrt_float3
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32
 // CHECK: ret <3 x float> %{{.*}}
 float3 test_sqrt_float3(float3 p0) { return sqrt(p0); }
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_sqrt_float4
 // CHECK: %{{.*}} = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_sqrt_float4(float4 p0) { return sqrt(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
index d3b979254391c..f55a8f8aff92d 100644
--- a/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step-overloads.hlsl
@@ -1,11 +1,11 @@
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // CHECK: define [[FNATTRS]] float @
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].step.f32(float
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 49d09e5c6fe6f..be0ffbd794646 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -2,20 +2,20 @@
 // RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="noundef nofpclass(nan inf)" -DTARGET=dx
+// RUN:   -DFNATTRS="hidden noundef nofpclass(nan inf)" -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef nofpclass(nan inf)" -DTARGET=spv
+// RUN:   -DFNATTRS="hidden spir_func noundef nofpclass(nan inf)" -DTARGET=spv
 
 // NATIVE_HALF: define [[FNATTRS]] half @
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].step.f16(half
diff --git a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
index d913aabfb4066..51eb20c58e405 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc-overloads.hlsl
@@ -2,82 +2,82 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_double
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_double
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_double(double p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_double2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_double2(double2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_double3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_double3(double3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_double4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_double4(double4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int(int p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int2(int2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int3(int3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int4(int4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint(uint p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint2(uint2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint3(uint3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint4(uint4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_int64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_int64_t(int64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_int64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_int64_t2(int64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_int64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_int64_t3(int64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_int64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_int64_t4(int64_t4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float {{.*}}test_trunc_uint64_t
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_uint64_t(uint64_t p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> {{.*}}test_trunc_uint64_t2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_uint64_t2(uint64_t2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> {{.*}}test_trunc_uint64_t3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_uint64_t3(uint64_t3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> {{.*}}test_trunc_uint64_t4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_uint64_t4(uint64_t4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/trunc.hlsl b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
index 26de5bf94c3cc..c1c6ee4119f0d 100644
--- a/clang/test/CodeGenHLSL/builtins/trunc.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/trunc.hlsl
@@ -5,42 +5,42 @@
 // RUN:  -emit-llvm -disable-llvm-passes -o - | \
 // RUN:  FileCheck %s --check-prefixes=CHECK,NO_HALF
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) half @_Z15test_trunc_half
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) half @_Z15test_trunc_half
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn half @llvm.trunc.f16(
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) float @_Z15test_trunc_half
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) float @_Z15test_trunc_half
 // NO_HALF: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 half test_trunc_half(half p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z16test_trunc_half2
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.trunc.v2f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z16test_trunc_half2
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32(
 half2 test_trunc_half2(half2 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z16test_trunc_half3
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.trunc.v3f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z16test_trunc_half3
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32(
 half3 test_trunc_half3(half3 p0) { return trunc(p0); }
 
-// NATIVE_HALF-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
+// NATIVE_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z16test_trunc_half4
 // NATIVE_HALF: call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.trunc.v4f16
-// NO_HALF-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
+// NO_HALF-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z16test_trunc_half4
 // NO_HALF: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32(
 half4 test_trunc_half4(half4 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z16test_trunc_float
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z16test_trunc_float
 // CHECK: call reassoc nnan ninf nsz arcp afn float @llvm.trunc.f32(
 float test_trunc_float(float p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z17test_trunc_float2
 // CHECK: call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.trunc.v2f32
 float2 test_trunc_float2(float2 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z17test_trunc_float3
 // CHECK: call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.trunc.v3f32
 float3 test_trunc_float3(float3 p0) { return trunc(p0); }
 
-// CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z17test_trunc_float4
 // CHECK: call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.trunc.v4f32
 float4 test_trunc_float4(float4 p0) { return trunc(p0); }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 3ab8048146ad3..0df3598a3cc3e 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func void @{{.*main.*}}() [[A0:#[0-9]+]] {
 void main() {
 // CHECK: entry:
 // CHECK:   %[[CT_ENTRY:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 8e1f2d69e7432..9034cae254036 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -6,8 +6,8 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
 // RUN:   --check-prefixes=CHECK,CHECK-DXIL
 
-// CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
-// CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-SPIRV: define hidden spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
+// CHECK-DXIL: define hidden noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
 // CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
 // CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
index 12b120d0c067d..a71b988417f09 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-pc-vulkan-library %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
 // CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
 uint test_1() {
@@ -10,7 +10,7 @@ uint test_1() {
 
 // CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 
-// CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] {
+// CHECK: define hidden spir_func noundef i32 @_Z6test_2v() [[A0]] {
 // CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry()
 // CHECK: call spir_func noundef i32 @_Z6test_1v() {{#[0-9]+}} [ "convergencectrl"(token %[[C2]]) ]
 uint test_2() {
diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl
index eebf0f682d3de..b58a49b41eb98 100644
--- a/clang/test/CodeGenHLSL/cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer.hlsl
@@ -46,14 +46,14 @@ cbuffer CBScalars : register(b1, space5) {
 
 // CHECK: @CBScalars.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars,
 // CHECK-SAME: 56, 0, 8, 16, 24, 32, 36, 40, 48))
-// CHECK: @a1 = external addrspace(2) global float, align 4
-// CHECK: @a2 = external addrspace(2) global double, align 8
-// CHECK: @a3 = external addrspace(2) global half, align 2
-// CHECK: @a4 = external addrspace(2) global i64, align 8
-// CHECK: @a5 = external addrspace(2) global i32, align 4
-// CHECK: @a6 = external addrspace(2) global i16, align 2
-// CHECK: @a7 = external addrspace(2) global i32, align 4
-// CHECK: @a8 = external addrspace(2) global i64, align 8
+// CHECK: @a1 = external hidden addrspace(2) global float, align 4
+// CHECK: @a2 = external hidden addrspace(2) global double, align 8
+// CHECK: @a3 = external hidden addrspace(2) global half, align 2
+// CHECK: @a4 = external hidden addrspace(2) global i64, align 8
+// CHECK: @a5 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a6 = external hidden addrspace(2) global i16, align 2
+// CHECK: @a7 = external hidden addrspace(2) global i32, align 4
+// CHECK: @a8 = external hidden addrspace(2) global i64, align 8
 // CHECK: @CBScalars.str = private unnamed_addr constant [10 x i8] c"CBScalars\00", align 1
 
 cbuffer CBVectors {
@@ -69,13 +69,13 @@ cbuffer CBVectors {
 
 // CHECK: @CBVectors.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors,
 // CHECK-SAME: 136, 0, 16, 40, 48, 80, 96, 112))
-// CHECK: @b1 = external addrspace(2) global <3 x float>, align 16
-// CHECK: @b2 = external addrspace(2) global <3 x double>, align 32
-// CHECK: @b3 = external addrspace(2) global <2 x half>, align 4
-// CHECK: @b4 = external addrspace(2) global <3 x i64>, align 32
-// CHECK: @b5 = external addrspace(2) global <4 x i32>, align 16
-// CHECK: @b6 = external addrspace(2) global <3 x i16>, align 8
-// CHECK: @b7 = external addrspace(2) global <3 x i64>, align 32
+// CHECK: @b1 = external hidden addrspace(2) global <3 x float>, align 16
+// CHECK: @b2 = external hidden addrspace(2) global <3 x double>, align 32
+// CHECK: @b3 = external hidden addrspace(2) global <2 x half>, align 4
+// CHECK: @b4 = external hidden addrspace(2) global <3 x i64>, align 32
+// CHECK: @b5 = external hidden addrspace(2) global <4 x i32>, align 16
+// CHECK: @b6 = external hidden addrspace(2) global <3 x i16>, align 8
+// CHECK: @b7 = external hidden addrspace(2) global <3 x i64>, align 32
 // CHECK: @CBVectors.str = private unnamed_addr constant [10 x i8] c"CBVectors\00", align 1
 
 cbuffer CBArrays : register(b2) {
@@ -91,14 +91,14 @@ cbuffer CBArrays : register(b2) {
 
 // CHECK: @CBArrays.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays,
 // CHECK-SAME: 708, 0, 48, 112, 176, 224, 608, 624, 656))
-// CHECK: @c1 = external addrspace(2) global [3 x float], align 4
-// CHECK: @c2 = external addrspace(2) global [2 x <3 x double>], align 32
-// CHECK: @c3 = external addrspace(2) global [2 x [2 x half]], align 2
-// CHECK: @c4 = external addrspace(2) global [3 x i64], align 8
-// CHECK: @c5 = external addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
-// CHECK: @c6 = external addrspace(2) global [1 x i16], align 2
-// CHECK: @c7 = external addrspace(2) global [2 x i64], align 8
-// CHECK: @c8 = external addrspace(2) global [4 x i32], align 4
+// CHECK: @c1 = external hidden addrspace(2) global [3 x float], align 4
+// CHECK: @c2 = external hidden addrspace(2) global [2 x <3 x double>], align 32
+// CHECK: @c3 = external hidden addrspace(2) global [2 x [2 x half]], align 2
+// CHECK: @c4 = external hidden addrspace(2) global [3 x i64], align 8
+// CHECK: @c5 = external hidden addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
+// CHECK: @c6 = external hidden addrspace(2) global [1 x i16], align 2
+// CHECK: @c7 = external hidden addrspace(2) global [2 x i64], align 8
+// CHECK: @c8 = external hidden addrspace(2) global [4 x i32], align 4
 // CHECK: @CBArrays.str = private unnamed_addr constant [9 x i8] c"CBArrays\00", align 1
 
 typedef uint32_t4 uint32_t8[2];
@@ -112,8 +112,8 @@ cbuffer CBTypedefArray : register(space2) {
 
 // CHECK: @CBTypedefArray.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray,
 // CHECK-SAME: 128, 0, 64))
-// CHECK: @t1 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
-// CHECK: @t2 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t1 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
+// CHECK: @t2 = external hidden addrspace(2) global [2 x [2 x <4 x i32>]], align 16
 // CHECK: @CBTypedefArray.str = private unnamed_addr constant [15 x i8] c"CBTypedefArray\00", align 1
 struct Empty {};
 
@@ -137,13 +137,13 @@ struct D {
 
 // CHECK: @CBStructs.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs,
 // CHECK-SAME: 246, 0, 16, 32, 64, 144, 238, 240))
-// CHECK: @a = external addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
-// CHECK: @b = external addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
-// CHECK: @c = external addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
-// CHECK: @array_of_A = external addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
-// CHECK: @d = external addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
-// CHECK: @e = external addrspace(2) global half, align 2
-// CHECK: @f = external addrspace(2) global <3 x i16>, align 8
+// CHECK: @a = external hidden addrspace(2) global target("dx.Layout", %A, 8, 0), align 1
+// CHECK: @b = external hidden addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 1
+// CHECK: @c = external hidden addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 1
+// CHECK: @array_of_A = external hidden addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 1
+// CHECK: @d = external hidden addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 1
+// CHECK: @e = external hidden addrspace(2) global half, align 2
+// CHECK: @f = external hidden addrspace(2) global <3 x i16>, align 8
 // CHECK: @CBStructs.str = private unnamed_addr constant [10 x i8] c"CBStructs\00", align 1
 
 cbuffer CBStructs {
@@ -178,10 +178,10 @@ cbuffer CBClasses {
 
 // CHECK: @CBClasses.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses,
 // CHECK-SAME: 260, 0, 16, 32, 112))
-// CHECK: @k = external addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
-// CHECK: @l = external addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
-// CHECK: @m = external addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
-// CHECK: @ka = external addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
+// CHECK: @k = external hidden addrspace(2) global target("dx.Layout", %K, 4, 0), align 1
+// CHECK: @l = external hidden addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 1
+// CHECK: @m = external hidden addrspace(2) global target("dx.Layout", %M, 68, 0), align 1
+// CHECK: @ka = external hidden addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 1
 // CHECK: @CBClasses.str = private unnamed_addr constant [10 x i8] c"CBClasses\00", align 1
 
 struct Test {
@@ -190,16 +190,16 @@ struct Test {
 
 // CHECK: @CBMix.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix,
 // CHECK-SAME: 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168))
-// CHECK: @test = external addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
-// CHECK: @f1 = external addrspace(2) global float, align 4
-// CHECK: @f2 = external addrspace(2) global [3 x [2 x <2 x float>]], align 8
-// CHECK: @f3 = external addrspace(2) global float, align 4
-// CHECK: @f4 = external addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
-// CHECK: @f5 = external addrspace(2) global double, align 8
-// CHECK: @f6 = external addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
-// CHECK: @f7 = external addrspace(2) global float, align 4
-// CHECK: @f8 = external addrspace(2) global <1 x double>, align 8
-// CHECK: @f9 = external addrspace(2) global i16, align 2
+// CHECK: @test = external hidden addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 1
+// CHECK: @f1 = external hidden addrspace(2) global float, align 4
+// CHECK: @f2 = external hidden addrspace(2) global [3 x [2 x <2 x float>]], align 8
+// CHECK: @f3 = external hidden addrspace(2) global float, align 4
+// CHECK: @f4 = external hidden addrspace(2) global target("dx.Layout", %anon, 4, 0), align 1
+// CHECK: @f5 = external hidden addrspace(2) global double, align 8
+// CHECK: @f6 = external hidden addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 1
+// CHECK: @f7 = external hidden addrspace(2) global float, align 4
+// CHECK: @f8 = external hidden addrspace(2) global <1 x double>, align 8
+// CHECK: @f9 = external hidden addrspace(2) global i16, align 2
 // CHECK: @CBMix.str = private unnamed_addr constant [6 x i8] c"CBMix\00", align 1
 
 cbuffer CBMix {
diff --git a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
index 4a7e2597dc0ff..33f480bf445e5 100644
--- a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
@@ -8,14 +8,14 @@
 // CHECK: %"n0::Foo" = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n1::__cblayout_A", 4, 0))
-// CHECK: @_ZN2n02n11aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n02n11aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @B.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::__cblayout_B", 4, 0))
-// CHECK: @_ZN2n01aE = external addrspace(2) global float, align 4
+// CHECK: @_ZN2n01aE = external hidden addrspace(2) global float, align 4
 
 // CHECK: @C.cb = global target("dx.CBuffer", target("dx.Layout", %"n0::n2::__cblayout_C", 20, 0, 16))
-// CHECK: @_ZN2n02n21aE = external addrspace(2) global float, align 4
-// CHECK: external addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
+// CHECK: @_ZN2n02n21aE = external hidden addrspace(2) global float, align 4
+// CHECK: external hidden addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 1
 
 namespace n0 {
   struct Foo {
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
index 0d092f0c36c29..16d22a5b1fdd4 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
@@ -6,9 +6,9 @@
 // CHECK: %__cblayout_CB_1 = type <{ float, <2 x float> }>
 
 // CHECK: @CB.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 176, 16, 168, 88))
-// CHECK: @a = external addrspace(2) global float, align 4
-// CHECK: @b = external addrspace(2) global double, align 8
-// CHECK: @c = external addrspace(2) global <2 x i32>, align 8
+// CHECK: @a = external hidden addrspace(2) global float, align 4
+// CHECK: @b = external hidden addrspace(2) global double, align 8
+// CHECK: @c = external hidden addrspace(2) global <2 x i32>, align 8
 // CHECK: @CB.str = private unnamed_addr constant [3 x i8] c"CB\00", align 1
 
 cbuffer CB : register(b1, space3) {
@@ -18,8 +18,8 @@ cbuffer CB : register(b1, space3) {
 }
 
 // CHECK: @CB.cb.1 = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB_1, 92, 88, 80))
-// CHECK: @x = external addrspace(2) global float, align 4
-// CHECK: @y = external addrspace(2) global <2 x float>, align 8
+// CHECK: @x = external hidden addrspace(2) global float, align 4
+// CHECK: @y = external hidden addrspace(2) global <2 x float>, align 8
 
 // Missing packoffset annotation will produce a warning.
 // Element x will be placed after the element y that has an explicit packoffset.
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
index a6034386ac450..cda231d8d2ebb 100644
--- a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
+++ b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
@@ -3,7 +3,7 @@
 // CHECK: %__cblayout_A = type <{ float }>
 
 // CHECK: @A.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_A, 4, 0))
-// CHECK: @a = external addrspace(2) global float, align 4
+// CHECK: @a = external hidden addrspace(2) global float, align 4
 // CHECK-DAG: @_ZL1b = internal global float 3.000000e+00, align 4
 // CHECK-NOT: @B.cb
 
diff --git a/clang/test/CodeGenHLSL/convergence/do.while.hlsl b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
index 934fe3ea9eb7a..9aabbfd54e539 100644
--- a/clang/test/CodeGenHLSL/convergence/do.while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/do.while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   do {
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -22,7 +22,7 @@ void test2() {
     foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
       foo();
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -57,7 +57,7 @@ void test4() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -78,7 +78,7 @@ void test5() {
     }
   } while (cond());
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/for.hlsl b/clang/test/CodeGenHLSL/convergence/for.hlsl
index 363c6a48839b5..b7b11e9959ea8 100644
--- a/clang/test/CodeGenHLSL/convergence/for.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/for.hlsl
@@ -10,7 +10,7 @@ void test1() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -23,7 +23,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -52,7 +52,7 @@ void test4() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -67,7 +67,7 @@ void test5() {
   for (cond();cond2();foo()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -86,7 +86,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -112,7 +112,7 @@ void test7() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test7v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test7v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/convergence/while.hlsl b/clang/test/CodeGenHLSL/convergence/while.hlsl
index 570b4b1336717..32579e8631001 100644
--- a/clang/test/CodeGenHLSL/convergence/while.hlsl
+++ b/clang/test/CodeGenHLSL/convergence/while.hlsl
@@ -8,7 +8,7 @@ void test1() {
   while (cond()) {
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test1v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test1v()
 // CHECK-SAME: [[A0:#[0-9]+]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -21,7 +21,7 @@ void test2() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test2v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test2v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -38,7 +38,7 @@ void test3() {
     foo();
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test3v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test3v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -59,7 +59,7 @@ void test4() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test4v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test4v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -82,7 +82,7 @@ void test5() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test5v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test5v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
@@ -107,7 +107,7 @@ void test6() {
     }
   }
 }
-// CHECK-LABEL: define spir_func void @_Z5test6v()
+// CHECK-LABEL: define hidden spir_func void @_Z5test6v()
 // CHECK-SAME: [[A0]] {
 // CHECK: entry:
 // CHECK:   [[T0:%[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/default_cbuffer.hlsl b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
index 557913042e884..ad4d92f8afc02 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer.hlsl
@@ -6,14 +6,14 @@
 // CHECK: %__cblayout_S = type <{ float }>
 
 // DXIL-DAG: @"$Globals.cb" = global target("dx.CBuffer", target("dx.Layout", %"__cblayout_$Globals", 20, 0, 4, 16))
-// DXIL-DAG: @a = external addrspace(2) global float
-// DXIL-DAG: @g = external addrspace(2) global float
-// DXIL-DAG: @h = external addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
+// DXIL-DAG: @a = external hidden addrspace(2) global float
+// DXIL-DAG: @g = external hidden addrspace(2) global float
+// DXIL-DAG: @h = external hidden addrspace(2) global target("dx.Layout", %__cblayout_S, 4, 0), align 4
 
 // SPIRV-DAG: @"$Globals.cb" = global target("spirv.VulkanBuffer", target("spirv.Layout", %"__cblayout_$Globals", 20, 0, 4, 16), 2, 0)
-// SPIRV-DAG: @a = external addrspace(12) global float
-// SPIRV-DAG: @g = external addrspace(12) global float
-// SPIRV-DAG: @h = external addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
+// SPIRV-DAG: @a = external hidden addrspace(12) global float
+// SPIRV-DAG: @g = external hidden addrspace(12) global float
+// SPIRV-DAG: @h = external hidden addrspace(12) global target("spirv.Layout", %__cblayout_S, 4, 0), align 8
 
 struct EmptyStruct {
 };
diff --git a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
index 40e3196649a50..1b2cb0e99aa83 100644
--- a/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
+++ b/clang/test/CodeGenHLSL/default_cbuffer_with_layout.hlsl
@@ -4,14 +4,14 @@
 // CHECK-SAME: target("dx.Layout", %S, 8, 0) }>
 // CHECK: %S = type <{ <2 x float> }>
 
-// CHECK-DAG: @b = external addrspace(2) global float, align 4
-// CHECK-DAG: @d = external addrspace(2) global <4 x i32>, align 16
+// CHECK-DAG: @b = external hidden addrspace(2) global float, align 4
+// CHECK-DAG: @d = external hidden addrspace(2) global <4 x i32>, align 16
 // CHECK-DAG: @"$Globals.cb" = global target("dx.CBuffer",
 // CHECK-DAG-SAME: target("dx.Layout", %"__cblayout_$Globals", 144, 120, 16, 32, 64, 128, 112))
-// CHECK-DAG: @a = external addrspace(2) global i32, align 4
-// CHECK-DAG: @c = external addrspace(2) global [4 x double], align 8
-// CHECK-DAG: @e = external addrspace(2) global <4 x float>, align 16
-// CHECK-DAG: @s = external addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
+// CHECK-DAG: @a = external hidden addrspace(2) global i32, align 4
+// CHECK-DAG: @c = external hidden addrspace(2) global [4 x double], align 8
+// CHECK-DAG: @e = external hidden addrspace(2) global <4 x float>, align 16
+// CHECK-DAG: @s = external hidden addrspace(2) global target("dx.Layout", %S, 8, 0), align 1
 
 struct S {
   float2 v;
diff --git a/clang/test/CodeGenHLSL/export.hlsl b/clang/test/CodeGenHLSL/export.hlsl
index 770618ff2e070..e72dbde5188a9 100644
--- a/clang/test/CodeGenHLSL/export.hlsl
+++ b/clang/test/CodeGenHLSL/export.hlsl
@@ -5,17 +5,15 @@
 export void f1() {
 }
 
-// CHECK: define void @_ZN11MyNamespace2f2Ev() [[Attr]]
+// CHECK: define void @_ZN11MyNamespace2f2Ev()
 namespace MyNamespace {
   export void f2() {
   }
 }
 
 export {
-// CHECK: define void @_Z2f3v() [[Attr]]
-// CHECK: define void @_Z2f4v() [[Attr]]
+// CHECK: define void @_Z2f3v()
+// CHECK: define void @_Z2f4v()
     void f3() {}
     void f4() {}
-}
-
-// CHECK: attributes [[Attr]] = { {{.*}} "hlsl.export" {{.*}} }
+}
\ No newline at end of file
diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl
index a562e75b34881..6498c53752d4e 100644
--- a/clang/test/CodeGenHLSL/group_shared.hlsl
+++ b/clang/test/CodeGenHLSL/group_shared.hlsl
@@ -8,7 +8,7 @@
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure groupshared translated into address space 3.
-// CHECK:@a = addrspace(3) global [10 x float]
+// CHECK:@a = hidden addrspace(3) global [10 x float]
 
  groupshared float a[10];
 
diff --git a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
index 12d3eeedb590f..60238cbf8eff5 100644
--- a/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
+++ b/clang/test/CodeGenHLSL/implicit-norecurse-attrib.hlsl
@@ -12,7 +12,7 @@ struct Node {
 };
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[IntAttr:\#[0-9]+]]
+// CHECK: define hidden noundef i32 @_Z4FindA100_4Nodej(ptr noundef byval([100 x %struct.Node]) align 1 %SortedTree, i32 noundef %key) [[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Find and return value corresponding to key in the SortedTree
 uint Find(Node SortedTree[MAX], uint key) {
@@ -31,7 +31,7 @@ uint Find(Node SortedTree[MAX], uint key) {
 }
 
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i1 @_Z8InitTreeA100_4NodeN4hlsl8RWBufferIDv4_jEEj(ptr noundef byval([100 x %struct.Node]) align 1 %tree, ptr noundef byval(%"class.hlsl::RWBuffer") align 4 %encodedTree, i32 noundef %maxDepth) [[Attr:\#[0-9]+]]
 // CHECK: ret i1
 // Initialize tree with given buffer
 // Imagine the inout works
@@ -52,7 +52,7 @@ RWBuffer<uint4> gTree;
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// CHECK: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -71,7 +71,7 @@ void main(uint GI : SV_GroupIndex) {
 
 // Mangled entry points are internal
 // CHECK: Function Attrs:{{.*}}norecurse
-// CHECK: define internal void @_Z11defaultMainv() [[IntAttr]]
+// CHECK: define internal void @_Z11defaultMainv() [[Attr]]
 // CHECK: ret void
 
 // Canonical entry points are external and shader attributed
@@ -88,6 +88,5 @@ void defaultMain() {
     needle = Find(haystack, needle);
 }
 
-// CHECK: attributes [[IntAttr]] = {{.*}} norecurse
-// CHECK: attributes [[ExtAttr]] = {{.*}} norecurse
+// CHECK: attributes [[Attr]] = {{.*}} norecurse
 // CHECK: attributes [[EntryAttr]] = {{.*}} norecurse
diff --git a/clang/test/CodeGenHLSL/inline-functions.hlsl b/clang/test/CodeGenHLSL/inline-functions.hlsl
index 4748eeee7475f..0c7467e2f972e 100644
--- a/clang/test/CodeGenHLSL/inline-functions.hlsl
+++ b/clang/test/CodeGenHLSL/inline-functions.hlsl
@@ -15,7 +15,7 @@ float nums[MAX];
 
 // Verify that all functions have the alwaysinline attribute
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[IntAttr:\#[0-9]+]]
+// NOINLINE: define hidden void @_Z4swapA100_jjj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %ix1, i32 noundef %ix2) [[Attr:\#[0-9]+]]
 // NOINLINE: ret void
 // Swap the values of Buf at indices ix1 and ix2
 void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
@@ -25,7 +25,7 @@ void swap(unsigned Buf[MAX], unsigned ix1, unsigned ix2) {
 }
 
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[IntAttr]]
+// NOINLINE: define hidden void @_Z10BubbleSortA100_jj(ptr noundef byval([100 x i32]) align 4 %Buf, i32 noundef %size) [[Attr]]
 // NOINLINE: ret void
 // Inefficiently sort Buf in place
 void BubbleSort(unsigned Buf[MAX], unsigned size) {
@@ -43,7 +43,7 @@ void BubbleSort(unsigned Buf[MAX], unsigned size) {
 
 // Note ExtAttr is the inlined export set of attribs
 // CHECK: Function Attrs: alwaysinline
-// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[ExtAttr:\#[0-9]+]]
+// CHECK: define noundef i32 @_Z11RemoveDupesA100_jj(ptr {{[a-z_ ]*}}noundef byval([100 x i32]) align 4 {{.*}}%Buf, i32 noundef %size) {{[a-z_ ]*}}[[Attr:\#[0-9]+]]
 // CHECK: ret i32
 // Sort Buf and remove any duplicate values
 // returns the number of values left
@@ -65,9 +65,9 @@ RWBuffer<unsigned> Indices;
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[IntAttr]]
+// NOINLINE: define internal void @_Z4mainj(i32 noundef %GI) [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -93,9 +93,9 @@ void main(unsigned int GI : SV_GroupIndex) {
 
 // The mangled version of main only remains without inlining
 // because it has internal linkage from the start
-// Note main functions get the norecurse attrib, which IntAttr reflects
+// Note main functions get the alwaysinline attrib, which Attr reflects
 // NOINLINE: Function Attrs: alwaysinline
-// NOINLINE: define internal void @_Z6main10v() [[IntAttr]]
+// NOINLINE: define internal void @_Z6main10v() [[Attr]]
 // NOINLINE: ret void
 
 // The unmangled version is not inlined, EntryAttr reflects that
@@ -113,6 +113,5 @@ void main10() {
   main(10);
 }
 
-// NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline
-// CHECK: attributes [[ExtAttr]] = {{.*}} alwaysinline
+// CHECK: attributes [[Attr]] = {{.*}} alwaysinline
 // CHECK: attributes [[EntryAttr]] = {{.*}} noinline
diff --git a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
index 5b58e436bbedb..7149be0122f4d 100644
--- a/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
+++ b/clang/test/CodeGenHLSL/inline-spirv/SpirvType.hlsl
@@ -18,12 +18,12 @@ struct S {
     Int i;
 };
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) @_Z14getArrayBufferu17spirv_type_28_0_0U5_TypeN4hlsl8RWBufferIfEEU6_ConstLm4E(target("spirv.Type", target("spirv.Image", float, 5, 2, 0, 0, 2, 0), target("spirv.IntegralConstant", i64, 4), 28, 0, 0) %v) #0
 ArrayBuffer<4> getArrayBuffer(ArrayBuffer<4> v) {
     return v;
 }
 
-// CHECK: define spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
+// CHECK: define hidden spir_func target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) @_Z6getIntu18spirv_type_21_4_32U4_LitLi32EU4_LitLi0E(target("spirv.Type", target("spirv.Literal", 32), target("spirv.Literal", 0), 21, 4, 32) %v) #0
 Int getInt(Int v) {
     return v;
 }
diff --git a/clang/test/CodeGenHLSL/no_int_promotion.hlsl b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
index 78bff3b13810d..b4ffcb477f1ba 100644
--- a/clang/test/CodeGenHLSL/no_int_promotion.hlsl
+++ b/clang/test/CodeGenHLSL/no_int_promotion.hlsl
@@ -10,37 +10,37 @@
 int16_t add(int16_t a, int16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 int16_t2 add(int16_t2 a, int16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 int16_t3 add(int16_t3 a, int16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 int16_t4 add(int16_t4 a, int16_t4 b) {
   return a + b;
 }
-// CHECK: define noundef i16 @
+// CHECK: define hidden noundef i16 @
 // CHECK: add i16 %
 uint16_t add(uint16_t a, uint16_t b) {
   return a + b;
 }
-// CHECK: define noundef <2 x i16> @
+// CHECK: define hidden noundef <2 x i16> @
 // CHECK: add <2 x i16>
 uint16_t2 add(uint16_t2 a, uint16_t2 b) {
   return a + b;
 }
-// CHECK: define noundef <3 x i16> @
+// CHECK: define hidden noundef <3 x i16> @
 // CHECK: add <3 x i16>
 uint16_t3 add(uint16_t3 a, uint16_t3 b) {
   return a + b;
 }
-// CHECK: define noundef <4 x i16> @
+// CHECK: define hidden noundef <4 x i16> @
 // CHECK: add <4 x i16>
 uint16_t4 add(uint16_t4 a, uint16_t4 b) {
   return a + b;
diff --git a/clang/test/CodeGenHLSL/out-of-line-static.hlsl b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
index 8127a6c2ec1e4..57f6c123e50e5 100644
--- a/clang/test/CodeGenHLSL/out-of-line-static.hlsl
+++ b/clang/test/CodeGenHLSL/out-of-line-static.hlsl
@@ -6,8 +6,8 @@ struct S {
 };
 
 int S::Value = 1;
-// DXIL: @_ZN1S5ValueE = global i32 1, align 4
-// SPIRV: @_ZN1S5ValueE = addrspace(10) global i32 1, align 4
+// DXIL: @_ZN1S5ValueE = hidden global i32 1, align 4
+// SPIRV: @_ZN1S5ValueE = hidden addrspace(10) global i32 1, align 4
 
 [shader("compute")]
 [numthreads(1,1,1)]
diff --git a/clang/test/CodeGenHLSL/shift-mask.hlsl b/clang/test/CodeGenHLSL/shift-mask.hlsl
index 7b3890ae560d2..41e05330ed1a5 100644
--- a/clang/test/CodeGenHLSL/shift-mask.hlsl
+++ b/clang/test/CodeGenHLSL/shift-mask.hlsl
@@ -5,7 +5,7 @@ int shl32(int V, int S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shl32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -13,7 +13,7 @@ int shr32(int V, int S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z5shr32ii(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = ashr i32 %{{.*}}, %[[Masked]]
 
@@ -21,7 +21,7 @@ int64_t shl64(int64_t V, int64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shl64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -29,7 +29,7 @@ int64_t shr64(int64_t V, int64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z5shr64ll(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = ashr i64 %{{.*}}, %[[Masked]]
 
@@ -37,7 +37,7 @@ uint shlu32(uint V, uint S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shlu32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = shl i32 %{{.*}}, %[[Masked]]
 
@@ -45,7 +45,7 @@ uint shru32(uint V, uint S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i32 @_Z6shru32jj(i32 noundef %V, i32 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i32 %{{.*}}, 31
 // CHECK-DAG:  %{{.*}} = lshr i32 %{{.*}}, %[[Masked]]
 
@@ -53,7 +53,7 @@ uint64_t shlu64(uint64_t V, uint64_t S) {
   return V << S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shlu64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = shl i64 %{{.*}}, %[[Masked]]
 
@@ -61,6 +61,6 @@ uint64_t shru64(uint64_t V, uint64_t S) {
   return V >> S;
 }
 
-// CHECK-LABEL: define noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
+// CHECK-LABEL: define hidden noundef i64 @_Z6shru64mm(i64 noundef %V, i64 noundef %S) #0 {
 // CHECK-DAG:  %[[Masked:.*]] = and i64 %{{.*}}, 63
 // CHECK-DAG:  %{{.*}} = lshr i64 %{{.*}}, %[[Masked]]
diff --git a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
index a87eb0b38f603..a2df307038774 100644
--- a/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
+++ b/clang/test/CodeGenHLSL/this-assignment-overload.hlsl
@@ -25,7 +25,7 @@ void main() {
 }
 
 // This test makes a probably safe assumption that HLSL 202x includes operator overloading for assignment operators.
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair8getFirstEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%Another = alloca %struct.Pair, align 1
@@ -42,7 +42,7 @@ void main() {
 // CHECK-NEXT:%0 = load i32, ptr %First2, align 1
 // CHECK-NEXT:ret i32 %0
 
-// CHECK:     define linkonce_odr noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
+// CHECK:     define linkonce_odr hidden noundef i32 @_ZN4Pair9getSecondEv(ptr noundef nonnull align 1 dereferenceable(8) %this) #0 align 2 {
 // CHECK-NEXT:entry:
 // CHECK-NEXT:%this.addr = alloca ptr, align 4
 // CHECK-NEXT:%agg.tmp = alloca %struct.Pair, align 1
diff --git a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
index 1cc7963c0e289..157a1818c82ff 100644
--- a/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
+++ b/clang/test/CodeGenHLSL/vk-input-builtin.hlsl
@@ -3,7 +3,7 @@
 
 [[vk::ext_builtin_input(/* WorkgroupId */ 26)]]
 static const uint3 groupid;
-// CHECK: @_ZL7groupid = external local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
+// CHECK: @_ZL7groupid = external hidden local_unnamed_addr addrspace(7) externally_initialized constant <3 x i32>, align 16, !spirv.Decorations [[META0:![0-9]+]]
 
 RWStructuredBuffer<int> output : register(u1, space0);
 
diff --git a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
index aad8836db1062..f37d00503fe57 100644
--- a/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
+++ b/clang/test/Options/enable_16bit_types_validation_spirv.hlsl
@@ -4,7 +4,7 @@
 // SPIRV: error: '-fnative-half-type' option requires target HLSL Version >= 2018, but HLSL Version is 'hlsl2016'
 
 // valid: "spirv-unknown-vulkan-library"
-// valid: define spir_func void @{{.*main.*}}() #0 {
+// valid: define hidden spir_func void @{{.*main.*}}() #0 {
 
 [numthreads(1,1,1)]
 void main()
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index 035899205bf8c..94b2dbe78c4f7 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -24,7 +24,9 @@ static bool finalizeLinkage(Module &M) {
   for (Function &EF : M.functions()) {
     if (EF.isIntrinsic())
       continue;
-    if (EF.hasFnAttribute("hlsl.shader") || EF.hasFnAttribute("hlsl.export"))
+    if (EF.hasExternalLinkage() && EF.hasDefaultVisibility())
+      continue;
+    if (EF.hasFnAttribute("hlsl.shader"))
       continue;
     Funcs.push_back(&EF);
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 36cc5cbe655bc..a412887e51adb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -480,7 +480,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                    .addUse(FuncVReg);
     addStringImm(F.getName(), MIB);
   } else if (F.getLinkage() != GlobalValue::InternalLinkage &&
-             F.getLinkage() != GlobalValue::PrivateLinkage) {
+             F.getLinkage() != GlobalValue::PrivateLinkage &&
+             F.getVisibility() != GlobalValue::HiddenVisibility) {
     SPIRV::LinkageType::LinkageType LnkTy =
         F.isDeclaration()
             ? SPIRV::LinkageType::Import
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 911a6966aaef0..851e0c6b81fcf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3898,7 +3898,8 @@ bool SPIRVInstructionSelector::selectGlobalValue(
   if (hasInitializer(GlobalVar) && !Init)
     return true;
 
-  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage();
+  bool HasLnkTy = !GV->hasInternalLinkage() && !GV->hasPrivateLinkage() &&
+                  !GV->hasHiddenVisibility();
   SPIRV::LinkageType::LinkageType LnkType =
       GV->isDeclarationForLinker()
           ? SPIRV::LinkageType::Import
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
index 202609c8156a7..78045ddcd85aa 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
@@ -5,25 +5,25 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
 ; Confirm that DXILFinalizeLinkage will remove functions that have compatible
 ; linkage and are not called from anywhere. This should be any function that
-; is not explicitly marked export and is not an entry point.
+; is marked hidden or internal.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden, and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
 
-; No inlining attribute and called, this should stay.
+; No inlining attribute, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #2 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #4 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #2 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #2 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
 entry:
   ret void
 }
@@ -154,27 +154,6 @@ entry:
   ret void
 }
 
-; No inlining attribute, internal, and exported; this should stay.
-; CHECK: define {{.*}}doInternalExported
-define internal void @"?doInternalExported@@YAXXZ"() #3 {
-entry:
-  ret void
-}
-
-; Alwaysinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doAlwaysInlineInternalExported
-define internal void @"?doAlwaysInlineInternalExported@@YAXXZ"() #1 {
-entry:
-  ret void
-}
-
-; Noinline, internal, and exported; this should stay.
-; CHECK: define {{.*}}doNoinlineInternalExported
-define internal void @"?doNoinlineInternalExported@@YAXXZ"() #5 {
-entry:
-  ret void
-}
-
 ; Marked external and exported, this should stay.
 ; CHECK: define {{.*}}doExternalExported
 define external void @"?doExternalExported@@YAXXZ"() #3 {
@@ -213,10 +192,10 @@ entry:
 }
 
 attributes #0 = { alwaysinline convergent norecurse nounwind }
-attributes #1 = { alwaysinline convergent norecurse nounwind "hlsl.export"}
+attributes #1 = { alwaysinline convergent norecurse nounwind }
 attributes #2 = { convergent norecurse nounwind }
-attributes #3 = { convergent norecurse nounwind "hlsl.export"}
+attributes #3 = { convergent norecurse nounwind }
 attributes #4 = { convergent noinline norecurse nounwind }
-attributes #5 = { convergent noinline norecurse nounwind "hlsl.export"}
+attributes #5 = { convergent noinline norecurse nounwind }
 attributes #6 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
 attributes #7 = { convergent }
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
index 49c3bda621d74..971451f981c99 100644
--- a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
@@ -7,23 +7,23 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 ; linkage and are not called from anywhere. This should be any function that
 ; is not an entry point.
 
-; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; Is hidden and is uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNothingUncalled
-define void @"?doNothingUncalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and uncalled, this should be removed.
+; Alwaysinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
-define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and uncalled, this should be removed.
+; Noinline, hidden and uncalled, this should be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineUncalled
-define void @"?doNoinlineUncalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -49,44 +49,44 @@ entry:
   ret void
 }
 
-; Marked external and uncalled, this should become internal and be removed.
+; Marked external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doExternalUncalled
-define external void @"?doExternalUncalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalUncalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline, external and uncalled, this should become internal and be removed.
+; Alwaysinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
-define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and uncalled, this should become internal and be removed.
+; Noinline, external, hidden, and uncalled, this should become internal and be removed.
 ; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
-define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
 
 ; No inlining attribute and called, this should stay.
 ; CHECK: define {{.*}}doNothingCalled
-define void @"?doNothingCalled@@YAXXZ"() #1 {
+define hidden void @"?doNothingCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Alwaysinline and called, this should stay.
+; Alwaysinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doAlwaysInlineCalled
-define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+define hidden void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline and called, this should stay.
+; Noinline, hidden, and called, this should stay.
 ; CHECK: define {{.*}}doNoinlineCalled
-define void @"?doNoinlineCalled@@YAXXZ"() #3 {
+define hidden void @"?doNoinlineCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
@@ -112,23 +112,23 @@ entry:
   ret void
 }
 
-; Marked external and called, this should become internal and stay.
+; Marked external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doExternalCalled
-define external void @"?doExternalCalled@@YAXXZ"() #1 {
+define external hidden void @"?doExternalCalled@@YAXXZ"() #1 {
 entry:
   ret void
 }
 
-; Always inlined, external and called, this should become internal and stay.
+; Always inlined, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doAlwaysInlineExternalCalled
-define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+define external hidden void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
-; Noinline, external and called, this should become internal and stay.
+; Noinline, external, hidden, and called, this should become internal and stay.
 ; CHECK: define {{.*}}doNoinlineExternalCalled
-define external void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
+define external hidden void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
index c761a79a5c28a..df691db5cff36 100644
--- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll
+++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
@@ -3,8 +3,8 @@
 
 target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 
-; DXILFinalizeLinkage changes linkage of all functions that are not
-; entry points or exported function to internal.
+; DXILFinalizeLinkage changes linkage of all functions that are hidden to
+; internal.
 
 ; CHECK-NOT: define internal void @"?f1@@YAXXZ"()
 define void @"?f1@@YAXXZ"() #0 {
@@ -13,19 +13,19 @@ entry:
 }
 
 ; CHECK: define internal void @"?f2@@YAXXZ"()
-define void @"?f2@@YAXXZ"() #0 {
+define hidden void @"?f2@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?f3@@YAXXZ"()
-define void @"?f3@@YAXXZ"() #0 {
+define hidden void @"?f3@@YAXXZ"() #0 {
 entry:
   ret void
 }
 
 ; CHECK: define internal void @"?foo@@YAXXZ"()
-define void @"?foo@@YAXXZ"() #0 {
+define hidden void @"?foo@@YAXXZ"() #0 {
 entry:
   call void @"?f2@@YAXXZ"() #3
   ret void
@@ -33,7 +33,7 @@ entry:
 
 ; Exported function - do not change linkage
 ; CHECK: define void @"?bar@@YAXXZ"()
-define void @"?bar@@YAXXZ"() #1 {
+define void @"?bar@@YAXXZ"() #0 {
 entry:
   call void @"?f3@@YAXXZ"() #3
   ret void
@@ -42,23 +42,22 @@ entry:
 ; CHECK: define internal void @"?main@@YAXXZ"() #0
 define internal void @"?main@@YAXXZ"() #0 {
 entry:
-  call void @"?foo@@YAXXZ"() #3
-  call void @"?bar@@YAXXZ"() #3
+  call void @"?foo@@YAXXZ"() #2
+  call void @"?bar@@YAXXZ"() #2
   ret void
 }
 
 ; Entry point function - do not change linkage
-; CHECK: define void @main() #2
-define void @main() #2 {
+; CHECK: define void @main() #1
+define void @main() #1 {
 entry:
   call void @"?main@@YAXXZ"()
   ret void
 }
 
 attributes #0 = { convergent noinline nounwind optnone}
-attributes #1 = { convergent noinline nounwind optnone "hlsl.export"}
-attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
-attributes #3 = { convergent }
+attributes #1 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
+attributes #2 = { convergent }
 
 ; Make sure "hlsl.export" attribute is stripped by llc
 ; CHECK-LLC-NOT: "hlsl.export"

From 60a59e350bfa909d3caf5b5b0dba8b473746ea0f Mon Sep 17 00:00:00 2001
From: Yuta Saito <kateinoigakukun@gmail.com>
Date: Tue, 17 Jun 2025 06:23:50 +0900
Subject: [PATCH 644/851] [ASan] Recognize WASI platform in
 sanitizer_platform.h (#139017)

---
 compiler-rt/lib/sanitizer_common/sanitizer_platform.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 9f5f41cd85514..4c8d9a9b86bed 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -14,7 +14,8 @@
 
 #if !defined(__linux__) && !defined(__FreeBSD__) && !defined(__NetBSD__) && \
     !defined(__APPLE__) && !defined(_WIN32) && !defined(__Fuchsia__) &&     \
-    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__)
+    !(defined(__sun__) && defined(__svr4__)) && !defined(__HAIKU__) &&      \
+    !defined(__wasi__)
 #  error "This operating system is not supported"
 #endif
 
@@ -61,6 +62,12 @@
 #  define SANITIZER_HAIKU 0
 #endif
 
+#if defined(__wasi__)
+#  define SANITIZER_WASI 1
+#else
+#  define SANITIZER_WASI 0
+#endif
+
 // - SANITIZER_APPLE: all Apple code
 //   - TARGET_OS_OSX: macOS
 //   - SANITIZER_IOS: devices (iOS and iOS-like)

From 38daa6d4ef1f3386cc50198199c5ec61dcb012af Mon Sep 17 00:00:00 2001
From: Uzair Nawaz <uzairnawaz@google.com>
Date: Mon, 16 Jun 2025 21:28:51 +0000
Subject: [PATCH 645/851] [libc] build fix: always use our char8_t headers even
 in overlay mode (#144433)

Build fix caused by certain platforms not providing char8_t when
expected
Temporary fix to just always use our own definition, even in overlay
mode.
---
 libc/hdr/types/char8_t.h                         | 8 --------
 libc/src/__support/wchar/character_converter.cpp | 2 +-
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h
index 31de764658f9e..4d71e3dd89098 100644
--- a/libc/hdr/types/char8_t.h
+++ b/libc/hdr/types/char8_t.h
@@ -9,14 +9,6 @@
 #ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 #define LLVM_LIBC_HDR_TYPES_CHAR8_T_H
 
-#ifdef LIBC_FULL_BUILD
-
 #include "include/llvm-libc-types/char8_t.h"
 
-#else // overlay mode
-
-#include "hdr/uchar_overlay.h"
-
-#endif // LLVM_LIBC_FULL_BUILD
-
 #endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index bac2f6d827e13..ca709769616c3 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -70,7 +70,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   char32_t output;
 
   // Shift to get the next 6 bits from the utf32 encoding
-  const char32_t shift_amount =
+  const size_t shift_amount =
       (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
   if (state->bytes_processed == 0) {
     /*

From 95418bc8a8fd765d5e60e0c8ac7f8b77d2c15ef2 Mon Sep 17 00:00:00 2001
From: Justin King <jcking@wulver.com>
Date: Mon, 16 Jun 2025 14:29:08 -0700
Subject: [PATCH 646/851] lsan: Support free_sized and free_aligned_sized from
 C23 (#144415)

Adds support to LSan for `free_sized` and `free_aligned_sized` from C23.

Other sanitizers will be handled with their own separate PRs.

For #144435

Signed-off-by: Justin King <jcking@google.com>
---
 compiler-rt/lib/lsan/lsan_allocator.cpp       |  4 ++++
 compiler-rt/lib/lsan/lsan_allocator.h         |  2 ++
 compiler-rt/lib/lsan/lsan_interceptors.cpp    | 18 +++++++++++++++
 compiler-rt/lib/lsan/lsan_malloc_mac.cpp      | 23 +++++++++++--------
 .../sanitizer_common/sanitizer_malloc_mac.inc | 15 ++++++++++++
 5 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index 493bf5f9efc57..a436d9c07ac6c 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -220,6 +220,10 @@ void lsan_free(void *p) {
   Deallocate(p);
 }
 
+void lsan_free_sized(void *p, uptr) { Deallocate(p); }
+
+void lsan_free_aligned_sized(void *p, uptr, uptr) { Deallocate(p); }
+
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
   return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 5eed0cbdb309b..2342f11fb5d0d 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -127,6 +127,8 @@ void *lsan_aligned_alloc(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_malloc(uptr size, const StackTrace &stack);
 void lsan_free(void *p);
+void lsan_free_sized(void *p, uptr size);
+void lsan_free_aligned_sized(void *p, uptr alignment, uptr size);
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack);
 void *lsan_reallocarray(void *p, uptr nmemb, uptr size,
                         const StackTrace &stack);
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index a8252cddacf25..8e33130840e92 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -84,6 +84,24 @@ INTERCEPTOR(void, free, void *p) {
   lsan_free(p);
 }
 
+INTERCEPTOR(void, free_sized, void *p, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_sized(p, size);
+}
+
+INTERCEPTOR(void, free_aligned_sized, void *p, uptr alignment, uptr size) {
+  if (UNLIKELY(!p))
+    return;
+  if (DlsymAlloc::PointerIsMine(p))
+    return DlsymAlloc::Free(p);
+  ENSURE_LSAN_INITED;
+  lsan_free_aligned_sized(p, alignment, size);
+}
+
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
   if (DlsymAlloc::Use())
     return DlsymAlloc::Callocate(nmemb, size);
diff --git a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
index 525c30272ccca..8a16c053da238 100644
--- a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
+++ b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
@@ -44,16 +44,19 @@ using namespace __lsan;
   void *p = lsan_valloc(size, stack)
 #define COMMON_MALLOC_FREE(ptr) \
   lsan_free(ptr)
-#define COMMON_MALLOC_SIZE(ptr) \
-  uptr size = lsan_mz_size(ptr)
-#define COMMON_MALLOC_FILL_STATS(zone, stats)
-#define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
-  (void)zone_name; \
-  Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", ptr);
-#define COMMON_MALLOC_NAMESPACE __lsan
-#define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
-#define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
+#  define COMMON_MALLOC_FREE_SIZED(ptr, size) lsan_free_sized(ptr, size)
+#  define COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size) \
+    lsan_free_aligned_sized(ptr, alignment, size)
+#  define COMMON_MALLOC_SIZE(ptr) uptr size = lsan_mz_size(ptr)
+#  define COMMON_MALLOC_FILL_STATS(zone, stats)
+#  define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name)    \
+    (void)zone_name;                                                        \
+    Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", \
+           ptr);
+#  define COMMON_MALLOC_NAMESPACE __lsan
+#  define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
+#  define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
 
-#include "sanitizer_common/sanitizer_malloc_mac.inc"
+#  include "sanitizer_common/sanitizer_malloc_mac.inc"
 
 #endif // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
index 6343eb284afbf..72ad22999b5a4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
@@ -144,6 +144,21 @@ INTERCEPTOR(void, free, void *ptr) {
   COMMON_MALLOC_FREE(ptr);
 }
 
+#ifdef COMMON_MALLOC_FREE_SIZED
+INTERCEPTOR(void, free_sized, void *ptr, size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_SIZED(ptr, size);
+}
+#endif
+
+#ifdef COMMON_MALLOC_FREE_ALIGNED_SIZED
+INTERCEPTOR(void, free_aligned_sized, void *ptr, size_t alignment,
+            size_t size) {
+  COMMON_MALLOC_ENTER();
+  COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size);
+}
+#endif
+
 INTERCEPTOR(void *, realloc, void *ptr, size_t size) {
   COMMON_MALLOC_ENTER();
   COMMON_MALLOC_REALLOC(ptr, size);

From 9c25ca78f9bdfe74e5dbaa60a864411bdbae4943 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:36:13 -0700
Subject: [PATCH 647/851] [flang] Don't generate module file for hermetic USE'd
 dependency (#144143)

It's possible for the module file generation code to think that it needs
to (re)generate a module file for a dependent module read from a
hermetic module file, if it defines contains a procedure imported via
renaming due to a name clash. Adjust the logic that determines whether a
module file should be written to include a check for having originated
in a module file.
---
 flang/lib/Semantics/mod-file.cpp   | 44 ++++++++++++++++--------------
 flang/test/Semantics/modfile79.F90 | 33 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 21 deletions(-)
 create mode 100644 flang/test/Semantics/modfile79.F90

diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 9f9e9f5840456..82c8536902eb2 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -109,15 +109,14 @@ bool ModFileWriter::WriteAll() {
 }
 
 void ModFileWriter::WriteAll(const Scope &scope) {
-  for (const auto &child : scope.children()) {
+  for (const Scope &child : scope.children()) {
     WriteOne(child);
   }
 }
 
 void ModFileWriter::WriteOne(const Scope &scope) {
   if (scope.kind() == Scope::Kind::Module) {
-    auto *symbol{scope.symbol()};
-    if (!symbol->test(Symbol::Flag::ModFile)) {
+    if (const auto *symbol{scope.symbol()}) {
       Write(*symbol);
     }
     WriteAll(scope); // write out submodules
@@ -134,7 +133,7 @@ static std::string ModFileName(const SourceName &name,
 // Write the module file for symbol, which must be a module or submodule.
 void ModFileWriter::Write(const Symbol &symbol) {
   const auto &module{symbol.get<ModuleDetails>()};
-  if (module.moduleFileHash()) {
+  if (symbol.test(Symbol::Flag::ModFile) || module.moduleFileHash()) {
     return; // already written
   }
   const auto *ancestor{module.ancestor()};
@@ -372,16 +371,19 @@ void ModFileWriter::PutSymbols(
   CollectSymbols(scope, sorted, uses, modules);
   // Write module files for dependencies first so that their
   // hashes are known.
-  for (auto ref : modules) {
+  for (const Symbol &mod : modules) {
     if (hermeticModules) {
-      hermeticModules->insert(*ref);
+      hermeticModules->insert(mod);
     } else {
-      Write(*ref);
-      needs_ << ModHeader::need
-             << CheckSumString(
-                    ref->get<ModuleDetails>().moduleFileHash().value())
-             << (ref->owner().IsIntrinsicModules() ? " i " : " n ")
-             << ref->name().ToString() << '\n';
+      Write(mod);
+      // It's possible that the module's file already existed and
+      // without its own hash due to being embedded in a hermetic
+      // module file.
+      if (auto hash{mod.get<ModuleDetails>().moduleFileHash()}) {
+        needs_ << ModHeader::need << CheckSumString(*hash)
+               << (mod.owner().IsIntrinsicModules() ? " i " : " n ")
+               << mod.name().ToString() << '\n';
+      }
     }
   }
   std::string buf; // stuff after CONTAINS in derived type
@@ -855,25 +857,25 @@ void CollectSymbols(const Scope &scope, SymbolVector &sorted,
   auto symbols{scope.GetSymbols()};
   std::size_t commonSize{scope.commonBlocks().size()};
   sorted.reserve(symbols.size() + commonSize);
-  for (SymbolRef symbol : symbols) {
-    const auto *generic{symbol->detailsIf<GenericDetails>()};
+  for (const Symbol &symbol : symbols) {
+    const auto *generic{symbol.detailsIf<GenericDetails>()};
     if (generic) {
       uses.insert(uses.end(), generic->uses().begin(), generic->uses().end());
-      for (auto ref : generic->uses()) {
-        modules.insert(GetUsedModule(ref->get<UseDetails>()));
+      for (const Symbol &used : generic->uses()) {
+        modules.insert(GetUsedModule(used.get<UseDetails>()));
       }
-    } else if (const auto *use{symbol->detailsIf<UseDetails>()}) {
+    } else if (const auto *use{symbol.detailsIf<UseDetails>()}) {
       modules.insert(GetUsedModule(*use));
     }
-    if (symbol->test(Symbol::Flag::ParentComp)) {
-    } else if (symbol->has<NamelistDetails>()) {
+    if (symbol.test(Symbol::Flag::ParentComp)) {
+    } else if (symbol.has<NamelistDetails>()) {
       namelist.push_back(symbol);
     } else if (generic) {
       if (generic->specific() &&
-          &generic->specific()->owner() == &symbol->owner()) {
+          &generic->specific()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->specific());
       } else if (generic->derivedType() &&
-          &generic->derivedType()->owner() == &symbol->owner()) {
+          &generic->derivedType()->owner() == &symbol.owner()) {
         sorted.push_back(*generic->derivedType());
       }
       generics.push_back(symbol);
diff --git a/flang/test/Semantics/modfile79.F90 b/flang/test/Semantics/modfile79.F90
new file mode 100644
index 0000000000000..7d3b42166654e
--- /dev/null
+++ b/flang/test/Semantics/modfile79.F90
@@ -0,0 +1,33 @@
+!RUN: %flang -c -DWHICH=1 %s && FileCheck %s <modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c %s && FileCheck %s <modfile79a.mod
+
+!Ensure that writing modfile79c.mod doesn't cause a spurious
+!regeneration of modfile79a.mod from its copy in the hermetic
+!module file modfile79b.mod.
+!CHECK: !mod$ v1 sum:93ec75fe672c5b6c
+!CHECK-NEXT: module modfile79a
+
+#if WHICH == 1
+module modfile79a
+  interface foo
+    module procedure foo
+  end interface
+ contains
+  subroutine foo
+  end
+end
+#elif WHICH == 2
+module modfile79b
+  use modfile79a
+  interface bar
+    procedure foo
+  end interface
+end
+#else
+module modfile79c
+  use modfile79b
+ contains
+  subroutine test
+    call bar
+  end
+end
+#endif

From 65b06cd983e59c25f30b680167559a4db2b44609 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:36:35 -0700
Subject: [PATCH 648/851] [flang][runtime] Check SOURCE= conformability on
 ALLOCATE (#144113)

The SOURCE= expression of an ALLOCATE statement, when present and not
scalar, must conform to the shape of the allocated objects. Check this
at runtime, and return a recoverable error, or crash, when appropriate.

Fixes https://github.com/llvm/llvm-project/issues/143900.
---
 flang-rt/lib/runtime/allocatable.cpp   | 20 ++++++++++
 flang/lib/Semantics/check-allocate.cpp | 51 ++++++++++++++++++++++++++
 flang/test/Semantics/allocate11.f90    |  1 +
 3 files changed, 72 insertions(+)

diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index ef18da6ea0786..f724f0a20884b 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -165,6 +165,26 @@ int RTDEF(AllocatableAllocateSource)(Descriptor &alloc,
       alloc, /*asyncObject=*/nullptr, hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
+    if (alloc.rank() != source.rank() && source.rank() != 0) {
+      terminator.Crash("ALLOCATE object has rank %d while SOURCE= has rank %d",
+          alloc.rank(), source.rank());
+    }
+    if (int rank{source.rank()}; rank > 0) {
+      SubscriptValue allocExtent[maxRank], sourceExtent[maxRank];
+      alloc.GetShape(allocExtent);
+      source.GetShape(sourceExtent);
+      for (int j{0}; j < rank; ++j) {
+        if (allocExtent[j] != sourceExtent[j]) {
+          if (!hasStat) {
+            terminator.Crash("ALLOCATE object has extent %jd on dimension %d, "
+                             "but SOURCE= has extent %jd",
+                static_cast<std::intmax_t>(allocExtent[j]), j + 1,
+                static_cast<std::intmax_t>(sourceExtent[j]));
+          }
+          return StatInvalidExtent;
+        }
+      }
+    }
     DoFromSourceAssign(alloc, source, terminator);
   }
   return stat;
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 2c215f45bf516..08053594c12e4 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -10,6 +10,7 @@
 #include "assignment.h"
 #include "definable.h"
 #include "flang/Evaluate/fold.h"
+#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -33,6 +34,7 @@ struct AllocateCheckerInfo {
   bool gotMold{false};
   bool gotStream{false};
   bool gotPinned{false};
+  std::optional<evaluate::ConstantSubscripts> sourceExprShape;
 };
 
 class AllocationCheckerHelper {
@@ -259,6 +261,9 @@ static std::optional<AllocateCheckerInfo> CheckAllocateOptions(
           CheckCopyabilityInPureScope(messages, *expr, scope);
         }
       }
+      auto maybeShape{evaluate::GetShape(context.foldingContext(), *expr)};
+      info.sourceExprShape =
+          evaluate::AsConstantExtents(context.foldingContext(), maybeShape);
     } else {
       // Error already reported on source expression.
       // Do not continue allocate checks.
@@ -581,6 +586,52 @@ bool AllocationCheckerHelper::RunChecks(SemanticsContext &context) {
             .Attach(
                 ultimate_->name(), "Declared here with rank %d"_en_US, rank_);
         return false;
+      } else if (allocateInfo_.gotSource && allocateInfo_.sourceExprShape &&
+          allocateInfo_.sourceExprShape->size() ==
+              static_cast<std::size_t>(allocateShapeSpecRank_)) {
+        std::size_t j{0};
+        for (const auto &shapeSpec :
+            std::get<std::list<parser::AllocateShapeSpec>>(allocation_.t)) {
+          if (j >= allocateInfo_.sourceExprShape->size()) {
+            break;
+          }
+          std::optional<evaluate::ConstantSubscript> lbound;
+          if (const auto &lb{std::get<0>(shapeSpec.t)}) {
+            lbound.reset();
+            const auto &lbExpr{lb->thing.thing.value()};
+            if (const auto *expr{GetExpr(context, lbExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              lbound = evaluate::ToInt64(folded);
+              evaluate::SetExpr(lbExpr, std::move(folded));
+            }
+          } else {
+            lbound = 1;
+          }
+          if (lbound) {
+            const auto &ubExpr{std::get<1>(shapeSpec.t).thing.thing.value()};
+            if (const auto *expr{GetExpr(context, ubExpr)}) {
+              auto folded{
+                  evaluate::Fold(context.foldingContext(), SomeExpr(*expr))};
+              auto ubound{evaluate::ToInt64(folded)};
+              evaluate::SetExpr(ubExpr, std::move(folded));
+              if (ubound) {
+                auto extent{*ubound - *lbound + 1};
+                if (extent < 0) {
+                  extent = 0;
+                }
+                if (extent != allocateInfo_.sourceExprShape->at(j)) {
+                  context.Say(name_.source,
+                      "Allocation has extent %jd on dimension %d, but SOURCE= has extent %jd"_err_en_US,
+                      static_cast<std::intmax_t>(extent), j + 1,
+                      static_cast<std::intmax_t>(
+                          allocateInfo_.sourceExprShape->at(j)));
+                }
+              }
+            }
+          }
+          ++j;
+        }
       }
     }
   } else { // allocating a scalar object
diff --git a/flang/test/Semantics/allocate11.f90 b/flang/test/Semantics/allocate11.f90
index 1b7495e9fc07d..8aeb069df09f2 100644
--- a/flang/test/Semantics/allocate11.f90
+++ b/flang/test/Semantics/allocate11.f90
@@ -163,6 +163,7 @@ subroutine C938_C947(var2, ptr, ptr2, fptr, my_team, srca)
   allocate(var2(2)[5:*], MOLD=my_team)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], MOLD=ptr)
+  !ERROR: Allocation has extent 2 on dimension 1, but SOURCE= has extent 9
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray
   allocate(var2(2)[5:*], SOURCE=ptr2)
   !ERROR: SOURCE or MOLD expression type must not be C_PTR or C_FUNPTR from ISO_C_BINDING when an allocatable object is a coarray

From 2bf3ccabfa37ee1b2d74da7b370cdb16a5cc8ac0 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 16 Jun 2025 14:37:01 -0700
Subject: [PATCH 649/851] [flang] Restructure runtime to avoid recursion
 (relanding) (#143993)

Recursion, both direct and indirect, prevents accurate stack size
calculation at link time for GPU device code. Restructure these
recursive (often mutually so) routines in the Fortran runtime with new
implementations based on an iterative work queue with
suspendable/resumable work tickets: Assign, Initialize, initializeClone,
Finalize, and Destroy.

Default derived type I/O is also recursive, but already disabled. It can
be added to this new framework later if the overall approach succeeds.

Note that derived type FINAL subroutine calls, defined assignments, and
defined I/O procedures all perform callbacks into user code, which may
well reenter the runtime library. This kind of recursion is not handled
by this change, although it may be possible to do so in the future using
thread-local work queues.

(Relanding this patch after reverting initial attempt due to some test
failures that needed some time to analyze and fix.)

Fixes https://github.com/llvm/llvm-project/issues/142481.
---
 .../include/flang-rt/runtime/environment.h    |   3 +
 flang-rt/include/flang-rt/runtime/stat.h      |  10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |  15 +-
 .../include/flang-rt/runtime/work-queue.h     | 555 +++++++++++++++
 flang-rt/lib/runtime/CMakeLists.txt           |   2 +
 flang-rt/lib/runtime/assign.cpp               | 663 +++++++++++------
 flang-rt/lib/runtime/derived.cpp              | 516 +++++++-------
 flang-rt/lib/runtime/descriptor-io.cpp        | 668 +++++++++++++++++-
 flang-rt/lib/runtime/descriptor-io.h          | 620 +---------------
 flang-rt/lib/runtime/environment.cpp          |   4 +
 flang-rt/lib/runtime/namelist.cpp             |   1 +
 flang-rt/lib/runtime/tools.cpp                |   4 +-
 flang-rt/lib/runtime/type-info.cpp            |  12 +-
 flang-rt/lib/runtime/work-queue.cpp           | 161 +++++
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |   2 +-
 flang/docs/Extensions.md                      |  10 +
 flang/include/flang/Runtime/assign.h          |   3 +-
 flang/include/flang/Semantics/tools.h         |   7 +-
 flang/lib/Semantics/runtime-type-info.cpp     |  88 ++-
 flang/lib/Semantics/tools.cpp                 |  32 +
 flang/module/__fortran_type_info.f90          |   5 +-
 flang/test/Lower/volatile-openmp.f90          |   8 +-
 flang/test/Semantics/typeinfo01.f90           |  34 +-
 flang/test/Semantics/typeinfo03.f90           |   2 +-
 flang/test/Semantics/typeinfo04.f90           |   8 +-
 flang/test/Semantics/typeinfo05.f90           |   4 +-
 flang/test/Semantics/typeinfo06.f90           |   4 +-
 flang/test/Semantics/typeinfo07.f90           |   8 +-
 flang/test/Semantics/typeinfo08.f90           |   2 +-
 flang/test/Semantics/typeinfo11.f90           |   2 +-
 flang/test/Semantics/typeinfo12.f90           |  67 ++
 flang/test/Semantics/typeinfo13.f90           |   2 +-
 32 files changed, 2350 insertions(+), 1172 deletions(-)
 create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 create mode 100644 flang-rt/lib/runtime/work-queue.cpp
 create mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 16258b3bbba9b..e579f6012ce86 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,6 +64,9 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
+  enum InternalDebugging { WorkQueue = 1 };
+  int internalDebugging{0}; // FLANG_RT_DEBUG
+
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index 070d0bf8673fb..dc372de53506a 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes
+  // Interoperable STAT= codes (>= 11)
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values
+  // Standard STAT= values (>= 101)
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,10 +49,14 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
+
+  // Dummy status for work queue continuation, declared here to perhaps
+  // avoid collisions
+  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 5e79efde164f2..80301a313282f 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -154,12 +154,17 @@ class SpecialBinding {
   RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const {
     return (isArgDescriptorSet_ >> zeroBasedArg) & 1;
   }
-  RT_API_ATTRS bool isTypeBound() const { return isTypeBound_; }
+  RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; }
   RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const {
     return (isArgContiguousSet_ >> zeroBasedArg) & 1;
   }
-  template <typename PROC> RT_API_ATTRS PROC GetProc() const {
-    return reinterpret_cast<PROC>(proc_);
+  template <typename PROC>
+  RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const {
+    if (bindings && isTypeBound_ > 0) {
+      return reinterpret_cast<PROC>(bindings[isTypeBound_ - 1].proc);
+    } else {
+      return reinterpret_cast<PROC>(proc_);
+    }
   }
 
   FILE *Dump(FILE *) const;
@@ -193,6 +198,8 @@ class SpecialBinding {
   //     When false, the defined I/O subroutine must have been
   //     called via a generic interface, not a generic TBP.
   std::uint8_t isArgDescriptorSet_{0};
+  // When a special binding is type-bound, this is its binding's index (plus 1,
+  // so that 0 signifies that it's not type-bound).
   std::uint8_t isTypeBound_{0};
   // True when a FINAL subroutine has a dummy argument that is an array that
   // is CONTIGUOUS or neither assumed-rank nor assumed-shape.
@@ -240,6 +247,7 @@ class DerivedType {
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
+  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -322,6 +330,7 @@ class DerivedType {
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
+  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
new file mode 100644
index 0000000000000..0daa7bc4d3384
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -0,0 +1,555 @@
+//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Internal runtime utilities for work queues that replace the use of recursion
+// for better GPU device support.
+//
+// A work queue comprises a list of tickets.  Each ticket class has a Begin()
+// member function, which is called once, and a Continue() member function
+// that can be called zero or more times.  A ticket's execution terminates
+// when either of these member functions returns a status other than
+// StatContinue.  When that status is not StatOk, then the whole queue
+// is shut down.
+//
+// By returning StatContinue from its Continue() member function,
+// a ticket suspends its execution so that any nested tickets that it
+// may have created can be run to completion.  It is the reponsibility
+// of each ticket class to maintain resumption information in its state
+// and manage its own progress.  Most ticket classes inherit from
+// class ComponentsOverElements, which implements an outer loop over all
+// components of a derived type, and an inner loop over all elements
+// of a descriptor, possibly with multiple phases of execution per element.
+//
+// Tickets are created by WorkQueue::Begin...() member functions.
+// There is one of these for each "top level" recursive function in the
+// Fortran runtime support library that has been restructured into this
+// ticket framework.
+//
+// When the work queue is running tickets, it always selects the last ticket
+// on the list for execution -- "work stack" might have been a more accurate
+// name for this framework.  This ticket may, while doing its job, create
+// new tickets, and since those are pushed after the active one, the first
+// such nested ticket will be the next one executed to completion -- i.e.,
+// the order of nested WorkQueue::Begin...() calls is respected.
+// Note that a ticket's Continue() member function won't be called again
+// until all nested tickets have run to completion and it is once again
+// the last ticket on the queue.
+//
+// Example for an assignment to a derived type:
+// 1. Assign() is called, and its work queue is created.  It calls
+//    WorkQueue::BeginAssign() and then WorkQueue::Run().
+// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
+//    BeginFinalize() and returns StatContinue.
+// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
+//    until one of them returns StatOk, which ends the finalization ticket.
+// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
+//    and then returns StatOk, which ends the ticket.
+// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
+//    and ::Continue() are called until they are done (not StatContinue).
+//    Along the way, it may create nested AssignTickets for components,
+//    and suspend itself so that they may each run to completion.
+
+#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
+#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
+
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
+#include <flang/Common/variant.h>
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
+
+namespace Fortran::runtime {
+class Terminator;
+class WorkQueue;
+
+// Ticket worker base classes
+
+template <typename TICKET> class ImmediateTicketRunner {
+public:
+  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
+      : ticket_{ticket} {}
+  RT_API_ATTRS int Run(WorkQueue &workQueue) {
+    int status{ticket_.Begin(workQueue)};
+    while (status == StatContinue) {
+      status = ticket_.Continue(workQueue);
+    }
+    return status;
+  }
+
+private:
+  TICKET &ticket_;
+};
+
+// Base class for ticket workers that operate elementwise over descriptors
+class Elementwise {
+public:
+  RT_API_ATTRS Elementwise(
+      const Descriptor &instance, const Descriptor *from = nullptr)
+      : instance_{instance}, from_{from} {
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
+  RT_API_ATTRS void Advance() {
+    ++elementAt_;
+    instance_.IncrementSubscripts(subscripts_);
+    if (from_) {
+      from_->IncrementSubscripts(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
+  RT_API_ATTRS void Reset() {
+    elementAt_ = 0;
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+
+protected:
+  const Descriptor &instance_, *from_{nullptr};
+  std::size_t elements_{instance_.Elements()};
+  std::size_t elementAt_{0};
+  SubscriptValue subscripts_[common::maxRank];
+  SubscriptValue fromSubscripts_[common::maxRank];
+};
+
+// Base class for ticket workers that operate over derived type components.
+class Componentwise {
+public:
+  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
+  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
+  RT_API_ATTRS void Advance() {
+    ++componentAt_;
+    GetComponent();
+  }
+  RT_API_ATTRS void SkipToEnd() {
+    component_ = nullptr;
+    componentAt_ = components_;
+  }
+  RT_API_ATTRS void Reset() {
+    component_ = nullptr;
+    componentAt_ = 0;
+    GetComponent();
+  }
+  RT_API_ATTRS void GetComponent();
+
+protected:
+  const typeInfo::DerivedType &derived_;
+  std::size_t components_{0}, componentAt_{0};
+  const typeInfo::Component *component_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
+};
+
+// Base class for ticket workers that operate over derived type components
+// in an outer loop, and elements in an inner loop.
+class ComponentsOverElements : public Componentwise, public Elementwise {
+public:
+  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Componentwise{derived}, Elementwise{instance, from} {
+    if (Elementwise::IsComplete()) {
+      Componentwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextElement();
+    if (Elementwise::IsComplete()) {
+      Elementwise::Reset();
+      Componentwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Elementwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void Reset() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Reset();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Base class for ticket workers that operate over elements in an outer loop,
+// type components in an inner loop.
+class ElementsOverComponents : public Elementwise, public Componentwise {
+public:
+  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Elementwise{instance, from}, Componentwise{derived} {
+    if (Componentwise::IsComplete()) {
+      Elementwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextComponent();
+    if (Componentwise::IsComplete()) {
+      Componentwise::Reset();
+      Elementwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Componentwise::Reset();
+    Elementwise::Advance();
+  }
+
+protected:
+  int phase_{0};
+};
+
+// Ticket worker classes
+
+// Implements derived type instance initialization
+class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
+                         private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<InitializeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+};
+
+// Initializes one derived type instance from the value of another
+class InitializeCloneTicket
+    : public ImmediateTicketRunner<InitializeCloneTicket>,
+      private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg)
+      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
+        ComponentsOverElements{original, derived}, clone_{clone},
+        hasStat_{hasStat}, errMsg_{errMsg} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const Descriptor &clone_;
+  bool hasStat_{false};
+  const Descriptor *errMsg_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
+};
+
+// Implements derived type instance finalization
+class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
+                       private ComponentsOverElements {
+public:
+  RT_API_ATTRS FinalizeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<FinalizeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const typeInfo::DerivedType *finalizableParentType_{nullptr};
+};
+
+// Implements derived type instance destruction
+class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
+                      private ComponentsOverElements {
+public:
+  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, bool finalize)
+      : ImmediateTicketRunner<DestroyTicket>{*this},
+        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  bool finalize_{false};
+};
+
+// Implements general intrinsic assignment
+class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
+public:
+  RT_API_ATTRS AssignTicket(Descriptor &to, const Descriptor &from, int flags,
+      MemmoveFct memmoveFct, const typeInfo::DerivedType *declaredType)
+      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
+        flags_{flags}, memmoveFct_{memmoveFct}, declaredType_{declaredType} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  RT_API_ATTRS bool IsSimpleMemmove() const {
+    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
+        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
+  }
+  RT_API_ATTRS Descriptor &GetTempDescriptor();
+
+  Descriptor &to_;
+  const Descriptor *from_{nullptr};
+  int flags_{0}; // enum AssignFlags
+  MemmoveFct memmoveFct_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
+  const typeInfo::DerivedType *declaredType_{nullptr};
+  const typeInfo::DerivedType *toDerived_{nullptr};
+  Descriptor *toDeallocate_{nullptr};
+  bool persist_{false};
+  bool done_{false};
+};
+
+// Implements derived type intrinsic assignment.
+template <bool IS_COMPONENTWISE>
+class DerivedAssignTicket
+    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+          ElementsOverComponents> {
+public:
+  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      ElementsOverComponents>;
+  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter)
+      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
+        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
+        deallocateAfter_{deallocateAfter} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
+  bool toIsContiguous_{this->instance_.IsContiguous()};
+  bool fromIsContiguous_{this->from_->IsContiguous()};
+  int flags_{0};
+  MemmoveFct memmoveFct_{nullptr};
+  Descriptor *deallocateAfter_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
+};
+
+namespace io::descr {
+
+template <io::Direction DIR>
+class DescriptorIoTicket
+    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
+      private Elementwise {
+public:
+  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
+        Elementwise{descriptor}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
+  const typeInfo::DerivedType *derived_{nullptr};
+  const typeInfo::SpecialBinding *special_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
+};
+
+template <io::Direction DIR>
+class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
+                        private ElementsOverComponents {
+public:
+  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DerivedIoTicket>(*this),
+        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+};
+
+} // namespace io::descr
+
+struct NullTicket {
+  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
+  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
+};
+
+struct Ticket {
+  RT_API_ATTRS int Continue(WorkQueue &);
+  bool begun{false};
+  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
+      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
+      DerivedAssignTicket<true>,
+      io::descr::DescriptorIoTicket<io::Direction::Output>,
+      io::descr::DescriptorIoTicket<io::Direction::Input>,
+      io::descr::DerivedIoTicket<io::Direction::Output>,
+      io::descr::DerivedIoTicket<io::Direction::Input>>
+      u;
+};
+
+class WorkQueue {
+public:
+  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
+      : terminator_{terminator} {
+    for (int j{1}; j < numStatic_; ++j) {
+      static_[j].previous = &static_[j - 1];
+      static_[j - 1].next = &static_[j];
+    }
+  }
+  RT_API_ATTRS ~WorkQueue();
+  RT_API_ATTRS Terminator &terminator() { return terminator_; };
+
+  // APIs for particular tasks.  These can return StatOk if the work is
+  // completed immediately.
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return InitializeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg) {
+    if (runTicketsImmediately_) {
+      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeCloneTicket>(
+          clone, original, derived, hasStat, errMsg);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginFinalize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return FinalizeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, bool finalize) {
+    if (runTicketsImmediately_) {
+      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
+    } else {
+      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
+      int flags, MemmoveFct memmoveFct,
+      const typeInfo::DerivedType *declaredType) {
+    if (runTicketsImmediately_) {
+      return AssignTicket{to, from, flags, memmoveFct, declaredType}.Run(*this);
+    } else {
+      StartTicket().u.emplace<AssignTicket>(
+          to, from, flags, memmoveFct, declaredType);
+      return StatContinue;
+    }
+  }
+  template <bool IS_COMPONENTWISE>
+  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter) {
+    if (runTicketsImmediately_) {
+      return DerivedAssignTicket<IS_COMPONENTWISE>{
+          to, from, derived, flags, memmoveFct, deallocateAfter}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
+          to, from, derived, flags, memmoveFct, deallocateAfter);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DescriptorIoTicket<DIR>{
+          io, descriptor, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
+          io, descriptor, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DerivedIoTicket<DIR>{
+          io, descriptor, derived, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
+          io, descriptor, derived, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+
+  RT_API_ATTRS int Run();
+
+private:
+#if RT_DEVICE_COMPILATION
+  // Always use the work queue on a GPU device to avoid recursion.
+  static constexpr bool runTicketsImmediately_{false};
+#else
+  // Avoid the work queue overhead on the host, unless it needs
+  // debugging, which is so much easier there.
+  static constexpr bool runTicketsImmediately_{true};
+#endif
+
+  // Most uses of the work queue won't go very deep.
+  static constexpr int numStatic_{2};
+
+  struct TicketList {
+    bool isStatic{true};
+    Ticket ticket;
+    TicketList *previous{nullptr}, *next{nullptr};
+  };
+
+  RT_API_ATTRS Ticket &StartTicket();
+  RT_API_ATTRS void Stop();
+
+  Terminator &terminator_;
+  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
+  TicketList static_[numStatic_];
+  TicketList *firstFree_{static_};
+};
+
+} // namespace Fortran::runtime
+#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index a3f63b4315644..332c0872e065f 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,6 +68,7 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
+  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -131,6 +132,7 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
+  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index bf67b5dc8b645..f936a4192a33c 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -62,11 +63,24 @@ static inline RT_API_ATTRS bool MustDeallocateLHS(
     // Distinct shape? Deallocate
     int rank{to.rank()};
     for (int j{0}; j < rank; ++j) {
-      if (to.GetDimension(j).Extent() != from.GetDimension(j).Extent()) {
+      const auto &toDim{to.GetDimension(j)};
+      const auto &fromDim{from.GetDimension(j)};
+      if (toDim.Extent() != fromDim.Extent()) {
+        return true;
+      }
+      if ((flags & UpdateLHSBounds) &&
+          toDim.LowerBound() != fromDim.LowerBound()) {
         return true;
       }
     }
   }
+  // Not reallocating; may have to update bounds
+  if (flags & UpdateLHSBounds) {
+    int rank{to.rank()};
+    for (int j{0}; j < rank; ++j) {
+      to.GetDimension(j).SetLowerBound(from.GetDimension(j).LowerBound());
+    }
+  }
   return false;
 }
 
@@ -102,11 +116,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
-  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
-    result = ReturnError(terminator, Initialize(to, *derived, terminator));
-  }
-  return result;
+  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
 }
 
 // least <= 0, most >= 0
@@ -169,24 +179,27 @@ static RT_API_ATTRS bool MayAlias(const Descriptor &x, const Descriptor &y) {
 }
 
 static RT_API_ATTRS void DoScalarDefinedAssignment(const Descriptor &to,
-    const Descriptor &from, const typeInfo::SpecialBinding &special) {
+    const Descriptor &from, const typeInfo::DerivedType &derived,
+    const typeInfo::SpecialBinding &special) {
   bool toIsDesc{special.IsArgDescriptor(0)};
   bool fromIsDesc{special.IsArgDescriptor(1)};
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (toIsDesc) {
     if (fromIsDesc) {
-      auto *p{
-          special.GetProc<void (*)(const Descriptor &, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, const Descriptor &)>(
+          bindings)};
       p(to, from);
     } else {
-      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>()};
+      auto *p{special.GetProc<void (*)(const Descriptor &, void *)>(bindings)};
       p(to, from.raw().base_addr);
     }
   } else {
     if (fromIsDesc) {
-      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>()};
+      auto *p{special.GetProc<void (*)(void *, const Descriptor &)>(bindings)};
       p(to.raw().base_addr, from);
     } else {
-      auto *p{special.GetProc<void (*)(void *, void *)>()};
+      auto *p{special.GetProc<void (*)(void *, void *)>(bindings)};
       p(to.raw().base_addr, from.raw().base_addr);
     }
   }
@@ -208,7 +221,7 @@ static RT_API_ATTRS void DoElementalDefinedAssignment(const Descriptor &to,
        to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
     toElementDesc.set_base_addr(to.Element<char>(toAt));
     fromElementDesc.set_base_addr(from.Element<char>(fromAt));
-    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, special);
+    DoScalarDefinedAssignment(toElementDesc, fromElementDesc, derived, special);
   }
 }
 
@@ -231,6 +244,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -244,275 +259,461 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
-      MustDeallocateLHS(to, from, terminator, flags)};
-  DescriptorAddendum *toAddendum{to.Addendum()};
-  const typeInfo::DerivedType *toDerived{
-      toAddendum ? toAddendum->derivedType() : nullptr};
-  if (toDerived && (flags & NeedFinalization) &&
-      toDerived->noFinalizationNeeded()) {
-    flags &= ~NeedFinalization;
-  }
-  std::size_t toElementBytes{to.ElementBytes()};
-  std::size_t fromElementBytes{from.ElementBytes()};
-  // The following lambda definition violates the conding style,
-  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
-  auto isSimpleMemmove = [&]() {
-    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
-        from.IsContiguous() && toElementBytes == fromElementBytes;
-  };
-  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
-  Descriptor *deferDeallocation{nullptr};
-  if (MayAlias(to, from)) {
+  WorkQueue workQueue{terminator};
+  if (workQueue.BeginAssign(to, from, flags, memmoveFct, nullptr) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+}
+
+RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
+  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
+      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
+  DescriptorAddendum *toAddendum{to_.Addendum()};
+  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
+  if (toDerived_ && (flags_ & NeedFinalization) &&
+      toDerived_->noFinalizationNeeded()) {
+    flags_ &= ~NeedFinalization;
+  }
+  if (MayAlias(to_, *from_)) {
     if (mustDeallocateLHS) {
-      deferDeallocation = &deferredDeallocStatDesc.descriptor();
+      // Convert the LHS into a temporary, then make it look deallocated.
+      toDeallocate_ = &tempDescriptor_.descriptor();
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
-          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
-      to.set_base_addr(nullptr);
-    } else if (!isSimpleMemmove()) {
+          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
+      to_.set_base_addr(nullptr);
+      if (toDerived_ && (flags_ & NeedFinalization)) {
+        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+        flags_ &= ~NeedFinalization;
+      }
+    } else if (!IsSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from.SizeInBytes()};
-      StaticDescriptor<maxRank, true, 16> staticDesc;
-      Descriptor &newFrom{staticDesc.descriptor()};
-      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
+      auto descBytes{from_->SizeInBytes()};
+      Descriptor &newFrom{tempDescriptor_.descriptor()};
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
-      if (stat == StatOk) {
-        if (HasDynamicComponent(from)) {
-          // If 'from' has allocatable/automatic component, we cannot
-          // just make a shallow copy of the descriptor member.
-          // This will still leave data overlap in 'to' and 'newFrom'.
-          // For example:
-          //   type t
-          //     character, allocatable :: c(:)
-          //   end type t
-          //   type(t) :: x(3)
-          //   x(2:3) = x(1:2)
-          // We have to make a deep copy into 'newFrom' in this case.
-          RTNAME(AssignTemporary)
-          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
-        } else {
-          ShallowCopy(newFrom, from, true, from.IsContiguous());
+      if (int stat{ReturnError(
+              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
+          stat != StatOk) {
+        return stat;
+      }
+      if (HasDynamicComponent(*from_)) {
+        // If 'from' has allocatable/automatic component, we cannot
+        // just make a shallow copy of the descriptor member.
+        // This will still leave data overlap in 'to' and 'newFrom'.
+        // For example:
+        //   type t
+        //     character, allocatable :: c(:)
+        //   end type t
+        //   type(t) :: x(3)
+        //   x(2:3) = x(1:2)
+        // We have to make a deep copy into 'newFrom' in this case.
+        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
+          if (const auto *derived{addendum->derivedType()}) {
+            if (!derived->noInitializationNeeded()) {
+              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
+                  status != StatOk && status != StatContinue) {
+                return status;
+              }
+            }
+          }
         }
-        Assign(to, newFrom, terminator,
-            flags &
-                (NeedFinalization | ComponentCanBeDefinedAssignment |
-                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
-        newFrom.Deallocate();
+        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (int status{workQueue.BeginAssign(
+                newFrom, *from_, nestedFlags, memmoveFct_, nullptr)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+      } else {
+        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
       }
-      return;
+      from_ = &newFrom; // this is why from_ has to be a pointer
+      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
+          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
+      toDeallocate_ = &newFrom;
     }
   }
-  if (to.IsAllocatable()) {
+  if (to_.IsAllocatable()) {
     if (mustDeallocateLHS) {
-      if (deferDeallocation) {
-        if ((flags & NeedFinalization) && toDerived) {
-          Finalize(*deferDeallocation, *toDerived, &terminator);
-          flags &= ~NeedFinalization;
-        }
-      } else {
-        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
-            &terminator);
-        flags &= ~NeedFinalization;
+      if (!toDeallocate_ && to_.IsAllocated()) {
+        toDeallocate_ = &to_;
+      }
+    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
+      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
+                                   "assignment to unallocated allocatable",
+          to_.rank(), from_->rank());
+    }
+  } else if (!to_.IsAllocated()) {
+    workQueue.terminator().Crash(
+        "Assign: left-hand side variable is neither allocated nor allocatable");
+  }
+  if (toDerived_ && to_.IsAllocated()) {
+    // Schedule finalization or destruction of the LHS.
+    if (flags_ & NeedFinalization) {
+      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else if (!toDerived_->noDestructionNeeded()) {
+      if (int status{
+              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
-      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
-                       "unallocated allocatable",
-          to.rank(), from.rank());
     }
-    if (!to.IsAllocated()) {
-      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
-        return;
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
+  if (done_) {
+    // All child tickets are complete; can release this ticket's state.
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+    }
+    return StatOk;
+  }
+  // All necessary finalization or destruction that was initiated by Begin()
+  // has been completed.  Deallocation may be pending, and if it's for the LHS,
+  // do it now so that the LHS gets reallocated.
+  if (toDeallocate_ == &to_) {
+    toDeallocate_ = nullptr;
+    to_.Deallocate();
+  }
+  // Allocate the LHS if needed
+  if (!to_.IsAllocated()) {
+    if (int stat{
+            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
+        stat != StatOk) {
+      return stat;
+    }
+    const auto *addendum{to_.Addendum()};
+    toDerived_ = addendum ? addendum->derivedType() : nullptr;
+    if (toDerived_) {
+      if (!toDerived_->noInitializationNeeded()) {
+        if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
+            status != StatOk) {
+          return status;
+        }
       }
-      flags &= ~NeedFinalization;
-      toElementBytes = to.ElementBytes(); // may have changed
-      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
     }
   }
-  if (toDerived && (flags & CanBeDefinedAssignment)) {
-    // Check for a user-defined assignment type-bound procedure;
-    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
-    // the semantics, including allocatable (re)allocation and any
-    // finalization.
-    //
-    // Note that the aliasing and LHS (re)allocation handling above
-    // needs to run even with CanBeDefinedAssignment flag, when
-    // the Assign() is invoked recursively for component-per-component
-    // assignments.
-    if (to.rank() == 0) {
-      if (const auto *special{toDerived->FindSpecialBinding(
+  // Check for a user-defined assignment type-bound procedure;
+  // see 10.2.1.4-5.
+  // Note that the aliasing and LHS (re)allocation handling above
+  // needs to run even with CanBeDefinedAssignment flag, since
+  // Assign() can be invoked recursively for component-wise assignments.
+  // The declared type (if known) must be used for generic resolution
+  // of ASSIGNMENT(=) to a binding, but that binding can be overridden.
+  if (declaredType_ && (flags_ & CanBeDefinedAssignment)) {
+    if (to_.rank() == 0) {
+      if (const auto *special{declaredType_->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        return DoScalarDefinedAssignment(to, from, *special);
+        DoScalarDefinedAssignment(to_, *from_, *toDerived_, *special);
+        done_ = true;
+        return StatContinue;
       }
     }
-    if (const auto *special{toDerived->FindSpecialBinding(
+    if (const auto *special{declaredType_->FindSpecialBinding(
             typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
+      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
+      done_ = true;
+      return StatContinue;
     }
   }
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
-  // Scalar expansion of the RHS is implied by using the same empty
-  // subscript values on each (seemingly) elemental reference into
-  // "from".
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
-  std::size_t toElements{to.Elements()};
-  if (from.rank() > 0 && toElements != from.Elements()) {
-    terminator.Crash("Assign: mismatching element counts in array assignment "
-                     "(to %zd, from %zd)",
-        toElements, from.Elements());
+  // Intrinsic assignment
+  std::size_t toElements{to_.Elements()};
+  if (from_->rank() > 0 && toElements != from_->Elements()) {
+    workQueue.terminator().Crash("Assign: mismatching element counts in array "
+                                 "assignment (to %zd, from %zd)",
+        toElements, from_->Elements());
   }
-  if (to.type() != from.type()) {
-    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
-        to.type().raw(), from.type().raw());
+  if (to_.type() != from_->type()) {
+    workQueue.terminator().Crash(
+        "Assign: mismatching types (to code %d != from code %d)",
+        to_.type().raw(), from_->type().raw());
   }
-  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
-    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
-                     "bytes != from %zd bytes)",
+  std::size_t toElementBytes{to_.ElementBytes()};
+  std::size_t fromElementBytes{from_->ElementBytes()};
+  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
+    workQueue.terminator().Crash("Assign: mismatching non-character element "
+                                 "sizes (to %zd bytes != from %zd bytes)",
         toElementBytes, fromElementBytes);
   }
-  if (const typeInfo::DerivedType *
-      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
-    // Derived type intrinsic assignment, which is componentwise and elementwise
-    // for all components, including parent components (10.2.1.2-3).
-    // The target is first finalized if still necessary (7.5.6.3(1))
-    if (flags & NeedFinalization) {
-      Finalize(to, *updatedToDerived, &terminator);
-    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
-      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
-    }
-    // Copy the data components (incl. the parent) first.
-    const Descriptor &componentDesc{updatedToDerived->component()};
-    std::size_t numComponents{componentDesc.Elements()};
-    for (std::size_t j{0}; j < toElements;
-         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-      for (std::size_t k{0}; k < numComponents; ++k) {
-        const auto &comp{
-            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-                k)}; // TODO: exploit contiguity here
-        // Use PolymorphicLHS for components so that the right things happen
-        // when the components are polymorphic; when they're not, they're both
-        // not, and their declared types will match.
-        int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (flags & ComponentCanBeDefinedAssignment) {
-          nestedFlags |=
-              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-        }
-        switch (comp.genre()) {
-        case typeInfo::Component::Genre::Data:
-          if (comp.category() == TypeCategory::Derived) {
-            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
-            Descriptor &toCompDesc{statDesc[0].descriptor()};
-            Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
-            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
-          } else { // Component has intrinsic type; simply copy raw bytes
-            std::size_t componentByteSize{comp.SizeInBytes(to)};
-            memmoveFct(to.Element<char>(toAt) + comp.offset(),
-                from.Element<const char>(fromAt) + comp.offset(),
-                componentByteSize);
-          }
-          break;
-        case typeInfo::Component::Genre::Pointer: {
-          std::size_t componentByteSize{comp.SizeInBytes(to)};
-          memmoveFct(to.Element<char>(toAt) + comp.offset(),
-              from.Element<const char>(fromAt) + comp.offset(),
-              componentByteSize);
-        } break;
-        case typeInfo::Component::Genre::Allocatable:
-        case typeInfo::Component::Genre::Automatic: {
-          auto *toDesc{reinterpret_cast<Descriptor *>(
-              to.Element<char>(toAt) + comp.offset())};
-          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-              from.Element<char>(fromAt) + comp.offset())};
-          // Allocatable components of the LHS are unconditionally
-          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-          // unlike a "top-level" assignment to a variable, where
-          // deallocation is optional.
-          //
-          // Be careful not to destroy/reallocate the LHS, if there is
-          // overlap between LHS and RHS (it seems that partial overlap
-          // is not possible, though).
-          // Invoke Assign() recursively to deal with potential aliasing.
-          if (toDesc->IsAllocatable()) {
-            if (!fromDesc->IsAllocated()) {
-              // No aliasing.
-              //
-              // If to is not allocated, the Destroy() call is a no-op.
-              // This is just a shortcut, because the recursive Assign()
-              // below would initiate the destruction for to.
-              // No finalization is required.
-              toDesc->Destroy(
-                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
-              continue; // F'2018 10.2.1.3(13)(2)
-            }
-          }
-          // Force LHS deallocation with DeallocateLHS flag.
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
-        } break;
-        }
+  if (toDerived_) {
+    if (toDerived_->noDefinedAssignment()) { // componentwise
+      if (int status{workQueue.BeginDerivedAssign<true>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      // Copy procedure pointer components
-      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
-      std::size_t numProcPtrs{procPtrDesc.Elements()};
-      for (std::size_t k{0}; k < numProcPtrs; ++k) {
-        const auto &procPtr{
-            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
-                k)};
-        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
-            from.Element<const char>(fromAt) + procPtr.offset,
-            sizeof(typeInfo::ProcedurePointer));
+    } else { // elementwise
+      if (int status{workQueue.BeginDerivedAssign<false>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
     }
-  } else { // intrinsic type, intrinsic assignment
-    if (isSimpleMemmove()) {
-      memmoveFct(to.raw().base_addr, from.raw().base_addr,
-          toElements * toElementBytes);
-    } else if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to.type().raw()) {
+    toDeallocate_ = nullptr;
+  } else if (IsSimpleMemmove()) {
+    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
+        toElements * toElementBytes);
+  } else {
+    // Scalar expansion of the RHS is implied by using the same empty
+    // subscript values on each (seemingly) elemental reference into
+    // "from".
+    SubscriptValue toAt[maxRank];
+    to_.GetLowerBounds(toAt);
+    SubscriptValue fromAt[maxRank];
+    from_->GetLowerBounds(fromAt);
+    if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to_.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        terminator.Crash("unexpected type code %d in blank padded Assign()",
-            to.type().raw());
+        workQueue.terminator().Crash(
+            "unexpected type code %d in blank padded Assign()",
+            to_.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
+          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
+        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (deferDeallocation) {
-    // deferDeallocation is used only when LHS is an allocatable.
-    // The finalization has already been run for it.
-    deferDeallocation->Destroy(
-        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+  if (persist_) {
+    done_ = true;
+    return StatContinue;
+  } else {
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+      toDeallocate_ = nullptr;
+    }
+    return StatOk;
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
+    WorkQueue &workQueue) {
+  if (toIsContiguous_ && fromIsContiguous_ &&
+      this->derived_.noDestructionNeeded() &&
+      this->derived_.noDefinedAssignment() &&
+      this->instance_.rank() == this->from_->rank()) {
+    if (std::size_t elementBytes{this->instance_.ElementBytes()};
+        elementBytes == this->from_->ElementBytes()) {
+      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
+      // to be expanded, the types have the same size, and there are no
+      // allocatable components or defined ASSIGNMENT(=) at any level.
+      memmoveFct_(this->instance_.template OffsetElement<char>(),
+          this->from_->template OffsetElement<const char *>(),
+          this->instance_.Elements() * elementBytes);
+      return StatOk;
+    }
+  }
+  // Use PolymorphicLHS for components so that the right things happen
+  // when the components are polymorphic; when they're not, they're both
+  // not, and their declared types will match.
+  int nestedFlags{MaybeReallocate | PolymorphicLHS};
+  if (flags_ & ComponentCanBeDefinedAssignment) {
+    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+  }
+  flags_ = nestedFlags;
+  // Copy procedure pointer components
+  const Descriptor &procPtrDesc{this->derived_.procPtr()};
+  bool noDataComponents{this->IsComplete()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &procPtr{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        memmoveFct_(this->instance_.template ElementComponent<char>(
+                        this->subscripts_, procPtr.offset),
+            this->from_->template ElementComponent<const char>(
+                this->fromSubscripts_, procPtr.offset),
+            sizeof(typeInfo::ProcedurePointer));
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  if (noDataComponents) {
+    return StatOk;
+  }
+  return StatContinue;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
+
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
+    WorkQueue &workQueue) {
+  while (!this->IsComplete()) {
+    // Copy the data components (incl. the parent) first.
+    switch (this->component_->genre()) {
+    case typeInfo::Component::Genre::Data:
+      if (this->component_->category() == TypeCategory::Derived) {
+        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
+        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
+        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
+            workQueue.terminator(), this->subscripts_);
+        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
+            workQueue.terminator(), this->fromSubscripts_);
+        const auto *componentDerived{this->component_->derivedType()};
+        this->Advance();
+        if (int status{workQueue.BeginAssign(toCompDesc, fromCompDesc, flags_,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      } else { // Component has intrinsic type; simply copy raw bytes
+        std::size_t componentByteSize{
+            this->component_->SizeInBytes(this->instance_)};
+        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+          std::size_t offset{this->component_->offset()};
+          char *to{this->instance_.template OffsetElement<char>(offset)};
+          const char *from{
+              this->from_->template OffsetElement<const char>(offset)};
+          std::size_t toElementStride{this->instance_.ElementBytes()};
+          std::size_t fromElementStride{
+              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+          if (toElementStride == fromElementStride &&
+              toElementStride == componentByteSize) {
+            memmoveFct_(to, from, this->elements_ * componentByteSize);
+          } else {
+            for (std::size_t n{this->elements_}; n--;
+                to += toElementStride, from += fromElementStride) {
+              memmoveFct_(to, from, componentByteSize);
+            }
+          }
+          this->Componentwise::Advance();
+        } else {
+          memmoveFct_(
+              this->instance_.template Element<char>(this->subscripts_) +
+                  this->component_->offset(),
+              this->from_->template Element<const char>(this->fromSubscripts_) +
+                  this->component_->offset(),
+              componentByteSize);
+          this->Advance();
+        }
+      }
+      break;
+    case typeInfo::Component::Genre::Pointer: {
+      std::size_t componentByteSize{
+          this->component_->SizeInBytes(this->instance_)};
+      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+        std::size_t offset{this->component_->offset()};
+        char *to{this->instance_.template OffsetElement<char>(offset)};
+        const char *from{
+            this->from_->template OffsetElement<const char>(offset)};
+        std::size_t toElementStride{this->instance_.ElementBytes()};
+        std::size_t fromElementStride{
+            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+        if (toElementStride == fromElementStride &&
+            toElementStride == componentByteSize) {
+          memmoveFct_(to, from, this->elements_ * componentByteSize);
+        } else {
+          for (std::size_t n{this->elements_}; n--;
+              to += toElementStride, from += fromElementStride) {
+            memmoveFct_(to, from, componentByteSize);
+          }
+        }
+        this->Componentwise::Advance();
+      } else {
+        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
+                this->component_->offset(),
+            this->from_->template Element<const char>(this->fromSubscripts_) +
+                this->component_->offset(),
+            componentByteSize);
+        this->Advance();
+      }
+    } break;
+    case typeInfo::Component::Genre::Allocatable:
+    case typeInfo::Component::Genre::Automatic: {
+      auto *toDesc{reinterpret_cast<Descriptor *>(
+          this->instance_.template Element<char>(this->subscripts_) +
+          this->component_->offset())};
+      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+          this->from_->template Element<char>(this->fromSubscripts_) +
+          this->component_->offset())};
+      const auto *componentDerived{this->component_->derivedType()};
+      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
+        if (toDesc->IsAllocated()) {
+          if (this->phase_ == 0) {
+            this->phase_++;
+            if (componentDerived && !componentDerived->noDestructionNeeded()) {
+              if (int status{workQueue.BeginDestroy(
+                      *toDesc, *componentDerived, /*finalize=*/false)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
+          toDesc->Deallocate();
+        }
+        this->Advance();
+      } else {
+        // Allocatable components of the LHS are unconditionally
+        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+        // unlike a "top-level" assignment to a variable, where
+        // deallocation is optional.
+        int nestedFlags{flags_};
+        if (!componentDerived ||
+            (componentDerived->noFinalizationNeeded() &&
+                componentDerived->noInitializationNeeded() &&
+                componentDerived->noDestructionNeeded())) {
+          // The actual deallocation might be avoidable when the existing
+          // location can be reoccupied.
+          nestedFlags |= MaybeReallocate | UpdateLHSBounds;
+        } else {
+          // Force LHS deallocation with DeallocateLHS flag.
+          nestedFlags |= DeallocateLHS;
+        }
+        this->Advance();
+        if (int status{workQueue.BeginAssign(*toDesc, *fromDesc, nestedFlags,
+                memmoveFct_, componentDerived)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } break;
+    }
+  }
+  if (deallocateAfter_) {
+    deallocateAfter_->Deallocate();
+  }
+  return StatOk;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -582,7 +783,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
-
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -599,7 +799,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 35037036f63e7..4e36b1e2edfc8 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,6 +12,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -30,180 +31,192 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
-    const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{instance.Elements()};
-  int stat{StatOk};
-  // Initialize data components in each element; the per-element iterations
-  // constitute the inner loops, not the outer ones
-  std::size_t myComponents{componentDesc.Elements()};
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &allocDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(allocDesc, instance, terminator);
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
+    const Descriptor *) {
+  WorkQueue workQueue{terminator};
+  int status{workQueue.BeginInitialize(instance, derived)};
+  return status == StatContinue ? workQueue.Run() : status;
+}
+
+RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived_.procPtr()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &comp{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (k > 0) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
+            subscripts_, comp.offset)};
+        pptr = comp.procInitialization;
+      }
+    }
+    if (IsComplete()) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      // Establish allocatable descriptors
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            allocDesc, instance_, workQueue.terminator());
         allocDesc.raw().attribute = CFI_attribute_allocatable;
-        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
-          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
-              if (const auto *derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  stat = Initialize(
-                      allocDesc, *derived, terminator, hasStat, errMsg);
-                }
-              }
-            }
-          }
-          if (stat != StatOk) {
-            break;
-          }
-        }
       }
-    } else if (const void *init{comp.initialization()}) {
+      SkipToNextComponent();
+    } else if (const void *init{component_->initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{comp.SizeInBytes(instance)};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
+      std::size_t bytes{component_->SizeInBytes(instance_)};
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        char *ptr{instance_.ElementComponent<char>(
+            subscripts_, component_->offset())};
         std::memcpy(ptr, init, bytes);
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &ptrDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(ptrDesc, instance, terminator);
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            ptrDesc, instance_, workQueue.terminator());
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.  Recursive.
+      // data component.  Handles parent component's elements.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, instance);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
-        }
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginInitialize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived.procPtr()};
-  std::size_t myProcPtrs{procPtrDesc.Elements()};
-  for (std::size_t k{0}; k < myProcPtrs; ++k) {
-    const auto &comp{
-        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
-          at, comp.offset)};
-      pptr = comp.procInitialization;
-    }
-  }
-  return stat;
+  return StatOk;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    const Descriptor &original, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{orig.Elements()};
-  int stat{StatOk};
-
-  // Skip pointers and unallocated variables.
-  if (orig.IsPointer() || !orig.IsAllocated()) {
-    return stat;
+  if (original.IsPointer() || !original.IsAllocated()) {
+    return StatOk; // nothing to do
+  } else {
+    WorkQueue workQueue{terminator};
+    int status{workQueue.BeginInitializeClone(
+        clone, original, derived, hasStat, errMsg)};
+    return status == StatContinue ? workQueue.Run() : status;
   }
-  // Initialize each data component.
-  std::size_t components{componentDesc.Elements()};
-  for (std::size_t i{0}; i < components; ++i) {
-    const typeInfo::Component &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
-    SubscriptValue at[maxRank];
-    orig.GetLowerBounds(at);
-    // Allocate allocatable components that are also allocated in the original
-    // object.
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
-        Descriptor &origDesc{
-            *orig.ElementComponent<Descriptor>(at, comp.offset())};
-        Descriptor &cloneDesc{
-            *clone.ElementComponent<Descriptor>(at, comp.offset())};
-        if (origDesc.IsAllocated()) {
+}
+
+RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (origDesc.IsAllocated()) {
+        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        if (phase_ == 0) {
+          ++phase_;
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
-              if (const typeInfo::DerivedType *
-                  derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  // Perform default initialization for the allocated element.
-                  stat = Initialize(
-                      cloneDesc, *derived, terminator, hasStat, errMsg);
-                }
-                // Initialize derived type's allocatables.
-                if (stat == StatOk) {
-                  stat = InitializeClone(cloneDesc, origDesc, *derived,
-                      terminator, hasStat, errMsg);
+          if (int stat{ReturnError(workQueue.terminator(),
+                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
+              stat != StatOk) {
+            return stat;
+          }
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              if (!derived->noInitializationNeeded()) {
+                // Perform default initialization for the allocated element.
+                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
+                    status != StatOk) {
+                  return status;
                 }
               }
             }
           }
         }
-        if (stat != StatOk) {
-          break;
+        if (phase_ == 1) {
+          ++phase_;
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              // Initialize derived type's allocatables.
+              if (int status{workQueue.BeginInitializeClone(
+                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType()) {
-      // Handle nested derived types.
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, orig);
-      // Data components don't have descriptors, allocate them.
-      StaticDescriptor<maxRank, true, 0> origStaticDesc;
-      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
-      Descriptor &origDesc{origStaticDesc.descriptor()};
-      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (component_->derivedType()) {
+        // Handle nested derived types.
+        const typeInfo::DerivedType &compType{*component_->derivedType()};
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &origDesc{componentDescriptor_.descriptor()};
+        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
         origDesc.Establish(compType,
-            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
         cloneDesc.Establish(compType,
-            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = InitializeClone(
-            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
+            clone_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginInitializeClone(
+                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
+            status != StatOk) {
+          return status;
         }
+      } else {
+        SkipToNextComponent();
       }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
+      workQueue.Run();
     }
   }
-  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -221,7 +234,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
+    const typeInfo::DerivedType &derived, Terminator &terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -258,9 +271,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
-        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = &copy;
       }
@@ -284,87 +295,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  CallFinalSubroutine(descriptor, derived, terminator);
-  const auto *parentType{derived.GetParentType()};
-  bool recurse{parentType && !parentType->noFinalizationNeeded()};
+RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
+  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  for (auto k{recurse ? std::size_t{1}
-                      /* skip first component, it's the parent */
-                      : 0};
-       k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    descriptor.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
-        comp.category() == TypeCategory::Derived) {
+  finalizableParentType_ = derived_.GetParentType();
+  if (finalizableParentType_) {
+    if (finalizableParentType_->noFinalizationNeeded()) {
+      finalizableParentType_ = nullptr;
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
+        component_->category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        const Descriptor &compDesc{
-            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (compDesc.IsAllocated()) {
-          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *
-                compDynamicType{addendum->derivedType()}) {
-              if (!compDynamicType->noFinalizationNeeded()) {
-                Finalize(compDesc, *compDynamicType, terminator);
+      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
+          if (const typeInfo::DerivedType *compDynamicType{
+                  addendum->derivedType()}) {
+            if (!compDynamicType->noFinalizationNeeded()) {
+              if (int status{
+                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
+                  status != StatOk) {
+                return status;
               }
             }
           }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
-        if (!compType->noFinalizationNeeded()) {
-          for (std::size_t j{0}; j++ < elements;
-               descriptor.IncrementSubscripts(at)) {
-            const Descriptor &compDesc{
-                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-            if (compDesc.IsAllocated()) {
-              Finalize(compDesc, *compType, terminator);
-            }
+    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType *compType{component_->derivedType()};
+          compType && !compType->noFinalizationNeeded()) {
+        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        Advance();
+        if (compDesc.IsAllocated()) {
+          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
+              status != StatOk) {
+            return status;
           }
         }
+      } else {
+        SkipToNextComponent();
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Finalize(compDesc, compType, terminator);
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginFinalize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  if (recurse) {
-    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
-    Descriptor &tmpDesc{statDesc.descriptor()};
-    tmpDesc = descriptor;
+  // Last, do the parent component, if any and finalizable.
+  if (finalizableParentType_) {
+    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
+    tmpDesc = instance_;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(parentType);
-    tmpDesc.raw().elem_len = parentType->sizeInBytes();
-    Finalize(tmpDesc, *parentType, terminator);
+    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
+    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
+    const auto &parentType{*finalizableParentType_};
+    finalizableParentType_ = nullptr;
+    // Don't return StatOk here if the nested FInalize is still running;
+    // it needs this->componentDescriptor_.
+    return workQueue.BeginFinalize(tmpDesc, parentType);
   }
+  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -373,51 +391,71 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor,
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
-    return;
+  if (descriptor.IsAllocated() && !derived.noDestructionNeeded()) {
+    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
+      workQueue.Run();
+    }
   }
-  if (finalize && !derived.noFinalizationNeeded()) {
-    Finalize(descriptor, derived, terminator);
+}
+
+RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
+  if (finalize_ && !derived_.noFinalizationNeeded()) {
+    if (int status{workQueue.BeginFinalize(instance_, derived_)};
+        status != StatOk && status != StatContinue) {
+      return status;
+    }
   }
+  return StatContinue;
+}
+
+RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
   // Deallocate all direct and indirect allocatable and automatic components.
   // Contrary to finalization, the order of deallocation does not matter.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  SubscriptValue at[maxRank];
-  descriptor.GetLowerBounds(at);
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    const bool destroyComp{
-        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j < elements; ++j) {
-        Descriptor *d{
-            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (destroyComp) {
-          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
+  while (!IsComplete()) {
+    const auto *componentDerived{component_->derivedType()};
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      Descriptor *d{instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (d->IsAllocated()) {
+        if (phase_ == 0) {
+          ++phase_;
+          if (componentDerived && !componentDerived->noDestructionNeeded()) {
+            if (int status{workQueue.BeginDestroy(
+                    *d, *componentDerived, /*finalize=*/false)};
+                status != StatOk) {
+              return status;
+            }
+          }
         }
         d->Deallocate();
-        descriptor.IncrementSubscripts(at);
       }
-    } else if (destroyComp &&
-        comp.genre() == typeInfo::Component::Genre::Data) {
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (!componentDerived || componentDerived->noDestructionNeeded()) {
+        SkipToNextComponent();
+      } else {
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &compDesc{componentDescriptor_.descriptor()};
+        const typeInfo::DerivedType &compType{*componentDerived};
         compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginDestroy(
+                compDesc, *componentDerived, /*finalize=*/false)};
+            status != StatOk) {
+          return status;
+        }
       }
+    } else {
+      SkipToNextComponent();
     }
   }
+  return StatOk;
 }
 
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 3db1455af52fe..e7b99e6fc3a2b 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,15 +7,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
+
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
 // Defined formatted I/O (maybe)
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -65,10 +94,13 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
       // I/O subroutine reads counts towards READ(SIZE=).
       startPos = io.InquirePos();
     }
+    const auto *bindings{
+        derived.binding().OffsetElement<const typeInfo::Binding>()};
     if (special.IsArgDescriptor(0)) {
       // "dtv" argument is "class(t)", pass a descriptor
       auto *p{special.GetProc<void (*)(const Descriptor &, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
       Descriptor &elementDesc{elementStatDesc.descriptor()};
       elementDesc.Establish(
@@ -79,7 +111,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
     } else {
       // "dtv" argument is "type(t)", pass a raw pointer
       auto *p{special.GetProc<void (*)(const void *, int &, char *,
-          const Descriptor &, int &, char *, std::size_t, std::size_t)>()};
+          const Descriptor &, int &, char *, std::size_t, std::size_t)>(
+          bindings)};
       p(descriptor.Element<char>(subscripts), unit, ioType, vListDesc, ioStat,
           ioMsg, ioTypeLen, sizeof ioMsg);
     }
@@ -104,8 +137,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
 }
 
 // Defined unformatted I/O
-bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -121,10 +154,12 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   std::size_t numElements{descriptor.Elements()};
   SubscriptValue subscripts[maxRank];
   descriptor.GetLowerBounds(subscripts);
+  const auto *bindings{
+      derived.binding().OffsetElement<const typeInfo::Binding>()};
   if (special.IsArgDescriptor(0)) {
     // "dtv" argument is "class(t)", pass a descriptor
     auto *p{special.GetProc<void (*)(
-        const Descriptor &, int &, int &, char *, std::size_t)>()};
+        const Descriptor &, int &, int &, char *, std::size_t)>(bindings)};
     StaticDescriptor<1, true, 10 /*?*/> elementStatDesc;
     Descriptor &elementDesc{elementStatDesc.descriptor()};
     elementDesc.Establish(derived, nullptr, 0, nullptr, CFI_attribute_pointer);
@@ -137,8 +172,9 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
     }
   } else {
     // "dtv" argument is "type(t)", pass a raw pointer
-    auto *p{special.GetProc<void (*)(
-        const void *, int &, int &, char *, std::size_t)>()};
+    auto *p{special
+            .GetProc<void (*)(const void *, int &, int &, char *, std::size_t)>(
+                bindings)};
     for (; numElements-- > 0; descriptor.IncrementSubscripts(subscripts)) {
       p(descriptor.Element<char>(subscripts), unit, ioStat, ioMsg,
           sizeof ioMsg);
@@ -152,5 +188,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Data) {
+      // Create a descriptor for the component
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      component_->CreatePointerDescriptor(
+          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
+      Advance();
+      if (int status{workQueue.BeginDescriptorIo<DIR>(
+              io_, compDesc, table_, anyIoTookPlace_)};
+          status != StatOk) {
+        return status;
+      }
+    } else {
+      // Component is itself a descriptor
+      char *pointer{
+          instance_.Element<char>(subscripts_) + component_->offset()};
+      const Descriptor &compDesc{
+          *reinterpret_cast<const Descriptor *>(pointer)};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (int status{workQueue.BeginDescriptorIo<DIR>(
+                io_, compDesc, table_, anyIoTookPlace_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
+  IoErrorHandler &handler{io_.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return handler.GetIoStat();
+  }
+  if (!io_.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return handler.GetIoStat();
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io_.BeginReadingRecord()) {
+      return StatOk;
+    }
+  }
+  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
+    // Unformatted I/O
+    IoErrorHandler &handler{io_.GetIoErrorHandler()};
+    const DescriptorAddendum *addendum{instance_.Addendum()};
+    if (const typeInfo::DerivedType *type{
+            addendum ? addendum->derivedType() : nullptr}) {
+      // derived type unformatted I/O
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*type,
+                DIR == Direction::Input
+                    ? common::DefinedIo::ReadUnformatted
+                    : common::DefinedIo::WriteUnformatted)}) {
+          if (definedIo->subroutine) {
+            typeInfo::SpecialBinding special{DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false};
+            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
+              anyIoTookPlace_ = true;
+              return StatOk;
+            }
+          } else {
+            int status{workQueue.BeginDerivedIo<DIR>(
+                io_, instance_, *type, table_, anyIoTookPlace_)};
+            return status == StatContinue ? StatOk : status; // done here
+          }
+        }
+      }
+      if (const typeInfo::SpecialBinding *special{
+              type->FindSpecialBinding(DIR == Direction::Input
+                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+        if (!table_ || !table_->ignoreNonTbpEntries || special->IsTypeBound()) {
+          // defined derived type unformatted I/O
+          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
+            anyIoTookPlace_ = true;
+            return StatOk;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+      // Default derived type unformatted I/O
+      // TODO: If no component at any level has defined READ or WRITE
+      // (as appropriate), the elements are contiguous, and no byte swapping
+      // is active, do a block transfer via the code below.
+      int status{workQueue.BeginDerivedIo<DIR>(
+          io_, instance_, *type, table_, anyIoTookPlace_)};
+      return status == StatContinue ? StatOk : status; // done here
+    } else {
+      // intrinsic type unformatted I/O
+      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
+      InquireIOLengthState *inq{nullptr};
+      bool swapEndianness{false};
+      if (externalUnf) {
+        swapEndianness = externalUnf->unit().swapEndianness();
+      } else {
+        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
+        if (!childUnf) {
+          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
+                                         : nullptr;
+          RUNTIME_CHECK(handler, inq != nullptr);
+        }
+      }
+      std::size_t elementBytes{instance_.ElementBytes()};
+      std::size_t swappingBytes{elementBytes};
+      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
+        // Byte swapping units can be smaller than elements, namely
+        // for COMPLEX and CHARACTER.
+        if (maybeCatAndKind->first == TypeCategory::Character) {
+          // swap each character position independently
+          swappingBytes = maybeCatAndKind->second; // kind
+        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+          // swap real and imaginary components independently
+          swappingBytes /= 2;
+        }
+      }
+      using CharType =
+          std::conditional_t<DIR == Direction::Output, const char, char>;
+      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+        if constexpr (DIR == Direction::Output) {
+          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                             : inq->Emit(&x, totalBytes, swappingBytes);
+        } else {
+          return externalUnf
+              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+              : childUnf->Receive(&x, totalBytes, swappingBytes);
+        }
+      }};
+      if (!swapEndianness &&
+          instance_.IsContiguous()) { // contiguous unformatted I/O
+        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+        if (Transfer(x, elements_ * elementBytes)) {
+          anyIoTookPlace_ = true;
+        } else {
+          return IostatEnd;
+        }
+      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+        for (; !IsComplete(); Advance()) {
+          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+          if (Transfer(x, elementBytes)) {
+            anyIoTookPlace_ = true;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+    }
+    // Unformatted I/O never needs to call Continue().
+    return StatOk;
+  }
+  // Formatted I/O
+  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    bool any{false};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        any = FormattedRealIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedRealIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedRealIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedRealIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedRealIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedRealIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        any = FormattedComplexIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedComplexIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedComplexIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedComplexIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedComplexIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedComplexIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        any = FormattedCharacterIO<char, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        any = FormattedLogicalIO<1, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedLogicalIO<2, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedLogicalIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedLogicalIO<8, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Derived: {
+      // Derived type information must be present for formatted I/O.
+      IoErrorHandler &handler{io_.GetIoErrorHandler()};
+      const DescriptorAddendum *addendum{instance_.Addendum()};
+      RUNTIME_CHECK(handler, addendum != nullptr);
+      derived_ = addendum->derivedType();
+      RUNTIME_CHECK(handler, derived_ != nullptr);
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*derived_,
+                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                        : common::DefinedIo::WriteFormatted)}) {
+          if (definedIo->subroutine) {
+            nonTbpSpecial_.emplace(DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadFormatted
+                    : typeInfo::SpecialBinding::Which::WriteFormatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false);
+            special_ = &*nonTbpSpecial_;
+          }
+        }
+      }
+      if (!special_) {
+        if (const typeInfo::SpecialBinding *binding{
+                derived_->FindSpecialBinding(DIR == Direction::Input
+                        ? typeInfo::SpecialBinding::Which::ReadFormatted
+                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+          if (!table_ || !table_->ignoreNonTbpEntries ||
+              binding->IsTypeBound()) {
+            special_ = binding;
+          }
+        }
+      }
+      return StatContinue;
+    }
+    }
+    if (any) {
+      anyIoTookPlace_ = true;
+    } else {
+      return IostatEnd;
+    }
+  } else {
+    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+        static_cast<int>(instance_.type().raw()));
+    return handler.GetIoStat();
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  // Only derived type formatted I/O gets here.
+  while (!IsComplete()) {
+    if (special_) {
+      if (auto defined{DefinedFormattedIo(
+              io_, instance_, *derived_, *special_, subscripts_)}) {
+        anyIoTookPlace_ |= *defined;
+        Advance();
+        continue;
+      }
+    }
+    Descriptor &elementDesc{elementDescriptor_.descriptor()};
+    elementDesc.Establish(
+        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
+    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
+    Advance();
+    if (int status{workQueue.BeginDerivedIo<DIR>(
+            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
+        status != StatOk) {
+      return status;
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  bool anyIoTookPlace{false};
+  WorkQueue workQueue{io.GetIoErrorHandler()};
+  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+  return anyIoTookPlace;
+}
+
+template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index eb60f106c9203..88ad59bd24b53 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,619 +9,27 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
+#include "flang-rt/runtime/connection.h"
 
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/optional.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
+namespace Fortran::runtime {
+class Descriptor;
+} // namespace Fortran::runtime
 
-namespace Fortran::runtime::io::descr {
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
 
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io::descr {
 
 template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-// For intrinsic (not defined) derived type I/O, formatted & unformatted
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
-    const typeInfo::Component &component, const Descriptor &origDescriptor,
-    const SubscriptValue origSubscripts[], Terminator &terminator,
-    const NonTbpDefinedIoTable *table) {
-#if !defined(RT_DEVICE_AVOID_RECURSION)
-  if (component.genre() == typeInfo::Component::Genre::Data) {
-    // Create a descriptor for the component
-    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
-    Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
-        desc, origDescriptor, terminator, origSubscripts);
-    return DescriptorIO<DIR>(io, desc, table);
-  } else {
-    // Component is itself a descriptor
-    char *pointer{
-        origDescriptor.Element<char>(origSubscripts) + component.offset()};
-    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
-    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
-  }
-#else
-  terminator.Crash("not yet implemented: component IO");
-#endif
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  SubscriptValue at[maxRank];
-  compArray.GetLowerBounds(at);
-  for (std::size_t k{0}; k < numComponents;
-       ++k, compArray.IncrementSubscripts(at)) {
-    const typeInfo::Component &component{
-        *compArray.Element<typeInfo::Component>(at)};
-    if (!DefaultComponentIO<DIR>(
-            io, component, descriptor, subscripts, handler, table)) {
-      // Return true for NAMELIST input if any component appeared.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && k > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    SubscriptValue at[maxRank];
-    compArray.GetLowerBounds(at);
-    for (std::size_t k{0}; k < numComponents;
-         ++k, compArray.IncrementSubscripts(at)) {
-      const typeInfo::Component &component{
-          *compArray.Element<typeInfo::Component>(at)};
-      if (!DefaultComponentIO<DIR>(
-              io, component, descriptor, subscripts, handler, table)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
-    const typeInfo::SpecialBinding &, const SubscriptValue[]);
-
-template <Direction DIR>
-static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  // Derived type information must be present for formatted I/O.
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  RUNTIME_CHECK(handler, addendum != nullptr);
-  const typeInfo::DerivedType *type{addendum->derivedType()};
-  RUNTIME_CHECK(handler, type != nullptr);
-  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
-  const typeInfo::SpecialBinding *special{nullptr};
-  if (table) {
-    if (const auto *definedIo{table->Find(*type,
-            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                    : common::DefinedIo::WriteFormatted)}) {
-      if (definedIo->subroutine) {
-        nonTbpSpecial.emplace(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted,
-            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-            false);
-        special = &*nonTbpSpecial;
-      }
-    }
-  }
-  if (!special) {
-    if (const typeInfo::SpecialBinding *
-        binding{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
-        special = binding;
-      }
-    }
-  }
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t numElements{descriptor.Elements()};
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    Fortran::common::optional<bool> result;
-    if (special) {
-      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
-    }
-    if (!result) {
-      result = DefaultComponentwiseFormattedIO<DIR>(
-          io, descriptor, *type, table, subscripts);
-    }
-    if (!result.value()) {
-      // Return true for NAMELIST input if we got anything.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && j > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
 
-// Unformatted I/O
-template <Direction DIR>
-static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  if (const typeInfo::DerivedType *
-      type{addendum ? addendum->derivedType() : nullptr}) {
-    // derived type unformatted I/O
-    if (table) {
-      if (const auto *definedIo{table->Find(*type,
-              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
-                                      : common::DefinedIo::WriteUnformatted)}) {
-        if (definedIo->subroutine) {
-          typeInfo::SpecialBinding special{DIR == Direction::Input
-                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
-              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-              false};
-          if (Fortran::common::optional<bool> wasDefined{
-                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
-            return *wasDefined;
-          }
-        } else {
-          return DefaultComponentwiseUnformattedIO<DIR>(
-              io, descriptor, *type, table);
-        }
-      }
-    }
-    if (const typeInfo::SpecialBinding *
-        special{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
-        // defined derived type unformatted I/O
-        return DefinedUnformattedIo(io, descriptor, *type, *special);
-      }
-    }
-    // Default derived type unformatted I/O
-    // TODO: If no component at any level has defined READ or WRITE
-    // (as appropriate), the elements are contiguous, and no byte swapping
-    // is active, do a block transfer via the code below.
-    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
-  } else {
-    // intrinsic type unformatted I/O
-    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
-    auto *inq{
-        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
-    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
-    std::size_t elementBytes{descriptor.ElementBytes()};
-    std::size_t numElements{descriptor.Elements()};
-    std::size_t swappingBytes{elementBytes};
-    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
-      // Byte swapping units can be smaller than elements, namely
-      // for COMPLEX and CHARACTER.
-      if (maybeCatAndKind->first == TypeCategory::Character) {
-        // swap each character position independently
-        swappingBytes = maybeCatAndKind->second; // kind
-      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-        // swap real and imaginary components independently
-        swappingBytes /= 2;
-      }
-    }
-    SubscriptValue subscripts[maxRank];
-    descriptor.GetLowerBounds(subscripts);
-    using CharType =
-        std::conditional_t<DIR == Direction::Output, const char, char>;
-    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-      if constexpr (DIR == Direction::Output) {
-        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                           : inq->Emit(&x, totalBytes, swappingBytes);
-      } else {
-        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-                           : childUnf->Receive(&x, totalBytes, swappingBytes);
-      }
-    }};
-    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
-    if (!swapEndianness &&
-        descriptor.IsContiguous()) { // contiguous unformatted I/O
-      char &x{ExtractElement<char>(io, descriptor, subscripts)};
-      return Transfer(x, numElements * elementBytes);
-    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-      for (std::size_t j{0}; j < numElements; ++j) {
-        char &x{ExtractElement<char>(io, descriptor, subscripts)};
-        if (!Transfer(x, elementBytes)) {
-          return false;
-        }
-        if (!descriptor.IncrementSubscripts(subscripts) &&
-            j + 1 < numElements) {
-          handler.Crash("DescriptorIO: subscripts out of bounds");
-        }
-      }
-      return true;
-    }
-  }
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return false;
-  }
-  if (!io.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return false;
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io.BeginReadingRecord()) {
-      return false;
-    }
-  }
-  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
-    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
-  }
-  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        return FormattedRealIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedRealIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedRealIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedRealIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedRealIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedRealIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        return FormattedComplexIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedComplexIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedComplexIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedComplexIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedComplexIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedComplexIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        return FormattedCharacterIO<char, DIR>(io, descriptor);
-      case 2:
-        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
-      case 4:
-        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        return FormattedLogicalIO<1, DIR>(io, descriptor);
-      case 2:
-        return FormattedLogicalIO<2, DIR>(io, descriptor);
-      case 4:
-        return FormattedLogicalIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedLogicalIO<8, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Derived:
-      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
-    }
-  }
-  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-      static_cast<int>(descriptor.type().raw()));
-  return false;
-}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 1d5304254ed0e..0f0564403c0e2 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
+    internalDebugging = std::strtol(x, nullptr, 10);
+  }
+
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index b0cf2180fc6d4..1bef387a9771f 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,6 +10,7 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index b08195cd31e05..24d05f369fcbe 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 82182696d70c6..d023c3392d559 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
+  std::size_t offset{offset_};
   if (subscripts) {
-    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
-  } else {
-    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
+    offset += container.SubscriptsToByteOffset(subscripts);
   }
+  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
@@ -279,6 +279,10 @@ FILE *Component::Dump(FILE *f) const {
   }
   std::fprintf(f, " category %d  kind %d  rank %d  offset 0x%zx\n", category_,
       kind_, rank_, static_cast<std::size_t>(offset_));
+  const auto &dtDesc{derivedType_.descriptor()};
+  if (dtDesc.raw().base_addr) {
+    std::fprintf(f, " derivedType_ %p\n", dtDesc.raw().base_addr);
+  }
   if (initialization_) {
     std::fprintf(f, " initialization @ %p:\n",
         reinterpret_cast<const void *>(initialization_));
@@ -325,7 +329,7 @@ FILE *SpecialBinding::Dump(FILE *f) const {
     break;
   }
   std::fprintf(f, "    isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_);
-  std::fprintf(f, "    isTypeBound: 0x%x\n", isTypeBound_);
+  std::fprintf(f, "    isTypeBound: %d\n", isTypeBound_);
   std::fprintf(f, "    isArgContiguousSet: 0x%x\n", isArgContiguousSet_);
   std::fprintf(f, "    proc: %p\n", reinterpret_cast<void *>(proc_));
   return f;
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
new file mode 100644
index 0000000000000..a508ecb637102
--- /dev/null
+++ b/flang-rt/lib/runtime/work-queue.cpp
@@ -0,0 +1,161 @@
+//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/work-queue.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/visit.h"
+
+namespace Fortran::runtime {
+
+#if !defined(RT_DEVICE_COMPILATION)
+// FLANG_RT_DEBUG code is disabled when false.
+static constexpr bool enableDebugOutput{false};
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
+    : derived_{derived}, components_{derived_.component().Elements()} {
+  GetComponent();
+}
+
+RT_API_ATTRS void Componentwise::GetComponent() {
+  if (IsComplete()) {
+    component_ = nullptr;
+  } else {
+    const Descriptor &componentDesc{derived_.component()};
+    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+        componentAt_);
+  }
+}
+
+RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
+  if (!begun) {
+    begun = true;
+    return common::visit(
+        [&workQueue](
+            auto &specificTicket) { return specificTicket.Begin(workQueue); },
+        u);
+  } else {
+    return common::visit(
+        [&workQueue](auto &specificTicket) {
+          return specificTicket.Continue(workQueue);
+        },
+        u);
+  }
+}
+
+RT_API_ATTRS WorkQueue::~WorkQueue() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+  while (firstFree_) {
+    TicketList *next{firstFree_->next};
+    if (!firstFree_->isStatic) {
+      FreeMemory(firstFree_);
+    }
+    firstFree_ = next;
+  }
+}
+
+RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
+  if (!firstFree_) {
+    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
+    firstFree_ = new (p) TicketList;
+    firstFree_->isStatic = false;
+  }
+  TicketList *newTicket{firstFree_};
+  if ((firstFree_ = newTicket->next)) {
+    firstFree_->previous = nullptr;
+  }
+  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
+  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
+    newTicket->previous->next = newTicket;
+  } else {
+    first_ = newTicket;
+  }
+  if ((newTicket->next = after)) {
+    after->previous = newTicket;
+  } else {
+    last_ = newTicket;
+  }
+  newTicket->ticket.begun = false;
+#if !defined(RT_DEVICE_COMPILATION)
+  if (enableDebugOutput &&
+      (executionEnvironment.internalDebugging &
+          ExecutionEnvironment::WorkQueue)) {
+    std::fprintf(stderr, "WQ: new ticket\n");
+  }
+#endif
+  return newTicket->ticket;
+}
+
+RT_API_ATTRS int WorkQueue::Run() {
+  while (last_) {
+    TicketList *at{last_};
+    insertAfter_ = last_;
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
+          at->ticket.begun ? "Continue" : "Begin");
+    }
+#endif
+    int stat{at->ticket.Continue(*this)};
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
+    }
+#endif
+    insertAfter_ = nullptr;
+    if (stat == StatOk) {
+      if (at->previous) {
+        at->previous->next = at->next;
+      } else {
+        first_ = at->next;
+      }
+      if (at->next) {
+        at->next->previous = at->previous;
+      } else {
+        last_ = at->previous;
+      }
+      if ((at->next = firstFree_)) {
+        at->next->previous = at;
+      }
+      at->previous = nullptr;
+      firstFree_ = at;
+    } else if (stat != StatContinue) {
+      Stop();
+      return stat;
+    }
+  }
+  return StatOk;
+}
+
+RT_API_ATTRS void WorkQueue::Stop() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+}
+
+RT_OFFLOAD_API_GROUP_END
+
+} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 3833e48be3dd6..6c148b1de6f82 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength";
+        << "OutputDescriptor() for InquireIoLength " << j;
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 78d871c593e1d..871749934810c 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,6 +858,16 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
+* Intrinsic assignment of arrays is defined elementally, and intrinsic
+  assignment of derived type components is defined componentwise.
+  However, when intrinsic assignment takes place for an array of derived
+  type, the order of the loop nesting is not defined.
+  Some compilers will loop over the elements, assigning all of the components
+  of each element before proceeding to the next element.
+  This compiler loops over all of the components, and assigns all of
+  the elements for each component before proceeding to the next component.
+  A program using defined assignment might be able to detect the difference.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index bc80997a1bec2..7d198bdcc9e89 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,8 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6
+  DeallocateLHS = 1 << 6,
+  UpdateLHSBounds = 1 << 7,
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index b13370512e5cc..69375a83dec25 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,9 +182,12 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &derived);
+bool MayRequireFinalization(const DerivedTypeSpec &);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
+// Does this type have any defined assignment at any level (or any polymorphic
+// allocatable)?
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 26ae81f97895a..51ba21a9e5edf 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -82,17 +82,17 @@ class RuntimeTableBuilder {
       const SomeExpr &genre, std::int64_t = 0) const;
   SomeExpr PackageIntValueExpr(const SomeExpr &genre, std::int64_t = 0) const;
   std::vector<evaluate::StructureConstructor> DescribeBindings(
-      const Scope &dtScope, Scope &);
+      const Scope &dtScope, Scope &, const SymbolVector &bindings);
   std::map<int, evaluate::StructureConstructor> DescribeSpecialGenerics(
-      const Scope &dtScope, const Scope &thisScope,
-      const DerivedTypeSpec *) const;
+      const Scope &dtScope, const Scope &thisScope, const DerivedTypeSpec *,
+      const SymbolVector &bindings) const;
   void DescribeSpecialGeneric(const GenericDetails &,
       std::map<int, evaluate::StructureConstructor> &, const Scope &,
-      const DerivedTypeSpec *) const;
+      const DerivedTypeSpec *, const SymbolVector &bindings) const;
   void DescribeSpecialProc(std::map<int, evaluate::StructureConstructor> &,
       const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
       std::optional<common::DefinedIo>, const Scope *, const DerivedTypeSpec *,
-      bool isTypeBound) const;
+      const SymbolVector *bindings) const;
   void IncorporateDefinedIoGenericInterfaces(
       std::map<int, evaluate::StructureConstructor> &, common::DefinedIo,
       const Scope *, const DerivedTypeSpec *);
@@ -595,8 +595,9 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     // Compile the "vtable" of type-bound procedure bindings
     std::uint32_t specialBitSet{0};
     if (!dtSymbol->attrs().test(Attr::ABSTRACT)) {
+      SymbolVector boundProcedures{CollectBindings(dtScope)};
       std::vector<evaluate::StructureConstructor> bindings{
-          DescribeBindings(dtScope, scope)};
+          DescribeBindings(dtScope, scope, boundProcedures)};
       AddValue(dtValues, derivedTypeSchema_, bindingDescCompName,
           SaveDerivedPointerTarget(scope,
               SaveObjectName(
@@ -609,12 +610,14 @@ const Symbol *RuntimeTableBuilder::DescribeType(
       // subroutines override any parent bindings, but FINAL subroutines do not
       // (the runtime will call all of them).
       std::map<int, evaluate::StructureConstructor> specials{
-          DescribeSpecialGenerics(dtScope, dtScope, derivedTypeSpec)};
+          DescribeSpecialGenerics(
+              dtScope, dtScope, derivedTypeSpec, boundProcedures)};
       if (derivedTypeSpec) {
-        for (auto &ref : FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
-          DescribeSpecialProc(specials, *ref, /*isAssignment-*/ false,
+        for (const Symbol &symbol :
+            FinalsForDerivedTypeInstantiation(*derivedTypeSpec)) {
+          DescribeSpecialProc(specials, symbol, /*isAssignment-*/ false,
               /*isFinal=*/true, std::nullopt, nullptr, derivedTypeSpec,
-              /*isTypeBound=*/true);
+              &boundProcedures);
         }
         IncorporateDefinedIoGenericInterfaces(specials,
             common::DefinedIo::ReadFormatted, &scope, derivedTypeSpec);
@@ -661,6 +664,10 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
+    // Similarly, a flag to enable optimized runtime assignment.
+    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
+        IntExpr<1>(
+            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
@@ -1041,15 +1048,16 @@ SymbolVector CollectBindings(const Scope &dtScope) {
 }
 
 std::vector<evaluate::StructureConstructor>
-RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
+RuntimeTableBuilder::DescribeBindings(
+    const Scope &dtScope, Scope &scope, const SymbolVector &bindings) {
   std::vector<evaluate::StructureConstructor> result;
-  for (const SymbolRef &ref : CollectBindings(dtScope)) {
+  for (const Symbol &symbol : bindings) {
     evaluate::StructureConstructorValues values;
     AddValue(values, bindingSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{
-            ref.get().get<ProcBindingDetails>().symbol()}});
+            symbol.get<ProcBindingDetails>().symbol()}});
     AddValue(values, bindingSchema_, "name"s,
-        SaveNameAsPointerTarget(scope, ref.get().name().ToString()));
+        SaveNameAsPointerTarget(scope, symbol.name().ToString()));
     result.emplace_back(DEREF(bindingSchema_.AsDerived()), std::move(values));
   }
   return result;
@@ -1057,16 +1065,18 @@ RuntimeTableBuilder::DescribeBindings(const Scope &dtScope, Scope &scope) {
 
 std::map<int, evaluate::StructureConstructor>
 RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
-    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &thisScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   std::map<int, evaluate::StructureConstructor> specials;
   if (const Scope * parentScope{dtScope.GetDerivedTypeParent()}) {
-    specials =
-        DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec);
+    specials = DescribeSpecialGenerics(
+        *parentScope, thisScope, derivedTypeSpec, bindings);
   }
   for (const auto &pair : dtScope) {
     const Symbol &symbol{*pair.second};
     if (const auto *generic{symbol.detailsIf<GenericDetails>()}) {
-      DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec);
+      DescribeSpecialGeneric(
+          *generic, specials, thisScope, derivedTypeSpec, bindings);
     }
   }
   return specials;
@@ -1074,15 +1084,16 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
 
 void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
     std::map<int, evaluate::StructureConstructor> &specials,
-    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec) const {
+    const Scope &dtScope, const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector &bindings) const {
   common::visit(
       common::visitors{
           [&](const GenericKind::OtherKind &k) {
             if (k == GenericKind::OtherKind::Assignment) {
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/true,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/true,
                     /*isFinal=*/false, std::nullopt, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
             }
           },
@@ -1092,10 +1103,10 @@ void RuntimeTableBuilder::DescribeSpecialGeneric(const GenericDetails &generic,
             case common::DefinedIo::ReadUnformatted:
             case common::DefinedIo::WriteFormatted:
             case common::DefinedIo::WriteUnformatted:
-              for (auto ref : generic.specificProcs()) {
-                DescribeSpecialProc(specials, *ref, /*isAssignment=*/false,
+              for (const Symbol &specific : generic.specificProcs()) {
+                DescribeSpecialProc(specials, specific, /*isAssignment=*/false,
                     /*isFinal=*/false, io, &dtScope, derivedTypeSpec,
-                    /*isTypeBound=*/true);
+                    &bindings);
               }
               break;
             }
@@ -1109,7 +1120,8 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     std::map<int, evaluate::StructureConstructor> &specials,
     const Symbol &specificOrBinding, bool isAssignment, bool isFinal,
     std::optional<common::DefinedIo> io, const Scope *dtScope,
-    const DerivedTypeSpec *derivedTypeSpec, bool isTypeBound) const {
+    const DerivedTypeSpec *derivedTypeSpec,
+    const SymbolVector *bindings) const {
   const auto *binding{specificOrBinding.detailsIf<ProcBindingDetails>()};
   if (binding && dtScope) { // use most recent override
     binding = &DEREF(dtScope->FindComponent(specificOrBinding.name()))
@@ -1128,6 +1140,9 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       // component assignment as part of intrinsic assignment.
       // Non-type-bound generic INTERFACEs and assignments from incompatible
       // types must not be used for component intrinsic assignment.
+      if (!binding) {
+        return;
+      }
       CHECK(proc->dummyArguments.size() == 2);
       const auto t1{
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
@@ -1137,7 +1152,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
           DEREF(std::get_if<evaluate::characteristics::DummyDataObject>(
                     &proc->dummyArguments[1].u))
               .type.type()};
-      if (!binding || t1.category() != TypeCategory::Derived ||
+      if (t1.category() != TypeCategory::Derived ||
           t2.category() != TypeCategory::Derived ||
           t1.IsUnlimitedPolymorphic() || t2.IsUnlimitedPolymorphic()) {
         return;
@@ -1149,7 +1164,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
       }
       which = proc->IsElemental() ? elementalAssignmentEnum_
                                   : scalarAssignmentEnum_;
-      if (binding && binding->passName() &&
+      if (binding->passName() &&
           *binding->passName() == proc->dummyArguments[1].name) {
         argThatMightBeDescriptor = 1;
         isArgDescriptorSet |= 2;
@@ -1234,8 +1249,19 @@ void RuntimeTableBuilder::DescribeSpecialProc(
         values, specialSchema_, "which"s, SomeExpr{std::move(which.value())});
     AddValue(values, specialSchema_, "isargdescriptorset"s,
         IntExpr<1>(isArgDescriptorSet));
-    AddValue(values, specialSchema_, "istypebound"s,
-        IntExpr<1>(isTypeBound ? 1 : 0));
+    int bindingIndex{0};
+    if (bindings) {
+      int j{0};
+      for (const Symbol &bind : DEREF(bindings)) {
+        ++j;
+        if (&bind.get<ProcBindingDetails>().symbol() == &specific) {
+          bindingIndex = j; // index offset by 1
+          break;
+        }
+      }
+    }
+    CHECK(bindingIndex <= 255);
+    AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex));
     AddValue(values, specialSchema_, "isargcontiguousset"s,
         IntExpr<1>(isArgContiguousSet));
     AddValue(values, specialSchema_, procCompName,
@@ -1260,7 +1286,7 @@ void RuntimeTableBuilder::IncorporateDefinedIoGenericInterfaces(
       CHECK(std::get<common::DefinedIo>(genericDetails.kind().u) == definedIo);
       for (auto ref : genericDetails.specificProcs()) {
         DescribeSpecialProc(specials, *ref, false, false, definedIo, nullptr,
-            derivedTypeSpec, false);
+            derivedTypeSpec, /*bindings=*/nullptr);
       }
     }
   }
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index a1445187b1e98..bf520d04a50cc 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -814,6 +814,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
+static bool MayHaveDefinedAssignment(
+    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
+  if (const Scope *scope{derived.GetScope()};
+      scope && checked.find(scope) == checked.end()) {
+    checked.insert(scope);
+    for (const auto &[_, symbolRef] : *scope) {
+      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
+        if (generic->kind().IsAssignment()) {
+          return true;
+        }
+      } else if (symbolRef->has<ObjectEntityDetails>() &&
+          !IsPointer(*symbolRef)) {
+        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
+          if (type->IsPolymorphic()) {
+            return true;
+          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+            if (MayHaveDefinedAssignment(*derived, checked)) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
+  std::set<const Scope *> checked;
+  return MayHaveDefinedAssignment(derived, checked);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index b30a6bf697563..8dd27d6e4c01b 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,7 +52,8 @@
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: __padding0(4)
+    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
+    integer(1) :: __padding0(3)
   end type
 
   type :: Binding
@@ -116,7 +117,7 @@
   type, bind(c) :: SpecialBinding
     integer(1) :: which ! SpecialBinding::Which
     integer(1) :: isArgDescriptorSet
-    integer(1) :: isTypeBound
+    integer(1) :: isTypeBound ! binding index + 1, if any
     integer(1) :: isArgContiguousSet
     integer(1) :: __padding0(4)
     type(__builtin_c_funptr) :: proc
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 28f0bf78f33c9..2e05b652822b5 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index d228cd2a84ca4..bb20c546e0261 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ subroutine s2(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ subroutine s2(x, y)
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,8 +155,8 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
 module m09
@@ -197,8 +197,8 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
 
@@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index f0c0a817da4a4..e2552d0a21d6f 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..94dd2199db35a 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index 2a7f12a153eb8..df1aecf3821de 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 2385709a8eb44..22f37b1a4369d 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index e8766d9811db8..ab20d6f601106 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 689cf469dee3b..391a66f3d6664 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ module m
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 92efc8f9ea54b..08e0b95abb763 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..6b23b63d28b1d
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,67 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!Check "nodefinedassignment" settings.
+
+module m01
+
+  type hasAsst1
+   contains
+    procedure asst1
+    generic :: assignment(=) => asst1
+  end type
+!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type hasAsst2 ! no defined assignment relevant to the runtime
+  end type
+  interface assignment(=)
+    procedure asst2
+  end interface
+!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test1
+    type(hasAsst1) c
+  end type
+!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type test2
+    type(hasAsst2) c
+  end type
+!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test3
+    type(hasAsst1), pointer :: p
+  end type
+!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test4
+    type(hasAsst2), pointer :: p
+  end type
+!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type, extends(hasAsst1) :: test5
+  end type
+!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type, extends(hasAsst2) :: test6
+  end type
+!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test7
+    type(test7), allocatable :: c
+  end type
+!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test8
+    class(test8), allocatable :: c
+  end type
+!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+
+ contains
+  impure elemental subroutine asst1(left, right)
+    class(hasAsst1), intent(out) :: left
+    class(hasAsst1), intent(in) :: right
+  end
+  impure elemental subroutine asst2(left, right)
+    class(hasAsst2), intent(out) :: left
+    class(hasAsst2), intent(in) :: right
+  end
+end
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
index cf4abf9e38181..ad824ad3590a2 100644
--- a/flang/test/Semantics/typeinfo13.f90
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -22,5 +22,5 @@ impure elemental subroutine override(to, from)
   end
 end
 
-!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)]
 !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]

From 6cbb67f84c53d88e67b0d5a9f0ad2cf4782e6f66 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <lazar_2004@list.ru>
Date: Tue, 17 Jun 2025 00:51:49 +0300
Subject: [PATCH 650/851] [mlir][emitc] Fix the emitc::ExpressionOp (#143894)

Fix the lack of verification that the definingOp of the return value
belongs to emitc::ExpressionOp.
---
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp      | 12 +++++++++---
 mlir/test/Dialect/EmitC/invalid_ops.mlir | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 1709654b90138..f82b20712b8c6 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -386,9 +386,7 @@ OpFoldResult emitc::ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
 Operation *ExpressionOp::getRootOp() {
   auto yieldOp = cast<YieldOp>(getBody()->getTerminator());
   Value yieldedValue = yieldOp.getResult();
-  Operation *rootOp = yieldedValue.getDefiningOp();
-  assert(rootOp && "Yielded value not defined within expression");
-  return rootOp;
+  return yieldedValue.getDefiningOp();
 }
 
 LogicalResult ExpressionOp::verify() {
@@ -406,6 +404,14 @@ LogicalResult ExpressionOp::verify() {
   if (!yieldResult)
     return emitOpError("must yield a value at termination");
 
+  Operation *rootOp = yieldResult.getDefiningOp();
+
+  if (!rootOp)
+    return emitOpError("yielded value has no defining op");
+
+  if (rootOp->getParentOp() != getOperation())
+    return emitOpError("yielded value not defined within expression");
+
   Type yieldType = yieldResult.getType();
 
   if (resultType != yieldType)
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 3793dfe3f173b..3946a36a83c6f 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -346,6 +346,28 @@ func.func @test_expression_multiple_results(%arg0: i32) -> i32 {
 
 // -----
 
+emitc.func @test_expression_no_defining_op(%a : i32) {
+  // expected-error @+1 {{'emitc.expression' op yielded value has no defining op}}
+  %res = emitc.expression : i32 {
+    emitc.yield %a : i32
+  }
+
+  return
+}
+
+// -----
+
+emitc.func @test_expression_op_outside_expression() {
+  %cond = literal "true" : i1
+  // expected-error @+1 {{'emitc.expression' op yielded value not defined within expression}}
+  %res = emitc.expression : i1 {
+    emitc.yield %cond : i1
+  }
+  return
+}
+
+// -----
+
 // expected-error @+1 {{'emitc.func' op requires zero or exactly one result, but has 2}}
 emitc.func @multiple_results(%0: i32) -> (i32, i32) {
   emitc.return %0 : i32

From a383b1a95b63cf120b3dea554c2d66ccfaee066b Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Mon, 16 Jun 2025 14:52:59 -0700
Subject: [PATCH 651/851] Reland "[HLSL][RootSignature] Implement serialization
 of RootConstants and RootFlags" (#143019)

This relands #141130.

The initial commit uncovered that we are missing the correct linking of
FrontendHLSL into clang/lib/Parse and clang/lib/unittests/Parse.

This change addreses this by linking them accordingly.

It was also checked and ensured that the LexHLSLRootSignature libraries
do not depend on FrontendHLSL and so we are not required to link there.

Resolves: #138190 and #138192
---
 clang/lib/Parse/CMakeLists.txt                |  1 +
 clang/unittests/Parse/CMakeLists.txt          |  1 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |  5 ++
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 33 +++++++++
 .../Frontend/HLSLRootSignatureDumpTest.cpp    | 69 +++++++++++++++++++
 5 files changed, 109 insertions(+)

diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt
index 00fde537bb9c6..e6cbf3b868b7d 100644
--- a/clang/lib/Parse/CMakeLists.txt
+++ b/clang/lib/Parse/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  FrontendHLSL
   FrontendOpenMP
   MC
   MCParser
diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt
index 6859efed294c8..2ed43a83b8782 100644
--- a/clang/unittests/Parse/CMakeLists.txt
+++ b/clang/unittests/Parse/CMakeLists.txt
@@ -11,5 +11,6 @@ add_clang_unittest(ParseTests
   LLVMTestingSupport
   clangTesting
   LLVM_COMPONENTS
+  FrontendHLSL
   Support
   )
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 6d959ad5bdc7f..ca20e6719f3a4 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -27,6 +27,11 @@ class Metadata;
 namespace hlsl {
 namespace rootsig {
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootConstants &Constants);
+
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
                                  const DescriptorTableClause &Clause);
 
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 7d744781da04f..5bae72a3986f8 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -129,6 +129,39 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+static const EnumEntry<RootFlags> RootFlagNames[] = {
+    {"AllowInputAssemblerInputLayout",
+     RootFlags::AllowInputAssemblerInputLayout},
+    {"DenyVertexShaderRootAccess", RootFlags::DenyVertexShaderRootAccess},
+    {"DenyHullShaderRootAccess", RootFlags::DenyHullShaderRootAccess},
+    {"DenyDomainShaderRootAccess", RootFlags::DenyDomainShaderRootAccess},
+    {"DenyGeometryShaderRootAccess", RootFlags::DenyGeometryShaderRootAccess},
+    {"DenyPixelShaderRootAccess", RootFlags::DenyPixelShaderRootAccess},
+    {"AllowStreamOutput", RootFlags::AllowStreamOutput},
+    {"LocalRootSignature", RootFlags::LocalRootSignature},
+    {"DenyAmplificationShaderRootAccess",
+     RootFlags::DenyAmplificationShaderRootAccess},
+    {"DenyMeshShaderRootAccess", RootFlags::DenyMeshShaderRootAccess},
+    {"CBVSRVUAVHeapDirectlyIndexed", RootFlags::CBVSRVUAVHeapDirectlyIndexed},
+    {"SamplerHeapDirectlyIndexed", RootFlags::SamplerHeapDirectlyIndexed},
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const RootFlags &Flags) {
+  OS << "RootFlags(";
+  printFlags(OS, Flags, ArrayRef(RootFlagNames));
+  OS << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const RootConstants &Constants) {
+  OS << "RootConstants(num32BitConstants = " << Constants.Num32BitConstants
+     << ", " << Constants.Reg << ", space = " << Constants.Space
+     << ", visibility = " << Constants.Visibility << ")";
+
+  return OS;
+}
+
 raw_ostream &operator<<(raw_ostream &OS, const DescriptorTable &Table) {
   OS << "DescriptorTable(numClauses = " << Table.NumClauses
      << ", visibility = " << Table.Visibility << ")";
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 90e6cd0a80d6b..1a0c8e2a16396 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -108,4 +108,73 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) {
   EXPECT_EQ(Out, Expected);
 }
 
+TEST(HLSLRootSignatureTest, DefaultRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 1;
+  Constants.Reg = {RegisterType::BReg, 3};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 1, b3, space = 0, "
+                         "visibility = All)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, SetRootConstantsDump) {
+  RootConstants Constants;
+  Constants.Num32BitConstants = 983;
+  Constants.Reg = {RegisterType::BReg, 34593};
+  Constants.Space = 7;
+  Constants.Visibility = ShaderVisibility::Pixel;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Constants;
+  OS.flush();
+
+  std::string Expected = "RootConstants(num32BitConstants = 983, b34593, "
+                         "space = 7, visibility = Pixel)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, NoneRootFlagsDump) {
+  RootFlags Flags = RootFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags(None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, AllRootFlagsDump) {
+  RootFlags Flags = RootFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Flags;
+  OS.flush();
+
+  std::string Expected = "RootFlags("
+                         "AllowInputAssemblerInputLayout | "
+                         "DenyVertexShaderRootAccess | "
+                         "DenyHullShaderRootAccess | "
+                         "DenyDomainShaderRootAccess | "
+                         "DenyGeometryShaderRootAccess | "
+                         "DenyPixelShaderRootAccess | "
+                         "AllowStreamOutput | "
+                         "LocalRootSignature | "
+                         "DenyAmplificationShaderRootAccess | "
+                         "DenyMeshShaderRootAccess | "
+                         "CBVSRVUAVHeapDirectlyIndexed | "
+                         "SamplerHeapDirectlyIndexed)";
+
+  EXPECT_EQ(Out, Expected);
+}
+
 } // namespace

From 30b16ec3415e7ddb597d096f818d011b1b4e6a63 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 16 Jun 2025 22:54:25 +0100
Subject: [PATCH 652/851] [VPlan] Simplify trivial VPFirstOrderRecurrencePHI
 recipes.

VPFirstOrderRecurrencePHIRecipes where the incoming values are the same
can be simplified and removed.

Fixes https://github.com/llvm/llvm-project/issues/144212.

The new test is added together with other related tests from
first-order-recurrence.ll
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   6 +
 ...irst-order-recurrence-dead-instructions.ll | 270 +++++++++++
 .../LoopVectorize/first-order-recurrence.ll   | 452 ------------------
 3 files changed, 276 insertions(+), 452 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 44a72755b9cf8..05a0e15f9a199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1141,6 +1141,12 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
+    if (Phi->getOperand(0) == Phi->getOperand(1))
+      Def->replaceAllUsesWith(Phi->getOperand(0));
+    return;
+  }
+
   // Some simplifications can only be applied after unrolling. Perform them
   // below.
   if (!Plan->isUnrolled())
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
new file mode 100644
index 0000000000000..d98cd45cb634e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/144212.
+define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(i8 %for.start, ptr %dst) {
+; CHECK-LABEL: define i8 @recurrence_phi_with_same_incoming_values_after_simplifications(
+; CHECK-SAME: i8 [[FOR_START:%.*]], ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[FOR_START]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLAT]], <4 x i8> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 4
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP2]], align 1
+; CHECK-NEXT:    store <4 x i8> [[TMP0]], ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], -8
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i8> [[BROADCAST_SPLAT]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ -7, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ], [ 1, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i8 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ [[FOR_START]], %[[ENTRY]] ], [ [[FOR_START]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i8 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR_NEXT]] = and i8 [[FOR_START]], -1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i8 [[FOR]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[FOR_NEXT_LCSSA:%.*]] = phi i8 [ [[FOR_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[FOR_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %for = phi i8 [ %for.start, %entry ], [ %for.next, %loop ]
+  %for.next = and i8 %for.start, -1
+  %iv.next = add i32 %iv, 1
+  %gep.dst = getelementptr inbounds i8, ptr %dst, i32 %iv
+  store i8 %for, ptr %gep.dst
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i8 %for.next
+}
+
+; %vec.dead will be marked as dead instruction in the vector loop and no recipe
+; will be created for it. Make sure a valid sink target is used.
+define i32 @sink_after_dead_inst(ptr %A.ptr) {
+; CHECK-LABEL: define i32 @sink_after_dead_inst(
+; CHECK-SAME: ptr [[A_PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i16> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP3]], i32 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP4]], align 4
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; CHECK-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; CHECK-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; CHECK-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; CHECK-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; CHECK-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; CHECK-NEXT:    br i1 [[VEC_DEAD]], label %[[FOR_END]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], %[[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
+  %cmp = icmp eq i32 %for, 15
+  %C = icmp eq i1 %cmp, true
+  %vec.dead = and i1 %C, 1
+  %iv.next = add i16 %iv, 1
+  %B1 = or i16 %iv.next, %iv.next
+  %B3 = and i1 %cmp, %C
+  %for.prev = zext i16 %B1 to i32
+
+  %ext = zext i1 %B3 to i32
+  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
+  store i32 0, ptr %A.gep
+  br i1 %vec.dead, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+; Dead instructions, like the exit condition are not part of the actual VPlan
+; and do not need to be sunk. PR44634.
+define void @sink_dead_inst(ptr %a) {
+; CHECK-LABEL: define void @sink_dead_inst(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[TMP4]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i16> [[TMP5]], splat (i16 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A]], i16 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP9]], i32 4
+; CHECK-NEXT:    store <4 x i16> [[TMP7]], ptr [[TMP10]], align 2
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT1]], %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], %[[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; CHECK-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %cmp = icmp eq i32 %rec.2, 15
+  %iv.next = add i16 %iv, 1
+  %rec.2.prev = zext i16 %iv.next to i32
+  %rec.1.prev = add i16 %iv.next, 5
+  %gep = getelementptr i16, ptr %a, i16 %iv
+  store i16 %use.rec.1, ptr %gep
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
+; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
+; to be removed also.
+define void @unused_recurrence(ptr %a) {
+; CHECK-LABEL: define void @unused_recurrence(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; CHECK-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[FOR_END:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, %[[MIDDLE_BLOCK]] ], [ -27, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], %[[FOR_COND]] ]
+; CHECK-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; CHECK-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; CHECK-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %iv.next= add i16 %iv, 1
+  %rec.1.prev = add i16 %iv.next, 5
+  %cmp = icmp eq i16 %iv, 1000
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 13dc53559d283..9be26d4247a36 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2473,177 +2473,6 @@ for.end12.loopexit:                               ; preds = %cond.end
   ret void
 }
 
-; Dead instructions, like the exit condition are not part of the actual VPlan
-; and do not need to be sunk. PR44634.
-define void @sink_dead_inst(ptr %a) {
-; UNROLL-NO-IC-LABEL: @sink_dead_inst(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], splat (i16 10)
-; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
-; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP12]], align 2
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-IC:       for.cond:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; UNROLL-NO-IC-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    ret void
-;
-; UNROLL-NO-VF-LABEL: @sink_dead_inst(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
-; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
-; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sub i16 [[TMP5]], 10
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]]
-; UNROLL-NO-VF-NEXT:    store i16 [[TMP7]], ptr [[TMP9]], align 2
-; UNROLL-NO-VF-NEXT:    store i16 [[TMP8]], ptr [[TMP10]], align 2
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-VF:       for.cond:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-VF-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; UNROLL-NO-VF-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    ret void
-;
-; SINK-AFTER-LABEL: @sink_dead_inst(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; SINK-AFTER-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
-; SINK-AFTER:       for.cond:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
-; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; SINK-AFTER-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
-; SINK-AFTER-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
-; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
-  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
-  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
-  %use.rec.1 = sub i16 %rec.1, 10
-  %cmp = icmp eq i32 %rec.2, 15
-  %iv.next = add i16 %iv, 1
-  %rec.2.prev = zext i16 %iv.next to i32
-  %rec.1.prev = add i16 %iv.next, 5
-  %gep = getelementptr i16, ptr %a, i16 %iv
-  store i16 %use.rec.1, ptr %gep
-  br i1 %cmp, label %for.end, label %for.cond
-
-for.end:
-  ret void
-}
-
 define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
@@ -3464,287 +3293,6 @@ bb:
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
 
-; %vec.dead will be marked as dead instruction in the vector loop and no recipe
-; will be created for it. Make sure a valid sink target is used.
-define i32 @sink_after_dead_inst(ptr %A.ptr) {
-; UNROLL-NO-IC-LABEL: @sink_after_dead_inst(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-IC:       loop:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-IC-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-IC-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-IC-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-IC-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-IC-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-IC-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-IC-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-; UNROLL-NO-VF-LABEL: @sink_after_dead_inst(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = or i16 [[TMP5]], [[TMP5]]
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
-; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP4]] to i32
-; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
-; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
-; UNROLL-NO-VF:       loop:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; UNROLL-NO-VF-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; UNROLL-NO-VF-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; UNROLL-NO-VF-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-; SINK-AFTER-LABEL: @sink_after_dead_inst(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
-; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
-; SINK-AFTER:       loop:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
-; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
-; SINK-AFTER-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
-; SINK-AFTER-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
-; SINK-AFTER-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
-; SINK-AFTER-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
-; SINK-AFTER-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
-; SINK-AFTER-NEXT:    store i32 0, ptr [[A_GEP]], align 4
-; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
-; SINK-AFTER-NEXT:    ret i32 [[FOR_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
-  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
-  %cmp = icmp eq i32 %for, 15
-  %C = icmp eq i1 %cmp, true
-  %vec.dead = and i1 %C, 1
-  %iv.next = add i16 %iv, 1
-  %B1 = or i16 %iv.next, %iv.next
-  %B3 = and i1 %cmp, %C
-  %for.prev = zext i16 %B1 to i32
-
-  %ext = zext i1 %B3 to i32
-  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
-  store i32 0, ptr %A.gep
-  br i1 %vec.dead, label %for.end, label %loop
-
-for.end:
-  ret i32 %for
-}
-
-; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
-; to be removed also.
-define void @unused_recurrence(ptr %a) {
-; UNROLL-NO-IC-LABEL: @unused_recurrence(
-; UNROLL-NO-IC-NEXT:  entry:
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-IC:       vector.ph:
-; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-IC:       vector.body:
-; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
-; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; UNROLL-NO-IC:       middle.block:
-; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-IC:       scalar.ph:
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-IC:       for.cond:
-; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; UNROLL-NO-IC:       for.end:
-; UNROLL-NO-IC-NEXT:    ret void
-;
-; UNROLL-NO-VF-LABEL: @unused_recurrence(
-; UNROLL-NO-VF-NEXT:  entry:
-; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL-NO-VF:       vector.ph:
-; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
-; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
-; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
-; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; UNROLL-NO-VF:       middle.block:
-; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
-; UNROLL-NO-VF:       for.cond:
-; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    ret void
-;
-; SINK-AFTER-LABEL: @unused_recurrence(
-; SINK-AFTER-NEXT:  entry:
-; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SINK-AFTER:       vector.ph:
-; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
-; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
-; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
-; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; SINK-AFTER:       middle.block:
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
-; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; SINK-AFTER:       scalar.ph:
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
-; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
-; SINK-AFTER:       for.cond:
-; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
-; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
-; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
-; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
-; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
-; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
-; SINK-AFTER:       for.end:
-; SINK-AFTER-NEXT:    ret void
-;
-entry:
-  br label %for.cond
-
-for.cond:
-  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
-  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
-  %use.rec.1 = sub i16 %rec.1, 10
-  %iv.next= add i16 %iv, 1
-  %rec.1.prev = add i16 %iv.next, 5
-  %cmp = icmp eq i16 %iv, 1000
-  br i1 %cmp, label %for.end, label %for.cond
-
-for.end:
-  ret void
-}
-
 ; Test case for https://github.com/llvm/llvm-project/issues/95520.
 define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-LABEL: @recurence_uniform_load(

From 4bcf9732c7361b3ea5208ced592245e0302fc7a2 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Mon, 16 Jun 2025 23:03:49 +0100
Subject: [PATCH 653/851] [CIR] Add Support For Library Builtins (#143984)

This patch upstreams support for builtins that map to a standard library
function. Examples would be abort() and printf().

It also fixes a minor issue with the errorNYI for all remaining
unimplemented builtins using the mlir::Location instead of the clang AST
SourceLocation.
---
 clang/include/clang/CIR/MissingFeatures.h |  1 +
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp   | 39 +++++++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.h      |  4 ++
 clang/test/CIR/CodeGen/builtin_call.cpp   | 18 +++++++
 clang/test/CIR/CodeGen/builtin_printf.cpp | 65 +++++++++++++++++++++++
 5 files changed, 125 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/builtin_printf.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 13ddc77835fbc..3dc28e6f2e5bf 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -236,6 +236,7 @@ struct MissingFeatures {
   static bool runCleanupsScope() { return false; }
   static bool lowerAggregateLoadStore() { return false; }
   static bool dataLayoutTypeAllocSize() { return false; }
+  static bool asmLabelAttr() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index c59ac78210f81..19fac00ab8736 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -20,10 +20,18 @@
 #include "mlir/Support/LLVM.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/GlobalDecl.h"
+#include "clang/CIR/MissingFeatures.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
+using namespace llvm;
+
+static RValue emitLibraryCall(CIRGenFunction &cgf, const FunctionDecl *fd,
+                              const CallExpr *e, mlir::Operation *calleeValue) {
+  CIRGenCallee callee = CIRGenCallee::forDirect(calleeValue, GlobalDecl(fd));
+  return cgf.emitCall(e->getCallee()->getType(), callee, e, ReturnValueSlot());
+}
 
 RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                        const CallExpr *e,
@@ -49,7 +57,34 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     }
   }
 
-  mlir::Location loc = getLoc(e->getExprLoc());
-  cgm.errorNYI(loc, "non constant foldable builtin calls");
+  const FunctionDecl *fd = gd.getDecl()->getAsFunction();
+
+  // If this is an alias for a lib function (e.g. __builtin_sin), emit
+  // the call using the normal call path, but using the unmangled
+  // version of the function name.
+  if (getContext().BuiltinInfo.isLibFunction(builtinID))
+    return emitLibraryCall(*this, fd, e,
+                           cgm.getBuiltinLibFunction(fd, builtinID));
+
+  cgm.errorNYI(e->getSourceRange(), "unimplemented builtin call");
   return getUndefRValue(e->getType());
 }
+
+/// Given a builtin id for a function like "__builtin_fabsf", return a Function*
+/// for "fabsf".
+cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
+                                                unsigned builtinID) {
+  assert(astContext.BuiltinInfo.isLibFunction(builtinID));
+
+  // Get the name, skip over the __builtin_ prefix (if necessary). We may have
+  // to build this up so provide a small stack buffer to handle the vast
+  // majority of names.
+  llvm::SmallString<64> name;
+
+  assert(!cir::MissingFeatures::asmLabelAttr());
+  name = astContext.BuiltinInfo.getName(builtinID).substr(10);
+
+  GlobalDecl d(fd);
+  mlir::Type type = convertType(fd->getType());
+  return getOrCreateCIRFunction(name, type, d, /*forVTable=*/false);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 03606dba200fd..0ea2d9f9c8229 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -301,6 +301,10 @@ class CIRGenModule : public CIRGenTypeCache {
                                 cir::FuncType funcType,
                                 const clang::FunctionDecl *funcDecl);
 
+  /// Given a builtin id for a function like "__builtin_fabsf", return a
+  /// Function* for "fabsf".
+  cir::FuncOp getBuiltinLibFunction(const FunctionDecl *fd, unsigned builtinID);
+
   mlir::IntegerAttr getSize(CharUnits size) {
     return builder.getSizeFromCharUnits(size);
   }
diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp
index 2706ea7f8f857..322c13c8f081a 100644
--- a/clang/test/CIR/CodeGen/builtin_call.cpp
+++ b/clang/test/CIR/CodeGen/builtin_call.cpp
@@ -76,3 +76,21 @@ float constant_fp_builtin_single() {
 // OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev()
 // OGCG: ret float 0x3FB99999A0000000
 // OGCG: }
+
+void library_builtins() {
+  __builtin_printf(nullptr);
+  __builtin_abort();
+}
+
+// CIR: cir.func @_Z16library_builtinsv() {
+// CIR: %[[NULL:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR: cir.call @printf(%[[NULL]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR: cir.call @abort() : () -> ()
+
+// LLVM: define void @_Z16library_builtinsv()
+// LLVM: call i32 (ptr, ...) @printf(ptr null)
+// LLVM: call void @abort()
+
+// OGCG: define dso_local void @_Z16library_builtinsv()
+// OGCG: call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG: call void @abort()
diff --git a/clang/test/CIR/CodeGen/builtin_printf.cpp b/clang/test/CIR/CodeGen/builtin_printf.cpp
new file mode 100644
index 0000000000000..366e474c2b09a
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_printf.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// CIR: cir.global "private" cir_private dsolocal @".str" = #cir.const_array<"%s\00" : !cir.array<!s8i x 3>> : !cir.array<!s8i x 3> 
+// CIR: cir.global "private" cir_private dsolocal @".str.1" = #cir.const_array<"%s %d\0A\00" : !cir.array<!s8i x 7>> : !cir.array<!s8i x 7>
+// LLVM: @.str = private global [3 x i8] c"%s\00"
+// LLVM: @.str.1 = private global [7 x i8] c"%s %d\0A\00"
+// OGCG: @.str = private unnamed_addr constant [3 x i8] c"%s\00"
+// OGCG: @.str.1 = private unnamed_addr constant [7 x i8] c"%s %d\0A\00"
+
+void func(char const * const str, int i) {
+  __builtin_printf(nullptr);
+  __builtin_printf("%s", str);
+  __builtin_printf("%s %d\n", str, i);
+}
+
+// CIR: cir.func @printf(!cir.ptr<!s8i>, ...) -> !s32i
+
+// CIR: cir.func @_Z4funcPKci(%[[arg0:.+]]: !cir.ptr<!s8i>{{.*}}, %[[arg1:.+]]: !s32i{{.*}}) {
+// CIR:   %[[str_ptr:.+]] = cir.alloca !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>, ["str", init, const]
+// CIR:   %[[i_ptr:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+// CIR:   cir.store %[[arg0]], %[[str_ptr]] : !cir.ptr<!s8i>, !cir.ptr<!cir.ptr<!s8i>>
+// CIR:   cir.store %[[arg1]], %[[i_ptr]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[null_ptr:.+]] = cir.const #cir.ptr<null> : !cir.ptr<!s8i>
+// CIR:   %[[printf_result1:.+]] = cir.call @printf(%[[null_ptr]]) : (!cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[str_fmt_global:.+]] = cir.get_global @".str" : !cir.ptr<!cir.array<!s8i x 3>>
+// CIR:   %[[str_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[str_fmt_global]] : !cir.ptr<!cir.array<!s8i x 3>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[printf_result2:.+]] = cir.call @printf(%[[str_fmt_ptr]], %[[str_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>) -> !s32i
+// CIR:   %[[full_fmt_global:.+]] = cir.get_global @".str.1" : !cir.ptr<!cir.array<!s8i x 7>>
+// CIR:   %[[full_fmt_ptr:.+]] = cir.cast(array_to_ptrdecay, %[[full_fmt_global]] : !cir.ptr<!cir.array<!s8i x 7>>), !cir.ptr<!s8i>
+// CIR:   %[[str_val2:.+]] = cir.load{{.*}} %[[str_ptr]] : !cir.ptr<!cir.ptr<!s8i>>, !cir.ptr<!s8i>
+// CIR:   %[[i_val:.+]] = cir.load{{.*}} %[[i_ptr]] : !cir.ptr<!s32i>, !s32i
+// CIR:   %[[printf_result3:.+]] = cir.call @printf(%[[full_fmt_ptr]], %[[str_val2]], %[[i_val]]) : (!cir.ptr<!s8i>, !cir.ptr<!s8i>, !s32i) -> !s32i
+// CIR:   cir.return
+
+// LLVM: define void @_Z4funcPKci(ptr %[[arg0:.+]], i32 %[[arg1:.+]])
+// LLVM:   %[[str_ptr:.+]] = alloca ptr
+// LLVM:   %[[i_ptr:.+]] = alloca i32
+// LLVM:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// LLVM:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr null)
+// LLVM:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr @.str, ptr %[[str_val]])
+// LLVM:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// LLVM:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// LLVM:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr @.str.1, ptr %[[str_val2]], i32 %[[i_val]])
+// LLVM:   ret void
+
+// OGCG: define dso_local void @_Z4funcPKci(ptr noundef %[[arg0:.+]], i32 noundef %[[arg1:.+]])
+// OGCG:   %[[str_ptr:.+]] = alloca ptr
+// OGCG:   %[[i_ptr:.+]] = alloca i32
+// OGCG:   store ptr %[[arg0]], ptr %[[str_ptr]]{{.*}}
+// OGCG:   store i32 %[[arg1]], ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result1:.+]] = call i32 (ptr, ...) @printf(ptr noundef null)
+// OGCG:   %[[str_val:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[printf_result2:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str, ptr noundef %[[str_val]])
+// OGCG:   %[[str_val2:.+]] = load ptr, ptr %[[str_ptr]]{{.*}}
+// OGCG:   %[[i_val:.+]] = load i32, ptr %[[i_ptr]]{{.*}}
+// OGCG:   %[[printf_result3:.+]] = call i32 (ptr, ...) @printf(ptr noundef @.str.1, ptr noundef %[[str_val2]], i32 noundef %[[i_val]])
+// OGCG:   ret void

From 2488f26d15e7e12aef9ead3fcb2d1b6da51812fb Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 16 Jun 2025 15:06:41 -0700
Subject: [PATCH 654/851] [win][x64] Unwind v2 3/n: Add support for requiring
 unwind v2 to be used (equivalent to MSVC's /d2epilogunwindrequirev2)
 (#143577)

#129142 added support for emitting Windows x64 unwind v2 information,
but it was "best effort". If any function didn't follow the requirements
for v2 it was silently downgraded to v1.

There are some parts of Windows (specifically kernel-mode code running
on Xbox) that require v2, hence we need the ability to fail the
compilation if v2 can't be used.

This change also adds a heuristic to check if there might be too many
unwind codes, it's currently conservative (i.e., assumes that certain
prolog instructions will use the maximum number of unwind codes).

Future work: attempting to chain unwind info across multiple tables if
there are too many unwind codes due to epilogs and adding a heuristic to
detect if an epilog will be too far from the end of the function.
---
 clang/include/clang/Basic/CodeGenOptions.def  |   6 +-
 clang/include/clang/Driver/Options.td         |  17 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |   6 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   9 +-
 clang/test/CodeGen/epilog-unwind.c            |  10 +-
 clang/test/Driver/cl-options.c                |   6 +-
 llvm/include/llvm/IR/Module.h                 |   4 +
 llvm/include/llvm/Support/CodeGen.h           |   9 +
 llvm/lib/IR/Module.cpp                        |   7 +
 llvm/lib/Target/X86/X86WinEHUnwindV2.cpp      | 152 +++++++--
 .../CodeGen/X86/win64-eh-unwindv2-errors.mir  | 318 ++++++++++++++++++
 .../win64-eh-unwindv2-too-many-epilogs.mir    |  94 ++++++
 12 files changed, 595 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
 create mode 100644 llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2a30ff11464dd..e5566a540dc65 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -483,8 +483,10 @@ CODEGENOPT(StaticClosure, 1, 0)
 /// Assume that UAVs/SRVs may alias
 CODEGENOPT(ResMayAlias, 1, 0)
 
-/// Enables unwind v2 (epilog) information for x64 Windows.
-CODEGENOPT(WinX64EHUnwindV2, 1, 0)
+/// Controls how unwind v2 (epilog) information should be generated for x64
+/// Windows.
+ENUM_CODEGENOPT(WinX64EHUnwindV2, llvm::WinX64EHUnwindV2Mode,
+                2, llvm::WinX64EHUnwindV2Mode::Disabled)
 
 /// FIXME: Make DebugOptions its own top-level .def file.
 #include "DebugOptions.def"
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b07deb4a8482..72d564e1ba0be 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2167,11 +2167,14 @@ defm assume_nothrow_exception_dtor: BoolFOption<"assume-nothrow-exception-dtor",
   LangOpts<"AssumeNothrowExceptionDtor">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Assume that exception objects' destructors are non-throwing">,
   NegFlag<SetFalse>>;
-defm winx64_eh_unwindv2 : BoolFOption<"winx64-eh-unwindv2",
-  CodeGenOpts<"WinX64EHUnwindV2">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Enable">,
-  NegFlag<SetFalse, [], [ClangOption], "Disable">,
-  BothFlags<[], [ClangOption], " unwind v2 (epilog) information for x64 Windows">>;
+def winx64_eh_unwindv2
+    : Joined<["-"], "fwinx64-eh-unwindv2=">, Group<f_Group>,
+    Visibility<[ClangOption, CC1Option]>,
+      HelpText<"Generate unwind v2 (epilog) information for x64 Windows">,
+      Values<"disabled,best-effort,required">,
+      NormalizedValues<["Disabled", "BestEffort", "Required"]>,
+      NormalizedValuesScope<"llvm::WinX64EHUnwindV2Mode">,
+      MarshallingInfoEnum<CodeGenOpts<"WinX64EHUnwindV2">, "Disabled">;
 def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">, Group<f_Group>,
   Visibility<[ClangOption, CLOption]>,
   HelpText<"Allows control over excess precision on targets where native "
@@ -8972,7 +8975,9 @@ def _SLASH_volatile_Group : OptionGroup<"</volatile group>">,
   Group<cl_compile_Group>;
 
 def _SLASH_d2epilogunwind : CLFlag<"d2epilogunwind">,
-  HelpText<"Enable unwind v2 (epilog) information for x64 Windows">;
+  HelpText<"Best effort generate unwind v2 (epilog) information for x64 Windows">;
+def _SLASH_d2epilogunwindrequirev2 : CLFlag<"d2epilogunwindrequirev2">,
+  HelpText<"Require generation of unwind v2 (epilog) information for x64 Windows">;
 def _SLASH_EH : CLJoined<"EH">, HelpText<"Set exception handling model">;
 def _SLASH_EP : CLFlag<"EP">,
   HelpText<"Disable linemarker output and preprocess to stdout">;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 06c0e1f8afe1b..c27168e4c4bfe 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1319,8 +1319,10 @@ void CodeGenModule::Release() {
                               1);
 
   // Enable unwind v2 (epilog).
-  if (CodeGenOpts.WinX64EHUnwindV2)
-    getModule().addModuleFlag(llvm::Module::Warning, "winx64-eh-unwindv2", 1);
+  if (CodeGenOpts.getWinX64EHUnwindV2() != llvm::WinX64EHUnwindV2Mode::Disabled)
+    getModule().addModuleFlag(
+        llvm::Module::Warning, "winx64-eh-unwindv2",
+        static_cast<unsigned>(CodeGenOpts.getWinX64EHUnwindV2()));
 
   // Indicate whether this Module was compiled with -fopenmp
   if (getLangOpts().OpenMP && !getLangOpts().OpenMPSimd)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8556bcadf0915..7dfed3a3356bb 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7360,8 +7360,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   // Unwind v2 (epilog) information for x64 Windows.
-  Args.addOptInFlag(CmdArgs, options::OPT_fwinx64_eh_unwindv2,
-                    options::OPT_fno_winx64_eh_unwindv2);
+  Args.AddLastArg(CmdArgs, options::OPT_winx64_eh_unwindv2);
 
   // C++ "sane" operator new.
   Args.addOptOutFlag(CmdArgs, options::OPT_fassume_sane_operator_new,
@@ -8418,8 +8417,10 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
     CmdArgs.push_back("-fms-kernel");
 
   // Unwind v2 (epilog) information for x64 Windows.
-  if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
-    CmdArgs.push_back("-fwinx64-eh-unwindv2");
+  if (Args.hasArg(options::OPT__SLASH_d2epilogunwindrequirev2))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=required");
+  else if (Args.hasArg(options::OPT__SLASH_d2epilogunwind))
+    CmdArgs.push_back("-fwinx64-eh-unwindv2=best-effort");
 
   for (const Arg *A : Args.filtered(options::OPT__SLASH_guard)) {
     StringRef GuardArgs = A->getValue();
diff --git a/clang/test/CodeGen/epilog-unwind.c b/clang/test/CodeGen/epilog-unwind.c
index 991ff09fb37cf..b2f7497b455b6 100644
--- a/clang/test/CodeGen/epilog-unwind.c
+++ b/clang/test/CodeGen/epilog-unwind.c
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
-// RUN: %clang_cc1 -fwinx64-eh-unwindv2 -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fwinx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ENABLED
-// RUN: %clang -fno-winx64-eh-unwindv2 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=disabled -emit-llvm %s -o - | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=best-effort -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
+// RUN: %clang_cc1 -fwinx64-eh-unwindv2=required -emit-llvm %s -o - | FileCheck %s -check-prefix=REQUIRED
+// RUN: %clang -fwinx64-eh-unwindv2=best-effort -S -emit-llvm %s -o - | FileCheck %s -check-prefix=BESTEFFORT
 
 void f(void) {}
 
-// ENABLED: !"winx64-eh-unwindv2", i32 1}
+// BESTEFFORT: !"winx64-eh-unwindv2", i32 1}
+// REQUIRED: !"winx64-eh-unwindv2", i32 2}
 // DISABLED-NOT: "winx64-eh-unwindv2"
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 0535285862b9f..eb079895a0a88 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -821,7 +821,11 @@
 // ARM64EC_OVERRIDE: warning: /arm64EC has been overridden by specified target: x86_64-pc-windows-msvc; option ignored
 
 // RUN: %clang_cl /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWIND
-// EPILOGUNWIND: -fwinx64-eh-unwindv2
+// EPILOGUNWIND: -fwinx64-eh-unwindv2=best-effort
+
+// RUN: %clang_cl /d2epilogunwindrequirev2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// RUN: %clang_cl /d2epilogunwindrequirev2 /d2epilogunwind /c -### -- %s 2>&1 | FileCheck %s --check-prefix=EPILOGUNWINDREQUIREV2
+// EPILOGUNWINDREQUIREV2: -fwinx64-eh-unwindv2=require
 
 // RUN: %clang_cl /funcoverride:override_me1 /funcoverride:override_me2 /c -### -- %s 2>&1 | FileCheck %s --check-prefix=FUNCOVERRIDE
 // FUNCOVERRIDE: -loader-replaceable-function=override_me1
diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h
index f4420f460741b..a99937a90cbb7 100644
--- a/llvm/include/llvm/IR/Module.h
+++ b/llvm/include/llvm/IR/Module.h
@@ -1041,6 +1041,10 @@ class LLVM_ABI Module {
 
   /// Returns target-abi from MDString, null if target-abi is absent.
   StringRef getTargetABIFromMD();
+
+  /// Get how unwind v2 (epilog) information should be generated for x64
+  /// Windows.
+  WinX64EHUnwindV2Mode getWinX64EHUnwindV2Mode() const;
 };
 
 /// Given "llvm.used" or "llvm.compiler.used" as a global name, collect the
diff --git a/llvm/include/llvm/Support/CodeGen.h b/llvm/include/llvm/Support/CodeGen.h
index 0e42789ba932e..48745f7f4d2a6 100644
--- a/llvm/include/llvm/Support/CodeGen.h
+++ b/llvm/include/llvm/Support/CodeGen.h
@@ -130,6 +130,15 @@ namespace llvm {
     Invalid = 2, ///< Not used.
   };
 
+  enum class WinX64EHUnwindV2Mode {
+    // Don't use unwind v2 (i.e., use v1).
+    Disabled = 0,
+    // Use unwind v2 here possible, otherwise fallback to v1.
+    BestEffort = 1,
+    // Use unwind v2 everywhere, otherwise raise an error.
+    Required = 2,
+  };
+
   } // namespace llvm
 
 #endif
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 37f4a72d8c20b..2d31481f62c67 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -917,3 +917,10 @@ StringRef Module::getTargetABIFromMD() {
     TargetABI = TargetABIMD->getString();
   return TargetABI;
 }
+
+WinX64EHUnwindV2Mode Module::getWinX64EHUnwindV2Mode() const {
+  Metadata *MD = getModuleFlag("winx64-eh-unwindv2");
+  if (auto *CI = mdconst::dyn_extract_or_null<ConstantInt>(MD))
+    return static_cast<WinX64EHUnwindV2Mode>(CI->getZExtValue());
+  return WinX64EHUnwindV2Mode::Disabled;
+}
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index 2c1f9a5746e38..e9081a4ae4e72 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 
 using namespace llvm;
@@ -31,6 +32,15 @@ STATISTIC(MeetsUnwindV2Criteria,
 STATISTIC(FailsUnwindV2Criteria,
           "Number of functions that fail Unwind v2 criteria");
 
+static cl::opt<unsigned> MaximumUnwindCodes(
+    "x86-wineh-unwindv2-max-unwind-codes", cl::Hidden,
+    cl::desc("Maximum number of unwind codes permitted in each unwind info."),
+    cl::init(UINT8_MAX));
+
+static cl::opt<unsigned>
+    ForceMode("x86-wineh-unwindv2-force-mode", cl::Hidden,
+              cl::desc("Overwrites the Unwind v2 mode for testing purposes."));
+
 namespace {
 
 class X86WinEHUnwindV2 : public MachineFunctionPass {
@@ -44,10 +54,12 @@ class X86WinEHUnwindV2 : public MachineFunctionPass {
   StringRef getPassName() const override { return "WinEH Unwind V2"; }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
-  bool rejectCurrentFunction() const {
-    FailsUnwindV2Criteria++;
-    return false;
-  }
+
+private:
+  /// Rejects the current function due to an internal error within LLVM.
+  static bool rejectCurrentFunctionInternalError(const MachineFunction &MF,
+                                                 WinX64EHUnwindV2Mode Mode,
+                                                 StringRef Reason);
 };
 
 enum class FunctionState {
@@ -69,8 +81,21 @@ FunctionPass *llvm::createX86WinEHUnwindV2Pass() {
   return new X86WinEHUnwindV2();
 }
 
+DebugLoc findDebugLoc(const MachineBasicBlock &MBB) {
+  for (const MachineInstr &MI : MBB)
+    if (MI.getDebugLoc())
+      return MI.getDebugLoc();
+
+  return DebugLoc::getUnknown();
+}
+
 bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getFunction().getParent()->getModuleFlag("winx64-eh-unwindv2"))
+  WinX64EHUnwindV2Mode Mode =
+      ForceMode.getNumOccurrences()
+          ? static_cast<WinX64EHUnwindV2Mode>(ForceMode.getValue())
+          : MF.getFunction().getParent()->getWinX64EHUnwindV2Mode();
+
+  if (Mode == WinX64EHUnwindV2Mode::Disabled)
     return false;
 
   // Current state of processing the function. We'll assume that all functions
@@ -80,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
   // Prolog information.
   SmallVector<int64_t> PushedRegs;
   bool HasStackAlloc = false;
+  unsigned ApproximatePrologCodeCount = 0;
 
   // Requested changes.
   SmallVector<MachineInstr *> UnwindV2StartLocations;
@@ -99,6 +125,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_PushReg:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_PushReg outside of prolog");
+        ApproximatePrologCodeCount++;
         PushedRegs.push_back(MI.getOperand(0).getImm());
         break;
 
@@ -106,9 +133,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_SetFrame:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+        // Assume a large alloc...
+        ApproximatePrologCodeCount +=
+            (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
         HasStackAlloc = true;
         break;
 
+      case X86::SEH_SaveReg:
+      case X86::SEH_SaveXMM:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_SaveXMM or SEH_SaveReg outside of prolog");
+        // Assume a big reg...
+        ApproximatePrologCodeCount += 3;
+        break;
+
+      case X86::SEH_PushFrame:
+        if (State != FunctionState::InProlog)
+          llvm_unreachable("SEH_PushFrame outside of prolog");
+        ApproximatePrologCodeCount++;
+        break;
+
       case X86::SEH_EndPrologue:
         if (State != FunctionState::InProlog)
           llvm_unreachable("SEH_EndPrologue outside of prolog");
@@ -127,10 +171,16 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
       case X86::SEH_EndEpilogue:
         if (State != FunctionState::InEpilog)
           llvm_unreachable("SEH_EndEpilogue outside of epilog");
-        if ((HasStackAlloc != HasStackDealloc) ||
-            (PoppedRegCount != PushedRegs.size()))
-          // Non-canonical epilog, reject the function.
-          return rejectCurrentFunction();
+        if (HasStackAlloc != HasStackDealloc)
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog made a stack allocation, "
+              "but the epilog did not deallocate it");
+        if (PoppedRegCount != PushedRegs.size())
+          return rejectCurrentFunctionInternalError(
+              MF, Mode,
+              "The prolog pushed more registers than "
+              "the epilog popped");
 
         // If we didn't find the start location, then use the end of the
         // epilog.
@@ -145,13 +195,26 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
         if (State == FunctionState::InEpilog) {
           // If the prolog contains a stack allocation, then the first
           // instruction in the epilog must be to adjust the stack pointer.
-          if (!HasStackAlloc || HasStackDealloc || (PoppedRegCount > 0)) {
-            return rejectCurrentFunction();
-          }
+          if (!HasStackAlloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating a stack "
+                "allocation, but the prolog did "
+                "not allocate one");
+          if (HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is deallocating the stack "
+                "allocation more than once");
+          if (PoppedRegCount > 0)
+            llvm_unreachable(
+                "Should have raised an error: either popping before "
+                "deallocating or deallocating without an allocation");
+
           HasStackDealloc = true;
         } else if (State == FunctionState::FinishedEpilog)
-          // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Unexpected mov or add instruction after the epilog");
         break;
 
       case X86::POP64r:
@@ -159,12 +222,22 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           // After the stack pointer has been adjusted, the epilog must
           // POP each register in reverse order of the PUSHes in the prolog.
           PoppedRegCount++;
-          if ((HasStackAlloc != HasStackDealloc) ||
-              (PoppedRegCount > PushedRegs.size()) ||
-              (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
-               MI.getOperand(0).getReg())) {
-            return rejectCurrentFunction();
-          }
+          if (HasStackAlloc != HasStackDealloc)
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "Cannot pop registers before the stack "
+                "allocation has been deallocated");
+          if (PoppedRegCount > PushedRegs.size())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping more registers than the prolog pushed");
+          if (PushedRegs[PushedRegs.size() - PoppedRegCount] !=
+              MI.getOperand(0).getReg())
+            return rejectCurrentFunctionInternalError(
+                MF, Mode,
+                "The epilog is popping a registers in "
+                "a different order than the "
+                "prolog pushed them");
 
           // Unwind v2 records the size of the epilog not from where we place
           // SEH_BeginEpilogue (as that contains the instruction to adjust the
@@ -176,7 +249,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           }
         } else if (State == FunctionState::FinishedEpilog)
           // Unexpected instruction after the epilog.
-          return rejectCurrentFunction();
+          return rejectCurrentFunctionInternalError(
+              MF, Mode, "Registers are being popped after the epilog");
         break;
 
       default:
@@ -191,7 +265,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
           if ((State == FunctionState::FinishedEpilog) ||
               (State == FunctionState::InEpilog))
             // Unknown instruction in or after the epilog.
-            return rejectCurrentFunction();
+            return rejectCurrentFunctionInternalError(
+                MF, Mode, "Unexpected instruction in or after the epilog");
         }
       }
     }
@@ -203,6 +278,25 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
+  MachineBasicBlock &FirstMBB = MF.front();
+  // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and
+  // that we won't be able to use the "last epilog at the end of function"
+  // optimization.
+  if (ApproximatePrologCodeCount + UnwindV2StartLocations.size() + 1 >
+      static_cast<unsigned>(MaximumUnwindCodes)) {
+    if (Mode == WinX64EHUnwindV2Mode::Required)
+      MF.getFunction().getContext().diagnose(DiagnosticInfoGenericWithLoc(
+          "Windows x64 Unwind v2 is required, but the function '" +
+              MF.getName() +
+              "' has too many unwind codes. Try splitting the function or "
+              "reducing the number of places where it exits early with a tail "
+              "call.",
+          MF.getFunction(), findDebugLoc(FirstMBB)));
+
+    FailsUnwindV2Criteria++;
+    return false;
+  }
+
   MeetsUnwindV2Criteria++;
 
   // Emit the pseudo instruction that marks the start of each epilog.
@@ -212,10 +306,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
             TII->get(X86::SEH_UnwindV2Start));
   }
   // Note that the function is using Unwind v2.
-  MachineBasicBlock &FirstMBB = MF.front();
-  BuildMI(FirstMBB, FirstMBB.front(), FirstMBB.front().getDebugLoc(),
+  BuildMI(FirstMBB, FirstMBB.front(), findDebugLoc(FirstMBB),
           TII->get(X86::SEH_UnwindVersion))
       .addImm(2);
 
   return true;
 }
+
+bool X86WinEHUnwindV2::rejectCurrentFunctionInternalError(
+    const MachineFunction &MF, WinX64EHUnwindV2Mode Mode, StringRef Reason) {
+  if (Mode == WinX64EHUnwindV2Mode::Required)
+    reportFatalInternalError("Windows x64 Unwind v2 is required, but LLVM has "
+                             "generated incompatible code in function '" +
+                             MF.getName() + "': " + Reason);
+
+  FailsUnwindV2Criteria++;
+  return false;
+}
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
new file mode 100644
index 0000000000000..f099d4fddcb33
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-errors.mir
@@ -0,0 +1,318 @@
+# RUN: split-file %s %t
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+;--- alloc_no_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/alloc_no_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=ALLOC-NO-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/alloc_no_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# ALLOC-NO-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'alloc_no_dealloc':
+# ALLOC-NO-DEALLOC-SAME: The prolog made a stack allocation, but the epilog did not deallocate it
+
+--- |
+  define dso_local void @alloc_no_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            alloc_no_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- missed_push.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=MISSED-PUSH
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/missed_push.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# MISSED-PUSH: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'missed_push':
+# MISSED-PUSH-SAME: The prolog pushed more registers than the epilog popped
+
+--- |
+  define dso_local void @missed_push() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            missed_push
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_no_alloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_no_alloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-NO-ALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/dealloc_no_alloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DEALLOC-NO-ALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_no_alloc':
+# DEALLOC-NO-ALLOC-SAME: The epilog is deallocating a stack allocation, but the prolog did not allocate one
+
+--- |
+  define dso_local void @dealloc_no_alloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_no_alloc
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- double_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=DOUBLE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/double_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# DOUBLE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'double_dealloc':
+# DOUBLE-DEALLOC-SAME: The epilog is deallocating the stack allocation more than once
+
+--- |
+  define dso_local void @double_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            double_dealloc
+body:             |
+  bb.0.entry:
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- dealloc_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=DEALLOC-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/dealloc_after_epilog.mir -run-pass=x86-wineh-unwindv2 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 |  FileCheck %s \
+# RUN:    --check-prefix=BESTEFFORT
+# DEALLOC-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'dealloc_after_epilog':
+# DEALLOC-AFTER-EPILOG-SAME: Unexpected mov or add instruction after the epilog
+
+--- |
+  define dso_local void @dealloc_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            dealloc_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    RET64
+...
+
+;--- pop_before_dealloc.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_before_dealloc.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-BEFORE-DEALLOC
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_before_dealloc.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-BEFORE-DEALLOC: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_before_dealloc':
+# POP-BEFORE-DEALLOC-SAME: Cannot pop registers before the stack allocation has been deallocated
+
+--- |
+  define dso_local void @pop_before_dealloc() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_before_dealloc
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    $rsp = frame-setup SUB64ri32 $rsp, 40, implicit-def dead $eflags
+    frame-setup SEH_StackAlloc 40
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsp = frame-destroy ADD64ri32 $rsp, 40, implicit-def dead $eflags
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- too_many_pops.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 2>&1 | FileCheck %s \
+# RUN:    --check-prefix=TOO-MANY-POPS
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/too_many_pops.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# TOO-MANY-POPS: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'too_many_pops':
+# TOO-MANY-POPS-SAME: The epilog is popping more registers than the prolog pushed
+
+--- |
+  define dso_local void @too_many_pops() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            too_many_pops
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_in_wrong_order.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_in_wrong_order.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-WRONG-ORDER
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_in_wrong_order.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-WRONG-ORDER: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_in_wrong_order':
+# POP-WRONG-ORDER-SAME: The epilog is popping a registers in a different order than the prolog pushed them
+
+--- |
+  define dso_local void @pop_in_wrong_order() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_in_wrong_order
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rdi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 55
+    frame-setup PUSH64r killed $rsi, implicit-def $rsp, implicit $rsp
+    frame-setup SEH_PushReg 60
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    $rsi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    SEH_EndEpilogue
+    RET64
+...
+
+;--- pop_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/pop_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=POP-AFTER-EPILOG
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/pop_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# POP-AFTER-EPILOG: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'pop_after_epilog':
+# POP-AFTER-EPILOG-SAME: Registers are being popped after the epilog
+
+--- |
+  define dso_local void @pop_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            pop_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $rdi = frame-destroy POP64r implicit-def $rsp, implicit $rsp
+    RET64
+...
+
+;--- instr_after_epilog.mir
+# RUN: not --crash llc -mtriple=x86_64-pc-windows-msvc -o - \
+# RUN:    %t/instr_after_epilog.mir -run-pass=x86-wineh-unwindv2 2>&1 | \
+# RUN:    FileCheck %s --check-prefix=INSTR-AFTER-END
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %t/instr_after_epilog.mir \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s --check-prefix=BESTEFFORT
+# INSTR-AFTER-END: LLVM ERROR: Windows x64 Unwind v2 is required, but LLVM has generated incompatible code in function 'instr_after_epilog':
+# INSTR-AFTER-END-SAME: Unexpected instruction in or after the epilog
+
+--- |
+  define dso_local void @instr_after_epilog() local_unnamed_addr {
+  entry:
+    ret void
+  }
+  !llvm.module.flags = !{!0}
+  !0 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+...
+---
+name:            instr_after_epilog
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    $ecx = MOV32rr killed $eax
+    RET64
+...
diff --git a/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
new file mode 100644
index 0000000000000..70c87ad87f792
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-eh-unwindv2-too-many-epilogs.mir
@@ -0,0 +1,94 @@
+# Require V2 and restrict the number of unwind codes to 8
+# RUN: not llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    2>&1 | FileCheck %s -check-prefix=REQUIREV2
+
+# Force best-effort and restrict the number of unwind codes to 8
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 -x86-wineh-unwindv2-max-unwind-codes=8 \
+# RUN:    -x86-wineh-unwindv2-force-mode=1 | \
+# RUN:    FileCheck %s -check-prefix=BESTEFFORT
+
+# Require V2, but allow the default number of unwind codes (255)
+# RUN: llc -mtriple=x86_64-pc-windows-msvc -o - %s \
+# RUN:    -run-pass=x86-wineh-unwindv2 | FileCheck %s -check-prefix=ALLOWMORE
+
+# Usually 255 unwind codes are permitted, but we passed an arg to llc to limit
+# it to 8.
+# REQUIREV2: error: example.c:2:1: Windows x64 Unwind v2 is required, but the function 'too_many_epilogs' has too many unwind codes.
+# REQUIREV2-SAME: Try splitting the function or reducing the number of places where it exits early with a tail call.
+
+# If we force "best effort" mode, then we won't see any errors, but we won't use
+# v2.
+# BESTEFFORT-NOT: SEH_UnwindVersion
+# BESTEFFORT-NOT: SEH_UnwindV2Start
+
+# If we allow more epilogs then too_many_epilogs will compile with v2.
+# ALLOWMORE-LABEL: too_many_epilogs
+# ALLOWMORE: SEH_UnwindVersion 2
+# ALLOWMORE: SEH_UnwindV2Start
+
+--- |
+  define dso_local void @too_many_epilogs() local_unnamed_addr !dbg !9 {
+  entry:
+    ret void, !dbg !10
+  }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2, !3, !4, !5}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "/app/example.c", directory: "/app")
+  !2 = !{i32 1, !"winx64-eh-unwindv2", i32 2}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"CodeView", i32 1}
+  !5 = !{i32 2, !"Debug Info Version", i32 3}
+  !6 = !DIFile(filename: "example.c", directory: "/app")
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{null}
+  !9 = distinct !DISubprogram(name: "too_many_epilogs", scope: !6, file: !6, line: 1, type: !7, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0)
+  !10 = !DILocation(line: 2, column: 1, scope: !9)
+  !11 = !DILocation(line: 3, column: 1, scope: !9)
+...
+---
+name:            too_many_epilogs
+body:             |
+  bb.0.entry:
+    frame-setup SEH_EndPrologue
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !10
+  bb.1:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.2:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.3:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.4:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.5:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.6:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.7:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+  bb.8:
+    SEH_BeginEpilogue
+    SEH_EndEpilogue
+    RET64 debug-location !11
+
+...

From 98eee4b554be18f734088455cb4cd9dc634e7602 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:20:08 -0700
Subject: [PATCH 655/851] [libc] utf8 to 32 CharacterConverter (#143973)

Implemented push and pop for utf8 to 32 conversion and tests.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../__support/wchar/character_converter.cpp   |  55 +++++
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 +
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ++++++++++++++++++
 4 files changed, 263 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ca709769616c3..7f147ac26d3d1 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -30,6 +31,49 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
+int CharacterConverter::push(char8_t utf8_byte) {
+  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0) {
+    // UTF-8 char has 1 byte total
+    if (num_ones == 0) {
+      state->total_bytes = 1;
+    }
+    // UTF-8 char has 2 through 4 bytes total
+    else if (num_ones >= 2 && num_ones <= 4) {
+      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+      we will make the base mask with 7 ones and right shift it as necessary. */
+      constexpr size_t SIGNIFICANT_BITS = 7;
+      uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
+      state->total_bytes = num_ones;
+      utf8_byte &= (base_mask >> num_ones);
+    }
+    // Invalid first byte
+    else {
+      // bytes_processed and total_bytes will always be 0 here
+      state->partial = static_cast<char32_t>(0);
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  if (num_ones == 1 && !isComplete()) {
+    char32_t byte =
+        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset the state
+  clear();
+  return -1;
+}
+
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -54,6 +98,17 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  // If pop is called too early, do not reset the state, use error to determine
+  // whether enough bytes have been pushed
+  if (!isComplete() || state->bytes_processed == 0)
+    return Error(-1);
+  char32_t utf32 = state->partial;
+  // reset if successful pop
+  clear();
+  return utf32;
+}
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 76218a16e0cf7..9f626ed31cc07 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-
-# Requires access to uchar header which is not on macos
-# Therefore, cannot currently build this on macos in overlay mode
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5dff6e9115f7d..5176bfd4b024b 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-support-wchar-tests)
 
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
+
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..9cb059faa9374
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first and third bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to push another should error.
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  // Should produce an error since mbstate was reset
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+  // Second two byte character
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}

From 6e124423546e5d22b4b6dc64d6cedfe93e627d58 Mon Sep 17 00:00:00 2001
From: sribee8 <145801438+sribee8@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:33:32 -0700
Subject: [PATCH 656/851] Revert "[libc] utf8 to 32 CharacterConverter"
 (#144446)

Reverts llvm/llvm-project#143973
This merge broke the build and I'm currently looking into the issue to
fix it.
---
 .../__support/wchar/character_converter.cpp   |  55 -----
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 -
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ------------------
 4 files changed, 3 insertions(+), 263 deletions(-)
 delete mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 7f147ac26d3d1..ca709769616c3 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,7 +8,6 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
-#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -31,49 +30,6 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
-int CharacterConverter::push(char8_t utf8_byte) {
-  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
-  // Checking the first byte if first push
-  if (state->bytes_processed == 0) {
-    // UTF-8 char has 1 byte total
-    if (num_ones == 0) {
-      state->total_bytes = 1;
-    }
-    // UTF-8 char has 2 through 4 bytes total
-    else if (num_ones >= 2 && num_ones <= 4) {
-      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
-      we will make the base mask with 7 ones and right shift it as necessary. */
-      constexpr size_t SIGNIFICANT_BITS = 7;
-      uint32_t base_mask = mask_trailing_ones<uint32_t, SIGNIFICANT_BITS>();
-      state->total_bytes = num_ones;
-      utf8_byte &= (base_mask >> num_ones);
-    }
-    // Invalid first byte
-    else {
-      // bytes_processed and total_bytes will always be 0 here
-      state->partial = static_cast<char32_t>(0);
-      return -1;
-    }
-    state->partial = static_cast<char32_t>(utf8_byte);
-    state->bytes_processed++;
-    return 0;
-  }
-  // Any subsequent push
-  // Adding 6 more bits so need to left shift
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
-  if (num_ones == 1 && !isComplete()) {
-    char32_t byte =
-        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
-    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
-    state->partial |= byte;
-    state->bytes_processed++;
-    return 0;
-  }
-  // Invalid byte -> reset the state
-  clear();
-  return -1;
-}
-
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -98,17 +54,6 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
-ErrorOr<char32_t> CharacterConverter::pop_utf32() {
-  // If pop is called too early, do not reset the state, use error to determine
-  // whether enough bytes have been pushed
-  if (!isComplete() || state->bytes_processed == 0)
-    return Error(-1);
-  char32_t utf32 = state->partial;
-  // reset if successful pop
-  clear();
-  return utf32;
-}
-
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 9f626ed31cc07..76218a16e0cf7 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,8 +275,9 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-# Requires access to uchar header which is not on MacOS
-# Cannot currently build this on MacOS in overlay mode
+
+# Requires access to uchar header which is not on macos
+# Therefore, cannot currently build this on macos in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5176bfd4b024b..5dff6e9115f7d 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,15 +1,5 @@
 add_custom_target(libc-support-wchar-tests)
 
-add_libc_test(
-  utf8_to_32_test 
-  SUITE
-    libc-support-tests
-  SRCS
-    utf8_to_32_test.cpp 
-  DEPENDS
-    libc.src.__support.wchar.character_converter
-)
-
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
deleted file mode 100644
index 9cb059faa9374..0000000000000
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===-- Unittests for character_converter utf8->utf32 ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/error_or.h"
-#include "src/__support/wchar/character_converter.h"
-#include "src/__support/wchar/mbstate.h"
-#include "test/UnitTest/Test.h"
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  char ch = 'A';
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_EQ(err, 0);
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 65);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[2] = {static_cast<char>(0xC2),
-                      static_cast<char>(0x8E)}; //  car symbol
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
-                      static_cast<char>(0x91)}; // ∑ sigma symbol
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  char_conv.push(static_cast<char8_t>(ch[2]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
-                      static_cast<char>(0xA4),
-                      static_cast<char>(0xA1)}; // 🤡 clown emoji
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  char_conv.push(static_cast<char8_t>(ch[0]));
-  char_conv.push(static_cast<char8_t>(ch[1]));
-  char_conv.push(static_cast<char8_t>(ch[2]));
-  char_conv.push(static_cast<char8_t>(ch[3]));
-  auto wch = char_conv.pop_utf32();
-
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch));
-
-  ASSERT_EQ(err, -1);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {
-      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
-      static_cast<char>(0x00)}; // first and third bytes are invalid
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, -1);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  // Prev byte was single byte so trying to push another should error.
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, -1);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, 0);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  // Last byte is invalid since it does not have correct starting sequence.
-  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
-  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
-                      static_cast<char>(0x80), static_cast<char>(0xC0)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, -1);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
-                      static_cast<char>(0x80)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  // Should produce an error on 3rd byte
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, -1);
-
-  // Should produce an error since mbstate was reset
-  auto wch = char_conv.pop_utf32();
-  ASSERT_FALSE(wch.has_value());
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
-                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
-
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  auto wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-
-  // Second two byte character
-  err = char_conv.push(static_cast<char8_t>(ch[2]));
-  ASSERT_EQ(err, 0);
-  err = char_conv.push(static_cast<char8_t>(ch[3]));
-  ASSERT_EQ(err, 0);
-  wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 460);
-}
-
-TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
-  LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
-  state.total_bytes = 0;
-  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
-  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
-  int err = char_conv.push(static_cast<char8_t>(ch[0]));
-  ASSERT_EQ(err, 0);
-  auto wch = char_conv.pop_utf32();
-  ASSERT_FALSE(
-      wch.has_value()); // Should fail since we have not read enough bytes
-  err = char_conv.push(static_cast<char8_t>(ch[1]));
-  ASSERT_EQ(err, 0);
-  wch = char_conv.pop_utf32();
-  ASSERT_TRUE(wch.has_value());
-  ASSERT_EQ(static_cast<int>(wch.value()), 142);
-}

From 99e53cb4139eda491f97cb33ee42ea424d352200 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:47:43 -0700
Subject: [PATCH 657/851] [llvm][StackProtector] Add noreturn to
 __stack_chk_fail call (#143976)

It's possible for __stack_chk_fail to be an alias when using CrossDSOCFI
since it will make a jump table entry for this function and replace it
with an alias. StackProtector can crash since it always expects this to
be a regular function. Instead add the noreturn attribute to the call.
---
 llvm/lib/CodeGen/StackProtector.cpp           |  4 +--
 .../cross-dso-cfi-stack-chk-fail.ll           | 33 +++++++++++++++++++
 .../StackProtector/stack-chk-fail-alias.ll    | 21 ++++++++++++
 3 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 create mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll

diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 5f866eea7d4e7..dda392d38b27a 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -725,8 +725,8 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
     StackChkFail =
         M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
   }
-  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
-  B.CreateCall(StackChkFail, Args);
+  CallInst *Call = B.CreateCall(StackChkFail, Args);
+  Call->addFnAttr(Attribute::NoReturn);
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
new file mode 100644
index 0000000000000..af03039813a2e
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
@@ -0,0 +1,33 @@
+;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
+;; CrossDSOCFI is used. This test just needs to not crash.
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
+
+define hidden void @__stack_chk_fail() !type !1{
+  unreachable
+}
+
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+define void @func(ptr %0) {
+entry:
+  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
+  br i1 %1, label %cont, label %trap
+
+trap:                                             ; preds = %entry
+  call void @llvm.trap()
+  unreachable
+
+cont:                                             ; preds = %entry
+  call void %0()
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
+!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
new file mode 100644
index 0000000000000..ab0a6e3f455e7
--- /dev/null
+++ b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
@@ -0,0 +1,21 @@
+;; __stack_chk_fail should have the noreturn attr even if it is an alias
+; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
+
+define hidden void @__stack_chk_fail_impl() {
+  unreachable
+}
+
+@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
+
+; CHECK-LABEL: @store_captures(
+; CHECK:       CallStackCheckFailBlk:
+; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
+define void @store_captures() sspstrong {
+entry:
+  %a = alloca i32, align 4
+  %j = alloca ptr, align 8
+  store ptr %a, ptr %j, align 8
+  ret void
+}
+
+; CHECK: attributes [[ATTRS]] = { noreturn }

From 964888d01f0b0f81540f8548370f00c315952042 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:24:45 -0700
Subject: [PATCH 658/851] [llvm][CFI] Ensure COFF comdat renaming applies for
 imported functions (#143421)

I ran into the same issue as
https://github.com/llvm/llvm-project/pull/139962 regarding the comdat
corresponding to a renamed key function but for thinlto. My last patch
had not considered the thinlto case, so this applies the same fix for
imported functions.
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp    | 34 ++++++++++++-------
 .../Inputs/import-thinlto-funcs.yaml          |  5 +++
 .../LowerTypeTests/cfi-coff-comdat-rename.ll  |  2 ++
 3 files changed, 28 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index ab67a0980e0c2..20b54c056cc2d 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -561,6 +561,8 @@ class LowerTypeTestsModule {
     return FunctionAnnotations.contains(V);
   }
 
+  void maybeReplaceComdat(Function *F, StringRef OriginalName);
+
 public:
   LowerTypeTestsModule(Module &M, ModuleAnalysisManager &AM,
                        ModuleSummaryIndex *ExportSummary,
@@ -1082,6 +1084,23 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
   }
 }
 
+void LowerTypeTestsModule::maybeReplaceComdat(Function *F,
+                                              StringRef OriginalName) {
+  // For COFF we should also rename the comdat if this function also
+  // happens to be the key function. Even if the comdat name changes, this
+  // should still be fine since comdat and symbol resolution happens
+  // before LTO, so all symbols which would prevail have been selected.
+  if (F->hasComdat() && ObjectFormat == Triple::COFF &&
+      F->getComdat()->getName() == OriginalName) {
+    Comdat *OldComdat = F->getComdat();
+    Comdat *NewComdat = M.getOrInsertComdat(F->getName());
+    for (GlobalObject &GO : M.global_objects()) {
+      if (GO.getComdat() == OldComdat)
+        GO.setComdat(NewComdat);
+    }
+  }
+}
+
 // ThinLTO backend: the function F has a jump table entry; update this module
 // accordingly. isJumpTableCanonical describes the type of the jump table entry.
 void LowerTypeTestsModule::importFunction(
@@ -1115,6 +1134,7 @@ void LowerTypeTestsModule::importFunction(
     FDecl->setVisibility(GlobalValue::HiddenVisibility);
   } else {
     F->setName(Name + ".cfi");
+    maybeReplaceComdat(F, Name);
     F->setLinkage(GlobalValue::ExternalLinkage);
     FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
                              F->getAddressSpace(), Name, &M);
@@ -1734,19 +1754,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
       FAlias->takeName(F);
       if (FAlias->hasName()) {
         F->setName(FAlias->getName() + ".cfi");
-        // For COFF we should also rename the comdat if this function also
-        // happens to be the key function. Even if the comdat name changes, this
-        // should still be fine since comdat and symbol resolution happens
-        // before LTO, so all symbols which would prevail have been selected.
-        if (F->hasComdat() && ObjectFormat == Triple::COFF &&
-            F->getComdat()->getName() == FAlias->getName()) {
-          Comdat *OldComdat = F->getComdat();
-          Comdat *NewComdat = M.getOrInsertComdat(F->getName());
-          for (GlobalObject &GO : M.global_objects()) {
-            if (GO.getComdat() == OldComdat)
-              GO.setComdat(NewComdat);
-          }
-        }
+        maybeReplaceComdat(F, FAlias->getName());
       }
       replaceCfiUses(F, FAlias, IsJumpTableCanonical);
       if (!F->hasLocalLinkage())
diff --git a/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
new file mode 100644
index 0000000000000..459d45032b0c4
--- /dev/null
+++ b/llvm/test/Transforms/LowerTypeTests/Inputs/import-thinlto-funcs.yaml
@@ -0,0 +1,5 @@
+---
+CfiFunctionDefs:
+  - f1
+  - f2
+...
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
index 7dda7f6df10c3..7eede8b7322f8 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-coff-comdat-rename.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: x86-registered-target
 ; RUN: opt -S -passes=lowertypetests %s | FileCheck %s
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | FileCheck %s
 
 ;; This is a check to assert we don't crash with:
 ;;
@@ -7,6 +8,7 @@
 ;;
 ;; So this just needs to exit normally.
 ; RUN: opt -S -passes=lowertypetests %s | llc -asm-verbose=false
+; RUN: opt -S -passes=lowertypetests -lowertypetests-summary-action=import -lowertypetests-read-summary=%p/Inputs/import-thinlto-funcs.yaml %s | llc -asm-verbose=false
 
 target datalayout = "e-p:64:64"
 target triple = "x86_64-pc-windows-msvc"

From ac7af53d05b94849fd590b1875db7b85957fb0f6 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 16:26:03 -0700
Subject: [PATCH 659/851] [flang] Fixed LIT tests to create modfiles in a temp
 dir. (#144448)

---
 flang/test/Semantics/modfile71.F90 |  7 ++++---
 flang/test/Semantics/modfile75.F90 |  3 ++-
 flang/test/Semantics/modfile76.F90 | 15 ++++++++-------
 flang/test/Semantics/modfile77.F90 |  3 ++-
 flang/test/Semantics/modfile78.F90 |  3 ++-
 flang/test/Semantics/modfile79.F90 |  3 ++-
 6 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/flang/test/Semantics/modfile71.F90 b/flang/test/Semantics/modfile71.F90
index 7c3c7f5b48958..7f32eb18c6f8f 100644
--- a/flang/test/Semantics/modfile71.F90
+++ b/flang/test/Semantics/modfile71.F90
@@ -1,6 +1,7 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 %s
-!RUN: not %flang_fc1 -fsyntax-only -pedantic %s 2>&1 | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 -J%t %s
+!RUN: not %flang_fc1 -fsyntax-only -pedantic -J%t %s 2>&1 | FileCheck %s
 
 ! Tests that a module captured in a hermetic module file is compatible when
 ! USE'd with a module of the same name USE'd directly.
diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index aba00ffac848a..8f7adafe7204d 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang_fc1 -fdebug-unparse %s | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1
 module modfile75a
diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90
index 50ee9a088e119..c7ae91bd42bed 100644
--- a/flang/test/Semantics/modfile76.F90
+++ b/flang/test/Semantics/modfile76.F90
@@ -1,23 +1,24 @@
-!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
-!RUN: %flang_fc1 -fsyntax-only %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 -J%t %s
+!RUN: %flang_fc1 -fsyntax-only -J%t %s
 
 ! Tests that a BIND(C) variable in a module A captured in a hermetic module
 ! file USE'd in a module B is not creating bogus complaints about BIND(C) name
 ! conflict when both module A and B are later accessed.
 
 #if STEP == 1
-module modfile75a
+module modfile76a
   integer, bind(c) :: x
 end
 
-module modfile75b
-  use modfile75a ! capture hermetically
+module modfile76b
+  use modfile76a ! capture hermetically
 end
 
 #else
 subroutine test
-  use modfile75a
-  use modfile75b
+  use modfile76a
+  use modfile76b
   implicit none
   print *, x
 end subroutine
diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90
index a82904ebbcc22..9ad615c16c43c 100644
--- a/flang/test/Semantics/modfile77.F90
+++ b/flang/test/Semantics/modfile77.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile77c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile77a
diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90
index cb3eccd9a4108..19b9ac39de934 100644
--- a/flang/test/Semantics/modfile78.F90
+++ b/flang/test/Semantics/modfile78.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -fhermetic-module-files -J%t %s && cat %t/modfile78c.mod | FileCheck %s
 
 #if WHICH == 1
 module modfile78a
diff --git a/flang/test/Semantics/modfile79.F90 b/flang/test/Semantics/modfile79.F90
index 7d3b42166654e..ae156527b3bf3 100644
--- a/flang/test/Semantics/modfile79.F90
+++ b/flang/test/Semantics/modfile79.F90
@@ -1,4 +1,5 @@
-!RUN: %flang -c -DWHICH=1 %s && FileCheck %s <modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c %s && FileCheck %s <modfile79a.mod
+!RUN: rm -rf %t && mkdir -p %t
+!RUN: %flang -c -DWHICH=1 -J%t %s && FileCheck %s <%t/modfile79a.mod && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang -c -J%t %s && FileCheck %s <%t/modfile79a.mod
 
 !Ensure that writing modfile79c.mod doesn't cause a spurious
 !regeneration of modfile79a.mod from its copy in the hermetic

From d882670d498a29f4e02f357ef9fe07c43de034c8 Mon Sep 17 00:00:00 2001
From: PiJoules <6019989+PiJoules@users.noreply.github.com>
Date: Mon, 16 Jun 2025 16:34:40 -0700
Subject: [PATCH 660/851] Revert "[llvm][StackProtector] Add noreturn to
 __stack_chk_fail call" (#144452)

Reverts llvm/llvm-project#143976

Reverting since this broke a builder:
https://lab.llvm.org/buildbot/#/builders/190/builds/21563
---
 llvm/lib/CodeGen/StackProtector.cpp           |  4 +--
 .../cross-dso-cfi-stack-chk-fail.ll           | 33 -------------------
 .../StackProtector/stack-chk-fail-alias.ll    | 21 ------------
 3 files changed, 2 insertions(+), 56 deletions(-)
 delete mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 delete mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll

diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index dda392d38b27a..5f866eea7d4e7 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -725,8 +725,8 @@ BasicBlock *CreateFailBB(Function *F, const Triple &Trip) {
     StackChkFail =
         M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
   }
-  CallInst *Call = B.CreateCall(StackChkFail, Args);
-  Call->addFnAttr(Attribute::NoReturn);
+  cast<Function>(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn);
+  B.CreateCall(StackChkFail, Args);
   B.CreateUnreachable();
   return FailBB;
 }
diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
deleted file mode 100644
index af03039813a2e..0000000000000
--- a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when
-;; CrossDSOCFI is used. This test just needs to not crash.
-; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector
-
-define hidden void @__stack_chk_fail() !type !1{
-  unreachable
-}
-
-define void @store_captures() sspstrong {
-entry:
-  %a = alloca i32, align 4
-  %j = alloca ptr, align 8
-  store ptr %a, ptr %j, align 8
-  ret void
-}
-
-define void @func(ptr %0) {
-entry:
-  %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid")
-  br i1 %1, label %cont, label %trap
-
-trap:                                             ; preds = %entry
-  call void @llvm.trap()
-  unreachable
-
-cont:                                             ; preds = %entry
-  call void %0()
-  ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"Cross-DSO CFI", i32 1}
-!1 = !{i64 0, !"typeid"}
diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
deleted file mode 100644
index ab0a6e3f455e7..0000000000000
--- a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-;; __stack_chk_fail should have the noreturn attr even if it is an alias
-; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s
-
-define hidden void @__stack_chk_fail_impl() {
-  unreachable
-}
-
-@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl
-
-; CHECK-LABEL: @store_captures(
-; CHECK:       CallStackCheckFailBlk:
-; CHECK-NEXT:      call void @__stack_chk_fail() [[ATTRS:#.*]]
-define void @store_captures() sspstrong {
-entry:
-  %a = alloca i32, align 4
-  %j = alloca ptr, align 8
-  store ptr %a, ptr %j, align 8
-  ret void
-}
-
-; CHECK: attributes [[ATTRS]] = { noreturn }

From 6421bd94eabdb71975c75e2c1621a095b3d8b6ad Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 16 Jun 2025 17:24:48 -0700
Subject: [PATCH 661/851] [lldb-dap] Creating protocol types for
 setExceptionBreakpoints. (#144153)

This adds new types for setExceptionBreakpoints and adds support for
`supportsExceptionFilterOptions`, which allows exception breakpoints to
set a condition.

While testing this, I noticed that obj-c exception catch breakpoints may
not be working correctly in lldb-dap.
---
 .../test/tools/lldb-dap/dap_server.py         |   6 +-
 .../TestDAP_setExceptionBreakpoints.py        |  11 +-
 .../tools/lldb-dap/exception/objc/Makefile    |   2 +-
 .../exception/objc/TestDAP_exception_objc.py  |  39 ++++-
 .../API/tools/lldb-dap/exception/objc/main.m  |  12 +-
 lldb/tools/lldb-dap/DAP.cpp                   | 158 ++++++++----------
 lldb/tools/lldb-dap/DAP.h                     |   4 +-
 lldb/tools/lldb-dap/ExceptionBreakpoint.cpp   |  26 ++-
 lldb/tools/lldb-dap/ExceptionBreakpoint.h     |  14 +-
 .../Handler/InitializeRequestHandler.cpp      |   1 -
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |  15 +-
 .../SetExceptionBreakpointsRequestHandler.cpp | 107 +++++-------
 lldb/tools/lldb-dap/JSONUtils.cpp             |   9 -
 lldb/tools/lldb-dap/JSONUtils.h               |  12 --
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  14 ++
 .../lldb-dap/Protocol/ProtocolRequests.h      |  50 ++++++
 .../tools/lldb-dap/Protocol/ProtocolTypes.cpp |  56 ++++---
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |  41 +++--
 lldb/tools/lldb-dap/ProtocolUtils.cpp         |  11 ++
 lldb/tools/lldb-dap/ProtocolUtils.h           |  13 ++
 lldb/unittests/DAP/ProtocolTypesTest.cpp      | 111 +++++++++---
 21 files changed, 453 insertions(+), 259 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index baf2d4ae542ba..6d32491eaa5e9 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -1050,8 +1050,12 @@ def request_setBreakpoints(self, source: Source, line_array, data=None):
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
-    def request_setExceptionBreakpoints(self, filters):
+    def request_setExceptionBreakpoints(
+        self, *, filters: list[str] = [], filter_options: list[dict] = []
+    ):
         args_dict = {"filters": filters}
+        if filter_options:
+            args_dict["filterOptions"] = filter_options
         command_dict = {
             "command": "setExceptionBreakpoints",
             "type": "request",
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
index 4dc8c5b3c7ded..4ca733a9a59ca 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
@@ -1,16 +1,12 @@
 """
-Test lldb-dap setBreakpoints request
+Test lldb-dap setExceptionBreakpoints request
 """
 
-
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
 import lldbdap_testcase
 
 
-@skip("Temporarily disable the breakpoint tests")
 class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     def test_functionality(self):
@@ -33,8 +29,9 @@ def test_functionality(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
 
-        filters = ["cpp_throw", "cpp_catch"]
-        response = self.dap_server.request_setExceptionBreakpoints(filters)
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filters=["cpp_throw", "cpp_catch"],
+        )
         if response:
             self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
index 9b6528337cb9d..17e6dc76699ab 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/Makefile
@@ -1,6 +1,6 @@
 OBJC_SOURCES := main.m
 
-CFLAGS_EXTRAS := -w
+CFLAGS_EXTRAS := -w -fobjc-exceptions
 
 USE_SYSTEM_STDLIB := 1
 
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
index 777d55f48e850..ddedf7a6de8c6 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/TestDAP_exception_objc.py
@@ -2,7 +2,6 @@
 Test exception behavior in DAP with obj-c throw.
 """
 
-
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 import lldbdap_testcase
@@ -25,3 +24,41 @@ def test_stopped_description(self):
         exception_details = exception_info["details"]
         self.assertRegex(exception_details["message"], "SomeReason")
         self.assertRegex(exception_details["stackTrace"], "main.m")
+
+    @skipUnlessDarwin
+    def test_break_on_throw_and_catch(self):
+        """
+        Test that breakpoints on exceptions work as expected.
+        """
+        program = self.getBuildArtifact("a.out")
+        self.build_and_launch(program)
+
+        response = self.dap_server.request_setExceptionBreakpoints(
+            filter_options=[
+                {
+                    "filterId": "objc_throw",
+                    "condition": '[[((NSException *)$arg1) name] isEqual:@"ThrownException"]',
+                },
+            ]
+        )
+        if response:
+            self.assertTrue(response["success"])
+
+        self.continue_to_exception_breakpoint("Objective-C Throw")
+
+        # FIXME: Catching objc exceptions do not appear to be working.
+        # Xcode appears to set a breakpoint on '__cxa_begin_catch' for objc
+        # catch, which is different than
+        # SBTarget::BreakpointCreateForException(eLanguageObjectiveC, /*catch_bp=*/true, /*throw_bp=*/false);
+        # self.continue_to_exception_breakpoint("Objective-C Catch")
+
+        self.do_continue()
+
+        self.assertTrue(self.verify_stop_exception_info("signal SIGABRT"))
+        exception_info = self.get_exceptionInfo()
+        self.assertEqual(exception_info["breakMode"], "always")
+        self.assertEqual(exception_info["description"], "signal SIGABRT")
+        self.assertEqual(exception_info["exceptionId"], "signal")
+        exception_details = exception_info["details"]
+        self.assertRegex(exception_details["message"], "SomeReason")
+        self.assertRegex(exception_details["stackTrace"], "main.m")
diff --git a/lldb/test/API/tools/lldb-dap/exception/objc/main.m b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
index e8db04fb40de1..bbfa621992799 100644
--- a/lldb/test/API/tools/lldb-dap/exception/objc/main.m
+++ b/lldb/test/API/tools/lldb-dap/exception/objc/main.m
@@ -1,8 +1,14 @@
 #import <Foundation/Foundation.h>
 
 int main(int argc, char const *argv[]) {
-  @throw [[NSException alloc] initWithName:@"ThrownException"
-                                    reason:@"SomeReason"
-                                  userInfo:nil];
+  @try {
+    NSException *e = [[NSException alloc] initWithName:@"ThrownException"
+                                      reason:@"SomeReason"
+                                    userInfo:nil];
+    @throw e;
+  } @catch (NSException *e) {
+    NSLog(@"Caught %@", e);
+    @throw; // let the process crash...
+  }
   return 0;
 }
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 9fe8227cd2d6f..c171b55951cb5 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -9,6 +9,7 @@
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
+#include "ExceptionBreakpoint.h"
 #include "Handler/RequestHandler.h"
 #include "Handler/ResponseHandler.h"
 #include "JSONUtils.h"
@@ -17,6 +18,7 @@
 #include "Protocol/ProtocolBase.h"
 #include "Protocol/ProtocolRequests.h"
 #include "Protocol/ProtocolTypes.h"
+#include "ProtocolUtils.h"
 #include "Transport.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBCommandInterpreter.h"
@@ -129,93 +131,81 @@ DAP::DAP(Log *log, const ReplMode default_repl_mode,
 DAP::~DAP() = default;
 
 void DAP::PopulateExceptionBreakpoints() {
-  llvm::call_once(init_exception_breakpoints_flag, [this]() {
-    exception_breakpoints = std::vector<ExceptionBreakpoint>{};
-
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
-      exception_breakpoints->emplace_back(*this, "cpp_catch", "C++ Catch",
-                                          lldb::eLanguageTypeC_plus_plus);
-      exception_breakpoints->emplace_back(*this, "cpp_throw", "C++ Throw",
-                                          lldb::eLanguageTypeC_plus_plus);
-    }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
-      exception_breakpoints->emplace_back(
-          *this, "objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC);
-      exception_breakpoints->emplace_back(
-          *this, "objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC);
-    }
-    if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
-      exception_breakpoints->emplace_back(*this, "swift_catch", "Swift Catch",
-                                          lldb::eLanguageTypeSwift);
-      exception_breakpoints->emplace_back(*this, "swift_throw", "Swift Throw",
-                                          lldb::eLanguageTypeSwift);
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) {
+    exception_breakpoints.emplace_back(*this, "cpp_catch", "C++ Catch",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "cpp_throw", "C++ Throw",
+                                       lldb::eLanguageTypeC_plus_plus,
+                                       eExceptionKindThrow);
+  }
+
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) {
+    exception_breakpoints.emplace_back(*this, "objc_catch", "Objective-C Catch",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "objc_throw", "Objective-C Throw",
+                                       lldb::eLanguageTypeObjC,
+                                       eExceptionKindThrow);
+  }
+
+  if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) {
+    exception_breakpoints.emplace_back(*this, "swift_catch", "Swift Catch",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindCatch);
+    exception_breakpoints.emplace_back(*this, "swift_throw", "Swift Throw",
+                                       lldb::eLanguageTypeSwift,
+                                       eExceptionKindThrow);
+  }
+
+  // Besides handling the hardcoded list of languages from above, we try to find
+  // any other languages that support exception breakpoints using the SB API.
+  for (int raw_lang = lldb::eLanguageTypeUnknown;
+       raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
+    lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
+
+    // We first discard any languages already handled above.
+    if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
+        lang == lldb::eLanguageTypeSwift)
+      continue;
+
+    if (!lldb::SBDebugger::SupportsLanguage(lang))
+      continue;
+
+    const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
+    if (!name)
+      continue;
+    std::string raw_lang_name = name;
+    std::string capitalized_lang_name = capitalize(name);
+
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
+      const char *raw_throw_keyword =
+          lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
+      std::string throw_keyword =
+          raw_throw_keyword ? raw_throw_keyword : "throw";
+
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + throw_keyword,
+          capitalized_lang_name + " " + capitalize(throw_keyword), lang,
+          eExceptionKindThrow);
     }
-    // Besides handling the hardcoded list of languages from above, we try to
-    // find any other languages that support exception breakpoints using the
-    // SB API.
-    for (int raw_lang = lldb::eLanguageTypeUnknown;
-         raw_lang < lldb::eNumLanguageTypes; ++raw_lang) {
-      lldb::LanguageType lang = static_cast<lldb::LanguageType>(raw_lang);
-
-      // We first discard any languages already handled above.
-      if (lldb::SBLanguageRuntime::LanguageIsCFamily(lang) ||
-          lang == lldb::eLanguageTypeSwift)
-        continue;
-
-      if (!lldb::SBDebugger::SupportsLanguage(lang))
-        continue;
-
-      const char *name = lldb::SBLanguageRuntime::GetNameForLanguageType(lang);
-      if (!name)
-        continue;
-      std::string raw_lang_name = name;
-      std::string capitalized_lang_name = capitalize(name);
-
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnThrow(lang)) {
-        const char *raw_throw_keyword =
-            lldb::SBLanguageRuntime::GetThrowKeywordForLanguage(lang);
-        std::string throw_keyword =
-            raw_throw_keyword ? raw_throw_keyword : "throw";
-
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + throw_keyword,
-            capitalized_lang_name + " " + capitalize(throw_keyword), lang);
-      }
 
-      if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
-        const char *raw_catch_keyword =
-            lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
-        std::string catch_keyword =
-            raw_catch_keyword ? raw_catch_keyword : "catch";
+    if (lldb::SBLanguageRuntime::SupportsExceptionBreakpointsOnCatch(lang)) {
+      const char *raw_catch_keyword =
+          lldb::SBLanguageRuntime::GetCatchKeywordForLanguage(lang);
+      std::string catch_keyword =
+          raw_catch_keyword ? raw_catch_keyword : "catch";
 
-        exception_breakpoints->emplace_back(
-            *this, raw_lang_name + "_" + catch_keyword,
-            capitalized_lang_name + " " + capitalize(catch_keyword), lang);
-      }
+      exception_breakpoints.emplace_back(
+          *this, raw_lang_name + "_" + catch_keyword,
+          capitalized_lang_name + " " + capitalize(catch_keyword), lang,
+          eExceptionKindCatch);
     }
-    assert(!exception_breakpoints->empty() && "should not be empty");
-  });
+  }
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
-  // PopulateExceptionBreakpoints() is called after g_dap.debugger is created
-  // in a request-initialize.
-  //
-  // But this GetExceptionBreakpoint() method may be called before attaching, in
-  // which case, we may not have populated the filter yet.
-  //
-  // We also cannot call PopulateExceptionBreakpoints() in DAP::DAP() because
-  // we need SBDebugger::Initialize() to have been called before this.
-  //
-  // So just calling PopulateExceptionBreakoints(),which does lazy-populating
-  // seems easiest. Two other options include:
-  //  + call g_dap.PopulateExceptionBreakpoints() in lldb-dap.cpp::main()
-  //    right after the call to SBDebugger::Initialize()
-  //  + Just call PopulateExceptionBreakpoints() to get a fresh list  everytime
-  //    we query (a bit overkill since it's not likely to change?)
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetFilter() == filter)
       return &bp;
   }
@@ -223,10 +213,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) {
 }
 
 ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
-  // See comment in the other GetExceptionBreakpoint().
-  PopulateExceptionBreakpoints();
-
-  for (auto &bp : *exception_breakpoints) {
+  for (auto &bp : exception_breakpoints) {
     if (bp.GetID() == bp_id)
       return &bp;
   }
@@ -1118,8 +1105,9 @@ protocol::Capabilities DAP::GetCapabilities() {
   }
 
   // Available filters or options for the setExceptionBreakpoints request.
+  PopulateExceptionBreakpoints();
   std::vector<protocol::ExceptionBreakpointsFilter> filters;
-  for (const auto &exc_bp : *exception_breakpoints)
+  for (const auto &exc_bp : exception_breakpoints)
     filters.emplace_back(CreateExceptionBreakpointFilter(exc_bp));
   capabilities.exceptionBreakpointFilters = std::move(filters);
 
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index 89bc827c1141f..5ca5822f9bced 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -99,7 +99,7 @@ struct DAP {
   lldb::SBBroadcaster broadcaster;
   FunctionBreakpointMap function_breakpoints;
   InstructionBreakpointMap instruction_breakpoints;
-  std::optional<std::vector<ExceptionBreakpoint>> exception_breakpoints;
+  std::vector<ExceptionBreakpoint> exception_breakpoints;
   llvm::once_flag init_exception_breakpoints_flag;
 
   /// Map step in target id to list of function targets that user can choose.
@@ -320,7 +320,7 @@ struct DAP {
     });
   }
 
-  /// The set of capablities supported by this adapter.
+  /// The set of capabilities supported by this adapter.
   protocol::Capabilities GetCapabilities();
 
   /// Debuggee will continue from stopped state.
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
index 9772e7344ced6..5bf06268a5af2 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp
@@ -9,23 +9,33 @@
 #include "ExceptionBreakpoint.h"
 #include "BreakpointBase.h"
 #include "DAP.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBMutex.h"
 #include "lldb/API/SBTarget.h"
 #include <mutex>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-void ExceptionBreakpoint::SetBreakpoint() {
+protocol::Breakpoint ExceptionBreakpoint::SetBreakpoint(StringRef condition) {
   lldb::SBMutex lock = m_dap.GetAPIMutex();
   std::lock_guard<lldb::SBMutex> guard(lock);
 
-  if (m_bp.IsValid())
-    return;
-  bool catch_value = m_filter.find("_catch") != std::string::npos;
-  bool throw_value = m_filter.find("_throw") != std::string::npos;
-  m_bp = m_dap.target.BreakpointCreateForException(m_language, catch_value,
-                                                   throw_value);
-  m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  if (!m_bp.IsValid()) {
+    m_bp = m_dap.target.BreakpointCreateForException(
+        m_language, m_kind == eExceptionKindCatch,
+        m_kind == eExceptionKindThrow);
+    m_bp.AddName(BreakpointBase::kDAPBreakpointLabel);
+  }
+
+  m_bp.SetCondition(condition.data());
+
+  protocol::Breakpoint breakpoint;
+  breakpoint.id = m_bp.GetID();
+  breakpoint.verified = m_bp.IsValid();
+  return breakpoint;
 }
 
 void ExceptionBreakpoint::ClearBreakpoint() {
diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
index 319b472a89a34..802ec71ce6ad3 100644
--- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h
+++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h
@@ -10,6 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H
 
 #include "DAPForward.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/StringRef.h"
@@ -18,14 +19,20 @@
 
 namespace lldb_dap {
 
+enum ExceptionKind : unsigned {
+  eExceptionKindCatch,
+  eExceptionKindThrow,
+};
+
 class ExceptionBreakpoint {
 public:
   ExceptionBreakpoint(DAP &d, std::string f, std::string l,
-                      lldb::LanguageType lang)
+                      lldb::LanguageType lang, ExceptionKind kind)
       : m_dap(d), m_filter(std::move(f)), m_label(std::move(l)),
-        m_language(lang), m_bp() {}
+        m_language(lang), m_kind(kind), m_bp() {}
 
-  void SetBreakpoint();
+  protocol::Breakpoint SetBreakpoint() { return SetBreakpoint(""); };
+  protocol::Breakpoint SetBreakpoint(llvm::StringRef condition);
   void ClearBreakpoint();
 
   lldb::break_id_t GetID() const { return m_bp.GetID(); }
@@ -39,6 +46,7 @@ class ExceptionBreakpoint {
   std::string m_filter;
   std::string m_label;
   lldb::LanguageType m_language;
+  ExceptionKind m_kind;
   lldb::SBBreakpoint m_bp;
 };
 
diff --git a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
index dcd02d61ca4f4..b499a69876e2c 100644
--- a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp
@@ -54,7 +54,6 @@ llvm::Expected<InitializeResponse> InitializeRequestHandler::Run(
   if (llvm::Error err = dap.RunPreInitCommands())
     return err;
 
-  dap.PopulateExceptionBreakpoints();
   auto cmd = dap.debugger.GetCommandInterpreter().AddMultiwordCommand(
       "lldb-dap", "Commands for managing lldb-dap.");
   if (arguments.supportedFeatures.contains(
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index 0ac8ca7c9a49e..054cc7a321316 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -387,14 +387,21 @@ class SetBreakpointsRequestHandler
   Run(const protocol::SetBreakpointsArguments &args) const override;
 };
 
-class SetExceptionBreakpointsRequestHandler : public LegacyRequestHandler {
+class SetExceptionBreakpointsRequestHandler
+    : public RequestHandler<
+          protocol::SetExceptionBreakpointsArguments,
+          llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "setExceptionBreakpoints"; }
   FeatureSet GetSupportedFeatures() const override {
-    return {protocol::eAdapterFeatureExceptionOptions};
+    /// Prefer the `filterOptions` feature over the `exceptionOptions`.
+    /// exceptionOptions is not supported in VSCode, while `filterOptions` is
+    /// supported.
+    return {protocol::eAdapterFeatureExceptionFilterOptions};
   }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::SetExceptionBreakpointsResponseBody>
+  Run(const protocol::SetExceptionBreakpointsArguments &args) const override;
 };
 
 class SetFunctionBreakpointsRequestHandler
diff --git a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
index 2214833f8a770..6a271fb825137 100644
--- a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp
@@ -8,86 +8,61 @@
 
 #include "DAP.h"
 #include "EventHelper.h"
-#include "JSONUtils.h"
+#include "Protocol/ProtocolRequests.h"
 #include "RequestHandler.h"
 #include <set>
 
+using namespace llvm;
+using namespace lldb_dap::protocol;
+
 namespace lldb_dap {
 
-// "SetExceptionBreakpointsRequest": {
-//   "allOf": [ { "$ref": "#/definitions/Request" }, {
-//     "type": "object",
-//     "description": "SetExceptionBreakpoints request; value of command field
-//     is 'setExceptionBreakpoints'. The request configures the debuggers
-//     response to thrown exceptions. If an exception is configured to break, a
-//     StoppedEvent is fired (event type 'exception').", "properties": {
-//       "command": {
-//         "type": "string",
-//         "enum": [ "setExceptionBreakpoints" ]
-//       },
-//       "arguments": {
-//         "$ref": "#/definitions/SetExceptionBreakpointsArguments"
-//       }
-//     },
-//     "required": [ "command", "arguments"  ]
-//   }]
-// },
-// "SetExceptionBreakpointsArguments": {
-//   "type": "object",
-//   "description": "Arguments for 'setExceptionBreakpoints' request.",
-//   "properties": {
-//     "filters": {
-//       "type": "array",
-//       "items": {
-//         "type": "string"
-//       },
-//       "description": "IDs of checked exception options. The set of IDs is
-//       returned via the 'exceptionBreakpointFilters' capability."
-//     },
-//     "exceptionOptions": {
-//       "type": "array",
-//       "items": {
-//         "$ref": "#/definitions/ExceptionOptions"
-//       },
-//       "description": "Configuration options for selected exceptions."
-//     }
-//   },
-//   "required": [ "filters" ]
-// },
-// "SetExceptionBreakpointsResponse": {
-//   "allOf": [ { "$ref": "#/definitions/Response" }, {
-//     "type": "object",
-//     "description": "Response to 'setExceptionBreakpoints' request. This is
-//     just an acknowledgement, so no body field is required."
-//   }]
-// }
-void SetExceptionBreakpointsRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  lldb::SBError error;
-  FillResponse(request, response);
-  const auto *arguments = request.getObject("arguments");
-  const auto *filters = arguments->getArray("filters");
+/// The request configures the debugger’s response to thrown exceptions. Each of
+/// the `filters`, `filterOptions`, and `exceptionOptions` in the request are
+/// independent configurations to a debug adapter indicating a kind of exception
+/// to catch. An exception thrown in a program should result in a `stopped`
+/// event from the debug adapter (with reason `exception`) if any of the
+/// configured filters match.
+///
+/// Clients should only call this request if the corresponding capability
+/// `exceptionBreakpointFilters` returns one or more filters.
+Expected<SetExceptionBreakpointsResponseBody>
+SetExceptionBreakpointsRequestHandler::Run(
+    const SetExceptionBreakpointsArguments &arguments) const {
   // Keep a list of any exception breakpoint filter names that weren't set
   // so we can clear any exception breakpoints if needed.
-  std::set<llvm::StringRef> unset_filters;
-  for (const auto &bp : *dap.exception_breakpoints)
+  std::set<StringRef> unset_filters;
+  for (const auto &bp : dap.exception_breakpoints)
     unset_filters.insert(bp.GetFilter());
 
-  for (const auto &value : *filters) {
-    const auto filter = GetAsString(value);
+  SetExceptionBreakpointsResponseBody body;
+  for (const auto &filter : arguments.filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp) {
-      exc_bp->SetBreakpoint();
-      unset_filters.erase(std::string(filter));
-    }
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint());
+    unset_filters.erase(filter);
+  }
+  for (const auto &filterOptions : arguments.filterOptions) {
+    auto *exc_bp = dap.GetExceptionBreakpoint(filterOptions.filterId);
+    if (!exc_bp)
+      continue;
+
+    body.breakpoints.push_back(exc_bp->SetBreakpoint(filterOptions.condition));
+    unset_filters.erase(filterOptions.filterId);
   }
+
+  // Clear any unset filters.
   for (const auto &filter : unset_filters) {
     auto *exc_bp = dap.GetExceptionBreakpoint(filter);
-    if (exc_bp)
-      exc_bp->ClearBreakpoint();
+    if (!exc_bp)
+      continue;
+
+    exc_bp->ClearBreakpoint();
   }
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  return body;
 }
 
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 6cdde63e9796e..cf7db41559b8d 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -482,15 +482,6 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name) {
   return event;
 }
 
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
-  protocol::ExceptionBreakpointsFilter filter;
-  filter.filter = bp.GetFilter();
-  filter.label = bp.GetLabel();
-  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
-  return filter;
-}
-
 // "StackFrame": {
 //   "type": "object",
 //   "description": "A Stackframe contains the source location.",
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 10dc46b94184f..69da0725bd05c 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -224,18 +224,6 @@ llvm::json::Value CreateModule(lldb::SBTarget &target, lldb::SBModule &module,
 ///     definition outlined by Microsoft.
 llvm::json::Object CreateEventObject(const llvm::StringRef event_name);
 
-/// Create a "ExceptionBreakpointsFilter" JSON object as described in
-/// the debug adapter definition.
-///
-/// \param[in] bp
-///     The exception breakpoint object to use
-///
-/// \return
-///     A "ExceptionBreakpointsFilter" JSON object with that follows
-///     the formal JSON definition outlined by Microsoft.
-protocol::ExceptionBreakpointsFilter
-CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
-
 /// Create a "StackFrame" object for a LLDB frame object.
 ///
 /// This function will fill in the following keys in the returned
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 1b1891ba59e61..e6ba54ed4dcd6 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -448,6 +448,20 @@ json::Value toJSON(const SetDataBreakpointsResponseBody &SDBR) {
   return json::Object{{"breakpoints", SDBR.breakpoints}};
 }
 
+bool fromJSON(const json::Value &Params, SetExceptionBreakpointsArguments &Args,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filters", Args.filters) &&
+         O.mapOptional("filterOptions", Args.filterOptions);
+}
+
+json::Value toJSON(const SetExceptionBreakpointsResponseBody &B) {
+  json::Object result;
+  if (!B.breakpoints.empty())
+    result.insert({"breakpoints", B.breakpoints});
+  return result;
+}
+
 json::Value toJSON(const ThreadsResponseBody &TR) {
   return json::Object{{"threads", TR.threads}};
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index 583c203be8e1a..01b8f2445c9fa 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -751,6 +751,56 @@ struct SetDataBreakpointsResponseBody {
 };
 llvm::json::Value toJSON(const SetDataBreakpointsResponseBody &);
 
+/// Arguments for `setExceptionBreakpoints` request.
+struct SetExceptionBreakpointsArguments {
+  /// Set of exception filters specified by their ID. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. The `filter` and `filterOptions` sets are additive.
+  std::vector<std::string> filters;
+
+  /// Set of exception filters and their options. The set of all possible
+  /// exception filters is defined by the `exceptionBreakpointFilters`
+  /// capability. This attribute is only honored by a debug adapter if the
+  /// corresponding capability `supportsExceptionFilterOptions` is true. The
+  /// `filter` and `filterOptions` sets are additive.
+  std::vector<ExceptionFilterOptions> filterOptions;
+
+  // unsupported keys: exceptionOptions
+};
+bool fromJSON(const llvm::json::Value &, SetExceptionBreakpointsArguments &,
+              llvm::json::Path);
+
+/// Response to `setExceptionBreakpoints` request.
+///
+/// The response contains an array of `Breakpoint` objects with information
+/// about each exception breakpoint or filter. The `Breakpoint` objects are in
+/// the same order as the elements of the `filters`, `filterOptions`,
+/// `exceptionOptions` arrays given as arguments. If both `filters` and
+/// `filterOptions` are given, the returned array must start with `filters`
+/// information first, followed by `filterOptions` information.
+///
+/// The `verified` property of a `Breakpoint` object signals whether the
+/// exception breakpoint or filter could be successfully created and whether the
+/// condition is valid. In case of an error the `message` property explains the
+/// problem. The `id` property can be used to introduce a unique ID for the
+/// exception breakpoint or filter so that it can be updated subsequently by
+/// sending breakpoint events.
+///
+/// For backward compatibility both the `breakpoints` array and the enclosing
+/// `body` are optional. If these elements are missing a client is not able to
+/// show problems for individual exception breakpoints or filters.
+struct SetExceptionBreakpointsResponseBody {
+  /// Information about the exception breakpoints or filters.
+  ///
+  /// The breakpoints returned are in the same order as the elements of the
+  /// `filters`, `filterOptions`, `exceptionOptions` arrays in the arguments. If
+  /// both `filters` and `filterOptions` are given, the returned array must
+  /// start with `filters` information first, followed by `filterOptions`
+  /// information.
+  std::vector<Breakpoint> breakpoints;
+};
+llvm::json::Value toJSON(const SetExceptionBreakpointsResponseBody &);
+
 /// Arguments to `disassemble` request.
 struct DisassembleArguments {
   /// Memory reference to the base location containing the instructions to
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
index c21f8382320a5..7f96c07faae10 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.cpp
@@ -86,14 +86,14 @@ bool fromJSON(const llvm::json::Value &Params, ExceptionBreakpointsFilter &EBF,
 json::Value toJSON(const ExceptionBreakpointsFilter &EBF) {
   json::Object result{{"filter", EBF.filter}, {"label", EBF.label}};
 
-  if (EBF.description)
-    result.insert({"description", *EBF.description});
+  if (!EBF.description.empty())
+    result.insert({"description", EBF.description});
   if (EBF.defaultState)
-    result.insert({"default", *EBF.defaultState});
+    result.insert({"default", EBF.defaultState});
   if (EBF.supportsCondition)
-    result.insert({"supportsCondition", *EBF.supportsCondition});
-  if (EBF.conditionDescription)
-    result.insert({"conditionDescription", *EBF.conditionDescription});
+    result.insert({"supportsCondition", EBF.supportsCondition});
+  if (!EBF.conditionDescription.empty())
+    result.insert({"conditionDescription", EBF.conditionDescription});
 
   return result;
 }
@@ -418,23 +418,41 @@ json::Value toJSON(const Capabilities &C) {
   for (const auto &feature : C.supportedFeatures)
     result.insert({ToString(feature), true});
 
-  if (C.exceptionBreakpointFilters && !C.exceptionBreakpointFilters->empty())
+  if (!C.exceptionBreakpointFilters.empty())
+    result.insert({"exceptionBreakpointFilters", C.exceptionBreakpointFilters});
+  if (!C.completionTriggerCharacters.empty())
     result.insert(
-        {"exceptionBreakpointFilters", *C.exceptionBreakpointFilters});
-  if (C.completionTriggerCharacters && !C.completionTriggerCharacters->empty())
+        {"completionTriggerCharacters", C.completionTriggerCharacters});
+  if (!C.additionalModuleColumns.empty())
+    result.insert({"additionalModuleColumns", C.additionalModuleColumns});
+  if (!C.supportedChecksumAlgorithms.empty())
     result.insert(
-        {"completionTriggerCharacters", *C.completionTriggerCharacters});
-  if (C.additionalModuleColumns && !C.additionalModuleColumns->empty())
-    result.insert({"additionalModuleColumns", *C.additionalModuleColumns});
-  if (C.supportedChecksumAlgorithms && !C.supportedChecksumAlgorithms->empty())
-    result.insert(
-        {"supportedChecksumAlgorithms", *C.supportedChecksumAlgorithms});
-  if (C.breakpointModes && !C.breakpointModes->empty())
-    result.insert({"breakpointModes", *C.breakpointModes});
+        {"supportedChecksumAlgorithms", C.supportedChecksumAlgorithms});
+  if (!C.breakpointModes.empty())
+    result.insert({"breakpointModes", C.breakpointModes});
 
   // lldb-dap extensions
-  if (C.lldbExtVersion && !C.lldbExtVersion->empty())
-    result.insert({"$__lldb_version", *C.lldbExtVersion});
+  if (!C.lldbExtVersion.empty())
+    result.insert({"$__lldb_version", C.lldbExtVersion});
+
+  return result;
+}
+
+bool fromJSON(const json::Value &Params, ExceptionFilterOptions &EFO,
+              json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("filterId", EFO.filterId) &&
+         O.mapOptional("condition", EFO.condition) &&
+         O.mapOptional("mode", EFO.mode);
+}
+
+json::Value toJSON(const ExceptionFilterOptions &EFO) {
+  json::Object result{{"filterId", EFO.filterId}};
+
+  if (!EFO.condition.empty())
+    result.insert({"condition", EFO.condition});
+  if (!EFO.mode.empty())
+    result.insert({"mode", EFO.mode});
 
   return result;
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index d7094fbab9e59..7fe7454113994 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -43,19 +43,19 @@ struct ExceptionBreakpointsFilter {
 
   /// A help text providing additional information about the exception filter.
   /// This string is typically shown as a hover and can be translated.
-  std::optional<std::string> description;
+  std::string description;
 
   /// Initial value of the filter option. If not specified a value false is
   /// assumed.
-  std::optional<bool> defaultState;
+  bool defaultState = false;
 
   /// Controls whether a condition can be specified for this filter option. If
   /// false or missing, a condition can not be set.
-  std::optional<bool> supportsCondition;
+  bool supportsCondition = false;
 
   /// A help text providing information about the condition. This string is
   /// shown as the placeholder text for a text box and can be translated.
-  std::optional<std::string> conditionDescription;
+  std::string conditionDescription;
 };
 bool fromJSON(const llvm::json::Value &, ExceptionBreakpointsFilter &,
               llvm::json::Path);
@@ -253,18 +253,17 @@ struct Capabilities {
 
   /// Available exception filter options for the `setExceptionBreakpoints`
   /// request.
-  std::optional<std::vector<ExceptionBreakpointsFilter>>
-      exceptionBreakpointFilters;
+  std::vector<ExceptionBreakpointsFilter> exceptionBreakpointFilters;
 
   /// The set of characters that should trigger completion in a REPL. If not
   /// specified, the UI should assume the `.` character.
-  std::optional<std::vector<std::string>> completionTriggerCharacters;
+  std::vector<std::string> completionTriggerCharacters;
 
   /// The set of additional module information exposed by the debug adapter.
-  std::optional<std::vector<ColumnDescriptor>> additionalModuleColumns;
+  std::vector<ColumnDescriptor> additionalModuleColumns;
 
   /// Checksum algorithms supported by the debug adapter.
-  std::optional<std::vector<ChecksumAlgorithm>> supportedChecksumAlgorithms;
+  std::vector<ChecksumAlgorithm> supportedChecksumAlgorithms;
 
   /// Modes of breakpoints supported by the debug adapter, such as 'hardware' or
   /// 'software'. If present, the client may allow the user to select a mode and
@@ -272,19 +271,39 @@ struct Capabilities {
   ///
   /// Clients may present the first applicable mode in this array as the
   /// 'default' mode in gestures that set breakpoints.
-  std::optional<std::vector<BreakpointMode>> breakpointModes;
+  std::vector<BreakpointMode> breakpointModes;
 
   /// lldb-dap Extensions
   /// @{
 
   /// The version of the adapter.
-  std::optional<std::string> lldbExtVersion;
+  std::string lldbExtVersion;
 
   /// @}
 };
 bool fromJSON(const llvm::json::Value &, Capabilities &, llvm::json::Path);
 llvm::json::Value toJSON(const Capabilities &);
 
+/// An `ExceptionFilterOptions` is used to specify an exception filter together
+/// with a condition for the `setExceptionBreakpoints` request.
+struct ExceptionFilterOptions {
+  /// ID of an exception filter returned by the `exceptionBreakpointFilters`
+  /// capability.
+  std::string filterId;
+
+  /// An expression for conditional exceptions.
+  /// The exception breaks into the debugger if the result of the condition is
+  /// true.
+  std::string condition;
+
+  /// The mode of this exception breakpoint. If defined, this must be one of the
+  /// `breakpointModes` the debug adapter advertised in its `Capabilities`.
+  std::string mode;
+};
+bool fromJSON(const llvm::json::Value &, ExceptionFilterOptions &,
+              llvm::json::Path);
+llvm::json::Value toJSON(const ExceptionFilterOptions &);
+
 /// A `Source` is a descriptor for source code. It is returned from the debug
 /// adapter as part of a `StackFrame` and it is used by clients when specifying
 /// breakpoints.
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 6e0adf5bc8b59..cb1ee6a424003 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -161,4 +161,15 @@ std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
   return threads;
 }
 
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) {
+  protocol::ExceptionBreakpointsFilter filter;
+  filter.filter = bp.GetFilter();
+  filter.label = bp.GetLabel();
+  filter.description = bp.GetLabel();
+  filter.defaultState = ExceptionBreakpoint::kDefaultValue;
+  filter.supportsCondition = true;
+  return filter;
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.h b/lldb/tools/lldb-dap/ProtocolUtils.h
index 2b2ac9e8e35fd..788d2fd054e2d 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.h
+++ b/lldb/tools/lldb-dap/ProtocolUtils.h
@@ -13,6 +13,7 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_UTILS_H
 
+#include "ExceptionBreakpoint.h"
 #include "Protocol/ProtocolTypes.h"
 
 #include "lldb/API/SBAddress.h"
@@ -74,6 +75,18 @@ protocol::Thread CreateThread(lldb::SBThread &thread, lldb::SBFormat &format);
 std::vector<protocol::Thread> GetThreads(lldb::SBProcess process,
                                          lldb::SBFormat &format);
 
+/// Create a "ExceptionBreakpointsFilter" JSON object as described in
+/// the debug adapter definition.
+///
+/// \param[in] bp
+///     The exception breakpoint object to use
+///
+/// \return
+///     A "ExceptionBreakpointsFilter" JSON object with that follows
+///     the formal JSON definition outlined by Microsoft.
+protocol::ExceptionBreakpointsFilter
+CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp);
+
 } // namespace lldb_dap
 
 #endif
diff --git a/lldb/unittests/DAP/ProtocolTypesTest.cpp b/lldb/unittests/DAP/ProtocolTypesTest.cpp
index f2a23db346565..46a09f090fea2 100644
--- a/lldb/unittests/DAP/ProtocolTypesTest.cpp
+++ b/lldb/unittests/DAP/ProtocolTypesTest.cpp
@@ -243,14 +243,12 @@ TEST(ProtocolTypesTest, Capabilities) {
             deserialized_capabilities->supportedFeatures);
 
   // Verify exception breakpoint filters.
-  ASSERT_TRUE(
-      deserialized_capabilities->exceptionBreakpointFilters.has_value());
-  EXPECT_EQ(capabilities.exceptionBreakpointFilters->size(),
-            deserialized_capabilities->exceptionBreakpointFilters->size());
-  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters->size(); ++i) {
-    const auto &original = capabilities.exceptionBreakpointFilters->at(i);
+  EXPECT_EQ(capabilities.exceptionBreakpointFilters.size(),
+            deserialized_capabilities->exceptionBreakpointFilters.size());
+  for (size_t i = 0; i < capabilities.exceptionBreakpointFilters.size(); ++i) {
+    const auto &original = capabilities.exceptionBreakpointFilters.at(i);
     const auto &deserialized =
-        deserialized_capabilities->exceptionBreakpointFilters->at(i);
+        deserialized_capabilities->exceptionBreakpointFilters.at(i);
     EXPECT_EQ(original.filter, deserialized.filter);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -260,19 +258,16 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify completion trigger characters.
-  ASSERT_TRUE(
-      deserialized_capabilities->completionTriggerCharacters.has_value());
   EXPECT_EQ(capabilities.completionTriggerCharacters,
             deserialized_capabilities->completionTriggerCharacters);
 
   // Verify additional module columns.
-  ASSERT_TRUE(deserialized_capabilities->additionalModuleColumns.has_value());
-  EXPECT_EQ(capabilities.additionalModuleColumns->size(),
-            deserialized_capabilities->additionalModuleColumns->size());
-  for (size_t i = 0; i < capabilities.additionalModuleColumns->size(); ++i) {
-    const auto &original = capabilities.additionalModuleColumns->at(i);
+  EXPECT_EQ(capabilities.additionalModuleColumns.size(),
+            deserialized_capabilities->additionalModuleColumns.size());
+  for (size_t i = 0; i < capabilities.additionalModuleColumns.size(); ++i) {
+    const auto &original = capabilities.additionalModuleColumns.at(i);
     const auto &deserialized =
-        deserialized_capabilities->additionalModuleColumns->at(i);
+        deserialized_capabilities->additionalModuleColumns.at(i);
     EXPECT_EQ(original.attributeName, deserialized.attributeName);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.format, deserialized.format);
@@ -281,19 +276,15 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify supported checksum algorithms.
-  ASSERT_TRUE(
-      deserialized_capabilities->supportedChecksumAlgorithms.has_value());
   EXPECT_EQ(capabilities.supportedChecksumAlgorithms,
             deserialized_capabilities->supportedChecksumAlgorithms);
 
   // Verify breakpoint modes.
-  ASSERT_TRUE(deserialized_capabilities->breakpointModes.has_value());
-  EXPECT_EQ(capabilities.breakpointModes->size(),
-            deserialized_capabilities->breakpointModes->size());
-  for (size_t i = 0; i < capabilities.breakpointModes->size(); ++i) {
-    const auto &original = capabilities.breakpointModes->at(i);
-    const auto &deserialized =
-        deserialized_capabilities->breakpointModes->at(i);
+  EXPECT_EQ(capabilities.breakpointModes.size(),
+            deserialized_capabilities->breakpointModes.size());
+  for (size_t i = 0; i < capabilities.breakpointModes.size(); ++i) {
+    const auto &original = capabilities.breakpointModes.at(i);
+    const auto &deserialized = deserialized_capabilities->breakpointModes.at(i);
     EXPECT_EQ(original.mode, deserialized.mode);
     EXPECT_EQ(original.label, deserialized.label);
     EXPECT_EQ(original.description, deserialized.description);
@@ -301,7 +292,6 @@ TEST(ProtocolTypesTest, Capabilities) {
   }
 
   // Verify lldb extension version.
-  ASSERT_TRUE(deserialized_capabilities->lldbExtVersion.has_value());
   EXPECT_EQ(capabilities.lldbExtVersion,
             deserialized_capabilities->lldbExtVersion);
 }
@@ -687,6 +677,75 @@ TEST(ProtocolTypesTest, CapabilitiesEventBody) {
   EXPECT_EQ(json, pp(body));
 }
 
+TEST(ProtocolTypesTest, ExceptionFilterOptions) {
+  EXPECT_THAT_EXPECTED(parse<ExceptionFilterOptions>(R"({"filterId":"id"})"),
+                       HasValue(Value(ExceptionFilterOptions{
+                           /*filterId=*/"id", /*condition=*/"", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":"1+2"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ ""})));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(
+          R"({"filterId":"id","condition":"1+2","mode":"m"})"),
+      HasValue(Value(ExceptionFilterOptions{
+          /*filterId=*/"id", /*condition=*/"1+2", /*mode*/ "m"})));
+
+  // Validate parsing errors
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({})", "exceptionFilterOptions"),
+      FailedWithMessage("missing value at exceptionFilterOptions.filterId"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","condition":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.condition"));
+  EXPECT_THAT_EXPECTED(
+      parse<ExceptionFilterOptions>(R"({"filterId":"id","mode":42})",
+                                    "exceptionFilterOptions"),
+      FailedWithMessage("expected string at exceptionFilterOptions.mode"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsArguments) {
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":[]})"),
+      HasValue(testing::FieldsAre(/*filters=*/testing::IsEmpty(),
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":["abc"]})"),
+      HasValue(testing::FieldsAre(/*filters=*/std::vector<std::string>{"abc"},
+                                  /*filterOptions=*/testing::IsEmpty())));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(
+          R"({"filters":[],"filterOptions":[{"filterId":"abc"}]})"),
+      HasValue(testing::FieldsAre(
+          /*filters=*/testing::IsEmpty(),
+          /*filterOptions=*/testing::Contains(testing::FieldsAre(
+              /*filterId=*/"abc", /*condition=*/"", /*mode=*/"")))));
+
+  // Validate parse errors
+  EXPECT_THAT_EXPECTED(parse<SetExceptionBreakpointsArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).filters"));
+  EXPECT_THAT_EXPECTED(
+      parse<SetExceptionBreakpointsArguments>(R"({"filters":false})"),
+      FailedWithMessage("expected array at (root).filters"));
+}
+
+TEST(ProtocolTypesTest, SetExceptionBreakpointsResponseBody) {
+  SetExceptionBreakpointsResponseBody body;
+  Breakpoint bp;
+  bp.id = 12, bp.verified = true;
+  body.breakpoints = {bp};
+  EXPECT_EQ(R"({
+  "breakpoints": [
+    {
+      "id": 12,
+      "verified": true
+    }
+  ]
+})",
+            pp(body));
+}
+
 TEST(ProtocolTypesTest, StepInTarget) {
   StepInTarget target;
   target.id = 230;
@@ -705,4 +764,4 @@ TEST(ProtocolTypesTest, StepInTarget) {
   EXPECT_EQ(target.column, deserialized_target->column);
   EXPECT_EQ(target.endLine, deserialized_target->endLine);
   EXPECT_EQ(target.endColumn, deserialized_target->endColumn);
-}
\ No newline at end of file
+}

From 97bfb936af4077e8cb6c75664231f27a9989d563 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:42:53 +0900
Subject: [PATCH 662/851] DAG: Move soft float predicate management into
 RuntimeLibcalls (#142905)

Work towards making RuntimeLibcalls the centralized location for
all libcall information. This requires changing the encoding from
tracking the ISD::CondCode to using CmpInst::Predicate.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  14 +-
 llvm/include/llvm/IR/RuntimeLibcalls.h        |  25 +++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   5 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |  33 ++++
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 182 +++++++++---------
 5 files changed, 160 insertions(+), 99 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4ed81d25e8e22..dd44afd0855a5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3574,20 +3574,18 @@ class LLVM_ABI TargetLoweringBase {
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
-    CmpLibcallCCs[Call] = CC;
+  /// FIXME: This should be removed
+  void setCmpLibcallCC(RTLIB::Libcall Call, CmpInst::Predicate Pred) {
+    Libcalls.setSoftFloatCmpLibcallPredicate(Call, Pred);
   }
 
-
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
-  /// FIXME: This can't be merged with 'RuntimeLibcallsInfo' because of the ISD.
-  ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
-    return CmpLibcallCCs[Call];
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return Libcalls.getSoftFloatCmpLibcallPredicate(Call);
   }
 
-
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     Libcalls.setLibcallCallingConv(Call, CC);
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 051bcc147cb71..45826fcd19f38 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
@@ -86,6 +87,20 @@ struct RuntimeLibcallsInfo {
     return ArrayRef(LibcallRoutineNames).drop_back();
   }
 
+  /// Get the comparison predicate that's to be used to test the result of the
+  /// comparison libcall against zero. This should only be used with
+  /// floating-point compare libcalls.
+  CmpInst::Predicate
+  getSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call) const {
+    return SoftFloatCompareLibcallPredicates[Call];
+  }
+
+  // FIXME: This should be removed. This should be private constant.
+  void setSoftFloatCmpLibcallPredicate(RTLIB::Libcall Call,
+                                       CmpInst::Predicate Pred) {
+    SoftFloatCompareLibcallPredicates[Call] = Pred;
+  }
+
 private:
   /// Stores the name each libcall.
   const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
@@ -93,6 +108,14 @@ struct RuntimeLibcallsInfo {
   /// Stores the CallingConv that should be used for each libcall.
   CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
 
+  /// The condition type that should be used to test the result of each of the
+  /// soft floating-point comparison libcall against integer zero.
+  ///
+  // FIXME: This is only relevant for the handful of floating-point comparison
+  // runtime calls; it's excessive to have a table entry for every single
+  // opcode.
+  CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL];
+
   static bool darwinHasSinCos(const Triple &TT) {
     assert(TT.isOSDarwin() && "should be called with darwin triple");
     // Don't bother with 32 bit x86.
@@ -108,6 +131,8 @@ struct RuntimeLibcallsInfo {
     return true;
   }
 
+  void initSoftFloatCmpLibcallPredicates();
+
   /// Set default libcall names. If a target wants to opt-out of a libcall it
   /// should be placed here.
   LLVM_ABI void initLibcalls(const Triple &TT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a0ffb4b6d5a4c..52f19cc6e1ab0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/CodeGenCommonISel.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -419,7 +420,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   NewLHS = Call.first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
 
-  CCCode = getCmpLibcallCC(LC1);
+  CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC1));
   if (ShouldInvertCC) {
     assert(RetVT.isInteger());
     CCCode = getSetCCInverse(CCCode, RetVT);
@@ -441,7 +442,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
 
     SDValue Tmp = DAG.getSetCC(dl, SetCCVT, NewLHS, NewRHS, CCCode);
     auto Call2 = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl, Chain);
-    CCCode = getCmpLibcallCC(LC2);
+    CCCode = getICmpCondCode(getSoftFloatCmpLibcallPredicate(LC2));
     if (ShouldInvertCC)
       CCCode = getSetCCInverse(CCCode, RetVT);
     NewLHS = DAG.getSetCC(dl, SetCCVT, Call2.first, NewRHS, CCCode);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d63d398e243f9..8506a0c03d33c 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -220,12 +220,45 @@ static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) {
   // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
 }
 
+void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F32] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F64] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_F128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::OEQ_PPCF128] = CmpInst::ICMP_EQ;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UNE_PPCF128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F32] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F64] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_F128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGE_PPCF128] = CmpInst::ICMP_SGE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F32] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F64] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_F128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLT_PPCF128] = CmpInst::ICMP_SLT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F32] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F64] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_F128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OLE_PPCF128] = CmpInst::ICMP_SLE;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F32] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F64] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_F128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::OGT_PPCF128] = CmpInst::ICMP_SGT;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F32] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F64] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_F128] = CmpInst::ICMP_NE;
+  SoftFloatCompareLibcallPredicates[RTLIB::UO_PPCF128] = CmpInst::ICMP_NE;
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
   std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
             nullptr);
 
+  initSoftFloatCmpLibcallPredicates();
+
 #define HANDLE_LIBCALL(code, name) setLibcallName(RTLIB::code, name);
 #define LIBCALL_NO_NAME nullptr
 #include "llvm/IR/RuntimeLibcalls.def"
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 5b3664c4e961f..05d8a1190ada8 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -522,67 +522,69 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
+      // clang-format off
       static const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
-        const ISD::CondCode Cond;
+        const CmpInst::Predicate Cond;
       } LibraryCalls[] = {
         // Single-precision floating-point arithmetic.
-        { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F32, "__addsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F32, "__subsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F32, "__mulsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F32, "__divsf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Double-precision floating-point arithmetic.
-        { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
-        { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
+        { RTLIB::ADD_F64, "__adddf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SUB_F64, "__subdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::MUL_F64, "__muldf3vfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::DIV_F64, "__divdf3vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Single-precision comparisons.
-        { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F32, "__eqsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F32, "__nesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F32, "__ltsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F32, "__lesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F32, "__gesf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F32, "__gtsf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F32,  "__unordsf2vfp", CmpInst::ICMP_NE },
 
         // Double-precision comparisons.
-        { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
-        { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
-        { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
-        { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
-        { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
-        { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
-        { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
+        { RTLIB::OEQ_F64, "__eqdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UNE_F64, "__nedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLT_F64, "__ltdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OLE_F64, "__ledf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGE_F64, "__gedf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::OGT_F64, "__gtdf2vfp",    CmpInst::ICMP_NE },
+        { RTLIB::UO_F64,  "__unorddf2vfp", CmpInst::ICMP_NE },
 
         // Floating-point to integer conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
-        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
-        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
-        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
+        { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Conversions between floating types.
-        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
-        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
+        { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", CmpInst::BAD_ICMP_PREDICATE },
 
         // Integer to floating-point conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
         // e.g., __floatunsidf vs. __floatunssidfvfp.
-        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
-        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
-        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
+        { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    CmpInst::BAD_ICMP_PREDICATE },
+        { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", CmpInst::BAD_ICMP_PREDICATE },
       };
+      // clang-format on
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
-        if (LC.Cond != ISD::SETCC_INVALID)
+        if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
@@ -592,97 +594,99 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
+    // clang-format off
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
-      const ISD::CondCode Cond;
+      const CmpInst::Predicate Cond;
     } LibraryCalls[] = {
       // Double-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 2
-      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Double-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 3
-      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 4
-      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Single-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 5
-      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
-      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
-      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, CmpInst::ICMP_EQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, CmpInst::ICMP_NE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Floating-point to integer conversions.
       // RTABI chapter 4.1.2, Table 6
-      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Conversions between floating types.
       // RTABI chapter 4.1.2, Table 7
-      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer to floating-point conversions.
       // RTABI chapter 4.1.2, Table 8
-      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Long long helper functions
       // RTABI chapter 4.2, Table 9
-      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
 
       // Integer division functions
       // RTABI chapter 4.3.1
-      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
-      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, CmpInst::BAD_ICMP_PREDICATE },
     };
+    // clang-format on
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
-      if (LC.Cond != ISD::SETCC_INVALID)
+      if (LC.Cond != CmpInst::BAD_ICMP_PREDICATE)
         setCmpLibcallCC(LC.Op, LC.Cond);
     }
 

From 1ffd9f553ccba27c0def5f38e7928af8f3976bac Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:46:17 +0900
Subject: [PATCH 663/851] RuntimeLibcalls: Cleanup sincos predicate functions
 (#143081)

The darwinHasSinCos wasn't actually used for sincos, only the stret
variant. Rename this to reflect that, and introduce a new one for
enabling sincos.
---
 llvm/include/llvm/IR/RuntimeLibcalls.h | 8 +++++++-
 llvm/lib/IR/RuntimeLibcalls.cpp        | 5 ++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 45826fcd19f38..d8f467e30fa6a 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -116,7 +116,7 @@ struct RuntimeLibcallsInfo {
   // opcode.
   CmpInst::Predicate SoftFloatCompareLibcallPredicates[RTLIB::UNKNOWN_LIBCALL];
 
-  static bool darwinHasSinCos(const Triple &TT) {
+  static bool darwinHasSinCosStret(const Triple &TT) {
     assert(TT.isOSDarwin() && "should be called with darwin triple");
     // Don't bother with 32 bit x86.
     if (TT.getArch() == Triple::x86)
@@ -131,6 +131,12 @@ struct RuntimeLibcallsInfo {
     return true;
   }
 
+  /// Return true if the target has sincosf/sincos/sincosl functions
+  static bool hasSinCos(const Triple &TT) {
+    return TT.isGNUEnvironment() || TT.isOSFuchsia() ||
+           (TT.isAndroid() && !TT.isAndroidVersionLT(9));
+  }
+
   void initSoftFloatCmpLibcallPredicates();
 
   /// Set default libcall names. If a target wants to opt-out of a libcall it
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 8506a0c03d33c..882f0db193b5a 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -412,7 +412,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
       break;
     }
 
-    if (darwinHasSinCos(TT)) {
+    if (darwinHasSinCosStret(TT)) {
       setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret");
       setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret");
       if (TT.isWatchABI()) {
@@ -456,8 +456,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::EXP10_F64, "__exp10");
   }
 
-  if (TT.isGNUEnvironment() || TT.isOSFuchsia() ||
-      (TT.isAndroid() && !TT.isAndroidVersionLT(9))) {
+  if (hasSinCos(TT)) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
     setLibcallName(RTLIB::SINCOS_F64, "sincos");
     setLibcallName(RTLIB::SINCOS_F80, "sincosl");

From 9bd234a4330c6882f23ebf1f7861c5ec97e74d95 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:49:55 +0900
Subject: [PATCH 664/851] AArch64: Move outline atomic libcalls configuration
 (#144374)

This de-conditionalizes the setting of the libcall names
on outlineAtomics() && !hasLSE(). The existence of the
libcall is a module level property, which cannot depend on the
subtarget so this is fine. It's better if the initial list of
calls has more entries than will be used than to have missing
ones. There aren't any alternative names set, so this is also
fine.

Currently RuntimeLibcallsInfo conflates the existence of the calls
with the lowering usage decision, so this suboptimally will report
the libcall name on subtargets that should not use the calls. This
doesn't matter in this case though, as the atomic lowering actions
are already separately controlled and aren't based on decisions on
libcall availability. We could be paranoid and clear the names in
TargetLowering.

Also fixes not catching all aarch64 triples in the RuntimeLibcallsInfo
construction; the previous check missed aarch64_be.
---
 llvm/lib/IR/RuntimeLibcalls.cpp               | 24 ++++++++++++++++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 21 ----------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 882f0db193b5a..cb64426a111de 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -18,6 +18,28 @@ static cl::opt<bool>
 
 static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
                                    const Triple &TT) {
+#define LCALLNAMES(A, B, N)                                                    \
+  Info.setLibcallName(A##N##_RELAX, #B #N "_relax");                           \
+  Info.setLibcallName(A##N##_ACQ, #B #N "_acq");                               \
+  Info.setLibcallName(A##N##_REL, #B #N "_rel");                               \
+  Info.setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
+#define LCALLNAME4(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
+#define LCALLNAME5(A, B)                                                       \
+  LCALLNAMES(A, B, 1)                                                          \
+  LCALLNAMES(A, B, 2)                                                          \
+  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
+  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
+  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
+
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
 #define HANDLE_LIBCALL(code, name)                                             \
@@ -520,7 +542,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     }
   }
 
-  if (TT.getArch() == Triple::ArchType::aarch64)
+  if (TT.isAArch64())
     setAArch64LibcallNames(*this, TT);
   else if (TT.isARM() || TT.isThumb())
     setARMLibcallNames(*this, TT);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c86aed7b38c8c..c7ffc39b5b162 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -959,27 +959,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
-#define LCALLNAMES(A, B, N)                                                    \
-  setLibcallName(A##N##_RELAX, #B #N "_relax");                                \
-  setLibcallName(A##N##_ACQ, #B #N "_acq");                                    \
-  setLibcallName(A##N##_REL, #B #N "_rel");                                    \
-  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
-#define LCALLNAME4(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
-#define LCALLNAME5(A, B)                                                       \
-  LCALLNAMES(A, B, 1)                                                          \
-  LCALLNAMES(A, B, 2)                                                          \
-  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
-    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
-    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
-#undef LCALLNAMES
-#undef LCALLNAME4
-#undef LCALLNAME5
   }
 
   if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {

From 24631e5440eed3093dfb52e7a631504b71845923 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:52:50 +0900
Subject: [PATCH 665/851] AArch64: Fix outline atomic libcall names for arm64ec
 (#144378)

Add a missing # prefix to each libcall name
---
 llvm/lib/IR/RuntimeLibcalls.cpp               | 14 +++++++++++---
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll |  3 +--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index cb64426a111de..5d22b41e28aad 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -36,9 +36,6 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
   LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
-#undef LCALLNAMES
-#undef LCALLNAME4
-#undef LCALLNAME5
 
   if (TT.isWindowsArm64EC()) {
     // FIXME: are there calls we need to exclude from this?
@@ -54,7 +51,18 @@ static void setAArch64LibcallNames(RuntimeLibcallsInfo &Info,
 #include "llvm/IR/RuntimeLibcalls.def"
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
+
+    LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, #__aarch64_cas)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, #__aarch64_swp)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, #__aarch64_ldadd)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, #__aarch64_ldset)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, #__aarch64_ldclr)
+    LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, #__aarch64_ldeor)
   }
+
+#undef LCALLNAMES
+#undef LCALLNAME4
+#undef LCALLNAME5
 }
 
 static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) {
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
index 92b95a90d89a0..cc4ec9c2eebd6 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -28,10 +28,9 @@ define i128 @f4(i128 %x, i128 %y) {
   ret i128 %r
 }
 
-; FIXME: This is wrong; should be "#__aarch64_cas1_relax"
 define i8 @f5(i8 %expected, i8 %new, ptr %ptr) "target-features"="+outline-atomics" {
 ; CHECK-LABEL: "#f5":
-; CHECK: bl __aarch64_cas1_relax
+; CHECK: bl "#__aarch64_cas1_relax"
     %pair = cmpxchg ptr %ptr, i8 %expected, i8 %new monotonic monotonic, align 1
    %r = extractvalue { i8, i1 } %pair, 0
     ret i8 %r

From 6e8cf9c63f643768a1d54a9ce2a73a570429c4bc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 09:55:41 +0900
Subject: [PATCH 666/851] AArch64: Add arm64ec libcall tests for __arm_sc_*
 functions (#144356)

---
 llvm/test/CodeGen/AArch64/arm64ec-builtins.ll | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
index cc4ec9c2eebd6..38416310b3536 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-builtins.ll
@@ -42,3 +42,33 @@ define float @f6(float %val, i32 %a) {
   %call = tail call fast float @llvm.ldexp.f32(float %val, i32 %a)
   ret float %call
 }
+
+@dst = global [512 x i8] zeroinitializer, align 1
+@src = global [512 x i8] zeroinitializer, align 1
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memcpy(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memcpy":
+; CHECK: bl __arm_sc_memcpy
+
+  tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memmove(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memmove":
+; CHECK: bl __arm_sc_memmove
+  tail call void @llvm.memmove.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
+  ret void
+}
+
+; FIXME: Wrong and probably needs a # prefix
+define void @call__arm_sc_memset(i64 noundef %n) #0 {
+; CHECK-LABEL: "#call__arm_sc_memset":
+; CHECK: bl __arm_sc_memset
+  tail call void @llvm.memset.p0.i64(ptr align 1 @dst, i8 2, i64 %n, i1 false)
+  ret void
+}
+
+attributes #0 = { nounwind "aarch64_pstate_sm_enabled" "target-features"="+sme2" }

From d4e2c0b359ea90236fd1b62791a04fb845f5d9f3 Mon Sep 17 00:00:00 2001
From: Bryan Chan <bryan.chan@huawei.com>
Date: Mon, 16 Jun 2025 20:59:18 -0400
Subject: [PATCH 667/851] [Driver] Add options to control workaround for
 Cortex-A53 Erratum 843419 (#143915)

Implement the -mfix-cortex-a53-843419 and -mno-fix-cortex-a53-843419 options,
which have been introduced to GCC to allow the user to control the workaround
for the erratum. If the option is enabled (which is the default, unchanged by
this patch), Clang passes --fix-cortex-a53-843419 to the linker when it cannot
ensure that the target is not a Cortex A53, otherwise it doesn't.

See https://gcc.gnu.org/onlinedocs/gcc/AArch64-Options.html#index-mfix-cortex-a53-843419
for information on the GCC options.
---
 clang/include/clang/Driver/Options.td   | 10 ++++++++--
 clang/lib/Driver/ToolChains/Fuchsia.cpp |  4 +++-
 clang/lib/Driver/ToolChains/Gnu.cpp     |  4 +++-
 clang/test/Driver/android-link.cpp      | 12 ++++++++++++
 4 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 72d564e1ba0be..8b7708e530b14 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5150,10 +5150,16 @@ def mno_fix_cortex_a72_aes_1655431 : Flag<["-"], "mno-fix-cortex-a72-aes-1655431
   Alias<mno_fix_cortex_a57_aes_1742098>;
 def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Work around Cortex-A53 erratum 835769 (AArch64 only)">;
 def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
-  HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
+  HelpText<"Don't work around Cortex-A53 erratum 835769 (AArch64 only)">;
+def mfix_cortex_a53_843419 : Flag<["-"], "mfix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Work around Cortex-A53 erratum 843419 (AArch64 only)">;
+def mno_fix_cortex_a53_843419 : Flag<["-"], "mno-fix-cortex-a53-843419">,
+  Group<m_aarch64_Features_Group>,
+  HelpText<"Don't work around Cortex-A53 erratum 843419 (AArch64 only)">;
 def mmark_bti_property : Flag<["-"], "mmark-bti-property">,
   Group<m_aarch64_Features_Group>,
   HelpText<"Add .note.gnu.property with BTI to assembly files (AArch64 only)">;
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 1c165bbfe84f5..146dc8bbd5313 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -91,7 +91,9 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--execute-only");
 
     std::string CPU = getCPUName(D, Args, Triple);
-    if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
+    if (Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                     options::OPT_mno_fix_cortex_a53_843419, true) &&
+        (CPU.empty() || CPU == "generic" || CPU == "cortex-a53"))
       CmdArgs.push_back("--fix-cortex-a53-843419");
   }
 
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9c68c5c6de2b2..9203bbc91b0bb 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -402,7 +402,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   // Most Android ARM64 targets should enable the linker fix for erratum
   // 843419. Only non-Cortex-A53 devices are allowed to skip this flag.
-  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily)) {
+  if (Arch == llvm::Triple::aarch64 && (isAndroid || isOHOSFamily) &&
+      Args.hasFlag(options::OPT_mfix_cortex_a53_843419,
+                   options::OPT_mno_fix_cortex_a53_843419, true)) {
     std::string CPU = getCPUName(D, Args, Triple);
     if (CPU.empty() || CPU == "generic" || CPU == "cortex-a53")
       CmdArgs.push_back("--fix-cortex-a53-843419");
diff --git a/clang/test/Driver/android-link.cpp b/clang/test/Driver/android-link.cpp
index ab7dae5405587..b103263cdd3f0 100644
--- a/clang/test/Driver/android-link.cpp
+++ b/clang/test/Driver/android-link.cpp
@@ -16,6 +16,16 @@
 // RUN: FileCheck -check-prefix=CORTEX-A57 < %t %s
 
 // RUN: %clang --target=aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
+// RUN:   -mno-fix-cortex-a53-843419 -mfix-cortex-a53-843419 \
+// RUN:   -### -v %s 2> %t
+// RUN: FileCheck -check-prefix=OVERRIDDEN2 < %t %s
+//
+// RUN: %clang -target aarch64-none-linux-android \
 // RUN:   -### -v %s 2> %t
 // RUN: FileCheck -check-prefix=MAX-PAGE-SIZE-16KB < %t %s
 
@@ -31,6 +41,8 @@
 // GENERIC-ARM: --fix-cortex-a53-843419
 // CORTEX-A53: --fix-cortex-a53-843419
 // CORTEX-A57-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN-NOT: --fix-cortex-a53-843419
+// OVERRIDDEN2: --fix-cortex-a53-843419
 // MAX-PAGE-SIZE-4KB: "-z" "max-page-size=4096"
 // MAX-PAGE-SIZE-16KB: "-z" "max-page-size=16384"
 // NO-MAX-PAGE-SIZE-16KB-NOT: "-z" "max-page-size=16384"

From 8b1528fad99a18d2e094968f1341efb3048a23da Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 10:00:41 +0900
Subject: [PATCH 668/851] RuntimeLibcalls: Use array initializers for default
 values (#143082)

---
 llvm/include/llvm/IR/RuntimeLibcalls.h | 7 +++++--
 llvm/lib/IR/RuntimeLibcalls.cpp        | 7 +------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index d8f467e30fa6a..3e1531ebfd9d6 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -103,10 +103,13 @@ struct RuntimeLibcallsInfo {
 
 private:
   /// Stores the name each libcall.
-  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
+  const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1] = {nullptr};
+
+  static_assert(static_cast<int>(CallingConv::C) == 0,
+                "default calling conv should be encoded as 0");
 
   /// Stores the CallingConv that should be used for each libcall.
-  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
+  CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL] = {};
 
   /// The condition type that should be used to test the result of each of the
   /// soft floating-point comparison libcall against integer zero.
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 5d22b41e28aad..7396626a03d41 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -284,8 +284,7 @@ void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
-  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames),
-            nullptr);
+  initSoftFloatCmpLibcallPredicates();
 
   initSoftFloatCmpLibcallPredicates();
 
@@ -295,10 +294,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 #undef HANDLE_LIBCALL
 #undef LIBCALL_NO_NAME
 
-  // Initialize calling conventions to their default.
-  for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC)
-    setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C);
-
   // Use the f128 variants of math functions on x86
   if (TT.isX86() && TT.isGNUEnvironment()) {
     setLibcallName(RTLIB::REM_F128, "fmodf128");

From f626620e33ba2c76ba226ecaeb09c320b60aa4d9 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 16 Jun 2025 18:19:17 -0700
Subject: [PATCH 669/851] [LLVM][TableGen] Use `StringRef` for
 CodeGenInstruction::AsmString (#144440)

---
 llvm/utils/TableGen/Common/CodeGenInstruction.cpp | 2 +-
 llvm/utils/TableGen/Common/CodeGenInstruction.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 0dfcf200d7e4b..2ec3683e116e9 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -435,7 +435,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
 CodeGenInstruction::CodeGenInstruction(const Record *R)
     : TheDef(R), Operands(R), InferredFrom(nullptr) {
   Namespace = R->getValueAsString("Namespace");
-  AsmString = R->getValueAsString("AsmString").str();
+  AsmString = R->getValueAsString("AsmString");
 
   isPreISelOpcode = R->getValueAsBit("isPreISelOpcode");
   isReturn = R->getValueAsBit("isReturn");
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index 3a5abc55319b1..0db12b551b437 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -226,7 +226,7 @@ class CodeGenInstruction {
 
   /// AsmString - The format string used to emit a .s file for the
   /// instruction.
-  std::string AsmString;
+  StringRef AsmString;
 
   /// Operands - This is information about the (ins) and (outs) list specified
   /// to the instruction.

From 2e3d212e40bc6fca9fbe53978a87c901eb19a01d Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Tue, 17 Jun 2025 10:41:08 +0800
Subject: [PATCH 670/851] [LoongArch] Allow difference across sections
 (#141722)

For SecA != SecB but SecB is current section, fallback for pcrel{64,32}
relocations. For linker relaxation being disabled and SecA == SecB,
return directly for avoid record relocations. In other cases, record
relocations which also allows across sections.
---
 .../MCTargetDesc/LoongArchAsmBackend.cpp      | 47 +++++++---
 .../MCTargetDesc/LoongArchAsmBackend.h        |  4 +
 llvm/test/MC/LoongArch/Misc/cfi-advance.s     | 12 +++
 .../test/MC/LoongArch/Relocations/fde-reloc.s |  7 +-
 llvm/test/MC/LoongArch/Relocations/sub-expr.s | 91 ++++++++++++++++---
 5 files changed, 137 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index d7569ab0ea597..b1491b75ac5bc 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -429,6 +429,26 @@ bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
   return true;
 }
 
+bool LoongArchAsmBackend::isPCRelFixupResolved(const MCSymbol *SymA,
+                                               const MCFragment &F) {
+  // If the section does not contain linker-relaxable fragments, PC-relative
+  // fixups can be resolved.
+  if (!F.getParent()->isLinkerRelaxable())
+    return true;
+
+  // Otherwise, check if the offset between the symbol and fragment is fully
+  // resolved, unaffected by linker-relaxable fragments (e.g. instructions or
+  // offset-affected MCAlignFragment). Complements the generic
+  // isSymbolRefDifferenceFullyResolvedImpl.
+  if (!PCRelTemp)
+    PCRelTemp = getContext().createTempSymbol();
+  PCRelTemp->setFragment(const_cast<MCFragment *>(&F));
+  MCValue Res;
+  MCExpr::evaluateSymbolicAdd(Asm, false, MCValue::get(SymA),
+                              MCValue::get(nullptr, PCRelTemp), Res);
+  return !Res.getSubSym();
+}
+
 bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                    const MCValue &Target, uint64_t &FixedValue,
                                    bool IsResolved) {
@@ -447,19 +467,24 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     if (!force) {
       const MCSection &SecA = SA.getSection();
       const MCSection &SecB = SB.getSection();
-
-      // We need record relocation if SecA != SecB. Usually SecB is same as the
-      // section of Fixup, which will be record the relocation as PCRel. If SecB
-      // is not same as the section of Fixup, it will report error. Just return
-      // false and then this work can be finished by handleFixup.
-      if (&SecA != &SecB)
+      const MCSection &SecCur = *F.getParent();
+
+      // To handle the case of A - B which B is same section with the current,
+      // generate PCRel relocations is better than ADD/SUB relocation pair.
+      // We can resolve it as A - PC + PC - B. The A - PC will be resolved
+      // as a PCRel relocation, while PC - B will serve as the addend.
+      // If the linker relaxation is disabled, it can be done directly since
+      // PC - B is constant. Otherwise, we should evaluate whether PC - B
+      // is constant. If it can be resolved as PCRel, use Fallback which
+      // generates R_LARCH_{32,64}_PCREL relocation later.
+      if (&SecA != &SecB && &SecB == &SecCur &&
+          isPCRelFixupResolved(Target.getSubSym(), F))
         return Fallback();
 
-      // In SecA == SecB case. If the linker relaxation is enabled, we need
-      // record the ADD, SUB relocations. Otherwise the FixedValue has already
-      // been calc- ulated out in evaluateFixup, return true and avoid record
-      // relocations.
-      if (!STI.hasFeature(LoongArch::FeatureRelax))
+      // In SecA == SecB case. If the linker relaxation is disabled, the
+      // FixedValue has already been calculated out in evaluateFixup,
+      // return true and avoid record relocations.
+      if (&SecA == &SecB && !STI.hasFeature(LoongArch::FeatureRelax))
         return true;
     }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index aeedafe2b44b4..56554c5c664eb 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -30,6 +30,10 @@ class LoongArchAsmBackend : public MCAsmBackend {
   bool Is64Bit;
   const MCTargetOptions &TargetOptions;
   DenseMap<MCSection *, const MCSymbolRefExpr *> SecToAlignSym;
+  // Temporary symbol used to check whether a PC-relative fixup is resolved.
+  MCSymbol *PCRelTemp = nullptr;
+
+  bool isPCRelFixupResolved(const MCSymbol *SymA, const MCFragment &F);
 
 public:
   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
index 662c43e6bceaf..38eba7caf6106 100644
--- a/llvm/test/MC/LoongArch/Misc/cfi-advance.s
+++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s
@@ -1,6 +1,8 @@
 # RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.o
 # RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOC %s
 # RUN: llvm-dwarfdump --debug-frame %t.o | FileCheck --check-prefix=DWARFDUMP %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefix=RELAX %s
 
 # RELOC:       Relocations [
 # RELOC-NEXT:    .rela.eh_frame {
@@ -12,6 +14,16 @@
 # DWARFDUMP-NEXT:  DW_CFA_advance_loc: 8
 # DWARFDUMP-NEXT:  DW_CFA_def_cfa_offset: +8
 
+# RELAX:       Relocations [
+# RELAX:         .rela.eh_frame {
+# RELAX-NEXT:       0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_ADD32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x20 R_LARCH_SUB32 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_ADD6 .L{{.*}} 0x0
+# RELAX-NEXT:       0x28 R_LARCH_SUB6 .L{{.*}} 0x0
+# RELAX-NEXT:    }
+# RELAX-NEXT:  ]
+
         .text
         .globl test
         .p2align 2
diff --git a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
index 990e07c7f00bd..ab911d1853a87 100644
--- a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
+++ b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s
@@ -1,5 +1,7 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 < %s \
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax < %s \
 # RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax < %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Ensure that the eh_frame records the symbolic difference with
 ## the R_LARCH_32_PCREL relocation.
@@ -12,3 +14,6 @@ func:
 # CHECK:   Section (4) .rela.eh_frame {
 # CHECK-NEXT:   0x1C R_LARCH_32_PCREL .text 0x0
 # CHECK-NEXT: }
+# RELAX:   Section ({{.*}}) .rela.eh_frame {
+# RELAX-NEXT:   0x1C R_LARCH_32_PCREL .L{{.*}} 0x0
+# RELAX-NEXT: }
diff --git a/llvm/test/MC/LoongArch/Relocations/sub-expr.s b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
index 0179e1027af8f..8bf046acc6975 100644
--- a/llvm/test/MC/LoongArch/Relocations/sub-expr.s
+++ b/llvm/test/MC/LoongArch/Relocations/sub-expr.s
@@ -1,28 +1,95 @@
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t
-# RUN: llvm-readobj -r %t | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX
 
 ## Check that subtraction expressions emit R_LARCH_32_PCREL and R_LARCH_64_PCREL relocations.
 
 ## TODO: 1- or 2-byte data relocations are not supported for now.
 
 # CHECK:      Relocations [
-# CHECK-NEXT:   Section ({{.*}}) .rela.data {
-# CHECK-NEXT:     0x0 R_LARCH_64_PCREL sx 0x0
-# CHECK-NEXT:     0x8 R_LARCH_64_PCREL sy 0x0
-# CHECK-NEXT:     0x10 R_LARCH_32_PCREL sx 0x0
-# CHECK-NEXT:     0x14 R_LARCH_32_PCREL sy 0x0
-# CHECK-NEXT:   }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sx {
+# CHECK-NEXT:       0x4 R_LARCH_PCALA_HI20 z 0x0
+# CHECK-NEXT:       0x8 R_LARCH_PCALA_LO12 z 0x0
+# CHECK-NEXT:       0xC R_LARCH_32_PCREL .sy 0xC
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.data {
+# CHECK-NEXT:       0x0 R_LARCH_64_PCREL .sx 0x4
+# CHECK-NEXT:       0x8 R_LARCH_64_PCREL .sy 0x4
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x4
+# CHECK-NEXT:       0x14 R_LARCH_32_PCREL .sy 0x4
+# CHECK-NEXT:       0x18 R_LARCH_ADD64 .sx 0x4
+# CHECK-NEXT:       0x18 R_LARCH_SUB64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_ADD64 .sy 0x4
+# CHECK-NEXT:       0x20 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_ADD32 .sx 0x4
+# CHECK-NEXT:       0x28 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_ADD32 .sy 0x4
+# CHECK-NEXT:       0x2C R_LARCH_SUB32 .sx 0x4
+# CHECK-NEXT:       0x30 R_LARCH_ADD64 .data 0x30
+# CHECK-NEXT:       0x30 R_LARCH_SUB64 .sx 0x4
+# CHECK-NEXT:       0x38 R_LARCH_ADD32 .data 0x38
+# CHECK-NEXT:       0x38 R_LARCH_SUB32 .sy 0x4
+# CHECK-NEXT:     }
+# CHECK-NEXT:     Section ({{.*}}) .rela.sy {
+# CHECK-NEXT:       0x10 R_LARCH_32_PCREL .sx 0x10
+# CHECK-NEXT:     }
+# CHECK-NEXT:   ]
 
-.section sx,"a"
-x:
+# RELAX:      Relocations [
+# RELAX-NEXT:   Section ({{.*}}) .rela.sx {
+# RELAX-NEXT:     0x4 R_LARCH_PCALA_HI20 z 0x0
+# RELAX-NEXT:     0x4 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0x8 R_LARCH_PCALA_LO12 z 0x0
+# RELAX-NEXT:     0x8 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:     0xC R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0xC R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.data {
+# RELAX-NEXT:     0x0 R_LARCH_64_PCREL x 0x0
+# RELAX-NEXT:     0x8 R_LARCH_64_PCREL y 0x0
+# RELAX-NEXT:     0x10 R_LARCH_32_PCREL x 0x0
+# RELAX-NEXT:     0x14 R_LARCH_32_PCREL y 0x0
+# RELAX-NEXT:     0x18 R_LARCH_ADD64 x 0x0
+# RELAX-NEXT:     0x18 R_LARCH_SUB64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_ADD64 y 0x0
+# RELAX-NEXT:     0x20 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x28 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_ADD32 y 0x0
+# RELAX-NEXT:     0x2C R_LARCH_SUB32 x 0x0
+# RELAX-NEXT:     0x30 R_LARCH_ADD64 {{.*}} 0x0
+# RELAX-NEXT:     0x30 R_LARCH_SUB64 x 0x0
+# RELAX-NEXT:     0x38 R_LARCH_ADD32 {{.*}} 0x0
+# RELAX-NEXT:     0x38 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT:   Section ({{.*}}) .rela.sy {
+# RELAX-NEXT:     0x4 R_LARCH_ALIGN - 0xC
+# RELAX-NEXT:     0x10 R_LARCH_ADD32 x 0x0
+# RELAX-NEXT:     0x10 R_LARCH_SUB32 y 0x0
+# RELAX-NEXT:   }
+# RELAX-NEXT: ]
+
+.section .sx,"ax"
 nop
+x:
+la.pcrel $a0, z
+.4byte y-x
 
 .data
 .8byte x-.
 .8byte y-.
 .4byte x-.
 .4byte y-.
+.8byte x-y
+.8byte y-x
+.4byte x-y
+.4byte y-x
+.8byte .-x
+.4byte .-y
 
-.section sy,"a"
-y:
+.section .sy,"ax"
 nop
+y:
+.p2align 4
+.4byte x-y

From ab7aaaca93a0670e96a454136bb9cf13bb1ae372 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 16 Jun 2025 19:50:43 -0700
Subject: [PATCH 671/851] [flang][tests] Remove stale module files to fix
 buildbots.

---
 flang/test/Semantics/modfile75.F90 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index 8f7adafe7204d..a61c59bbb31b8 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,4 +1,6 @@
 !RUN: rm -rf %t && mkdir -p %t
+! The next line is a temporary clean-up for the buildbots to pass.
+!RUN: rm -f modfile75a.mod modfile75b.mod
 !RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1

From 9093bc7eff33b002d7f16d4d62ff1af2a5a993f8 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Tue, 17 Jun 2025 10:57:36 +0800
Subject: [PATCH 672/851] [llvm-exegesis] Ignore the instructions for which
 InstrDesc.getSchedClass() == 0 (#143840)

This allows llvm-exegesis to skip instructions that lack scheduling
information, avoiding invalid benchmarking. e.g. `InstB` in RISC-V.
---
 llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test | 3 +++
 llvm/tools/llvm-exegesis/lib/Target.cpp                     | 2 ++
 2 files changed, 5 insertions(+)
 create mode 100644 llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test

diff --git a/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
new file mode 100644
index 0000000000000..fcf3b8f5463d4
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/RISCV/unsupported-opcode.test
@@ -0,0 +1,3 @@
+# RUN: llvm-exegesis -mtriple=riscv64-unknown-linux-gnu -mcpu=generic --benchmark-phase=assemble-measured-code -mode=inverse_throughput -opcode-name=InsnB 2>&1 | FileCheck %s
+
+CHECK: Unsupported opcode: No Sched Class
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 68d19514bedb2..fc5f82f288ae4 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -45,6 +45,8 @@ ExegesisTarget::getIgnoredOpcodeReasonOrNull(const LLVMState &State,
     return "Unsupported opcode: isBranch/isIndirectBranch";
   if (InstrDesc.isCall() || InstrDesc.isReturn())
     return "Unsupported opcode: isCall/isReturn";
+  if (InstrDesc.getSchedClass() == 0)
+    return "Unsupported opcode: No Sched Class";
   return nullptr;
 }
 

From 602c3089f749ec3b61b93652ea9eb5947a61bcf2 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Tue, 17 Jun 2025 11:17:37 +0800
Subject: [PATCH 673/851] [TargetParser] Increase MAX_SUBTARGET_FEATURES to 384
 (#144326)

There are 314 features in RISC-V backend, which is about to exceed
the maxinum 320 as there are some ongoing new extensions.

We increase the `MAX_SUBTARGET_FEATURES` to 384 so that we won't
surprise anyone.
---
 llvm/include/llvm/TargetParser/SubtargetFeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index 6f1723dec5d04..cdcfcdd0e802e 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -32,7 +32,7 @@ namespace llvm {
 class raw_ostream;
 class Triple;
 
-const unsigned MAX_SUBTARGET_WORDS = 5;
+const unsigned MAX_SUBTARGET_WORDS = 6;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.

From a02afb0def589ec28f8240ff15760e5f241b833c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 20:40:17 -0700
Subject: [PATCH 674/851] AVR: Migrate to the new relocation specifier
 representation

Define printImpl and evaluateAsRelocationImpl within AVRMCAsmInfo.
---
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp  | 27 +++++++++++++++----
 .../Target/AVR/MCTargetDesc/AVRMCAsmInfo.h    |  4 +++
 .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 11 --------
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h  |  5 +---
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index 68db5227d073c..cfd7dc5822627 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -116,6 +116,19 @@ AVR::Fixups AVRMCExpr::getFixupKind() const {
   return Kind;
 }
 
+void AVRMCAsmInfo::printSpecifierExpr(raw_ostream &OS,
+                                      const MCSpecifierExpr &Expr) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
+  assert(E.getSpecifier() != AVR::S_AVR_NONE);
+  OS << E.getName() << '(';
+  if (E.isNegated())
+    OS << '-' << '(';
+  printExpr(OS, *E.getSubExpr());
+  if (E.isNegated())
+    OS << ')';
+  OS << ')';
+}
+
 int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
   if (Negated)
     Value *= -1;
@@ -164,15 +177,19 @@ int64_t AVRMCExpr::evaluateAsInt64(int64_t Value) const {
   return static_cast<uint64_t>(Value) & 0xff;
 }
 
-bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
-                                          const MCAssembler *Asm) const {
+// bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
+//                                           const MCAssembler *Asm) const {
+bool AVRMCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+                                             MCValue &Result,
+                                             const MCAssembler *Asm) const {
+  auto &E = static_cast<const AVRMCExpr &>(Expr);
   MCValue Value;
-  bool isRelocatable = getSubExpr()->evaluateAsRelocatable(Value, Asm);
+  bool isRelocatable = E.getSubExpr()->evaluateAsRelocatable(Value, Asm);
   if (!isRelocatable)
     return false;
 
   if (Value.isAbsolute()) {
-    Result = MCValue::get(evaluateAsInt64(Value.getConstant()));
+    Result = MCValue::get(E.evaluateAsInt64(Value.getConstant()));
   } else {
     if (!Asm || !Asm->hasLayout())
       return false;
@@ -181,7 +198,7 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result,
     if (Value.getSpecifier() != MCSymbolRefExpr::VK_None)
       return false;
     assert(!Value.getSubSym());
-    if (specifier == AVR::S_PM)
+    if (E.getSpecifier() == AVR::S_PM)
       Spec = AVR::S_PM;
 
     // TODO: don't attach specifier to MCSymbolRefExpr.
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
index 649e247adab0f..fab271304e27d 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h
@@ -25,6 +25,10 @@ class Triple;
 class AVRMCAsmInfo : public MCAsmInfo {
 public:
   explicit AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options);
+  void printSpecifierExpr(raw_ostream &OS,
+                          const MCSpecifierExpr &Expr) const override;
+  bool evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr, MCValue &Res,
+                                 const MCAssembler *Asm) const override;
 };
 
 namespace AVR {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 3067e854d8dc8..5963976d0dc7f 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -21,15 +21,4 @@ const AVRMCExpr *AVRMCExpr::create(Specifier Kind, const MCExpr *Expr,
   return new (Ctx) AVRMCExpr(Kind, Expr, Negated);
 }
 
-void AVRMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  assert(specifier != AVR::S_AVR_NONE);
-  OS << getName() << '(';
-  if (isNegated())
-    OS << '-' << '(';
-  MAI->printExpr(OS, *getSubExpr());
-  if (isNegated())
-    OS << ')';
-  OS << ')';
-}
-
 } // namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
index d72d36f108580..5592e24be5378 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.h
@@ -18,6 +18,7 @@ namespace llvm {
 /// A expression in AVR machine code.
 class AVRMCExpr : public MCSpecifierExpr {
 public:
+  friend class AVRMCAsmInfo;
   using Specifier = Spec;
   /// Specifies the type of an expression.
 
@@ -36,10 +37,6 @@ class AVRMCExpr : public MCSpecifierExpr {
   bool isNegated() const { return Negated; }
   void setNegated(bool negated = true) { Negated = negated; }
 
-  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
-  bool evaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAssembler *Asm) const override;
-
 public:
   static Specifier parseSpecifier(StringRef Name);
 

From 199428e0472c80d9b742d0a3e492ab902005fb6a Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 16 Jun 2025 20:41:40 -0700
Subject: [PATCH 675/851] [bazel][lld] Remove unneeded dependencies. (#144455)

As far as I can tell these are not used in any includes in their
respective targets, and building all of LLD with
```
bazel build --config=generic_clang @llvm-project//lld/...
```
still works.
---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 2c9f3e56e3113..450157758d75b 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -49,7 +49,6 @@ cc_library(
         "//llvm:CodeGen",
         "//llvm:Core",
         "//llvm:DebugInfoDWARF",
-        "//llvm:IRPrinter",
         "//llvm:Option",
         "//llvm:ProfileData",
         "//llvm:Support",
@@ -153,7 +152,6 @@ cc_library(
         "//llvm:Option",
         "//llvm:Support",
         "//llvm:Symbolize",
-        "//llvm:Target",
         "//llvm:TargetParser",
         "//llvm:TransformUtils",
         "//llvm:WindowsDriver",
@@ -210,15 +208,11 @@ cc_library(
         "//llvm:BitReader",
         "//llvm:BitWriter",
         "//llvm:CGData",
-        "//llvm:Core",
         "//llvm:DebugInfoDWARF",
         "//llvm:Demangle",
         "//llvm:LTO",
-        "//llvm:MC",
-        "//llvm:ObjCARC",
         "//llvm:Object",
         "//llvm:Option",
-        "//llvm:ProfileData",
         "//llvm:Support",
         "//llvm:TargetParser",
         "//llvm:TextAPI",

From 30350afd023c4e9583d5a8bbfd56af7c354923fa Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 16 Jun 2025 20:58:10 -0700
Subject: [PATCH 676/851] MCSpecifierExpr: Remove unused virtual functions

... now that all targets using MCSpecifierExpr have migrated to
XXXMCAsmInfo::printExpr/evaluateAsRelocatableImpl.
---
 llvm/include/llvm/MC/MCAsmInfo.h |  5 ++++-
 llvm/include/llvm/MC/MCExpr.h    |  7 -------
 llvm/lib/MC/MCAsmInfo.cpp        | 18 +++++++-----------
 llvm/lib/MC/MCExpr.cpp           |  9 ---------
 4 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index a7bf1b965bf2d..93ce3cc444213 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -715,7 +715,10 @@ class LLVM_ABI MCAsmInfo {
   std::optional<uint32_t> getSpecifierForName(StringRef Name) const;
 
   void printExpr(raw_ostream &, const MCExpr &) const;
-  virtual void printSpecifierExpr(raw_ostream &, const MCSpecifierExpr &) const;
+  virtual void printSpecifierExpr(raw_ostream &,
+                                  const MCSpecifierExpr &) const {
+    llvm_unreachable("Need to implement hook if target uses MCSpecifierExpr");
+  }
   virtual bool evaluateAsRelocatableImpl(const MCSpecifierExpr &, MCValue &Res,
                                          const MCAssembler *Asm) const;
 };
diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h
index cd57fafc50b56..4ec780d8ff94f 100644
--- a/llvm/include/llvm/MC/MCExpr.h
+++ b/llvm/include/llvm/MC/MCExpr.h
@@ -512,7 +512,6 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
 
   explicit MCSpecifierExpr(const MCExpr *Expr, Spec S, SMLoc Loc = SMLoc())
       : MCExpr(Specifier, Loc), Expr(Expr), specifier(S) {}
-  virtual ~MCSpecifierExpr() = default;
 
 public:
   LLVM_ABI static const MCSpecifierExpr *
@@ -523,12 +522,6 @@ class LLVM_ABI MCSpecifierExpr : public MCExpr {
   Spec getSpecifier() const { return specifier; }
   const MCExpr *getSubExpr() const { return Expr; }
 
-  virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-    llvm_unreachable("Replace MCExpr::print calls with MCAsmInfo::printExpr");
-  }
-  virtual bool evaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAssembler *Asm) const;
-
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Specifier;
   }
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index e8eaf4619df51..ba672d2fc2ec0 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -157,17 +158,12 @@ void MCAsmInfo::printExpr(raw_ostream &OS, const MCExpr &Expr) const {
     Expr.print(OS, this);
 }
 
-void MCAsmInfo::printSpecifierExpr(raw_ostream &OS,
-                                   const MCSpecifierExpr &Expr) const {
-  // TODO: Switch to unreachable after all targets that use MCSpecifierExpr
-  // migrate to MCAsmInfo::printSpecifierExpr.
-  Expr.printImpl(OS, this);
-}
-
-bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &Expr,
+bool MCAsmInfo::evaluateAsRelocatableImpl(const MCSpecifierExpr &E,
                                           MCValue &Res,
                                           const MCAssembler *Asm) const {
-  // TODO: Remove after all targets that use MCSpecifierExpr migrate to
-  // MCAsmInfo::evaluateAsRelocatableImpl.
-  return Expr.evaluateAsRelocatableImpl(Res, Asm);
+  if (!E.getSubExpr()->evaluateAsRelocatable(Res, Asm))
+    return false;
+
+  Res.setSpecifier(E.getSpecifier());
+  return !Res.getSubSym();
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 89191294f3ed3..8919a2627cf6a 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -754,12 +754,3 @@ const MCSpecifierExpr *MCSpecifierExpr::create(const MCSymbol *Sym, Spec S,
                                                MCContext &Ctx, SMLoc Loc) {
   return new (Ctx) MCSpecifierExpr(MCSymbolRefExpr::create(Sym, Ctx), S, Loc);
 }
-
-bool MCSpecifierExpr::evaluateAsRelocatableImpl(MCValue &Res,
-                                                const MCAssembler *Asm) const {
-  if (!getSubExpr()->evaluateAsRelocatable(Res, Asm))
-    return false;
-
-  Res.setSpecifier(specifier);
-  return !Res.getSubSym();
-}

From 7caeec599998bd8aa01d498574e148e4e9c982db Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 06:08:15 +0200
Subject: [PATCH 677/851] [NFC][flang][OpenMP] Unify `genSectionsOp`'s
 prototype to match other `genXXXOp` functions (#144013)

Unifies the prototype of `genSectionsOp` to match other ops generators.
Doing so, we are able to call `genSectionsOp` directtly from
`genOMPDispatch` instead of the special handling needed now to pass the
section blocks. This is useful because now we can handle symbol mapping
scopes easier for nested OpenMP directives. See

https://github.com/llvm/llvm-project/pull/143706#issuecomment-2965344723
and the following discussion for more info.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp | 35 +++++++++++--------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 82673f0948a5b..060eba1b906e3 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -201,6 +201,8 @@ class HostEvalInfo {
 /// structures, but it will probably still require some further work to support
 /// reverse offloading.
 static llvm::SmallVector<HostEvalInfo, 0> hostEvalInfo;
+static llvm::SmallVector<const parser::OpenMPSectionsConstruct *, 0>
+    sectionsStack;
 
 /// Bind symbols to their corresponding entry block arguments.
 ///
@@ -2220,8 +2222,12 @@ static mlir::omp::SectionsOp
 genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
               lower::pft::Evaluation &eval, mlir::Location loc,
-              const ConstructQueue &queue, ConstructQueue::const_iterator item,
-              const parser::OmpSectionBlocks &sectionBlocks) {
+              const ConstructQueue &queue,
+              ConstructQueue::const_iterator item) {
+  assert(!sectionsStack.empty());
+  const auto &sectionBlocks =
+      std::get<parser::OmpSectionBlocks>(sectionsStack.back()->t);
+  sectionsStack.pop_back();
   mlir::omp::SectionsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genSectionsClauses(converter, semaCtx, item->clauses, loc, clauseOps,
@@ -3458,10 +3464,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     // Lowered in the enclosing genSectionsOp.
     break;
   case llvm::omp::Directive::OMPD_sections:
-    // Called directly from genOMP([...], OpenMPSectionsConstruct) because it
-    // has a different prototype.
-    // This code path is still taken when iterating through the construct queue
-    // in genBodyOfOp
+    genSectionsOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
   case llvm::omp::Directive::OMPD_simd:
     newOp =
@@ -4137,8 +4140,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
       std::get<parser::OmpClauseList>(beginSectionsDirective.t), semaCtx);
   const auto &endSectionsDirective =
       std::get<parser::OmpEndSectionsDirective>(sectionsConstruct.t);
-  const auto &sectionBlocks =
-      std::get<parser::OmpSectionBlocks>(sectionsConstruct.t);
   clauses.append(makeClauses(
       std::get<parser::OmpClauseList>(endSectionsDirective.t), semaCtx));
   mlir::Location currentLocation = converter.getCurrentLocation();
@@ -4150,22 +4151,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   ConstructQueue queue{
       buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx,
                           eval, source, directive, clauses)};
-  ConstructQueue::iterator next = queue.begin();
-  // Generate constructs that come first e.g. Parallel
-  while (next != queue.end() &&
-         next->id != llvm::omp::Directive::OMPD_sections) {
-    genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
-                   next);
-    next = std::next(next);
-  }
 
-  // call genSectionsOp directly (not via genOMPDispatch) so that we can add the
-  // sectionBlocks argument
-  assert(next != queue.end());
-  assert(next->id == llvm::omp::Directive::OMPD_sections);
-  genSectionsOp(converter, symTable, semaCtx, eval, currentLocation, queue,
-                next, sectionBlocks);
-  assert(std::next(next) == queue.end());
+  sectionsStack.push_back(&sectionsConstruct);
+  genOMPDispatch(converter, symTable, semaCtx, eval, currentLocation, queue,
+                 queue.begin());
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,

From b5dbf8210a57b986b9802304745f4c5c108cf37b Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 06:08:38 +0200
Subject: [PATCH 678/851] [flang] Enable delayed localization by default for
 `do concurrent` (#144074)

Reintroduces changes from
https://github.com/llvm/llvm-project/issues/143897. A fix for the
reported problem in https://github.com/llvm/llvm-project/issues/143897
is hopefully resolved in
https://github.com/llvm/llvm-project/pull/144027.

This PR aims to make it easier and more self-contained to revert the
switch/flag if we discover any problems with enabling it by default.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +-----
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 64b16b3abe991..5ff8101dba097 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,11 +2033,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
-    // default unlike the staging flag) once the implementation of this is more
-    // complete.
-    bool useDelayedPriv =
-        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
+    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 6cae0eb46db13..039b17808d19e 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index a3d0c34ed8569..67f080eb2c1c5 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index d643213854744..798cbb335c8c0 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc3..64f14ff972272 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca16..34d7bcfb7d7ad 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From 2dc58e02cbce83784a38b4cc33f83529ad1a7c7e Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 07:01:53 +0200
Subject: [PATCH 679/851] [flang][OpenMP] Add symbol table scopes for `teams`
 and `parallel` (#144015)

Adds symbol map scopes for standalone `teams` and `parallel` constructs.
This is required to properly bind the privatized symbols in both
constructs so that nested constructs can find them.

Resolves https://github.com/llvm/llvm-project/issues/116428.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  9 ++------
 .../OpenMP/Todo/target-parallel-private.f90   | 13 ------------
 .../OpenMP/Todo/target-teams-private.f90      | 13 ------------
 .../Lower/OpenMP/target-parallel-private.f90  | 21 +++++++++++++++++++
 .../Lower/OpenMP/target-teams-private.f90     | 20 ++++++++++++++++++
 5 files changed, 43 insertions(+), 33 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
 delete mode 100644 flang/test/Lower/OpenMP/Todo/target-teams-private.f90
 create mode 100644 flang/test/Lower/OpenMP/target-parallel-private.f90
 create mode 100644 flang/test/Lower/OpenMP/target-teams-private.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 060eba1b906e3..3e865a1ee7185 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2674,6 +2674,7 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
            semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
            mlir::Location loc, const ConstructQueue &queue,
            ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::TeamsOperands clauseOps;
   llvm::SmallVector<const semantics::Symbol *> reductionSyms;
   genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps,
@@ -2981,6 +2982,7 @@ static mlir::omp::ParallelOp genStandaloneParallel(
     lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval, mlir::Location loc,
     const ConstructQueue &queue, ConstructQueue::const_iterator item) {
+  lower::SymMapScope scope(symTable);
   mlir::omp::ParallelOperands parallelClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
   genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
@@ -4027,13 +4029,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
           parser::ToUpperCaseLetters(llvm::omp::getOpenMPClauseName(clause.id));
       TODO(clauseLocation, name + " clause is not implemented yet");
     }
-
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_teams)
-      TODO(clauseLocation, "TARGET TEAMS PRIVATE is not implemented yet");
-    if (std::holds_alternative<clause::Private>(clause.u) &&
-        origDirective == llvm::omp::Directive::OMPD_target_parallel)
-      TODO(clauseLocation, "TARGET PARALLEL PRIVATE is not implemented yet");
   }
 
   llvm::omp::Directive directive =
diff --git a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90 b/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
deleted file mode 100644
index e820143021f9a..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-parallel-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target parallel`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET PARALLEL PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target parallel private(i)
-!$omp end target parallel
-end subroutine
diff --git a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90 b/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
deleted file mode 100644
index c8d998a5cbf94..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-teams-private.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `private` clause on `target teams`
-!===============================================================================
-
-! CHECK: not yet implemented: TARGET TEAMS PRIVATE is not implemented yet
-subroutine target_teams_private()
-integer, dimension(3) :: i
-!$omp target teams private(i)
-!$omp end target teams
-end subroutine
diff --git a/flang/test/Lower/OpenMP/target-parallel-private.f90 b/flang/test/Lower/OpenMP/target-parallel-private.f90
new file mode 100644
index 0000000000000..cc04b77e4a527
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-parallel-private.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target parallel`
+!===============================================================================
+
+subroutine target_parallel_private()
+integer, dimension(3) :: i
+!$omp target parallel private(i)
+!$omp end target parallel
+end subroutine
+
+! CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : {{.*}}
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.parallel private(@[[PRIVATIZER]] %{{.*}} -> %{{.*}} : {{.*}}) {
+! CHECK:   }
+! CHECK: }
diff --git a/flang/test/Lower/OpenMP/target-teams-private.f90 b/flang/test/Lower/OpenMP/target-teams-private.f90
new file mode 100644
index 0000000000000..65d97649b5cf3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-teams-private.f90
@@ -0,0 +1,20 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization \
+! RUN:   -o - %s 2>&1 | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp --enable-delayed-privatization -o - %s 2>&1 |\
+! RUN:   FileCheck %s
+
+!===============================================================================
+! `private` clause on `target teams`
+!===============================================================================
+
+subroutine target_teams_private()
+integer, dimension(3) :: i
+!$omp target teams private(i)
+!$omp end target teams
+end subroutine
+
+! CHECK: omp.target {{.*}} {
+! CHECK:   omp.teams {
+! CHECK:     %{{.*}} = fir.alloca !fir.array<3xi32> {bindc_name = "i", {{.*}}}
+! CHECK:   }
+! CHECK: }

From 84d879d6999b61cea3f9f200df57653f5a51ee41 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:11:16 -0700
Subject: [PATCH 680/851] [RISCV] Rename Relocation QC_E_JUMP_PLT to
 QC_E_CALL_PLT (#143998)

The semantics and definition of this relocation are unchanged. The new
name reflects that instructions with the relocation should be assumed to
clobber non-callee-saved registers, as with the R_RISCV_CALL_PLT
relocation.

The name was changed in v0.2 of the ABI extensions:
https://github.com/quic/riscv-elf-psabi-quic-extensions/releases/tag/v0.2
---
 .../llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def       | 2 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp      | 6 +++---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp | 4 ++--
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h        | 2 +-
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp   | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
index 7ae3d3f205772..b02462ca89fdd 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV_nonstandard.def
@@ -25,4 +25,4 @@
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_ABS20_U,    192)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_BRANCH,   193)
 ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_32,       194)
-ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_JUMP_PLT, 195)
+ELF_RISCV_NONSTANDARD_RELOC(QUALCOMM, R_RISCV_QC_E_CALL_PLT, 195)
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 2f37c351baf9f..9161f23c8a954 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -85,7 +85,7 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_qc_e_branch", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_riscv_qc_e_32", 16, 32, 0},
       {"fixup_riscv_qc_abs20_u", 12, 20, 0},
-      {"fixup_riscv_qc_e_jump_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_qc_e_call_plt", 0, 48, MCFixupKindInfo::FKF_IsPCRel},
   };
   static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds,
                 "Not all fixup kinds added to Infos array");
@@ -552,7 +552,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = (Bit19 << 31) | (Bit14_0 << 16) | (Bit18_15 << 12);
     return Value;
   }
-  case RISCV::fixup_riscv_qc_e_jump_plt: {
+  case RISCV::fixup_riscv_qc_e_call_plt: {
     if (!isInt<32>(Value))
       Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     if (Value & 0x1)
@@ -699,7 +699,7 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   case RISCV::fixup_riscv_qc_e_branch:
   case RISCV::fixup_riscv_qc_abs20_u:
   case RISCV::fixup_riscv_qc_e_32:
-  case RISCV::fixup_riscv_qc_e_jump_plt:
+  case RISCV::fixup_riscv_qc_e_call_plt:
     VendorIdentifier = "QUALCOMM";
     break;
   }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 1d81096d6b600..3c1f9450a0991 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -101,8 +101,8 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup,
       return ELF::R_RISCV_CALL_PLT;
     case RISCV::fixup_riscv_qc_e_branch:
       return ELF::R_RISCV_QC_E_BRANCH;
-    case RISCV::fixup_riscv_qc_e_jump_plt:
-      return ELF::R_RISCV_QC_E_JUMP_PLT;
+    case RISCV::fixup_riscv_qc_e_call_plt:
+      return ELF::R_RISCV_QC_E_CALL_PLT;
     }
   }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 80fbed8d10f99..8d869a64cde47 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -54,7 +54,7 @@ enum Fixups {
   // 20-bit fixup for symbol references in the 32-bit qc.li instruction
   fixup_riscv_qc_abs20_u,
   // 32-bit fixup for symbol references in the 48-bit qc.j/qc.jal instructions
-  fixup_riscv_qc_e_jump_plt,
+  fixup_riscv_qc_e_call_plt,
 
   // Used as a sentinel, must be the last
   fixup_riscv_invalid,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 1185e3558b002..2a90552037f91 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -645,7 +645,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
     } else if (MIFrm == RISCVII::InstFormatQC_EJ) {
-      FixupKind = RISCV::fixup_riscv_qc_e_jump_plt;
+      FixupKind = RISCV::fixup_riscv_qc_e_call_plt;
       RelaxCandidate = true;
     }
   }

From c0ac95181eededc85027d63fe9f97bc742b7a552 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:12:12 -0700
Subject: [PATCH 681/851] [RISCV] Update Xqci to v0.13.0 (#144398)

---
 clang/include/clang/Basic/AttrDocs.td         |  2 +-
 .../Driver/print-supported-extensions-riscv.c |  6 ++--
 llvm/docs/RISCVUsage.rst                      | 34 +++++++++----------
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  6 ++--
 llvm/test/CodeGen/RISCV/attributes.ll         |  6 ++--
 .../TargetParser/RISCVISAInfoTest.cpp         | 10 +++---
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 047f51ffa59ed..6051e1fc45111 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2934,7 +2934,7 @@ https://gcc.gnu.org/onlinedocs/gcc/RISC-V-Function-Attributes.html
 https://riscv.org/specifications/privileged-isa/
 The RISC-V Instruction Set Manual Volume II: Privileged Architecture
 Version 1.10.
-https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.7
+https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0
 https://sifive.cdn.prismic.io/sifive/d1984d2b-c9b9-4c91-8de0-d68a5e64fa0f_sifive-interrupt-cookbook-v1p2.pdf
   }];
 }
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 95464f06378e2..33d8738d5a9bb 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -221,14 +221,14 @@
 // CHECK-NEXT:     xqcicli              0.3       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
 // CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
-// CHECK-NEXT:     xqcicsr              0.3       'Xqcicsr' (Qualcomm uC CSR Extension)
-// CHECK-NEXT:     xqciint              0.7       'Xqciint' (Qualcomm uC Interrupts Extension)
+// CHECK-NEXT:     xqcicsr              0.4       'Xqcicsr' (Qualcomm uC CSR Extension)
+// CHECK-NEXT:     xqciint              0.10      'Xqciint' (Qualcomm uC Interrupts Extension)
 // CHECK-NEXT:     xqciio               0.1       'Xqciio' (Qualcomm uC External Input Output Extension)
 // CHECK-NEXT:     xqcilb               0.2       'Xqcilb' (Qualcomm uC Long Branch Extension)
 // CHECK-NEXT:     xqcili               0.2       'Xqcili' (Qualcomm uC Load Large Immediate Extension)
 // CHECK-NEXT:     xqcilia              0.2       'Xqcilia' (Qualcomm uC Large Immediate Arithmetic Extension)
 // CHECK-NEXT:     xqcilo               0.3       'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)
-// CHECK-NEXT:     xqcilsm              0.5       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
+// CHECK-NEXT:     xqcilsm              0.6       'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)
 // CHECK-NEXT:     xqcisim              0.2       'Xqcisim' (Qualcomm uC Simulation Hint Extension)
 // CHECK-NEXT:     xqcisls              0.2       'Xqcisls' (Qualcomm uC Scaled Load Store Extension)
 // CHECK-NEXT:     xqcisync             0.3       'Xqcisync' (Qualcomm uC Sync Delay Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 64f17f59575ea..78890b605d83c 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -445,58 +445,58 @@ The current vendor extensions supported are:
   LLVM implements `version 0.1 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.1.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
 
 ``experimental-Xqcia``
-  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciac``
-  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibi``
-  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Branch Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcibm``
-  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.8 of the Qualcomm uC Bit Manipulation extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicli``
-  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicm``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Move extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcics``
-  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Conditional Select extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcicsr``
-  LLVM implements `version 0.3 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.4 of the Qualcomm uC CSR extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciint``
-  LLVM implements `version 0.7 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.10 of the Qualcomm uC Interrupts extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqciio``
-  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.1 of the Qualcomm uC External Input Output extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilb``
-  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Long Branch extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcili``
-  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Load Large Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilia``
-  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Large Immediate Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilo``
-  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Large Offset Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcilsm``
   LLVM implements `version 0.6 of the Qualcomm uC Load Store Multiple extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisim``
-  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Simulation Hint extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisls``
-  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``experimental-Xqcisync``
-  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.11.0>`__ by Qualcomm. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Sync Delay extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
 
 ``Xmipscmov``
   LLVM implements conditional move for the `p8700 processor <https://mips.com/products/hardware/p8700/>`__ by MIPS.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 940caa4f40444..0f26c6f1e0a5e 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1488,14 +1488,14 @@ def HasVendorXqcics
                          "'Xqcics' (Qualcomm uC Conditional Select Extension)">;
 
 def FeatureVendorXqcicsr
-    : RISCVExperimentalExtension<0, 3, "Qualcomm uC CSR Extension">;
+    : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
 def HasVendorXqcicsr
     : Predicate<"Subtarget->hasVendorXqcicsr()">,
       AssemblerPredicate<(all_of FeatureVendorXqcicsr),
                          "'Xqcicsr' (Qualcomm uC CSR Extension)">;
 
 def FeatureVendorXqciint
-    : RISCVExperimentalExtension<0, 7, "Qualcomm uC Interrupts Extension",
+    : RISCVExperimentalExtension<0, 10, "Qualcomm uC Interrupts Extension",
                                  [FeatureStdExtZca]>;
 def HasVendorXqciint
     : Predicate<"Subtarget->hasVendorXqciint()">,
@@ -1542,7 +1542,7 @@ def HasVendorXqcilo
                          "'Xqcilo' (Qualcomm uC Large Offset Load Store Extension)">;
 
 def FeatureVendorXqcilsm
-    : RISCVExperimentalExtension<0, 5,
+    : RISCVExperimentalExtension<0, 6,
                                  "Qualcomm uC Load Store Multiple Extension">;
 def HasVendorXqcilsm
     : Predicate<"Subtarget->hasVendorXqcilsm()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index ba8969b5a5382..c9cfb2fb20b11 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -445,14 +445,14 @@
 ; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p3"
 ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
-; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p3"
-; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p7"
+; RV32XQCICSR: .attribute 5, "rv32i2p1_xqcicsr0p4"
+; RV32XQCIINT: .attribute 5, "rv32i2p1_zca1p0_xqciint0p10"
 ; RV32XQCIIO: .attribute 5, "rv32i2p1_xqciio0p1"
 ; RV32XQCILB: .attribute 5, "rv32i2p1_zca1p0_xqcilb0p2"
 ; RV32XQCILI: .attribute 5, "rv32i2p1_zca1p0_xqcili0p2"
 ; RV32XQCILIA: .attribute 5, "rv32i2p1_zca1p0_xqcilia0p2"
 ; RV32XQCILO: .attribute 5, "rv32i2p1_zca1p0_xqcilo0p3"
-; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p5"
+; RV32XQCILSM: .attribute 5, "rv32i2p1_xqcilsm0p6"
 ; RV32XQCISIM: attribute 5, "rv32i2p1_zca1p0_xqcisim0p2"
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 29bfa30848ec9..0316e6470422e 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -684,9 +684,9 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   for (StringRef Input :
        {"rv64i_xqcia0p7", "rv64i_xqciac0p3", "rv64i_xqcibi0p2",
         "rv64i_xqcibm0p8", "rv64i_xqcicli0p3", "rv64i_xqcicm0p2",
-        "rv64i_xqcics0p2", "rv64i_xqcicsr0p3", "rv64i_xqciint0p7",
+        "rv64i_xqcics0p2", "rv64i_xqcicsr0p4", "rv64i_xqciint0p10",
         "rv64i_xqciio0p1", "rv64i_xqcilb0p2", "rv64i_xqcili0p2",
-        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p5",
+        "rv64i_xqcilia0p2", "rv64i_xqcilo0p3", "rv64i_xqcilsm0p6",
         "rv64i_xqcisim0p2", "rv64i_xqcisls0p2", "rv64i_xqcisync0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
@@ -1192,14 +1192,14 @@ Experimental extensions
     xqcicli              0.3
     xqcicm               0.2
     xqcics               0.2
-    xqcicsr              0.3
-    xqciint              0.7
+    xqcicsr              0.4
+    xqciint              0.10
     xqciio               0.1
     xqcilb               0.2
     xqcili               0.2
     xqcilia              0.2
     xqcilo               0.3
-    xqcilsm              0.5
+    xqcilsm              0.6
     xqcisim              0.2
     xqcisls              0.2
     xqcisync             0.3

From 98c6c371d6dc09454d541474ef65a0e47c4baae6 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Mon, 16 Jun 2025 22:13:45 -0700
Subject: [PATCH 682/851] [RISCV] Xqccmp v0.3 (#137854)

All the changes for v0.2 and v0.3 are either already implemented, or
irrelevant to the compiler implementation.
---
 clang/test/Driver/print-supported-extensions-riscv.c | 2 +-
 llvm/docs/RISCVUsage.rst                             | 2 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td               | 3 ++-
 llvm/test/CodeGen/RISCV/attributes.ll                | 4 ++--
 llvm/unittests/TargetParser/RISCVISAInfoTest.cpp     | 6 +++---
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 33d8738d5a9bb..e1f5a7a0105d7 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -213,7 +213,7 @@
 // CHECK-NEXT:     smctr                1.0       'Smctr' (Control Transfer Records Machine Level)
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
-// CHECK-NEXT:     xqccmp               0.1       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
+// CHECK-NEXT:     xqccmp               0.3       'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves)
 // CHECK-NEXT:     xqcia                0.7       'Xqcia' (Qualcomm uC Arithmetic Extension)
 // CHECK-NEXT:     xqciac               0.3       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcibi               0.2       'Xqcibi' (Qualcomm uC Branch Immediate Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 78890b605d83c..aadda309feab0 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -442,7 +442,7 @@ The current vendor extensions supported are:
   LLVM implements `the custom compressed opcodes present in some QingKe cores` by WCH / Nanjing Qinheng Microelectronics. The vendor refers to these opcodes by the name "XW".
 
 ``experimental-Xqccmp``
-  LLVM implements `version 0.1 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.1.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
+  LLVM implements `version 0.3 of the 16-bit Push/Pop instructions and double-moves extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqccmp_extension-0.3.0>`__ by Qualcomm. All instructions are prefixed with `qc.` as described in the specification.
 
 ``experimental-Xqcia``
   LLVM implements `version 0.7 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.13.0>`__ by Qualcomm. These instructions are only available for riscv32.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0f26c6f1e0a5e..0b35084267324 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1426,7 +1426,8 @@ def HasVendorXwchc
 // Qualcomm Extensions
 
 def FeatureVendorXqccmp
-    : RISCVExperimentalExtension<0, 1, "Qualcomm 16-bit Push/Pop and Double Moves",
+    : RISCVExperimentalExtension<0, 3,
+                                 "Qualcomm 16-bit Push/Pop and Double Moves",
                                  [FeatureStdExtZca]>;
 def HasVendorXqccmp
     : Predicate<"Subtarget->hasVendorXqccmp()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index c9cfb2fb20b11..cdbf1caff5d80 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -437,7 +437,7 @@
 ; RV32XTHEADMEMPAIR: .attribute 5, "rv32i2p1_xtheadmempair1p0"
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_zca1p0_xwchc2p2"
-; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p1"
+; RV32XQCCMP: .attribute 5, "rv32i2p1_zca1p0_xqccmp0p3"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p7"
 ; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p3"
 ; RV32XQCIBI: .attribute 5, "rv32i2p1_zca1p0_xqcibi0p2"
@@ -683,7 +683,7 @@
 ; RV64SSCTR: .attribute 5, "rv64i2p1_sscsrind1p0_ssctr1p0"
 ; RV64SDEXT: .attribute 5, "rv64i2p1_sdext1p0"
 ; RV64SDTRIG: .attribute 5, "rv64i2p1_sdtrig1p0"
-; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p1"
+; RV64XQCCMP: .attribute 5, "rv64i2p1_zca1p0_xqccmp0p3"
 
 ; RVI20U32: .attribute 5, "rv32i2p1"
 ; RVI20U64: .attribute 5, "rv64i2p1"
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 0316e6470422e..a0910a164ea08 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -695,13 +695,13 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
 
   for (StringRef Input :
        {"rv32idc_xqciac0p3", "rv32i_zcd_xqciac0p3", "rv32idc_xqcicm0p2",
-        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p1", "rv32i_zcd_xqccmp0p1"}) {
+        "rv32i_zcd_xqcicm0p2", "rv32idc_xqccmp0p3", "rv32i_zcd_xqccmp0p3"}) {
     EXPECT_THAT(
         toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
         ::testing::EndsWith("extension when 'd' extension is enabled"));
   }
 
-  for (StringRef Input : {"rv32i_zcmp_xqccmp0p1", "rv64i_zcmp_xqccmp0p1"}) {
+  for (StringRef Input : {"rv32i_zcmp_xqccmp0p3", "rv64i_zcmp_xqccmp0p3"}) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "'zcmp' and 'xqccmp' extensions are incompatible");
   }
@@ -1184,7 +1184,7 @@ Experimental extensions
     smctr                1.0
     ssctr                1.0
     svukte               0.3
-    xqccmp               0.1
+    xqccmp               0.3
     xqcia                0.7
     xqciac               0.3
     xqcibi               0.2

From e86740e6003739a41139d94e1643a3207f8fd8f8 Mon Sep 17 00:00:00 2001
From: no92 <no92@users.noreply.github.com>
Date: Tue, 17 Jun 2025 07:51:46 +0200
Subject: [PATCH 683/851] [clang] Add managarm support (#139271)

This PR is part of a series to upstream managarm support, as laid out in
the
[RFC](https://discourse.llvm.org/t/rfc-new-proposed-managarm-support-for-llvm-and-clang-87845/85884/1).
This PR is a follow-up to #87845 and #138854.
---
 clang/lib/Basic/Targets.cpp                   |   9 +
 clang/lib/Basic/Targets/OSTargets.h           |  30 ++
 clang/lib/Driver/CMakeLists.txt               |   1 +
 clang/lib/Driver/Driver.cpp                   |   4 +
 clang/lib/Driver/ToolChains/Gnu.cpp           |   2 +
 clang/lib/Driver/ToolChains/Managarm.cpp      | 218 ++++++++++++++
 clang/lib/Driver/ToolChains/Managarm.h        |  55 ++++
 clang/lib/Lex/InitHeaderSearch.cpp            |   1 +
 .../lib/aarch64-managarm-mlibc/.keep          |   0
 .../lib/riscv64-managarm-mlibc/.keep          |   0
 .../lib/x86_64-managarm-mlibc/.keep           |   0
 .../lib64/aarch64-managarm-mlibc/.keep        |   0
 .../lib64/riscv64-managarm-mlibc/.keep        |   0
 .../lib64/x86_64-managarm-mlibc/.keep         |   0
 .../aarch64-managarm-mlibc/c++/10/.keep       |   0
 .../usr/include/c++/10/.keep                  |   0
 .../usr/include/c++/v1/.keep                  |   0
 .../riscv64-managarm-mlibc/c++/10/.keep       |   0
 .../x86_64-managarm-mlibc/c++/10/.keep        |   0
 .../usr/lib/aarch64-managarm-mlibc/.keep      |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbegin.o   |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginS.o  |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginT.o  |   0
 .../usr/lib/riscv64-managarm-mlibc/.keep      |   0
 .../usr/lib/x86_64-managarm-mlibc/.keep       |   0
 .../basic_managarm_tree/usr/lib64/.keep       |   0
 clang/test/Driver/managarm.cpp                | 267 ++++++++++++++++++
 clang/test/Preprocessor/init.c                |   5 +
 .../predefined-macros-no-warnings.c           |   3 +
 35 files changed, 595 insertions(+)
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.cpp
 create mode 100644 clang/lib/Driver/ToolChains/Managarm.h
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
 create mode 100644 clang/test/Driver/managarm.cpp

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 9889141ad2085..af1111a863308 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -164,6 +164,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<AArch64leTargetInfo>>(Triple,
+                                                                       Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
@@ -466,6 +469,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<RISCV64TargetInfo>>(Triple,
                                                                    Opts);
       }
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<RISCV64TargetInfo>>(Triple,
+                                                                     Opts);
     default:
       return std::make_unique<RISCV64TargetInfo>(Triple, Opts);
     }
@@ -654,6 +660,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<PS5OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::Hurd:
       return std::make_unique<HurdTargetInfo<X86_64TargetInfo>>(Triple, Opts);
+    case llvm::Triple::Managarm:
+      return std::make_unique<ManagarmTargetInfo<X86_64TargetInfo>>(Triple,
+                                                                    Opts);
     default:
       return std::make_unique<X86_64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index d148b38d03c7c..5dac699c2bb45 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -395,6 +395,36 @@ class LLVM_LIBRARY_VISIBILITY LinuxTargetInfo : public OSTargetInfo<Target> {
   }
 };
 
+// Managarm Target
+template <typename Target>
+class LLVM_LIBRARY_VISIBILITY ManagarmTargetInfo : public OSTargetInfo<Target> {
+protected:
+  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
+                    MacroBuilder &Builder) const override {
+    DefineStd(Builder, "unix", Opts);
+    Builder.defineMacro("__managarm__");
+    if (Opts.POSIXThreads)
+      Builder.defineMacro("_REENTRANT");
+    if (Opts.CPlusPlus)
+      Builder.defineMacro("_GNU_SOURCE");
+    if (this->HasFloat128)
+      Builder.defineMacro("__FLOAT128__");
+  }
+
+public:
+  ManagarmTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
+      : OSTargetInfo<Target>(Triple, Opts) {
+    switch (Triple.getArch()) {
+    default:
+      break;
+    case llvm::Triple::x86:
+    case llvm::Triple::x86_64:
+      this->HasFloat128 = true;
+      break;
+    }
+  }
+};
+
 // NetBSD Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 44e16edfb1ccf..3cfd671e9d8f2 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -66,6 +66,7 @@ add_clang_library(clangDriver
   ToolChains/HLSL.cpp
   ToolChains/Hurd.cpp
   ToolChains/Linux.cpp
+  ToolChains/Managarm.cpp
   ToolChains/MipsLinux.cpp
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 060f76fb653c9..6c27d8c670728 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -33,6 +33,7 @@
 #include "ToolChains/Linux.h"
 #include "ToolChains/MSP430.h"
 #include "ToolChains/MSVC.h"
+#include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
 #include "ToolChains/NaCl.h"
@@ -6842,6 +6843,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
+    case llvm::Triple::Managarm:
+      TC = std::make_unique<toolchains::Managarm>(*this, Target, Args);
+      break;
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 9203bbc91b0bb..afce4fffe1d5f 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -226,6 +226,8 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
       return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
+    if (T.isOSManagarm())
+      return "aarch64managarm";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
     return "aarch64linuxb";
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
new file mode 100644
index 0000000000000..ff455f2c6ec70
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.cpp
@@ -0,0 +1,218 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Managarm.h"
+#include "Arch/ARM.h"
+#include "Arch/RISCV.h"
+#include "clang/Config/config.h"
+#include "clang/Driver/CommonArgs.h"
+#include "clang/Driver/Driver.h"
+#include "clang/Driver/Options.h"
+#include "clang/Driver/SanitizerArgs.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Support/Path.h"
+
+using namespace clang::driver;
+using namespace clang::driver::toolchains;
+using namespace clang;
+using namespace llvm::opt;
+
+using tools::addPathIfExists;
+
+std::string Managarm::getMultiarchTriple(const Driver &D,
+                                         const llvm::Triple &TargetTriple,
+                                         StringRef SysRoot) const {
+  switch (TargetTriple.getArch()) {
+  default:
+    return TargetTriple.str();
+  case llvm::Triple::x86_64:
+    return "x86_64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::aarch64:
+    return "aarch64-managarm-" + TargetTriple.getEnvironmentName().str();
+  case llvm::Triple::riscv64:
+    return "riscv64-managarm-" + TargetTriple.getEnvironmentName().str();
+  }
+}
+
+static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
+  // It happens that only x86, PPC and SPARC use the 'lib32' variant of
+  // oslibdir, and using that variant while targeting other architectures causes
+  // problems because the libraries are laid out in shared system roots that
+  // can't cope with a 'lib32' library search path being considered. So we only
+  // enable them when we know we may need it.
+  //
+  // FIXME: This is a bit of a hack. We should really unify this code for
+  // reasoning about oslibdir spellings with the lib dir spellings in the
+  // GCCInstallationDetector, but that is a more significant refactoring.
+  if (Triple.getArch() == llvm::Triple::x86 || Triple.isPPC32() ||
+      Triple.getArch() == llvm::Triple::sparc)
+    return "lib32";
+
+  if (Triple.getArch() == llvm::Triple::x86_64 && Triple.isX32())
+    return "libx32";
+
+  if (Triple.getArch() == llvm::Triple::riscv32)
+    return "lib32";
+
+  return Triple.isArch32Bit() ? "lib" : "lib64";
+}
+
+Managarm::Managarm(const Driver &D, const llvm::Triple &Triple,
+                   const ArgList &Args)
+    : Generic_ELF(D, Triple, Args) {
+  GCCInstallation.init(Triple, Args);
+  Multilibs = GCCInstallation.getMultilibs();
+  SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+  std::string SysRoot = computeSysRoot();
+
+  ToolChain::path_list &PPaths = getProgramPaths();
+
+  Generic_GCC::PushPPaths(PPaths);
+
+#ifdef ENABLE_LINKER_BUILD_ID
+  ExtraOpts.push_back("--build-id");
+#endif
+
+  // The selection of paths to try here is designed to match the patterns which
+  // the GCC driver itself uses, as this is part of the GCC-compatible driver.
+  // This was determined by running GCC in a fake filesystem, creating all
+  // possible permutations of these directories, and seeing which ones it added
+  // to the link paths.
+  path_list &Paths = getFilePaths();
+
+  const std::string OSLibDir = std::string(getOSLibDir(Triple, Args));
+  const std::string MultiarchTriple = getMultiarchTriple(D, Triple, SysRoot);
+
+  Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/lib/..", OSLibDir), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib", MultiarchTriple), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr", OSLibDir), Paths);
+
+  Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
+
+  addPathIfExists(D, concat(SysRoot, "/lib"), Paths);
+  addPathIfExists(D, concat(SysRoot, "/usr/lib"), Paths);
+}
+
+bool Managarm::HasNativeLLVMSupport() const { return true; }
+
+Tool *Managarm::buildLinker() const {
+  return new tools::gnutools::Linker(*this);
+}
+
+Tool *Managarm::buildAssembler() const {
+  return new tools::gnutools::Assembler(*this);
+}
+
+std::string Managarm::computeSysRoot() const {
+  if (!getDriver().SysRoot.empty())
+    return getDriver().SysRoot;
+  return std::string();
+}
+
+std::string Managarm::getDynamicLinker(const ArgList &Args) const {
+  switch (getTriple().getArch()) {
+  case llvm::Triple::aarch64:
+    return "/lib/aarch64-managarm/ld.so";
+  case llvm::Triple::riscv64: {
+    StringRef ABIName = tools::riscv::getRISCVABI(Args, getTriple());
+    return ("/lib/riscv64-managarm/ld-riscv64-" + ABIName + ".so").str();
+  }
+  case llvm::Triple::x86_64:
+    return "/lib/x86_64-managarm/ld.so";
+  default:
+    llvm_unreachable("unsupported architecture");
+  }
+}
+
+void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
+                                         ArgStringList &CC1Args) const {
+  const Driver &D = getDriver();
+  std::string SysRoot = computeSysRoot();
+
+  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
+    return;
+
+  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
+    addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include");
+
+  // Add 'include' in the resource directory, which is similar to
+  // GCC_INCLUDE_DIR (private headers) in GCC.
+  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
+    SmallString<128> ResourceDirInclude(D.ResourceDir);
+    llvm::sys::path::append(ResourceDirInclude, "include");
+    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
+  }
+
+  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
+    return;
+
+  // TOOL_INCLUDE_DIR
+  AddMultilibIncludeArgs(DriverArgs, CC1Args);
+
+  // Check for configure-time C include directories.
+  StringRef CIncludeDirs(C_INCLUDE_DIRS);
+  if (CIncludeDirs != "") {
+    SmallVector<StringRef, 5> dirs;
+    CIncludeDirs.split(dirs, ":");
+    for (StringRef dir : dirs) {
+      StringRef Prefix =
+          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
+      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
+    }
+    return;
+  }
+
+  // On systems using multiarch, add /usr/include/$triple before
+  // /usr/include.
+  std::string MultiarchIncludeDir = getMultiarchTriple(D, getTriple(), SysRoot);
+  if (!MultiarchIncludeDir.empty())
+    addExternCSystemInclude(
+        DriverArgs, CC1Args,
+        concat(SysRoot, "/usr/include", MultiarchIncludeDir));
+
+  // Add an include of '/include' directly. This isn't provided by default by
+  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
+  // add even when Clang is acting as-if it were a system compiler.
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/include"));
+
+  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/usr/include"));
+}
+
+void Managarm::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  // We need a detected GCC installation on Managarm to provide libstdc++'s
+  // headers.
+  if (!GCCInstallation.isValid())
+    return;
+
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+
+  // Try generic GCC detection.
+  Generic_GCC::addGCCLibStdCxxIncludePaths(DriverArgs, CC1Args, TripleStr);
+}
+
+SanitizerMask Managarm::getSupportedSanitizers() const {
+  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
+  SanitizerMask Res = ToolChain::getSupportedSanitizers();
+  Res |= SanitizerKind::PointerCompare;
+  Res |= SanitizerKind::PointerSubtract;
+  Res |= SanitizerKind::KernelAddress;
+  Res |= SanitizerKind::Vptr;
+  if (IsX86_64)
+    Res |= SanitizerKind::KernelMemory;
+  return Res;
+}
+
+void Managarm::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
+  for (const auto &Opt : ExtraOpts)
+    CmdArgs.push_back(Opt.c_str());
+}
diff --git a/clang/lib/Driver/ToolChains/Managarm.h b/clang/lib/Driver/ToolChains/Managarm.h
new file mode 100644
index 0000000000000..2082e2c615f28
--- /dev/null
+++ b/clang/lib/Driver/ToolChains/Managarm.h
@@ -0,0 +1,55 @@
+//===--- Managarm.h - Managarm ToolChain Implementations --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
+
+#include "Gnu.h"
+#include "clang/Driver/ToolChain.h"
+
+namespace clang {
+namespace driver {
+namespace toolchains {
+
+class LLVM_LIBRARY_VISIBILITY Managarm : public Generic_ELF {
+public:
+  Managarm(const Driver &D, const llvm::Triple &Triple,
+           const llvm::opt::ArgList &Args);
+
+  bool HasNativeLLVMSupport() const override;
+
+  std::string getMultiarchTriple(const Driver &D,
+                                 const llvm::Triple &TargetTriple,
+                                 StringRef SysRoot) const override;
+
+  void
+  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
+                            llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
+  SanitizerMask getSupportedSanitizers() const override;
+  std::string computeSysRoot() const override;
+
+  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
+
+  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
+
+  std::vector<std::string> ExtraOpts;
+
+protected:
+  Tool *buildAssembler() const override;
+  Tool *buildLinker() const override;
+};
+
+} // end namespace toolchains
+} // end namespace driver
+} // end namespace clang
+
+#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 641e3beebc081..3e22b4001bde7 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -221,6 +221,7 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Hurd:
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
+  case llvm::Triple::Managarm:
   case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/managarm.cpp b/clang/test/Driver/managarm.cpp
new file mode 100644
index 0000000000000..5afa17aadb6d2
--- /dev/null
+++ b/clang/test/Driver/managarm.cpp
@@ -0,0 +1,267 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-X86-64 %s
+// CHECK-X86-64:      "-cc1"
+// CHECK-X86-64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-SAME: "-L
+// CHECK-X86-64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-X86-64-LIBS %s
+// CHECK-X86-64-LIBS:      "-cc1"
+// CHECK-X86-64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-LIBS-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
+// CHECK-X86-64-LIBS-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
+// CHECK-X86-64-LIBS-SAME: "-L
+// CHECK-X86-64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-X86-64-STATIC %s
+// CHECK-X86-64-STATIC:      "-cc1"
+// CHECK-X86-64-STATIC-SAME: "-static-define"
+// CHECK-X86-64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-X86-64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-X86-64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-X86-64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-STATIC-SAME: "-static"
+// CHECK-X86-64-STATIC-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-X86-64-STATIC-SAME: "-L
+// CHECK-X86-64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-X86-64-SHARED %s
+// CHECK-X86-64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-X86-64-SHARED-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-X86-64-SHARED-SAME: "-L
+// CHECK-X86-64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
+// CHECK-AARCH64:      "-cc1"
+// CHECK-AARCH64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-SAME: {{^}} "-L
+// CHECK-AARCH64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-LIBS %s
+// CHECK-AARCH64-LIBS:      "-cc1"
+// CHECK-AARCH64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-LIBS-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
+// CHECK-AARCH64-LIBS-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L
+// CHECK-AARCH64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-STATIC %s
+// CHECK-AARCH64-STATIC:      "-cc1"
+// CHECK-AARCH64-STATIC-SAME: "-static-define"
+// CHECK-AARCH64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-AARCH64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-AARCH64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-STATIC-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-static"
+// CHECK-AARCH64-STATIC-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L
+// CHECK-AARCH64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SHARED %s
+// CHECK-AARCH64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64-SHARED-SAME: "-m" "aarch64managarm"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L
+// CHECK-AARCH64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-RISCV64 %s
+// CHECK-RISCV64:      "-cc1"
+// CHECK-RISCV64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-SAME: "-L
+// CHECK-RISCV64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-LIBS %s
+// CHECK-RISCV64-LIBS:      "-cc1"
+// CHECK-RISCV64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-LIBS-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-LIBS-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
+// CHECK-RISCV64-LIBS-SAME: "-L
+// CHECK-RISCV64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-STATIC %s
+// CHECK-RISCV64-STATIC:      "-cc1"
+// CHECK-RISCV64-STATIC-SAME: "-static-define"
+// CHECK-RISCV64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-RISCV64-STATIC-SAME: "-internal-externc-isystem"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// CHECK-RISCV64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-STATIC-SAME: "-static"
+// CHECK-RISCV64-STATIC-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o"
+// CHECK-RISCV64-STATIC-SAME: "-L
+// CHECK-RISCV64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
+
+// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
+// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-SHARED %s
+// CHECK-RISCV64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-RISCV64-SHARED-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o"
+// CHECK-RISCV64-SHARED-SAME: "-L
+// CHECK-RISCV64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
+// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 031a6c1a755bd..bed39dc3e34dc 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,6 +1622,11 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
+// RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
+// MANAGARM: #define __managarm__ 1
+
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index 4e3e29ccfa8a8..fe27ed8814eec 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -14,6 +14,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-win32-gnu
@@ -108,6 +109,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux-openhos
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-solaris
@@ -167,6 +169,7 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
+// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32

From 41b9d28327bf20befe63a683b2a2f90670837b2f Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Tue, 17 Jun 2025 07:42:57 +0100
Subject: [PATCH 684/851] [BOLT][NFC] Using target_triple in lit config
 (#144078)

---
 bolt/test/lit.local.cfg | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/bolt/test/lit.local.cfg b/bolt/test/lit.local.cfg
index d5a6849b27a77..8a61d11f5825f 100644
--- a/bolt/test/lit.local.cfg
+++ b/bolt/test/lit.local.cfg
@@ -1,6 +1,11 @@
-host_linux_triple = config.target_triple.split("-")[0] + "-unknown-linux-gnu"
+host_triple = config.target_triple
+
+# Force triple on non-linux hosts to get ELF binaries on all platforms.
+if not "linux" in host_triple:
+  host_triple = host_triple.split("-")[0] + "-unknown-linux-gnu"
+
 common_linker_flags = "-fuse-ld=lld -Wl,--unresolved-symbols=ignore-all -Wl,--build-id=none -pie"
-flags = f"--target={host_linux_triple} -fPIE {common_linker_flags}"
+flags = f"--target={host_triple} -fPIE {common_linker_flags}"
 
 config.substitutions.insert(0, ("%cflags", f"%cflags {flags}"))
 config.substitutions.insert(0, ("%cxxflags", f"%cxxflags {flags}"))

From 7e6c1bd3edf4fc19be70587a4ac33a76bab78c02 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 16 Jun 2025 23:54:40 -0700
Subject: [PATCH 685/851] [BOLT][NFCI] Simplify DataAggregator using traces
 (#143289)

Consistently apply traces as defined in #127125 for branch profile
aggregation. This combines branches and fall-through records into one.

With large input binaries/profiles, the speed up in aggregation time
(`-time-aggr`, wall time):
- perf.data, pre-BOLT input: 154.5528s -> 144.0767s
- pre-aggregated data, pre-BOLT input: 15.1026s -> 9.0711s
- pre-aggregated data, BOLTed input: 15.4871s -> 10.0077s

Test Plan: NFC
---
 bolt/include/bolt/Profile/DataAggregator.h |  54 ++++--
 bolt/lib/Profile/DataAggregator.cpp        | 182 ++++++++-------------
 2 files changed, 104 insertions(+), 132 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 3f07a6dc03a4f..10d96fbeca3e2 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -99,24 +99,28 @@ class DataAggregator : public DataReader {
     uint64_t Addr;
   };
 
+  /// Container for the unit of branch data.
+  /// Backwards compatible with legacy use for branches and fall-throughs:
+  /// - if \p Branch is FT_ONLY or FT_EXTERNAL_ORIGIN, the trace only
+  ///   contains fall-through data,
+  /// - if \p To is BR_ONLY, the trace only contains branch data.
   struct Trace {
+    static constexpr const uint64_t EXTERNAL = 0ULL;
+    static constexpr const uint64_t BR_ONLY = -1ULL;
+    static constexpr const uint64_t FT_ONLY = -1ULL;
+    static constexpr const uint64_t FT_EXTERNAL_ORIGIN = -2ULL;
+
+    uint64_t Branch;
     uint64_t From;
     uint64_t To;
-    Trace(uint64_t From, uint64_t To) : From(From), To(To) {}
-    bool operator==(const Trace &Other) const {
-      return From == Other.From && To == Other.To;
-    }
+    auto tie() const { return std::tie(Branch, From, To); }
+    bool operator==(const Trace &Other) const { return tie() == Other.tie(); }
+    bool operator<(const Trace &Other) const { return tie() < Other.tie(); }
   };
+  friend raw_ostream &operator<<(raw_ostream &OS, const Trace &);
 
   struct TraceHash {
-    size_t operator()(const Trace &L) const {
-      return std::hash<uint64_t>()(L.From << 32 | L.To);
-    }
-  };
-
-  struct FTInfo {
-    uint64_t InternCount{0};
-    uint64_t ExternCount{0};
+    size_t operator()(const Trace &L) const { return hash_combine(L.tie()); }
   };
 
   struct TakenBranchInfo {
@@ -126,8 +130,8 @@ class DataAggregator : public DataReader {
 
   /// Intermediate storage for profile data. We save the results of parsing
   /// and use them later for processing and assigning profile.
-  std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs;
-  std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
+  std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
+  std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -200,8 +204,8 @@ class DataAggregator : public DataReader {
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const LBREntry &First,
-                         const LBREntry &Second, uint64_t Count = 1) const;
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
+                         uint64_t Count) const;
 
   /// Record external entry into the function \p BF.
   ///
@@ -265,8 +269,7 @@ class DataAggregator : public DataReader {
   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
-  bool doTrace(const LBREntry &First, const LBREntry &Second,
-               uint64_t Count = 1);
+  bool doTrace(const Trace &Trace, uint64_t Count);
 
   /// Parser helpers
   /// Return false if we exhausted our parser buffer and finished parsing
@@ -516,6 +519,21 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   OS << formatv("{0:x} -> {1:x}/{2}", L.From, L.To, L.Mispred ? 'M' : 'P');
   return OS;
 }
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const DataAggregator::Trace &T) {
+  switch (T.Branch) {
+  case DataAggregator::Trace::FT_ONLY:
+  case DataAggregator::Trace::FT_EXTERNAL_ORIGIN:
+    break;
+  default:
+    OS << Twine::utohexstr(T.Branch) << " -> ";
+  }
+  OS << Twine::utohexstr(T.From);
+  if (T.To != DataAggregator::Trace::BR_ONLY)
+    OS << " ... " << Twine::utohexstr(T.To);
+  return OS;
+}
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index ade8478f556e9..118629b04f6fc 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -523,6 +523,10 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
   deleteTempFiles();
 
 heatmap:
+  // Sort parsed traces for faster processing.
+  if (!opts::BasicAggregation)
+    llvm::sort(Traces, llvm::less_first());
+
   if (!opts::HeatmapMode)
     return Error::success();
 
@@ -598,8 +602,7 @@ void DataAggregator::processProfile(BinaryContext &BC) {
     llvm::stable_sort(MemEvents.second.Data);
 
   // Release intermediate storage.
-  clear(BranchLBRs);
-  clear(FallthroughLBRs);
+  clear(Traces);
   clear(BasicSamples);
   clear(MemSamples);
 }
@@ -780,37 +783,19 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
 
-bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
-                             uint64_t Count) {
-  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(First.To);
-  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
+bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
+  const uint64_t From = Trace.From, To = Trace.To;
+  BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
+  BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
+  NumTraces += Count;
   if (!FromFunc || !ToFunc) {
-    LLVM_DEBUG({
-      dbgs() << "Out of range trace starting in ";
-      if (FromFunc)
-        dbgs() << formatv("{0} @ {1:x}", *FromFunc,
-                          First.To - FromFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(First.To);
-      dbgs() << " and ending in ";
-      if (ToFunc)
-        dbgs() << formatv("{0} @ {1:x}", *ToFunc,
-                          Second.From - ToFunc->getAddress());
-      else
-        dbgs() << Twine::utohexstr(Second.From);
-      dbgs() << '\n';
-    });
+    LLVM_DEBUG(dbgs() << "Out of range trace " << Trace << '\n');
     NumLongRangeTraces += Count;
     return false;
   }
   if (FromFunc != ToFunc) {
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
-    LLVM_DEBUG({
-      dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-             << formatv(" @ {0:x}", First.To - FromFunc->getAddress())
-             << " and ending in " << ToFunc->getPrintName()
-             << formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
-    });
     return false;
   }
 
@@ -818,28 +803,21 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
   BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
   if (!ParentFunc)
     ParentFunc = FromFunc;
-  ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
+  ParentFunc->SampleCountInBytes += Count * (To - From);
 
   const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
-          ? BAT->getFallthroughsInTrace(FuncAddress, First.To, Second.From)
-          : getFallthroughsInTrace(*FromFunc, First, Second, Count);
+          ? BAT->getFallthroughsInTrace(FuncAddress, From, To)
+          : getFallthroughsInTrace(*FromFunc, Trace, Count);
   if (!FTs) {
-    LLVM_DEBUG(
-        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
-               << " and ending in " << ToFunc->getPrintName() << " @ "
-               << ToFunc->getPrintName() << " @ "
-               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
+    LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
     return false;
   }
 
   LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
-                    << FromFunc->getPrintName() << ":"
-                    << Twine::utohexstr(First.To) << " to "
-                    << Twine::utohexstr(Second.From) << ".\n");
+                    << FromFunc->getPrintName() << ":" << Trace << '\n');
   for (auto [From, To] : *FTs) {
     if (BAT) {
       From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
@@ -852,17 +830,15 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
 }
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
-                                       const LBREntry &FirstLBR,
-                                       const LBREntry &SecondLBR,
+DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
                                        uint64_t Count) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
 
   // Offsets of the trace within this function.
-  const uint64_t From = FirstLBR.To - BF.getAddress();
-  const uint64_t To = SecondLBR.From - BF.getAddress();
+  const uint64_t From = Trace.From - BF.getAddress();
+  const uint64_t To = Trace.To - BF.getAddress();
 
   if (From > To)
     return std::nullopt;
@@ -889,8 +865,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (From == FromBB->getOffset() && !BF.containsAddress(FirstLBR.From) &&
-      !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
+  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
+      From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
+      !FromBB->isLandingPad()) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -898,10 +875,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
       if (Instr && BC.MIB->isCall(*Instr))
         FromBB = PrevBB;
       else
-        LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR
-                          << '\n');
+        LLVM_DEBUG(dbgs() << "invalid trace (no call): " << Trace << '\n');
     } else {
-      LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
+      LLVM_DEBUG(dbgs() << "invalid trace: " << Trace << '\n');
     }
   }
 
@@ -920,9 +896,7 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF,
 
     // Check for bad LBRs.
     if (!BB->getSuccessor(NextBB->getLabel())) {
-      LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
-                        << "  " << FirstLBR << '\n'
-                        << "  " << SecondLBR << '\n');
+      LLVM_DEBUG(dbgs() << "no fall-through for the trace: " << Trace << '\n');
       return std::nullopt;
     }
 
@@ -1227,14 +1201,15 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     FT_EXTERNAL_ORIGIN // f
   } Type = INVALID;
 
-  // The number of fields to parse, set based on Type.
+  /// The number of fields to parse, set based on \p Type.
   int AddrNum = 0;
   int CounterNum = 0;
-  // Storage for parsed fields.
+  /// Storage for parsed fields.
   StringRef EventName;
   std::optional<Location> Addr[3];
   int64_t Counters[2] = {0};
 
+  /// Parse strings: record type and optionally an event name.
   while (Type == INVALID || Type == EVENT_NAME) {
     while (checkAndConsumeFS()) {
     }
@@ -1268,6 +1243,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     CounterNum = SSI(Str).Case("B", 2).Case("E", 0).Default(1);
   }
 
+  /// Parse locations depending on entry type, recording them in \p Addr array.
   for (int I = 0; I < AddrNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1277,6 +1253,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Addr[I] = AddrOrErr.get();
   }
 
+  /// Parse counters depending on entry type.
   for (int I = 0; I < CounterNum; ++I) {
     while (checkAndConsumeFS()) {
     }
@@ -1287,11 +1264,13 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
     Counters[I] = CountOrErr.get();
   }
 
+  /// Expect end of line here.
   if (!checkAndConsumeNewLine()) {
     reportError("expected end of line");
     return make_error_code(llvm::errc::io_error);
   }
 
+  /// Record event name into \p EventNames and return.
   if (Type == EVENT_NAME) {
     EventNames.insert(EventName);
     return std::error_code();
@@ -1305,6 +1284,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   int64_t Count = Counters[0];
   int64_t Mispreds = Counters[1];
 
+  /// Record basic IP sample into \p BasicSamples and return.
   if (Type == SAMPLE) {
     BasicSamples[FromOffset] += Count;
     NumTotalSamples += Count;
@@ -1316,30 +1296,26 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
   if (ToFunc)
     ToFunc->setHasProfileAvailable();
 
-  Trace Trace(FromOffset, ToOffset);
-  // Taken trace
-  if (Type == TRACE || Type == BRANCH) {
-    TakenBranchInfo &Info = BranchLBRs[Trace];
-    Info.TakenCount += Count;
-    Info.MispredCount += Mispreds;
-
-    NumTotalSamples += Count;
-  }
-  // Construct fallthrough part of the trace
-  if (Type == TRACE) {
-    const uint64_t TraceFtEndOffset = Addr[2]->Offset;
-    Trace.From = ToOffset;
-    Trace.To = TraceFtEndOffset;
-    Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN;
+  /// For legacy fall-through types, adjust locations to match Trace container.
+  if (Type == FT || Type == FT_EXTERNAL_ORIGIN) {
+    Addr[2] = Location(Addr[1]->Offset); // Trace To
+    Addr[1] = Location(Addr[0]->Offset); // Trace From
+    // Put a magic value into Trace Branch to differentiate from a full trace.
+    Addr[0] = Location(Type == FT ? Trace::FT_ONLY : Trace::FT_EXTERNAL_ORIGIN);
   }
-  // Add fallthrough trace
-  if (Type != BRANCH) {
-    FTInfo &Info = FallthroughLBRs[Trace];
-    (Type == FT ? Info.InternCount : Info.ExternCount) += Count;
 
-    NumTraces += Count;
+  /// For legacy branch type, mark Trace To to differentite from a full trace.
+  if (Type == BRANCH) {
+    Addr[2] = Location(Trace::BR_ONLY);
   }
 
+  /// Record a trace.
+  Trace T{Addr[0]->Offset, Addr[1]->Offset, Addr[2]->Offset};
+  TakenBranchInfo TI{(uint64_t)Count, (uint64_t)Mispreds};
+  Traces.emplace_back(T, TI);
+
+  NumTotalSamples += Count;
+
   return std::error_code();
 }
 
@@ -1350,7 +1326,7 @@ bool DataAggregator::ignoreKernelInterrupt(LBREntry &LBR) const {
 
 std::error_code DataAggregator::printLBRHeatMap() {
   outs() << "PERF2BOLT: parse branch events...\n";
-  NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
+  NamedRegionTimer T("buildHeatmap", "Building heatmap", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
 
   if (BC->IsLinuxKernel) {
@@ -1386,12 +1362,9 @@ std::error_code DataAggregator::printLBRHeatMap() {
   // Register basic samples and perf LBR addresses not covered by fallthroughs.
   for (const auto &[PC, Hits] : BasicSamples)
     HM.registerAddress(PC, Hits);
-  for (const auto &LBR : FallthroughLBRs) {
-    const Trace &Trace = LBR.first;
-    const FTInfo &Info = LBR.second;
-    HM.registerAddressRange(Trace.From, Trace.To,
-                            Info.InternCount + Info.ExternCount);
-  }
+  for (const auto &[Trace, Info] : Traces)
+    if (Trace.To != Trace::BR_ONLY)
+      HM.registerAddressRange(Trace.From, Trace.To, Info.TakenCount);
 
   if (HM.getNumInvalidRanges())
     outs() << "HEATMAP: invalid traces: " << HM.getNumInvalidRanges() << '\n';
@@ -1437,22 +1410,10 @@ void DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
     // chronological order)
     if (NeedsSkylakeFix && NumEntry <= 2)
       continue;
-    if (NextLBR) {
-      // Record fall-through trace.
-      const uint64_t TraceFrom = LBR.To;
-      const uint64_t TraceTo = NextLBR->From;
-      const BinaryFunction *TraceBF =
-          getBinaryFunctionContainingAddress(TraceFrom);
-      FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
-      if (TraceBF && TraceBF->containsAddress(LBR.From))
-        ++Info.InternCount;
-      else
-        ++Info.ExternCount;
-      ++NumTraces;
-    }
+    uint64_t TraceTo = NextLBR ? NextLBR->From : Trace::BR_ONLY;
     NextLBR = &LBR;
 
-    TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
+    TakenBranchInfo &Info = TraceMap[Trace{LBR.From, LBR.To, TraceTo}];
     ++Info.TakenCount;
     Info.MispredCount += LBR.Mispred;
   }
@@ -1563,10 +1524,14 @@ std::error_code DataAggregator::parseBranchEvents() {
     parseLBRSample(Sample, NeedsSkylakeFix);
   }
 
-  for (const Trace &Trace : llvm::make_first_range(BranchLBRs))
-    for (const uint64_t Addr : {Trace.From, Trace.To})
+  Traces.reserve(TraceMap.size());
+  for (const auto &[Trace, Info] : TraceMap) {
+    Traces.emplace_back(Trace, Info);
+    for (const uint64_t Addr : {Trace.Branch, Trace.From})
       if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Addr))
         BF->setHasProfileAvailable();
+  }
+  clear(TraceMap);
 
   outs() << "PERF2BOLT: read " << NumSamples << " samples and " << NumEntries
          << " LBR entries\n";
@@ -1591,23 +1556,12 @@ void DataAggregator::processBranchEvents() {
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
-  for (const auto &AggrLBR : FallthroughLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const FTInfo &Info = AggrLBR.second;
-    LBREntry First{Loc.From, Loc.From, false};
-    LBREntry Second{Loc.To, Loc.To, false};
-    if (Info.InternCount)
-      doTrace(First, Second, Info.InternCount);
-    if (Info.ExternCount) {
-      First.From = 0;
-      doTrace(First, Second, Info.ExternCount);
-    }
-  }
-
-  for (const auto &AggrLBR : BranchLBRs) {
-    const Trace &Loc = AggrLBR.first;
-    const TakenBranchInfo &Info = AggrLBR.second;
-    doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
+  for (const auto &[Trace, Info] : Traces) {
+    if (Trace.Branch != Trace::FT_ONLY &&
+        Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
+      doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
+    if (Trace.To != Trace::BR_ONLY)
+      doTrace(Trace, Info.TakenCount);
   }
   printBranchSamplesDiagnostics();
 }

From 80b79ce432bbe12701fd9fe495ff9feeb5e4b9ca Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 09:30:28 +0200
Subject: [PATCH 686/851] [ConstantFolding] Handle reading from type padding
 (#144330)

ReadDataFromGlobal() did not handle reads from the padding of types (in
the sense of type store size != type alloc size, rather than struct
padding).

Return zero in that case.

Fixes https://github.com/llvm/llvm-project/issues/144279.
---
 llvm/lib/Analysis/ConstantFolding.cpp         |  4 +++
 .../InstSimplify/ConstProp/loads.ll           | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2b7a438a9ef01..b58f9b26a8651 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -432,6 +432,10 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
   assert(ByteOffset <= DL.getTypeAllocSize(C->getType()) &&
          "Out of range access");
 
+  // Reading type padding, return zero.
+  if (ByteOffset >= DL.getTypeStoreSize(C->getType()))
+    return true;
+
   // If this element is zero or undefined, we can just return since *CurPtr is
   // zero initialized.
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
index dd75560e25ced..061c6834eb97d 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/loads.ll
@@ -441,3 +441,39 @@ define i128 @load-128bit(){
   %1 = load i128, ptr @global128, align 4
   ret i128 %1
 }
+
+
+@i40_struct = constant { i40, i8 } { i40 0, i8 1 }
+@i40_array = constant [2 x i40] [i40 0, i40 1]
+
+define i8 @load_i40_struct_padding() {
+; CHECK-LABEL: @load_i40_struct_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_struct, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_struct_partial_padding() {
+; CHECK-LABEL: @load_i40_struct_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_struct, i64 4)
+  ret i16 %v
+}
+
+define i8 @load_i40_array_padding() {
+; CHECK-LABEL: @load_i40_array_padding(
+; CHECK-NEXT:    ret i8 0
+;
+  %v = load i8, ptr getelementptr (i8, ptr @i40_array, i64 6)
+  ret i8 %v
+}
+
+define i16 @load_i40_array_partial_padding() {
+; CHECK-LABEL: @load_i40_array_partial_padding(
+; CHECK-NEXT:    ret i16 0
+;
+  %v = load i16, ptr getelementptr (i8, ptr @i40_array, i64 4)
+  ret i16 %v
+}

From bb70023cbfecf7880e4cc89966947ef475e070e9 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 09:49:18 +0200
Subject: [PATCH 687/851] [MemoryLocation][DSE] Allow other read effects in
 MemoryLocation::getForDest() (#144343)

MemoryLocation::getForDest() returns a (potentially) written location,
while still allowing other reads. Currently, this is limited to
argmemonly functions. However, we can ignore other (non-argmem) read
effects here for the same reason we can ignore argument reads.

Fixes https://github.com/llvm/llvm-project/issues/144300.

Proof: https://alive2.llvm.org/ce/z/LKq_dc
---
 llvm/lib/Analysis/MemoryLocation.cpp          |  4 ++-
 .../DeadStoreElimination/trivial-dse-calls.ll | 32 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 3b42bb412b9ba..c8daab7abde18 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -111,7 +111,9 @@ MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) {
 
 std::optional<MemoryLocation>
 MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) {
-  if (!CB->onlyAccessesArgMemory())
+  // Check that the only possible writes are to arguments.
+  MemoryEffects WriteME = CB->getMemoryEffects() & MemoryEffects::writeOnly();
+  if (!WriteME.onlyAccessesArgPointees())
     return std::nullopt;
 
   if (CB->hasOperandBundles())
diff --git a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
index 030d315bfd925..df2feb087e397 100644
--- a/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/trivial-dse-calls.ll
@@ -286,3 +286,35 @@ define void @test_dse_non_alloca() {
   ret void
 }
 
+define void @test_other_read_effects() {
+; CHECK-LABEL: @test_other_read_effects(
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  ret void
+}
+
+define i32 @test_other_read_effects_read_after() {
+; CHECK-LABEL: @test_other_read_effects_read_after(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(read, argmem: readwrite) nounwind willreturn
+  %v = load i32, ptr %a
+  ret i32 %v
+}
+
+define void @test_other_write_effects() {
+; CHECK-LABEL: @test_other_write_effects(
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @f(ptr [[A]]) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4
+  call void @f(ptr %a) memory(write, argmem: readwrite) nounwind willreturn
+  ret void
+}

From 632151fbeea972f4aa3c14921eca1e45c07646f3 Mon Sep 17 00:00:00 2001
From: gaynor-anthropic <gaynor@anthropic.com>
Date: Tue, 17 Jun 2025 00:52:18 -0700
Subject: [PATCH 688/851] InstCombine: improve optimizations for ceiling
 division with no overflow (#142869)

Fixes #142497.

Alive2: https://alive2.llvm.org/ce/z/CeaHaH

The contents of this pull request were substantially written using
claude-code. I've reviewed to the best of my ability (it's been years
since I did any compilers work).

---------

Co-authored-by: Yingwei Zheng <dtcxzyw@qq.com>
Co-authored-by: Nikita Popov <github@npopov.com>
---
 .../InstCombine/InstCombineAddSub.cpp         |  28 ++
 llvm/test/Transforms/InstCombine/add.ll       | 261 ++++++++++++++++++
 2 files changed, 289 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c1ce364eb1794..0a3837f2c0ce3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1787,6 +1787,34 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
+  // Ceiling division by power-of-2:
+  // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
+  // This is valid when adding (N-1) to X doesn't overflow.
+  {
+    Value *X;
+    const APInt *ShiftAmt, *Mask;
+    CmpPredicate Pred;
+
+    // Match: (X >> C) + zext((X & Mask) != 0)
+    // or:    zext((X & Mask) != 0) + (X >> C)
+    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))),
+                          m_ZExt(m_SpecificICmp(
+                              ICmpInst::ICMP_NE,
+                              m_And(m_Deferred(X), m_LowBitMask(Mask)),
+                              m_ZeroInt())))) &&
+        Mask->popcount() == *ShiftAmt) {
+
+      // Check if X + Mask doesn't overflow
+      Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
+      if (willNotOverflowUnsignedAdd(X, MaskC, I)) {
+        // (X + Mask) >> ShiftAmt
+        Value *Add = Builder.CreateNUWAdd(X, MaskC);
+        return BinaryOperator::CreateLShr(
+            Add, ConstantInt::get(X->getType(), *ShiftAmt));
+      }
+    }
+  }
+
   // (~X) + (~Y) --> -2 - (X + Y)
   {
     // To ensure we can save instructions we need to ensure that we consume both
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 495f99824652d..a16e30bb49452 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -4273,4 +4273,265 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) {
 }
 
 declare void @llvm.assume(i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N)
+; This is only valid when x + (N-1) doesn't overflow
+
+; Test with known range that prevents overflow
+define i32 @ceil_div_by_8_known_range(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_by_8_known_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Test with the exact IR from the original testcase
+define i32 @ceil_div_from_clz(i32 %v) {
+; CHECK-LABEL: @ceil_div_from_clz(
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
+  %sub = sub nuw nsw i32 32, %ctlz
+  %shr = lshr i32 %sub, 3
+  %and = and i32 %sub, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw nsw i32 %shr, %ext
+  ret i32 %r
+}
+
+; Vector version with known range
+define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_8_vec_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7)
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+; Ceiling division by 16 with known range
+define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15
+; CHECK-NEXT:    [[R:%.*]] = lshr i16 [[TMP1]], 4
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %shr = lshr i16 %x, 4
+  %and = and i16 %x, 15
+  %cmp = icmp ne i16 %and, 0
+  %ext = zext i1 %cmp to i16
+  %r = add i16 %shr, %ext
+  ret i16 %r
+}
+
+; Negative test: no overflow guarantee - should NOT optimize
+define i32 @ceil_div_by_8_no_overflow_info(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_no_overflow_info(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: nuw on final add doesn't help
+define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw i32 %shr, %ext  ; nuw here doesn't prove x+7 won't overflow
+  ret i32 %r
+}
+
+; Negative test: wrong mask
+define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_mask(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 6
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 6  ; Wrong mask: should be 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong shift amount
+define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_shift(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 4  ; Shift by 4, but mask is 7 (should be 15)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong comparison
+define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_cmp(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp eq i32 %and, 0  ; Wrong: should be ne
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use test: all intermediate values have uses
+define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    call void @use_i32(i32 [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  call void @use_i32(i32 %and)
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Commuted test: add operands are swapped  
+define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Commuted with multi-use
+define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Multi-use test where only zext has multiple uses - should still optimize
+define i32 @ceil_div_zext_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_zext_multi_use(
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Multi-use with vector type
+define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_vec_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3)
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  call void @use_vec(<2 x i32> %shr)
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+declare void @use_i32(i32)
+declare void @use_vec(<2 x i32>)
 declare void @fake_func(i32)

From c564ebba22ae9af315e08789c628810a3bbcf3df Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 16 Jun 2025 18:09:02 +0100
Subject: [PATCH 689/851] [X86] combineEXTRACT_SUBVECTOR - move AVX1 ANDNP
 comment and fold back together. NFC.

These appear to have been split by a merge at some point.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 290fad07be4f9..820b9c53a5089 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59683,16 +59683,6 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL,
 static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget &Subtarget) {
-  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
-  // eventually get combined/lowered into ANDNP) with a concatenated operand,
-  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
-  // We let generic combining take over from there to simplify the
-  // insert/extract and 'not'.
-  // This pattern emerges during AVX1 legalization. We handle it before lowering
-  // to avoid complications like splitting constant vector loads.
-
-  // Capture the original wide type in the likely case that we need to bitcast
-  // back to this type.
   if (!N->getValueType(0).isSimple())
     return SDValue();
 
@@ -59708,8 +59698,14 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc DL(N);
 
-  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
-      TLI.isTypeLegal(InVecVT) &&
+  // For AVX1 only, if we are extracting from a 256-bit and+not (which will
+  // eventually get combined/lowered into ANDNP) with a concatenated operand,
+  // split the 'and' into 128-bit ops to avoid the concatenate and extract.
+  // We let generic combining take over from there to simplify the
+  // insert/extract and 'not'.
+  // This pattern emerges during AVX1 legalization. We handle it before lowering
+  // to avoid complications like splitting constant vector loads.
+  if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && TLI.isTypeLegal(InVecVT) &&
       InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [](SDValue V) {
       V = peekThroughBitcasts(V);

From cb355def9561e2d1d4b363f44dcedf5522f0f8a1 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles@arm.com>
Date: Tue, 17 Jun 2025 09:05:36 +0100
Subject: [PATCH 690/851] [Flang][OpenMP] Add Parsing support for Indirect
 Clause (#143505)

As part of OpenMP Version 5.1, support for the `indirect` clause was
added for the `declare target` directive. This clause should follow an
`enter` clause, and allows procedure calls to be done indirectly through
OpenMP.

This adds Parsing support for the clause, along with semantics checks.
Currently, lowering for the clause is not supported so a TODO message
will be outputted to the user. It also performs version checking as
`indirect` is only support in OpenMP 5.1 or greater.

See also: #110008
---
 flang/include/flang/Parser/dump-parse-tree.h  |  1 +
 flang/include/flang/Parser/parse-tree.h       |  6 +++
 flang/lib/Lower/OpenMP/Clauses.cpp            |  4 +-
 flang/lib/Parser/openmp-parsers.cpp           |  2 +
 flang/lib/Semantics/check-omp-structure.cpp   |  9 ++++
 .../Lower/OpenMP/Todo/omp-clause-indirect.f90 | 34 ++++++++++++
 .../OpenMP/declare-target-indirect-tree.f90   | 53 +++++++++++++++++++
 flang/test/Semantics/indirect01.f90           | 34 ++++++++++++
 flang/test/Semantics/indirect02.f90           | 36 +++++++++++++
 llvm/include/llvm/Frontend/OpenMP/ClauseT.h   |  2 +-
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  5 +-
 11 files changed, 181 insertions(+), 5 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
 create mode 100644 flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
 create mode 100644 flang/test/Semantics/indirect01.f90
 create mode 100644 flang/test/Semantics/indirect02.f90

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index c6a5150a85a4c..e3eed6aed8079 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -565,6 +565,7 @@ class ParseTreeDumper {
   NODE_ENUM(OmpDependenceType, Value)
   NODE(parser, OmpTaskDependenceType)
   NODE_ENUM(OmpTaskDependenceType, Value)
+  NODE(parser, OmpIndirectClause)
   NODE(parser, OmpIterationOffset)
   NODE(parser, OmpIteration)
   NODE(parser, OmpIterationVector)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 67405f88e09f2..61f97b855b0e5 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4300,6 +4300,12 @@ struct OmpHoldsClause {
   WRAPPER_CLASS_BOILERPLATE(OmpHoldsClause, common::Indirection<Expr>);
 };
 
+// Ref: [5.2: 209]
+struct OmpIndirectClause {
+  WRAPPER_CLASS_BOILERPLATE(
+      OmpIndirectClause, std::optional<ScalarLogicalExpr>);
+};
+
 // Ref: [5.2:72-73], in 4.5-5.1 it's scattered over individual directives
 // that allow the IF clause.
 //
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 4d0f5c3a127e1..c0c57d1832d4e 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -905,8 +905,8 @@ Inclusive make(const parser::OmpClause::Inclusive &inp,
 
 Indirect make(const parser::OmpClause::Indirect &inp,
               semantics::SemanticsContext &semaCtx) {
-  // inp -> empty
-  llvm_unreachable("Empty: indirect");
+  // inp.v.v -> std::optional<parser::ScalarLogicalExpr>
+  return Indirect{maybeApply(makeExprFn(semaCtx), inp.v.v)};
 }
 
 Init make(const parser::OmpClause::Init &inp,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 9b112a2133918..c55642d969503 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1004,6 +1004,8 @@ TYPE_PARSER( //
     "IF" >> construct<OmpClause>(construct<OmpClause::If>(
                 parenthesized(Parser<OmpIfClause>{}))) ||
     "INBRANCH" >> construct<OmpClause>(construct<OmpClause::Inbranch>()) ||
+    "INDIRECT" >> construct<OmpClause>(construct<OmpClause::Indirect>(
+                      maybe(parenthesized(scalarLogicalExpr)))) ||
     "INIT" >> construct<OmpClause>(construct<OmpClause::Init>(
                   parenthesized(Parser<OmpInitClause>{}))) ||
     "INCLUSIVE" >> construct<OmpClause>(construct<OmpClause::Inclusive>(
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 58d28dce7094a..83f4d1edf3c4f 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1820,15 +1820,24 @@ void OmpStructureChecker::Leave(const parser::OmpDeclareTargetWithClause &x) {
     const parser::OmpClause *toClause = FindClause(llvm::omp::Clause::OMPC_to);
     const parser::OmpClause *linkClause =
         FindClause(llvm::omp::Clause::OMPC_link);
+    const parser::OmpClause *indirectClause =
+        FindClause(llvm::omp::Clause::OMPC_indirect);
     if (!enterClause && !toClause && !linkClause) {
       context_.Say(x.source,
           "If the DECLARE TARGET directive has a clause, it must contain at least one ENTER clause or LINK clause"_err_en_US);
     }
+    if (indirectClause && !enterClause) {
+      context_.Say(x.source,
+          "The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive."_err_en_US);
+    }
     unsigned version{context_.langOptions().OpenMPVersion};
     if (toClause && version >= 52) {
       context_.Warn(common::UsageWarning::OpenMPUsage, toClause->source,
           "The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead."_warn_en_US);
     }
+    if (indirectClause) {
+      CheckAllowedClause(llvm::omp::Clause::OMPC_indirect);
+    }
   }
 }
 
diff --git a/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90 b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
new file mode 100644
index 0000000000000..d441cac47f5da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-clause-indirect.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fc1 -emit-fir -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90 b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
new file mode 100644
index 0000000000000..df85942ec15a5
--- /dev/null
+++ b/flang/test/Parser/OpenMP/declare-target-indirect-tree.f90
@@ -0,0 +1,53 @@
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 %openmp_flags -fopenmp-version=52 -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 %openmp_flags -fdebug-unparse -fopenmp-version=52 %s | FileCheck %s --check-prefix="UNPARSE"
+
+module functions
+  implicit none
+
+  interface
+  function func() result(i)
+    character(1) :: i
+  end function
+  end interface
+
+contains
+  function func1() result(i)
+    !$omp declare target enter(func1) indirect(.true.)
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func1'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause -> Scalar -> Logical -> Expr = '.true._4'
+    !CHECK-NEXT: | | | | | | LiteralConstant -> LogicalLiteralConstant
+    !CHECK-NEXT: | | | | | | | bool = 'true'
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+
+  function func2() result(i)
+    !$omp declare target enter(func2) indirect
+    !CHECK: | | | | | OmpDeclareTargetSpecifier -> OmpDeclareTargetWithClause -> OmpClauseList -> OmpClause -> Enter -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'func2'
+    !CHECK-NEXT: | | | | | OmpClause -> Indirect -> OmpIndirectClause ->
+    character(1) :: i
+    i = 'b'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1, ptr2=>func2
+  character(1) :: val1, val2
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+  !$omp target map(from: val2)
+  val2 = ptr2()
+  !$omp end target
+
+end program
+
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func1) INDIRECT(.true._4)
+!UNPARSE: !$OMP DECLARE TARGET  ENTER(func2) INDIRECT()
diff --git a/flang/test/Semantics/indirect01.f90 b/flang/test/Semantics/indirect01.f90
new file mode 100644
index 0000000000000..59850662275d9
--- /dev/null
+++ b/flang/test/Semantics/indirect01.f90
@@ -0,0 +1,34 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK: The INDIRECT clause cannot be used without the ENTER clause with the DECLARE TARGET directive.
+    !$omp declare target indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/flang/test/Semantics/indirect02.f90 b/flang/test/Semantics/indirect02.f90
new file mode 100644
index 0000000000000..273f8856626b7
--- /dev/null
+++ b/flang/test/Semantics/indirect02.f90
@@ -0,0 +1,36 @@
+! This test checks the lowering of OpenMP Indirect Clause when used with the Declare Target directive
+
+! RUN: not flang -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s --check-prefix="CHECK-50"
+! RUN: not flang -fopenmp -fopenmp-version=52 %s 2>&1 | FileCheck %s --check-prefix="CHECK-52"
+
+module functions
+  implicit none
+
+  interface
+    function func() result(i)
+      character(1) :: i
+    end function
+  end interface
+
+contains
+  function func1() result(i)
+    !CHECK-50: INDIRECT clause is not allowed on directive DECLARE TARGET in OpenMP v5.0, try -fopenmp-version=51
+    !CHECK-52: not yet implemented: Unhandled clause INDIRECT in DECLARE TARGET construct
+    !$omp declare target enter(func1) indirect(.true.)
+    character(1) :: i
+    i = 'a'
+    return
+  end function
+end module
+
+program main
+  use functions
+  implicit none
+  procedure (func), pointer :: ptr1=>func1
+  character(1) :: val1
+
+  !$omp target map(from: val1)
+  val1 = ptr1()
+  !$omp end target
+
+end program
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index e0714e812e5cd..de888ff86fe91 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -701,7 +701,7 @@ template <typename T, typename I, typename E> //
 struct IndirectT {
   using InvokedByFptr = E;
   using WrapperTrait = std::true_type;
-  InvokedByFptr v;
+  OPT(InvokedByFptr) v;
 };
 
 // V5.2: [14.1.2] `init` clause
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 027692275b63b..a87111cb5a11d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -246,6 +246,7 @@ def OMPC_Inclusive : Clause<[Spelling<"inclusive">]> {
   let flangClass = "OmpObjectList";
 }
 def OMPC_Indirect : Clause<[Spelling<"indirect">]> {
+  let flangClass = "OmpIndirectClause";
 }
 def OMPC_Init : Clause<[Spelling<"init">]> {
   let clangClass = "OMPInitClause";
@@ -646,7 +647,7 @@ def OMP_EndAssumes : Directive<[Spelling<"end assumes">]> {
 def OMP_BeginDeclareTarget : Directive<[Spelling<"begin declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_DeviceType>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];
@@ -724,7 +725,7 @@ def OMP_DeclareSimd : Directive<[Spelling<"declare simd">]> {
 def OMP_DeclareTarget : Directive<[Spelling<"declare target">]> {
   let allowedClauses = [
     VersionedClause<OMPC_Enter, 52>,
-    VersionedClause<OMPC_Indirect>,
+    VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
     VersionedClause<OMPC_To>,
   ];

From 90905a638e483dd9040c153785148fcea7c3e412 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Tue, 17 Jun 2025 13:49:48 +0530
Subject: [PATCH 691/851] [lldb][AIX] Added XCOFF ParseSymtab handling
 (#141577)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

**Description:**
Adding ParseSymtab logic after creating sections. It is able to handle
both 32 and 64 bit symbols,
without the need to add template logic.

This is an incremental PR on top of my previous couple of XCOFF support
commits.
---
 .../ObjectFile/XCOFF/ObjectFileXCOFF.cpp      | 102 +++++++++++++-
 .../Shell/ObjectFile/XCOFF/symbol-info.yaml   | 121 +++++++++++++++++
 .../Shell/ObjectFile/XCOFF/symbol-info32.yaml | 124 ++++++++++++++++++
 3 files changed, 346 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
 create mode 100644 lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml

diff --git a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
index 84d05e173f83f..d2c46edaf28cb 100644
--- a/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/XCOFF/ObjectFileXCOFF.cpp
@@ -188,7 +188,107 @@ AddressClass ObjectFileXCOFF::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {}
+static lldb::SymbolType MapSymbolType(llvm::object::SymbolRef::Type sym_type) {
+  switch (sym_type) {
+  case llvm::object::SymbolRef::ST_Function:
+    return lldb::eSymbolTypeCode;
+  case llvm::object::SymbolRef::ST_Data:
+    return lldb::eSymbolTypeData;
+  case llvm::object::SymbolRef::ST_File:
+    return lldb::eSymbolTypeSourceFile;
+  default:
+    return lldb::eSymbolTypeInvalid;
+  }
+}
+
+void ObjectFileXCOFF::ParseSymtab(Symtab &lldb_symtab) {
+  Log *log = GetLog(LLDBLog::Object);
+  SectionList *sectionList = GetSectionList();
+
+  for (const auto &symbol_ref : m_binary->symbols()) {
+    llvm::object::XCOFFSymbolRef xcoff_sym_ref(symbol_ref);
+
+    llvm::Expected<llvm::StringRef> name_or_err = xcoff_sym_ref.getName();
+    if (!name_or_err) {
+      LLDB_LOG_ERROR(log, name_or_err.takeError(),
+                     "Unable to extract name from the xcoff symbol ref object");
+      continue;
+    }
+
+    llvm::StringRef symbolName = name_or_err.get();
+    // Remove the . prefix added during compilation. This prefix is usually
+    // added to differentiate between reference to the code and function
+    // descriptor. For instance, Adding .func will only allow user to put bp on
+    // .func, which is not known to the user, instead of func.
+    llvm::StringRef name_no_dot =
+        symbolName.starts_with(".") ? symbolName.drop_front() : symbolName;
+    auto storageClass = xcoff_sym_ref.getStorageClass();
+    // C_HIDEXT symbols are not needed to be exposed, with the exception of TOC
+    // which is responsible for storing references to global data
+    if (storageClass == XCOFF::C_HIDEXT && symbolName != "TOC") {
+
+      // Zero or muliple aux entries may suggest ambiguous data
+      if (xcoff_sym_ref.getNumberOfAuxEntries() != 1)
+        continue;
+
+      auto aux_csect_or_err = xcoff_sym_ref.getXCOFFCsectAuxRef();
+      if (!aux_csect_or_err) {
+        LLDB_LOG_ERROR(log, aux_csect_or_err.takeError(),
+                       "Unable to access xcoff csect aux ref object");
+        continue;
+      }
+
+      const llvm::object::XCOFFCsectAuxRef csect_aux = aux_csect_or_err.get();
+
+      // Only add hidden ext entries which come under Program Code, skip others
+      // as they are not useful as debugging data.
+      if (csect_aux.getStorageMappingClass() != XCOFF::XMC_PR)
+        continue;
+
+      // This does not apply to 32-bit,
+      // Only add csect symbols identified by the aux entry, as they are
+      // needed to reference section information. Skip others
+      if (m_binary->is64Bit())
+        if (csect_aux.getAuxType64() != XCOFF::AUX_CSECT)
+          continue;
+    }
+
+    Symbol symbol;
+    symbol.GetMangled().SetValue(ConstString(name_no_dot));
+
+    int16_t sectionNumber = xcoff_sym_ref.getSectionNumber();
+    // Note that XCOFF section headers are numbered from 1 and not 0.
+    size_t sectionIndex = static_cast<size_t>(sectionNumber - 1);
+    if (sectionNumber > 0) {
+      if (sectionIndex < sectionList->GetSize()) {
+
+        lldb::SectionSP section_sp =
+            sectionList->GetSectionAtIndex(sectionIndex);
+        if (!section_sp || section_sp->GetFileAddress() == LLDB_INVALID_ADDRESS)
+          continue;
+
+        lldb::addr_t file_addr = section_sp->GetFileAddress();
+        lldb::addr_t symbolValue = xcoff_sym_ref.getValue();
+        if (symbolValue < file_addr)
+          continue;
+
+        symbol.GetAddressRef() = Address(section_sp, symbolValue - file_addr);
+      }
+    }
+
+    Expected<llvm::object::SymbolRef::Type> sym_type_or_err =
+        symbol_ref.getType();
+    if (!sym_type_or_err) {
+      LLDB_LOG_ERROR(log, sym_type_or_err.takeError(),
+                     "Unable to access xcoff symbol type");
+      continue;
+    }
+
+    symbol.SetType(MapSymbolType(sym_type_or_err.get()));
+
+    lldb_symtab.AddSymbol(symbol);
+  }
+}
 
 bool ObjectFileXCOFF::IsStripped() { return false; }
 
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
new file mode 100644
index 0000000000000..6b1a40a283445
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info.yaml
@@ -0,0 +1,121 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000100000500                    0x0000000000000398 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000110000a70                    0x0000000000000060 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x0000000110000ad0                    0x00000000000000b0 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000100000898                    0x00000000100001d8 0x00000000 text
+# CHECK: [    5] 4294967295     Code            0x0000000100000898                    0x00000000100001d8 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1F7
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x0002
+Sections:
+  - Name:            .text
+    Address:         0x100000438
+    Size:            0x38
+    FileOffsetToData: 0x0
+    FileOffsetToLineNumbers: 0x0
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     E8C20000
+  - Name:            .data
+    Address:         0x1100008D2
+    Size:            0x2AE
+    FileOffsetToData: 0x8D2
+    FileOffsetToRelocations: 0x132E
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x22
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     '' 
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 0
+        StorageMappingClass: XMC_RW
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .__threads_init
+    Value:           0x100000500
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 80
+        SectionOrLengthHi: 0
+  - Name:            __threads_init
+    Value:           0x110000A70
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_DS
+        SectionOrLengthLo: 24
+        SectionOrLengthHi: 0
+  - Name:            TOC
+    Value:           0x110000AD0
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 25
+        StorageMappingClass: XMC_TC0
+        SectionOrLengthLo: 0
+        SectionOrLengthHi: 0
+  - Name:            .text
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 17
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 58
+        SectionOrLengthHi: 0
+  - Name:            .main
+    Value:           0x100000898
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        SymbolAlignmentAndType: 2
+        StorageMappingClass: XMC_PR
+        SectionOrLengthLo: 135
+        SectionOrLengthHi: 0
+...
diff --git a/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
new file mode 100644
index 0000000000000..59c018ba0e426
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/XCOFF/symbol-info32.yaml
@@ -0,0 +1,124 @@
+# RUN: yaml2obj %s -o %t
+# RUN: %lldb %t -o "image dump symtab" -o exit | FileCheck %s
+# CHECK: Index   UserID     DSX Type            File Address/Value Load Address       Size               Flags      Name
+# CHECK: [    0] 4294967295     Invalid         0xffffffffffffffff                    0x0000000000000000 0x00000000 errno
+# CHECK: [    1] 4294967295     Code            0x0000000010000320                    0x0000000000000420 0x00000000 __threads_init
+# CHECK: [    2] 4294967295     Data            0x0000000020000920                    0x000000000000003c 0x00000000 __threads_init
+# CHECK: [    3] 4294967295     Invalid         0x000000002000095c                    0x0000000000000060 0x00000000 TOC
+# CHECK: [    4] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 text
+# CHECK: [    5] 4294967295     Invalid         0x0000000010000740                    0x000000000000003a 0x00000000 main
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:     0x1DF
+  NumberOfSections: 2
+  CreationTime:    000000000
+  Flags:           0x1002
+Sections:
+  - Name:            .text
+    Address:         0x10000268
+    Size:            0x512
+    FileOffsetToData: 0x268
+    FileOffsetToRelocations: 0xECC
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x24
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_TEXT ]
+    SectionData:     80C20000
+  - Name:            .data
+    Address:         0x2000077A
+    Size:            0x242
+    FileOffsetToData: 0x77A
+    FileOffsetToRelocations: 0x1034
+    FileOffsetToLineNumbers: 0x0
+    NumberOfRelocations: 0x25
+    NumberOfLineNumbers: 0x0
+    Flags:           [ STYP_DATA ]
+    SectionData:     ''
+Symbols:
+  - Name:            errno
+    Value:           0x0
+    Section:         N_UNDEF
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_RW
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .__threads_init
+    Value:           0x10000320
+    Section:         .text
+    Type:            0x20
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 84
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            __threads_init
+    Value:           0x20000920
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_DS
+        SectionOrLength: 12
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            TOC
+    Value:           0x2000095C
+    Section:         .data
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_TC0
+        SectionOrLength: 0
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .text
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_HIDEXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 58
+        StabInfoIndex:   0
+        StabSectNum:     0
+  - Name:            .main
+    Value:           0x10000740
+    Section:         .text
+    Type:            0x0
+    StorageClass:    C_EXT
+    NumberOfAuxEntries: 1
+    AuxEntries:
+      - Type:            AUX_CSECT
+        ParameterHashIndex: 0
+        TypeChkSectNum:  0
+        StorageMappingClass: XMC_PR
+        SectionOrLength: 137
+        StabInfoIndex:   0
+        StabSectNum:     0
+
+...

From 437945b28838c71fb32a76f6433cef8807967f71 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 17 Jun 2025 09:26:52 +0100
Subject: [PATCH 692/851] [AArch64][SVE] Move incorrectly placed assert
 (#144318)

This assert is only valid if FPAfterSVECalleeSaves is true, for the
default layout resolving CSR works correctly.
---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 2650c621e19f6..7ffe779f2408d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2896,8 +2896,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       isTargetWindows(MF) && AFI->getSVECalleeSavedStackSize();
 
   if (isSVE) {
-    assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
-           "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
     StackOffset FPOffset =
         StackOffset::get(-AFI->getCalleeSaveBaseToFrameRecordOffset(), ObjectOffset);
     StackOffset SPOffset =
@@ -2905,6 +2903,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
                          ObjectOffset);
     if (FPAfterSVECalleeSaves) {
+      assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() &&
+             "Math isn't correct for CSRs with FPAfterSVECalleeSaves");
       FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize());
     }
     // Always use the FP for SVE spills if available and beneficial.

From 85b110e0419af4b1b9a238b6978029e20010e794 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 17 Jun 2025 09:30:35 +0100
Subject: [PATCH 693/851] [mlir][vector] Add documentation note on adding new
 ops (#144308)

This adds a note requesting that additions of new ops to the Vector
dialect go through an RFC process. The goal is to clarify expectations
for contributors.

Note: this documents an existing (though previously unwritten)
convention. See, e.g.:
* https://discourse.llvm.org/t/rfc-adding-vector-to-elements-op-to-the-vector-dialect
* https://discourse.llvm.org/t/rfc-improving-gather-codegen-for-vector-dialect
---
 mlir/docs/Dialects/Vector.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/docs/Dialects/Vector.md b/mlir/docs/Dialects/Vector.md
index ade0068c56fb6..ebeb0a2de0ff1 100644
--- a/mlir/docs/Dialects/Vector.md
+++ b/mlir/docs/Dialects/Vector.md
@@ -1,5 +1,8 @@
 # 'vector' Dialect
 
+**Please post an RFC on the [forum](https://llvm.discourse.group/c/mlir/31)
+before adding  any operation in this dialect.**
+
 [TOC]
 
 MLIR supports multi-dimensional `vector` types and custom operations on those

From e2551c14d0d9180ccaef9d33c524d83e7813a361 Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Tue, 17 Jun 2025 10:31:38 +0200
Subject: [PATCH 694/851] [analyzer] Fix a false memory leak reports involving
 placement new (#144341)

Placement new does not allocate memory, so it should not be reported as
a memory leak. A recent MallocChecker refactor changed inlining of
placement-new calls with manual evaluation by MallocChecker.
https://github.com/llvm/llvm-project/commit/339282d49f5310a2837da45c0ccc19da15675554

This change avoids marking the value returned by placement new as
allocated and hence avoids the false leak reports.

Note that the there are two syntaxes to invoke placement new:
`new (p) int` and an explicit operator call `operator new(sizeof(int), p)`.
The first syntax was already properly handled by the engine.
This change corrects handling of the second syntax.

CPP-6375
---
 .../StaticAnalyzer/Checkers/MallocChecker.cpp | 22 +++++++++++
 .../test/Analysis/NewDelete-checker-test.cpp  | 38 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index fef33509c0b6e..35e98a5e2719a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -1371,6 +1371,20 @@ void MallocChecker::checkIfFreeNameIndex(ProgramStateRef State,
   C.addTransition(State);
 }
 
+const Expr *getPlacementNewBufferArg(const CallExpr *CE,
+                                     const FunctionDecl *FD) {
+  // Checking for signature:
+  // void* operator new  ( std::size_t count, void* ptr );
+  // void* operator new[]( std::size_t count, void* ptr );
+  if (CE->getNumArgs() != 2 || (FD->getOverloadedOperator() != OO_New &&
+                                FD->getOverloadedOperator() != OO_Array_New))
+    return nullptr;
+  auto BuffType = FD->getParamDecl(1)->getType();
+  if (BuffType.isNull() || !BuffType->isVoidPointerType())
+    return nullptr;
+  return CE->getArg(1);
+}
+
 void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
                                            const CallEvent &Call,
                                            CheckerContext &C) const {
@@ -1386,6 +1400,14 @@ void MallocChecker::checkCXXNewOrCXXDelete(ProgramStateRef State,
   // processed by the checkPostStmt callbacks for CXXNewExpr and
   // CXXDeleteExpr.
   const FunctionDecl *FD = C.getCalleeDecl(CE);
+  if (const auto *BufArg = getPlacementNewBufferArg(CE, FD)) {
+    // Placement new does not allocate memory
+    auto RetVal = State->getSVal(BufArg, Call.getLocationContext());
+    State = State->BindExpr(CE, C.getLocationContext(), RetVal);
+    C.addTransition(State);
+    return;
+  }
+
   switch (FD->getOverloadedOperator()) {
   case OO_New:
     State = MallocMemAux(C, Call, CE->getArg(0), UndefinedVal(), State,
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index 06754f669b1e6..da0eef7c52bd8 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -26,9 +26,10 @@
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
 // RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
-// RUN:   -verify=expected,leak \
+// RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
-// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
+// RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
+// RUN:   -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator-cxx.h"
 
@@ -63,6 +64,39 @@ void testGlobalNoThrowPlacementExprNewBeforeOverload() {
   int *p = new(std::nothrow) int;
 } // leak-warning{{Potential leak of memory pointed to by 'p'}}
 
+//----- Standard pointer placement operators
+void testGlobalPointerPlacementNew() {
+  int i;
+  void *p1 = operator new(0, &i); // no leak: placement new never allocates
+  void *p2 = operator new[](0, &i); // no leak
+  int *p3 = new(&i) int; // no leak
+  int *p4 = new(&i) int[0]; // no leak
+}
+
+template<typename T>
+void clang_analyzer_dump(T x);
+
+void testPlacementNewBufValue() {
+  int i = 10;
+  int *p = new(&i) int;
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementNewBufValueExplicitOp() {
+  int i = 10;
+  int *p = (int*)operator new(sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
+void testPlacementArrNewBufValueExplicitArrOp() {
+  int i = 10;
+  int *p = (int*)operator new[](sizeof(int), &i);
+  clang_analyzer_dump(p); // inspection-warning{{&i}}
+  clang_analyzer_dump(*p); // inspection-warning{{10}}
+}
+
 //----- Other cases
 void testNewMemoryIsInHeap() {
   int *p = new int;

From 308b97a5d48583680f56b888165295c62744b9e5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 10:33:04 +0200
Subject: [PATCH 695/851] [LICM] Regenerate test checks (NFC)

---
 llvm/test/Transforms/LICM/funclet.ll | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/LICM/funclet.ll b/llvm/test/Transforms/LICM/funclet.ll
index cacb0c90d3702..1cdd12ddc98e7 100644
--- a/llvm/test/Transforms/LICM/funclet.ll
+++ b/llvm/test/Transforms/LICM/funclet.ll
@@ -14,7 +14,7 @@ define void @test1(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[DOTLCSSA1:%.*]] = phi i32 [ [[TMP0]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %catch] unwind to caller
@@ -59,7 +59,7 @@ define void @test2(ptr %s, i1 %b) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[TRY_CONT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[WHILE_COND]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       catch.dispatch:
 ; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @pure_computation() [ "funclet"(token [[CP]]) ]
@@ -114,10 +114,10 @@ define void @test3(i1 %a, i1 %b, i1 %c) personality ptr @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[CS]] = catchswitch within none [label %catch.object.Throwable] unwind to caller
 ; CHECK:       forbody:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK-NEXT:            to label [[POSTINVOKE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
 ; CHECK:       else:
 ; CHECK-NEXT:    invoke void @may_throw()
-; CHECK-NEXT:    to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
+; CHECK-NEXT:            to label [[FORCOND_BACKEDGE]] unwind label [[CATCH_DISPATCH]]
 ;
 entry:
   %.frame = alloca i8, align 4

From 2c90ebf3a79e25db3e6bcd9b3a66590b5996de4d Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Tue, 17 Jun 2025 09:34:47 +0100
Subject: [PATCH 696/851] [OMPIRBuilder][debug] Don't drop debug info for loop
 constructs. (#144393)

In OMPIRBuilder, we have many cases where we don't handle the debug
location correctly while changing the location or insertion point. This
is one of those cases.

Please see the following test program.
```
program main
  implicit none
  integer i, j
  integer array(16384)

!$omp target teams distribute
  DO i=1,16384
    !$omp parallel do
      DO j=1,16384
        array(j) = i
      ENDDO
    !$omp end parallel do
  ENDDO
!$omp end target teams distribute

print *, array
end program main
```

When tried to compile with the follownig command
`flang -g -O2 -fopenmp  test.f90 -o test  --offload-arch=gfx90a`

will fail in the verification with the following errors: `!dbg
attachment points at wrong subprogram for function`

This happens because we were dropping the debug location in the
createCanonicalLoop and the call to the functions like
`__kmpc_distribute_static_4u` get generated without a debug location.
When it gets inlined, the locations inside it are not adjusted as the
call instruction does not have the debug locations
(`llvm/lib/Transforms/Utils/InlineFunction.cpp:fixupLineNumbers`). Later
Verifier finds that the caller have instructions with debug locations
that point to another function and fails.

The fix is simple to not drop the debug location.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  6 +-
 .../LLVMIR/omptarget-debug-loop-loc.mlir      | 66 +++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index cf17a84242c70..7cbbbff511c88 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4184,7 +4184,11 @@ Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
     Value *IndVar = Builder.CreateAdd(Span, Start);
     return BodyGenCB(Builder.saveIP(), IndVar);
   };
-  LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
+  LocationDescription LoopLoc =
+      ComputeIP.isSet()
+          ? Loc
+          : LocationDescription(Builder.saveIP(),
+                                Builder.getCurrentDebugLocation());
   return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
new file mode 100644
index 0000000000000..a755cef98d7c4
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-debug-loop-loc.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
+  omp.private {type = private} @_QFEj_private_i32 : i32 loc(#loc1)
+  omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1)
+  llvm.func @test() {
+    %3 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %3 x i32 {bindc_name = "j"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc4)
+    %6 = llvm.mlir.constant(1 : i64) : i64
+    %7 = llvm.alloca %6 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc4)
+    %8 = llvm.addrspacecast %7 : !llvm.ptr<5> to !llvm.ptr
+    %9 = llvm.mlir.constant(16383 : index) : i64
+    %10 = llvm.mlir.constant(0 : index) : i64
+    %11 = llvm.mlir.constant(1 : index) : i64
+    %12 = llvm.mlir.constant(16384 : i32) : i32
+    %14 = llvm.mlir.addressof @_QFEarray : !llvm.ptr
+    %18 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    %20 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "j"} loc(#loc3)
+    %22 = omp.map.bounds lower_bound(%10 : i64) upper_bound(%9 : i64) extent(%9 : i64) stride(%11 : i64) start_idx(%11 : i64) loc(#loc3)
+    %23 = omp.map.info var_ptr(%14 : !llvm.ptr, !llvm.array<16384 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%22) -> !llvm.ptr {name = "array"} loc(#loc3)
+    %24 = omp.map.info var_ptr(%8 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc3)
+    omp.target map_entries(%18 -> %arg0, %20 -> %arg2, %23 -> %arg4, %24 -> %arg5 : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+      %25 = llvm.mlir.constant(1 : i32) : i32
+      %27 = llvm.mlir.constant(16384 : i32) : i32
+      omp.teams {
+        omp.distribute private(@_QFEi_private_i32 %arg5 -> %arg6 : !llvm.ptr) {
+          omp.loop_nest (%arg7) : i32 = (%25) to (%27) inclusive step (%25) {
+            omp.parallel {
+              omp.wsloop private(@_QFEj_private_i32 %arg2 -> %arg8 : !llvm.ptr) {
+                omp.loop_nest (%arg9) : i32 = (%25) to (%27) inclusive step (%25) {
+                  llvm.store %arg9, %arg8 : i32, !llvm.ptr loc(#loc9)
+                  omp.yield
+                } loc(#loc9)
+              } loc(#loc9)
+              omp.terminator loc(#loc9)
+            } loc(#loc9)
+            omp.yield loc(#loc9)
+          } loc(#loc9)
+        } loc(#loc9)
+        omp.terminator loc(#loc9)
+      } loc(#loc9)
+      omp.terminator loc(#loc9)
+    } loc(#loc9)
+    llvm.return loc(#loc9)
+  } loc(#loc14)
+  llvm.mlir.global internal @_QFEarray() {addr_space = 0 : i32} : !llvm.array<16384 x i32> {
+    %0 = llvm.mlir.zero : !llvm.array<16384 x i32>
+    llvm.return %0 : !llvm.array<16384 x i32>
+  } loc(#loc2)
+}
+#di_file = #llvm.di_file<"test.f90" in "">
+#di_null_type = #llvm.di_null_type
+#loc1 = loc("test.f90":4:23)
+#loc2 = loc("test.f90":4:15)
+#loc3 = loc("test.f90":1:7)
+#loc4 = loc("test.f90":4:18)
+#loc9 = loc("test.f90":13:11)
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang", isOptimized = true, emissionKind = LineTablesOnly>
+#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_program, types = #di_null_type>
+#di_subprogram = #llvm.di_subprogram<id = distinct[1]<>, compileUnit = #di_compile_unit, scope = #di_file, name = "main", file = #di_file, subprogramFlags = "Definition|Optimized|MainSubprogram", type = #di_subroutine_type>
+#loc14 = loc(fused<#di_subprogram>[#loc3])
+
+
+// CHECK: call void @__kmpc_distribute_static{{.*}}!dbg
+

From 0f8c72160ec001599ecb29f0fa182c5550f5dd0a Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Tue, 17 Jun 2025 09:45:18 +0100
Subject: [PATCH 697/851] [C++20][Modules] Disable preferred_name when writing
 a C++20 header unit (#144377)

https://reviews.llvm.org/D130331 added workaround for named modules
only. But the same issue happens for headees units. Link issue #56490
---
 clang/include/clang/Serialization/ASTWriter.h |  4 ++
 clang/lib/Serialization/ASTWriter.cpp         |  5 +-
 .../Modules/preferred_name_header_unit.cpp    | 64 +++++++++++++++++++
 3 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/preferred_name_header_unit.cpp

diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index cf4ae610ea51f..0f49646f3f022 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -899,6 +899,10 @@ class ASTWriter : public ASTDeserializationListener,
     return WritingModule && WritingModule->isNamedModule();
   }
 
+  bool isWritingStdCXXHeaderUnit() const {
+    return WritingModule && WritingModule->isHeaderUnit();
+  }
+
   bool isGeneratingReducedBMI() const { return GeneratingReducedBMI; }
 
   bool getDoneWritingDeclsAndTypes() const { return DoneWritingDeclsAndTypes; }
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index ab1b5b333e06a..be22ee5221911 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5167,8 +5167,9 @@ void ASTRecordWriter::AddAttr(const Attr *A) {
   // FIXME: Clang can't handle the serialization/deserialization of
   // preferred_name properly now. See
   // https://github.com/llvm/llvm-project/issues/56490 for example.
-  if (!A || (isa<PreferredNameAttr>(A) &&
-             Writer->isWritingStdCXXNamedModules()))
+  if (!A ||
+      (isa<PreferredNameAttr>(A) && (Writer->isWritingStdCXXNamedModules() ||
+                                     Writer->isWritingStdCXXHeaderUnit())))
     return Record.push_back(0);
 
   Record.push_back(A->getKind() + 1); // FIXME: stable encoding, target attrs
diff --git a/clang/test/Modules/preferred_name_header_unit.cpp b/clang/test/Modules/preferred_name_header_unit.cpp
new file mode 100644
index 0000000000000..b1f1e3579f31e
--- /dev/null
+++ b/clang/test/Modules/preferred_name_header_unit.cpp
@@ -0,0 +1,64 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: cd %t
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-name=h1.h -emit-header-unit -xc++-user-header h1.h -o h1.pcm
+// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-map-file=module.modulemap -fmodule-file=h1.h=h1.pcm main.cpp -o main.o
+
+//--- module.modulemap
+module "h1.h" {
+  header "h1.h"
+  export *
+}
+
+//--- h0.h
+// expected-no-diagnostics
+#pragma once
+namespace std {
+
+template <class _CharT, class = _CharT, class = _CharT> class basic_string;
+
+namespace pmr {
+using string = basic_string<char>;
+}
+
+template <class, class, class>
+class __attribute__((__preferred_name__(pmr::string))) basic_string;
+
+template <class> class basic_string_view {};
+
+template <class _CharT, class _Traits, class _Allocator> class basic_string {
+  typedef _CharT value_type;
+  typedef _Allocator allocator_type;
+  struct __rep;
+public:
+  template <class _Tp>
+  basic_string(_Tp) {}
+  basic_string operator+=(value_type);
+};
+
+namespace filesystem {
+class path {
+  typedef char value_type;
+  value_type preferred_separator;
+  typedef basic_string<value_type> string_type;
+  typedef basic_string_view<value_type> __string_view;
+  template <class _Source> void append(_Source) {
+    __pn_ += preferred_separator;
+  }
+  void __root_directory() { append(string_type(__string_view{})); }
+  string_type __pn_;
+};
+} // namespace filesystem
+} // namespace std
+
+//--- h1.h
+// expected-no-diagnostics
+#pragma once
+
+#include "h0.h"
+
+//--- main.cpp
+// expected-no-diagnostics
+#include "h0.h"
+
+import "h1.h";

From 26d082d330e4d8d1fc3194b4b87ede9332a297f5 Mon Sep 17 00:00:00 2001
From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com>
Date: Tue, 17 Jun 2025 09:47:15 +0100
Subject: [PATCH 698/851] [clang-tidy][performance-unnecessary-value-param]
 Avoid in coroutines (#140912)

Summary:
Replacing by-value parameters with passing by-reference is not safe for
coroutines because the caller may be executed in parallel with the
callee, which increases the chances of resulting in dangling references
and hard-to-find crashes. See for the reference
[cppcoreguidelines-avoid-reference-coroutine-parameters](https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-reference-coroutine-parameters.html).

Test Plan: check-clang-tools
---
 .../UnnecessaryValueParamCheck.cpp            | 18 +++--
 .../performance/UnnecessaryValueParamCheck.h  |  1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |  2 +
 .../performance/unnecessary-value-param.rst   |  9 ++-
 .../unnecessary-value-param-coroutine.cpp     | 65 +++++++++++++++++++
 5 files changed, 87 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp

diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index a877f9a7ee912..d89c3a69fc841 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -50,7 +50,8 @@ UnnecessaryValueParamCheck::UnnecessaryValueParamCheck(
                                         utils::IncludeSorter::IS_LLVM),
                areDiagsSelfContained()),
       AllowedTypes(
-          utils::options::parseStringList(Options.get("AllowedTypes", ""))) {}
+          utils::options::parseStringList(Options.get("AllowedTypes", ""))),
+      IgnoreCoroutines(Options.get("IgnoreCoroutines", true)) {}
 
 void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
   const auto ExpensiveValueParamDecl = parmVarDecl(
@@ -61,12 +62,14 @@ void UnnecessaryValueParamCheck::registerMatchers(MatchFinder *Finder) {
                            matchers::matchesAnyListedName(AllowedTypes))))))),
       decl().bind("param"));
   Finder->addMatcher(
-      traverse(
-          TK_AsIs,
-          functionDecl(hasBody(stmt()), isDefinition(), unless(isImplicit()),
-                       unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
-                       has(typeLoc(forEach(ExpensiveValueParamDecl))),
-                       decl().bind("functionDecl"))),
+      traverse(TK_AsIs,
+               functionDecl(
+                   hasBody(IgnoreCoroutines ? stmt(unless(coroutineBodyStmt()))
+                                            : stmt()),
+                   isDefinition(), unless(isImplicit()),
+                   unless(cxxMethodDecl(anyOf(isOverride(), isFinal()))),
+                   has(typeLoc(forEach(ExpensiveValueParamDecl))),
+                   decl().bind("functionDecl"))),
       this);
 }
 
@@ -123,6 +126,7 @@ void UnnecessaryValueParamCheck::storeOptions(
   Options.store(Opts, "IncludeStyle", Inserter.getStyle());
   Options.store(Opts, "AllowedTypes",
                 utils::options::serializeStringList(AllowedTypes));
+  Options.store(Opts, "IgnoreCoroutines", IgnoreCoroutines);
 }
 
 void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 8bfd814d16357..b52043416e769 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -46,6 +46,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
   ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
+  bool IgnoreCoroutines;
 };
 
 } // namespace clang::tidy::performance
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 19ccd1790e757..3c1ca2f929044 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -265,6 +265,8 @@ Changes in existing checks
   <clang-tidy/checks/performance/unnecessary-value-param>` check performance by
   tolerating fix-it breaking compilation when functions is used as pointers
   to avoid matching usage of functions within the current compilation unit.
+  Added an option `IgnoreCoroutines` with the default value `true` to
+  suppress this check for coroutines where passing by reference may be unsafe.
 
 - Improved :doc:`readability-convert-member-functions-to-static
   <clang-tidy/checks/readability/convert-member-functions-to-static>` check by
diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
index dc86530b95f13..cd25d7d94d99b 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/unnecessary-value-param.rst
@@ -56,7 +56,7 @@ Will become:
 
 Because the fix-it needs to change the signature of the function, it may break
 builds if the function is used in multiple translation units or some codes
-depends on funcion signatures.
+depends on function signatures.
 
 Options
 -------
@@ -74,3 +74,10 @@ Options
    default is empty. If a name in the list contains the sequence `::`, it is
    matched against the qualified type name (i.e. ``namespace::Type``),
    otherwise it is matched against only the type name (i.e. ``Type``).
+
+.. option:: IgnoreCoroutines
+
+   A boolean specifying whether the check should suggest passing parameters by
+   reference in coroutines. Passing parameters by reference in coroutines may
+   not be safe, please see :doc:`cppcoreguidelines-avoid-reference-coroutine-parameters <../cppcoreguidelines/avoid-reference-coroutine-parameters>`
+   for more information. Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
new file mode 100644
index 0000000000000..0a84dc4676470
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-coroutine.cpp
@@ -0,0 +1,65 @@
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- -fix-errors
+// RUN: %check_clang_tidy -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: true}}' -fix-errors
+// RUN: %check_clang_tidy -check-suffix=ALLOWED -std=c++20-or-later %s performance-unnecessary-value-param %t -- \
+// RUN:   -config='{CheckOptions: {performance-unnecessary-value-param.IgnoreCoroutines: false}}' -fix-errors
+
+namespace std {
+
+template <class Ret, typename... T> struct coroutine_traits {
+  using promise_type = typename Ret::promise_type;
+};
+
+template <class Promise = void> struct coroutine_handle {
+  static coroutine_handle from_address(void *) noexcept;
+  static coroutine_handle from_promise(Promise &promise);
+  constexpr void *address() const noexcept;
+};
+
+template <> struct coroutine_handle<void> {
+  template <class PromiseType>
+  coroutine_handle(coroutine_handle<PromiseType>) noexcept;
+  static coroutine_handle from_address(void *);
+  constexpr void *address() const noexcept;
+};
+
+struct suspend_always {
+  bool await_ready() noexcept { return false; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+struct suspend_never {
+  bool await_ready() noexcept { return true; }
+  void await_suspend(coroutine_handle<>) noexcept {}
+  void await_resume() noexcept {}
+};
+
+} // namespace std
+
+struct ReturnObject {
+    struct promise_type {
+        ReturnObject get_return_object() { return {}; }
+        ReturnObject return_void() { return {}; }
+        std::suspend_always initial_suspend() { return {}; }
+        std::suspend_always final_suspend() noexcept { return {}; }
+        void unhandled_exception() {}
+        std::suspend_always yield_value(int value) { return {}; }
+    };
+};
+
+struct A {
+  A(const A&);
+};
+
+ReturnObject foo_coroutine(const A a) {
+// CHECK-MESSAGES-ALLOWED: [[@LINE-1]]:36: warning: the const qualified parameter 'a'
+// CHECK-FIXES: ReturnObject foo_coroutine(const A a) {
+  co_return;
+}
+
+ReturnObject foo_not_coroutine(const A a) {
+// CHECK-MESSAGES: [[@LINE-1]]:40: warning: the const qualified parameter 'a'
+// CHECK-MESSAGES-ALLOWED: [[@LINE-2]]:40: warning: the const qualified parameter 'a'
+  return ReturnObject{};
+}

From 5dc632dd56c61fb768424cc8027760490683d00d Mon Sep 17 00:00:00 2001
From: Rolf Morel <rolf.morel@intel.com>
Date: Tue, 17 Jun 2025 10:53:11 +0200
Subject: [PATCH 699/851] [MLIR][VSCode] update packages to fix CVE-2022-25883
 and CVE-2022-3517 (#144479)

Fixes issue #140869.
---
 mlir/utils/vscode/package-lock.json | 127 +++++++++++++---------------
 mlir/utils/vscode/package.json      |   2 +
 2 files changed, 62 insertions(+), 67 deletions(-)

diff --git a/mlir/utils/vscode/package-lock.json b/mlir/utils/vscode/package-lock.json
index 1efd5779f5cb2..28454c680177b 100644
--- a/mlir/utils/vscode/package-lock.json
+++ b/mlir/utils/vscode/package-lock.json
@@ -10,6 +10,8 @@
       "dependencies": {
         "base64-js": "^1.5.1",
         "chokidar": "3.5.2",
+        "minimatch": "^3.0.5",
+        "semver": "^7.5.2",
         "vscode-languageclient": "^8.0.2-next.5"
       },
       "devDependencies": {
@@ -89,6 +91,16 @@
         "keytar": "^7.7.0"
       }
     },
+    "node_modules/@vscode/vsce/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/@vscode/vsce/node_modules/xml2js": {
       "version": "0.5.0",
       "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -1195,9 +1207,10 @@
       }
     },
     "node_modules/minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
+      "license": "ISC",
       "dependencies": {
         "brace-expansion": "^1.1.7"
       },
@@ -1262,22 +1275,6 @@
         "node": ">=10"
       }
     },
-    "node_modules/node-abi/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dev": true,
-      "optional": true,
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/node-addon-api": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/node-addon-api/-/node-addon-api-4.3.0.tgz",
@@ -1365,6 +1362,16 @@
         "semver": "^5.1.0"
       }
     },
+    "node_modules/parse-semver/node_modules/semver": {
+      "version": "5.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+      "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver"
+      }
+    },
     "node_modules/parse5": {
       "version": "6.0.1",
       "resolved": "https://registry.npmjs.org/parse5/-/parse5-6.0.1.tgz",
@@ -1567,12 +1574,18 @@
       "dev": true
     },
     "node_modules/semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true,
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "license": "ISC",
+      "dependencies": {
+        "lru-cache": "^6.0.0"
+      },
       "bin": {
-        "semver": "bin/semver"
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
       }
     },
     "node_modules/set-blocking": {
@@ -1901,20 +1914,6 @@
         "vscode": "^1.67.0"
       }
     },
-    "node_modules/vscode-languageclient/node_modules/semver": {
-      "version": "7.3.7",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-      "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-      "dependencies": {
-        "lru-cache": "^6.0.0"
-      },
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
     "node_modules/vscode-languageserver-protocol": {
       "version": "3.17.2-next.6",
       "resolved": "https://registry.npmjs.org/vscode-languageserver-protocol/-/vscode-languageserver-protocol-3.17.2-next.6.tgz",
@@ -2049,6 +2048,12 @@
         "yazl": "^2.2.2"
       },
       "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        },
         "xml2js": {
           "version": "0.5.0",
           "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz",
@@ -2895,9 +2900,9 @@
       "optional": true
     },
     "minimatch": {
-      "version": "3.0.4",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
-      "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+      "version": "3.0.5",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.5.tgz",
+      "integrity": "sha512-tUpxzX0VAzJHjLu0xUfFv1gwVp9ba3IOuRAVH2EGuRW8a5emA2FlACLqiT/lDVtS1W+TGNwqz3sWaNyLgDJWuw==",
       "requires": {
         "brace-expansion": "^1.1.7"
       }
@@ -2951,18 +2956,6 @@
       "optional": true,
       "requires": {
         "semver": "^7.3.5"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "dev": true,
-          "optional": true,
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "node-addon-api": {
@@ -3035,6 +3028,14 @@
       "dev": true,
       "requires": {
         "semver": "^5.1.0"
+      },
+      "dependencies": {
+        "semver": {
+          "version": "5.7.2",
+          "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.2.tgz",
+          "integrity": "sha512-cBznnQ9KjJqU67B52RMC65CMarK2600WFnbkcaiwWq3xy/5haFJlshgnpjovMVJ+Hff49d8GEn0b87C5pDQ10g==",
+          "dev": true
+        }
       }
     },
     "parse5": {
@@ -3200,10 +3201,12 @@
       "dev": true
     },
     "semver": {
-      "version": "5.7.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
-      "integrity": "sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==",
-      "dev": true
+      "version": "7.5.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.2.tgz",
+      "integrity": "sha512-SoftuTROv/cRjCze/scjGyiDtcUyxw1rgYQSZY7XTmtR5hX+dm76iDbTH8TkLPHCQmlbQVSSbNZCPM2hb0knnQ==",
+      "requires": {
+        "lru-cache": "^6.0.0"
+      }
     },
     "set-blocking": {
       "version": "2.0.0",
@@ -3454,16 +3457,6 @@
         "minimatch": "^3.0.4",
         "semver": "^7.3.5",
         "vscode-languageserver-protocol": "3.17.2-next.6"
-      },
-      "dependencies": {
-        "semver": {
-          "version": "7.3.7",
-          "resolved": "https://registry.npmjs.org/semver/-/semver-7.3.7.tgz",
-          "integrity": "sha512-QlYTucUYOews+WeEujDoEGziz4K6c47V/Bd+LjSSYcA94p+DmINdf7ncaUinThfvZyu13lN9OY1XDxt8C0Tw0g==",
-          "requires": {
-            "lru-cache": "^6.0.0"
-          }
-        }
       }
     },
     "vscode-languageserver-protocol": {
diff --git a/mlir/utils/vscode/package.json b/mlir/utils/vscode/package.json
index 6d0f6f5c88adb..74f9ba37c7f16 100644
--- a/mlir/utils/vscode/package.json
+++ b/mlir/utils/vscode/package.json
@@ -39,6 +39,8 @@
   "dependencies": {
     "base64-js": "^1.5.1",
     "chokidar": "3.5.2",
+    "minimatch": "^3.0.5",
+    "semver": "^7.5.2",
     "vscode-languageclient": "^8.0.2-next.5"
   },
   "devDependencies": {

From 64bd4858dc2d64311622e793b66094b07ca7bdc5 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 10:09:10 +0100
Subject: [PATCH 700/851] Amend enviroment variables in bazel - change from
 #144391 (#144484)

---
 .../llvm/include/llvm/Config/llvm-config.h                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 5dd53cffb7bd7..8a9c74d67b124 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -132,10 +132,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE 0
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#define LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 0
+#define LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN 0
 
 #endif

From e5ad7f4556ba4f31380153f70a8c6186926764e2 Mon Sep 17 00:00:00 2001
From: Jesse Huang <jesse.huang@sifive.com>
Date: Tue, 17 Jun 2025 17:21:24 +0800
Subject: [PATCH 701/851] [RISCV] Move RISCVIndirectBranchTracking before
 Branch Relaxation (#139993)

The `RISCVIndirectBranchTracking` pass inserts `lpad` instruction and
could change the basic block alignment, so this should not happen after
the branch relaxation as the adjusted offset is possible to exceed the
branch range.
---
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 5 ++++-
 llvm/test/CodeGen/RISCV/O0-pipeline.ll       | 2 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll       | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 8a47453cedcd3..0bea3bc432b66 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -570,6 +570,10 @@ void RISCVPassConfig::addPreEmitPass() {
     addPass(createMachineCopyPropagationPass(true));
   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
     addPass(createRISCVLateBranchOptPass());
+  // The IndirectBranchTrackingPass inserts lpad and could have changed the
+  // basic block alignment. It must be done before Branch Relaxation to
+  // prevent the adjusted offset exceeding the branch range.
+  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(&BranchRelaxationPassID);
   addPass(createRISCVMakeCompressibleOptPass());
 }
@@ -581,7 +585,6 @@ void RISCVPassConfig::addPreEmitPass2() {
     // ensuring return instruction is detected correctly.
     addPass(createRISCVPushPopOptimizationPass());
   }
-  addPass(createRISCVIndirectBranchTrackingPass());
   addPass(createRISCVExpandPseudoPass());
 
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
index 694662eab1681..8714b286374a5 100644
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -62,6 +62,7 @@
 ; CHECK-NEXT:       Insert fentry calls
 ; CHECK-NEXT:       Insert XRay ops
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -73,7 +74,6 @@
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 19de864422bc5..c7f70a9d266c2 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -195,6 +195,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       Machine Copy Propagation Pass
 ; CHECK-NEXT:       RISC-V Late Branch Optimisation Pass
+; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       Branch relaxation pass
 ; CHECK-NEXT:       RISC-V Make Compressible
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
@@ -210,7 +211,6 @@
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISC-V Zcmp move merging pass
 ; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass
-; CHECK-NEXT:       RISC-V Indirect Branch Tracking
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles

From 97e17e15957bf6f03923ca46301b32cad507f34b Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 17 Jun 2025 11:34:05 +0200
Subject: [PATCH 702/851] Revert "[flang] Enable delayed localization by
 default for `do concurrent` (#144074)" (#144476)

This reverts commit b5dbf8210a57b986b9802304745f4c5c108cf37b.

Reverting again due to gfortran failure:
https://lab.llvm.org/buildbot/#/builders/17/builds/8868
---
 flang/lib/Lower/Bridge.cpp                            | 6 +++++-
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 5ff8101dba097..64b16b3abe991 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,7 +2033,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
+    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
+    // default unlike the staging flag) once the implementation of this is more
+    // complete.
+    bool useDelayedPriv =
+        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 039b17808d19e..6cae0eb46db13 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index 67f080eb2c1c5..a3d0c34ed8569 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index 798cbb335c8c0..d643213854744 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 64f14ff972272..60df27a591dc3 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 34d7bcfb7d7ad..84db1972cca16 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

From dfd00edbabef8094bec663cca9314a950ec56e0d Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 10:37:18 +0100
Subject: [PATCH 703/851] Fix for #144391 not fully addressed by #144484
 (#144488)

---
 utils/bazel/llvm_configs/llvm-config.h.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 6d3c37cc8b194..a0ad517a6ecf4 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -131,10 +131,10 @@
 
 /* Define to 1 to enable expensive checks for debug location coverage checking,
    and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE
 
 /* Define to 1 to enable expensive tracking of the origin of debug location
    coverage bugs, and to 0 otherwise. */
-#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN
 
 #endif

From 277b2b6da70b488e08b0f0eecba2a4cd1dd01129 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 10:39:54 +0100
Subject: [PATCH 704/851] [X86] combineCastedMaskArithmetic - convert to
 SDPatternMatch matching. NFC. (#144472)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 28 +++++++++----------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 820b9c53a5089..2eadcc5416c28 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45513,6 +45513,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
 
   if (!DCI.isBeforeLegalizeOps())
@@ -45526,15 +45527,6 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
 
-  if (!Op.hasOneUse())
-    return SDValue();
-
-  // Look for logic ops.
-  if (Op.getOpcode() != ISD::AND &&
-      Op.getOpcode() != ISD::OR &&
-      Op.getOpcode() != ISD::XOR)
-    return SDValue();
-
   // Make sure we have a bitcast between mask registers and a scalar type.
   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
         DstVT.isScalarInteger()) &&
@@ -45542,18 +45534,18 @@ static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
         SrcVT.isScalarInteger()))
     return SDValue();
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
+  SDValue LHS, RHS;
 
-  if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
-      LHS.getOperand(0).getValueType() == DstVT)
-    return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
-                       DAG.getBitcast(DstVT, RHS));
+  // Look for logic ops.
+  if (!sd_match(Op, m_OneUse(m_BitwiseLogic(m_Value(LHS), m_Value(RHS)))))
+    return SDValue();
 
-  if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
-      RHS.getOperand(0).getValueType() == DstVT)
+  // If either operand was bitcast from DstVT, then perform logic with DstVT (at
+  // least one of the getBitcast() will fold away).
+  if (sd_match(LHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))) ||
+      sd_match(RHS, m_OneUse(m_BitCast(m_SpecificVT(DstVT)))))
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
-                       DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
+                       DAG.getBitcast(DstVT, LHS), DAG.getBitcast(DstVT, RHS));
 
   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
   // Most of these have to move a constant from the scalar domain anyway.

From aa01e8e9cff9e754b47be57b2f85b962cf1ec9fb Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 17 Jun 2025 10:42:42 +0100
Subject: [PATCH 705/851] [mlir][OpenMP] Fix broken insertion point for charbox
 with omp task (#143112)

Fixes #142365
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  3 +-
 .../Target/LLVMIR/openmp-task-charbox.mlir    | 87 +++++++++++++++++++
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-task-charbox.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6bccc1d6f5d30..90ce06a0345c0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2294,8 +2294,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
     if (!privateVarOrErr)
       return handleError(privateVarOrErr, *taskOp.getOperation());
 
-    llvm::IRBuilderBase::InsertPointGuard guard(builder);
-    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+    setInsertPointForPossiblyEmptyBlock(builder);
 
     // TODO: this is a bit of a hack for Fortran character boxes.
     // Character boxes are passed by value into the init region and then the
diff --git a/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
new file mode 100644
index 0000000000000..7a448f74ed648
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-task-charbox.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// Regression test for a compiler crash. Ensure that the insertion point is set
+// correctly when triggering the charbox hack multiple times.
+// Nonsense test code to minimally reproduce the issue.
+
+module {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+  omp.private {type = private} @_QFEc2_private_box_heap_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(0 : i64) : i64
+    %2 = llvm.mlir.constant(1 : i32) : i32
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%3, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %6 = llvm.ptrtoint %arg0 : !llvm.ptr to i64
+    %7 = llvm.icmp "eq" %6, %1 : i64
+    llvm.cond_br %7, ^bb1, ^bb2
+  ^bb1:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb2:  // pred: ^bb0
+    llvm.br ^bb3
+  ^bb3:  // 2 preds: ^bb1, ^bb2
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    omp.yield
+  }
+  omp.private {type = private} @_QFEc1_private_box_ptr_c8xU : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(24 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i32) : i32
+    %2 = llvm.alloca %1 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    "llvm.intr.memcpy"(%2, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c2"} : (i64) -> !llvm.ptr
+    %2 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {bindc_name = "c1"} : (i64) -> !llvm.ptr
+    omp.task private(@_QFEc1_private_box_ptr_c8xU %2 -> %arg0, @_QFEc2_private_box_heap_c8xU %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: @_QQmain() {
+// CHECK:         %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK:         %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
+// CHECK:         br label %[[VAL_2:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_3:.*]]
+// CHECK:         br label %[[VAL_4:.*]]
+// CHECK:       omp.private.init:                                 ; preds = %[[VAL_2]]
+// CHECK:         %[[VAL_5:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr null, i32 1) to i64))
+// CHECK:         %[[VAL_6:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 0
+// CHECK:         %[[VAL_7:.*]] = getelementptr { { ptr, i64, i32, i8, i8, i8, i8 }, { ptr, i64, i32, i8, i8, i8, i8 } }, ptr %[[VAL_5]], i32 0, i32 1
+// ...
+// CHECK:         br label %[[VAL_9:.*]]
+// CHECK:       omp.private.init4:                                ; preds = %[[VAL_10:.*]], %[[VAL_11:.*]]
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       omp.private.init3:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13:.*]]
+// CHECK:       omp.private.init2:                                ; preds = %[[VAL_9]]
+// CHECK:         br label %[[VAL_13]]
+// CHECK:       omp.private.init1:                                ; preds = %[[VAL_4]]
+// CHECK:         %[[VAL_14:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i32(ptr %[[VAL_14]], ptr %[[VAL_0]], i32 24, i1 false)
+// CHECK:         %[[VAL_15:.*]] = ptrtoint ptr %[[VAL_0]] to i64
+// CHECK:         %[[VAL_16:.*]] = icmp eq i64 %[[VAL_15]], 0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_10]], label %[[VAL_11]]
+// CHECK:       omp.region.cont:                                  ; preds = %[[VAL_13]]
+// CHECK:         %[[VAL_17:.*]] = phi ptr [ %[[VAL_7]], %[[VAL_13]] ]
+// CHECK:         br label %[[VAL_18:.*]]
+// CHECK:       omp.private.copy:                                 ; preds = %[[VAL_12]]
+// CHECK:         br label %[[VAL_19:.*]]
+// CHECK:       omp.task.start:                                   ; preds = %[[VAL_18]]
+// CHECK:         br label %[[VAL_20:.*]]
+// CHECK:       codeRepl:                                         ; preds = %[[VAL_19]]
+// CHECK:         %[[VAL_21:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_5]], ptr %[[VAL_21]], align 8
+// CHECK:         %[[VAL_22:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         %[[VAL_23:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_22]], i32 1, i64 40, i64 8, ptr @_QQmain..omp_par)
+// CHECK:         %[[VAL_24:.*]] = load ptr, ptr %[[VAL_23]], align 8
+// CHECK:         call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_24]], ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK:         %[[VAL_25:.*]] = call i32 @__kmpc_omp_task(ptr @1, i32 %[[VAL_22]], ptr %[[VAL_23]])

From 00709c306d0a0f60d169ab25f612ed6715e16743 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 18:44:32 +0900
Subject: [PATCH 706/851] AArch64: Fix hardcoding calling convention of
 sincos_stret (NFC) (#144336)

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c7ffc39b5b162..1169efce3123f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5061,9 +5061,10 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
+  CallingConv::ID CC = getLibcallCallingConv(LC);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+      .setLibCallee(CC, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;

From 4c8f43440955c93a54b9547421513867bc81788a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Tue, 17 Jun 2025 11:51:09 +0200
Subject: [PATCH 707/851] [analyzer] Conversion to CheckerFamily:
 NullabilityChecker (#143735)

This commit converts NullabilityChecker to the new checker family
framework that was introduced in the recent commit
6833076a5d9f5719539a24e900037da5a3979289

This commit removes the dummy checker `nullability.NullabilityBase`
because it was hidden from the users and didn't have any useful role
except for helping the registration of the checker parts in the old
ad-hoc system (which is replaced by the new standardized framework).

Except for the removal of this dummy checker, no functional changes
intended.
---
 .../clang/StaticAnalyzer/Checkers/Checkers.td |  53 +++---
 .../Checkers/NullabilityChecker.cpp           | 177 +++++++++---------
 .../test/Analysis/analyzer-enabled-checkers.c |   1 -
 clang/test/Analysis/bugfix-124477.m           |   2 +-
 ...c-library-functions-arg-enabled-checkers.c |   1 -
 5 files changed, 109 insertions(+), 125 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 2a96df80d1001..211ce585fbac8 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -326,39 +326,34 @@ def StdVariantChecker : Checker<"StdVariant">,
 
 let ParentPackage = Nullability in {
 
-def NullabilityBase : Checker<"NullabilityBase">,
-  HelpText<"Stores information during the analysis about nullability.">,
-  Documentation<NotDocumented>,
-  Hidden;
-
-def NullPassedToNonnullChecker : Checker<"NullPassedToNonnull">,
-  HelpText<"Warns when a null pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullPassedToNonnullChecker
+      : Checker<"NullPassedToNonnull">,
+        HelpText<"Warns when a null pointer is passed to a pointer which has a "
+                 "_Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullReturnedFromNonnullChecker : Checker<"NullReturnedFromNonnull">,
-  HelpText<"Warns when a null pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullReturnedFromNonnullChecker
+      : Checker<"NullReturnedFromNonnull">,
+        HelpText<"Warns when a null pointer is returned from a function that "
+                 "has _Nonnull return type.">,
+        Documentation<HasDocumentation>;
 
-def NullableDereferencedChecker : Checker<"NullableDereferenced">,
-  HelpText<"Warns when a nullable pointer is dereferenced.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullableDereferencedChecker
+      : Checker<"NullableDereferenced">,
+        HelpText<"Warns when a nullable pointer is dereferenced.">,
+        Documentation<HasDocumentation>;
 
-def NullablePassedToNonnullChecker : Checker<"NullablePassedToNonnull">,
-  HelpText<"Warns when a nullable pointer is passed to a pointer which has a "
-           "_Nonnull type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<HasDocumentation>;
+  def NullablePassedToNonnullChecker
+      : Checker<"NullablePassedToNonnull">,
+        HelpText<"Warns when a nullable pointer is passed to a pointer which "
+                 "has a _Nonnull type.">,
+        Documentation<HasDocumentation>;
 
-def NullableReturnedFromNonnullChecker : Checker<"NullableReturnedFromNonnull">,
-  HelpText<"Warns when a nullable pointer is returned from a function that has "
-           "_Nonnull return type.">,
-  Dependencies<[NullabilityBase]>,
-  Documentation<NotDocumented>;
+  def NullableReturnedFromNonnullChecker
+      : Checker<"NullableReturnedFromNonnull">,
+        HelpText<"Warns when a nullable pointer is returned from a function "
+                 "that has _Nonnull return type.">,
+        Documentation<NotDocumented>;
 
 } // end "nullability"
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
index 461d01b452fd0..9744d1abf7790 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NullabilityChecker.cpp
@@ -81,11 +81,12 @@ enum class ErrorKind : int {
 };
 
 class NullabilityChecker
-    : public Checker<check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
-                     check::PostCall, check::PostStmt<ExplicitCastExpr>,
-                     check::PostObjCMessage, check::DeadSymbols, eval::Assume,
-                     check::Location, check::Event<ImplicitNullDerefEvent>,
-                     check::BeginFunction> {
+    : public CheckerFamily<
+          check::Bind, check::PreCall, check::PreStmt<ReturnStmt>,
+          check::PostCall, check::PostStmt<ExplicitCastExpr>,
+          check::PostObjCMessage, check::DeadSymbols, eval::Assume,
+          check::Location, check::Event<ImplicitNullDerefEvent>,
+          check::BeginFunction> {
 
 public:
   // If true, the checker will not diagnose nullabilility issues for calls
@@ -113,25 +114,21 @@ class NullabilityChecker
   void printState(raw_ostream &Out, ProgramStateRef State, const char *NL,
                   const char *Sep) const override;
 
-  enum CheckKind {
-    CK_NullPassedToNonnull,
-    CK_NullReturnedFromNonnull,
-    CK_NullableDereferenced,
-    CK_NullablePassedToNonnull,
-    CK_NullableReturnedFromNonnull,
-    CK_NumCheckKinds
-  };
-
-  bool ChecksEnabled[CK_NumCheckKinds] = {false};
-  CheckerNameRef CheckNames[CK_NumCheckKinds];
-  mutable std::unique_ptr<BugType> BTs[CK_NumCheckKinds];
-
-  const std::unique_ptr<BugType> &getBugType(CheckKind Kind) const {
-    if (!BTs[Kind])
-      BTs[Kind].reset(new BugType(CheckNames[Kind], "Nullability",
-                                  categories::MemoryError));
-    return BTs[Kind];
-  }
+  StringRef getDebugTag() const override { return "NullabilityChecker"; }
+
+  // FIXME: All bug types share the same Description ("Nullability") since the
+  // creation of this checker. We should write more descriptive descriptions...
+  // or just eliminate the Description field if it is meaningless?
+  CheckerFrontendWithBugType NullPassedToNonnull{"Nullability",
+                                                 categories::MemoryError};
+  CheckerFrontendWithBugType NullReturnedFromNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableDereferenced{"Nullability",
+                                                  categories::MemoryError};
+  CheckerFrontendWithBugType NullablePassedToNonnull{"Nullability",
+                                                     categories::MemoryError};
+  CheckerFrontendWithBugType NullableReturnedFromNonnull{
+      "Nullability", categories::MemoryError};
 
   // When set to false no nullability information will be tracked in
   // NullabilityMap. It is possible to catch errors like passing a null pointer
@@ -164,17 +161,16 @@ class NullabilityChecker
   ///
   /// When \p SuppressPath is set to true, no more bugs will be reported on this
   /// path by this checker.
-  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error, CheckKind CK,
-                                 ExplodedNode *N, const MemRegion *Region,
-                                 CheckerContext &C,
+  void reportBugIfInvariantHolds(StringRef Msg, ErrorKind Error,
+                                 const BugType &BT, ExplodedNode *N,
+                                 const MemRegion *Region, CheckerContext &C,
                                  const Stmt *ValueExpr = nullptr,
                                  bool SuppressPath = false) const;
 
-  void reportBug(StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
-                 const MemRegion *Region, BugReporter &BR,
+  void reportBug(StringRef Msg, ErrorKind Error, const BugType &BT,
+                 ExplodedNode *N, const MemRegion *Region, BugReporter &BR,
                  const Stmt *ValueExpr = nullptr) const {
-    const std::unique_ptr<BugType> &BT = getBugType(CK);
-    auto R = std::make_unique<PathSensitiveBugReport>(*BT, Msg, N);
+    auto R = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
     if (Region) {
       R->markInteresting(Region);
       R->addVisitor<NullabilityBugVisitor>(Region);
@@ -480,7 +476,7 @@ static bool checkInvariantViolation(ProgramStateRef State, ExplodedNode *N,
 }
 
 void NullabilityChecker::reportBugIfInvariantHolds(
-    StringRef Msg, ErrorKind Error, CheckKind CK, ExplodedNode *N,
+    StringRef Msg, ErrorKind Error, const BugType &BT, ExplodedNode *N,
     const MemRegion *Region, CheckerContext &C, const Stmt *ValueExpr,
     bool SuppressPath) const {
   ProgramStateRef OriginalState = N->getState();
@@ -492,7 +488,7 @@ void NullabilityChecker::reportBugIfInvariantHolds(
     N = C.addTransition(OriginalState, N);
   }
 
-  reportBug(Msg, Error, CK, N, Region, C.getBugReporter(), ValueExpr);
+  reportBug(Msg, Error, BT, N, Region, C.getBugReporter(), ValueExpr);
 }
 
 /// Cleaning up the program state.
@@ -546,19 +542,19 @@ void NullabilityChecker::checkEvent(ImplicitNullDerefEvent Event) const {
   if (!TrackedNullability)
     return;
 
-  if (ChecksEnabled[CK_NullableDereferenced] &&
+  if (NullableDereferenced.isEnabled() &&
       TrackedNullability->getValue() == Nullability::Nullable) {
     BugReporter &BR = *Event.BR;
     // Do not suppress errors on defensive code paths, because dereferencing
     // a nullable pointer is always an error.
     if (Event.IsDirectDereference)
       reportBug("Nullable pointer is dereferenced",
-                ErrorKind::NullableDereferenced, CK_NullableDereferenced,
+                ErrorKind::NullableDereferenced, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     else {
       reportBug("Nullable pointer is passed to a callee that requires a "
                 "non-null",
-                ErrorKind::NullablePassedToNonnull, CK_NullableDereferenced,
+                ErrorKind::NullablePassedToNonnull, NullableDereferenced,
                 Event.SinkNode, Region, BR);
     }
   }
@@ -710,29 +706,28 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
   Nullability RetExprTypeLevelNullability =
         getNullabilityAnnotation(lookThroughImplicitCasts(RetExpr)->getType());
 
-  bool NullReturnedFromNonNull = (RequiredNullability == Nullability::Nonnull &&
-                                  Nullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullReturnedFromNonnull] && NullReturnedFromNonNull &&
-      RetExprTypeLevelNullability != Nullability::Nonnull &&
-      !InSuppressedMethodFamily) {
-    ExplodedNode *N = C.generateErrorNode(State);
-    if (!N)
-      return;
+  if (RequiredNullability == Nullability::Nonnull &&
+      Nullness == NullConstraint::IsNull) {
+    if (NullReturnedFromNonnull.isEnabled() &&
+        RetExprTypeLevelNullability != Nullability::Nonnull &&
+        !InSuppressedMethodFamily) {
+      ExplodedNode *N = C.generateErrorNode(State);
+      if (!N)
+        return;
 
-    SmallString<256> SBuf;
-    llvm::raw_svector_ostream OS(SBuf);
-    OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
-    OS << " returned from a " << C.getDeclDescription(D) <<
-          " that is expected to return a non-null value";
-    reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
-                              CK_NullReturnedFromNonnull, N, nullptr, C,
-                              RetExpr);
-    return;
-  }
+      SmallString<256> SBuf;
+      llvm::raw_svector_ostream OS(SBuf);
+      OS << (RetExpr->getType()->isObjCObjectPointerType() ? "nil" : "Null");
+      OS << " returned from a " << C.getDeclDescription(D)
+         << " that is expected to return a non-null value";
+      reportBugIfInvariantHolds(OS.str(), ErrorKind::NilReturnedToNonnull,
+                                NullReturnedFromNonnull, N, nullptr, C,
+                                RetExpr);
+      return;
+    }
 
-  // If null was returned from a non-null function, mark the nullability
-  // invariant as violated even if the diagnostic was suppressed.
-  if (NullReturnedFromNonNull) {
+    // If null was returned from a non-null function, mark the nullability
+    // invariant as violated even if the diagnostic was suppressed.
     State = State->set<InvariantViolated>(true);
     C.addTransition(State);
     return;
@@ -746,7 +741,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
       State->get<NullabilityMap>(Region);
   if (TrackedNullability) {
     Nullability TrackedNullabValue = TrackedNullability->getValue();
-    if (ChecksEnabled[CK_NullableReturnedFromNonnull] &&
+    if (NullableReturnedFromNonnull.isEnabled() &&
         Nullness != NullConstraint::IsNotNull &&
         TrackedNullabValue == Nullability::Nullable &&
         RequiredNullability == Nullability::Nonnull) {
@@ -758,7 +753,7 @@ void NullabilityChecker::checkPreStmt(const ReturnStmt *S,
             " that is expected to return a non-null value";
 
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NullableReturnedToNonnull,
-                                CK_NullableReturnedFromNonnull, N, Region, C);
+                                NullableReturnedFromNonnull, N, Region, C);
     }
     return;
   }
@@ -809,8 +804,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
 
     unsigned ParamIdx = Param->getFunctionScopeIndex() + 1;
 
-    if (ChecksEnabled[CK_NullPassedToNonnull] &&
-        Nullness == NullConstraint::IsNull &&
+    if (NullPassedToNonnull.isEnabled() && Nullness == NullConstraint::IsNull &&
         ArgExprTypeLevelNullability != Nullability::Nonnull &&
         RequiredNullability == Nullability::Nonnull &&
         isDiagnosableCall(Call)) {
@@ -824,7 +818,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
       OS << " passed to a callee that requires a non-null " << ParamIdx
          << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
       reportBugIfInvariantHolds(OS.str(), ErrorKind::NilPassedToNonnull,
-                                CK_NullPassedToNonnull, N, nullptr, C, ArgExpr,
+                                NullPassedToNonnull, N, nullptr, C, ArgExpr,
                                 /*SuppressPath=*/false);
       return;
     }
@@ -841,7 +835,7 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
           TrackedNullability->getValue() != Nullability::Nullable)
         continue;
 
-      if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+      if (NullablePassedToNonnull.isEnabled() &&
           RequiredNullability == Nullability::Nonnull &&
           isDiagnosableCall(Call)) {
         ExplodedNode *N = C.addTransition(State);
@@ -850,17 +844,16 @@ void NullabilityChecker::checkPreCall(const CallEvent &Call,
         OS << "Nullable pointer is passed to a callee that requires a non-null "
            << ParamIdx << llvm::getOrdinalSuffix(ParamIdx) << " parameter";
         reportBugIfInvariantHolds(OS.str(), ErrorKind::NullablePassedToNonnull,
-                                  CK_NullablePassedToNonnull, N, Region, C,
+                                  NullablePassedToNonnull, N, Region, C,
                                   ArgExpr, /*SuppressPath=*/true);
         return;
       }
-      if (ChecksEnabled[CK_NullableDereferenced] &&
+      if (NullableDereferenced.isEnabled() &&
           Param->getType()->isReferenceType()) {
         ExplodedNode *N = C.addTransition(State);
-        reportBugIfInvariantHolds("Nullable pointer is dereferenced",
-                                  ErrorKind::NullableDereferenced,
-                                  CK_NullableDereferenced, N, Region, C,
-                                  ArgExpr, /*SuppressPath=*/true);
+        reportBugIfInvariantHolds(
+            "Nullable pointer is dereferenced", ErrorKind::NullableDereferenced,
+            NullableDereferenced, N, Region, C, ArgExpr, /*SuppressPath=*/true);
         return;
       }
       continue;
@@ -1294,7 +1287,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
 
   bool NullAssignedToNonNull = (LocNullability == Nullability::Nonnull &&
                                 RhsNullness == NullConstraint::IsNull);
-  if (ChecksEnabled[CK_NullPassedToNonnull] && NullAssignedToNonNull &&
+  if (NullPassedToNonnull.isEnabled() && NullAssignedToNonNull &&
       ValNullability != Nullability::Nonnull &&
       ValueExprTypeLevelNullability != Nullability::Nonnull &&
       !isARCNilInitializedLocal(C, S)) {
@@ -1312,7 +1305,7 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     OS << (LocType->isObjCObjectPointerType() ? "nil" : "Null");
     OS << " assigned to a pointer which is expected to have non-null value";
     reportBugIfInvariantHolds(OS.str(), ErrorKind::NilAssignedToNonnull,
-                              CK_NullPassedToNonnull, N, nullptr, C, ValueStmt);
+                              NullPassedToNonnull, N, nullptr, C, ValueStmt);
     return;
   }
 
@@ -1338,13 +1331,13 @@ void NullabilityChecker::checkBind(SVal L, SVal V, const Stmt *S,
     if (RhsNullness == NullConstraint::IsNotNull ||
         TrackedNullability->getValue() != Nullability::Nullable)
       return;
-    if (ChecksEnabled[CK_NullablePassedToNonnull] &&
+    if (NullablePassedToNonnull.isEnabled() &&
         LocNullability == Nullability::Nonnull) {
       ExplodedNode *N = C.addTransition(State, C.getPredecessor());
       reportBugIfInvariantHolds("Nullable pointer is assigned to a pointer "
                                 "which is expected to have non-null value",
                                 ErrorKind::NullableAssignedToNonnull,
-                                CK_NullablePassedToNonnull, N, ValueRegion, C);
+                                NullablePassedToNonnull, N, ValueRegion, C);
     }
     return;
   }
@@ -1391,28 +1384,26 @@ void NullabilityChecker::printState(raw_ostream &Out, ProgramStateRef State,
   }
 }
 
-void ento::registerNullabilityBase(CheckerManager &mgr) {
-  mgr.registerChecker<NullabilityChecker>();
-}
-
-bool ento::shouldRegisterNullabilityBase(const CheckerManager &mgr) {
-  return true;
-}
-
-#define REGISTER_CHECKER(name, trackingRequired)                               \
-  void ento::register##name##Checker(CheckerManager &mgr) {                    \
-    NullabilityChecker *checker = mgr.getChecker<NullabilityChecker>();        \
-    checker->ChecksEnabled[NullabilityChecker::CK_##name] = true;              \
-    checker->CheckNames[NullabilityChecker::CK_##name] =                       \
-        mgr.getCurrentCheckerName();                                           \
-    checker->NeedTracking = checker->NeedTracking || trackingRequired;         \
-    checker->NoDiagnoseCallsToSystemHeaders =                                  \
-        checker->NoDiagnoseCallsToSystemHeaders ||                             \
-        mgr.getAnalyzerOptions().getCheckerBooleanOption(                      \
-            checker, "NoDiagnoseCallsToSystemHeaders", true);                  \
+// The checker group "nullability" (which consists of the checkers that are
+// implemented in this file) has a group-level configuration option which
+// affects all the checkers in the group. As this is a completely unique
+// remnant of old design (this is the only group option in the analyzer), there
+// is no machinery to inject the group name from `Checkers.td`, so it is simply
+// hardcoded here:
+constexpr llvm::StringLiteral GroupName = "nullability";
+constexpr llvm::StringLiteral GroupOptName = "NoDiagnoseCallsToSystemHeaders";
+
+#define REGISTER_CHECKER(NAME, TRACKING_REQUIRED)                              \
+  void ento::register##NAME##Checker(CheckerManager &Mgr) {                    \
+    NullabilityChecker *Chk = Mgr.getChecker<NullabilityChecker>();            \
+    Chk->NAME.enable(Mgr);                                                     \
+    Chk->NeedTracking = Chk->NeedTracking || TRACKING_REQUIRED;                \
+    Chk->NoDiagnoseCallsToSystemHeaders =                                      \
+        Mgr.getAnalyzerOptions().getCheckerBooleanOption(GroupName,            \
+                                                         GroupOptName, true);  \
   }                                                                            \
                                                                                \
-  bool ento::shouldRegister##name##Checker(const CheckerManager &mgr) {        \
+  bool ento::shouldRegister##NAME##Checker(const CheckerManager &) {           \
     return true;                                                               \
   }
 
diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c
index 66b9be9795f12..78ee00deea18d 100644
--- a/clang/test/Analysis/analyzer-enabled-checkers.c
+++ b/clang/test/Analysis/analyzer-enabled-checkers.c
@@ -34,7 +34,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker
diff --git a/clang/test/Analysis/bugfix-124477.m b/clang/test/Analysis/bugfix-124477.m
index 80820f4c93444..8bb0196b2f9b8 100644
--- a/clang/test/Analysis/bugfix-124477.m
+++ b/clang/test/Analysis/bugfix-124477.m
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced,nullability.NullabilityBase -x objective-c %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,apiModeling,nullability.NullableDereferenced -x objective-c %s
 /*
   This test is reduced from a static analyzer crash. The bug causing
   the crash is explained in #124477.  It can only be triggered in some
diff --git a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
index 8c6078a49c231..7f9c9ff4c9fd7 100644
--- a/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
+++ b/clang/test/Analysis/std-c-library-functions-arg-enabled-checkers.c
@@ -42,7 +42,6 @@
 // CHECK-NEXT: core.uninitialized.CapturedBlockVariable
 // CHECK-NEXT: core.uninitialized.UndefReturn
 // CHECK-NEXT: deadcode.DeadStores
-// CHECK-NEXT: nullability.NullabilityBase
 // CHECK-NEXT: nullability.NullPassedToNonnull
 // CHECK-NEXT: nullability.NullReturnedFromNonnull
 // CHECK-NEXT: security.insecureAPI.SecuritySyntaxChecker

From 6f2983765983b9403ae40430da8034d2d1b6e8a4 Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Tue, 17 Jun 2025 10:54:22 +0100
Subject: [PATCH 708/851] Reland: "[Frontend][PCH]-Add support for ignoring PCH
 options (-ignore-pch). (#142409)" (#143614)

Visual Studio has an argument to ignore all PCH related switches.
clang-cl has also support option /Y-. Having the same option in clang
would be helpful. This commit is to add support for ignoring PCH options
(-ignore-pch).

The commit includes:
  1. Implement -ignore-pch as a Driver option.
  2. Add a Driver test and a PCH test.
  3. Add a section of -ignore-pch to user manual.
  4. Add a release note for the new option '-ignore-pch'.

The change since the original landing:
  1. preprocessing-only mode doesn't imply that -include-pch is disabled.

Co-authored-by: Matheus Izvekov <mizvekov@gmail.com>
---
 clang/docs/ReleaseNotes.rst           |   2 +
 clang/docs/UsersManual.rst            |  13 +++
 clang/include/clang/Driver/Options.td |   3 +
 clang/lib/Driver/Driver.cpp           |   8 ++
 clang/lib/Driver/ToolChains/Clang.cpp |   5 +-
 clang/test/Driver/ignored-pch.cpp     |  19 +++++
 clang/test/PCH/Inputs/ignored-pch.h   |   6 ++
 clang/test/PCH/ignored-pch.c          | 113 ++++++++++++++++++++++++++
 8 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/ignored-pch.cpp
 create mode 100644 clang/test/PCH/Inputs/ignored-pch.h
 create mode 100644 clang/test/PCH/ignored-pch.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 59d9612268d30..d32d3921b74fc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -339,6 +339,8 @@ New Compiler Flags
 
 - New option ``-Wnrvo`` added and disabled by default to warn about missed NRVO opportunities.
 
+- New option ``-ignore-pch`` added to disable precompiled headers. It overrides ``-emit-pch`` and ``-include-pch``. (#GH142409, `PCHDocs <https://clang.llvm.org/docs/UsersManual.html#ignoring-a-pch-file>`_).
+
 Deprecated Compiler Flags
 -------------------------
 
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 62844f7e6a2fa..284a404026dfe 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1458,6 +1458,19 @@ will be processed from the PCH file. Otherwise, Clang will report an error.
   ``test.h`` since ``test.h`` was included directly in the source file and not
   specified on the command line using ``-include-pch``.
 
+Ignoring a PCH File
+^^^^^^^^^^^^^^^^^^^
+
+To ignore PCH options, a `-ignore-pch` option is passed to ``clang``:
+
+.. code-block:: console
+
+  $ clang -x c-header test.h -Xclang -ignore-pch -o test.h.pch
+  $ clang -include-pch test.h.pch -Xclang -ignore-pch test.c -o test
+
+This option disables precompiled headers, overrides -emit-pch and -include-pch.
+test.h.pch is not generated and not used as a prefix header.
+
 Relocatable PCH Files
 ^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 8b7708e530b14..1ba52d50056e7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3351,6 +3351,9 @@ defm pch_codegen: OptInCC1FFlag<"pch-codegen", "Generate ", "Do not generate ",
   "code for uses of this PCH that assumes an explicit object file will be built for the PCH">;
 defm pch_debuginfo: OptInCC1FFlag<"pch-debuginfo", "Generate ", "Do not generate ",
   "debug info for types in an object file built from this PCH and do not generate them elsewhere">;
+def ignore_pch : Flag<["-"], "ignore-pch">, Group<f_Group>,
+  Visibility<[ClangOption]>,
+  HelpText<"Disable precompiled headers, overrides -emit-pch and -include-pch">;
 
 def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 6c27d8c670728..780bfc83dc623 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4331,6 +4331,14 @@ void Driver::handleArguments(Compilation &C, DerivedArgList &Args,
     YcArg = YuArg = nullptr;
   }
 
+  if (Args.hasArg(options::OPT_include_pch) &&
+      Args.hasArg(options::OPT_ignore_pch)) {
+    // If -ignore-pch is used, -include-pch is disabled. Since -emit-pch is
+    // CC1option, it will not be added to command argments if -ignore-pch is
+    // used.
+    Args.eraseArg(options::OPT_include_pch);
+  }
+
   bool LinkOnly = phases::Link == FinalPhase && Inputs.size() > 0;
   for (auto &I : Inputs) {
     types::ID InputType = I.first;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7dfed3a3356bb..bb7e5f424337b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5202,7 +5202,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-module-interface");
     else if (JA.getType() == types::TY_HeaderUnit)
       CmdArgs.push_back("-emit-header-unit");
-    else
+    else if (!Args.hasArg(options::OPT_ignore_pch))
       CmdArgs.push_back("-emit-pch");
   } else if (isa<VerifyPCHJobAction>(JA)) {
     CmdArgs.push_back("-verify-pch");
@@ -5259,7 +5259,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (JA.getType() == types::TY_PP_Asm) {
       CmdArgs.push_back("-S");
     } else if (JA.getType() == types::TY_AST) {
-      CmdArgs.push_back("-emit-pch");
+      if (!Args.hasArg(options::OPT_ignore_pch))
+        CmdArgs.push_back("-emit-pch");
     } else if (JA.getType() == types::TY_ModuleFile) {
       CmdArgs.push_back("-module-file-info");
     } else if (JA.getType() == types::TY_RewrittenObjC) {
diff --git a/clang/test/Driver/ignored-pch.cpp b/clang/test/Driver/ignored-pch.cpp
new file mode 100644
index 0000000000000..a3597dc0fe0d4
--- /dev/null
+++ b/clang/test/Driver/ignored-pch.cpp
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// Create PCH without -ignore-pch.
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-EMIT-PCH
+// RUN: %clang -x c++-header %S/Inputs/pchfile.h -o %t/pchfile.h.pch
+// RUN: %clang %s -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-INCLUDE-PCH
+// RUN: %clang %s -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefixes=CHECK-EMIT-PCH,CHECK-INCLUDE-PCH
+
+
+// Create PCH with -ignore-pch.
+// RUN: %clang -x c++-header -ignore-pch %S/Inputs/pchfile.h -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -include-pch  %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+// RUN: %clang %s -ignore-pch -emit-ast -include-pch %t/pchfile.h.pch -### 2>&1 | FileCheck %s -check-prefix=CHECK-IGNORE-PCH
+
+// CHECK-EMIT-PCH: -emit-pch
+// CHECK-INCLUDE-PCH: -include-pch
+// CHECK-IGNORE-PCH-NOT: -emit-pch
+// CHECK-IGNORE-PCH-NOT: -include-pch
diff --git a/clang/test/PCH/Inputs/ignored-pch.h b/clang/test/PCH/Inputs/ignored-pch.h
new file mode 100644
index 0000000000000..56047037c331f
--- /dev/null
+++ b/clang/test/PCH/Inputs/ignored-pch.h
@@ -0,0 +1,6 @@
+#ifndef IGNORED_PCH_H
+#define IGNORED_PCH_H
+inline int f() {
+  return 42;
+}
+#endif // IGNORED_PCH_H
diff --git a/clang/test/PCH/ignored-pch.c b/clang/test/PCH/ignored-pch.c
new file mode 100644
index 0000000000000..5b64582cba618
--- /dev/null
+++ b/clang/test/PCH/ignored-pch.c
@@ -0,0 +1,113 @@
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -emit-ast -include-pch %t.pch -o %t.ll
+// RUN: ls %t.pch | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Check that -ignore-pch causes -emit-pch and -include-pch options to be ignored.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ %s
+
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -emit-ast %s -include-pch %t.pch -ignore-pch -o %t.ll
+// RUN: not ls %t.ll 2>&1 | FileCheck --check-prefix=CHECK-OBJ-ERROR %s
+
+// Check that -ignore-pch works for multiple PCH related options.
+// Test with -building-pch-with-obj.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -building-pch-with-obj -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -building-pch-with-obj -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-compiler-errors.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fallow-pch-with-compiler-errors -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fallow-pch-with-different-modules-cache-path.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.pch
+// RUN: %clang -S -emit-llvm %s -ignore-pch -include-pch %t.pch -Xclang -fallow-pch-with-different-modules-cache-path -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-codegen.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-codegen -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-codegen -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH-ERROR %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-debuginfo.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-debuginfo -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-debuginfo -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fpch-instantiate-templates.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -fpch-instantiate-templates -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -fpch-instantiate-templates -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-pch-timestamp.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-pch-timestamp -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-pch-timestamp -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -fno-validate-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -fno-validate-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -fno-validate-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -relocatable-pch.
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -relocatable-pch -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -relocatable-pch -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+// Test with -pch-through-hdrstop-create/-pch-through-hdrstop-use
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -ignore-pch -Xclang -pch-through-hdrstop-create -o %t.pch
+// RUN: %clang -S -emit-llvm %s -include-pch %t.pch -ignore-pch -Xclang -pch-through-hdrstop-use -o %t.ll
+// RUN: not ls %t.pch 2>&1 | FileCheck --check-prefix=CHECK-PCH %s
+// RUN: ls %t.ll | FileCheck --check-prefix=CHECK-OBJ %s
+
+
+// Test with AST dump output:
+// RUN: rm -rf %t.pch %t.ll
+// RUN: %clang -x c-header %S/Inputs/ignored-pch.h -o %t.pch
+// RUN: %clang %s -include-pch %t.pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST-PCH %s
+// RUN: %clang %s -include-pch %t.pch -ignore-pch -Xclang -ast-dump-all -c | FileCheck --check-prefix=CHECK-AST %s
+
+// CHECK-PCH: ignored-pch.c.{{.*}}.pch
+// CHECK-OBJ: ignored-pch.c.{{.*}}.ll
+// CHECK-PCH-ERROR: ignored-pch.c.{{.*}}.pch{{'?}}: No such file or directory
+// CHECK-OBJ-ERROR: ignored-pch.c.{{.*}}.ll{{'?}}: No such file or directory
+// CHECK-AST-PCH: <undeserialized declarations>
+// CHECK-AST-NOT: <undeserialized declarations>
+
+#pragma hdrstop
+#include "Inputs/ignored-pch.h"
+int main() {
+  return f();
+}

From 7eda8274fed9a87f25a54616f5009bb68e511b77 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Tue, 17 Jun 2025 11:03:14 +0100
Subject: [PATCH 709/851] [MLIR] Integration tests for lowering vector.contract
 to SVE FEAT_I8MM (#140573)

---
 .../CPU/ArmSVE/vector-contract-i8mm.mlir      | 463 ++++++++++++++++++
 1 file changed, 463 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
new file mode 100644
index 0000000000000..5f6e8e4c30892
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/vector-contract-i8mm.mlir
@@ -0,0 +1,463 @@
+// REQUIRES: arm-emulator
+
+// DEFINE: %{compile} = mlir-opt %s \
+// DEFINE:   --convert-vector-to-scf --convert-scf-to-cf  --convert-vector-to-llvm='enable-arm-sve enable-arm-i8mm' \
+// DEFINE:   --expand-strided-metadata --convert-to-llvm --finalize-memref-to-llvm  \
+// DEFINE:   --lower-affine --convert-arith-to-llvm --reconcile-unrealized-casts \
+// DEFINE: -o %t
+
+// DEFINE: %{entry_point} = main
+
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void  --march=aarch64 --mattr="+sve,+i8mm" \
+// DEFINE:    -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%native_mlir_arm_runner_utils
+
+// RUN: rm -f %t && %{compile} && FileCheck %s --input-file=%t -check-prefix CHECK-IR && %{run} | FileCheck %s
+
+#packed_maps = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+
+//
+// Test the lowering of `vector.contract` using the `LowerContractionToSVEI8MMPattern`
+//
+// The operation that the `vector.contract` in this test performs is matrix
+// multiplication with accumulate
+//     OUT = ACC + LHS * RHS
+// of two 8-bit integer matrices LHS and RHS, and a 32-bit integer matrix ACC
+// into a 32-bit integer matrix OUT. The LHS and RHS can be sign- or zero- extended,
+// this test covers all the possible variants.
+//
+// Tested are calculations as well as that the relevant `ArmSVE` dialect
+// operations ('arm_sve.smmla`, arm_sve.ummla`, etc) are emitted.
+//
+// That pattern above handles (therefore this test prepares) input/output vectors with
+// specific shapes:
+//   * LHS:      vector<Mx8xi8>
+//   * RHS:      vector<[N]x8xi8>
+//   * ACC, OUT: vector<Mx[N]xi32>
+// Note that the RHS is transposed.
+// This data layout makes it efficient to load data into SVE
+// registers in the layout expected by FEAT_I8MM instructions.
+// Such a `vector.contract` is representative of the code we aim to generate
+// by scalable vectorisation of `linalg.mmt4d`.
+// See mlir/lib/Dialect/ArmSVE/Transforms/LowerContractionToSVEI8MMPattern.cpp
+// for more information and rationale about these shapes.
+//
+// In this specific test we use M == 4 and N == 4
+//
+
+// Allocate and initialise a memref containing test data for use as the ACC
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the columns of the memref.
+func.func private @prepareAccTestData(%in: vector<4x4xi32>) -> memref<4x?xi32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<4x?xi32>
+
+  scf.for %j = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%c0, %j] {in_bounds = [true, true]} : vector<4x4xi32>, memref<4x?xi32>
+  }
+
+  return %mem : memref<4x?xi32>
+}
+
+// Allocate and initialise a memref containing test data for use as the LHS
+// operand. This function just writes the parameter `%in` into the memref.
+// The size of the LHS does not depends on VSCALE.
+func.func private @prepareLHSTestData(%in: vector<4x8xi8>) -> memref<4x8xi8> {
+  %c0 = arith.constant 0 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %mem = memref.alloc() : memref<4x8xi8>
+  vector.transfer_write %in, %mem[%c0, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<4x8xi8>
+
+  return %mem : memref<4x8xi8>
+}
+
+// Allocate and initialise a memref containing test data for use as the RHS
+// operand. The memref has one dynamic dimension whose extent depends on the
+// runtime value of VSCALE.
+//
+// The input parameter `%in` is a vector that is replicated VSCALE times
+// across the rows of the memref.
+//
+// For convenience, flatten the memref, since the RHS vector is read first as a
+// single-dimensional scalable vector and then cast into [N]x8 shape.
+func.func private @prepareRHSTestData(%in: vector<4x8xi8>) -> memref<?xi8> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c0_i8 = arith.constant 0 : i8
+
+  %vs = vector.vscale
+  %d = arith.muli %c4, %vs : index
+  %mem = memref.alloc(%d) : memref<?x8xi8>
+
+  scf.for %i = %c0 to %d step %c4 {
+    vector.transfer_write %in, %mem[%i, %c0] {in_bounds = [true, true]} : vector<4x8xi8>, memref<?x8xi8>
+  }
+
+  %mem_out = memref.collapse_shape %mem [[0, 1]] : memref<?x8xi8> into memref<?xi8>
+  return %mem_out : memref<?xi8>
+}
+
+// Test the operation where both LHS and RHS are interpreted as signed, hence
+// we ultimately emit and execute the `smmla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_smmla
+// CHECK-IR-COUNT-4: arm_sve.intr.smmla
+func.func @test_smmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // FIXME: Workaround for a crash, see https://github.com/llvm/llvm-project/issues/143670
+  %acc_cast = memref.cast %acc_mem : memref<4x?xi32> to memref<*xi32>
+  call @printMemrefI32(%acc_cast) : (memref<*xi32>) -> ()
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35],
+                                   [-23,  39,  48,  26, -23,  32, -39, -38]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where both LHS and RHS are interpreted as unsigned, hence
+// we ultimately emit and execute the `ummla` instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_ummla
+// CHECK-IR-COUNT-4: arm_sve.intr.ummla
+func.func @test_ummla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[16, 16, 48, 40],
+                                   [40, 24, 35, 12],
+                                   [33, 24, 29, 19],
+                                   [28, 13, 33, 18]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[35, 42, 37, 49, 36, 36, 23, 33],
+                                   [39, 34, 33, 45, 43, 10, 44, 47],
+                                   [18, 35, 29, 25, 36, 33, 28, 29],
+                                   [26, 49, 43, 32, 27, 16, 45, 33]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[18, 31, 37, 35, 44, 22, 37, 28],
+                                   [21, 22, 49, 39, 30, 28, 35, 37],
+                                   [21, 47, 39, 35, 23, 43, 24, 49],
+                                   [49, 49, 40, 32, 37, 20, 47, 40]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(UMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as unsigned and RHS is
+// interpreted as signed, hence we ultimately emit and execute the `usmmla`
+// instruction.
+
+// CHECK-IR-LABEL: llvm.func @test_usmmla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_usmmla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[153, 161,  24, 157, 211, 154,  52,  27],
+                                   [168,  77, 136, 124, 249,  28,  13, 122],
+                                   [ 97,  82, 181,  39,  53,  25,  80, 240],
+                                   [184, 227, 106, 165, 126, 113, 121, 228]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[ 40,  27,  37,  43,  38,  -6,  37,  49],
+                                   [-17, -50,  -1,  48, -13,  22,  39,  33],
+                                   [-35, -24,  37, -32,  33,  30, -11, -17],
+                                   [-28,  31,   3, -44, -15, -27,  22,  35]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extui %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extsi %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(USMMLA):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Test the operation where LHS is interpreted as signed and RHS is interpreted
+// as unsigned. In this test we ultimately emit end execute the `usmmla`
+// instruction with reversed operands, see `LowerContractionToSVEI8MMPattern.cpp`
+// for more details.
+
+// CHECK-IR-LABEL: llvm.func @test_summla
+// CHECK-IR-COUNT-4: arm_sve.intr.usmmla
+func.func @test_summla() {
+
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c0_i8 = arith.constant 0 : i8
+
+  // Accumulator test data
+  %acc_cst = arith.constant dense<[[-44,  20,  44, -46],
+                                   [ -8,  25, -34,  26],
+                                   [-20, -36,  -3,  39],
+                                   [-48, -31, -25, -21]]> : vector<4x4xi32>
+
+  %acc_mem = func.call @prepareAccTestData(%acc_cst) : (vector<4x4xi32>) -> memref<4x?xi32>
+  %acc = vector.transfer_read %acc_mem[%c0, %c0], %c0_i32 {in_bounds = [true, true]} : memref<4x?xi32>, vector<4x[4]xi32>
+
+  // LHS test data
+  %lhs_cst = arith.constant dense<[[-35, -27, -36, -31,  23, -34,  -8, -33],
+                                   [-20,  17, -32, -47,  37,  22,  -7, -21],
+                                   [ -7, -35,  20,  -4,  39,  46, -23,  40],
+                                   [ 40,  27,  37,  43,  38,  -6,  37,  49]]> : vector<4x8xi8>
+
+  %lhs_mem = func.call @prepareLHSTestData(%lhs_cst) : (vector<4x8xi8>) -> memref<4x8xi8>
+  %lhs = vector.transfer_read %lhs_mem[%c0, %c0], %c0_i8 {in_bounds = [true, true]} : memref<4x8xi8>, vector<4x8xi8>
+
+  // RHS test data
+  %rhs_cst = arith.constant dense<[[125, 171, 138, 187, 108, 175,  82,  99],
+                                   [221,  25, 164,  97, 156, 221, 218, 177],
+                                   [171, 160, 219, 191, 144,  45, 161, 210],
+                                   [223, 165, 123,  99, 108,  86,  37,  92]]> : vector<4x8xi8>
+
+  %rhs_mem = func.call @prepareRHSTestData(%rhs_cst) : (vector<4x8xi8>) -> memref<?xi8>
+  %rhs_flat = vector.transfer_read %rhs_mem[%c0], %c0_i8 {in_bounds = [true]} :  memref<?xi8>, vector<[32]xi8>
+  %rhs = vector.shape_cast %rhs_flat : vector<[32]xi8> to vector<[4]x8xi8>
+
+  // Matrix multiplication and accumulate with transposed RHS.
+  %0 = arith.extsi %lhs : vector<4x8xi8> to vector<4x8xi32>
+  %1 = arith.extui %rhs : vector<[4]x8xi8> to vector<[4]x8xi32>
+  %2 = vector.contract {indexing_maps = #packed_maps,
+                        iterator_types = ["parallel", "parallel", "reduction"],
+                        kind = #vector.kind<add>} %0, %1, %acc
+    : vector<4x8xi32>, vector<[4]x8xi32> into vector<4x[4]xi32>
+
+  // Display the result of the multiplication
+  vector.print str "Result(SUMMLA (i.e. USMMLA transposed)):\n"
+  %u0 = vector.extract %2[0] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u1 = vector.extract %2[1] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u2 = vector.extract %2[2] : vector<[4]xi32> from vector<4x[4]xi32>
+  %u3 = vector.extract %2[3] : vector<[4]xi32> from vector<4x[4]xi32>
+  vector.print %u0 : vector<[4]xi32>
+  vector.print %u1 : vector<[4]xi32>
+  vector.print %u2 : vector<[4]xi32>
+  vector.print %u3 : vector<[4]xi32>
+
+  // Deallocate the buffers.
+  memref.dealloc %acc_mem : memref<4x?xi32>
+  memref.dealloc %lhs_mem : memref<4x8xi8>
+  memref.dealloc %rhs_mem : memref<?xi8>
+
+  return
+}
+
+// Perform each test with SVE vector lengths 128 bits and 256 bits (i.e. VSCALEs
+// 1 and 2, respectively). The vector length is set via the `setArmVLBits`
+// function. The effect of setting a different vector length is that the tests
+// allocate and operate on different sized buffers (see `prepare<X>TestData`
+// functions).
+
+func.func @main() {
+  %c128 = arith.constant 128 : i32
+  %c256 = arith.constant 256 : i32
+
+// CHECK-LABEL: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK: Result(SMMLA):
+// CHECK: ( -1999,  1941,   685, -2879, -1999,  1941,   685, -2879 )
+// CHECK: ( -3705,  2952,   987,  -685, -3705,  2952,   987,  -685 )
+// CHECK: (  2565,  4157, -1589,  -357,  2565,  4157, -1589,  -357 )
+// CHECK: (  2383, -2252,    32, -1365,  2383, -2252,    32, -1365 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_smmla() : () -> ()
+
+// CHECK-LABEL: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK: Result(UMMLA):
+// CHECK: ( 9183, 9513, 10460, 11314, 9183, 9513, 10460, 11314 )
+// CHECK: ( 9648, 9812, 10092, 12088, 9648, 9812, 10092, 12088 )
+// CHECK: ( 7548, 7625,  8398,  9044, 7548, 7625,  8398,  9044 )
+// CHECK: ( 8855, 9046,  9685, 11191, 8855, 9046,  9685, 11191 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_ummla() : () -> ()
+
+// CHECK-LABEL: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK: Result(USMMLA):
+// CHECK: ( 28403,  445,  -2759, -11409, 28403,  445,  -2759, -11409 )
+// CHECK: ( 34908, 1047,    142,  -7274, 34908, 1047,    142,  -7274 )
+// CHECK: ( 31032, 6807,  -2378,   7382, 31032, 6807,  -2378,   7382 )
+// CHECK: ( 44217, 6396, -10930,    623, 44217, 6396, -10930,    623 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_usmmla() : () -> ()
+
+// CHECK-LABEL: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c128) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+// CHECK: Result(SUMMLA (i.e. USMMLA transposed)):
+// CHECK: ( -27190, -28812, -30502, -23575, -27190, -28812, -30502, -23575 )
+// CHECK: (  -7613,  -8386, -15938,  -6521,  -7613,  -8386, -15938,  -6521 )
+// CHECK: (   9468,  18750,   9199,   5764,   9468,  18750,   9199,   5764 )
+// CHECK: (  33655,  41064,  48900,  31627,  33655,  41064,  48900,  31627 )
+  func.call @setArmVLBits(%c256) : (i32) -> ()
+  func.call @test_summla() : () -> ()
+
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
+func.func private @printMemrefI32(%ptr : memref<*xi32>)

From c377ce1216a8ce73c940d2366a7bf223790f43b4 Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova@arm.com>
Date: Tue, 17 Jun 2025 11:07:43 +0100
Subject: [PATCH 710/851] [AArch64][VecLib] Add libmvec support for AArch64
 targets (#143696)

This patch adds support for the `libmvec` vector library on AArch64
targets. Currently, all `libmvec` functions in GLIBC version 2.40 are
supported. The full list of math functions enabled can be found
[here](https://github.com/bminor/glibc/blob/96abd59bf2a11ddd4e7ccaac840ec13c0b62d3ba/sysdeps/aarch64/fpu/Versions)
(up to GLIBC 2.40).

Previously, `libmvec` was only supported on x86_64 targets. Attempts to
use it on AArch64 resulted in the following error from Clang:
`unsupported option 'libmvec' for target 'aarch64'`.
---
 clang/docs/ReleaseNotes.rst                   |    2 +
 clang/include/clang/Driver/Options.td         |    5 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |    9 +-
 clang/test/Driver/fveclib.c                   |   10 +-
 llvm/include/llvm/Analysis/VecFuncs.def       |  260 +++++
 llvm/lib/Analysis/TargetLibraryInfo.cpp       |   12 +
 .../replace-with-veclib-libmvec-scalable.ll   |  579 +++++++++
 .../AArch64/replace-with-veclib-libmvec.ll    |  577 +++++++++
 .../AArch64/veclib-function-calls.ll          | 1035 +++++++++++++++++
 .../AArch64/veclib-intrinsic-calls.ll         |  735 ++++++++++++
 llvm/test/Transforms/Util/add-TLI-mappings.ll |   28 +-
 11 files changed, 3243 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
 create mode 100644 llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d32d3921b74fc..03641f5d0ea0d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -359,6 +359,8 @@ Modified Compiler Flags
 
 - The ``-fchar8_t`` flag is no longer considered in non-C++ languages modes. (#GH55373)
 
+- The ``-fveclib=libmvec`` option now supports AArch64 targets (requires GLIBC 2.40 or newer).
+
 Removed Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1ba52d50056e7..0ffd8c40da7da 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3479,8 +3479,9 @@ def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, FlangOption, FC1Option]>,
     HelpText<"Use the given vector functions library">,
     HelpTextForVariants<[ClangOption, CC1Option],
-      "Use the given vector functions library. "
-      "Note: -fveclib={ArmPL,SLEEF} implies -fno-math-errno">,
+      "Use the given vector functions library.\n"
+      "  Note: -fveclib={ArmPL,SLEEF,libmvec} implies -fno-math-errno.\n"
+      "  Note: -fveclib=libmvec on AArch64 requires GLIBC 2.40 or newer.">,
     Values<"Accelerate,libmvec,MASSV,SVML,SLEEF,Darwin_libsystem_m,ArmPL,AMDLIBM,none">,
     NormalizedValuesScope<"llvm::driver::VectorLibrary">,
     NormalizedValues<["Accelerate", "LIBMVEC", "MASSV", "SVML", "SLEEF",
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index bb7e5f424337b..a78a1c8978183 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5702,11 +5702,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
-    } else if (Name == "libmvec" || Name == "AMDLIBM") {
+    } else if (Name == "AMDLIBM") {
       if (Triple.getArch() != llvm::Triple::x86 &&
           Triple.getArch() != llvm::Triple::x86_64)
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Name << Triple.getArchName();
+    } else if (Name == "libmvec") {
+      if (Triple.getArch() != llvm::Triple::x86 &&
+          Triple.getArch() != llvm::Triple::x86_64 &&
+          Triple.getArch() != llvm::Triple::aarch64 &&
+          Triple.getArch() != llvm::Triple::aarch64_be)
+        D.Diag(diag::err_drv_unsupported_opt_for_target)
+            << Name << Triple.getArchName();
     } else if (Name == "SLEEF" || Name == "ArmPL") {
       if (Triple.getArch() != llvm::Triple::aarch64 &&
           Triple.getArch() != llvm::Triple::aarch64_be &&
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 5420555c36a2a..c57e9aa7a3cc2 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -1,6 +1,7 @@
 // RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck --check-prefix=CHECK-NOLIB %s
 // RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck --check-prefix=CHECK-ACCELERATE %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-libmvec %s
+// RUN: %clang -### -c --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-LIBMVEC-AARCH64 %s
 // RUN: %clang -### -c --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-AMDLIBM %s
 // RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck --check-prefix=CHECK-MASSV %s
 // RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s
@@ -12,6 +13,7 @@
 // CHECK-NOLIB: "-fveclib=none"
 // CHECK-ACCELERATE: "-fveclib=Accelerate"
 // CHECK-libmvec: "-fveclib=libmvec"
+// CHECK-LIBMVEC-AARCH64: "-fveclib=libmvec"
 // CHECK-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-MASSV: "-fveclib=MASSV"
 // CHECK-DARWIN_LIBSYSTEM_M: "-fveclib=Darwin_libsystem_m"
@@ -23,7 +25,6 @@
 
 // RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
-// RUN: not %clang --target=aarch64 -c -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // RUN: not %clang --target=aarch64 -c -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // CHECK-ERROR: unsupported option {{.*}} for target
@@ -43,6 +44,9 @@
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
 // CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC-AARCH64 %s
+// CHECK-LTO-LIBMVEC-AARCH64: "-plugin-opt=-vector-library=LIBMVEC"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-AMDLIBM %s
 // CHECK-LTO-AMDLIBM: "-plugin-opt=-vector-library=AMDLIBM"
 
@@ -68,6 +72,10 @@
 // CHECK-ERRNO-LIBMVEC: "-fveclib=libmvec"
 // CHECK-ERRNO-LIBMVEC-SAME: "-fmath-errno"
 
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-LIBMVEC-AARCH64 %s
+// CHECK-ERRNO-LIBMVEC-AARCH64: "-fveclib=libmvec"
+// CHECK-ERRNO-LIBMVEC-AARCH64-SAME: "-fmath-errno"
+
 // RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=AMDLIBM %s 2>&1 | FileCheck --check-prefix=CHECK-ERRNO-AMDLIBM %s
 // CHECK-ERRNO-AMDLIBM: "-fveclib=AMDLIBM"
 // CHECK-ERRNO-AMDLIBM-SAME: "-fmath-errno"
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 68753a2497db2..4015df990729f 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -237,6 +237,266 @@ TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVdN4v_log", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVbN4v_logf", FIXED(4), "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVdN8v_logf", FIXED(8), "_ZGV_LLVM_N8v")
 
+#elif defined(TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS)
+
+TLI_DEFINE_VECFUNC("acos", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acos", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acosf", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVnN2v_acos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN2v_acosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVnN4v_acosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.acos.f64", "_ZGVsMxv_acos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.acos.f32", "_ZGVsMxv_acosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("acosh", "_ZGVnN2v_acosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN2v_acoshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVnN4v_acoshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("acosh", "_ZGVsMxv_acosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("acoshf", "_ZGVsMxv_acoshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asin", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asin", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinf", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVnN2v_asin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN2v_asinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVnN4v_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.asin.f64", "_ZGVsMxv_asin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.asin.f32", "_ZGVsMxv_asinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("asinh", "_ZGVnN2v_asinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN2v_asinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVnN4v_asinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("asinh", "_ZGVsMxv_asinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("asinhf", "_ZGVsMxv_asinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanf", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVnN2v_atan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN2v_atanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVnN4v_atanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan.f64", "_ZGVsMxv_atan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan.f32", "_ZGVsMxv_atanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("atan2", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atan2", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("atan2f", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVnN2vv_atan2", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN2vv_atan2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVnN4vv_atan2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.atan2.f64", "_ZGVsMxvv_atan2", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.atan2.f32", "_ZGVsMxvv_atan2f", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("atanh", "_ZGVnN2v_atanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN2v_atanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVnN4v_atanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("atanh", "_ZGVsMxv_atanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("atanhf", "_ZGVsMxv_atanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVnN2v_cbrt", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN2v_cbrtf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVnN4v_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cbrt", "_ZGVsMxv_cbrt",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cbrtf", "_ZGVsMxv_cbrtf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cos", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cos", "_ZGVsMxv_cos",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("cosf", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVnN2v_cos", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN2v_cosf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVnN4v_cosf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVsMxv_cos", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVsMxv_cosf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("cosh", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("cosh", "_ZGVsMxv_cosh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("coshf", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVnN2v_cosh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN2v_coshf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVnN4v_coshf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.cosh.f64", "_ZGVsMxv_cosh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.cosh.f32", "_ZGVsMxv_coshf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erf", "_ZGVnN2v_erf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN2v_erff", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erff", "_ZGVnN4v_erff", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erf", "_ZGVsMxv_erf",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erff", "_ZGVsMxv_erff", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("erfc", "_ZGVnN2v_erfc", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN2v_erfcf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVnN4v_erfcf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("erfc", "_ZGVsMxv_erfc",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("erfcf", "_ZGVsMxv_erfcf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expf", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp", "_ZGVsMxv_exp",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expf", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN2v_expf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVnN4v_expf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp.f64", "_ZGVsMxv_exp", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp.f32", "_ZGVsMxv_expf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp10", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp10", "_ZGVsMxv_exp10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp10f", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVnN2v_exp10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN2v_exp10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVnN4v_exp10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp10.f64", "_ZGVsMxv_exp10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp10.f32", "_ZGVsMxv_exp10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("exp2", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("exp2", "_ZGVsMxv_exp2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("exp2f", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVnN2v_exp2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN2v_exp2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVnN4v_exp2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.exp2.f64", "_ZGVsMxv_exp2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.exp2.f32", "_ZGVsMxv_exp2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("expm1", "_ZGVnN2v_expm1", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN2v_expm1f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVnN4v_expm1f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("expm1", "_ZGVsMxv_expm1",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("expm1f", "_ZGVsMxv_expm1f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("hypot", "_ZGVnN2vv_hypot", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN2vv_hypotf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVnN4vv_hypotf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("hypot", "_ZGVsMxvv_hypot", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("hypotf", "_ZGVsMxvv_hypotf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("log", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("logf", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log", "_ZGVsMxv_log",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("logf", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVnN2v_log", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN2v_logf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log.f64", "_ZGVsMxv_log", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log.f32", "_ZGVsMxv_logf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log10", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log10", "_ZGVsMxv_log10",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log10f", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVnN2v_log10", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN2v_log10f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVnN4v_log10f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log10.f64", "_ZGVsMxv_log10", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log10.f32", "_ZGVsMxv_log10f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log1p", "_ZGVnN2v_log1p", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN2v_log1pf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVnN4v_log1pf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log1p", "_ZGVsMxv_log1p",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log1pf", "_ZGVsMxv_log1pf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("log2", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("log2", "_ZGVsMxv_log2",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVnN2v_log2", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN2v_log2f", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVnN4v_log2f", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("pow", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("powf", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("pow", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("powf", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN2vv_powf", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.pow.f64", "_ZGVsMxvv_pow", SCALABLE(2), MASKED, "_ZGVsMxvv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.pow.f32", "_ZGVsMxvv_powf", SCALABLE(4), MASKED, "_ZGVsMxvv", NOCC)
+
+TLI_DEFINE_VECFUNC("sin", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sin", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVnN2v_sin", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN2v_sinf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVnN4v_sinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("sinh", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVnN2v_sinh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN2v_sinhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVnN4v_sinhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.sinh.f64", "_ZGVsMxv_sinh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.sinh.f32", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tan", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tan", "_ZGVsMxv_tan",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanf", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVnN2v_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN2v_tanf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVnN4v_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVsMxv_tan", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVsMxv_tanf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("tanh", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("tanh", "_ZGVsMxv_tanh",  SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("tanhf", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVnN2v_tanh", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN2v_tanhf", FIXED(2), NOMASK, "_ZGV_LLVM_N2v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVnN4v_tanhf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v", CallingConv::AArch64_VectorCall)
+TLI_DEFINE_VECFUNC("llvm.tanh.f64", "_ZGVsMxv_tanh", SCALABLE(2), MASKED, "_ZGVsMxv", NOCC)
+TLI_DEFINE_VECFUNC("llvm.tanh.f32", "_ZGVsMxv_tanhf", SCALABLE(4), MASKED, "_ZGVsMxv", NOCC)
+
 #elif defined(TLI_DEFINE_MASSV_VECFUNCS)
 // IBM MASS library's vector Functions
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c8b568354965d..a3ed093134390 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1299,6 +1299,14 @@ static const VecDesc VecFuncs_LIBMVEC_X86[] = {
 #undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 };
 
+static const VecDesc VecFuncs_LIBMVEC_AARCH64[] = {
+#define TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX, CC)               \
+  {SCAL, VEC, VF, MASK, VABI_PREFIX, CC},
+#include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_LIBMVEC_AARCH64_VECFUNCS
+};
+
 static const VecDesc VecFuncs_MASSV[] = {
 #define TLI_DEFINE_MASSV_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
@@ -1376,6 +1384,10 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
     case llvm::Triple::x86_64:
       addVectorizableFunctions(VecFuncs_LIBMVEC_X86);
       break;
+    case llvm::Triple::aarch64:
+    case llvm::Triple::aarch64_be:
+      addVectorizableFunctions(VecFuncs_LIBMVEC_AARCH64);
+      break;
     }
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
new file mode 100644
index 0000000000000..1b541d1330aae
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec-scalable.ll
@@ -0,0 +1,579 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxvv_pow, ptr @_ZGVsMxvv_powf, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf, ptr @_ZGVsMxv_tan, ptr @_ZGVsMxv_tanf, ptr @_ZGVsMxv_acos, ptr @_ZGVsMxv_acosf, ptr @_ZGVsMxv_asin, ptr @_ZGVsMxv_asinf, ptr @_ZGVsMxv_atan, ptr @_ZGVsMxv_atanf, ptr @_ZGVsMxvv_atan2, ptr @_ZGVsMxvv_atan2f, ptr @_ZGVsMxv_cosh, ptr @_ZGVsMxv_coshf, ptr @_ZGVsMxv_sinh, ptr @_ZGVsMxv_sinhf, ptr @_ZGVsMxv_tanh, ptr @_ZGVsMxv_tanhf], section "llvm.metadata"
+;.
+define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_ceil_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_copysign_vscale_f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[MAG:%.*]], <vscale x 2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %mag, <vscale x 2 x double> %sgn)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[MAG:%.*]], <vscale x 4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> %mag, <vscale x 4 x float> %sgn)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fabs_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fabs_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_floor_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_floor_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_fma_vscale_f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[A:%.*]], <vscale x 2 x double> [[B:%.*]], <vscale x 2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_maxnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_maxnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_minnum_vscale_f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[IN0:%.*]], <vscale x 2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> %in0, <vscale x 2 x double> %in1)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_minnum_vscale_f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[IN0:%.*]], <vscale x 4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %in0, <vscale x 4 x float> %in1)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_nearbyint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_nearbyint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_pow_vscale_f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x double> [[POW:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> %in, <vscale x 2 x double> %pow)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_pow_vscale_f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x float> [[POW:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> %in, <vscale x 4 x float> %pow)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_rint_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_rint_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_round_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_round_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sqrt_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sqrt_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_acos_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.acos.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_acos_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.acos.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_asin_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.asin.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_asin_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.asin.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_atan2_vscale_f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.atan2.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_atan2_vscale_f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.atan2.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_cosh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.cosh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_cosh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.cosh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_sinh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.sinh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_sinh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.sinh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+define <vscale x 2 x double> @llvm_tanh_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.tanh.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.tanh.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+
+define <vscale x 2 x double> @llvm_trunc_vscale_f64(<vscale x 2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = call fast <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> %in)
+  ret <vscale x 2 x double> %1
+}
+
+define <vscale x 4 x float> @llvm_trunc_vscale_f32(<vscale x 4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_vscale_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+;
+  %1 = call fast <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> %in)
+  ret <vscale x 4 x float> %1
+}
+
+declare <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.tan.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.tan.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+sve" }
+;.
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
new file mode 100644
index 0000000000000..6323d942a08e7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-libmvec.ll
@@ -0,0 +1,577 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -vector-library=LIBMVEC -replace-with-veclib -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;.
+; CHECK: @llvm.compiler.used = appending global [34 x ptr] [ptr @_ZGVnN2v_cos, ptr @_ZGVnN4v_cosf, ptr @_ZGVnN2v_exp, ptr @_ZGVnN4v_expf, ptr @_ZGVnN2v_exp10, ptr @_ZGVnN4v_exp10f, ptr @_ZGVnN2v_exp2, ptr @_ZGVnN4v_exp2f, ptr @_ZGVnN2v_log, ptr @_ZGVnN4v_logf, ptr @_ZGVnN2v_log10, ptr @_ZGVnN4v_log10f, ptr @_ZGVnN2v_log2, ptr @_ZGVnN4v_log2f, ptr @_ZGVnN2vv_pow, ptr @_ZGVnN4vv_powf, ptr @_ZGVnN2v_sin, ptr @_ZGVnN4v_sinf, ptr @_ZGVnN2v_tan, ptr @_ZGVnN4v_tanf, ptr @_ZGVnN2v_acos, ptr @_ZGVnN4v_acosf, ptr @_ZGVnN2v_asin, ptr @_ZGVnN4v_asinf, ptr @_ZGVnN2v_atan, ptr @_ZGVnN4v_atanf, ptr @_ZGVnN2vv_atan2, ptr @_ZGVnN4vv_atan2f, ptr @_ZGVnN2v_cosh, ptr @_ZGVnN4v_coshf, ptr @_ZGVnN2v_sinh, ptr @_ZGVnN4v_sinhf, ptr @_ZGVnN2v_tanh, ptr @_ZGVnN4v_tanhf], section "llvm.metadata"
+;.
+define <2 x double> @llvm_ceil_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_ceil_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.ceil.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_ceil_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_ceil_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_copysign_f64(<2 x double> %mag, <2 x double> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> [[MAG:%.*]], <2 x double> [[SGN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sgn)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_copysign_f32(<4 x float> %mag, <4 x float> %sgn) {
+; CHECK-LABEL: @llvm_copysign_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> [[MAG:%.*]], <4 x float> [[SGN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sgn)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_cosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_expf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_exp2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_exp2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_exp2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.exp2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_exp2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fabs_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_fabs_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fabs_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_fabs_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_floor_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_floor_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.floor.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.floor.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_floor_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_floor_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.floor.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_fma_f64(<2 x double> %a, <2 x double> %b, <2 x double> %c ) {
+; CHECK-LABEL: @llvm_fma_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.fma.v2f64(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_fma_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: @llvm_fma_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.fma.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_logf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log10_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log10_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log10(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log10.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log10_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log10_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log10f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log10.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_log2_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_log2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_log2(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.log2.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_log2_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_log2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_log2f(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.log2.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_maxnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.maxnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_maxnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_maxnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_minnum_f64(<2 x double> %in0, <2 x double> %in1) {
+; CHECK-LABEL: @llvm_minnum_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> [[IN0:%.*]], <2 x double> [[IN1:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.minnum.v2f64(<2 x double> %in0, <2 x double> %in1)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_minnum_f32(<4 x float> %in0, <4 x float> %in1) {
+; CHECK-LABEL: @llvm_minnum_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> [[IN0:%.*]], <4 x float> [[IN1:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %in0, <4 x float> %in1)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_nearbyint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_nearbyint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_nearbyint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_pow_f64(<2 x double> %in, <2 x double> %pow) {
+; CHECK-LABEL: @llvm_pow_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_pow(<2 x double> [[IN:%.*]], <2 x double> [[POW:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.pow.v2f64(<2 x double> %in, <2 x double> %pow)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_pow_f32(<4 x float> %in, <4 x float> %pow) {
+; CHECK-LABEL: @llvm_pow_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_powf(<4 x float> [[IN:%.*]], <4 x float> [[POW:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.pow.v4f32(<4 x float> %in, <4 x float> %pow)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_rint_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_rint_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.rint.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.rint.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_rint_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_rint_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.rint.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_round_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_round_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.round.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.round.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_round_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_round_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.round.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sqrt_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sqrt_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sqrt_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sqrt_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_acos_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_acos_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_acos(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.acos.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_acos_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_acos_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_acosf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.acos.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_asin_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_asin_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_asin(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.asin.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_asin_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_asin_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_asinf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.asin.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_atan_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_atan(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_atan_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_atanf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_atan2_f64(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @llvm_atan2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[INX:%.*]], <2 x double> [[INY:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.atan2.v2f64(<2 x double> %x, <2 x double> %y)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_atan2_f32(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @llvm_atan2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[INX:%.*]], <4 x float> [[INY:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.atan2.v4f32(<4 x float> %x, <4 x float> %y)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_cosh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_cosh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_cosh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.cosh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_cosh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_cosh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_coshf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.cosh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_sinh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_sinh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_sinh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.sinh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_sinh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_sinh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.sinh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_tanh_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_tanh_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @_ZGVnN2v_tanh(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.tanh.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_tanh_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_tanh_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.tanh.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+define <2 x double> @llvm_trunc_f64(<2 x double> %in) {
+; CHECK-LABEL: @llvm_trunc_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> [[IN:%.*]])
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+;
+  %1 = call fast <2 x double> @llvm.trunc.v2f64(<2 x double> %in)
+  ret <2 x double> %1
+}
+
+define <4 x float> @llvm_trunc_f32(<4 x float> %in) {
+; CHECK-LABEL: @llvm_trunc_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> [[IN:%.*]])
+; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %in)
+  ret <4 x float> %1
+}
+
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.cos.v2f64(<2 x double>)
+declare <4 x float> @llvm.cos.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
+declare <2 x double> @llvm.exp10.v2f64(<2 x double>)
+declare <4 x float> @llvm.exp10.v4f32(<4 x float>)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+declare <4 x float> @llvm.floor.v4f32(<4 x float>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.log.v2f64(<2 x double>)
+declare <4 x float> @llvm.log.v4f32(<4 x float>)
+declare <2 x double> @llvm.log10.v2f64(<2 x double>)
+declare <4 x float> @llvm.log10.v4f32(<4 x float>)
+declare <2 x double> @llvm.log2.v2f64(<2 x double>)
+declare <4 x float> @llvm.log2.v4f32(<4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
+declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.rint.v2f64(<2 x double>)
+declare <4 x float> @llvm.rint.v4f32(<4 x float>)
+declare <2 x double> @llvm.round.v2f64(<2 x double>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
+declare <2 x double> @llvm.sin.v2f64(<2 x double>)
+declare <4 x float> @llvm.sin.v4f32(<4 x float>)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <2 x double> @llvm.tan.v2f64(<2 x double>)
+declare <4 x float> @llvm.tan.v4f32(<4 x float>)
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
index c6ea44bb85f11..670b08987c81e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -1,4 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(cos|sin|tan|cbrt|erf|exp[^e]|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s -check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s -check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -mcpu=neoverse-v1 -force-vector-interleave=1 -S < %s | FileCheck %s -check-prefixes=SLEEF-SVE-NOPRED
@@ -19,6 +22,18 @@ declare double @acos(double)
 declare float @acosf(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -64,6 +79,18 @@ define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -112,6 +139,18 @@ declare double @acosh(double)
 declare float @acoshf(float)
 
 define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -157,6 +196,18 @@ define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acoshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acoshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acoshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -205,6 +256,18 @@ declare double @asin(double)
 declare float @asinf(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -250,6 +313,18 @@ define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -298,6 +373,18 @@ declare double @asinh(double)
 declare float @asinhf(float)
 
 define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +430,18 @@ define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -391,6 +490,18 @@ declare double @atan(double)
 declare float @atanf(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -436,6 +547,18 @@ define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -484,6 +607,18 @@ declare double @atan2(double, double)
 declare float @atan2f(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -529,6 +664,18 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -577,6 +724,18 @@ declare double @atanh(double)
 declare float @atanhf(float)
 
 define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -622,6 +781,18 @@ define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -670,6 +841,18 @@ declare double @cbrt(double)
 declare float @cbrtf(float)
 
 define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cbrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cbrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -715,6 +898,18 @@ define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cbrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cbrtf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cbrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cbrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cbrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cbrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -763,6 +958,18 @@ declare double @copysign(double, double)
 declare float @copysignf(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_copysign(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -808,6 +1015,18 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_copysignf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -856,6 +1075,18 @@ declare double @cos(double)
 declare float @cosf(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -901,6 +1132,18 @@ define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -949,6 +1192,18 @@ declare double @cosh(double)
 declare float @coshf(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -994,6 +1249,18 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1042,6 +1309,18 @@ declare double @cospi(double)
 declare float @cospif(float)
 
 define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cospi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1087,6 +1366,18 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cospif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1135,6 +1426,18 @@ declare double @erf(double)
 declare float @erff(float)
 
 define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erf(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erf(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1180,6 +1483,18 @@ define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erff(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erff(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erff(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1228,6 +1543,18 @@ declare double @erfc(double)
 declare float @erfcf(float)
 
 define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_erfc(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_erfc(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1273,6 +1600,18 @@ define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @erfc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_erfcf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @erfc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_erfcf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @erfc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_erfcf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1321,6 +1660,18 @@ declare double @exp(double)
 declare float @expf(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1366,6 +1717,18 @@ define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1414,6 +1777,18 @@ declare double @exp10(double)
 declare float @exp10f(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1459,6 +1834,18 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1507,6 +1894,18 @@ declare double @exp2(double)
 declare float @exp2f(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1552,6 +1951,18 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1600,6 +2011,18 @@ declare double @expm1(double)
 declare float @expm1f(float)
 
 define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_expm1(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_expm1(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1645,6 +2068,18 @@ define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @expm1_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expm1f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @expm1_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expm1f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @expm1_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expm1f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1693,6 +2128,18 @@ declare double @fdim(double, double)
 declare float @fdimf(float, float)
 
 define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fdim(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1738,6 +2185,18 @@ define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fdim_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fdim_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fdim_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fdimf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1786,6 +2245,18 @@ declare double @fma(double, double, double)
 declare float @fmaf(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vvv_fma(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1831,6 +2302,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vvv_fmaf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1879,6 +2362,18 @@ declare double @fmax(double, double)
 declare float @fmaxf(float, float)
 
 define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmax(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1924,6 +2419,18 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmax_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmax_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmax_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmaxf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1972,6 +2479,18 @@ declare double @fmin(double, double)
 declare float @fminf(float, float)
 
 define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmin(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2017,6 +2536,18 @@ define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fminf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2065,6 +2596,18 @@ declare double @fmod(double, double)
 declare float @fmodf(float, float)
 
 define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @fmod(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2110,6 +2653,18 @@ define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fmod_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @fmod_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @fmodf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @fmod_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2158,6 +2713,18 @@ declare double @hypot(double, double)
 declare float @hypotf(float, float)
 
 define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_hypot(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_hypot(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -2203,6 +2770,18 @@ define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @hypot_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_hypotf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @hypot_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_hypotf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @hypot_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_hypotf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -2251,6 +2830,18 @@ declare i32 @ilogb(double)
 declare i32 @ilogbf(float)
 
 define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x i32> @_ZGVnN2v_ilogb(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2296,6 +2887,18 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ilogb_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ilogb_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ilogb_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x i32> @_ZGVnN4v_ilogbf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2344,6 +2947,18 @@ declare double @ldexp(double, i32)
 declare float @ldexpf(float, i32)
 
 define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <2 x double> @_ZGVnN2vv_ldexp(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]])
@@ -2391,6 +3006,18 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 }
 
 define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ldexp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ldexp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ldexp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP4:%.*]] = call <4 x float> @_ZGVnN4vv_ldexpf(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]])
@@ -2441,6 +3068,18 @@ declare double @lgamma(double)
 declare float @lgammaf(float)
 
 define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @lgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2486,6 +3125,18 @@ define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @lgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @lgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @lgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @lgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2534,6 +3185,18 @@ declare double @log(double)
 declare float @logf(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2579,6 +3242,18 @@ define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2627,6 +3302,18 @@ declare double @log10(double)
 declare float @log10f(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2672,6 +3359,18 @@ define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2720,6 +3419,18 @@ declare double @log1p(double)
 declare float @log1pf(float)
 
 define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log1p(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log1p(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2765,6 +3476,18 @@ define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log1p_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log1pf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log1p_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log1pf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log1p_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log1pf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2813,6 +3536,18 @@ declare double @log2(double)
 declare float @log2f(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2858,6 +3593,18 @@ define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2909,6 +3656,18 @@ declare double @modf(double, ptr)
 declare float @modff(float, ptr)
 
 define void @modf_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call double @modf(double [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -2953,6 +3712,18 @@ for.cond.cleanup:
 }
 
 define void @modf_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @modf_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @modf_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @modf_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @modf_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[DATA:%.*]] = call float @modff(float [[NUM:%.*]], ptr [[GEPB:%.*]])
@@ -3000,6 +3771,18 @@ declare double @nextafter(double, double)
 declare float @nextafterf(float, float)
 
 define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_nextafter(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3045,6 +3828,18 @@ define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nextafter_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; LIBMVEC-SVE-LABEL: define void @nextafter_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @nextafter_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_nextafterf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3093,6 +3888,18 @@ declare double @pow(double, double)
 declare float @powf(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -3138,6 +3945,18 @@ define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -3186,6 +4005,18 @@ declare double @sin(double)
 declare float @sinf(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3231,6 +4062,18 @@ define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3282,6 +4125,18 @@ declare void @sincos(double, ptr, ptr)
 declare void @sincosf(float, ptr, ptr)
 
 define void @sincos_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincos(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3325,6 +4180,18 @@ for.cond.cleanup:
 }
 
 define void @sincos_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincosf(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3374,6 +4241,18 @@ declare void @sincospi(double, ptr, ptr)
 declare void @sincospif(float, ptr, ptr)
 
 define void @sincospi_f64(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospi(double [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3417,6 +4296,18 @@ for.cond.cleanup:
 }
 
 define void @sincospi_f32(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
+; LIBMVEC-NEON-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sincospi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sincospi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sincospi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    call void @sincospif(float [[NUM:%.*]], ptr [[GEPB:%.*]], ptr [[GEPC:%.*]])
@@ -3463,6 +4354,18 @@ declare double @sinh(double)
 declare float @sinhf(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3508,6 +4411,18 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3556,6 +4471,18 @@ declare double @sinpi(double)
 declare float @sinpif(float)
 
 define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinpi(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3601,6 +4528,18 @@ define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinpi_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinpi_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinpi_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinpif(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3649,6 +4588,18 @@ declare double @sqrt(double)
 declare float @sqrtf(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @sqrt(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3694,6 +4645,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @sqrtf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3742,6 +4705,18 @@ declare double @tan(double)
 declare float @tanf(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3787,6 +4762,18 @@ define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3835,6 +4822,18 @@ declare double @tanh(double)
 declare float @tanhf(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3880,6 +4879,18 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[TMP12:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -3928,6 +4939,18 @@ declare double @tgamma(double)
 declare float @tgammaf(float)
 
 define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @tgamma(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[WIDE_LOAD:%.*]])
@@ -3973,6 +4996,18 @@ define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 }
 
 define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tgamma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-NEON-WIDTH-2:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tgamma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @tgammaf(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tgamma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
index f753df32d9ebc..f6f2e39594dd8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --filter "call.*(acos|asin|atan|atan2|cos|cosh|exp|log|sin|sinh|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|tan|tanh|trunc)" --version 2
 
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON
+; RUN: opt -mattr=+neon -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=LIBMVEC-NEON-WIDTH-2
+; RUN: opt -mattr=+sve -vector-library=LIBMVEC -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=LIBMVEC-SVE
 ; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
 ; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
 ; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
@@ -16,6 +19,19 @@ declare double @llvm.acos.f64(double)
 declare float @llvm.acos.f32(float)
 
 define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.acos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -51,6 +67,19 @@ define void @acos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @acos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @acos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @acos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_acosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @acos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.acos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @acos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -89,6 +118,19 @@ declare double @llvm.asin.f64(double)
 declare float @llvm.asin.f32(float)
 
 define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.asin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -124,6 +166,19 @@ define void @asin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @asin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @asin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @asin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_asinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @asin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.asin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @asin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -162,6 +217,19 @@ declare double @llvm.atan.f64(double)
 declare float @llvm.atan.f32(float)
 
 define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -197,6 +265,19 @@ define void @atan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_atanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @atan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -235,6 +316,19 @@ declare double @llvm.atan2.f64(double, double)
 declare float @llvm.atan2.f32(float, float)
 
 define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.atan2.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -270,6 +364,19 @@ define void @atan2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @atan2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @atan2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_atan2f(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @atan2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.atan2.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @atan2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -308,6 +415,18 @@ declare double @llvm.ceil.f64(double)
 declare float @llvm.ceil.f32(float)
 
 define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -343,6 +462,18 @@ define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @ceil_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @ceil_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @ceil_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @ceil_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -381,6 +512,19 @@ declare double @llvm.copysign.f64(double, double)
 declare float @llvm.copysign.f32(float, float)
 
 define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.copysign.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -416,6 +560,19 @@ define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @copysign_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @copysign_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @copysign_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]])
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.copysign.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @copysign_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -454,6 +611,19 @@ declare double @llvm.cos.f64(double)
 declare float @llvm.cos.f32(float)
 
 define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
@@ -489,6 +659,19 @@ define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cos_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cos_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_cosf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cos_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cos_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -527,6 +710,19 @@ declare double @llvm.cosh.f64(double)
 declare float @llvm.cosh.f32(float)
 
 define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.cosh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -562,6 +758,19 @@ define void @cosh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @cosh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @cosh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_coshf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @cosh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.cosh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @cosh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -600,6 +809,19 @@ declare double @llvm.exp.f64(double)
 declare float @llvm.exp.f32(float)
 
 define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
@@ -635,6 +857,19 @@ define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_expf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -673,6 +908,19 @@ declare double @llvm.exp10.f64(double)
 declare float @llvm.exp10.f32(float)
 
 define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -708,6 +956,19 @@ define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -746,6 +1007,19 @@ declare double @llvm.exp2.f64(double)
 declare float @llvm.exp2.f32(float)
 
 define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -781,6 +1055,19 @@ define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @exp2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_exp2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @exp2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @exp2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -819,6 +1106,18 @@ declare double @llvm.fabs.f64(double)
 declare float @llvm.fabs.f32(float)
 
 define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -854,6 +1153,18 @@ define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fabs_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fabs_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @fabs_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @fabs_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -892,6 +1203,18 @@ declare double @llvm.floor.f64(double)
 declare float @llvm.floor.f32(float)
 
 define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -927,6 +1250,18 @@ define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @floor_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @floor_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @floor_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @floor_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @floor_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -965,6 +1300,18 @@ declare double @llvm.fma.f64(double, double, double)
 declare float @llvm.fma.f32(float, float, float)
 
 define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
@@ -1000,6 +1347,18 @@ define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @fma_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @fma_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @fma_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @fma_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
@@ -1038,6 +1397,19 @@ declare double @llvm.log.f64(double)
 declare float @llvm.log.f32(float)
 
 define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1073,6 +1445,19 @@ define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_logf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1111,6 +1496,19 @@ declare double @llvm.log10.f64(double)
 declare float @llvm.log10.f32(float)
 
 define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1146,6 +1544,19 @@ define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log10_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log10_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log10_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log10_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1184,6 +1595,19 @@ declare double @llvm.log2.f64(double)
 declare float @llvm.log2.f32(float)
 
 define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1219,6 +1643,19 @@ define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @log2_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @log2_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_log2f(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @log2_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @log2_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1257,6 +1694,18 @@ declare double @llvm.maxnum.f64(double, double)
 declare float @llvm.maxnum.f32(float, float)
 
 define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1292,6 +1741,18 @@ define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @maxnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @maxnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @maxnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @maxnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1330,6 +1791,18 @@ declare double @llvm.minnum.f64(double, double)
 declare float @llvm.minnum.f32(float, float)
 
 define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1365,6 +1838,18 @@ define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @minnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @minnum_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @minnum_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
 ; SLEEF-NEON-LABEL: define void @minnum_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1403,6 +1888,18 @@ declare double @llvm.nearbyint.f64(double)
 declare float @llvm.nearbyint.f32(float)
 
 define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1438,6 +1935,18 @@ define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @nearbyint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @nearbyint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @nearbyint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @nearbyint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1476,6 +1985,19 @@ declare double @llvm.pow.f64(double, double)
 declare float @llvm.pow.f32(float, float)
 
 define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x double> [[WIDE_LOAD]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.pow.f64(double [[IN:%.*]], double [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
@@ -1511,6 +2033,19 @@ define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @pow_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @pow_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2vv_powf(<2 x float> [[WIDE_LOAD:%.*]], <2 x float> [[WIDE_LOAD]])
+;
+; LIBMVEC-SVE-LABEL: define void @pow_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.pow.f32(float [[IN:%.*]], float [[IN]])
+;
 ; SLEEF-NEON-LABEL: define void @pow_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
@@ -1549,6 +2084,18 @@ declare double @llvm.rint.f64(double)
 declare float @llvm.rint.f32(float)
 
 define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1584,6 +2131,18 @@ define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @rint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @rint_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @rint_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @rint_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @rint_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1622,6 +2181,18 @@ declare double @llvm.round.f64(double)
 declare float @llvm.round.f32(float)
 
 define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1657,6 +2228,18 @@ define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @round_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @round_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @round_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @round_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @round_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1695,6 +2278,19 @@ declare double @llvm.sin.f64(double)
 declare float @llvm.sin.f32(float)
 
 define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1730,6 +2326,19 @@ define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sin_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sin_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sin_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sin_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1768,6 +2377,19 @@ declare double @llvm.sinh.f64(double)
 declare float @llvm.sinh.f32(float)
 
 define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.sinh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1803,6 +2425,19 @@ define void @sinh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sinh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sinh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_sinhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sinh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.sinh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sinh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1841,6 +2476,18 @@ declare double @llvm.sqrt.f64(double)
 declare float @llvm.sqrt.f32(float)
 
 define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1876,6 +2523,18 @@ define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @sqrt_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @sqrt_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @sqrt_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1914,6 +2573,19 @@ declare double @llvm.tan.f64(double)
 declare float @llvm.tan.f32(float)
 
 define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tan.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
@@ -1949,6 +2621,19 @@ define void @tan_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tan_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tan_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tan_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tan_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tan.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tan_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -1987,6 +2672,19 @@ declare double @llvm.tanh.f64(double)
 declare float @llvm.tanh.f32(float)
 
 define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_LOAD:%.*]], <vscale x 2 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call double @llvm.tanh.f64(double [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2022,6 +2720,19 @@ define void @tanh_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @tanh_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call aarch64_vector_pcs <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @tanh_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call aarch64_vector_pcs <2 x float> @_ZGVnN2v_tanhf(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @tanh_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP8:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_LOAD:%.*]], <vscale x 4 x i1> splat (i1 true))
+; LIBMVEC-SVE:    [[CALL:%.*]] = tail call float @llvm.tanh.f32(float [[IN:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @tanh_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
@@ -2060,6 +2771,18 @@ declare double @llvm.trunc.f64(double)
 declare float @llvm.trunc.f32(float)
 
 define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f64
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f64
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f64
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
@@ -2095,6 +2818,18 @@ define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
 }
 
 define void @trunc_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; LIBMVEC-NEON-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-NEON-WIDTH-2-LABEL: define void @trunc_f32
+; LIBMVEC-NEON-WIDTH-2-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-NEON-WIDTH-2:    [[TMP2:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[WIDE_LOAD:%.*]])
+;
+; LIBMVEC-SVE-LABEL: define void @trunc_f32
+; LIBMVEC-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; LIBMVEC-SVE:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
 ; SLEEF-NEON-LABEL: define void @trunc_f32
 ; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
 ; SLEEF-NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index a1f660d31668e..5459512239b64 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -1,15 +1,13 @@
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=SVML -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SVML
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=AMDLIBM -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,AMDLIBM
 ; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -vector-library=MASSV -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,MASSV
-; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=LIBMVEC-AARCH64
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-AARCH64
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=LIBMVEC -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,LIBMVEC-X86
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -vector-library=Accelerate -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ACCELERATE
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI
 ; RUN: opt -mtriple=riscv64-unknown-linux-gnu -vector-library=sleefgnuabi -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,SLEEFGNUABI_RISCV
 ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -vector-library=ArmPL -passes=inject-tli-mappings -S < %s | FileCheck %s  --check-prefixes=COMMON,ARMPL
 
-; LIBMVEC-AARCH64-NOT: llvm.compiler.used
-
 ; COMMON-LABEL: @llvm.compiler.used = appending global
 ; SVML-SAME:        [6 x ptr] [
 ; SVML-SAME:          ptr @__svml_sin2,
@@ -35,6 +33,12 @@
 ; MASSV-SAME:         ptr @__log10f4
 ; ACCELERATE-SAME:  [1 x ptr] [
 ; ACCELERATE-SAME:    ptr @vlog10f
+; LIBMVEC-AARCH64-SAME: [5 x ptr] [
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_sin,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN2v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVnN4v_log10f,
+; LIBMVEC-AARCH64-SAME:   ptr @_ZGVsMxv_log10f
 ; LIBMVEC-X86-SAME: [2 x ptr] [
 ; LIBMVEC-X86-SAME:   ptr @_ZGVbN2v_sin,
 ; LIBMVEC-X86-SAME:   ptr @_ZGVdN4v_sin
@@ -100,6 +104,7 @@ define double @sin_f64(double %in) {
 ; AMDLIBM:            call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; MASSV:              call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; ACCELERATE:         call double @sin(double %{{.*}})
+; LIBMVEC-AARCH64:    call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; LIBMVEC-X86:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI:        call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
 ; SLEEFGNUABI_RISCV:  call double @sin(double %{{.*}}) #[[SIN:[0-9]+]]
@@ -158,6 +163,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; COMMON-LABEL:       @call_llvm.log10.f32(
 ; SVML:               call float @llvm.log10.f32(float %{{.*}})
 ; AMDLIBM:            call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
+; LIBMVEC-AARCH64:    call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; LIBMVEC-X86:        call float @llvm.log10.f32(float %{{.*}})
 ; MASSV:              call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
 ; ACCELERATE:         call float @llvm.log10.f32(float %{{.*}}) #[[LOG10:[0-9]+]]
@@ -167,6 +173,7 @@ define float @call_llvm.log10.f32(float %in) {
 ; No mapping of "llvm.log10.f32" to a vector function for SVML.
 ; SVML-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; AMDLIBM-NOT:        _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
+; LIBMVEC-AARCH64-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
 ; LIBMVEC-X86-NOT: _ZGV_LLVM_{{.*}}_llvm.log10.f32({{.*}})
   %call = tail call float @llvm.log10.f32(float %in)
   ret float %call
@@ -196,8 +203,11 @@ declare float @llvm.log10.f32(float) #0
 ; MASSV: declare <2 x double> @__sind2(<2 x double>)
 ; MASSV: declare <4 x float> @__log10f4(<4 x float>)
 
-; LIBMVEC-AARCH64-NOT: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
-; LIBMVEC-AARCH64-NOT: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x double> @_ZGVnN2v_sin(<2 x double>)
+; LIBMVEC-AARCH64: declare <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double>, <vscale x 2 x i1>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <2 x float> @_ZGVnN2v_log10f(<2 x float>)
+; LIBMVEC-AARCH64: declare aarch64_vector_pcs <4 x float> @_ZGVnN4v_log10f(<4 x float>)
+; LIBMVEC-AARCH64: declare <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float>, <vscale x 4 x i1>)
 
 ; LIBMVEC-X86: declare <2 x double> @_ZGVbN2v_sin(<2 x double>)
 ; LIBMVEC-X86: declare <4 x double> @_ZGVdN4v_sin(<4 x double>)
@@ -272,6 +282,14 @@ attributes #0 = { nounwind readnone }
 ; ACCELERATE:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ACCELERATE-SAME:   "_ZGV_LLVM_N4v_llvm.log10.f32(vlog10f)" }
 
+; LIBMVEC-AARCH64:      attributes #[[SIN]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVnN2v_sin),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_sin(_ZGVsMxv_sin)" }
+; LIBMVEC-AARCH64:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
+; LIBMVEC-AARCH64-SAME:   "_ZGV_LLVM_N2v_llvm.log10.f32(_ZGVnN2v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGV_LLVM_N4v_llvm.log10.f32(_ZGVnN4v_log10f),
+; LIBMVEC-AARCH64-SAME:   _ZGVsMxv_llvm.log10.f32(_ZGVsMxv_log10f)" }
+
 ; LIBMVEC-X86:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; LIBMVEC-X86-SAME:   "_ZGV_LLVM_N2v_sin(_ZGVbN2v_sin),
 ; LIBMVEC-X86-SAME:   _ZGV_LLVM_N4v_sin(_ZGVdN4v_sin)" }

From 465e3ce9f10019db071dc7794ae9ab22f9fc76f7 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 17 Jun 2025 11:09:22 +0100
Subject: [PATCH 711/851] [LLVM][CodeGen] Lower ConstantInt vectors like
 shufflevector base splats. (#144395)

ConstantInt vectors utilise DAG.getConstant() when constructing the
initial DAG. This can have the effect of legalising the constant before
the DAG combiner is run, significant altering the generated code. To
mitigate this (hopefully as a temporary measure) we instead try to
construct the DAG in the same way as shufflevector based splats.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 22 +++++++++++++++++--
 llvm/test/CodeGen/AArch64/sve-expand-div.ll   |  1 +
 .../AArch64/sve-fixed-length-sdiv-pow2.ll     |  1 +
 llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll    |  1 +
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c63eb7fc6b374..4f548cbad5c30 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1791,8 +1791,26 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V)) {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
 
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
-      return DAG.getConstant(*CI, getCurSDLoc(), VT);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+      SDLoc DL = getCurSDLoc();
+
+      // DAG.getConstant() may attempt to legalise the vector constant which can
+      // significantly change the combines applied to the DAG. To reduce the
+      // divergence when enabling ConstantInt based vectors we try to construct
+      // the DAG in the same way as shufflevector based splats. TODO: The
+      // divergence sometimes leads to better optimisations. Ideally we should
+      // prevent DAG.getConstant() from legalising too early but there are some
+      // degradations preventing this.
+      if (VT.isScalableVector())
+        return DAG.getNode(
+            ISD::SPLAT_VECTOR, DL, VT,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      if (VT.isFixedLengthVector())
+        return DAG.getSplatBuildVector(
+            VT, DL,
+            DAG.getConstant(CI->getValue(), DL, VT.getVectorElementType()));
+      return DAG.getConstant(*CI, DL, VT);
+    }
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index 180c64e0a7de1..bd6c72a3946c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 ; Check that expensive divides are expanded into a more performant sequence
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
index 8b4386e2c2216..45781fa47c6de 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=128 -use-constant-int-for-fixed-length-splat < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_128
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
index 4607f225f81ea..a799b51f15cb1 100644
--- a/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-sdiv-pow2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 

From 71f72f4d5d1b820a3e6147289547821332eaf115 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 11:21:56 +0100
Subject: [PATCH 712/851] [DAG] Move foldMaskedMerge before visitAND. NFC.

Reduces diff in #144342
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f6d811ddba8ab..d14615dcbc5ee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7206,6 +7206,32 @@ static SDValue foldLogicTreeOfShifts(SDNode *N, SDValue LeftHand,
   return DAG.getNode(LogicOpcode, DL, VT, CombinedShifts, W);
 }
 
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue M, X, Y;
+  if (sd_match(Node,
+               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
+                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
+    EVT VT = M.getValueType();
+    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
+    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
+    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8136,32 +8162,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
-                               const TargetLowering &TLI, const SDLoc &DL) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-
-  // If the target supports and-not, don't fold this.
-  if (TLI.hasAndNot(SDValue(Node, 0)))
-    return SDValue();
-
-  SDValue M, X, Y;
-  if (sd_match(Node,
-               m_Or(m_OneUse(m_And(m_OneUse(m_Not(m_Value(M))), m_Value(Y))),
-                    m_OneUse(m_And(m_Deferred(M), m_Value(X)))))) {
-    EVT VT = M.getValueType();
-    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, Y);
-    SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor, M);
-    return DAG.getNode(ISD::XOR, DL, VT, And, Y);
-  }
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);

From d3f13a0732c2d937a4c12cb8b1a61992ee5b0d9c Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 17 Jun 2025 12:30:47 +0200
Subject: [PATCH 713/851] [GVN] MemorySSA for GVN: embed the memory state in
 symbolic expressions (#123218)

While migrating towards MemorySSA, account for the memory state modeled
by MemorySSA by hashing it, when computing the symbolic expressions for
the memory operations. Likewise, when phi-translating while walking the
CFG for PRE possibilities, see if the value number of an operand may be
refined with one of the value from the incoming edges of the MemoryPhi
associated to the current phi.

Co-authored-by: Momchil Velikov <momchil.velikov@arm.com>
---
 llvm/include/llvm/Transforms/Scalar/GVN.h | 21 +++++-
 llvm/lib/Transforms/Scalar/GVN.cpp        | 89 +++++++++++++++++++++--
 2 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index e156ec469a14f..245414935bc0f 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -46,7 +46,9 @@ class ImplicitControlFlowTracking;
 class LoadInst;
 class LoopInfo;
 class MemDepResult;
+class MemoryAccess;
 class MemoryDependenceResults;
+class MemoryLocation;
 class MemorySSA;
 class MemorySSAUpdater;
 class NonLocalDepResult;
@@ -170,6 +172,10 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     // Value number to PHINode mapping. Used for phi-translate in scalarpre.
     DenseMap<uint32_t, PHINode *> NumberingPhi;
 
+    // Value number to BasicBlock mapping. Used for phi-translate across
+    // MemoryPhis.
+    DenseMap<uint32_t, BasicBlock *> NumberingBB;
+
     // Cache for phi-translate in scalarpre.
     using PhiTranslateMap =
         DenseMap<std::pair<uint32_t, const BasicBlock *>, uint32_t>;
@@ -177,6 +183,9 @@ class GVNPass : public PassInfoMixin<GVNPass> {
 
     AAResults *AA = nullptr;
     MemoryDependenceResults *MD = nullptr;
+    bool IsMDEnabled = false;
+    MemorySSA *MSSA = nullptr;
+    bool IsMSSAEnabled = false;
     DominatorTree *DT = nullptr;
 
     uint32_t NextValueNumber = 1;
@@ -187,12 +196,14 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     Expression createExtractvalueExpr(ExtractValueInst *EI);
     Expression createGEPExpr(GetElementPtrInst *GEP);
     uint32_t lookupOrAddCall(CallInst *C);
+    uint32_t computeLoadStoreVN(Instruction *I);
     uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
                               uint32_t Num, GVNPass &GVN);
     bool areCallValsEqual(uint32_t Num, uint32_t NewNum, const BasicBlock *Pred,
                           const BasicBlock *PhiBlock, GVNPass &GVN);
     std::pair<uint32_t, bool> assignExpNewValueNum(Expression &Exp);
     bool areAllValsInBB(uint32_t Num, const BasicBlock *BB, GVNPass &GVN);
+    void addMemoryStateToExp(Instruction *I, Expression &Exp);
 
   public:
     LLVM_ABI ValueTable();
@@ -201,6 +212,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     LLVM_ABI ~ValueTable();
     LLVM_ABI ValueTable &operator=(const ValueTable &Arg);
 
+    LLVM_ABI uint32_t lookupOrAdd(MemoryAccess *MA);
     LLVM_ABI uint32_t lookupOrAdd(Value *V);
     LLVM_ABI uint32_t lookup(Value *V, bool Verify = true) const;
     LLVM_ABI uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred,
@@ -216,7 +228,14 @@ class GVNPass : public PassInfoMixin<GVNPass> {
     LLVM_ABI void erase(Value *V);
     void setAliasAnalysis(AAResults *A) { AA = A; }
     AAResults *getAliasAnalysis() const { return AA; }
-    void setMemDep(MemoryDependenceResults *M) { MD = M; }
+    void setMemDep(MemoryDependenceResults *M, bool MDEnabled = true) {
+      MD = M;
+      IsMDEnabled = MDEnabled;
+    }
+    void setMemorySSA(MemorySSA *M, bool MSSAEnabled = false) {
+      MSSA = M;
+      IsMSSAEnabled = MSSAEnabled;
+    }
     void setDomTree(DominatorTree *D) { DT = D; }
     uint32_t getNextUnusedValueNumber() { return NextValueNumber; }
     LLVM_ABI void verifyRemoved(const Value *) const;
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c8a0479358eab..c580dd4ff230a 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -474,6 +474,19 @@ void GVNPass::ValueTable::add(Value *V, uint32_t Num) {
     NumberingPhi[Num] = PN;
 }
 
+/// Include the incoming memory state into the hash of the expression for the
+/// given instruction. If the incoming memory state is:
+/// * LiveOnEntry, add the value number of the entry block,
+/// * a MemoryPhi, add the value number of the basic block corresponding to that
+/// MemoryPhi,
+/// * a MemoryDef, add the value number of the memory setting instruction.
+void GVNPass::ValueTable::addMemoryStateToExp(Instruction *I, Expression &Exp) {
+  assert(MSSA && "addMemoryStateToExp should not be called without MemorySSA");
+  assert(MSSA->getMemoryAccess(I) && "Instruction does not access memory");
+  MemoryAccess *MA = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(I);
+  Exp.VarArgs.push_back(lookupOrAdd(MA));
+}
+
 uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
   // FIXME: Currently the calls which may access the thread id may
   // be considered as not accessing the memory. But this is
@@ -594,15 +607,48 @@ uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
     return V;
   }
 
+  if (MSSA && IsMSSAEnabled && AA->onlyReadsMemory(C)) {
+    Expression Exp = createExpr(C);
+    addMemoryStateToExp(C, Exp);
+    auto [V, _] = assignExpNewValueNum(Exp);
+    ValueNumbering[C] = V;
+    return V;
+  }
+
   ValueNumbering[C] = NextValueNumber;
   return NextValueNumber++;
 }
 
+/// Returns the value number for the specified load or store instruction.
+uint32_t GVNPass::ValueTable::computeLoadStoreVN(Instruction *I) {
+  if (!MSSA || !IsMSSAEnabled) {
+    ValueNumbering[I] = NextValueNumber;
+    return NextValueNumber++;
+  }
+
+  Expression Exp;
+  Exp.Ty = I->getType();
+  Exp.Opcode = I->getOpcode();
+  for (Use &Op : I->operands())
+    Exp.VarArgs.push_back(lookupOrAdd(Op));
+  addMemoryStateToExp(I, Exp);
+
+  auto [V, _] = assignExpNewValueNum(Exp);
+  ValueNumbering[I] = V;
+  return V;
+}
+
 /// Returns true if a value number exists for the specified value.
 bool GVNPass::ValueTable::exists(Value *V) const {
   return ValueNumbering.contains(V);
 }
 
+uint32_t GVNPass::ValueTable::lookupOrAdd(MemoryAccess *MA) {
+  return MSSA->isLiveOnEntryDef(MA) || isa<MemoryPhi>(MA)
+             ? lookupOrAdd(MA->getBlock())
+             : lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+}
+
 /// lookupOrAdd - Returns the value number for the specified value, assigning
 /// it a new number if it did not have one before.
 uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
@@ -613,6 +659,8 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
   auto *I = dyn_cast<Instruction>(V);
   if (!I) {
     ValueNumbering[V] = NextValueNumber;
+    if (isa<BasicBlock>(V))
+      NumberingBB[NextValueNumber] = cast<BasicBlock>(V);
     return NextValueNumber++;
   }
 
@@ -672,6 +720,9 @@ uint32_t GVNPass::ValueTable::lookupOrAdd(Value *V) {
       ValueNumbering[V] = NextValueNumber;
       NumberingPhi[NextValueNumber] = cast<PHINode>(V);
       return NextValueNumber++;
+    case Instruction::Load:
+    case Instruction::Store:
+      return computeLoadStoreVN(I);
     default:
       ValueNumbering[V] = NextValueNumber;
       return NextValueNumber++;
@@ -709,6 +760,7 @@ void GVNPass::ValueTable::clear() {
   ValueNumbering.clear();
   ExpressionNumbering.clear();
   NumberingPhi.clear();
+  NumberingBB.clear();
   PhiTranslateTable.clear();
   NextValueNumber = 1;
   Expressions.clear();
@@ -723,6 +775,8 @@ void GVNPass::ValueTable::erase(Value *V) {
   // If V is PHINode, V <--> value number is an one-to-one mapping.
   if (isa<PHINode>(V))
     NumberingPhi.erase(Num);
+  else if (isa<BasicBlock>(V))
+    NumberingBB.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -2310,15 +2364,39 @@ bool GVNPass::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
 uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
                                                const BasicBlock *PhiBlock,
                                                uint32_t Num, GVNPass &GVN) {
+  // See if we can refine the value number by looking at the PN incoming value
+  // for the given predecessor.
   if (PHINode *PN = NumberingPhi[Num]) {
-    for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) {
-      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(I) == Pred)
-        if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
-          return TransVal;
-    }
+    if (PN->getParent() == PhiBlock)
+      for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I)
+        if (PN->getIncomingBlock(I) == Pred)
+          if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false))
+            return TransVal;
     return Num;
   }
 
+  if (BasicBlock *BB = NumberingBB[Num]) {
+    assert(MSSA && "NumberingBB is non-empty only when using MemorySSA");
+    // Value numbers of basic blocks are used to represent memory state in
+    // load/store instructions and read-only function calls when said state is
+    // set by a MemoryPhi.
+    if (BB != PhiBlock)
+      return Num;
+    MemoryPhi *MPhi = MSSA->getMemoryAccess(BB);
+    for (unsigned i = 0, N = MPhi->getNumIncomingValues(); i != N; ++i) {
+      if (MPhi->getIncomingBlock(i) != Pred)
+        continue;
+      MemoryAccess *MA = MPhi->getIncomingValue(i);
+      if (auto *PredPhi = dyn_cast<MemoryPhi>(MA))
+        return lookupOrAdd(PredPhi->getBlock());
+      if (MSSA->isLiveOnEntryDef(MA))
+        return lookupOrAdd(&BB->getParent()->getEntryBlock());
+      return lookupOrAdd(cast<MemoryUseOrDef>(MA)->getMemoryInst());
+    }
+    llvm_unreachable(
+        "CFG/MemorySSA mismatch: predecessor not found among incoming blocks");
+  }
+
   // If there is any value related with Num is defined in a BB other than
   // PhiBlock, it cannot depend on a phi in PhiBlock without going through
   // a backedge. We can do an early exit in that case to save compile time.
@@ -2761,6 +2839,7 @@ bool GVNPass::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   ICF = &ImplicitCFT;
   this->LI = &LI;
   VN.setMemDep(MD);
+  VN.setMemorySSA(MSSA);
   ORE = RunORE;
   InvalidBlockRPONumbers = true;
   MemorySSAUpdater Updater(MSSA);

From ce96fdde54c379fa3893f3f07d8233df9e16b9e2 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 12:38:02 +0200
Subject: [PATCH 714/851] [clang][bytecode] Keep the last chunk in
 InterpStack::clear() (#144487)

We call clear when checking for potential constant expressions, but that
used to free all the chunks. Keep the last one so we don't have to
re-allocate it.
---
 clang/lib/AST/ByteCode/InterpStack.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpStack.cpp b/clang/lib/AST/ByteCode/InterpStack.cpp
index b183335dd5884..6b748d62b83bd 100644
--- a/clang/lib/AST/ByteCode/InterpStack.cpp
+++ b/clang/lib/AST/ByteCode/InterpStack.cpp
@@ -19,9 +19,7 @@
 using namespace clang;
 using namespace clang::interp;
 
-InterpStack::~InterpStack() { clear(); }
-
-void InterpStack::clear() {
+InterpStack::~InterpStack() {
   if (Chunk && Chunk->Next)
     std::free(Chunk->Next);
   if (Chunk)
@@ -33,6 +31,21 @@ void InterpStack::clear() {
 #endif
 }
 
+// We keep the last chunk around to reuse.
+void InterpStack::clear() {
+  if (!Chunk)
+    return;
+
+  if (Chunk->Next)
+    std::free(Chunk->Next);
+
+  assert(Chunk);
+  StackSize = 0;
+#ifndef NDEBUG
+  ItemTypes.clear();
+#endif
+}
+
 void InterpStack::clearTo(size_t NewSize) {
   assert(NewSize <= size());
   size_t ToShrink = size() - NewSize;

From 576ced56d78b48e658b0a170603388e4802f6311 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 12:43:39 +0200
Subject: [PATCH 715/851] [clang][bytecode] Simplify Block::replacePointer()
 (#144490)

Try to do less work here instead of a full remove + add.
---
 clang/lib/AST/ByteCode/InterpBlock.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp
index 9ef44cd29ff87..f60307870ffcc 100644
--- a/clang/lib/AST/ByteCode/InterpBlock.cpp
+++ b/clang/lib/AST/ByteCode/InterpBlock.cpp
@@ -69,20 +69,26 @@ void Block::cleanup() {
 void Block::replacePointer(Pointer *Old, Pointer *New) {
   assert(Old);
   assert(New);
+  assert(Old != New);
   if (IsStatic) {
     assert(!Pointers);
     return;
   }
-
 #ifndef NDEBUG
   assert(hasPointer(Old));
 #endif
 
-  removePointer(Old);
-  addPointer(New);
+  if (Old->Prev)
+    Old->Prev->Next = New;
+  if (Old->Next)
+    Old->Next->Prev = New;
+  New->Prev = Old->Prev;
+  New->Next = Old->Next;
+  if (Pointers == Old)
+    Pointers = New;
 
   Old->PointeeStorage.BS.Pointee = nullptr;
-
+  New->PointeeStorage.BS.Pointee = this;
 #ifndef NDEBUG
   assert(!hasPointer(Old));
   assert(hasPointer(New));

From 49c6235d1fb3bcecfe37a8e41bec69d6c7dc86ff Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 12:51:37 +0200
Subject: [PATCH 716/851] [PowerPC] Regenerate MIR test checks (NFC)

---
 .../PowerPC/aix-vector-vararg-fixed-caller.ll | 137 +++++++++---------
 1 file changed, 69 insertions(+), 68 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
index f3e58b7897948..fad275f58cd01 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-fixed-caller.ll
@@ -12,76 +12,77 @@ define void @caller() {
 
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
-  ; 32BIT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
-  ; 32BIT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
-  ; 32BIT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
-  ; 32BIT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
-  ; 32BIT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
-  ; 32BIT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
-  ; 32BIT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
-  ; 32BIT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 55
-  ; 32BIT:   $r3 = COPY [[LI1]]
-  ; 32BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 32BIT:   $f1 = COPY [[LFD]]
-  ; 32BIT:   $r9 = COPY [[ORI2]]
-  ; 32BIT:   $r10 = COPY [[ORI3]]
-  ; 32BIT:   $f2 = COPY [[LFD1]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   BLR implicit $lr, implicit $rm
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LIS:%[0-9]+]]:gprc = LIS 38314
+  ; 32BIT-NEXT:   [[ORI:%[0-9]+]]:gprc = ORI killed [[LIS]], 63376
+  ; 32BIT-NEXT:   STW killed [[ORI]], 84, $r1 :: (store (s32) into unknown-address + 4, basealign 8)
+  ; 32BIT-NEXT:   [[LIS1:%[0-9]+]]:gprc = LIS 16389
+  ; 32BIT-NEXT:   [[ORI1:%[0-9]+]]:gprc = ORI killed [[LIS1]], 48905
+  ; 32BIT-NEXT:   STW killed [[ORI1]], 80, $r1 :: (store (s32), align 8)
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc2]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LIS2:%[0-9]+]]:gprc = LIS 16393
+  ; 32BIT-NEXT:   [[ORI2:%[0-9]+]]:gprc = ORI killed [[LIS2]], 8697
+  ; 32BIT-NEXT:   [[LIS3:%[0-9]+]]:gprc = LIS 61467
+  ; 32BIT-NEXT:   [[ORI3:%[0-9]+]]:gprc = ORI killed [[LIS3]], 34414
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc_and_gprc_nor0 = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LWZtoc3]] :: (load (s64) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 55
+  ; 32BIT-NEXT:   $r3 = COPY [[LI1]]
+  ; 32BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 32BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 32BIT-NEXT:   $r9 = COPY [[ORI2]]
+  ; 32BIT-NEXT:   $r10 = COPY [[ORI3]]
+  ; 32BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $v2, implicit $f1, implicit $r9, implicit $r10, implicit $f2, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 88, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
-  ; 64BIT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
-  ; 64BIT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
-  ; 64BIT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
-  ; 64BIT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
-  ; 64BIT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
-  ; 64BIT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
-  ; 64BIT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
-  ; 64BIT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
-  ; 64BIT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
-  ; 64BIT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
-  ; 64BIT:   $x3 = COPY [[LI8_1]]
-  ; 64BIT:   $v2 = COPY [[LXVW4X1]]
-  ; 64BIT:   $f1 = COPY [[LFD]]
-  ; 64BIT:   $x7 = COPY [[ORI8_3]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   $f2 = COPY [[LFD1]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LIS8_:%[0-9]+]]:g8rc = LIS8 16389
+  ; 64BIT-NEXT:   [[ORI8_:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_]], 48905
+  ; 64BIT-NEXT:   [[RLDIC:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC]], 38314
+  ; 64BIT-NEXT:   [[ORI8_1:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_]], 63376
+  ; 64BIT-NEXT:   STD killed [[ORI8_1]], 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT2]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc_and_g8rc_nox0 = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LFD1:%[0-9]+]]:f8rc = LFD 0, killed [[LDtocCPT3]] :: (load (s64) from constant-pool)
+  ; 64BIT-NEXT:   [[LIS8_1:%[0-9]+]]:g8rc = LIS8 16393
+  ; 64BIT-NEXT:   [[ORI8_2:%[0-9]+]]:g8rc = ORI8 killed [[LIS8_1]], 8697
+  ; 64BIT-NEXT:   [[RLDIC1:%[0-9]+]]:g8rc = RLDIC killed [[ORI8_2]], 32, 1
+  ; 64BIT-NEXT:   [[ORIS8_1:%[0-9]+]]:g8rc = ORIS8 killed [[RLDIC1]], 61467
+  ; 64BIT-NEXT:   [[ORI8_3:%[0-9]+]]:g8rc = ORI8 killed [[ORIS8_1]], 34414
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 55
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_1]]
+  ; 64BIT-NEXT:   $v2 = COPY [[LXVW4X1]]
+  ; 64BIT-NEXT:   $f1 = COPY [[LFD]]
+  ; 64BIT-NEXT:   $x7 = COPY [[ORI8_3]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   $f2 = COPY [[LFD1]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $v2, implicit $f1, implicit $x7, implicit $x9, implicit $x10, implicit $f2, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
 entry:
   %call = tail call <4 x i32> (i32, <4 x i32>, double, ...) @callee(i32 signext 55, <4 x i32> <i32 170, i32 187, i32 204, i32 221>, double 3.141590e+00, <4 x i32> <i32 10, i32 20, i32 30, i32 40>, double 2.718280e+00)
   ret void

From 2d336e7c5e821383816a9dca080f713747cc9e1e Mon Sep 17 00:00:00 2001
From: Arseniy Zaostrovnykh <necto.ne@gmail.com>
Date: Tue, 17 Jun 2025 13:07:44 +0200
Subject: [PATCH 717/851] [analyzer] Avoid contradicting assumption in tainted
 div-by-0 error node (#144491)

This patch corrects the state of the error node generated by the
core.DivideZero checker when it detects potential division by zero
involving a tainted denominator.

The checker split in

https://github.com/llvm/llvm-project/pull/106389/commits/91ac5ed10a154410c246d985752c1bbfcf23b105
started to introduce a conflicting assumption about the denominator into
the error node:
Node with the Bug Report "Division by a tainted value, possibly zero"
has an assumption "denominator != 0".

This has been done as a shortcut to continue analysis with the correct
assumption *after* the division - if we proceed, we can only assume the
denominator was not zero. However, this assumption is introduced
one-node too soon, leading to a self-contradictory error node.

In this patch, I make the error node with assumption of zero denominator
fatal, but allow analysis to continue on the second half of the state
split with the assumption of non-zero denominator.

---

CPP-6376
---
 .../lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp  |  8 ++++----
 clang/test/Analysis/taint-generic.c                 | 13 +++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
index 15d73fb9ca39a..ab90615f63182 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DivZeroChecker.cpp
@@ -69,7 +69,7 @@ void DivZeroChecker::reportTaintBug(
     llvm::ArrayRef<SymbolRef> TaintedSyms) const {
   if (!TaintedDivChecker.isEnabled())
     return;
-  if (ExplodedNode *N = C.generateNonFatalErrorNode(StateZero)) {
+  if (ExplodedNode *N = C.generateErrorNode(StateZero)) {
     auto R =
         std::make_unique<PathSensitiveBugReport>(TaintedDivChecker, Msg, N);
     bugreporter::trackExpressionValue(N, getDenomExpr(N), *R);
@@ -113,9 +113,9 @@ void DivZeroChecker::checkPreStmt(const BinaryOperator *B,
   if ((stateNotZero && stateZero)) {
     std::vector<SymbolRef> taintedSyms = getTaintedSymbols(C.getState(), *DV);
     if (!taintedSyms.empty()) {
-      reportTaintBug("Division by a tainted value, possibly zero", stateNotZero,
-                     C, taintedSyms);
-      return;
+      reportTaintBug("Division by a tainted value, possibly zero", stateZero, C,
+                     taintedSyms);
+      // Fallthrough to continue analysis in case of non-zero denominator.
     }
   }
 
diff --git a/clang/test/Analysis/taint-generic.c b/clang/test/Analysis/taint-generic.c
index 3c520612c5d9b..9d6d2942df4a9 100644
--- a/clang/test/Analysis/taint-generic.c
+++ b/clang/test/Analysis/taint-generic.c
@@ -412,6 +412,19 @@ int testTaintedDivFP(void) {
   return 5/x; // x cannot be 0, so no tainted warning either
 }
 
+void clang_analyzer_warnIfReached();
+
+int testTaintDivZeroNonfatal() {
+  int x;
+  scanf("%d", &x);
+  int y = 5/x; // expected-warning {{Division by a tainted value, possibly zero}}
+  if (x == 0)
+    clang_analyzer_warnIfReached();
+  else
+    clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+  return y;
+}
+
 // Zero-sized VLAs.
 void testTaintedVLASize(void) {
   int x;

From 990d2540bf0545cc4024c3718069f6d0b42c461b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 12:12:46 +0100
Subject: [PATCH 718/851] [X86] isAddSubOrSubAdd - convert to SDPatternMatch
 matching. NFC. (#144486)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2eadcc5416c28..a2e3873fe31ab 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8268,6 +8268,7 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
                              SDValue &Opnd0, SDValue &Opnd1,
                              unsigned &NumExtracts,
                              bool &IsSubAdd) {
+  using namespace SDPatternMatch;
 
   MVT VT = BV->getSimpleValueType(0);
   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
@@ -8302,14 +8303,8 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
     // Early exit if we cannot match that sequence.
-    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
-        Op0.getOperand(1) != Op1.getOperand(1))
-      return false;
-
-    unsigned I0 = Op0.getConstantOperandVal(1);
-    if (I0 != i)
+    if (!sd_match(Op0, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))) ||
+        !sd_match(Op1, m_ExtractElt(m_SpecificVT(VT), m_SpecificInt(i))))
       return false;
 
     // We found a valid add/sub node, make sure its the same opcode as previous
@@ -8319,16 +8314,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     Opc[i % 2] = Opcode;
 
     // Update InVec0 and InVec1.
-    if (InVec0.isUndef()) {
+    if (InVec0.isUndef())
       InVec0 = Op0.getOperand(0);
-      if (InVec0.getSimpleValueType() != VT)
-        return false;
-    }
-    if (InVec1.isUndef()) {
+    if (InVec1.isUndef())
       InVec1 = Op1.getOperand(0);
-      if (InVec1.getSimpleValueType() != VT)
-        return false;
-    }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.

From 875b36a8742437b95f623bab1e0332562c7b4b3f Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <git@ozinenko.com>
Date: Tue, 17 Jun 2025 13:40:57 +0200
Subject: [PATCH 719/851] [mlir] fix MemRefToLLVM lowering of atomic operations
 (#139045)

We have been confusingly, and arguably incorrectly, lowering `m**imumf`
atomic RMW operations in the MemRef dialect to `fm**` atomic RMW
operations in the LLVM dialect, which have different NaN-propagation
semantics: `m**imumf` propagates NaNs from either operand whereas
`fm**`, which lowers to the `fm**num` intrinsic returns the non-NaN
operand. This also contradicts the lowering of `arith.m**imumf` and
`arith.m**numf` operations.

Change the lowering to match the terminology in arith.

Add tests for these lowerings.

Keep a debug message in case of surprising behavior downstream (the code
may be producing more NaNs now).
---
 mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp   | 13 +++++++++++++
 .../Conversion/MemRefToLLVM/memref-to-llvm.mlir     | 10 +++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index ade4e4d3de8ec..8ccf1bfc292d5 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -28,6 +28,9 @@
 #include "llvm/Support/MathExtras.h"
 #include <optional>
 
+#define DEBUG_TYPE "memref-to-llvm"
+#define DBGS() llvm::dbgs() << "[" DEBUG_TYPE "] "
+
 namespace mlir {
 #define GEN_PASS_DEF_FINALIZEMEMREFTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
@@ -1782,12 +1785,22 @@ matchSimpleAtomicOp(memref::AtomicRMWOp atomicOp) {
   case arith::AtomicRMWKind::assign:
     return LLVM::AtomicBinOp::xchg;
   case arith::AtomicRMWKind::maximumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw maximumf changed "
+                         "from fmax to fmaximum, expect more NaNs");
+    return LLVM::AtomicBinOp::fmaximum;
+  case arith::AtomicRMWKind::maxnumf:
     return LLVM::AtomicBinOp::fmax;
   case arith::AtomicRMWKind::maxs:
     return LLVM::AtomicBinOp::max;
   case arith::AtomicRMWKind::maxu:
     return LLVM::AtomicBinOp::umax;
   case arith::AtomicRMWKind::minimumf:
+    // TODO: remove this by end of 2025.
+    LLVM_DEBUG(DBGS() << "the lowering of memref.atomicrmw minimum changed "
+                         "from fmin to fminimum, expect more NaNs");
+    return LLVM::AtomicBinOp::fminimum;
+  case arith::AtomicRMWKind::minnumf:
     return LLVM::AtomicBinOp::fmin;
   case arith::AtomicRMWKind::mins:
     return LLVM::AtomicBinOp::min;
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index acfc188574255..51d56389dac9e 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -452,11 +452,19 @@ func.func @atomic_rmw(%I : memref<10xi32>, %ival : i32, %F : memref<10xf32>, %fv
   // CHECK: llvm.atomicrmw umin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw addf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
   // CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maximumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmaximum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw maxnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmax %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minimumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fminimum %{{.*}}, %{{.*}} acq_rel
+  memref.atomic_rmw minnumf %fval, %F[%i] : (f32, memref<10xf32>) -> f32
+  // CHECK: llvm.atomicrmw fmin %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw ori %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _or %{{.*}}, %{{.*}} acq_rel
   memref.atomic_rmw andi %ival, %I[%i] : (i32, memref<10xi32>) -> i32
   // CHECK: llvm.atomicrmw _and %{{.*}}, %{{.*}} acq_rel
-  // CHECK-INTERFACE-COUNT-9: llvm.atomicrmw
+  // CHECK-INTERFACE-COUNT-13: llvm.atomicrmw
   return
 }
 

From 9700930bd90a099f702332cf86dd898f00840f99 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 13:06:10 +0100
Subject: [PATCH 720/851] [X86] detectZextAbsDiff - convert to SDPatternMatch
 matching. NFC. (#144498)

Match the entire ABS(SUB(ZEXT(vXi8),ZEXT(vXi8))) pattern and simplify the logic in combineBasicSADPattern accordingly
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 37 ++++++++++---------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a2e3873fe31ab..cd02d275d6b57 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46058,22 +46058,18 @@ static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
 // Given a ABS node, detect the following pattern:
 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
 // This is useful as it is the input into a SAD pattern.
-static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
-  SDValue AbsOp1 = Abs->getOperand(0);
-  if (AbsOp1.getOpcode() != ISD::SUB)
-    return false;
-
-  Op0 = AbsOp1.getOperand(0);
-  Op1 = AbsOp1.getOperand(1);
+static bool detectZextAbsDiff(SDValue Abs, SDValue &Op0, SDValue &Op1) {
+  using namespace SDPatternMatch;
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
-  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
-      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
-      Op1.getOpcode() != ISD::ZERO_EXTEND ||
-      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
-    return false;
-
-  return true;
+  EVT SrcVT0, SrcVT1;
+  return sd_match(
+             Abs,
+             m_UnaryOp(ISD::ABS,
+                       m_Sub(m_AllOf(m_Value(Op0), m_ZExt(m_VT(SrcVT0))),
+                             m_AllOf(m_Value(Op1), m_ZExt(m_VT(SrcVT1)))))) &&
+         SrcVT0.getVectorElementType() == MVT::i8 &&
+         SrcVT1.getVectorElementType() == MVT::i8;
 }
 
 static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
@@ -46455,6 +46451,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Match shuffle + add pyramid.
   ISD::NodeType BinOp;
   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+  if (!Root)
+    return SDValue();
 
   // The operand is expected to be zero extended from i8
   // (verified in detectZextAbsDiff).
@@ -46464,16 +46462,11 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // Also the sign extend is basically zero extend
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
-  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-               Root.getOpcode() == ISD::ZERO_EXTEND ||
-               Root.getOpcode() == ISD::ANY_EXTEND))
+  if (Root.getOpcode() == ISD::SIGN_EXTEND ||
+      Root.getOpcode() == ISD::ZERO_EXTEND ||
+      Root.getOpcode() == ISD::ANY_EXTEND)
     Root = Root.getOperand(0);
 
-  // If there was a match, we want Root to be a select that is the root of an
-  // abs-diff pattern.
-  if (!Root || Root.getOpcode() != ISD::ABS)
-    return SDValue();
-
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
   if (!detectZextAbsDiff(Root, Zext0, Zext1))

From 12611a7fc71376e88aa01e3f0bbc74517f1a1703 Mon Sep 17 00:00:00 2001
From: Denzel-Brian Budii <73462654+chios202@users.noreply.github.com>
Date: Tue, 17 Jun 2025 15:07:20 +0300
Subject: [PATCH 721/851] [mlir] Improve mlir-query by adding matcher
 combinators (#141423)

Whereas backward-slice matching provides support to limit traversal by
specifying the desired depth level, this pull request introduces support
for limiting traversal with a nested matcher (adding forward-slice
also). It also adds support for variadic operators, including `anyOf`
and `allOf`. Rather than simply stopping traversal when an operation
named foo is encountered, one can now define a matcher that specifies
different exit conditions. Variadic support implementation within
mlir-query is very similar to clang-query.
---
 mlir/include/mlir/Query/Matcher/Marshallers.h |  61 +++++++++
 mlir/include/mlir/Query/Matcher/MatchFinder.h |   4 +-
 .../mlir/Query/Matcher/MatchersInternal.h     | 116 +++++++++++++++++-
 .../mlir/Query/Matcher/SliceMatchers.h        | 104 +++++++++++++++-
 .../include/mlir/Query/Matcher/VariantValue.h |  11 +-
 mlir/lib/Query/Matcher/CMakeLists.txt         |   1 +
 mlir/lib/Query/Matcher/MatchersInternal.cpp   |  33 +++++
 mlir/lib/Query/Matcher/RegistryManager.cpp    |   7 +-
 mlir/lib/Query/Matcher/VariantValue.cpp       |  52 ++++++++
 mlir/lib/Query/Query.cpp                      |   5 +
 ...ex-test.mlir => backward-slice-union.mlir} |  13 +-
 .../forward-slice-by-predicate.mlir           |  27 ++++
 .../mlir-query/logical-operator-test.mlir     |  11 ++
 .../mlir-query/slice-function-extraction.mlir |  29 +++++
 mlir/tools/mlir-query/mlir-query.cpp          |  14 ++-
 15 files changed, 471 insertions(+), 17 deletions(-)
 create mode 100644 mlir/lib/Query/Matcher/MatchersInternal.cpp
 rename mlir/test/mlir-query/{complex-test.mlir => backward-slice-union.mlir} (71%)
 create mode 100644 mlir/test/mlir-query/forward-slice-by-predicate.mlir
 create mode 100644 mlir/test/mlir-query/logical-operator-test.mlir
 create mode 100644 mlir/test/mlir-query/slice-function-extraction.mlir

diff --git a/mlir/include/mlir/Query/Matcher/Marshallers.h b/mlir/include/mlir/Query/Matcher/Marshallers.h
index 012bf7b9ec4a9..5fe6965f32efb 100644
--- a/mlir/include/mlir/Query/Matcher/Marshallers.h
+++ b/mlir/include/mlir/Query/Matcher/Marshallers.h
@@ -108,6 +108,9 @@ class MatcherDescriptor {
                                 const llvm::ArrayRef<ParserValue> args,
                                 Diagnostics *error) const = 0;
 
+  // If the matcher is variadic, it can take any number of arguments.
+  virtual bool isVariadic() const = 0;
+
   // Returns the number of arguments accepted by the matcher.
   virtual unsigned getNumArgs() const = 0;
 
@@ -140,6 +143,8 @@ class FixedArgCountMatcherDescriptor : public MatcherDescriptor {
     return marshaller(matcherFunc, matcherName, nameRange, args, error);
   }
 
+  bool isVariadic() const override { return false; }
+
   unsigned getNumArgs() const override { return argKinds.size(); }
 
   void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
@@ -153,6 +158,54 @@ class FixedArgCountMatcherDescriptor : public MatcherDescriptor {
   const std::vector<ArgKind> argKinds;
 };
 
+class VariadicOperatorMatcherDescriptor : public MatcherDescriptor {
+public:
+  using VarOp = DynMatcher::VariadicOperator;
+  VariadicOperatorMatcherDescriptor(unsigned minCount, unsigned maxCount,
+                                    VarOp varOp, StringRef matcherName)
+      : minCount(minCount), maxCount(maxCount), varOp(varOp),
+        matcherName(matcherName) {}
+
+  VariantMatcher create(SourceRange nameRange, ArrayRef<ParserValue> args,
+                        Diagnostics *error) const override {
+    if (args.size() < minCount || maxCount < args.size()) {
+      addError(error, nameRange, ErrorType::RegistryWrongArgCount,
+               {llvm::Twine("requires between "), llvm::Twine(minCount),
+                llvm::Twine(" and "), llvm::Twine(maxCount),
+                llvm::Twine(" args, got "), llvm::Twine(args.size())});
+      return VariantMatcher();
+    }
+
+    std::vector<VariantMatcher> innerArgs;
+    for (int64_t i = 0, e = args.size(); i != e; ++i) {
+      const ParserValue &arg = args[i];
+      const VariantValue &value = arg.value;
+      if (!value.isMatcher()) {
+        addError(error, arg.range, ErrorType::RegistryWrongArgType,
+                 {llvm::Twine(i + 1), llvm::Twine("matcher: "),
+                  llvm::Twine(value.getTypeAsString())});
+        return VariantMatcher();
+      }
+      innerArgs.push_back(value.getMatcher());
+    }
+    return VariantMatcher::VariadicOperatorMatcher(varOp, std::move(innerArgs));
+  }
+
+  bool isVariadic() const override { return true; }
+
+  unsigned getNumArgs() const override { return 0; }
+
+  void getArgKinds(unsigned argNo, std::vector<ArgKind> &kinds) const override {
+    kinds.push_back(ArgKind(ArgKind::Matcher));
+  }
+
+private:
+  const unsigned minCount;
+  const unsigned maxCount;
+  const VarOp varOp;
+  const StringRef matcherName;
+};
+
 // Helper function to check if argument count matches expected count
 inline bool checkArgCount(SourceRange nameRange, size_t expectedArgCount,
                           llvm::ArrayRef<ParserValue> args,
@@ -224,6 +277,14 @@ makeMatcherAutoMarshall(ReturnType (*matcherFunc)(ArgTypes...),
       reinterpret_cast<void (*)()>(matcherFunc), matcherName, argKinds);
 }
 
+// Variadic operator overload.
+template <unsigned MinCount, unsigned MaxCount>
+std::unique_ptr<MatcherDescriptor>
+makeMatcherAutoMarshall(VariadicOperatorMatcherFunc<MinCount, MaxCount> func,
+                        StringRef matcherName) {
+  return std::make_unique<VariadicOperatorMatcherDescriptor>(
+      MinCount, MaxCount, func.varOp, matcherName);
+}
 } // namespace mlir::query::matcher::internal
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MARSHALLERS_H
diff --git a/mlir/include/mlir/Query/Matcher/MatchFinder.h b/mlir/include/mlir/Query/Matcher/MatchFinder.h
index f8abf20ef60bb..6d06ca13d1344 100644
--- a/mlir/include/mlir/Query/Matcher/MatchFinder.h
+++ b/mlir/include/mlir/Query/Matcher/MatchFinder.h
@@ -21,7 +21,9 @@
 
 namespace mlir::query::matcher {
 
-/// A class that provides utilities to find operations in the IR.
+/// Finds and collects matches from the IR. After construction
+/// `collectMatches` can be used to traverse the IR and apply
+/// matchers.
 class MatchFinder {
 
 public:
diff --git a/mlir/include/mlir/Query/Matcher/MatchersInternal.h b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
index 183b2514e109f..88109430b6feb 100644
--- a/mlir/include/mlir/Query/Matcher/MatchersInternal.h
+++ b/mlir/include/mlir/Query/Matcher/MatchersInternal.h
@@ -8,11 +8,11 @@
 //
 // Implements the base layer of the matcher framework.
 //
-// Matchers are methods that return a Matcher which provides a method one of the
-// following methods: match(Operation *op), match(Operation *op,
-// SetVector<Operation *> &matchedOps)
+// Matchers are methods that return a Matcher which provides a
+// `match(...)` method whose parameters define the context of the match.
+// Support includes simple (unary) matchers as well as matcher combinators
+// (anyOf, allOf, etc.)
 //
-// The matcher functions are defined in include/mlir/IR/Matchers.h.
 // This file contains the wrapper classes needed to construct matchers for
 // mlir-query.
 //
@@ -25,6 +25,15 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
 namespace mlir::query::matcher {
+class DynMatcher;
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers);
+
+} // namespace internal
 
 // Defaults to false if T has no match() method with the signature:
 // match(Operation* op).
@@ -84,6 +93,27 @@ class MatcherFnImpl : public MatcherInterface {
   MatcherFn matcherFn;
 };
 
+// VariadicMatcher takes a vector of Matchers and returns true if any Matchers
+// match the given operation.
+using VariadicOperatorFunction = bool (*)(Operation *op,
+                                          SetVector<Operation *> *matchedOps,
+                                          ArrayRef<DynMatcher> innerMatchers);
+
+template <VariadicOperatorFunction Func>
+class VariadicMatcher : public MatcherInterface {
+public:
+  VariadicMatcher(std::vector<DynMatcher> matchers)
+      : matchers(std::move(matchers)) {}
+
+  bool match(Operation *op) override { return Func(op, nullptr, matchers); }
+  bool match(Operation *op, SetVector<Operation *> &matchedOps) override {
+    return Func(op, &matchedOps, matchers);
+  }
+
+private:
+  std::vector<DynMatcher> matchers;
+};
+
 // Matcher wraps a MatcherInterface implementation and provides match()
 // methods that redirect calls to the underlying implementation.
 class DynMatcher {
@@ -92,6 +122,31 @@ class DynMatcher {
   DynMatcher(MatcherInterface *implementation)
       : implementation(implementation) {}
 
+  // Construct from a variadic function.
+  enum VariadicOperator {
+    // Matches operations for which all provided matchers match.
+    AllOf,
+    // Matches operations for which at least one of the provided matchers
+    // matches.
+    AnyOf
+  };
+
+  static std::unique_ptr<DynMatcher>
+  constructVariadic(VariadicOperator Op,
+                    std::vector<DynMatcher> innerMatchers) {
+    switch (Op) {
+    case AllOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::allOfVariadicOperator>(
+              std::move(innerMatchers)));
+    case AnyOf:
+      return std::make_unique<DynMatcher>(
+          new VariadicMatcher<internal::anyOfVariadicOperator>(
+              std::move(innerMatchers)));
+    }
+    llvm_unreachable("Invalid Op value.");
+  }
+
   template <typename MatcherFn>
   static std::unique_ptr<DynMatcher>
   constructDynMatcherFromMatcherFn(MatcherFn &matcherFn) {
@@ -113,6 +168,59 @@ class DynMatcher {
   std::string functionName;
 };
 
+// VariadicOperatorMatcher related types.
+template <typename... Ps>
+class VariadicOperatorMatcher {
+public:
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp, Ps &&...params)
+      : varOp(varOp), params(std::forward<Ps>(params)...) {}
+
+  operator std::unique_ptr<DynMatcher>() const & {
+    return DynMatcher::constructVariadic(
+        varOp, getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+  operator std::unique_ptr<DynMatcher>() && {
+    return DynMatcher::constructVariadic(
+        varOp, std::move(*this).getMatchers(std::index_sequence_for<Ps...>()));
+  }
+
+private:
+  // Helper method to unpack the tuple into a vector.
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) const & {
+    return {DynMatcher(std::get<Is>(params))...};
+  }
+
+  template <std::size_t... Is>
+  std::vector<DynMatcher> getMatchers(std::index_sequence<Is...>) && {
+    return {DynMatcher(std::get<Is>(std::move(params)))...};
+  }
+
+  const DynMatcher::VariadicOperator varOp;
+  std::tuple<Ps...> params;
+};
+
+// Overloaded function object to generate VariadicOperatorMatcher objects from
+// arbitrary matchers.
+template <unsigned MinCount, unsigned MaxCount>
+struct VariadicOperatorMatcherFunc {
+  DynMatcher::VariadicOperator varOp;
+
+  template <typename... Ms>
+  VariadicOperatorMatcher<Ms...> operator()(Ms &&...Ps) const {
+    static_assert(MinCount <= sizeof...(Ms) && sizeof...(Ms) <= MaxCount,
+                  "invalid number of parameters for variadic matcher");
+    return VariadicOperatorMatcher<Ms...>(varOp, std::forward<Ms>(Ps)...);
+  }
+};
+
+namespace internal {
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    anyOf = {DynMatcher::AnyOf};
+const VariadicOperatorMatcherFunc<1, std::numeric_limits<unsigned>::max()>
+    allOf = {DynMatcher::AllOf};
+} // namespace internal
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHER_MATCHERSINTERNAL_H
diff --git a/mlir/include/mlir/Query/Matcher/SliceMatchers.h b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
index 441205b3a9615..7181648f06f89 100644
--- a/mlir/include/mlir/Query/Matcher/SliceMatchers.h
+++ b/mlir/include/mlir/Query/Matcher/SliceMatchers.h
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides matchers for MLIRQuery that peform slicing analysis
+// This file defines slicing-analysis matchers that extend and abstract the
+// core implementations from `SliceAnalysis.h`.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,9 +17,9 @@
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/IR/Operation.h"
 
-/// A matcher encapsulating `getBackwardSlice` method from SliceAnalysis.h.
-/// Additionally, it limits the slice computation to a certain depth level using
-/// a custom filter.
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. The traversal stops once the desired depth level
+/// is reached.
 ///
 /// Example: starting from node 9, assuming the matcher
 /// computes the slice for the first two depth levels:
@@ -119,6 +120,77 @@ bool BackwardSliceMatcher<Matcher>::matches(
                            : backwardSlice.size() >= 1;
 }
 
+/// Computes the backward-slice of all transitive defs reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateBackwardSliceMatcher {
+public:
+  PredicateBackwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                                bool inclusive, bool omitBlockArguments,
+                                bool omitUsesFromAbove)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive),
+        omitBlockArguments(omitBlockArguments),
+        omitUsesFromAbove(omitUsesFromAbove) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &backwardSlice) {
+    backwardSlice.clear();
+    BackwardSliceOptions options;
+    options.inclusive = inclusive;
+    options.omitUsesFromAbove = omitUsesFromAbove;
+    options.omitBlockArguments = omitBlockArguments;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      LogicalResult result = getBackwardSlice(rootOp, &backwardSlice, options);
+      assert(result.succeeded() && "expected backward slice to succeed");
+      (void)result;
+      return options.inclusive ? backwardSlice.size() > 1
+                               : backwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+  bool omitBlockArguments;
+  bool omitUsesFromAbove;
+};
+
+/// Computes the forward-slice of all users reachable from `rootOp`,
+/// if `innerMatcher` matches. Traversal stops where `filterMatcher` matches.
+template <typename BaseMatcher, typename Filter>
+class PredicateForwardSliceMatcher {
+public:
+  PredicateForwardSliceMatcher(BaseMatcher innerMatcher, Filter filterMatcher,
+                               bool inclusive)
+      : innerMatcher(std::move(innerMatcher)),
+        filterMatcher(std::move(filterMatcher)), inclusive(inclusive) {}
+
+  bool match(Operation *rootOp, SetVector<Operation *> &forwardSlice) {
+    forwardSlice.clear();
+    ForwardSliceOptions options;
+    options.inclusive = inclusive;
+    if (innerMatcher.match(rootOp)) {
+      options.filter = [&](Operation *subOp) {
+        return !filterMatcher.match(subOp);
+      };
+      getForwardSlice(rootOp, &forwardSlice, options);
+      return options.inclusive ? forwardSlice.size() > 1
+                               : forwardSlice.size() >= 1;
+    }
+    return false;
+  }
+
+private:
+  BaseMatcher innerMatcher;
+  Filter filterMatcher;
+  bool inclusive;
+};
+
 /// Matches transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher>
@@ -130,7 +202,7 @@ m_GetDefinitions(Matcher innerMatcher, int64_t maxDepth, bool inclusive,
                                        omitUsesFromAbove);
 }
 
-/// Matches all transitive defs of a top-level operation up to N levels
+/// Matches all transitive defs of a top-level operation up to N levels.
 template <typename Matcher>
 inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                                          int64_t maxDepth) {
@@ -139,6 +211,28 @@ inline BackwardSliceMatcher<Matcher> m_GetAllDefinitions(Matcher innerMatcher,
                                        false, false);
 }
 
+/// Matches all transitive defs of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateBackwardSliceMatcher<BaseMatcher, Filter>
+m_GetDefinitionsByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                            bool inclusive, bool omitBlockArguments,
+                            bool omitUsesFromAbove) {
+  return PredicateBackwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive,
+      omitBlockArguments, omitUsesFromAbove);
+}
+
+/// Matches all users of a top-level operation and stops where
+/// `filterMatcher` rejects.
+template <typename BaseMatcher, typename Filter>
+inline PredicateForwardSliceMatcher<BaseMatcher, Filter>
+m_GetUsersByPredicate(BaseMatcher innerMatcher, Filter filterMatcher,
+                      bool inclusive) {
+  return PredicateForwardSliceMatcher<BaseMatcher, Filter>(
+      std::move(innerMatcher), std::move(filterMatcher), inclusive);
+}
+
 } // namespace mlir::query::matcher
 
 #endif // MLIR_TOOLS_MLIRQUERY_MATCHERS_SLICEMATCHERS_H
diff --git a/mlir/include/mlir/Query/Matcher/VariantValue.h b/mlir/include/mlir/Query/Matcher/VariantValue.h
index 98c0a18e25101..1a47576de1841 100644
--- a/mlir/include/mlir/Query/Matcher/VariantValue.h
+++ b/mlir/include/mlir/Query/Matcher/VariantValue.h
@@ -26,7 +26,12 @@ enum class ArgKind { Boolean, Matcher, Signed, String };
 // A variant matcher object to abstract simple and complex matchers into a
 // single object type.
 class VariantMatcher {
-  class MatcherOps;
+  class MatcherOps {
+  public:
+    std::optional<DynMatcher>
+    constructVariadicOperator(DynMatcher::VariadicOperator varOp,
+                              ArrayRef<VariantMatcher> innerMatchers) const;
+  };
 
   // Payload interface to be specialized by each matcher type. It follows a
   // similar interface as VariantMatcher itself.
@@ -43,6 +48,9 @@ class VariantMatcher {
 
   // Clones the provided matcher.
   static VariantMatcher SingleMatcher(DynMatcher matcher);
+  static VariantMatcher
+  VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                          ArrayRef<VariantMatcher> args);
 
   // Makes the matcher the "null" matcher.
   void reset();
@@ -61,6 +69,7 @@ class VariantMatcher {
       : value(std::move(value)) {}
 
   class SinglePayload;
+  class VariadicOpPayload;
 
   std::shared_ptr<const Payload> value;
 };
diff --git a/mlir/lib/Query/Matcher/CMakeLists.txt b/mlir/lib/Query/Matcher/CMakeLists.txt
index 629479bf7adc1..ba202762fdfbb 100644
--- a/mlir/lib/Query/Matcher/CMakeLists.txt
+++ b/mlir/lib/Query/Matcher/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRQueryMatcher
   MatchFinder.cpp
+  MatchersInternal.cpp
   Parser.cpp
   RegistryManager.cpp
   VariantValue.cpp
diff --git a/mlir/lib/Query/Matcher/MatchersInternal.cpp b/mlir/lib/Query/Matcher/MatchersInternal.cpp
new file mode 100644
index 0000000000000..01f412ade846b
--- /dev/null
+++ b/mlir/lib/Query/Matcher/MatchersInternal.cpp
@@ -0,0 +1,33 @@
+//===--- MatchersInternal.cpp----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Query/Matcher/MatchersInternal.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace mlir::query::matcher {
+
+namespace internal {
+
+bool allOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::all_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+bool anyOfVariadicOperator(Operation *op, SetVector<Operation *> *matchedOps,
+                           ArrayRef<DynMatcher> innerMatchers) {
+  return llvm::any_of(innerMatchers, [&](const DynMatcher &matcher) {
+    if (matchedOps)
+      return matcher.match(op, *matchedOps);
+    return matcher.match(op);
+  });
+}
+} // namespace internal
+} // namespace mlir::query::matcher
diff --git a/mlir/lib/Query/Matcher/RegistryManager.cpp b/mlir/lib/Query/Matcher/RegistryManager.cpp
index 4b511c5f009e7..08b610453b11a 100644
--- a/mlir/lib/Query/Matcher/RegistryManager.cpp
+++ b/mlir/lib/Query/Matcher/RegistryManager.cpp
@@ -64,7 +64,7 @@ std::vector<ArgKind> RegistryManager::getAcceptedCompletionTypes(
     unsigned argNumber = ctxEntry.second;
     std::vector<ArgKind> nextTypeSet;
 
-    if (argNumber < ctor->getNumArgs())
+    if (ctor->isVariadic() || argNumber < ctor->getNumArgs())
       ctor->getArgKinds(argNumber, nextTypeSet);
 
     typeSet.insert(nextTypeSet.begin(), nextTypeSet.end());
@@ -83,7 +83,7 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
     const internal::MatcherDescriptor &matcher = *m.getValue();
     llvm::StringRef name = m.getKey();
 
-    unsigned numArgs = matcher.getNumArgs();
+    unsigned numArgs = matcher.isVariadic() ? 1 : matcher.getNumArgs();
     std::vector<std::vector<ArgKind>> argKinds(numArgs);
 
     for (const ArgKind &kind : acceptedTypes) {
@@ -115,6 +115,9 @@ RegistryManager::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes,
       }
     }
 
+    if (matcher.isVariadic())
+      os << ",...";
+
     os << ")";
     typedText += "(";
 
diff --git a/mlir/lib/Query/Matcher/VariantValue.cpp b/mlir/lib/Query/Matcher/VariantValue.cpp
index 1cb2d48f9d56f..7bf4774dba830 100644
--- a/mlir/lib/Query/Matcher/VariantValue.cpp
+++ b/mlir/lib/Query/Matcher/VariantValue.cpp
@@ -27,12 +27,64 @@ class VariantMatcher::SinglePayload : public VariantMatcher::Payload {
   DynMatcher matcher;
 };
 
+class VariantMatcher::VariadicOpPayload : public VariantMatcher::Payload {
+public:
+  VariadicOpPayload(DynMatcher::VariadicOperator varOp,
+                    std::vector<VariantMatcher> args)
+      : varOp(varOp), args(std::move(args)) {}
+
+  std::optional<DynMatcher> getDynMatcher() const override {
+    std::vector<DynMatcher> dynMatchers;
+    for (auto variantMatcher : args) {
+      std::optional<DynMatcher> dynMatcher = variantMatcher.getDynMatcher();
+      if (dynMatcher)
+        dynMatchers.push_back(dynMatcher.value());
+    }
+    auto result = DynMatcher::constructVariadic(varOp, dynMatchers);
+    return *result;
+  }
+
+  std::string getTypeAsString() const override {
+    std::string inner;
+    llvm::interleave(
+        args, [&](auto const &arg) { inner += arg.getTypeAsString(); },
+        [&] { inner += " & "; });
+    return inner;
+  }
+
+private:
+  const DynMatcher::VariadicOperator varOp;
+  const std::vector<VariantMatcher> args;
+};
+
 VariantMatcher::VariantMatcher() = default;
 
 VariantMatcher VariantMatcher::SingleMatcher(DynMatcher matcher) {
   return VariantMatcher(std::make_shared<SinglePayload>(std::move(matcher)));
 }
 
+VariantMatcher
+VariantMatcher::VariadicOperatorMatcher(DynMatcher::VariadicOperator varOp,
+                                        ArrayRef<VariantMatcher> args) {
+  return VariantMatcher(
+      std::make_shared<VariadicOpPayload>(varOp, std::move(args)));
+}
+
+std::optional<DynMatcher> VariantMatcher::MatcherOps::constructVariadicOperator(
+    DynMatcher::VariadicOperator varOp,
+    ArrayRef<VariantMatcher> innerMatchers) const {
+  std::vector<DynMatcher> dynMatchers;
+  for (const auto &innerMatcher : innerMatchers) {
+    if (!innerMatcher.value)
+      return std::nullopt;
+    std::optional<DynMatcher> inner = innerMatcher.value->getDynMatcher();
+    if (!inner)
+      return std::nullopt;
+    dynMatchers.push_back(*inner);
+  }
+  return *DynMatcher::constructVariadic(varOp, dynMatchers);
+}
+
 std::optional<DynMatcher> VariantMatcher::getDynMatcher() const {
   return value ? value->getDynMatcher() : std::nullopt;
 }
diff --git a/mlir/lib/Query/Query.cpp b/mlir/lib/Query/Query.cpp
index 803284d6df86a..637e1f3cdef87 100644
--- a/mlir/lib/Query/Query.cpp
+++ b/mlir/lib/Query/Query.cpp
@@ -10,6 +10,7 @@
 #include "QueryParser.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Query/Matcher/MatchFinder.h"
 #include "mlir/Query/QuerySession.h"
 #include "llvm/ADT/SetVector.h"
@@ -68,6 +69,8 @@ static Operation *extractFunction(std::vector<Operation *> &ops,
   // Clone operations and build function body
   std::vector<Operation *> clonedOps;
   std::vector<Value> clonedVals;
+  // TODO: Handle extraction of operations with compute payloads defined via
+  // regions.
   for (Operation *slicedOp : slice) {
     Operation *clonedOp =
         clonedOps.emplace_back(builder.clone(*slicedOp, mapper));
@@ -129,6 +132,8 @@ LogicalResult MatchQuery::run(llvm::raw_ostream &os, QuerySession &qs) const {
         finder.flattenMatchedOps(matches);
     Operation *function =
         extractFunction(flattenedMatches, rootOp->getContext(), functionName);
+    if (failed(verify(function)))
+      return mlir::failure();
     os << "\n" << *function << "\n\n";
     function->erase();
     return mlir::success();
diff --git a/mlir/test/mlir-query/complex-test.mlir b/mlir/test/mlir-query/backward-slice-union.mlir
similarity index 71%
rename from mlir/test/mlir-query/complex-test.mlir
rename to mlir/test/mlir-query/backward-slice-union.mlir
index ad96f03747a43..f8f88c2043749 100644
--- a/mlir/test/mlir-query/complex-test.mlir
+++ b/mlir/test/mlir-query/backward-slice-union.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-query %s -c "m getAllDefinitions(hasOpName(\"arith.addf\"),2)" | FileCheck %s
+// RUN: mlir-query %s -c "m anyOf(getAllDefinitions(hasOpName(\"arith.addf\"),2),getAllDefinitions(hasOpName(\"tensor.extract\"),1))" | FileCheck %s
 
 #map = affine_map<(d0, d1) -> (d0, d1)>
 func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>) {
@@ -19,14 +19,23 @@ func.func @slice_use_from_above(%arg0: tensor<5x5xf32>, %arg1: tensor<5x5xf32>)
 }
 
 // CHECK: Match #1:
-
 // CHECK: %[[LINALG:.*]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} 
 // CHECK-SAME: ins(%arg0 : tensor<5x5xf32>) outs(%arg1 : tensor<5x5xf32>)
+
+// CHECK: {{.*}}.mlir:7:10: note: "root" binds here
 // CHECK: %[[ADDF1:.*]] = arith.addf %in, %in : f32
 
 // CHECK: Match #2:
+// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
+// CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 
+// CHECK: {{.*}}.mlir:14:18: note: "root" binds here
+// CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: Match #3:
 // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[LINALG]] {{\[\[.*\]\]}} : tensor<5x5xf32> into tensor<25xf32>
 // CHECK: %[[C2:.*]] = arith.constant {{.*}} : index
 // CHECK: %[[EXTRACTED:.*]] = tensor.extract %[[COLLAPSED]][%[[C2]]] : tensor<25xf32>
+
+// CHECK: {{.*}}.mlir:15:10: note: "root" binds here
 // CHECK: %[[ADDF2:.*]] = arith.addf %[[EXTRACTED]], %[[EXTRACTED]] : f32  
diff --git a/mlir/test/mlir-query/forward-slice-by-predicate.mlir b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
new file mode 100644
index 0000000000000..e11378da89d9f
--- /dev/null
+++ b/mlir/test/mlir-query/forward-slice-by-predicate.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-query %s -c "m getUsersByPredicate(anyOf(hasOpName(\"memref.alloc\"),isConstantOp()),anyOf(hasOpName(\"affine.load\"), hasOpName(\"memref.dealloc\")),true)" | FileCheck %s
+
+func.func @slice_depth1_loop_nest_with_offsets() {
+  %0 = memref.alloc() : memref<100xf32>
+  %cst = arith.constant 7.000000e+00 : f32
+  affine.for %i0 = 0 to 16 {
+    %a0 = affine.apply affine_map<(d0) -> (d0 + 2)>(%i0)
+    affine.store %cst, %0[%a0] : memref<100xf32>
+  }
+  affine.for %i1 = 4 to 8 {
+    %a1 = affine.apply affine_map<(d0) -> (d0 - 1)>(%i1)
+    %1 = affine.load %0[%a1] : memref<100xf32>
+  }
+  return
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:4:8: note: "root" binds here
+// CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<100xf32>
+
+// CHECK: affine.store %cst, %0[%a0] : memref<100xf32>
+
+// CHECK: Match #2:
+// CHECK: {{.*}}.mlir:5:10: note: "root" binds here
+// CHECK: %[[CST:.*]] = arith.constant 7.000000e+00 : f32
+
+// CHECK: affine.store %[[CST]], %0[%a0] : memref<100xf32>
diff --git a/mlir/test/mlir-query/logical-operator-test.mlir b/mlir/test/mlir-query/logical-operator-test.mlir
new file mode 100644
index 0000000000000..ac05428287abd
--- /dev/null
+++ b/mlir/test/mlir-query/logical-operator-test.mlir
@@ -0,0 +1,11 @@
+// RUN: mlir-query %s -c "m allOf(hasOpName(\"memref.alloca\"), hasOpAttrName(\"alignment\"))" | FileCheck %s
+
+func.func @dynamic_alloca(%arg0: index, %arg1: index) -> memref<?x?xf32> {
+  %0 = memref.alloca(%arg0, %arg1) : memref<?x?xf32>
+  memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
+  return %0 : memref<?x?xf32>
+}
+
+// CHECK: Match #1:
+// CHECK: {{.*}}.mlir:5:3: note: "root" binds here
+// CHECK: memref.alloca(%arg0, %arg1) {alignment = 32} : memref<?x?xf32>
diff --git a/mlir/test/mlir-query/slice-function-extraction.mlir b/mlir/test/mlir-query/slice-function-extraction.mlir
new file mode 100644
index 0000000000000..e55d5e77c5736
--- /dev/null
+++ b/mlir/test/mlir-query/slice-function-extraction.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-query %s -c "m getDefinitionsByPredicate(hasOpName(\"memref.store\"),hasOpName(\"memref.alloc\"),true,false,false).extract(\"backward_slice\")" | FileCheck %s
+
+// CHECK:       func.func @backward_slice(%{{.*}}: memref<10xf32>) -> (f32, index, index, f32, index, index, f32) {
+// CHECK:         %[[CST0:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[I0:.*]] = affine.apply affine_map<()[s0] -> (s0)>()[%[[C0]]]
+// CHECK-NEXT:    memref.store %[[CST0]], %{{.*}}[%[[I0]]] : memref<10xf32>
+// CHECK-NEXT:    %[[CST2:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[I1:.*]] = affine.apply affine_map<() -> (0)>()
+// CHECK-NEXT:    memref.store %[[CST2]], %{{.*}}[%[[I1]]] : memref<10xf32>
+// CHECK-NEXT:    %[[C1:.*]] = arith.constant 0 : index
+// CHECK-NEXT:    %[[LOAD:.*]] = memref.load %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    memref.store %[[LOAD]], %{{.*}}[%[[C1]]] : memref<10xf32>
+// CHECK-NEXT:    return %[[CST0]], %[[C0]], %[[I0]], %[[CST2]], %[[I1]], %[[C1]], %[[LOAD]] : f32, index, index, f32, index, index, f32
+
+func.func @slicing_memref_store_trivial() {
+  %0 = memref.alloc() : memref<10xf32>
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %i1 = 0 to 10 {
+    %1 = affine.apply affine_map<()[s0] -> (s0)>()[%c0]
+    memref.store %cst, %0[%1] : memref<10xf32>
+    %2 = memref.load %0[%c0] : memref<10xf32>
+    %3 = affine.apply affine_map<()[] -> (0)>()[]
+    memref.store %cst, %0[%3] : memref<10xf32>
+    memref.store %2, %0[%c0] : memref<10xf32>
+  }
+  return
+}
diff --git a/mlir/tools/mlir-query/mlir-query.cpp b/mlir/tools/mlir-query/mlir-query.cpp
index 78c0ec97c0cdf..8a17a33c61838 100644
--- a/mlir/tools/mlir-query/mlir-query.cpp
+++ b/mlir/tools/mlir-query/mlir-query.cpp
@@ -40,12 +40,22 @@ int main(int argc, char **argv) {
   query::matcher::Registry matcherRegistry;
 
   // Matchers registered in alphabetical order for consistency:
+  matcherRegistry.registerMatcher("allOf", query::matcher::internal::allOf);
+  matcherRegistry.registerMatcher("anyOf", query::matcher::internal::anyOf);
+  matcherRegistry.registerMatcher(
+      "getAllDefinitions",
+      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
       "getDefinitions",
       query::matcher::m_GetDefinitions<query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher(
-      "getAllDefinitions",
-      query::matcher::m_GetAllDefinitions<query::matcher::DynMatcher>);
+      "getDefinitionsByPredicate",
+      query::matcher::m_GetDefinitionsByPredicate<query::matcher::DynMatcher,
+                                                  query::matcher::DynMatcher>);
+  matcherRegistry.registerMatcher(
+      "getUsersByPredicate",
+      query::matcher::m_GetUsersByPredicate<query::matcher::DynMatcher,
+                                            query::matcher::DynMatcher>);
   matcherRegistry.registerMatcher("hasOpAttrName",
                                   static_cast<HasOpAttrName *>(m_Attr));
   matcherRegistry.registerMatcher("hasOpName", static_cast<HasOpName *>(m_Op));

From 087d83e0c6d94c1ad6a68b089950d05185d0e043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= <gaetan.bossu@arm.com>
Date: Tue, 17 Jun 2025 13:20:52 +0100
Subject: [PATCH 722/851] [SLP] vectorizeStores: Name things a bit more clearly
 (NFC) (#144511)

I believe the new variable names better convey their purpose. However, I
also believe that function is more complex than it needs to be, and this
tiny patch should be seen as a first step towards (maybe) further
refactoring.

The previous names were very generic (Size, Sz, Cnt, StartIdx). This
made it easy to get confused given that the vecotrizeStores() function
is already complex enough.

My hope would be to eventually have a function concise enough to clearly
see what are the different strategies being attempted to vectorise a
group of related store instructions.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 113 ++++++++++--------
 1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c3ca22dce0cc4..9a7e9b75da517 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21191,25 +21191,30 @@ bool SLPVectorizerPass::vectorizeStores(
         ++Repeat;
         bool RepeatChanged = false;
         bool AnyProfitableGraph = false;
-        for (unsigned Size : CandidateVFs) {
+        for (unsigned VF : CandidateVFs) {
           AnyProfitableGraph = false;
-          unsigned StartIdx = std::distance(
-              RangeSizes.begin(),
-              find_if(RangeSizes,
-                      std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
-          while (StartIdx < End) {
-            unsigned EndIdx = std::distance(
+          unsigned FirstUnvecStore =
+              std::distance(RangeSizes.begin(),
+                            find_if(RangeSizes, std::bind(IsNotVectorized,
+                                                          VF >= MaxRegVF, _1)));
+
+          // Form slices of size VF starting from FirstUnvecStore and try to
+          // vectorize them.
+          while (FirstUnvecStore < End) {
+            unsigned FirstVecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(StartIdx),
-                        std::bind(IsVectorized, Size >= MaxRegVF, _1)));
-            unsigned Sz = EndIdx >= End ? End : EndIdx;
-            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
-              if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
-                                  Size >= MaxRegVF)) {
-                ++Cnt;
+                find_if(RangeSizes.drop_front(FirstUnvecStore),
+                        std::bind(IsVectorized, VF >= MaxRegVF, _1)));
+            unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
+            for (unsigned SliceStartIdx = FirstUnvecStore;
+                 SliceStartIdx + VF <= MaxSliceEnd;) {
+              if (!checkTreeSizes(RangeSizes.slice(SliceStartIdx, VF),
+                                  VF >= MaxRegVF)) {
+                ++SliceStartIdx;
                 continue;
               }
-              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              ArrayRef<Value *> Slice =
+                  ArrayRef(Operands).slice(SliceStartIdx, VF);
               assert(all_of(Slice,
                             [&](Value *V) {
                               return cast<StoreInst>(V)
@@ -21223,19 +21228,23 @@ bool SLPVectorizerPass::vectorizeStores(
               if (!NonSchedulable.empty()) {
                 auto [NonSchedSizeMax, NonSchedSizeMin] =
                     NonSchedulable.lookup(Slice.front());
-                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
-                  Cnt += NonSchedSizeMax;
+                if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
+                  // VF is too ambitious. Try to vectorize another slice before
+                  // trying a smaller VF.
+                  SliceStartIdx += NonSchedSizeMax;
                   continue;
                 }
               }
               unsigned TreeSize;
               std::optional<bool> Res =
-                  vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
+                  vectorizeStoreChain(Slice, R, SliceStartIdx, MinVF, TreeSize);
               if (!Res) {
+                // Update the range of non schedulable VFs for slices starting
+                // at SliceStartIdx.
                 NonSchedulable
-                    .try_emplace(Slice.front(), std::make_pair(Size, Size))
+                    .try_emplace(Slice.front(), std::make_pair(VF, VF))
                     .first->getSecond()
-                    .second = Size;
+                    .second = VF;
               } else if (*Res) {
                 // Mark the vectorized stores so that we don't vectorize them
                 // again.
@@ -21246,63 +21255,67 @@ bool SLPVectorizerPass::vectorizeStores(
                 // If we vectorized initial block, no need to try to vectorize
                 // it again.
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size))
+                     RangeSizes.slice(SliceStartIdx, VF))
                   P.first = P.second = 0;
-                if (Cnt < StartIdx + MinVF) {
-                  for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(StartIdx, Cnt - StartIdx))
+                if (SliceStartIdx < FirstUnvecStore + MinVF) {
+                  for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
+                           FirstUnvecStore, SliceStartIdx - FirstUnvecStore))
                     P.first = P.second = 0;
-                  StartIdx = Cnt + Size;
+                  FirstUnvecStore = SliceStartIdx + VF;
                 }
-                if (Cnt > Sz - Size - MinVF) {
+                if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
                   for (std::pair<unsigned, unsigned> &P :
-                       RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)))
+                       RangeSizes.slice(SliceStartIdx + VF,
+                                        MaxSliceEnd - (SliceStartIdx + VF)))
                     P.first = P.second = 0;
-                  if (Sz == End)
-                    End = Cnt;
-                  Sz = Cnt;
+                  if (MaxSliceEnd == End)
+                    End = SliceStartIdx;
+                  MaxSliceEnd = SliceStartIdx;
                 }
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
-              if (Size > 2 && Res &&
-                  !all_of(RangeSizes.slice(Cnt, Size),
-                          std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
+              if (VF > 2 && Res &&
+                  !all_of(RangeSizes.slice(SliceStartIdx, VF),
+                          std::bind(VFIsProfitable, VF >= MaxRegVF, TreeSize,
                                     _1))) {
-                Cnt += Size;
+                SliceStartIdx += VF;
                 continue;
               }
               // Check for the very big VFs that we're not rebuilding same
               // trees, just with larger number of elements.
-              if (Size > MaxRegVF && TreeSize > 1 &&
-                  all_of(RangeSizes.slice(Cnt, Size),
+              if (VF > MaxRegVF && TreeSize > 1 &&
+                  all_of(RangeSizes.slice(SliceStartIdx, VF),
                          std::bind(FirstSizeSame, TreeSize, _1))) {
-                Cnt += Size;
-                while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
-                  ++Cnt;
+                SliceStartIdx += VF;
+                while (SliceStartIdx != MaxSliceEnd &&
+                       RangeSizes[SliceStartIdx].first == TreeSize)
+                  ++SliceStartIdx;
                 continue;
               }
-              if (TreeSize > 1)
+              if (TreeSize > 1) {
                 for (std::pair<unsigned, unsigned> &P :
-                     RangeSizes.slice(Cnt, Size)) {
-                  if (Size >= MaxRegVF)
+                     RangeSizes.slice(SliceStartIdx, VF)) {
+                  if (VF >= MaxRegVF)
                     P.second = std::max(P.second, TreeSize);
                   else
                     P.first = std::max(P.first, TreeSize);
                 }
-              ++Cnt;
+              }
+              ++SliceStartIdx;
               AnyProfitableGraph = true;
             }
-            if (StartIdx >= End)
+            if (FirstUnvecStore >= End)
               break;
-            if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
+            if (MaxSliceEnd - FirstUnvecStore < VF &&
+                MaxSliceEnd - FirstUnvecStore >= MinVF)
               AnyProfitableGraph = true;
-            StartIdx = std::distance(
+            FirstUnvecStore = std::distance(
                 RangeSizes.begin(),
-                find_if(RangeSizes.drop_front(Sz),
-                        std::bind(IsNotVectorized, Size >= MaxRegVF, _1)));
+                find_if(RangeSizes.drop_front(MaxSliceEnd),
+                        std::bind(IsNotVectorized, VF >= MaxRegVF, _1)));
           }
-          if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
+          if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(VF))
             break;
         }
         // All values vectorized - exit.

From cb011d3199e1160ad2706cb5b1d43692fa4784d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?=
 <jmartinezcaamao@gmail.com>
Date: Tue, 17 Jun 2025 14:32:05 +0200
Subject: [PATCH 723/851] [CUDA][HIP] Add a __device__ version of
 std::__glibcxx_assert_fail() (#136133)

libstdc++ 15 uses the non-constexpr function
std::__glibcxx_assert_fail() to trigger compilation errors when the
__glibcxx_assert(cond) macro is used in a constantly evaluated context.

Compilation fails when using code from the libstdc++ (such as
std::array) on device code, since these assertions invoke a
non-constexpr host function from device code.

This patch proposes a cuda wrapper header "bits/c++config.h" which adds
a __device__ version of std::__glibcxx_assert_fail().

Solves SWDEV-518041
---
 clang/lib/Headers/CMakeLists.txt              |  1 +
 .../Headers/cuda_wrappers/bits/c++config.h    | 51 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 clang/lib/Headers/cuda_wrappers/bits/c++config.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c1c9d2e8c7b79..c96d209c1fc0c 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -341,6 +341,7 @@ set(cuda_wrapper_files
 )
 
 set(cuda_wrapper_bits_files
+  cuda_wrappers/bits/c++config.h
   cuda_wrappers/bits/shared_ptr_base.h
   cuda_wrappers/bits/basic_string.h
   cuda_wrappers/bits/basic_string.tcc
diff --git a/clang/lib/Headers/cuda_wrappers/bits/c++config.h b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
new file mode 100644
index 0000000000000..eafa13a9cc640
--- /dev/null
+++ b/clang/lib/Headers/cuda_wrappers/bits/c++config.h
@@ -0,0 +1,51 @@
+// libstdc++ uses the non-constexpr function std::__glibcxx_assert_fail()
+// to trigger compilation errors when the __glibcxx_assert(cond) macro
+// is used in a constexpr context.
+// Compilation fails when using code from the libstdc++ (such as std::array) on
+// device code, since these assertions invoke a non-constexpr host function from
+// device code.
+//
+// To work around this issue, we declare our own device version of the function
+
+#ifndef __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+#define __CLANG_CUDA_WRAPPERS_BITS_CPP_CONFIG
+
+#include_next <bits/c++config.h>
+
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+
+#ifdef _GLIBCXX_VERBOSE_ASSERT
+__attribute__((device, noreturn)) inline void
+__glibcxx_assert_fail(const char *file, int line, const char *function,
+                      const char *condition) noexcept {
+  if (file && function && condition)
+    __builtin_printf("%s:%d: %s: Assertion '%s' failed.\n", file, line,
+                     function, condition);
+  else if (function)
+    __builtin_printf("%s: Undefined behavior detected.\n", function);
+  __builtin_abort();
+}
+#endif
+
+#endif
+__attribute__((device, noreturn, __always_inline__,
+               __visibility__("default"))) inline void
+__glibcxx_assert_fail(...) noexcept {
+  __builtin_abort();
+}
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif

From 3377b56338d93760507e1707ebde48536e28ee1c Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 17 Jun 2025 08:39:15 -0400
Subject: [PATCH 724/851] Revert "[clang] Add managarm support" (#144514)

Reverts llvm/llvm-project#139271

There are multiple failing build bots:
https://lab.llvm.org/buildbot/#/builders/10/builds/7482
https://lab.llvm.org/buildbot/#/builders/11/builds/17473
---
 clang/lib/Basic/Targets.cpp                   |   9 -
 clang/lib/Basic/Targets/OSTargets.h           |  30 --
 clang/lib/Driver/CMakeLists.txt               |   1 -
 clang/lib/Driver/Driver.cpp                   |   4 -
 clang/lib/Driver/ToolChains/Gnu.cpp           |   2 -
 clang/lib/Driver/ToolChains/Managarm.cpp      | 218 --------------
 clang/lib/Driver/ToolChains/Managarm.h        |  55 ----
 clang/lib/Lex/InitHeaderSearch.cpp            |   1 -
 .../lib/aarch64-managarm-mlibc/.keep          |   0
 .../lib/riscv64-managarm-mlibc/.keep          |   0
 .../lib/x86_64-managarm-mlibc/.keep           |   0
 .../lib64/aarch64-managarm-mlibc/.keep        |   0
 .../lib64/riscv64-managarm-mlibc/.keep        |   0
 .../lib64/x86_64-managarm-mlibc/.keep         |   0
 .../aarch64-managarm-mlibc/c++/10/.keep       |   0
 .../usr/include/c++/10/.keep                  |   0
 .../usr/include/c++/v1/.keep                  |   0
 .../riscv64-managarm-mlibc/c++/10/.keep       |   0
 .../x86_64-managarm-mlibc/c++/10/.keep        |   0
 .../usr/lib/aarch64-managarm-mlibc/.keep      |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/aarch64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbegin.o  |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginS.o |   0
 .../gcc/riscv64-managarm-mlibc/10/crtbeginT.o |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbegin.o   |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginS.o  |   0
 .../gcc/x86_64-managarm-mlibc/10/crtbeginT.o  |   0
 .../usr/lib/riscv64-managarm-mlibc/.keep      |   0
 .../usr/lib/x86_64-managarm-mlibc/.keep       |   0
 .../basic_managarm_tree/usr/lib64/.keep       |   0
 clang/test/Driver/managarm.cpp                | 267 ------------------
 clang/test/Preprocessor/init.c                |   5 -
 .../predefined-macros-no-warnings.c           |   3 -
 35 files changed, 595 deletions(-)
 delete mode 100644 clang/lib/Driver/ToolChains/Managarm.cpp
 delete mode 100644 clang/lib/Driver/ToolChains/Managarm.h
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
 delete mode 100644 clang/test/Driver/managarm.cpp

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index af1111a863308..9889141ad2085 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -164,9 +164,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
       }
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<AArch64leTargetInfo>>(Triple,
-                                                                       Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<AArch64leTargetInfo>>(Triple,
                                                                      Opts);
@@ -469,9 +466,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
         return std::make_unique<OHOSTargetInfo<RISCV64TargetInfo>>(Triple,
                                                                    Opts);
       }
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<RISCV64TargetInfo>>(Triple,
-                                                                     Opts);
     default:
       return std::make_unique<RISCV64TargetInfo>(Triple, Opts);
     }
@@ -660,9 +654,6 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<PS5OSTargetInfo<X86_64TargetInfo>>(Triple, Opts);
     case llvm::Triple::Hurd:
       return std::make_unique<HurdTargetInfo<X86_64TargetInfo>>(Triple, Opts);
-    case llvm::Triple::Managarm:
-      return std::make_unique<ManagarmTargetInfo<X86_64TargetInfo>>(Triple,
-                                                                    Opts);
     default:
       return std::make_unique<X86_64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 5dac699c2bb45..d148b38d03c7c 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -395,36 +395,6 @@ class LLVM_LIBRARY_VISIBILITY LinuxTargetInfo : public OSTargetInfo<Target> {
   }
 };
 
-// Managarm Target
-template <typename Target>
-class LLVM_LIBRARY_VISIBILITY ManagarmTargetInfo : public OSTargetInfo<Target> {
-protected:
-  void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
-                    MacroBuilder &Builder) const override {
-    DefineStd(Builder, "unix", Opts);
-    Builder.defineMacro("__managarm__");
-    if (Opts.POSIXThreads)
-      Builder.defineMacro("_REENTRANT");
-    if (Opts.CPlusPlus)
-      Builder.defineMacro("_GNU_SOURCE");
-    if (this->HasFloat128)
-      Builder.defineMacro("__FLOAT128__");
-  }
-
-public:
-  ManagarmTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
-      : OSTargetInfo<Target>(Triple, Opts) {
-    switch (Triple.getArch()) {
-    default:
-      break;
-    case llvm::Triple::x86:
-    case llvm::Triple::x86_64:
-      this->HasFloat128 = true;
-      break;
-    }
-  }
-};
-
 // NetBSD Target
 template <typename Target>
 class LLVM_LIBRARY_VISIBILITY NetBSDTargetInfo : public OSTargetInfo<Target> {
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 3cfd671e9d8f2..44e16edfb1ccf 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -66,7 +66,6 @@ add_clang_library(clangDriver
   ToolChains/HLSL.cpp
   ToolChains/Hurd.cpp
   ToolChains/Linux.cpp
-  ToolChains/Managarm.cpp
   ToolChains/MipsLinux.cpp
   ToolChains/MinGW.cpp
   ToolChains/MSP430.cpp
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 780bfc83dc623..2f86b6633df1c 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -33,7 +33,6 @@
 #include "ToolChains/Linux.h"
 #include "ToolChains/MSP430.h"
 #include "ToolChains/MSVC.h"
-#include "ToolChains/Managarm.h"
 #include "ToolChains/MinGW.h"
 #include "ToolChains/MipsLinux.h"
 #include "ToolChains/NaCl.h"
@@ -6851,9 +6850,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Fuchsia:
       TC = std::make_unique<toolchains::Fuchsia>(*this, Target, Args);
       break;
-    case llvm::Triple::Managarm:
-      TC = std::make_unique<toolchains::Managarm>(*this, Target, Args);
-      break;
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index afce4fffe1d5f..9203bbc91b0bb 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -226,8 +226,6 @@ static const char *getLDMOption(const llvm::Triple &T, const ArgList &Args) {
       return "elf_iamcu";
     return "elf_i386";
   case llvm::Triple::aarch64:
-    if (T.isOSManagarm())
-      return "aarch64managarm";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
     return "aarch64linuxb";
diff --git a/clang/lib/Driver/ToolChains/Managarm.cpp b/clang/lib/Driver/ToolChains/Managarm.cpp
deleted file mode 100644
index ff455f2c6ec70..0000000000000
--- a/clang/lib/Driver/ToolChains/Managarm.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Managarm.h"
-#include "Arch/ARM.h"
-#include "Arch/RISCV.h"
-#include "clang/Config/config.h"
-#include "clang/Driver/CommonArgs.h"
-#include "clang/Driver/Driver.h"
-#include "clang/Driver/Options.h"
-#include "clang/Driver/SanitizerArgs.h"
-#include "llvm/Option/ArgList.h"
-#include "llvm/Support/Path.h"
-
-using namespace clang::driver;
-using namespace clang::driver::toolchains;
-using namespace clang;
-using namespace llvm::opt;
-
-using tools::addPathIfExists;
-
-std::string Managarm::getMultiarchTriple(const Driver &D,
-                                         const llvm::Triple &TargetTriple,
-                                         StringRef SysRoot) const {
-  switch (TargetTriple.getArch()) {
-  default:
-    return TargetTriple.str();
-  case llvm::Triple::x86_64:
-    return "x86_64-managarm-" + TargetTriple.getEnvironmentName().str();
-  case llvm::Triple::aarch64:
-    return "aarch64-managarm-" + TargetTriple.getEnvironmentName().str();
-  case llvm::Triple::riscv64:
-    return "riscv64-managarm-" + TargetTriple.getEnvironmentName().str();
-  }
-}
-
-static StringRef getOSLibDir(const llvm::Triple &Triple, const ArgList &Args) {
-  // It happens that only x86, PPC and SPARC use the 'lib32' variant of
-  // oslibdir, and using that variant while targeting other architectures causes
-  // problems because the libraries are laid out in shared system roots that
-  // can't cope with a 'lib32' library search path being considered. So we only
-  // enable them when we know we may need it.
-  //
-  // FIXME: This is a bit of a hack. We should really unify this code for
-  // reasoning about oslibdir spellings with the lib dir spellings in the
-  // GCCInstallationDetector, but that is a more significant refactoring.
-  if (Triple.getArch() == llvm::Triple::x86 || Triple.isPPC32() ||
-      Triple.getArch() == llvm::Triple::sparc)
-    return "lib32";
-
-  if (Triple.getArch() == llvm::Triple::x86_64 && Triple.isX32())
-    return "libx32";
-
-  if (Triple.getArch() == llvm::Triple::riscv32)
-    return "lib32";
-
-  return Triple.isArch32Bit() ? "lib" : "lib64";
-}
-
-Managarm::Managarm(const Driver &D, const llvm::Triple &Triple,
-                   const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-  GCCInstallation.init(Triple, Args);
-  Multilibs = GCCInstallation.getMultilibs();
-  SelectedMultilibs.assign({GCCInstallation.getMultilib()});
-  std::string SysRoot = computeSysRoot();
-
-  ToolChain::path_list &PPaths = getProgramPaths();
-
-  Generic_GCC::PushPPaths(PPaths);
-
-#ifdef ENABLE_LINKER_BUILD_ID
-  ExtraOpts.push_back("--build-id");
-#endif
-
-  // The selection of paths to try here is designed to match the patterns which
-  // the GCC driver itself uses, as this is part of the GCC-compatible driver.
-  // This was determined by running GCC in a fake filesystem, creating all
-  // possible permutations of these directories, and seeing which ones it added
-  // to the link paths.
-  path_list &Paths = getFilePaths();
-
-  const std::string OSLibDir = std::string(getOSLibDir(Triple, Args));
-  const std::string MultiarchTriple = getMultiarchTriple(D, Triple, SysRoot);
-
-  Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths);
-
-  addPathIfExists(D, concat(SysRoot, "/lib", MultiarchTriple), Paths);
-  addPathIfExists(D, concat(SysRoot, "/lib/..", OSLibDir), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr/lib", MultiarchTriple), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr", OSLibDir), Paths);
-
-  Generic_GCC::AddMultiarchPaths(D, SysRoot, OSLibDir, Paths);
-
-  addPathIfExists(D, concat(SysRoot, "/lib"), Paths);
-  addPathIfExists(D, concat(SysRoot, "/usr/lib"), Paths);
-}
-
-bool Managarm::HasNativeLLVMSupport() const { return true; }
-
-Tool *Managarm::buildLinker() const {
-  return new tools::gnutools::Linker(*this);
-}
-
-Tool *Managarm::buildAssembler() const {
-  return new tools::gnutools::Assembler(*this);
-}
-
-std::string Managarm::computeSysRoot() const {
-  if (!getDriver().SysRoot.empty())
-    return getDriver().SysRoot;
-  return std::string();
-}
-
-std::string Managarm::getDynamicLinker(const ArgList &Args) const {
-  switch (getTriple().getArch()) {
-  case llvm::Triple::aarch64:
-    return "/lib/aarch64-managarm/ld.so";
-  case llvm::Triple::riscv64: {
-    StringRef ABIName = tools::riscv::getRISCVABI(Args, getTriple());
-    return ("/lib/riscv64-managarm/ld-riscv64-" + ABIName + ".so").str();
-  }
-  case llvm::Triple::x86_64:
-    return "/lib/x86_64-managarm/ld.so";
-  default:
-    llvm_unreachable("unsupported architecture");
-  }
-}
-
-void Managarm::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
-                                         ArgStringList &CC1Args) const {
-  const Driver &D = getDriver();
-  std::string SysRoot = computeSysRoot();
-
-  if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc))
-    return;
-
-  if (!DriverArgs.hasArg(options::OPT_nostdlibinc))
-    addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include");
-
-  // Add 'include' in the resource directory, which is similar to
-  // GCC_INCLUDE_DIR (private headers) in GCC.
-  if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
-    SmallString<128> ResourceDirInclude(D.ResourceDir);
-    llvm::sys::path::append(ResourceDirInclude, "include");
-    addSystemInclude(DriverArgs, CC1Args, ResourceDirInclude);
-  }
-
-  if (DriverArgs.hasArg(options::OPT_nostdlibinc))
-    return;
-
-  // TOOL_INCLUDE_DIR
-  AddMultilibIncludeArgs(DriverArgs, CC1Args);
-
-  // Check for configure-time C include directories.
-  StringRef CIncludeDirs(C_INCLUDE_DIRS);
-  if (CIncludeDirs != "") {
-    SmallVector<StringRef, 5> dirs;
-    CIncludeDirs.split(dirs, ":");
-    for (StringRef dir : dirs) {
-      StringRef Prefix =
-          llvm::sys::path::is_absolute(dir) ? StringRef(SysRoot) : "";
-      addExternCSystemInclude(DriverArgs, CC1Args, Prefix + dir);
-    }
-    return;
-  }
-
-  // On systems using multiarch, add /usr/include/$triple before
-  // /usr/include.
-  std::string MultiarchIncludeDir = getMultiarchTriple(D, getTriple(), SysRoot);
-  if (!MultiarchIncludeDir.empty())
-    addExternCSystemInclude(
-        DriverArgs, CC1Args,
-        concat(SysRoot, "/usr/include", MultiarchIncludeDir));
-
-  // Add an include of '/include' directly. This isn't provided by default by
-  // system GCCs, but is often used with cross-compiling GCCs, and harmless to
-  // add even when Clang is acting as-if it were a system compiler.
-  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/include"));
-
-  addExternCSystemInclude(DriverArgs, CC1Args, concat(SysRoot, "/usr/include"));
-}
-
-void Managarm::addLibStdCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  // We need a detected GCC installation on Managarm to provide libstdc++'s
-  // headers.
-  if (!GCCInstallation.isValid())
-    return;
-
-  StringRef TripleStr = GCCInstallation.getTriple().str();
-
-  // Try generic GCC detection.
-  Generic_GCC::addGCCLibStdCxxIncludePaths(DriverArgs, CC1Args, TripleStr);
-}
-
-SanitizerMask Managarm::getSupportedSanitizers() const {
-  const bool IsX86_64 = getTriple().getArch() == llvm::Triple::x86_64;
-  SanitizerMask Res = ToolChain::getSupportedSanitizers();
-  Res |= SanitizerKind::PointerCompare;
-  Res |= SanitizerKind::PointerSubtract;
-  Res |= SanitizerKind::KernelAddress;
-  Res |= SanitizerKind::Vptr;
-  if (IsX86_64)
-    Res |= SanitizerKind::KernelMemory;
-  return Res;
-}
-
-void Managarm::addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const {
-  for (const auto &Opt : ExtraOpts)
-    CmdArgs.push_back(Opt.c_str());
-}
diff --git a/clang/lib/Driver/ToolChains/Managarm.h b/clang/lib/Driver/ToolChains/Managarm.h
deleted file mode 100644
index 2082e2c615f28..0000000000000
--- a/clang/lib/Driver/ToolChains/Managarm.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===--- Managarm.h - Managarm ToolChain Implementations --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
-#define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
-
-#include "Gnu.h"
-#include "clang/Driver/ToolChain.h"
-
-namespace clang {
-namespace driver {
-namespace toolchains {
-
-class LLVM_LIBRARY_VISIBILITY Managarm : public Generic_ELF {
-public:
-  Managarm(const Driver &D, const llvm::Triple &Triple,
-           const llvm::opt::ArgList &Args);
-
-  bool HasNativeLLVMSupport() const override;
-
-  std::string getMultiarchTriple(const Driver &D,
-                                 const llvm::Triple &TargetTriple,
-                                 StringRef SysRoot) const override;
-
-  void
-  AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
-                            llvm::opt::ArgStringList &CC1Args) const override;
-  void
-  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
-                           llvm::opt::ArgStringList &CC1Args) const override;
-  SanitizerMask getSupportedSanitizers() const override;
-  std::string computeSysRoot() const override;
-
-  std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
-
-  void addExtraOpts(llvm::opt::ArgStringList &CmdArgs) const override;
-
-  std::vector<std::string> ExtraOpts;
-
-protected:
-  Tool *buildAssembler() const override;
-  Tool *buildLinker() const override;
-};
-
-} // end namespace toolchains
-} // end namespace driver
-} // end namespace clang
-
-#endif // LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_MANAGARM_H
diff --git a/clang/lib/Lex/InitHeaderSearch.cpp b/clang/lib/Lex/InitHeaderSearch.cpp
index 3e22b4001bde7..641e3beebc081 100644
--- a/clang/lib/Lex/InitHeaderSearch.cpp
+++ b/clang/lib/Lex/InitHeaderSearch.cpp
@@ -221,7 +221,6 @@ bool InitHeaderSearch::ShouldAddDefaultIncludePaths(
   case llvm::Triple::Hurd:
   case llvm::Triple::Linux:
   case llvm::Triple::LiteOS:
-  case llvm::Triple::Managarm:
   case llvm::Triple::NaCl:
   case llvm::Triple::NetBSD:
   case llvm::Triple::OpenBSD:
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/lib64/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/aarch64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/c++/v1/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/riscv64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/include/x86_64-managarm-mlibc/c++/10/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/aarch64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/riscv64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib/x86_64-managarm-mlibc/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep b/clang/test/Driver/Inputs/basic_managarm_tree/usr/lib64/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/managarm.cpp b/clang/test/Driver/managarm.cpp
deleted file mode 100644
index 5afa17aadb6d2..0000000000000
--- a/clang/test/Driver/managarm.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-X86-64 %s
-// CHECK-X86-64:      "-cc1"
-// CHECK-X86-64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
-// CHECK-X86-64-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
-// CHECK-X86-64-SAME: "-L
-// CHECK-X86-64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-X86-64-LIBS %s
-// CHECK-X86-64-LIBS:      "-cc1"
-// CHECK-X86-64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-LIBS-SAME: "-dynamic-linker" "/lib/x86_64-managarm/ld.so"
-// CHECK-X86-64-LIBS-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbegin.o"
-// CHECK-X86-64-LIBS-SAME: "-L
-// CHECK-X86-64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-X86-64-STATIC %s
-// CHECK-X86-64-STATIC:      "-cc1"
-// CHECK-X86-64-STATIC-SAME: "-static-define"
-// CHECK-X86-64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/x86_64-managarm-mlibc/c++/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-X86-64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-X86-64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-X86-64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-STATIC-SAME: "-static"
-// CHECK-X86-64-STATIC-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-X86-64-STATIC-SAME: "-L
-// CHECK-X86-64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=x86_64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-X86-64-SHARED %s
-// CHECK-X86-64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-SHARED-SAME: "{{.*}}/usr/lib/gcc/x86_64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-X86-64-SHARED-SAME: "-L
-// CHECK-X86-64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-managarm-mlibc/10/../../../../lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/x86_64-managarm-mlibc"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-X86-64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-AARCH64 %s
-// CHECK-AARCH64:      "-cc1"
-// CHECK-AARCH64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-AARCH64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
-// CHECK-AARCH64-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
-// CHECK-AARCH64-SAME: {{^}} "-L
-// CHECK-AARCH64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-LIBS %s
-// CHECK-AARCH64-LIBS:      "-cc1"
-// CHECK-AARCH64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-LIBS-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-dynamic-linker" "/lib/aarch64-managarm/ld.so"
-// CHECK-AARCH64-LIBS-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbegin.o"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L
-// CHECK-AARCH64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-STATIC %s
-// CHECK-AARCH64-STATIC:      "-cc1"
-// CHECK-AARCH64-STATIC-SAME: "-static-define"
-// CHECK-AARCH64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/aarch64-managarm-mlibc/c++/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-AARCH64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-AARCH64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-STATIC-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-static"
-// CHECK-AARCH64-STATIC-SAME: "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L
-// CHECK-AARCH64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=aarch64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SHARED %s
-// CHECK-AARCH64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-AARCH64-SHARED-SAME: "-m" "aarch64managarm"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "{{.*}}/usr/lib/gcc/aarch64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L
-// CHECK-AARCH64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/aarch64-managarm-mlibc/10/../../../../lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/aarch64-managarm-mlibc"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-AARCH64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform 2>&1 | FileCheck --check-prefix=CHECK-RISCV64 %s
-// CHECK-RISCV64:      "-cc1"
-// CHECK-RISCV64-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-RISCV64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
-// CHECK-RISCV64-SAME: "-L
-// CHECK-RISCV64-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=libc++ --rtlib=compiler-rt --unwindlib=libunwind 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-LIBS %s
-// CHECK-RISCV64-LIBS:      "-cc1"
-// CHECK-RISCV64-LIBS-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v1"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-LIBS-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64-LIBS:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-LIBS-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbegin.o"
-// CHECK-RISCV64-LIBS-SAME: "-L
-// CHECK-RISCV64-LIBS-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-LIBS-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   --stdlib=platform -static 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-STATIC %s
-// CHECK-RISCV64-STATIC:      "-cc1"
-// CHECK-RISCV64-STATIC-SAME: "-static-define"
-// CHECK-RISCV64-STATIC-SAME: "-isysroot" "[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/riscv64-managarm-mlibc/c++/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../include/c++/10/backward"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/local/include"
-// CHECK-RISCV64-STATIC-SAME: "-internal-externc-isystem"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "[[SYSROOT]]/usr/include/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/include"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
-// CHECK-RISCV64-STATIC:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-STATIC-SAME: "-static"
-// CHECK-RISCV64-STATIC-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginT.o"
-// CHECK-RISCV64-STATIC-SAME: "-L
-// CHECK-RISCV64-STATIC-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-STATIC-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
-
-// RUN: %clang -### %s --target=riscv64-unknown-managarm-mlibc --sysroot=%S/Inputs/basic_managarm_tree \
-// RUN:   -shared 2>&1 | FileCheck --check-prefix=CHECK-RISCV64-SHARED %s
-// CHECK-RISCV64-SHARED:      "{{.*}}ld" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-RISCV64-SHARED-SAME: "{{.*}}/usr/lib/gcc/riscv64-managarm-mlibc/10/crtbeginS.o"
-// CHECK-RISCV64-SHARED-SAME: "-L
-// CHECK-RISCV64-SHARED-SAME: {{^}}[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/riscv64-managarm-mlibc/10/../../../../lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib/../lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/riscv64-managarm-mlibc"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib64"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/lib"
-// CHECK-RISCV64-SHARED-SAME: {{^}} "-L[[SYSROOT]]/usr/lib"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index bed39dc3e34dc..031a6c1a755bd 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,11 +1622,6 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
-// RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
-// MANAGARM: #define __managarm__ 1
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
diff --git a/clang/test/Preprocessor/predefined-macros-no-warnings.c b/clang/test/Preprocessor/predefined-macros-no-warnings.c
index fe27ed8814eec..4e3e29ccfa8a8 100644
--- a/clang/test/Preprocessor/predefined-macros-no-warnings.c
+++ b/clang/test/Preprocessor/predefined-macros-no-warnings.c
@@ -14,7 +14,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-linux-openhos
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-netbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-openbsd
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple aarch64-win32-gnu
@@ -109,7 +108,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-fuchsia
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-linux-openhos
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple riscv64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-linux
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple sparc-solaris
@@ -169,7 +167,6 @@
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-nacl
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps4
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-ps5
-// RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple x86_64-managarm
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spir64
 // RUN: %clang_cc1 %s -Eonly -Wsystem-headers -Werror -triple spirv32

From 5f841a6284900026929edcbe8d2b98ce813e0bbc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 17 Jun 2025 07:41:20 -0500
Subject: [PATCH 725/851] [flang][OpenMP] Set _OPENMP macro for version 6.0
 (#144410)

---
 flang/include/flang/Support/OpenMP-features.h    | 3 +++
 flang/test/Driver/flang-openmp-version-macro.f90 | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Support/OpenMP-features.h b/flang/include/flang/Support/OpenMP-features.h
index 1dd7ea560cc96..349cd19c1224f 100644
--- a/flang/include/flang/Support/OpenMP-features.h
+++ b/flang/include/flang/Support/OpenMP-features.h
@@ -42,6 +42,9 @@ void setOpenMPMacro(int version, FortranPredefinitions &predefinitions) {
   case 52:
     predefinitions.emplace_back("_OPENMP", "202111");
     break;
+  case 60:
+    predefinitions.emplace_back("_OPENMP", "202411");
+    break;
   case 11:
   default:
     predefinitions.emplace_back("_OPENMP", "199911");
diff --git a/flang/test/Driver/flang-openmp-version-macro.f90 b/flang/test/Driver/flang-openmp-version-macro.f90
index 95b3071544d06..f690ab3819482 100644
--- a/flang/test/Driver/flang-openmp-version-macro.f90
+++ b/flang/test/Driver/flang-openmp-version-macro.f90
@@ -2,7 +2,6 @@
 
 ! RUN: %flang_fc1 -fopenmp -cpp -E %s | FileCheck %s --check-prefix=DEFAULT-OPENMP-VERSION
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=11 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-11
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=20 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-20
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=25 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-25
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=30 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-30
@@ -12,6 +11,7 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=50 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-50
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=51 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-51
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-52
+! RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -cpp -E %s | FileCheck %s --check-prefix=OPENMP-VERSION-60
 
 ! DEFAULT-OPENMP-VERSION: integer :: var1 = 201107
 ! OPENMP-VERSION-11: integer :: var1 = 199911
@@ -24,6 +24,7 @@
 ! OPENMP-VERSION-50: integer :: var1 = 201811
 ! OPENMP-VERSION-51: integer :: var1 = 202011
 ! OPENMP-VERSION-52: integer :: var1 = 202111
+! OPENMP-VERSION-60: integer :: var1 = 202411
 
 #if _OPENMP
   integer :: var1 = _OPENMP

From b91936aeffb798b7deb67aff7bc5c84acea5452e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 17 Jun 2025 21:55:57 +0900
Subject: [PATCH 726/851] AMDGPU: Combine nnan fminimum/fmaximum to
 fminnum_ieee/fmaxnum_ieee (#142217)

This improves codegen for gfx950, where fminimum/fmaximum are
legal through fminimum3/fmaximum3, so may have an additional
encoding cost.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |  11 ++
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 141 +++++-----------
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll | 165 ++++++-------------
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 141 +++++-----------
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll | 165 ++++++-------------
 5 files changed, 203 insertions(+), 420 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 30535ae88f7ba..0ced3a6ba9bc0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13880,6 +13880,17 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
       return Res;
   }
 
+  // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
+  // for some types, but at a higher cost since it's implemented with a 3
+  // operand form.
+  const SDNodeFlags Flags = N->getFlags();
+  if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
+      !Subtarget->hasIEEEMinMax() && Flags.hasNoNaNs()) {
+    unsigned NewOpc =
+        Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
+    return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index a56c92785d487..92a2f54841eed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -113,17 +113,11 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -270,17 +264,11 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -771,17 +759,11 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -939,17 +921,11 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1296,19 +1272,12 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1501,19 +1470,12 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1741,19 +1703,12 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1981,19 +1936,12 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_maximum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_maximum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2788,4 +2736,3 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index 826bf427503ab..6c4f13a4eab8f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -85,17 +85,11 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index 3dcc70b0ea3b6..9e82b41bb9585 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -91,17 +91,11 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -225,17 +219,11 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) {
 ; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -646,17 +634,11 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -779,17 +761,11 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1062,19 +1038,12 @@ define <3 x half> @v_minimum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1220,19 +1189,12 @@ define <3 x half> @v_minimum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1401,19 +1363,12 @@ define <4 x half> @v_minimum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan:
 ; GFX10:       ; %bb.0:
@@ -1582,19 +1537,12 @@ define <4 x half> @v_minimum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1)
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f16__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_pk_minimum3_f16 v0, v0, v2, v2
-; GFX950-NEXT:    v_pk_minimum3_f16 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f16__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f16__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2207,4 +2155,3 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 0215795467323..8adbe861fe6f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -85,17 +85,11 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -199,17 +193,11 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -568,19 +556,12 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -704,19 +685,12 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -971,21 +945,13 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1131,21 +1097,13 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1310,23 +1268,14 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1493,23 +1442,14 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX950:       ; %bb.0:
-; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
-; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
-; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
-; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
-; GFX950-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -2051,4 +1991,3 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
-; GFX9: {{.*}}

From b4e39e4ff923334a8a1fdcc6d92b01d3885a01f2 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 17 Jun 2025 15:03:37 +0200
Subject: [PATCH 727/851] [LLVM] [Support] Query the terminal width using
 `ioctl()` (#143514)

On unix systems, we were trying to determine the terminal width using
the `COULMNS` environment variable. Unfortunately, `COLUMNS` is not
exported by all shells and thus not available on some systems.

We were previously using `ioctl()` for this; fall back to doing so if `COLUMNS`
does not exist or does not store a positive integer.

This essentially reverts a3eb3d3d92d037fe3c9deaad87f6fc42fe9ea766 and
parts of https://reviews.llvm.org/D61326.

For more information, see #139499.

Fixes #139499.
---
 llvm/cmake/config-ix.cmake              |  5 +++++
 llvm/include/llvm/Config/config.h.cmake |  3 +++
 llvm/lib/Support/Unix/Process.inc       | 24 ++++++++++++++++++------
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 9895469973e47..0fcd73e752311 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -19,6 +19,7 @@ if (ANDROID OR CYGWIN OR CMAKE_SYSTEM_NAME MATCHES "AIX|DragonFly|FreeBSD|Haiku|
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (APPLE)
   set(HAVE_MACH_MACH_H 1)
   set(HAVE_MALLOC_MALLOC_H 1)
@@ -26,6 +27,7 @@ elseif (APPLE)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 1)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 elseif (WIN32)
   set(HAVE_MACH_MACH_H 0)
   set(HAVE_MALLOC_MALLOC_H 0)
@@ -33,6 +35,7 @@ elseif (WIN32)
   set(HAVE_SYS_MMAN_H 0)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 0)
+  set(HAVE_SYS_IOCTL_H 0)
 elseif (ZOS)
   # Confirmed in
   # https://github.com/llvm/llvm-project/pull/104706#issuecomment-2297109613
@@ -42,6 +45,7 @@ elseif (ZOS)
   set(HAVE_SYS_MMAN_H 1)
   set(HAVE_SYSEXITS_H 0)
   set(HAVE_UNISTD_H 1)
+  set(HAVE_SYS_IOCTL_H 1)
 else()
   # Other platforms that we don't promise support for.
   check_include_file(mach/mach.h HAVE_MACH_MACH_H)
@@ -50,6 +54,7 @@ else()
   check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
   check_include_file(sysexits.h HAVE_SYSEXITS_H)
   check_include_file(unistd.h HAVE_UNISTD_H)
+  check_include_file(sys/ioctl.h HAVE_SYS_IOCTL_H)
 endif()
 
 if( UNIX AND NOT (APPLE OR BEOS OR HAIKU) )
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 06d4756397911..ce83de8e4cba9 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index b5c3719f57963..db735b7484ad8 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -34,6 +34,9 @@
 #ifdef HAVE_GETAUXVAL
 #include <sys/auxv.h>
 #endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
@@ -304,31 +307,40 @@ bool Process::FileDescriptorIsDisplayed(int fd) {
 #endif
 }
 
-static unsigned getColumns() {
+static unsigned getColumns(int FileID) {
   // If COLUMNS is defined in the environment, wrap to that many columns.
+  // This matches GCC.
   if (const char *ColumnsStr = std::getenv("COLUMNS")) {
     int Columns = std::atoi(ColumnsStr);
     if (Columns > 0)
       return Columns;
   }
 
-  // We used to call ioctl TIOCGWINSZ to determine the width. It is considered
-  // unuseful.
-  return 0;
+  // Some shells do not export COLUMNS; query the column count via ioctl()
+  // instead if it isn't available.
+  unsigned Columns = 0;
+
+#ifdef HAVE_SYS_IOCTL_H
+  struct winsize ws;
+  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
+    Columns = ws.ws_col;
+#endif
+
+  return Columns;
 }
 
 unsigned Process::StandardOutColumns() {
   if (!StandardOutIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(0);
 }
 
 unsigned Process::StandardErrColumns() {
   if (!StandardErrIsDisplayed())
     return 0;
 
-  return getColumns();
+  return getColumns(1);
 }
 
 static bool terminalHasColors() {

From 3451cd5d206f29df5b6ab5c200b7b8b17f3f2e3f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 15:03:48 +0200
Subject: [PATCH 728/851] [PowerPC] Regenerate MIR test checks (NFC)

---
 .../PowerPC/aix-vector-vararg-caller.ll       | 227 +++++++++---------
 1 file changed, 114 insertions(+), 113 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
index 472be4fa63643..4697a093e5d6d 100644
--- a/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-vararg-caller.ll
@@ -8,123 +8,124 @@
 ; RUN: FileCheck --check-prefix=64BIT %s
 
 define <4 x i32> @caller() {
+
   ; 32BIT-LABEL: name: caller
   ; 32BIT: bb.0.entry:
-  ; 32BIT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI:%[0-9]+]]:gprc = LI 48
-  ; 32BIT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI1:%[0-9]+]]:gprc = LI 32
-  ; 32BIT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI2:%[0-9]+]]:gprc = LI 160
-  ; 32BIT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI3:%[0-9]+]]:gprc = LI 144
-  ; 32BIT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI4:%[0-9]+]]:gprc = LI 128
-  ; 32BIT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI5:%[0-9]+]]:gprc = LI 112
-  ; 32BIT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI6:%[0-9]+]]:gprc = LI 96
-  ; 32BIT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI7:%[0-9]+]]:gprc = LI 80
-  ; 32BIT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
-  ; 32BIT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
-  ; 32BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
-  ; 32BIT:   [[LI8:%[0-9]+]]:gprc = LI 64
-  ; 32BIT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
-  ; 32BIT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
-  ; 32BIT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
-  ; 32BIT:   [[LI9:%[0-9]+]]:gprc = LI 9
-  ; 32BIT:   $r3 = COPY [[LI9]]
-  ; 32BIT:   $r5 = COPY [[LWZ5]]
-  ; 32BIT:   $r6 = COPY [[LWZ4]]
-  ; 32BIT:   $r7 = COPY [[LWZ3]]
-  ; 32BIT:   $r8 = COPY [[LWZ2]]
-  ; 32BIT:   $r9 = COPY [[LWZ1]]
-  ; 32BIT:   $r10 = COPY [[LWZ]]
-  ; 32BIT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
-  ; 32BIT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
-  ; 32BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 32BIT:   $v2 = COPY [[COPY]]
-  ; 32BIT:   BLR implicit $lr, implicit $rm, implicit $v2
-
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[LWZtoc:%[0-9]+]]:gprc = LWZtoc %const.0, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI:%[0-9]+]]:gprc = LI 48
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X]], $r1, killed [[LI]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc1:%[0-9]+]]:gprc = LWZtoc %const.1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc1]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI1:%[0-9]+]]:gprc = LI 32
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $r1, killed [[LI1]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc2:%[0-9]+]]:gprc = LWZtoc %const.2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc2]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI2:%[0-9]+]]:gprc = LI 160
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $r1, killed [[LI2]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc3:%[0-9]+]]:gprc = LWZtoc %const.3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc3]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI3:%[0-9]+]]:gprc = LI 144
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $r1, killed [[LI3]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc4:%[0-9]+]]:gprc = LWZtoc %const.4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc4]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI4:%[0-9]+]]:gprc = LI 128
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $r1, killed [[LI4]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc5:%[0-9]+]]:gprc = LWZtoc %const.5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc5]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI5:%[0-9]+]]:gprc = LI 112
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $r1, killed [[LI5]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc6:%[0-9]+]]:gprc = LWZtoc %const.6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc6]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI6:%[0-9]+]]:gprc = LI 96
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $r1, killed [[LI6]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc7:%[0-9]+]]:gprc = LWZtoc %const.7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc7]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI7:%[0-9]+]]:gprc = LI 80
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $r1, killed [[LI7]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZtoc8:%[0-9]+]]:gprc = LWZtoc %const.8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero, killed [[LWZtoc8]] :: (load (s128) from constant-pool)
+  ; 32BIT-NEXT:   [[LI8:%[0-9]+]]:gprc = LI 64
+  ; 32BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $r1, killed [[LI8]] :: (store (s128))
+  ; 32BIT-NEXT:   [[LWZ:%[0-9]+]]:gprc = LWZ 52, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ1:%[0-9]+]]:gprc = LWZ 48, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ2:%[0-9]+]]:gprc = LWZ 44, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ3:%[0-9]+]]:gprc = LWZ 40, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ4:%[0-9]+]]:gprc = LWZ 36, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LWZ5:%[0-9]+]]:gprc = LWZ 32, $r1 :: (load (s32))
+  ; 32BIT-NEXT:   [[LI9:%[0-9]+]]:gprc = LI 9
+  ; 32BIT-NEXT:   $r3 = COPY [[LI9]]
+  ; 32BIT-NEXT:   $r5 = COPY [[LWZ5]]
+  ; 32BIT-NEXT:   $r6 = COPY [[LWZ4]]
+  ; 32BIT-NEXT:   $r7 = COPY [[LWZ3]]
+  ; 32BIT-NEXT:   $r8 = COPY [[LWZ2]]
+  ; 32BIT-NEXT:   $r9 = COPY [[LWZ1]]
+  ; 32BIT-NEXT:   $r10 = COPY [[LWZ]]
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .callee[PR]>, csr_aix32_altivec, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $v2
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 176, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 32BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $v2
+  ;
   ; 64BIT-LABEL: name: caller
   ; 64BIT: bb.0.entry:
-  ; 64BIT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
-  ; 64BIT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
-  ; 64BIT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
-  ; 64BIT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
-  ; 64BIT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
-  ; 64BIT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
-  ; 64BIT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
-  ; 64BIT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
-  ; 64BIT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
-  ; 64BIT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
-  ; 64BIT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
-  ; 64BIT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
-  ; 64BIT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
-  ; 64BIT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
-  ; 64BIT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
-  ; 64BIT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
-  ; 64BIT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
-  ; 64BIT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
-  ; 64BIT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
-  ; 64BIT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
-  ; 64BIT:   $x3 = COPY [[LI8_9]]
-  ; 64BIT:   $x5 = COPY [[LD5]]
-  ; 64BIT:   $x6 = COPY [[LD4]]
-  ; 64BIT:   $x7 = COPY [[LD3]]
-  ; 64BIT:   $x8 = COPY [[LD2]]
-  ; 64BIT:   $x9 = COPY [[LD1]]
-  ; 64BIT:   $x10 = COPY [[LD]]
-  ; 64BIT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
-  ; 64BIT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
-  ; 64BIT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
-  ; 64BIT:   $v2 = COPY [[COPY]]
-  ; 64BIT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[LDtocCPT:%[0-9]+]]:g8rc = LDtocCPT %const.0, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_:%[0-9]+]]:g8rc = LI8 96
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X]], $x1, killed [[LI8_]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT1:%[0-9]+]]:g8rc = LDtocCPT %const.1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X1:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT1]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_1:%[0-9]+]]:g8rc = LI8 80
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X1]], $x1, killed [[LI8_1]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT2:%[0-9]+]]:g8rc = LDtocCPT %const.2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X2:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT2]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_2:%[0-9]+]]:g8rc = LI8 64
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X2]], $x1, killed [[LI8_2]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT3:%[0-9]+]]:g8rc = LDtocCPT %const.3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X3:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT3]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_3:%[0-9]+]]:g8rc = LI8 192
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X3]], $x1, killed [[LI8_3]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT4:%[0-9]+]]:g8rc = LDtocCPT %const.4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X4:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT4]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_4:%[0-9]+]]:g8rc = LI8 176
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X4]], $x1, killed [[LI8_4]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT5:%[0-9]+]]:g8rc = LDtocCPT %const.5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X5:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT5]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_5:%[0-9]+]]:g8rc = LI8 160
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X5]], $x1, killed [[LI8_5]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT6:%[0-9]+]]:g8rc = LDtocCPT %const.6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X6:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT6]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_6:%[0-9]+]]:g8rc = LI8 144
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X6]], $x1, killed [[LI8_6]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT7:%[0-9]+]]:g8rc = LDtocCPT %const.7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X7:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT7]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_7:%[0-9]+]]:g8rc = LI8 128
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X7]], $x1, killed [[LI8_7]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LDtocCPT8:%[0-9]+]]:g8rc = LDtocCPT %const.8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   [[LXVW4X8:%[0-9]+]]:vsrc = LXVW4X $zero8, killed [[LDtocCPT8]] :: (load (s128) from constant-pool)
+  ; 64BIT-NEXT:   [[LI8_8:%[0-9]+]]:g8rc = LI8 112
+  ; 64BIT-NEXT:   STXVW4X killed [[LXVW4X8]], $x1, killed [[LI8_8]] :: (store (s128))
+  ; 64BIT-NEXT:   [[LD:%[0-9]+]]:g8rc = LD 104, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD1:%[0-9]+]]:g8rc = LD 96, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD2:%[0-9]+]]:g8rc = LD 88, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD3:%[0-9]+]]:g8rc = LD 80, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD4:%[0-9]+]]:g8rc = LD 72, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LD5:%[0-9]+]]:g8rc = LD 64, $x1 :: (load (s64))
+  ; 64BIT-NEXT:   [[LI8_9:%[0-9]+]]:g8rc = LI8 9
+  ; 64BIT-NEXT:   $x3 = COPY [[LI8_9]]
+  ; 64BIT-NEXT:   $x5 = COPY [[LD5]]
+  ; 64BIT-NEXT:   $x6 = COPY [[LD4]]
+  ; 64BIT-NEXT:   $x7 = COPY [[LD3]]
+  ; 64BIT-NEXT:   $x8 = COPY [[LD2]]
+  ; 64BIT-NEXT:   $x9 = COPY [[LD1]]
+  ; 64BIT-NEXT:   $x10 = COPY [[LD]]
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .callee[PR]>, csr_ppc64_altivec, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def $v2
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 208, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   [[COPY:%[0-9]+]]:vsrc = COPY $v2
+  ; 64BIT-NEXT:   $v2 = COPY [[COPY]]
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $v2
   entry:
     %call = tail call <4 x i32> (i32, ...) @callee(i32 9, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> <i32 8, i32 9, i32 10, i32 11>, <4 x i32> <i32 12, i32 13, i32 14, i32 15>, <4 x i32> <i32 16, i32 17, i32 18, i32 19>, <4 x i32> <i32 20, i32 21, i32 22, i32 23>, <4 x i32> <i32 24, i32 25, i32 26, i32 27>, <4 x i32> <i32 28, i32 29, i32 30, i32 31>, <4 x i32> <i32 32, i32 33, i32 34, i32 35>)
       ret <4 x i32> %call

From 76ea1db1746db254716aafbc992b637cd10c6ea3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 17 Jun 2025 15:16:24 +0200
Subject: [PATCH 729/851] [PowerPC] Split test into assembly and MIR variants
 (NFC)

So that both can be generated.
---
 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll |  588 ++++++++
 llvm/test/CodeGen/PowerPC/aix-cc-byval.ll     | 1301 +++++++----------
 2 files changed, 1080 insertions(+), 809 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll

diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
new file mode 100644
index 0000000000000..67800df6ed4b5
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-mir.ll
@@ -0,0 +1,588 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
+; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=64BIT %s
+
+%struct.S0 = type {}
+
+%struct.S1 = type { [1 x i8] }
+@gS1 = external global %struct.S1, align 1
+
+define void @call_test_byval_1Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS1, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 0, killed renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS1, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 0, killed renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 56, 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
+  ret void
+}
+
+define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_1Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   renamable $r4 = COPY $r3
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM $r3, 8, 24, 31
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_1Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   renamable $x4 = COPY $x3
+  ; 64BIT-NEXT:   renamable $x3 = RLDICL $x3, 8, 56
+  ; 64BIT-NEXT:   STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %0 = load i8, ptr %s, align 1
+  ret i8 %0
+}
+
+@f = common global float 0.000000e+00, align 4
+
+%struct.S2 = type { [2 x i8] }
+
+@gS2 = external global %struct.S2, align 1
+
+define void @call_test_byval_2Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @f, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load (s32) from @f)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS2, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 0, killed renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r5 = RLWINM killed renamable $r3, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 42
+  ; 32BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 32BIT-NEXT:   $r7 = LI 43
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @f, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load (s32) from @f)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS2, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 0, killed renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = RLDICR killed renamable $x3, 48, 15
+  ; 64BIT-NEXT:   $x3 = LI8 42
+  ; 64BIT-NEXT:   $f2 = COPY renamable $f1
+  ; 64BIT-NEXT:   $x7 = LI8 43
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %0 = load float, ptr @f, align 4
+  %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
+  ret void
+}
+
+define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+  ; 32BIT-LABEL: name: test_byval_2Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r5
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_2Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x5
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
+  %4 = load i8, ptr %arrayidx, align 1
+  ret i8 %4
+}
+
+%struct.S3 = type <{ i8, i16 }>
+@gS3 = external global %struct.S3, align 1
+
+define void @call_test_byval_3Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LI 42
+  ; 32BIT-NEXT:   renamable $r4 = LWZtoc @gS3, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 56, $r1 :: (store (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 2, renamable $r4 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 0, killed renamable $r4 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r3, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   $r3 = LI 1
+  ; 32BIT-NEXT:   $r4 = LI 2
+  ; 32BIT-NEXT:   $r5 = LI 3
+  ; 32BIT-NEXT:   $r6 = LI 4
+  ; 32BIT-NEXT:   $r7 = LI 5
+  ; 32BIT-NEXT:   $r8 = LI 6
+  ; 32BIT-NEXT:   $r9 = LI 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LI8 42
+  ; 64BIT-NEXT:   renamable $x4 = LDtoc @gS3, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 112, $x1 :: (store (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 2, renamable $x4 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 0, killed renamable $x4 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x10 = RLDIC killed renamable $x3, 40, 16
+  ; 64BIT-NEXT:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x4, 48, 0
+  ; 64BIT-NEXT:   $x3 = LI8 1
+  ; 64BIT-NEXT:   $x4 = LI8 2
+  ; 64BIT-NEXT:   $x5 = LI8 3
+  ; 64BIT-NEXT:   $x6 = LI8 4
+  ; 64BIT-NEXT:   $x7 = LI8 5
+  ; 64BIT-NEXT:   $x8 = LI8 6
+  ; 64BIT-NEXT:   $x9 = LI8 7
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
+  ret void
+}
+
+define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+  ; 32BIT-LABEL: name: test_byval_3Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
+  ; 32BIT-NEXT:   renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_3Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x10
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
+  ; 64BIT-NEXT:   renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16) from %ir.gep, align 1)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
+  %8 = load i16, ptr %gep, align 1
+  ret i16 %8
+}
+
+%struct.S4 = type { [4 x i8] }
+%struct.S4A = type { i32 }
+
+@gS4 = external global %struct.S4, align 1
+
+define void @call_test_byval_4Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS4, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS4, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x4 = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
+  ; 64BIT-NEXT:   renamable $x3 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x4, 32, 31
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %s0 = alloca %struct.S0, align 8
+  %s4a = alloca %struct.S4A, align 8
+  %call = call signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 @gS4, ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S4A) align 4 %s4a)
+  ret void
+}
+
+define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+  ; 32BIT-LABEL: name: test_byval_4Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = RLWINM killed renamable $r3, 0, 24, 31
+  ; 32BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3
+  ; 32BIT-NEXT:   STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_4Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
+  ; 64BIT-NEXT:   STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
+  ; 64BIT-NEXT:   renamable $r3 = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICL killed renamable $x4, 32, 32
+  ; 64BIT-NEXT:   renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r3, implicit killed $x4
+  ; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r3
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
+  %1 = load i8, ptr %arrayidx, align 1
+  %2 = load i32, ptr %s4a, align 4
+  %conv = zext i8 %1 to i32
+  %add = add nsw i32 %2, %conv
+  ret i32 %add
+}
+
+%struct.S5 = type { [5 x i8] }
+
+@gS5 = external global %struct.S5, align 1
+
+define void @call_test_byval_5Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_5Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS5, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 4, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 24, 0, 7
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_5Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS5, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LBZ8 4, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 24, 0, 7
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
+  ret void
+}
+
+declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
+
+%struct.S6 = type { [6 x i8] }
+
+@gS6 = external global %struct.S6, align 1
+
+define void @call_test_byval_6Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_6Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS6, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_6Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS6, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x5, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
+  ret void
+}
+
+declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
+
+%struct.S7 = type { [7 x i8] }
+
+@gS7 = external global %struct.S7, align 1
+
+define void @call_test_byval_7Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_7Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS7, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r5 = LHZ 4, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r4 = LBZ 6, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = RLWINM killed renamable $r4, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r5, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_7Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS7, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x4 = LHZ8 4, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x5 = LBZ8 6, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x6 = LWZ8 0, killed renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x3 = RLWINM8 killed renamable $x5, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x4, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x6, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
+  ret void
+}
+
+declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
+
+%struct.S8 = type { [8 x i8] }
+
+@gS8 = external global %struct.S8, align 1
+
+define void @call_test_byval_8Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_8Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS8, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_8Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS8, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
+  ret void
+}
+
+declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
+
+%struct.S32 = type { [32 x i8] }
+
+@gS32 = external global %struct.S32, align 1
+
+define void @call_test_byval_32Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS32, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r10 = LWZ 28, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS32, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x6 = LD 24, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
+  ret void
+}
+
+define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_32Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: test_byval_32Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8) from %ir.arrayidx)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
+  %0 = load i8, ptr %arrayidx, align 1
+  ret i8 %0
+}
+
+; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
+; details on why.
+
+%struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
+
+@gS31 = external global %struct.S31, align 1
+
+define void @call_test_byval_31Byte() {
+  ; 32BIT-LABEL: name: call_test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r3 = LWZtoc @gS31, $r2 :: (load (s32) from got)
+  ; 32BIT-NEXT:   renamable $r11 = LHZ 28, renamable $r3 :: (load (s16))
+  ; 32BIT-NEXT:   renamable $r10 = LBZ 30, renamable $r3 :: (load (s8))
+  ; 32BIT-NEXT:   renamable $r9 = LWZ 24, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r8 = LWZ 20, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r7 = LWZ 16, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r6 = LWZ 12, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r3 = LWZ 0, killed renamable $r3 :: (load (s32))
+  ; 32BIT-NEXT:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
+  ; 32BIT-NEXT:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r11, 16, 0, 15
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def dead $f1
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LDtoc @gS31, $x2 :: (load (s64) from got)
+  ; 64BIT-NEXT:   renamable $x7 = LHZ8 28, renamable $x3 :: (load (s16))
+  ; 64BIT-NEXT:   renamable $x6 = LBZ8 30, renamable $x3 :: (load (s8))
+  ; 64BIT-NEXT:   renamable $x8 = LWZ8 24, renamable $x3 :: (load (s32))
+  ; 64BIT-NEXT:   renamable $x5 = LD 16, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x4 = LD 8, renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWINM8 killed renamable $x6, 8, 16, 23
+  ; 64BIT-NEXT:   renamable $x3 = LD 0, killed renamable $x3 :: (load (s64))
+  ; 64BIT-NEXT:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x7, 16, 0, 15
+  ; 64BIT-NEXT:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x8, 32, 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1, implicit-def dead $f1
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm
+entry:
+  %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
+  ret void
+}
+
+define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+  ; 32BIT-LABEL: name: test_byval_31Byte
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+  ; 32BIT-NEXT: {{  $}}
+  ; 32BIT-NEXT:   STW killed renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20)
+  ; 32BIT-NEXT:   STW killed renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+  ; 32BIT-NEXT:   STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+  ; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep)
+  ; 32BIT-NEXT:   STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24, align 8)
+  ; 32BIT-NEXT:   STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28)
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $f1
+  ;
+  ; 64BIT-LABEL: name: test_byval_31Byte
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
+  ; 64BIT-NEXT: {{  $}}
+  ; 64BIT-NEXT:   STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+  ; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64) from %ir.gep, align 16)
+  ; 64BIT-NEXT:   STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+  ; 64BIT-NEXT:   STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24)
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $f1
+entry:
+  %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
+  %load = load double, ptr %gep, align 1
+  ret double %load
+}
+
+%struct.F = type { float, float, float }
+
+define i32 @call_test_byval_homogeneous_float_struct() {
+  ; 32BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 32BIT: bb.0.entry:
+  ; 32BIT-NEXT:   renamable $r3 = LI 0
+  ; 32BIT-NEXT:   STW renamable $r3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 32BIT-NEXT:   STW renamable $r3, 4, %stack.0.s :: (store (s32) into %ir.s + 4, basealign 8)
+  ; 32BIT-NEXT:   STW killed renamable $r3, 0, %stack.0.s :: (store (s32) into %ir.s, align 8)
+  ; 32BIT-NEXT:   ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 32BIT-NEXT:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
+  ; 32BIT-NEXT:   $r3 = LI 0
+  ; 32BIT-NEXT:   BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
+  ; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+  ; 32BIT-NEXT:   BLR implicit $lr, implicit $rm, implicit $r3
+  ;
+  ; 64BIT-LABEL: name: call_test_byval_homogeneous_float_struct
+  ; 64BIT: bb.0.entry:
+  ; 64BIT-NEXT:   renamable $x3 = LI8 0
+  ; 64BIT-NEXT:   STW8 renamable $x3, 8, %stack.0.s :: (store (s32) into %ir.s + 8, align 8)
+  ; 64BIT-NEXT:   STD killed renamable $x3, 0, %stack.0.s :: (store (s64) into %ir.s)
+  ; 64BIT-NEXT:   ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
+  ; 64BIT-NEXT:   renamable $x4 = RLDICR killed renamable $x3, 32, 31
+  ; 64BIT-NEXT:   $x3 = LI8 0
+  ; 64BIT-NEXT:   BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
+  ; 64BIT-NEXT:   ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+  ; 64BIT-NEXT:   BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+  %s = alloca %struct.F, align 8
+  call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
+  %call = call i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4 %s)
+  ret i32 %call
+}
+
+declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
+
+declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
index 5e7a1bc81916e..a06b61fc45334 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll
@@ -1,18 +1,11 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,32BIT %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s
-
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \
-; RUN:  -mattr=-altivec -verify-machineinstrs < %s | \
-; RUN: FileCheck --check-prefixes=CHECK,64BIT %s
+; RUN: FileCheck --check-prefixes=32BIT %s
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \
 ; RUN:  -mtriple powerpc64-ibm-aix-xcoff < %s | \
-; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s
+; RUN: FileCheck --check-prefixes=64BIT %s
 
 %struct.S0 = type {}
 
@@ -20,96 +13,60 @@
 @gS1 = external global %struct.S1, align 1
 
 define void @call_test_byval_1Byte() {
+; 32BIT-LABEL: call_test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C0(2) # @gS1
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 3, 0(3)
+; 32BIT-NEXT:    slwi 3, 3, 24
+; 32BIT-NEXT:    bl .test_byval_1Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C0(2) # @gS1
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lbz 3, 0(3)
+; 64BIT-NEXT:    sldi 3, 3, 56
+; 64BIT-NEXT:    bl .test_byval_1Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %call = call zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 @gS1)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_1Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS1, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $r3 = LBZ 0, killed renamable $r[[REG]] :: (load (s8))
-; 32BIT-NEXT:  renamable $r3 = RLWINM killed renamable $r3, 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_1Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_1Byte:
-
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-NEXT:  stw 0, 72(1)
-; ASM32-NEXT:  lbz 3, 0([[REG]])
-; ASM32-NEXT:  slwi 3, 3, 24
-; ASM32-NEXT:  bl .test_byval_1Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REG:[0-9]+]] = LDtoc @gS1, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LBZ8 0, killed renamable $x[[REG]] :: (load (s8))
-; 64BIT-NEXT:  renamable $x3 = RLDICR killed renamable $x3, 56, 7
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_1Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 144(1)
-; ASM64-NEXT:  lbz 3, 0([[REG]])
-; ASM64-NEXT:  sldi 3, 3, 56
-; ASM64-NEXT:  bl .test_byval_1Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define zeroext i8 @test_byval_1Byte(ptr byval(%struct.S0) align 1 %s0, ptr byval(%struct.S1) align 1 %s) {
+; 32BIT-LABEL: test_byval_1Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mr 4, 3
+; 32BIT-NEXT:    srwi 3, 3, 24
+; 32BIT-NEXT:    stw 4, 24(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_1Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mr 4, 3
+; 64BIT-NEXT:    rldicl 3, 3, 8, 56
+; 64BIT-NEXT:    std 4, 48(1)
+; 64BIT-NEXT:    blr
 entry:
   %0 = load i8, ptr %s, align 1
   ret i8 %0
 }
 
-; CHECK-LABEL: name:            test_byval_1Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:         - { id: 1, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:       bb.0.entry:
-; 32BIT-NEXT:    liveins: $r3
-; 32BIT:         renamable $r4 = COPY $r3
-; 32BIT:         renamable $r3 = RLWINM $r3, 8, 24, 31
-; 32BIT:         STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 8)
-; 32BIT-NEXT:    BLR
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:    - { id: 0, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:         - { id: 1, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:        isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        renamable $x4 = COPY $x3
-; 64BIT:        renamable $x3 = RLDICL $x3, 8, 56
-; 64BIT:        STD killed renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-
-; CHECKASM-LABEL: .test_byval_1Byte:
-
-; ASM32:      mr 4, 3
-; ASM32-NEXT: srwi 3, 3, 24
-; ASM32-NEXT: stw 4, 24(1)
-; ASM32-NEXT: blr
-
-; ASM64:      mr 4, 3
-; ASM64-NEXT: rldicl 3, 3, 8, 56
-; ASM64-NEXT: std 4, 48(1)
-; ASM64-NEXT: blr
-
-
 @f = common global float 0.000000e+00, align 4
 
 %struct.S2 = type { [2 x i8] }
@@ -117,240 +74,184 @@ entry:
 @gS2 = external global %struct.S2, align 1
 
 define void @call_test_byval_2Byte() {
+; 32BIT-LABEL: call_test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C1(2) # @f
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 7, 43
+; 32BIT-NEXT:    lfs 1, 0(3)
+; 32BIT-NEXT:    lwz 3, L..C2(2) # @gS2
+; 32BIT-NEXT:    lhz 3, 0(3)
+; 32BIT-NEXT:    fmr 2, 1
+; 32BIT-NEXT:    slwi 5, 3, 16
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    bl .test_byval_2Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C1(2) # @f
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    li 7, 43
+; 64BIT-NEXT:    lfs 1, 0(3)
+; 64BIT-NEXT:    ld 3, L..C2(2) # @gS2
+; 64BIT-NEXT:    lhz 3, 0(3)
+; 64BIT-NEXT:    fmr 2, 1
+; 64BIT-NEXT:    sldi 5, 3, 48
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    bl .test_byval_2Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %0 = load float, ptr @f, align 4
   %call = call zeroext i8 @test_byval_2Byte(i32 signext 42, float %0, ptr byval(%struct.S2) align 1 @gS2, float %0, i32 signext 43)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_2Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       renamable $r[[REG1:[0-9]+]] = LWZtoc @f, $r2 :: (load (s32) from got)
-; 32BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $r[[REG1]] :: (dereferenceable load (s32) from @f)
-; 32BIT-NEXT:  ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 42
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LWZtoc @gS2, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LHZ 0, killed renamable $r[[REG2]] :: (load (s16))
-; 32BIT-DAG:   renamable $r5 = RLWINM killed renamable $r[[REG3]], 16, 0, 15
-; 32BIT-DAG:   $f2 = COPY renamable $f1
-; 32BIT-DAG:   $r7 = LI 43
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_2Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $f1, implicit $r5, implicit killed $f2, implicit killed $r7, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_2Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 42
-; ASM32-DAG:   lwz [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lfs 1, 0([[REG1]])
-; ASM32-DAG:   lwz [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM32-DAG:   slwi 5, [[REG3]], 16
-; ASM32-DAG:   fmr 2, 1
-; ASM32-DAG:   li 7, 43
-; ASM32-NEXT:  bl .test_byval_2Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 64
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       renamable $x[[REG1:[0-9]+]] = LDtoc @f, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $f1 = LFS 0, killed renamable $x[[REG1]] :: (dereferenceable load (s32) from @f)
-; 64BIT-NEXT:  ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 42
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LDtoc @gS2, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LHZ8 0, killed renamable $x[[REG2]] :: (load (s16))
-; 64BIT-DAG:   renamable $x5 = RLDICR killed renamable $x[[REG3]], 48, 15
-; 64BIT-DAG:   $f2 = COPY renamable $f1
-; 64BIT-DAG:   $x7 = LI8 43
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_2Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $f1, implicit $x5, implicit killed $f2, implicit killed $x7, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-DAG:   std 0, 128(1)
-; ASM64-DAG:   li 3, 42
-; ASM64-DAG:   ld [[REG1:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lfs 1, 0([[REG1]])
-; ASM64-DAG:   ld [[REG2:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG3:[0-9]+]], 0([[REG2]])
-; ASM64-DAG:   sldi 5, [[REG3]], 48
-; ASM64-DAG:   fmr 2, 1
-; ASM64-DAG:   li 7, 43
-; ASM64-NEXT:  bl .test_byval_2Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 112
-
 define zeroext i8 @test_byval_2Byte(i32, float, ptr byval(%struct.S2) align 1 %s, float, i32) {
+; 32BIT-LABEL: test_byval_2Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    lbz 3, 33(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_2Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lbz 3, 65(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S2, ptr %s, i32 0, i32 0, i32 1
   %4 = load i8, ptr %arrayidx, align 1
   ret i8 %4
 }
 
-; CHECK-LABEL: name:            test_byval_2Byte
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 32, size: 4, alignment: 16, stack-id: default,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r5
-; 32BIT:        STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
-; 32BIT-NEXT:   renamable $r3 = LBZ 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 64, size: 8, alignment: 16, stack-id: default,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x5
-; 64BIT:        STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
-; 64BIT-NEXT:   renamable $x3 = LBZ8 1, %fixed-stack.0 :: (dereferenceable load (s8)
-
-; CHECKASM-LABEL: .test_byval_2Byte:
-
-; ASM32:        stw 5, 32(1)
-; ASM32-NEXT:   lbz 3, 33(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 5, 64(1)
-; ASM64-NEXT:   lbz 3, 65(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S3 = type <{ i8, i16 }>
 @gS3 = external global %struct.S3, align 1
 
 define void @call_test_byval_3Byte() {
+; 32BIT-LABEL: call_test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 4, L..C3(2) # @gS3
+; 32BIT-NEXT:    li 3, 42
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    li 5, 3
+; 32BIT-NEXT:    li 6, 4
+; 32BIT-NEXT:    li 7, 5
+; 32BIT-NEXT:    stw 3, 56(1)
+; 32BIT-NEXT:    li 8, 6
+; 32BIT-NEXT:    li 9, 7
+; 32BIT-NEXT:    lbz 3, 2(4)
+; 32BIT-NEXT:    lhz 4, 0(4)
+; 32BIT-NEXT:    rlwinm 10, 3, 8, 16, 23
+; 32BIT-NEXT:    li 3, 1
+; 32BIT-NEXT:    rlwimi 10, 4, 16, 0, 15
+; 32BIT-NEXT:    li 4, 2
+; 32BIT-NEXT:    bl .test_byval_3Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 4, L..C3(2) # @gS3
+; 64BIT-NEXT:    li 3, 42
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    li 5, 3
+; 64BIT-NEXT:    li 6, 4
+; 64BIT-NEXT:    li 7, 5
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    li 8, 6
+; 64BIT-NEXT:    li 9, 7
+; 64BIT-NEXT:    lbz 3, 2(4)
+; 64BIT-NEXT:    lhz 4, 0(4)
+; 64BIT-NEXT:    rldic 10, 3, 40, 16
+; 64BIT-NEXT:    li 3, 1
+; 64BIT-NEXT:    rldimi 10, 4, 48, 0
+; 64BIT-NEXT:    li 4, 2
+; 64BIT-NEXT:    bl .test_byval_3Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i16 @test_byval_3Byte(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, ptr byval(%struct.S3) align 1 @gS3, i32 42)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_3Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 60, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   $r3 = LI 1
-; 32BIT-DAG:   $r4 = LI 2
-; 32BIT-DAG:   $r5 = LI 3
-; 32BIT-DAG:   $r6 = LI 4
-; 32BIT-DAG:   $r7 = LI 5
-; 32BIT-DAG:   $r8 = LI 6
-; 32BIT-DAG:   $r9 = LI 7
-; 32BIT-DAG:   renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS3, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 0, killed renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 2, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-DAG:   renamable $r[[REG3:[0-9]+]] = LI 42
-; 32BIT-DAG:   STW killed renamable $r[[REG3]], 56, $r1 :: (store (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_3Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 60, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_3Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-DAG:   li 3, 1
-; ASM32-DAG:   li 4, 2
-; ASM32-DAG:   li 5, 3
-; ASM32-DAG:   li 6, 4
-; ASM32-DAG:   li 7, 5
-; ASM32-DAG:   li 8, 6
-; ASM32-DAG:   li 9, 7
-; ASM32-DAG:   lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG1]], 16, 0, 15
-; ASM32-DAG:   li [[REG3:[0-9]+]], 42
-; ASM32-DAG:   stw [[REG3]], 56(1)
-; ASM32-NEXT:  bl .test_byval_3Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 120, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-DAG:   $x3 = LI8 1
-; 64BIT-DAG:   $x4 = LI8 2
-; 64BIT-DAG:   $x5 = LI8 3
-; 64BIT-DAG:   $x6 = LI8 4
-; 64BIT-DAG:   $x7 = LI8 5
-; 64BIT-DAG:   $x8 = LI8 6
-; 64BIT-DAG:   $x9 = LI8 7
-; 64BIT-DAG:   renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS3, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LHZ8 0, killed renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 2, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x10 = RLDIC killed renamable $x[[REG2]], 40, 16
-; 64BIT-DAG:   renamable $x10 = RLDIMI killed renamable $x10, killed renamable $x[[REG1]], 48, 0
-; 64BIT-DAG:   $x[[REG3:[0-9]+]] = LI8 42
-; 64BIT-DAG:   STD killed renamable $x[[REG3]], 112, $x1 :: (store (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_3Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit $x10, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 120, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64-DAG:   li 3, 1
-; ASM64-DAG:   li 4, 2
-; ASM64-DAG:   li 5, 3
-; ASM64-DAG:   li 6, 4
-; ASM64-DAG:   li 7, 5
-; ASM64-DAG:   li 8, 6
-; ASM64-DAG:   li 9, 7
-; ASM64-DAG:   ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lhz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 2([[REGADDR]])
-; ASM64-DAG:   rldic 10, [[REG2]], 40, 16
-; ASM64-DAG:   rldimi 10, [[REG1]], 48, 0
-; ASM64-DAG:   li [[REG3:[0-9]+]], 42
-; ASM64-DAG:   std [[REG3]], 112(1)
-; ASM64-NEXT:  bl .test_byval_3Byte
-; ASM64-NEXT:  nop
-
-
 define zeroext i16 @test_byval_3Byte(i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S3) align 1 %s, i32) {
+; 32BIT-LABEL: test_byval_3Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    lhz 3, 53(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_3Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 10, 104(1)
+; 64BIT-NEXT:    lhz 3, 105(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S3, ptr %s, i32 0, i32 1
   %8 = load i16, ptr %gep, align 1
   ret i16 %8
 }
 
-; CHECK-LABEL: name:            test_byval_3Byte
-
-; 32BIT:       fixedStack:
-; 32BIT-NEXT:    - { id: 0, type: default, offset: 56, size: 4, alignment: 8, stack-id: default,
-; 32BIT:         - { id: 1, type: default, offset: 52, size: 4, alignment: 4, stack-id: default,
-
-; 32BIT-LABEL: bb.0.entry:
-; 32BIT-NEXT:    liveins: $r10
-; 32BIT:         STW killed renamable $r10, 0, %fixed-stack.1 :: (store (s32) into %fixed-stack.1)
-; 32BIT-NEXT:    renamable $r3 = LHZ 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; 64BIT:       fixedStack:
-; 64BIT-NEXT:     - { id: 0, type: default, offset: 116, size: 4, alignment: 4, stack-id: default,
-; 64BIT:          - { id: 1, type: default, offset: 104, size: 8, alignment: 8, stack-id: default,
-
-; 64BIT-LABEL: bb.0.entry:
-; 64BIT-NEXT:    liveins: $x10
-; 64BIT:         STD killed renamable $x10, 0, %fixed-stack.1 :: (store (s64) into %fixed-stack.1)
-; 64BIT-NEXT:    renamable $x3 = LHZ8 1, %fixed-stack.1 :: (dereferenceable load (s16)
-
-; CHECKASM-LABEL: .test_byval_3Byte:
-
-; ASM32:        stw 10, 52(1)
-; ASM32-NEXT:   lhz 3, 53(1)
-; ASM32-NEXT:   blr
-
-; ASM64:        std 10, 104(1)
-; ASM64-NEXT:   lhz 3, 105(1)
-; ASM64-NEXT:   blr
-
-
 %struct.S4 = type { [4 x i8] }
 %struct.S4A = type { i32 }
 
 @gS4 = external global %struct.S4, align 1
 
 define void @call_test_byval_4Byte() {
+; 32BIT-LABEL: call_test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    lwz 3, L..C4(2) # @gS4
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    lwz 4, 64(1)
+; 32BIT-NEXT:    bl .test_byval_4Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    ld 3, L..C4(2) # @gS4
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    lwz 3, 0(3)
+; 64BIT-NEXT:    lwz 4, 112(1)
+; 64BIT-NEXT:    sldi 3, 3, 32
+; 64BIT-NEXT:    sldi 4, 4, 32
+; 64BIT-NEXT:    bl .test_byval_4Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s0 = alloca %struct.S0, align 8
   %s4a = alloca %struct.S4A, align 8
@@ -358,46 +259,24 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_4Byte{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REG:[0-9]+]] = LWZtoc @gS4, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REG]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_4Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3,  implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_4Byte:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-NEXT:  lwz [[REG:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REG]])
-; ASM32-DAG:   lwz 4, 64(1)
-; ASM32-NEXT:  bl .test_byval_4Byte
-; ASM32-NEXT:  nop
-; ASM32-NEXT:  addi 1, 1, 80
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS4, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[LD1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[LD2:[0-9]+]] = LWZ8 0, %stack.1.s4a :: (load (s32) from %stack.1.s4a, align 8)
-; 64BIT-DAG:   renamable $x3 = RLDICR killed renamable $x[[LD1]], 32, 31
-; 64BIT-DAG:   renamable $x4 = RLDICR killed renamable $x[[LD2]], 32, 31
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_4Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3,  implicit $x4, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -128(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[LD1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lwz [[LD2:[0-9]+]], 112(1)
-; ASM64-DAG:   sldi 3, [[LD1]], 32
-; ASM64-DAG:   sldi 4, [[LD2]], 32
-; ASM64-NEXT:  bl .test_byval_4Byte
-; ASM64-NEXT:  nop
-; ASM64-NEXT:  addi 1, 1, 128
-
-
 define signext i32 @test_byval_4Byte(ptr byval(%struct.S4) align 1 %s, ptr byval(%struct.S0) align 1, ptr byval(%struct.S4A) align 4 %s4a) {
+; 32BIT-LABEL: test_byval_4Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    clrlwi 3, 3, 24
+; 32BIT-NEXT:    add 3, 4, 3
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_4Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 51(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    rldicl 4, 4, 32, 32
+; 64BIT-NEXT:    add 3, 4, 3
+; 64BIT-NEXT:    extsw 3, 3
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S4, ptr %s, i32 0, i32 0, i32 3
   %1 = load i8, ptr %arrayidx, align 1
@@ -407,64 +286,43 @@ entry:
   ret i32 %add
 }
 
-; CHECK-LABEL: name:            test_byval_4Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 1, type: default, offset: 28, size: 4, alignment: 4, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 32BIT:        - { id: 2, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3
-; 32BIT:        STW renamable $r3, 0, %fixed-stack.2 :: (store (s32) into %fixed-stack.2, align 8)
-; 32BIT-DAG:    STW killed renamable $r4, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0)
-; 32BIT-DAG:    renamable $r[[SCRATCH:[0-9]+]] = RLWINM killed renamable $r3, 0, 24, 31
-; 32BIT-DAG:    renamable $r3 = nsw ADD4 renamable $r4, killed renamable $r[[SCRATCH]]
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT: - { id: 0, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 1, type: default, offset: 56, size: 8, alignment: 8, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-; 64BIT:      - { id: 2, type: default, offset: 48, size: 8, alignment: 16, stack-id: default,
-; 64BIT-NEXT:     isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3
-; 64BIT:        STD killed renamable $x3, 0, %fixed-stack.2 :: (store (s64) into %fixed-stack.2, align 16)
-; 64BIT:        STD renamable $x4, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0)
-; 64BIT-DAG:    renamable $r[[SCRATCH1:[0-9]+]] = LBZ 3, %fixed-stack.2 :: (dereferenceable load (s8)
-; 64BIT-DAG:    renamable $x[[SCRATCH2:[0-9]+]] = RLDICL killed renamable $x4, 32, 32
-; 64BIT-NEXT:   renamable $r[[SCRATCH3:[0-9]+]] = nsw ADD4 renamable $r[[SCRATCH2]], killed renamable $r[[SCRATCH1]], implicit killed $x[[SCRATCH2]]
-; 64BIT-NEXT:   renamable $x3 = EXTSW_32_64 killed renamable $r[[SCRATCH3]]
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_4Byte:
-
-; ASM32:        stw 3, 24(1)
-; ASM32-DAG:    stw 4, 28(1)
-; ASM32-DAG:    clrlwi  [[SCRATCH:[0-9]+]], 3, 24
-; ASM32-DAG:    add 3, 4, [[SCRATCH]]
-; ASM32-NEXT:   blr
-
-; ASM64:        std 3, 48(1)
-; ASM64-NEXT:   lbz [[SCRATCH1:[0-9]+]], 51(1)
-; ASM64-NEXT:   std 4, 56(1)
-; ASM64-NEXT:   rldicl [[SCRATCH2:[0-9]+]], 4, 32, 32
-; ASM64-NEXT:   add [[SCRATCH3:[0-9]+]], [[SCRATCH2]], [[SCRATCH1]]
-; ASM64-NEXT:   extsw 3, [[SCRATCH3]]
-; ASM64-NEXT:   blr
-
-
 %struct.S5 = type { [5 x i8] }
 
 @gS5 = external global %struct.S5, align 1
 
 define void @call_test_byval_5Byte() {
+; 32BIT-LABEL: call_test_byval_5Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C5(2) # @gS5
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 24
+; 32BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_5Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C5(2) # @gS5
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 24, 0, 7
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_5Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1 @gS5)
   ret void
@@ -472,54 +330,43 @@ entry:
 
 declare zeroext i8 @test_byval_5Byte(ptr byval(%struct.S5) align 1)
 
-; CHECK-LABEL: name: call_test_byval_5Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS5, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LBZ 4, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 24, 0, 7
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:   ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_5Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lbz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 24
-; ASM32-NEXT:  bl .test_byval_5Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_5Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lbz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 24, 0, 7
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_5Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S6 = type { [6 x i8] }
 
 @gS6 = external global %struct.S6, align 1
 
 define void @call_test_byval_6Byte() {
+; 32BIT-LABEL: call_test_byval_6Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C6(2) # @gS6
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lhz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    slwi 4, 4, 16
+; 32BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_6Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C6(2) # @gS6
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 5, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 5, 32, 0
+; 64BIT-NEXT:    bl .test_byval_6Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1 @gS6)
   ret void
@@ -527,54 +374,47 @@ entry:
 
 declare zeroext i8 @test_byval_6Byte(ptr byval(%struct.S6) align 1)
 
-; CHECK-LABEL: name: call_test_byval_6Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS6, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_6Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   slwi 4, [[REG1]], 16
-; ASM32-NEXT:  bl .test_byval_6Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_6Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_6Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S7 = type { [7 x i8] }
 
 @gS7 = external global %struct.S7, align 1
 
 define void @call_test_byval_7Byte() {
+; 32BIT-LABEL: call_test_byval_7Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C7(2) # @gS7
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 4, 6(3)
+; 32BIT-NEXT:    lhz 5, 4(3)
+; 32BIT-NEXT:    rlwinm 4, 4, 8, 16, 23
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    rlwimi 4, 5, 16, 0, 15
+; 32BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_7Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C7(2) # @gS7
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 5, 6(3)
+; 64BIT-NEXT:    lhz 4, 4(3)
+; 64BIT-NEXT:    lwz 6, 0(3)
+; 64BIT-NEXT:    rlwinm 3, 5, 8, 16, 23
+; 64BIT-NEXT:    rlwimi 3, 4, 16, 0, 15
+; 64BIT-NEXT:    rldimi 3, 6, 32, 0
+; 64BIT-NEXT:    bl .test_byval_7Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1 @gS7)
   ret void
@@ -582,62 +422,39 @@ entry:
 
 declare zeroext i8 @test_byval_7Byte(ptr byval(%struct.S7) align 1)
 
-; CHECK-LABEL: name: call_test_byval_7Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS7, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG1:[0-9]+]] = LHZ 4, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r[[REG2:[0-9]+]] = LBZ 6, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r4 = RLWINM killed renamable $r[[REG2]], 8, 16, 23
-; 32BIT-DAG:   renamable $r4 = RLWIMI killed renamable $r4, killed renamable $r[[REG1]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_7Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lhz [[REG1:[0-9]+]], 4([[REGADDR]])
-; ASM32-DAG:   lbz [[REG2:[0-9]+]], 6([[REGADDR]])
-; ASM32-DAG:   rlwinm 4, [[REG2]], 8, 16, 23
-; ASM32-DAG:   rlwimi 4, [[REG1]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_7Byte[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_7Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 0([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 4([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 6([[REGADDR]])
-; ASM64-DAG:   rlwinm 3, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 3, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 3, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_7Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S8 = type { [8 x i8] }
 
 @gS8 = external global %struct.S8, align 1
 
 define void @call_test_byval_8Byte() {
+; 32BIT-LABEL: call_test_byval_8Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C8(2) # @gS8
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_8Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C8(2) # @gS8
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_8Byte[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1 @gS8)
   ret void
@@ -645,102 +462,75 @@ entry:
 
 declare zeroext i8 @test_byval_8Byte(ptr byval(%struct.S8) align 1)
 
-; CHECK-LABEL: name: call_test_byval_8Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS8, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_8Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_8Byte[PR]
-; ASM32-NEXT:  nop
-
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load (s64) from got)
-; 64BIT-NEXT:  renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_8Byte[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-NEXT:  std 0, 128(1)
-; ASM64-NEXT:  ld 3, 0([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_8Byte[PR]
-; ASM64-NEXT:  nop
-
-
 %struct.S32 = type { [32 x i8] }
 
 @gS32 = external global %struct.S32, align 1
 
 define void @call_test_byval_32Byte() {
+; 32BIT-LABEL: call_test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C9(2) # @gS32
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lwz 10, 28(3)
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_32Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C9(2) # @gS32
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    ld 6, 24(3)
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_32Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 @gS32)
   ret void
 }
 
-; CHECK-LABEL: name: call_test_byval_32Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS32, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r10 = LWZ 28, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_32Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_32Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lwz 10, 28([[REGADDR]])
-; ASM32-NEXT:  bl .test_byval_32Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS32, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x6 = LD 24, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_32Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   ld 6, 24([[REGADDR]])
-; ASM64-NEXT:  bl .test_byval_32Byte
-; ASM64-NEXT:  nop
-
 define zeroext i8 @test_byval_32Byte(ptr byval(%struct.S32) align 1 %s) {
+; 32BIT-LABEL: test_byval_32Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    lbz 3, 45(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_32Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    lbz 3, 69(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %arrayidx = getelementptr inbounds %struct.S32, ptr %s, i32 0, i32 0, i32 21
   %0 = load i8, ptr %arrayidx, align 1
@@ -750,200 +540,127 @@ entry:
 ; The ByVal handling produces dead stores. See `LowerFormalArguments_AIX` for
 ; details on why.
 
-; CHECK-LABEL: name:            test_byval_32Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT:        STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT:        renamable $r3 = LBZ 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT:        BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT:        STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-DAG:    STD killed renamable $x3, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-NEXT:   renamable $x3 = LBZ8 21, %fixed-stack.0 :: (dereferenceable load (s8)
-; 64BIT-DAG:    STD killed renamable $x4, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; CHECKASM-LABEL: .test_byval_32Byte:
-
-; ASM32:       stw 8, 44(1)
-; ASM32:       stw 3, 24(1)
-; ASM32-DAG:   lbz 3, 45(1)
-; ASM32-DAG:   stw 4, 28(1)
-; ASM32-DAG:   stw 5, 32(1)
-; ASM32-DAG:   stw 6, 36(1)
-; ASM32-DAG:   stw 7, 40(1)
-; ASM32-DAG:   stw 9, 48(1)
-; ASM32-DAG:   stw 10, 52(1)
-; ASM32-NEXT:  blr
-
-; ASM64:       std 5, 64(1)
-; ASM64:       std 3, 48(1)
-; ASM64-DAG:   lbz 3, 69(1)
-; ASM64-DAG:   std 4, 56(1)
-; ASM64-DAG:   std 6, 72(1)
-; ASM64-NEXT:  blr
-
 %struct.S31 = type <{ float, i32, i64, double, i32, i16, i8 }>
 
 @gS31 = external global %struct.S31, align 1
 
 define void @call_test_byval_31Byte() {
+; 32BIT-LABEL: call_test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -64(1)
+; 32BIT-NEXT:    lwz 3, L..C10(2) # @gS31
+; 32BIT-NEXT:    stw 0, 72(1)
+; 32BIT-NEXT:    lbz 10, 30(3)
+; 32BIT-NEXT:    lhz 11, 28(3)
+; 32BIT-NEXT:    rlwinm 10, 10, 8, 16, 23
+; 32BIT-NEXT:    lwz 9, 24(3)
+; 32BIT-NEXT:    rlwimi 10, 11, 16, 0, 15
+; 32BIT-NEXT:    lwz 8, 20(3)
+; 32BIT-NEXT:    lwz 7, 16(3)
+; 32BIT-NEXT:    lwz 6, 12(3)
+; 32BIT-NEXT:    lwz 5, 8(3)
+; 32BIT-NEXT:    lwz 4, 4(3)
+; 32BIT-NEXT:    lwz 3, 0(3)
+; 32BIT-NEXT:    bl .test_byval_31Byte
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 64
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -112(1)
+; 64BIT-NEXT:    ld 3, L..C10(2) # @gS31
+; 64BIT-NEXT:    std 0, 128(1)
+; 64BIT-NEXT:    lbz 6, 30(3)
+; 64BIT-NEXT:    lhz 7, 28(3)
+; 64BIT-NEXT:    rlwinm 6, 6, 8, 16, 23
+; 64BIT-NEXT:    lwz 8, 24(3)
+; 64BIT-NEXT:    rlwimi 6, 7, 16, 0, 15
+; 64BIT-NEXT:    ld 5, 16(3)
+; 64BIT-NEXT:    rldimi 6, 8, 32, 0
+; 64BIT-NEXT:    ld 4, 8(3)
+; 64BIT-NEXT:    ld 3, 0(3)
+; 64BIT-NEXT:    bl .test_byval_31Byte
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 112
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %call = call double @test_byval_31Byte(ptr byval(%struct.S31) align 1 @gS31)
   ret void
 }
 
-
-; CHECK-LABEL: name: call_test_byval_31Byte{{.*}}
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-NEXT:  renamable $r[[REGADDR:[0-9]+]] = LWZtoc @gS31, $r2 :: (load (s32) from got)
-; 32BIT-DAG:   renamable $r3 = LWZ 0, killed renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r4 = LWZ 4, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r5 = LWZ 8, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r6 = LWZ 12, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r7 = LWZ 16, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r8 = LWZ 20, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r9 = LWZ 24, renamable $r[[REGADDR]] :: (load (s32))
-; 32BIT-DAG:   renamable $r[[REG:[0-9]+]] = LHZ 28, renamable $r[[REGADDR]] :: (load (s16))
-; 32BIT-DAG:   renamable $r10 = LBZ 30, renamable $r[[REGADDR]] :: (load (s8))
-; 32BIT-DAG:   renamable $r10 = RLWINM killed renamable $r10, 8, 16, 23
-; 32BIT-DAG:   renamable $r10 = RLWIMI killed renamable $r10, killed renamable $r[[REG]], 16, 0, 15
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_31Byte>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_31Byte:
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM32:       stwu 1, -64(1)
-; ASM32-NEXT:  lwz [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM32-DAG:   lwz 3, 0([[REGADDR]])
-; ASM32-DAG:   lwz 4, 4([[REGADDR]])
-; ASM32-DAG:   lwz 5, 8([[REGADDR]])
-; ASM32-DAG:   lwz 6, 12([[REGADDR]])
-; ASM32-DAG:   lwz 7, 16([[REGADDR]])
-; ASM32-DAG:   lwz 8, 20([[REGADDR]])
-; ASM32-DAG:   lwz 9, 24([[REGADDR]])
-; ASM32-DAG:   lbz 10, 30([[REGADDR]])
-; ASM32-DAG:   lhz [[REG:[0-9]+]], 28([[REGADDR]])
-; ASM32-DAG:   rlwinm 10, 10, 8, 16, 23
-; ASM32-DAG:   rlwimi 10, [[REG]], 16, 0, 15
-; ASM32-NEXT:  bl .test_byval_31Byte
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT-NEXT:  renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS31, $x2 :: (load (s64) from got)
-; 64BIT-DAG:   renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x4 = LD 8, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x5 = LD 16, renamable $x[[REGADDR]] :: (load (s64))
-; 64BIT-DAG:   renamable $x[[REG1:[0-9]+]] = LWZ8 24, renamable $x[[REGADDR]] :: (load (s32))
-; 64BIT-DAG:   renamable $x[[REG2:[0-9]+]] = LHZ8 28, renamable $x[[REGADDR]] :: (load (s16))
-; 64BIT-DAG:   renamable $x[[REG3:[0-9]+]] = LBZ8 30, renamable $x[[REGADDR]] :: (load (s8))
-; 64BIT-DAG:   renamable $x6 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23
-; 64BIT-DAG:   renamable $x6 = RLWIMI8 killed renamable $x6, killed renamable $x[[REG2]], 16, 0, 15
-; 64BIT-DAG:   renamable $x6 = RLDIMI killed renamable $x6, killed renamable $x[[REG1]], 32, 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_31Byte>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x2, implicit-def $r1
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; ASM64:       stdu 1, -112(1)
-; ASM64-NEXT:  ld [[REGADDR:[0-9]+]], L..C{{[0-9]+}}(2)
-; ASM64-DAG:   ld 3, 0([[REGADDR]])
-; ASM64-DAG:   ld 4, 8([[REGADDR]])
-; ASM64-DAG:   ld 5, 16([[REGADDR]])
-; ASM64-DAG:   lwz [[REG1:[0-9]+]], 24([[REGADDR]])
-; ASM64-DAG:   lhz [[REG2:[0-9]+]], 28([[REGADDR]])
-; ASM64-DAG:   lbz [[REG3:[0-9]+]], 30([[REGADDR]])
-; ASM64-DAG:   rlwinm 6, [[REG3]], 8, 16, 23
-; ASM64-DAG:   rlwimi 6, [[REG2]], 16, 0, 15
-; ASM64-DAG:   rldimi 6, [[REG1]], 32, 0
-; ASM64-NEXT:  bl .test_byval_31Byte
-; ASM64-NEXT:  nop
-
-
-
 define double @test_byval_31Byte(ptr byval(%struct.S31) align 1 %s) {
+; 32BIT-LABEL: test_byval_31Byte:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    stw 8, 44(1)
+; 32BIT-NEXT:    stw 7, 40(1)
+; 32BIT-NEXT:    lfd 1, 40(1)
+; 32BIT-NEXT:    stw 3, 24(1)
+; 32BIT-NEXT:    stw 4, 28(1)
+; 32BIT-NEXT:    stw 5, 32(1)
+; 32BIT-NEXT:    stw 6, 36(1)
+; 32BIT-NEXT:    stw 9, 48(1)
+; 32BIT-NEXT:    stw 10, 52(1)
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: test_byval_31Byte:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    std 5, 64(1)
+; 64BIT-NEXT:    lfd 1, 64(1)
+; 64BIT-NEXT:    std 3, 48(1)
+; 64BIT-NEXT:    std 4, 56(1)
+; 64BIT-NEXT:    std 6, 72(1)
+; 64BIT-NEXT:    blr
 entry:
   %gep = getelementptr inbounds %struct.S31, ptr %s, i32 0, i32 3
   %load = load double, ptr %gep, align 1
   ret double %load
 }
 
-; CHECK-LABEL: name:            test_byval_31Byte
-
-; 32BIT:      fixedStack:
-; 32BIT-NEXT:   - { id: 0, type: default, offset: 24, size: 32, alignment: 8, stack-id: default,
-; 32BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 32BIT:      bb.0.entry:
-; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
-; 32BIT-DAG:    STW killed renamable $r3,   0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0
-; 32BIT-DAG:    STW killed renamable $r4,   4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4
-; 32BIT-DAG:    STW killed renamable $r5,   8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8
-; 32BIT-DAG:    STW killed renamable $r6,  12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12
-; 32BIT-DAG:    STW killed renamable $r7,  16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16
-; 32BIT-DAG:    STW killed renamable $r8,  20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20
-; 32BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 32BIT-DAG:    STW killed renamable $r9,  24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24
-; 32BIT-DAG:    STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28
-; 32BIT-NEXT:   BLR
-
-; 64BIT:      fixedStack:
-; 64BIT-NEXT:   - { id: 0, type: default, offset: 48, size: 32, alignment: 16, stack-id: default,
-; 64BIT-NEXT:       isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
-
-; 64BIT:      bb.0.entry:
-; 64BIT-NEXT:   liveins: $x3, $x4, $x5, $x6
-; 64BIT-DAG:    STD killed renamable $x3,  0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0
-; 64BIT-DAG:    STD killed renamable $x5, 16, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 16
-; 64BIT-NEXT:   renamable $f1 = LFD 16, %fixed-stack.0 :: (dereferenceable load (s64)
-; 64BIT-DAG:    STD killed renamable $x4,  8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8
-; 64BIT-DAG:    STD killed renamable $x6, 24, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 24
-; 64BIT-NEXT:   BLR8
-
-; ASM32-LABEL: .test_byval_31Byte:
-
-; ASM32-DAG:      stw 8, 44(1)
-; ASM32:          stw 7, 40(1)
-; ASM32-DAG:      lfd 1, 40(1)
-; ASM32-DAG:      stw 3, 24(1)
-; ASM32-DAG:      stw 4, 28(1)
-; ASM32-DAG:      stw 5, 32(1)
-; ASM32-DAG:      stw 6, 36(1)
-; ASM32-DAG:      stw 9, 48(1)
-; ASM32-DAG:      stw 10, 52(1)
-; ASM32-NEXT:     blr
-
-; ASM64:          std 5, 64(1)
-; ASM64:          lfd 1, 64(1)
-; ASM64-DAG:      std 3, 48(1)
-; ASM64-DAG:      std 4, 56(1)
-; ASM64-DAG:      std 6, 72(1)
-; ASM64-NEXT:     blr
-
 %struct.F = type { float, float, float }
 
 define i32 @call_test_byval_homogeneous_float_struct() {
+; 32BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 32BIT:       # %bb.0: # %entry
+; 32BIT-NEXT:    mflr 0
+; 32BIT-NEXT:    stwu 1, -80(1)
+; 32BIT-NEXT:    li 3, 0
+; 32BIT-NEXT:    stw 0, 88(1)
+; 32BIT-NEXT:    stw 3, 72(1)
+; 32BIT-NEXT:    stw 3, 68(1)
+; 32BIT-NEXT:    lwz 5, 72(1)
+; 32BIT-NEXT:    lwz 4, 68(1)
+; 32BIT-NEXT:    stw 3, 64(1)
+; 32BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 32BIT-NEXT:    nop
+; 32BIT-NEXT:    addi 1, 1, 80
+; 32BIT-NEXT:    lwz 0, 8(1)
+; 32BIT-NEXT:    mtlr 0
+; 32BIT-NEXT:    blr
+;
+; 64BIT-LABEL: call_test_byval_homogeneous_float_struct:
+; 64BIT:       # %bb.0: # %entry
+; 64BIT-NEXT:    mflr 0
+; 64BIT-NEXT:    stdu 1, -128(1)
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    std 0, 144(1)
+; 64BIT-NEXT:    stw 3, 120(1)
+; 64BIT-NEXT:    std 3, 112(1)
+; 64BIT-NEXT:    lwz 3, 120(1)
+; 64BIT-NEXT:    sldi 4, 3, 32
+; 64BIT-NEXT:    li 3, 0
+; 64BIT-NEXT:    bl .test_byval_homogeneous_float_struct[PR]
+; 64BIT-NEXT:    nop
+; 64BIT-NEXT:    addi 1, 1, 128
+; 64BIT-NEXT:    ld 0, 16(1)
+; 64BIT-NEXT:    mtlr 0
+; 64BIT-NEXT:    blr
 entry:
   %s = alloca %struct.F, align 8
   call void @llvm.memset.p0.i32(ptr align 4 %s, i8 0, i32 12, i1 false)
@@ -954,37 +671,3 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
 
 declare i32 @test_byval_homogeneous_float_struct(ptr byval(%struct.F) align 4)
-
-; CHECK-LABEL: name: call_test_byval_homogeneous_float_struct{{.*}}
-
-; 32BIT:       ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT-DAG:   renamable $r4 = LWZ 4, %stack.0.s :: (load (s32) from %stack.0.s + 4)
-; 32BIT-DAG:   renamable $r5 = LWZ 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 32BIT-DAG:   $r3 = LI 0
-; 32BIT-NEXT:  BL_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r5, implicit $r2, implicit-def $r1, implicit-def $r3
-; 32BIT-NEXT:  ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; CHECKASM-LABEL: .call_test_byval_homogeneous_float_struct:
-
-; ASM32:       stwu 1, -80(1)
-; ASM32-DAG:   lwz 4, 68(1)
-; ASM32-DAG:   lwz 5, 72(1)
-; ASM32-DAG:   stw 3, 64(1)
-; ASM32-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM32-NEXT:  nop
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; 64BIT:       ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT:       renamable $x3 = LWZ8 8, %stack.0.s :: (load (s32) from %stack.0.s + 8, align 8)
-; 64BIT-NEXT:  renamable $x4 = RLDICR killed renamable $x3, 32, 31
-; 64BIT-NEXT:  $x3 = LI8 0
-; 64BIT-NEXT:  BL8_NOP <mcsymbol .test_byval_homogeneous_float_struct[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1, implicit-def $x3
-; 64BIT-NEXT:  ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings.
-; ASM64:       stdu 1, -128(1)
-; ASM64:       lwz 3, 120(1)
-; ASM64-NEXT:  sldi 4, 3, 32
-; ASM64-NEXT:  li 3, 0
-; ASM64-NEXT:  bl .test_byval_homogeneous_float_struct[PR]
-; ASM64-NEXT:  nop

From 977d8a4bcd83797217433709201922b9deb97ae2 Mon Sep 17 00:00:00 2001
From: Vincent <llvm@viceroygroup.ca>
Date: Tue, 17 Jun 2025 06:20:41 -0700
Subject: [PATCH 730/851] [clang][Sema] Fixed Compound Literal is not Constant
 Expression (#143852)

Added a check for a compound literal hiding inside a function.

fixes #87867
---
 clang/docs/ReleaseNotes.rst      |  2 ++
 clang/include/clang/Sema/Scope.h | 11 +++++++++++
 clang/lib/Sema/SemaExpr.cpp      |  1 +
 clang/test/Sema/gh87867.c        | 33 ++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+)
 create mode 100644 clang/test/Sema/gh87867.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 03641f5d0ea0d..6f28dbd03ca2a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -692,6 +692,8 @@ Bug Fixes in This Version
   ``#include`` directive. (#GH138094)
 - Fixed a crash during constant evaluation involving invalid lambda captures
   (#GH138832)
+- Fixed compound literal is not constant expression inside initializer list
+  (#GH87867)
 - Fixed a crash when instantiating an invalid dependent friend template specialization.
   (#GH139052)
 - Fixed a crash with an invalid member function parameter list with a default
diff --git a/clang/include/clang/Sema/Scope.h b/clang/include/clang/Sema/Scope.h
index ad12a3d73413b..07b9e1bc10f5a 100644
--- a/clang/include/clang/Sema/Scope.h
+++ b/clang/include/clang/Sema/Scope.h
@@ -427,6 +427,17 @@ class Scope {
     return false;
   }
 
+  /// isInObjcMethodScope - Return true if this scope is, or is contained, in an
+  /// C function body.
+  bool isInCFunctionScope() const {
+    for (const Scope *S = this; S; S = S->getParent()) {
+      if (S->isFunctionScope())
+        return true;
+    }
+
+    return false;
+  }
+
   /// isInObjcMethodScope - Return true if this scope is, or is contained in, an
   /// Objective-C method body.  Note that this method is not constant time.
   bool isInObjcMethodScope() const {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 413eff4aa294a..ebc43157d4c2b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7176,6 +7176,7 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo,
   //   void func(char *para[(int [1]){ 0 }[0]);
   const Scope *S = getCurScope();
   bool IsFileScope = !CurContext->isFunctionOrMethod() &&
+                     !S->isInCFunctionScope() &&
                      (!S || !S->isFunctionPrototypeScope());
 
   // In C, compound literals are l-values for some reason.
diff --git a/clang/test/Sema/gh87867.c b/clang/test/Sema/gh87867.c
new file mode 100644
index 0000000000000..0568c734424ca
--- /dev/null
+++ b/clang/test/Sema/gh87867.c
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 %s
+
+// Compound literal doesn't need a constant expression inside a initializer-list if it is already inside a function 
+// see: https://github.com/llvm/llvm-project/issues/87867
+int foo(int *a, int b) {
+    return 0;
+}
+
+int x;
+struct{int t;} a = (struct {
+    typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t; // expected-error {{initializer element is not a compile-time constant}}
+}){0};
+
+void inside_a_func(){
+    int x;
+    (void)(struct {
+        typeof(foo(&(struct { int t; }){.t = x}.t, 0)) t;
+    }){0};
+}
+
+// see: https://github.com/llvm/llvm-project/issues/143613
+#define bitcast(type, value) \
+    (((union{ typeof(value) src; type dst; }){ (value) }).dst)
+
+double placeholder = 10.0;
+double bar = bitcast(double, placeholder);  // expected-error {{initializer element is not a compile-time constant}}
+
+int main(void)
+{
+    int foo = 4;
+    foo = bitcast(int, bitcast(double, foo));
+    return 0;
+}

From 816ab1af0da1dc833f487933e7d6fb470d844001 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 17 Jun 2025 06:21:21 -0700
Subject: [PATCH 731/851] [NFCI][TableGen][DecoderEmitter] Cull Op handling
 when possible (#142974)

TryDecode/CheckPredicate/SoftFail MCD ops are not used by many targets.
Track the set of opcodes that were emitted and emit code for handling
TryDecode/CheckPredicate/SoftFail ops when decoding only if there were
emitted. This is purely eliminating dead code in the generated
`decodeInstruction` function.

This results in the following reduction in the size of the Disassembler
.so files with a release x86_64 release build on Linux:

```
Target                                                   Old Size        New Size  %  reduction
build/lib/libLLVMAArch64Disassembler.so.21.0git             256656          256656          0.00
build/lib/libLLVMAMDGPUDisassembler.so.21.0git              813000          808168          0.59
build/lib/libLLVMARCDisassembler.so.21.0git                  44816           43536          2.86
build/lib/libLLVMARMDisassembler.so.21.0git                 281744          278808          1.04
build/lib/libLLVMAVRDisassembler.so.21.0git                  36040           34496          4.28
build/lib/libLLVMBPFDisassembler.so.21.0git                  26248           23168         11.73
build/lib/libLLVMCSKYDisassembler.so.21.0git                 55960           53632          4.16
build/lib/libLLVMHexagonDisassembler.so.21.0git             115952          113416          2.19
build/lib/libLLVMLanaiDisassembler.so.21.0git                24360           21008         13.76
build/lib/libLLVMLoongArchDisassembler.so.21.0git            58584           56168          4.12
build/lib/libLLVMM68kDisassembler.so.21.0git                 57264           53880          5.91
build/lib/libLLVMMSP430Disassembler.so.21.0git               28896           28440          1.58
build/lib/libLLVMMipsDisassembler.so.21.0git                123128          120568          2.08
build/lib/libLLVMPowerPCDisassembler.so.21.0git              80656           78096          3.17
build/lib/libLLVMRISCVDisassembler.so.21.0git               154080          150200          2.52
build/lib/libLLVMSparcDisassembler.so.21.0git                42040           39568          5.88
build/lib/libLLVMSystemZDisassembler.so.21.0git              97056           94552          2.58
build/lib/libLLVMVEDisassembler.so.21.0git                   83944           81352          3.09
build/lib/libLLVMWebAssemblyDisassembler.so.21.0git          25280           25280          0.00
build/lib/libLLVMX86Disassembler.so.21.0git                2920624         2920624          0.00
build/lib/libLLVMXCoreDisassembler.so.21.0git                48320           44288          8.34
build/lib/libLLVMXtensaDisassembler.so.21.0git               42248           35840         15.17
```
---
 llvm/utils/TableGen/DecoderEmitter.cpp | 114 +++++++++++++++----------
 1 file changed, 71 insertions(+), 43 deletions(-)

diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 7489d369c9932..37814113b467a 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -222,10 +222,11 @@ class DecoderEmitter {
   DecoderEmitter(const RecordKeeper &R, StringRef PredicateNamespace)
       : RK(R), Target(R), PredicateNamespace(PredicateNamespace) {}
 
-  // Emit the decoder state machine table.
-  void emitTable(formatted_raw_ostream &OS, DecoderTable &Table, indent Indent,
-                 unsigned BitWidth, StringRef Namespace,
-                 const EncodingIDsVec &EncodingIDs) const;
+  // Emit the decoder state machine table. Returns a mask of MCD decoder ops
+  // that were emitted.
+  unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
+                     indent Indent, unsigned BitWidth, StringRef Namespace,
+                     const EncodingIDsVec &EncodingIDs) const;
   void emitInstrLenTable(formatted_raw_ostream &OS,
                          ArrayRef<unsigned> InstrLen) const;
   void emitPredicateFunction(formatted_raw_ostream &OS,
@@ -826,11 +827,12 @@ unsigned Filter::usefulness() const {
 //                              //
 //////////////////////////////////
 
-// Emit the decoder state machine table.
-void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
-                               indent Indent, unsigned BitWidth,
-                               StringRef Namespace,
-                               const EncodingIDsVec &EncodingIDs) const {
+// Emit the decoder state machine table. Returns a mask of MCD decoder ops
+// that were emitted.
+unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS,
+                                   DecoderTable &Table, indent Indent,
+                                   unsigned BitWidth, StringRef Namespace,
+                                   const EncodingIDsVec &EncodingIDs) const {
   // We'll need to be able to map from a decoded opcode into the corresponding
   // EncodingID for this specific combination of BitWidth and Namespace. This
   // is used below to index into NumberedEncodings.
@@ -884,6 +886,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
       OS << " (Fail)";
   };
 
+  unsigned OpcodeMask = 0;
+
   while (I != E) {
     assert(I < E && "incomplete decode table entry!");
 
@@ -892,6 +896,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
     OS.PadToColumn(12);
 
     const uint8_t DecoderOp = *I++;
+    OpcodeMask |= (1 << DecoderOp);
     switch (DecoderOp) {
     default:
       PrintFatalError("Invalid decode table opcode: " + Twine((int)DecoderOp) +
@@ -1027,6 +1032,8 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
   Indent -= 2;
 
   OS << Indent << "};\n\n";
+
+  return OpcodeMask;
 }
 
 void DecoderEmitter::emitInstrLenTable(formatted_raw_ostream &OS,
@@ -1045,19 +1052,13 @@ void DecoderEmitter::emitPredicateFunction(formatted_raw_ostream &OS,
   OS << Indent << "static bool checkDecoderPredicate(unsigned Idx, "
      << "const FeatureBitset &Bits) {\n";
   Indent += 2;
-  if (!Predicates.empty()) {
-    OS << Indent << "switch (Idx) {\n";
-    OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
-    unsigned Index = 0;
-    for (const auto &Predicate : Predicates) {
-      OS << Indent << "case " << Index++ << ":\n";
-      OS << Indent + 2 << "return (" << Predicate << ");\n";
-    }
-    OS << Indent << "}\n";
-  } else {
-    // No case statement to emit
-    OS << Indent << "llvm_unreachable(\"Invalid index!\");\n";
+  OS << Indent << "switch (Idx) {\n";
+  OS << Indent << "default: llvm_unreachable(\"Invalid index!\");\n";
+  for (const auto &[Index, Predicate] : enumerate(Predicates)) {
+    OS << Indent << "case " << Index << ":\n";
+    OS << Indent + 2 << "return (" << Predicate << ");\n";
   }
+  OS << Indent << "}\n";
   Indent -= 2;
   OS << Indent << "}\n\n";
 }
@@ -2217,8 +2218,15 @@ static void insertBits(InsnType &field, InsnType bits, unsigned startBit,
 
 // emitDecodeInstruction - Emit the templated helper function
 // decodeInstruction().
-static void emitDecodeInstruction(formatted_raw_ostream &OS,
-                                  bool IsVarLenInst) {
+static void emitDecodeInstruction(formatted_raw_ostream &OS, bool IsVarLenInst,
+                                  unsigned OpcodeMask) {
+  const bool HasTryDecode = OpcodeMask & ((1 << MCD::OPC_TryDecode) |
+                                          (1 << MCD::OPC_TryDecodeOrFail));
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+  const bool HasSoftFail = OpcodeMask & (1 << MCD::OPC_SoftFail);
+
   OS << R"(
 static unsigned decodeNumToSkip(const uint8_t *&Ptr) {
   unsigned NumToSkip = *Ptr++;
@@ -2238,9 +2246,11 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
     OS << ",\n                                      "
           "llvm::function_ref<void(APInt &, uint64_t)> makeUp";
   }
-  OS << R"() {
-  const FeatureBitset &Bits = STI.getFeatureBits();
+  OS << ") {\n";
+  if (HasCheckPredicate)
+    OS << "  const FeatureBitset &Bits = STI.getFeatureBits();\n";
 
+  OS << R"(
   const uint8_t *Ptr = DecodeTable;
   uint64_t CurFieldValue = 0;
   DecodeStatus S = MCDisassembler::Success;
@@ -2321,7 +2331,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  if (HasCheckPredicate) {
+    OS << R"(
     case MCD::OPC_CheckPredicate:
     case MCD::OPC_CheckPredicateOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_CheckPredicateOrFail;
@@ -2343,7 +2355,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
         Ptr += NumToSkip;
       }
       break;
-    }
+    })";
+  }
+  OS << R"(
     case MCD::OPC_Decode: {
       // Decode the Opcode value.
       unsigned Opc = decodeULEB128AndIncUnsafe(Ptr);
@@ -2364,7 +2378,9 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
                    << ", using decoder " << DecodeIdx << ": "
                    << (S != MCDisassembler::Fail ? "PASS\n" : "FAIL\n"));
       return S;
-    }
+    })";
+  if (HasTryDecode) {
+    OS << R"(
     case MCD::OPC_TryDecode:
     case MCD::OPC_TryDecodeOrFail: {
       bool IsFail = DecoderOp == MCD::OPC_TryDecodeOrFail;
@@ -2399,17 +2415,22 @@ static DecodeStatus decodeInstruction(const uint8_t DecodeTable[], MCInst &MI,
       // set before the decode attempt.
       S = MCDisassembler::Success;
       break;
-    }
-    case MCD::OPC_SoftFail: {
-      // Decode the mask values.
-      uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
-      uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
-      bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
-      if (Failed)
-        S = MCDisassembler::SoftFail;
-      LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
-      break;
-    }
+    })";
+  }
+  if (HasSoftFail) {
+    OS << R"(
+      case MCD::OPC_SoftFail: {
+        // Decode the mask values.
+        uint64_t PositiveMask = decodeULEB128AndIncUnsafe(Ptr);
+        uint64_t NegativeMask = decodeULEB128AndIncUnsafe(Ptr);
+        bool Failed = (insn & PositiveMask) != 0 || (~insn & NegativeMask) != 0;
+        if (Failed)
+          S = MCDisassembler::SoftFail;
+        LLVM_DEBUG(dbgs() << Loc << ": OPC_SoftFail: " << (Failed ? "FAIL\n" : "PASS\n"));
+        break;
+  })";
+  }
+  OS << R"(
     case MCD::OPC_Fail: {
       LLVM_DEBUG(dbgs() << Loc << ": OPC_Fail\n");
       return MCDisassembler::Fail;
@@ -2609,6 +2630,7 @@ namespace {
   }
 
   DecoderTableInfo TableInfo;
+  unsigned OpcodeMask = 0;
   for (const auto &Opc : OpcMap) {
     // Emit the decoder for this namespace+width combination.
     ArrayRef<EncodingAndInst> NumberedEncodingsRef(NumberedEncodings.data(),
@@ -2634,8 +2656,8 @@ namespace {
     TableInfo.Table.push_back(MCD::OPC_Fail);
 
     // Print the table to the output stream.
-    emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(), Opc.first.first,
-              Opc.second);
+    OpcodeMask |= emitTable(OS, TableInfo.Table, indent(0), FC.getBitWidth(),
+                            Opc.first.first, Opc.second);
   }
 
   // For variable instruction, we emit a instruction length table
@@ -2643,14 +2665,20 @@ namespace {
   // You can see example usage in M68k's disassembler.
   if (IsVarLenInst)
     emitInstrLenTable(OS, InstrLen);
+
+  const bool HasCheckPredicate =
+      OpcodeMask &
+      ((1 << MCD::OPC_CheckPredicate) | (1 << MCD::OPC_CheckPredicateOrFail));
+
   // Emit the predicate function.
-  emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
+  if (HasCheckPredicate)
+    emitPredicateFunction(OS, TableInfo.Predicates, indent(0));
 
   // Emit the decoder function.
   emitDecoderFunction(OS, TableInfo.Decoders, indent(0));
 
   // Emit the main entry point for the decoder, decodeInstruction().
-  emitDecodeInstruction(OS, IsVarLenInst);
+  emitDecodeInstruction(OS, IsVarLenInst, OpcodeMask);
 
   OS << "\n} // namespace\n";
 }

From 9fed480f183d9cfa784228cd77b2c0a642fca697 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 17 Jun 2025 06:28:27 -0700
Subject: [PATCH 732/851] [BOLT] Explicitly check for returns when extending
 call continuation profile (#143295)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Call continuation logic relies on assumptions about fall-through origin:
- the branch is external to the function,
- fall-through start is at the beginning of the block,
- the block is not an entry point or a landing pad.

Leverage trace information to explicitly check whether the origin is a
return instruction, and defer to checks above only in case of
DSO-external branch source.

This covers both regular and BAT cases, addressing call continuation
fall-through undercounting in the latter mode, which improves BAT
profile quality metrics. For example, for one large binary:
- CFG discontinuity 21.83% -> 0.00%,
- CFG flow imbalance 10.77%/100.00% -> 3.40%/13.82% (weighted/worst)
- CG flow imbalance 8.49% —> 8.49%.

Depends on #143289.

Test Plan: updated callcont-fallthru.s
---
 bolt/include/bolt/Profile/DataAggregator.h | 12 +++-
 bolt/lib/Profile/DataAggregator.cpp        | 71 ++++++++++++---------
 bolt/test/X86/callcont-fallthru.s          | 72 ++++++++++++----------
 3 files changed, 90 insertions(+), 65 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 10d96fbeca3e2..96969cf53baca 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -132,6 +132,9 @@ class DataAggregator : public DataReader {
   /// and use them later for processing and assigning profile.
   std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
   std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
+  /// Pre-populated addresses of returns, coming from pre-aggregated data or
+  /// disassembly. Used to disambiguate call-continuation fall-throughs.
+  std::unordered_set<uint64_t> Returns;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -204,8 +207,8 @@ class DataAggregator : public DataReader {
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
-                         uint64_t Count) const;
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
+                         bool IsReturn) const;
 
   /// Record external entry into the function \p BF.
   ///
@@ -265,11 +268,14 @@ class DataAggregator : public DataReader {
                      uint64_t From, uint64_t To, uint64_t Count,
                      uint64_t Mispreds);
 
+  /// Checks if \p Addr corresponds to a return instruction.
+  bool checkReturn(uint64_t Addr);
+
   /// Register a \p Branch.
   bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
 
   /// Register a trace between two LBR entries supplied in execution order.
-  bool doTrace(const Trace &Trace, uint64_t Count);
+  bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn);
 
   /// Parser helpers
   /// Return false if we exhausted our parser buffer and finished parsing
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 118629b04f6fc..178c9d3a63730 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -730,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
   return true;
 }
 
+bool DataAggregator::checkReturn(uint64_t Addr) {
+  auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
+  if (llvm::is_contained(Returns, Addr))
+    return true;
+
+  BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
+  if (!Func)
+    return false;
+
+  const uint64_t Offset = Addr - Func->getAddress();
+  if (Func->hasInstructions()
+          ? isReturn(Func->getInstructionAtOffset(Offset))
+          : isReturn(Func->disassembleInstructionAtOffset(Offset))) {
+    Returns.emplace(Addr);
+    return true;
+  }
+  return false;
+}
+
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
                               uint64_t Mispreds) {
-  // Returns whether \p Offset in \p Func contains a return instruction.
-  auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
-    auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
-    return Func.hasInstructions()
-               ? isReturn(Func.getInstructionAtOffset(Offset))
-               : isReturn(Func.disassembleInstructionAtOffset(Offset));
-  };
-
   // Mutates \p Addr to an offset into the containing function, performing BAT
   // offset translation and parent lookup.
   //
-  // Returns the containing function (or BAT parent) and whether the address
-  // corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
+  // Returns the containing function (or BAT parent).
   auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
     BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
     if (!Func) {
       Addr = 0;
-      return std::pair{Func, false};
+      return Func;
     }
 
     Addr -= Func->getAddress();
 
-    bool IsRet = IsFrom && checkReturn(*Func, Addr);
-
     if (BAT)
       Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
 
     if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
-      Func = ParentFunc;
+      return ParentFunc;
 
-    return std::pair{Func, IsRet};
+    return Func;
   };
 
-  auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
-  auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
+  BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true);
+  BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false);
   if (!FromFunc && !ToFunc)
     return false;
 
-  // Ignore returns.
-  if (IsReturn)
-    return true;
-
   // Treat recursive control transfers as inter-branches.
   if (FromFunc == ToFunc && To != 0) {
     recordBranch(*FromFunc, From, To, Count, Mispreds);
@@ -783,7 +787,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
 
-bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
+bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
+                             bool IsReturn) {
   const uint64_t From = Trace.From, To = Trace.To;
   BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
   BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
@@ -808,8 +813,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
   const uint64_t FuncAddress = FromFunc->getAddress();
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
-          ? BAT->getFallthroughsInTrace(FuncAddress, From, To)
-          : getFallthroughsInTrace(*FromFunc, Trace, Count);
+          ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
+          : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
   if (!FTs) {
     LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
@@ -831,7 +836,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
 DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
-                                       uint64_t Count) const {
+                                       uint64_t Count, bool IsReturn) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
@@ -865,9 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
 
   // Adjust FromBB if the first LBR is a return from the last instruction in
   // the previous block (that instruction should be a call).
-  if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
-      From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
-      !FromBB->isLandingPad()) {
+  if (IsReturn) {
+    if (From)
+      FromBB = BF.getBasicBlockContainingOffset(From - 1);
+    else
+      LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
+  } else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
+             !FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
     const BinaryBasicBlock *PrevBB =
         BF.getLayout().getBlock(FromBB->getIndex() - 1);
     if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -1557,11 +1566,13 @@ void DataAggregator::processBranchEvents() {
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
   for (const auto &[Trace, Info] : Traces) {
-    if (Trace.Branch != Trace::FT_ONLY &&
+    bool IsReturn = checkReturn(Trace.Branch);
+    // Ignore returns.
+    if (!IsReturn && Trace.Branch != Trace::FT_ONLY &&
         Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
       doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
     if (Trace.To != Trace::BR_ONLY)
-      doTrace(Trace, Info.TakenCount);
+      doTrace(Trace, Info.TakenCount, IsReturn);
   }
   printBranchSamplesDiagnostics();
 }
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 4994cfb541eef..c2ef024db9475 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,29 +4,43 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pat PREAGGT1
-# RUN: link_fdata %s %t %t.pat2 PREAGGT2
-# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT
+# Trace to a call continuation, not a landing pad/entry point
+# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE
+# Trace from a return to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
+# Trace from an external location to a landing pad/entry point call continuation
+# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
+# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
 
 ## Check pre-aggregated traces attach call continuation fallthrough count
-# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s
-
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to secondary entry point (unstripped)
-# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-## Check pre-aggregated traces don't attach call continuation fallthrough count
-## to landing pad (stripped, LP)
-# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
+## in the basic case (not an entry point, not a landing pad).
+# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE
+
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+## Check pre-aggregated traces from a return attach call continuation
+## fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
+
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to secondary entry point (unstripped)
+# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
+## Check pre-aggregated traces from external location don't attach call
+## continuation fallthrough count to landing pad (stripped, landing pad)
+# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
+# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
 
 ## Check pre-aggregated traces don't report zero-sized PLT fall-through as
 ## invalid trace
-# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \
+# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
@@ -56,11 +70,11 @@ main:
 Ltmp0_br:
 	callq	puts@PLT
 ## Check PLT traces are accepted
-# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
+# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
 ## Target is an external-origin call continuation
-# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
-# CHECK:      callq puts@PLT
-# CHECK-NEXT: count: 2
+# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2
+# CHECK-BASE:      callq puts@PLT
+# CHECK-BASE-NEXT: count: 2
 
 Ltmp1:
 	movq	-0x10(%rbp), %rax
@@ -71,24 +85,18 @@ Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
 Ltmp4_br:
 	je	Ltmp0
-# CHECK2:      je .Ltmp0
-# CHECK2-NEXT: count: 3
 
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
 ## Target is a binary-local call continuation
-# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
-# CHECK:      callq foo
-# CHECK-NEXT: count: 1
-
-## PLT call continuation fallthrough spanning the call
-# CHECK2:      callq foo
-# CHECK2-NEXT: count: 3
-
+# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
-# CHECK3:      callq foo
-# CHECK3-NEXT: count: 0
+# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
+
+# CHECK-ATTACH:      callq foo
+# CHECK-ATTACH-NEXT: count: 1
+# CHECK-SKIP:        callq foo
+# CHECK-SKIP-NEXT:   count: 0
 
 Ltmp3:
 	cmpl	$0x0, -0x18(%rbp)

From 917bc909673a491fe070fe41c4ad112bcffd4c06 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Tue, 17 Jun 2025 06:41:15 -0700
Subject: [PATCH 733/851] [MLIR][LLVMIR] Mark Funcop as affinescope (#144456)

All functions are conceptually an affine scope.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index eda1d544cd81c..68fa620d239b9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1829,7 +1829,7 @@ def LLVM_ComdatOp : LLVM_Op<"comdat", [NoTerminator, NoRegionArguments, SymbolTa
 }
 
 def LLVM_LLVMFuncOp : LLVM_Op<"func", [
-    AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
+    AffineScope, AutomaticAllocationScope, IsolatedFromAbove, FunctionOpInterface
   ]> {
   let summary = "LLVM dialect function.";
 

From de3339063ae5a926ab2ed17651a0e628b9c34fb0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Tue, 17 Jun 2025 15:44:16 +0200
Subject: [PATCH 734/851] [bazel] Port b4e39e4ff923334a8a1fdcc6d92b01d3885a01f2

---
 utils/bazel/llvm-project-overlay/llvm/config.bzl               | 1 +
 .../llvm-project-overlay/llvm/include/llvm/Config/config.h     | 3 +++
 utils/bazel/llvm_configs/config.h.cmake                        | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index d9d3666a3ecce..7cb4b7e9ffe75 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -43,6 +43,7 @@ posix_defines = [
     "HAVE_SETENV_R=1",
     "HAVE_STRERROR_R=1",
     "HAVE_SYSEXITS_H=1",
+    "HAVE_SYS_IOCTL_H=1",
     "HAVE_UNISTD_H=1",
 ]
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index 3ef1d0c4b1651..feac6a9d3308f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -171,6 +171,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #define HAVE_SYS_MMAN_H 1
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+/* HAVE_SYS_IOCTL_H defined in Bazel */
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 /* #undef HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC */
 
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 06d4756397911..ce83de8e4cba9 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -164,6 +164,9 @@
 /* Define to 1 if you have the <sys/mman.h> header file. */
 #cmakedefine HAVE_SYS_MMAN_H ${HAVE_SYS_MMAN_H}
 
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#cmakedefine HAVE_SYS_IOCTL_H ${HAVE_SYS_IOCTL_H}
+
 /* Define to 1 if stat struct has st_mtimespec member .*/
 #cmakedefine HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC ${HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC}
 

From dc72b91ffedf791a44a1af19b00064a2a3c59ab9 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234@gmail.com>
Date: Tue, 17 Jun 2025 09:59:38 -0400
Subject: [PATCH 735/851] [AArch64] Report icmp as free if it can be folded
 into ands (#143286)

Since changing the backend to fold x >= 1 / x < 1 -> x > 0 / x <= 0 and
x <= -1 / x > -1 -> x > 0 / x <= 0, this should be reflected in the
cost.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 25 +++++++++++++------
 llvm/test/Analysis/CostModel/AArch64/cmp.ll   | 21 +++++++++++++++-
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0232ac421aeda..ed051f295752e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4353,15 +4353,26 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
     }
   }
 
-  // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
-  // FIXME: This can apply to more conditions and add/sub if it can be shown to
-  // be profitable.
+  // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
+  // icmp(and, 0) as free, as we can make use of ands, but only if the
+  // comparison is not unsigned.
   if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
-      ICmpInst::isEquality(VecPred) &&
+      !CmpInst::isUnsigned(VecPred) &&
       TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
-      match(I->getOperand(1), m_Zero()) &&
-      match(I->getOperand(0), m_And(m_Value(), m_Value())))
-    return 0;
+      match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
+    if (match(I->getOperand(1), m_Zero()))
+      return 0;
+
+    // x >= 1 / x < 1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_One()) &&
+        (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
+      return 0;
+
+    // x <= -1 / x > -1 -> x > 0 / x <= 0
+    if (match(I->getOperand(1), m_AllOnes()) &&
+        (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
+      return 0;
+  }
 
   // The base case handles scalable vectors fine for now, since it treats the
   // cost as 1 * legalization cost.
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index aba113865af10..16b3913f52028 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -53,6 +53,14 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32 = icmp eq i32 %a32, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %a64 = and i64 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64 = icmp ne i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32ge = icmp sge i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32le = icmp slt i32 %a32, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32leneg = icmp sle i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c32gtneg = icmp sgt i32 %a32, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64ge = icmp sge i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64le = icmp slt i64 %a64, 1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64leneg = icmp sle i64 %a64, -1
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64gtneg = icmp sgt i64 %a64, -1
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %a128 = and i128 undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %c128 = icmp eq i128 %a128, 0
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av16i8 = and <16 x i8> undef, undef
@@ -62,7 +70,7 @@ define void @andcmp() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %av4i32 = and <4 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %cv4i32 = icmp ne <4 x i32> %av4i32, zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c32not0 = icmp eq i32 %a32, 1
-; CHECK-NEXT:  Cost Model: Found costs of 1 for: %c64sle = icmp sle i64 %a64, 0
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: %c64sle = icmp sle i64 %a64, 0
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a8 = and i8 undef, undef
@@ -73,6 +81,17 @@ define void @andcmp() {
   %c32 = icmp eq i32 %a32, 0
   %a64 = and i64 undef, undef
   %c64 = icmp ne i64 %a64, 0
+
+  %c32ge = icmp sge i32 %a32, 1
+  %c32le = icmp slt i32 %a32, 1
+  %c32leneg = icmp sle i32 %a32, -1
+  %c32gtneg  = icmp sgt i32 %a32, -1
+
+  %c64ge = icmp sge i64 %a64, 1
+  %c64le = icmp slt i64 %a64, 1
+  %c64leneg = icmp sle i64 %a64, -1
+  %c64gtneg  = icmp sgt i64 %a64, -1
+
   %a128 = and i128 undef, undef
   %c128 = icmp eq i128 %a128, zeroinitializer
   %av16i8 = and <16 x i8> undef, undef

From 414710c753d87d314529857e15d1ad01a76c6605 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Tue, 17 Jun 2025 23:03:14 +0900
Subject: [PATCH 736/851] [SLP] Fix isCommutative to check uses of the original
 instruction instead of the converted instruction. (#143094)

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 36 +++++++++++++++----
 .../Transforms/SLPVectorizer/isCommutative.ll | 34 ++++++++++++++++++
 2 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/isCommutative.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9a7e9b75da517..8bff3c018714d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -511,15 +511,25 @@ static bool isSplat(ArrayRef<Value *> VL) {
 }
 
 /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
-static bool isCommutative(Instruction *I) {
+/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
+/// patterns that make it effectively commutative (like equality comparisons
+/// with zero).
+/// In most cases, users should not call this function directly (since \p I and
+/// \p InstWithUses are the same). However, when analyzing interchangeable
+/// instructions, we need to use the converted opcode along with the original
+/// uses.
+/// \param I The instruction to check for commutativity
+/// \param InstWithUses The instruction whose uses are analyzed for special
+/// patterns
+static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
   if (auto *Cmp = dyn_cast<CmpInst>(I))
     return Cmp->isCommutative();
   if (auto *BO = dyn_cast<BinaryOperator>(I))
     return BO->isCommutative() ||
            (BO->getOpcode() == Instruction::Sub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
             all_of(
-                BO->uses(),
+                InstWithUses->uses(),
                 [](const Use &U) {
                   // Commutative, if icmp eq/ne sub, 0
                   CmpPredicate Pred;
@@ -536,14 +546,24 @@ static bool isCommutative(Instruction *I) {
                           Flag->isOne());
                 })) ||
            (BO->getOpcode() == Instruction::FSub &&
-            !BO->hasNUsesOrMore(UsesLimit) &&
-            all_of(BO->uses(), [](const Use &U) {
+            !InstWithUses->hasNUsesOrMore(UsesLimit) &&
+            all_of(InstWithUses->uses(), [](const Use &U) {
               return match(U.getUser(),
                            m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
             }));
   return I->isCommutative();
 }
 
+/// This is a helper function to check whether \p I is commutative.
+/// This is a convenience wrapper that calls the two-parameter version of
+/// isCommutative with the same instruction for both parameters. This is
+/// the common case where the instruction being checked for commutativity
+/// is the same as the instruction whose uses are analyzed for special
+/// patterns (see the two-parameter version above for details).
+/// \param I The instruction to check for commutativity
+/// \returns true if the instruction is commutative, false otherwise
+static bool isCommutative(Instruction *I) { return isCommutative(I, I); }
+
 template <typename T>
 static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
                                                      unsigned Offset) {
@@ -2898,7 +2918,11 @@ class BoUpSLP {
           continue;
         }
         auto [SelectedOp, Ops] = convertTo(cast<Instruction>(V), S);
-        bool IsInverseOperation = !isCommutative(SelectedOp);
+        // We cannot check commutativity by the converted instruction
+        // (SelectedOp) because isCommutative also examines def-use
+        // relationships.
+        bool IsInverseOperation =
+            !isCommutative(SelectedOp, cast<Instruction>(V));
         for (unsigned OpIdx : seq<unsigned>(ArgSize)) {
           bool APO = (OpIdx == 0) ? false : IsInverseOperation;
           OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
diff --git a/llvm/test/Transforms/SLPVectorizer/isCommutative.ll b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
new file mode 100644
index 0000000000000..704ac8295f55b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/isCommutative.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S %s | FileCheck %s
+
+define i16 @check_isCommutative_with_the_original_source() {
+; CHECK-LABEL: @check_isCommutative_with_the_original_source(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND3:%.*]] = select i1 true, i16 1, i16 0
+; CHECK-NEXT:    ret i16 [[COND3]]
+;
+entry:
+  %sub = sub i16 0, -1
+  %cmp = icmp eq i16 %sub, 1
+
+  %sub1 = sub i16 0, -1
+  %cmp2 = icmp eq i16 %sub1, 1
+  %cond3 = select i1 %cmp2, i16 1, i16 0
+
+  %sub5 = sub nsw i16 0, 0
+  %cmp6 = icmp eq i16 %sub5, 0
+  %cmp9 = icmp eq i16 %sub5, 0
+
+  %sub12 = sub nsw i16 0, 0
+  %cmp13 = icmp eq i16 %sub12, 0
+
+  %sub16 = sub nsw i16 0, 0
+  %cmp17 = icmp eq i16 %sub16, 0
+
+  %sub20 = sub nsw i16 0, 0
+  %cmp21 = icmp eq i16 %sub20, 0
+  %cmp24 = icmp eq i16 %sub20, 0
+
+  ret i16 %cond3
+}
+

From 35f6d917206d79ab0e3d382a36ca05ccc13983d5 Mon Sep 17 00:00:00 2001
From: Richard Howell <rmaz@users.noreply.github.com>
Date: Tue, 17 Jun 2025 07:18:50 -0700
Subject: [PATCH 737/851] [lld] check cache in loadDylib before real_path
 (#143595)

---
 lld/MachO/DriverUtils.cpp              | 40 +++++++++++---
 lld/test/MachO/reexport-with-symlink.s | 74 ++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 6 deletions(-)
 create mode 100644 lld/test/MachO/reexport-with-symlink.s

diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp
index f7f6be049f0e1..a3b722f13daca 100644
--- a/lld/MachO/DriverUtils.cpp
+++ b/lld/MachO/DriverUtils.cpp
@@ -225,14 +225,21 @@ std::optional<StringRef> macho::resolveDylibPath(StringRef dylibPath) {
 // especially if it's a commonly re-exported core library.
 static DenseMap<CachedHashStringRef, DylibFile *> loadedDylibs;
 
+static StringRef realPathIfDifferent(StringRef path) {
+  SmallString<128> realPathBuf;
+  if (fs::real_path(path, realPathBuf))
+    return StringRef();
+
+  SmallString<128> absPathBuf = path;
+  if (!fs::make_absolute(absPathBuf) && realPathBuf == absPathBuf)
+    return StringRef();
+
+  return uniqueSaver().save(StringRef(realPathBuf));
+}
+
 DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
                             bool isBundleLoader, bool explicitlyLinked) {
-  // Frameworks can be found from different symlink paths, so resolve
-  // symlinks before looking up in the dylib cache.
-  SmallString<128> realPath;
-  std::error_code err = fs::real_path(mbref.getBufferIdentifier(), realPath);
-  CachedHashStringRef path(!err ? uniqueSaver().save(StringRef(realPath))
-                                : mbref.getBufferIdentifier());
+  CachedHashStringRef path(mbref.getBufferIdentifier());
   DylibFile *&file = loadedDylibs[path];
   if (file) {
     if (explicitlyLinked)
@@ -240,6 +247,22 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
     return file;
   }
 
+  // Frameworks can be found from different symlink paths, so resolve
+  // symlinks and look up in the dylib cache.
+  CachedHashStringRef realPath(
+      realPathIfDifferent(mbref.getBufferIdentifier()));
+  if (!realPath.val().empty()) {
+    // Avoid map insertions here so that we do not invalidate the "file"
+    // reference.
+    auto it = loadedDylibs.find(realPath);
+    if (it != loadedDylibs.end()) {
+      DylibFile *realfile = it->second;
+      if (explicitlyLinked)
+        realfile->setExplicitlyLinked();
+      return realfile;
+    }
+  }
+
   DylibFile *newFile;
   file_magic magic = identify_magic(mbref.getBuffer());
   if (magic == file_magic::tapi_file) {
@@ -292,6 +315,11 @@ DylibFile *macho::loadDylib(MemoryBufferRef mbref, DylibFile *umbrella,
             sys::path::filename(newFile->installName) + "' because " +
             config->clientName + " is not an allowed client");
   }
+
+  // If the load path was a symlink, cache the real path too.
+  if (!realPath.val().empty())
+    loadedDylibs[realPath] = newFile;
+
   return newFile;
 }
 
diff --git a/lld/test/MachO/reexport-with-symlink.s b/lld/test/MachO/reexport-with-symlink.s
new file mode 100644
index 0000000000000..a6b5992713f39
--- /dev/null
+++ b/lld/test/MachO/reexport-with-symlink.s
@@ -0,0 +1,74 @@
+# REQUIRES: aarch64, shell
+# RUN: rm -rf %t; split-file %s %t
+# RUN: ln -s Versions/A/Developer %t/Developer/Library/Frameworks/Developer.framework/
+# RUN: llvm-mc -filetype obj -triple arm64-apple-macos11.0 %t/test.s -o %t/test.o
+# RUN: %lld -arch arm64 -platform_version macos 11.0 11.0 -o %t/test -framework Developer -F %t/Developer/Library/Frameworks -L %t/Developer/usr/lib %t/test.o -t | FileCheck %s
+
+# CHECK: {{.*}}/Developer/Library/Frameworks/Developer.framework/Developer
+# CHECK: {{.*}}/Developer/usr/lib/libDeveloperSupport.tbd(@rpath/libDeveloperSupport.dylib)
+# CHECK-NOT: {{.*}}/Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+
+#--- Developer/Library/Frameworks/Developer.framework/Versions/A/Developer
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/Developer.framework/Developer"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcPublic"]
+        }
+      }
+    ]
+  }
+}
+#--- Developer/usr/lib/libDeveloperSupport.tbd
+{
+  "tapi_tbd_version": 5,
+  "main_library": {
+    "target_info": [
+      {
+        "target": "arm64-macos"
+      }
+    ],
+    "install_names": [
+      {
+        "name": "@rpath/libDeveloperSupport.dylib"
+      }
+    ],
+    "reexported_libraries": [
+      {
+        "names": [
+          "@rpath/Developer.framework/Versions/A/Developer"
+        ]
+      }
+    ],
+    "exported_symbols": [
+      {
+        "text": {
+          "global": ["_funcSupport"]
+        }
+      }
+    ]
+  }
+}
+#--- test.s
+.text
+.globl _main
+.linker_option "-lDeveloperSupport"
+
+_main:
+  ret
+
+.data
+  .quad _funcPublic
+  .quad _funcSupport

From 0a7b0c844c59189ad4f5072b73d7dfdfd78e76b7 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 15:24:26 +0100
Subject: [PATCH 738/851] [lldb][Expression] Remove IR pointer checker
 (#144483)

Currently when jitting expressions, LLDB scans the IR instructions of
the `$__lldb_expr` and will insert a call to a utility function for each
load/store instruction. The purpose of the utility funciton is to
dereference the load/store operand. If that operand was an invalid
pointer the utility function would trap and LLDB asks the IR checker
whether it was responsible for the trap, in which case it prints out an
error message saying the expression dereferenced an invalid pointer.

This is a lot of setup for not much gain. In fact, creating/running this
utility expression shows up as ~2% of the expression evaluation time
(though we cache them for subsequent expressions). And the error message
we get out of it is arguably less useful than if we hadn't instrumented
the IR. It was also untested.

Before:
```
(lldb) expr int a = *returns_invalid_ptr()

error: Execution was interrupted, reason: Attempted to dereference an invalid pointer..
The process has been returned to the state before expression evaluation.
```

After:
```
(lldb) expr int a = *returns_invalid_ptr()

error: Expression execution was interrupted: EXC_BAD_ACCESS (code=1, address=0x5).
The process has been returned to the state before expression evaluation.
```

This patch removes this IR checker.
---
 .../Clang/IRDynamicChecks.cpp                 | 107 +-----------------
 .../ExpressionParser/Clang/IRDynamicChecks.h  |   1 -
 .../lldb-dap/save-core/TestDAP_save_core.py   |   6 -
 3 files changed, 4 insertions(+), 110 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
index c7c292a8a7e42..6ef5d3f5be6d9 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
@@ -32,31 +32,16 @@ using namespace lldb_private;
 
 static char ID;
 
-#define VALID_POINTER_CHECK_NAME "_$__lldb_valid_pointer_check"
 #define VALID_OBJC_OBJECT_CHECK_NAME "$__lldb_objc_object_check"
 
-static const char g_valid_pointer_check_text[] =
-    "extern \"C\" void\n"
-    "_$__lldb_valid_pointer_check (unsigned char *$__lldb_arg_ptr)\n"
-    "{\n"
-    "    unsigned char $__lldb_local_val = *$__lldb_arg_ptr;\n"
-    "}";
-
 ClangDynamicCheckerFunctions::ClangDynamicCheckerFunctions()
     : DynamicCheckerFunctions(DCF_Clang) {}
 
 ClangDynamicCheckerFunctions::~ClangDynamicCheckerFunctions() = default;
 
-llvm::Error ClangDynamicCheckerFunctions::Install(
-    DiagnosticManager &diagnostic_manager, ExecutionContext &exe_ctx) {
-  Expected<std::unique_ptr<UtilityFunction>> utility_fn =
-      exe_ctx.GetTargetRef().CreateUtilityFunction(
-          g_valid_pointer_check_text, VALID_POINTER_CHECK_NAME,
-          lldb::eLanguageTypeC, exe_ctx);
-  if (!utility_fn)
-    return utility_fn.takeError();
-  m_valid_pointer_check = std::move(*utility_fn);
-
+llvm::Error
+ClangDynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
+                                      ExecutionContext &exe_ctx) {
   if (Process *process = exe_ctx.GetProcessPtr()) {
     ObjCLanguageRuntime *objc_language_runtime =
         ObjCLanguageRuntime::Get(*process);
@@ -78,11 +63,7 @@ bool ClangDynamicCheckerFunctions::DoCheckersExplainStop(lldb::addr_t addr,
   // FIXME: We have to get the checkers to know why they scotched the call in
   // more detail,
   // so we can print a better message here.
-  if (m_valid_pointer_check && m_valid_pointer_check->ContainsAddress(addr)) {
-    message.Printf("Attempted to dereference an invalid pointer.");
-    return true;
-  } else if (m_objc_object_check &&
-             m_objc_object_check->ContainsAddress(addr)) {
+  if (m_objc_object_check && m_objc_object_check->ContainsAddress(addr)) {
     message.Printf("Attempted to dereference an invalid ObjC Object or send it "
                    "an unrecognized selector");
     return true;
@@ -223,29 +204,6 @@ class Instrumenter {
     return true;
   }
 
-  /// Build a function pointer for a function with signature void
-  /// (*)(uint8_t*) with a given address
-  ///
-  /// \param[in] start_address
-  ///     The address of the function.
-  ///
-  /// \return
-  ///     The function pointer, for use in a CallInst.
-  llvm::FunctionCallee BuildPointerValidatorFunc(lldb::addr_t start_address) {
-    llvm::Type *param_array[1];
-
-    param_array[0] = const_cast<llvm::PointerType *>(GetI8PtrTy());
-
-    ArrayRef<llvm::Type *> params(param_array, 1);
-
-    FunctionType *fun_ty = FunctionType::get(
-        llvm::Type::getVoidTy(m_module.getContext()), params, true);
-    PointerType *fun_ptr_ty = PointerType::getUnqual(m_module.getContext());
-    Constant *fun_addr_int =
-        ConstantInt::get(GetIntptrTy(), start_address, false);
-    return {fun_ty, ConstantExpr::getIntToPtr(fun_addr_int, fun_ptr_ty)};
-  }
-
   /// Build a function pointer for a function with signature void
   /// (*)(uint8_t*, uint8_t*) with a given address
   ///
@@ -300,53 +258,6 @@ class Instrumenter {
   IntegerType *m_intptr_ty = nullptr;
 };
 
-class ValidPointerChecker : public Instrumenter {
-public:
-  ValidPointerChecker(llvm::Module &module,
-                      std::shared_ptr<UtilityFunction> checker_function)
-      : Instrumenter(module, checker_function),
-        m_valid_pointer_check_func(nullptr) {}
-
-  ~ValidPointerChecker() override = default;
-
-protected:
-  bool InstrumentInstruction(llvm::Instruction *inst) override {
-    Log *log = GetLog(LLDBLog::Expressions);
-
-    LLDB_LOGF(log, "Instrumenting load/store instruction: %s\n",
-              PrintValue(inst).c_str());
-
-    if (!m_valid_pointer_check_func)
-      m_valid_pointer_check_func =
-          BuildPointerValidatorFunc(m_checker_function->StartAddress());
-
-    llvm::Value *dereferenced_ptr = nullptr;
-
-    if (llvm::LoadInst *li = dyn_cast<llvm::LoadInst>(inst))
-      dereferenced_ptr = li->getPointerOperand();
-    else if (llvm::StoreInst *si = dyn_cast<llvm::StoreInst>(inst))
-      dereferenced_ptr = si->getPointerOperand();
-    else
-      return false;
-
-    // Insert an instruction to call the helper with the result
-    CallInst::Create(m_valid_pointer_check_func, dereferenced_ptr, "",
-                     inst->getIterator());
-
-    return true;
-  }
-
-  bool InspectInstruction(llvm::Instruction &i) override {
-    if (isa<llvm::LoadInst>(&i) || isa<llvm::StoreInst>(&i))
-      RegisterInstruction(i);
-
-    return true;
-  }
-
-private:
-  llvm::FunctionCallee m_valid_pointer_check_func;
-};
-
 class ObjcObjectChecker : public Instrumenter {
 public:
   ObjcObjectChecker(llvm::Module &module,
@@ -527,16 +438,6 @@ bool IRDynamicChecks::runOnModule(llvm::Module &M) {
     return false;
   }
 
-  if (m_checker_functions.m_valid_pointer_check) {
-    ValidPointerChecker vpc(M, m_checker_functions.m_valid_pointer_check);
-
-    if (!vpc.Inspect(*function))
-      return false;
-
-    if (!vpc.Instrument())
-      return false;
-  }
-
   if (m_checker_functions.m_objc_object_check) {
     ObjcObjectChecker ooc(M, m_checker_functions.m_objc_object_check);
 
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
index ff20c1f08be0c..f67229afc2152 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.h
@@ -53,7 +53,6 @@ class ClangDynamicCheckerFunctions
 
   bool DoCheckersExplainStop(lldb::addr_t addr, Stream &message) override;
 
-  std::shared_ptr<UtilityFunction> m_valid_pointer_check;
   std::shared_ptr<UtilityFunction> m_objc_object_check;
 };
 
diff --git a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
index 4045dd8fb6569..77c1e47914a39 100644
--- a/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
+++ b/lldb/test/API/tools/lldb-dap/save-core/TestDAP_save_core.py
@@ -32,13 +32,7 @@ def test_save_core(self):
         # Getting dap stack trace may trigger __lldb_caller_function JIT module to be created.
         self.get_stackFrames(startFrame=0)
 
-        # Evaluating an expression that cause "_$__lldb_valid_pointer_check" JIT module to be created.
-        expression = 'printf("this is a test")'
-        self.dap_server.request_evaluate(expression, context="watch")
-
-        # Verify "_$__lldb_valid_pointer_check" JIT module is created.
         modules = self.dap_server.get_modules()
-        self.assertTrue(modules["_$__lldb_valid_pointer_check"])
         thread_count = len(self.dap_server.get_threads())
 
         core_stack = self.getBuildArtifact("core.stack.dmp")

From 8f797542258f6e682eb251d0851922a1ac08fb44 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 17 Jun 2025 15:30:08 +0100
Subject: [PATCH 739/851] [SCEV] Better preserve wrapping info in
 SimplifyICmpOperands for UGE. (#144404)

Update SimplifyICmpOperands to only try subtracting 1 from RHS first, if
RHS is an op we can fold the subtract directly into. Otherwise try
adding to LHS first, as we can preserve NUW flags.

This improves results in a few cases, including the modified test case
from berkeley-abc and new code to be added in
https://github.com/llvm/llvm-project/pull/128061.

Note that there are more cases where the results can be improved by
better ordering here which I'll try to investigate as follow-up.

PR: https://github.com/llvm/llvm-project/pull/144404
---
 llvm/lib/Analysis/ScalarEvolution.cpp                 | 11 ++++++++++-
 .../IndVarSimplify/simplify-icmp-operands-order.ll    |  9 +++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2dfe625eb0dcc..dd309bc2c54a8 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10892,7 +10892,12 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
     }
     break;
   case ICmpInst::ICMP_UGE:
-    if (!getUnsignedRangeMin(RHS).isMinValue()) {
+    // If RHS is an op we can fold the -1, try that first.
+    // Otherwise prefer LHS to preserve the nuw flag.
+    if ((isa<SCEVConstant>(RHS) ||
+         (isa<SCEVAddExpr, SCEVAddRecExpr>(RHS) &&
+          isa<SCEVConstant>(cast<SCEVNAryExpr>(RHS)->getOperand(0)))) &&
+        !getUnsignedRangeMin(RHS).isMinValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
@@ -10901,6 +10906,10 @@ bool ScalarEvolution::SimplifyICmpOperands(CmpPredicate &Pred, const SCEV *&LHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
+    } else if (!getUnsignedRangeMin(RHS).isMinValue()) {
+      RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
+      Pred = ICmpInst::ICMP_UGT;
+      Changed = true;
     }
     break;
   default:
diff --git a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
index b0dbbd5eaedf4..fb2fdb116f904 100644
--- a/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/simplify-icmp-operands-order.ll
@@ -53,15 +53,12 @@ loop.latch:
 
 define void @test_simplifycompare_rhs_not_constant1() {
 ; CHECK-LABEL: define void @test_simplifycompare_rhs_not_constant1() {
-; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[P:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[P]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 -8
-; CHECK-NEXT:    call void @use(ptr [[PTR_IV]])
-; CHECK-NEXT:    [[EC:%.*]] = icmp ult ptr [[PTR_IV_NEXT]], [[P]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    call void @use(ptr [[P]])
+; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;

From 0fb198e132eff36281a20698588d815c3c30f991 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 15:30:49 +0100
Subject: [PATCH 740/851] [X86] Remove combineShuffleOfConcatUndef fold
 (#144524)

We can now let a mixture of combineConcatVectorOps and target shuffle combining handle this instead of creating ISD::CONCAT_VECTORS nodes and hoping they will merge properly.

In the horizontal-sum.ll test changes we were creating a ISD::CONCAT_VECTORS node that was being split shortly after, but not before causing issues with HADD folding due to additional uses.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 54 -------------------------
 llvm/test/CodeGen/X86/horizontal-sum.ll | 28 ++++++-------
 2 files changed, 14 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd02d275d6b57..12fcc614ab254 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
-// We are looking for a shuffle where both sources are concatenated with undef
-// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
-// if we can express this as a single-source shuffle, that's preferable.
-static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
-                                           SelectionDAG &DAG,
-                                           const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
-  if (!VT.is128BitVector() && !VT.is256BitVector())
-    return SDValue();
-
-  if (VT.getVectorElementType() != MVT::i32 &&
-      VT.getVectorElementType() != MVT::i64 &&
-      VT.getVectorElementType() != MVT::f32 &&
-      VT.getVectorElementType() != MVT::f64)
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Check that both sources are concats with undef.
-  if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
-      N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
-      N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
-      !N1.getOperand(1).isUndef())
-    return SDValue();
-
-  // Construct the new shuffle mask. Elements from the first source retain their
-  // index, but elements from the second source no longer need to skip an undef.
-  SmallVector<int, 8> Mask;
-  int NumElts = VT.getVectorNumElements();
-
-  auto *SVOp = cast<ShuffleVectorSDNode>(N);
-  for (int Elt : SVOp->getMask())
-    Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
-
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
-                               N1.getOperand(0));
-  return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
-}
-
 /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
 /// low half of each source vector and does not set any high half elements in
 /// the destination vector, narrow the shuffle to half its original size.
@@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
           VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
     return LD;
 
-  // For AVX2, we sometimes want to combine
-  // (vector_shuffle <mask> (concat_vectors t1, undef)
-  //                        (concat_vectors t2, undef))
-  // Into:
-  // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
-  // Since the latter can be efficiently lowered with VPERMD/VPERMQ
-  if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
-    return ShufConcat;
-
   if (isTargetShuffle(N->getOpcode())) {
     SDValue Op(N, 0);
     if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 0afc4f784bc5e..568150cfa3971 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-SLOW-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-SLOW-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-SLOW-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-FAST-NEXT:    vhaddps %xmm5, %xmm5, %xmm4
 ; AVX2-FAST-NEXT:    vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-FAST-NEXT:    vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-FAST-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1,0]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]

From 4cfe0d7f4c2c39dd90e27258aa448789f2ba4278 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Tue, 17 Jun 2025 15:32:23 +0100
Subject: [PATCH 741/851] [flang][OpenMP] Support using copyprivate with
 fir.boxchar arguments (#144092)

Implement the lowering for passing a fir.boxchar argument to the
copyprivate clause.

Resolves https://github.com/llvm/llvm-project/issues/142123.

---------

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 50 ++++++++++++++++------
 flang/test/Lower/OpenMP/copyprivate5.f90   | 36 ++++++++++++++++
 2 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/copyprivate5.f90

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index b5c8de8c2ce8b..bc8fc14bcaeb2 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -727,12 +727,15 @@ class TypeInfo {
   // Is the type inside a box?
   bool isBox() const { return inBox; }
 
+  bool isBoxChar() const { return inBoxChar; }
+
 private:
   void typeScan(mlir::Type type);
 
   std::optional<fir::CharacterType::LenType> charLen;
   llvm::SmallVector<int64_t> shape;
   bool inBox = false;
+  bool inBoxChar = false;
 };
 
 void TypeInfo::typeScan(mlir::Type ty) {
@@ -748,6 +751,9 @@ void TypeInfo::typeScan(mlir::Type ty) {
     typeScan(cty.getEleTy());
   } else if (auto cty = mlir::dyn_cast<fir::CharacterType>(ty)) {
     charLen = cty.getLen();
+  } else if (auto cty = mlir::dyn_cast<fir::BoxCharType>(ty)) {
+    inBoxChar = true;
+    typeScan(cty.getEleTy());
   } else if (auto hty = mlir::dyn_cast<fir::HeapType>(ty)) {
     typeScan(hty.getEleTy());
   } else if (auto pty = mlir::dyn_cast<fir::PointerType>(ty)) {
@@ -791,12 +797,6 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
   fir::FortranVariableFlagsAttr attrs;
   if (varAttrs != fir::FortranVariableFlagsEnum::None)
     attrs = fir::FortranVariableFlagsAttr::get(builder.getContext(), varAttrs);
-  llvm::SmallVector<mlir::Value> typeparams;
-  if (typeInfo.getCharLength().has_value()) {
-    mlir::Value charLen = builder.createIntegerConstant(
-        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
-    typeparams.push_back(charLen);
-  }
   mlir::Value shape;
   if (!typeInfo.isBox() && !typeInfo.getShape().empty()) {
     llvm::SmallVector<mlir::Value> extents;
@@ -805,11 +805,34 @@ createCopyFunc(mlir::Location loc, lower::AbstractConverter &converter,
           builder.createIntegerConstant(loc, builder.getIndexType(), extent));
     shape = builder.create<fir::ShapeOp>(loc, extents);
   }
+  mlir::Value dst = funcOp.getArgument(0);
+  mlir::Value src = funcOp.getArgument(1);
+  llvm::SmallVector<mlir::Value> typeparams;
+  if (typeInfo.isBoxChar()) {
+    // fir.boxchar will be passed here as fir.ref<fir.boxchar>
+    auto loadDst = builder.create<fir::LoadOp>(loc, dst);
+    auto loadSrc = builder.create<fir::LoadOp>(loc, src);
+    // get the actual fir.ref<fir.char> type
+    mlir::Type refType =
+        fir::ReferenceType::get(mlir::cast<fir::BoxCharType>(eleTy).getEleTy());
+    auto unboxedDst = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadDst);
+    auto unboxedSrc = builder.create<fir::UnboxCharOp>(
+        loc, refType, builder.getIndexType(), loadSrc);
+    // Add length to type parameters
+    typeparams.push_back(unboxedDst.getResult(1));
+    dst = unboxedDst.getResult(0);
+    src = unboxedSrc.getResult(0);
+  } else if (typeInfo.getCharLength().has_value()) {
+    mlir::Value charLen = builder.createIntegerConstant(
+        loc, builder.getCharacterLengthType(), *typeInfo.getCharLength());
+    typeparams.push_back(charLen);
+  }
   auto declDst = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(0), copyFuncName + "_dst", shape, typeparams,
+      loc, dst, copyFuncName + "_dst", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   auto declSrc = builder.create<hlfir::DeclareOp>(
-      loc, funcOp.getArgument(1), copyFuncName + "_src", shape, typeparams,
+      loc, src, copyFuncName + "_src", shape, typeparams,
       /*dummy_scope=*/nullptr, attrs);
   converter.copyVar(loc, declDst.getBase(), declSrc.getBase(), varAttrs);
   builder.create<mlir::func::ReturnOp>(loc);
@@ -835,10 +858,13 @@ bool ClauseProcessor::processCopyprivate(
 
     // CopyPrivate variables must be passed by reference. However, in the case
     // of assumed shapes/vla the type is not a !fir.ref, but a !fir.box.
-    // In these cases to retrieve the appropriate !fir.ref<!fir.box<...>> to
-    // access the data we need we must perform an alloca and then store to it
-    // and retrieve the data from the new alloca.
-    if (mlir::isa<fir::BaseBoxType>(symType)) {
+    // In the case of character types, the passed in type can also be
+    // !fir.boxchar. In these cases to retrieve the appropriate
+    // !fir.ref<!fir.box<...>> or !fir.ref<!fir.boxchar<..>> to access the data
+    // we need we must perform an alloca and then store to it and retrieve the
+    // data from the new alloca.
+    if (mlir::isa<fir::BaseBoxType>(symType) ||
+        mlir::isa<fir::BoxCharType>(symType)) {
       fir::FirOpBuilder &builder = converter.getFirOpBuilder();
       auto alloca = builder.create<fir::AllocaOp>(currentLocation, symType);
       builder.create<fir::StoreOp>(currentLocation, symVal, alloca);
diff --git a/flang/test/Lower/OpenMP/copyprivate5.f90 b/flang/test/Lower/OpenMP/copyprivate5.f90
new file mode 100644
index 0000000000000..c75eb82a45e9f
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyprivate5.f90
@@ -0,0 +1,36 @@
+! Test lowering of COPYPRIVATE with character arguments
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! Testcase from: https://github.com/llvm/llvm-project/issues/142123
+
+! CHECK-LABEL:  func.func private @_copy_boxchar_c8xU(
+! CHECK-SAME:     %arg0: [[TYPE:!fir.ref<!fir.boxchar<1>>]],
+! CHECK-SAME:     %arg1: [[TYPE]]) attributes {llvm.linkage = #llvm.linkage<internal>} {
+! CHECK:    %[[RDST:.*]] = fir.load %arg0 : [[TYPE]]
+! CHECK:    %[[RSRC:.*]] = fir.load %arg1 : [[TYPE]]
+! CHECK:    %[[UDST:.*]]:2 = fir.unboxchar %[[RDST:.*]] : ([[UTYPE:!fir.boxchar<1>]]) -> ([[RTYPE:!fir.ref<!fir.char<1,\?>>]], [[ITYPE:index]])
+! CHECK:    %[[USRC:.*]]:2 = fir.unboxchar %[[RSRC:.*]] : ([[UTYPE]]) -> ([[RTYPE]], [[ITYPE]])
+! CHECK:    %[[DST:.*]]:2 = hlfir.declare %[[UDST:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME1:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    %[[SRC:.*]]:2 = hlfir.declare %[[USRC:.*]]#0 typeparams %[[UDST:.*]]#1 {uniq_name = "[[NAME2:.*]]"} : ([[RTYPE]], [[ITYPE]]) -> ([[UTYPE]], [[RTYPE]])
+! CHECK:    hlfir.assign %[[SRC:.*]]#0 to %[[DST:.*]]#0 : [[UTYPE]], [[UTYPE]]
+! CHECK:    return
+! CHECK:  }
+
+! CHECK-LABEL: func.func @_QPs(%arg0: !fir.boxchar<1> {fir.bindc_name = "c"}) {
+! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.boxchar<1>
+! CHECK: fir.store %[[SRC:.*]] to %[[ALLOC:.*]] : !fir.ref<!fir.boxchar<1>>
+! CHECK: omp.single copyprivate([[ALLOC:.*]] -> @_copy_boxchar_c8xU : !fir.ref<!fir.boxchar<1>>) {
+! CHECK:   hlfir.assign %[[NEW_VAL:.*]] to %[[SRC:.*]] : !fir.ref<!fir.char<1,3>>, !fir.boxchar<1>
+! CHECK:   omp.terminator
+! CHECK: }
+
+subroutine s(c)
+character(*) :: c
+!$omp single copyprivate(c)
+c = "bar"
+!$omp end single
+end subroutine
+
+character(len=3) :: c
+call s(c)
+end

From 549bc55cc39bb9fb22df464bcf3b7d4d4a5ff507 Mon Sep 17 00:00:00 2001
From: Davide Grohmann <davide.grohmann@arm.com>
Date: Tue, 17 Jun 2025 16:35:14 +0200
Subject: [PATCH 742/851] [mlir][spirv] Fix int type declaration duplication
 when serializing (#143108)

At the MLIR level unsigned integer and signless integers are different
types. Indeed when looking up the two types in type definition cache
they do not match.

Hence when translating a SPIR-V module which contains both usign and
signless integers will contain the same type declaration twice
(something like OpTypeInt 32 0) which is not permitted in SPIR-V and
such generated modules fail validation.

This patch solves the problem by mapping unisgned integer types to
singless integer types before looking up in the type definition cache.

---------

Signed-off-by: Davide Grohmann <davide.grohmann@arm.com>
---
 mlir/lib/Target/SPIRV/Serialization/Serializer.cpp | 13 +++++++++++++
 mlir/test/CMakeLists.txt                           |  6 ++++++
 mlir/test/Target/SPIRV/constant.mlir               |  5 ++++-
 mlir/test/lit.cfg.py                               |  1 +
 mlir/test/lit.local.cfg                            |  7 +++++++
 mlir/test/lit.site.cfg.py.in                       |  4 +++-
 6 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/lit.local.cfg

diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index d258bfd852961..56c64f38fe29a 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -446,6 +446,19 @@ LogicalResult Serializer::processType(Location loc, Type type,
 LogicalResult
 Serializer::processTypeImpl(Location loc, Type type, uint32_t &typeID,
                             SetVector<StringRef> &serializationCtx) {
+
+  // Map unsigned integer types to singless integer types.
+  // This is needed otherwise the generated spirv assembly will contain
+  // twice a type declaration (like OpTypeInt 32 0) which is no permitted and
+  // such module fails validation. Indeed at MLIR level the two types are
+  // different and lookup in the cache below misses.
+  // Note: This conversion needs to happen here before the type is looked up in
+  // the cache.
+  if (type.isUnsignedInteger()) {
+    type = IntegerType::get(loc->getContext(), type.getIntOrFloatBitWidth(),
+                            IntegerType::SignednessSemantics::Signless);
+  }
+
   typeID = getTypeID(type);
   if (typeID)
     return success();
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index ac8b44f53aebf..89568e7766ae5 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -68,6 +68,7 @@ endif()
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
   LLVM_HAS_NVPTX_TARGET
+  LLVM_INCLUDE_SPIRV_TOOLS_TESTS
   MLIR_ENABLE_BINDINGS_PYTHON
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
@@ -217,6 +218,11 @@ if(MLIR_ENABLE_BINDINGS_PYTHON)
   )
 endif()
 
+if (LLVM_INCLUDE_SPIRV_TOOLS_TESTS)
+  list(APPEND MLIR_TEST_DEPENDS spirv-as)
+  list(APPEND MLIR_TEST_DEPENDS spirv-val)
+endif()
+
 # This target can be used to just build the dependencies
 # for the check-mlir target without executing the tests.
 # This is useful for bots when splitting the build step
diff --git a/mlir/test/Target/SPIRV/constant.mlir b/mlir/test/Target/SPIRV/constant.mlir
index 8d4e53418b70f..50d9b09ee0042 100644
--- a/mlir/test/Target/SPIRV/constant.mlir
+++ b/mlir/test/Target/SPIRV/constant.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+// RUN: %if spirv-tools %{ mlir-translate -no-implicit-module -serialize-spirv %s | spirv-val %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
+spirv.module Logical Vulkan requires #spirv.vce<v1.3, [VulkanMemoryModel, Shader, Int64, Int16, Int8, Float64, Float16, CooperativeMatrixKHR], [SPV_KHR_vulkan_memory_model, SPV_KHR_cooperative_matrix]> {
   // CHECK-LABEL: @bool_const
   spirv.func @bool_const() -> () "None" {
     // CHECK: spirv.Constant true
@@ -305,4 +306,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Shader], []> {
     %coop = spirv.Constant dense<4> : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
     spirv.ReturnValue %coop : !spirv.coopmatrix<16x16xi8, Subgroup, MatrixAcc>
   }
+
+  spirv.EntryPoint "GLCompute" @bool_const
 }
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 9b5cadd62befc..a6f1ac0d568f4 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -332,6 +332,7 @@ def find_real_python_interpreter():
 else:
     config.available_features.add("noasserts")
 
+config.targets = frozenset(config.targets_to_build.split())
 
 def have_host_jit_feature_support(feature_name):
     mlir_runner_exe = lit.util.which("mlir-runner", config.mlir_tools_dir)
diff --git a/mlir/test/lit.local.cfg b/mlir/test/lit.local.cfg
new file mode 100644
index 0000000000000..167c454db5184
--- /dev/null
+++ b/mlir/test/lit.local.cfg
@@ -0,0 +1,7 @@
+if not "SPIRV" in config.root.targets:
+    config.unsupported = True
+
+if config.spirv_tools_tests:
+    config.available_features.add("spirv-tools")
+    config.substitutions.append(("spirv-as", os.path.join(config.llvm_tools_dir, "spirv-as")))
+    config.substitutions.append(("spirv-val", os.path.join(config.llvm_tools_dir, "spirv-val")))
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index 132aabe135940..77f24e0f29b09 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -5,6 +5,8 @@ import sys
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
+config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_shlib_ext = "@SHLIBEXT@"
 config.llvm_shlib_dir = lit_config.substitute(path(r"@SHLIBDIR@"))
 config.python_executable = "@Python3_EXECUTABLE@"
@@ -41,7 +43,7 @@ config.mlir_run_amx_tests = @MLIR_RUN_AMX_TESTS@
 config.mlir_run_arm_sve_tests = @MLIR_RUN_ARM_SVE_TESTS@
 # This is a workaround for the fact that LIT's:
 #   %if <cond>
-# requires <cond> to be in the set of available features. 
+# requires <cond> to be in the set of available features.
 # TODO: Update LIT's TestRunner so that this is not required.
 if config.mlir_run_arm_sve_tests:
     config.available_features.add("mlir_arm_sve_tests")

From 7ec103a984ff114d24f26d935fe2292379269b53 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <k.basioli@gmail.com>
Date: Tue, 17 Jun 2025 15:52:33 +0100
Subject: [PATCH 743/851] Port #143108 to bazel (#144538)

---
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 53405a0dea24a..a2fb5ade73247 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -37,6 +37,7 @@ expand_template(
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
         "@LLVM_HAS_NVPTX_TARGET@": "0",
+        "@LLVM_INCLUDE_SPIRV_TOOLS_TESTS@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",

From 9eb0020555fc643582b2802abb8c1bc92059c248 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 17 Jun 2025 15:55:14 +0100
Subject: [PATCH 744/851] [DebugInfo][RemoveDIs] Remove a swathe of
 debug-intrinsic code (#144389)

Seeing how we can't generate any debug intrinsics any more: delete a
variety of codepaths where they're handled. For the most part these are
plain deletions, in others I've tweaked comments to remain coherent, or
added a type to (what was) type-generic-lambdas.

This isn't all the DbgInfoIntrinsic call sites but it's most of the
simple scenarios.

Co-authored-by: Nikita Popov <github@npopov.com>
---
 .../llvm/Analysis/IRSimilarityIdentifier.h    |   4 -
 llvm/include/llvm/Analysis/PtrUseVisitor.h    |   1 -
 llvm/include/llvm/IR/InstVisitor.h            |  10 --
 llvm/include/llvm/Transforms/Utils/Local.h    |   7 +-
 llvm/lib/Analysis/AliasSetTracker.cpp         |   3 -
 llvm/lib/Analysis/CallGraph.cpp               |   5 +-
 llvm/lib/Analysis/DemandedBits.cpp            |   3 +-
 llvm/lib/Analysis/Loads.cpp                   |   2 +-
 .../lib/Analysis/MemoryDependenceAnalysis.cpp |   8 -
 llvm/lib/Analysis/ValueTracking.cpp           |   6 -
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  14 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   5 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |   1 -
 llvm/lib/IR/DebugInfo.cpp                     |   5 -
 .../Target/AArch64/AArch64StackTagging.cpp    |   3 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |   2 +-
 llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp   |   5 +-
 .../AggressiveInstCombine.cpp                 |   4 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  11 --
 .../lib/Transforms/IPO/SampleProfileProbe.cpp |   3 +-
 .../InstCombine/InstructionCombining.cpp      |  12 +-
 .../Instrumentation/GCOVProfiling.cpp         |   8 -
 .../Instrumentation/ThreadSanitizer.cpp       |   3 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp           |  15 +-
 llvm/lib/Transforms/Scalar/GVN.cpp            |   7 +-
 llvm/lib/Transforms/Scalar/GVNHoist.cpp       |   3 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |   4 -
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   6 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   2 +-
 .../Scalar/SpeculativeExecution.cpp           |   8 +-
 .../Scalar/TailRecursionElimination.cpp       |  17 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  51 +-----
 llvm/lib/Transforms/Utils/Debugify.cpp        |  16 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp       |   7 -
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   9 +-
 llvm/lib/Transforms/Utils/Local.cpp           |  11 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |  44 ++---
 .../Utils/ScalarEvolutionExpander.cpp         |  15 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 159 ++++++------------
 .../Vectorize/LoopVectorizationLegality.cpp   |   2 -
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   3 -
 41 files changed, 104 insertions(+), 400 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
index dfda2dcee0db1..09a8875e1e28c 100644
--- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
+++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h
@@ -545,10 +545,6 @@ struct IRInstructionMapper {
     // dependent.
     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
-    // DebugInfo should be included in the regions, but should not be
-    // analyzed for similarity as it has no bearing on the outcome of the
-    // program.
-    InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
     InstrType visitIntrinsicInst(IntrinsicInst &II) {
       // These are disabled due to complications in the CodeExtractor when
       // outlining these instructions.  For instance, It is unclear what we
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index c9d3874e7dd96..0858d8aee2186 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -285,7 +285,6 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   // No-op intrinsics which we know don't escape the pointer to logic in
   // some other function.
-  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) {}
   void visitMemIntrinsic(MemIntrinsic &I) {}
   void visitIntrinsicInst(IntrinsicInst &II) {
     switch (II.getIntrinsicID()) {
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index b4eb729c7ce38..6d5398bb7a4cd 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -199,13 +199,6 @@ class InstVisitor {
   RetTy visitCatchPadInst(CatchPadInst &I)     { DELEGATE(FuncletPadInst); }
   RetTy visitFreezeInst(FreezeInst &I)         { DELEGATE(Instruction); }
 
-  // Handle the special intrinsic instruction classes.
-  RetTy visitDbgDeclareInst(DbgDeclareInst &I)    { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgValueInst(DbgValueInst &I)        { DELEGATE(DbgVariableIntrinsic);}
-  RetTy visitDbgVariableIntrinsic(DbgVariableIntrinsic &I)
-                                                  { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgLabelInst(DbgLabelInst &I)        { DELEGATE(DbgInfoIntrinsic);}
-  RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); }
   RetTy visitMemSetInst(MemSetInst &I)            { DELEGATE(MemIntrinsic); }
   RetTy visitMemSetPatternInst(MemSetPatternInst &I) {
     DELEGATE(IntrinsicInst);
@@ -286,9 +279,6 @@ class InstVisitor {
     if (const Function *F = I.getCalledFunction()) {
       switch (F->getIntrinsicID()) {
       default:                     DELEGATE(IntrinsicInst);
-      case Intrinsic::dbg_declare: DELEGATE(DbgDeclareInst);
-      case Intrinsic::dbg_value:   DELEGATE(DbgValueInst);
-      case Intrinsic::dbg_label:   DELEGATE(DbgLabelInst);
       case Intrinsic::memcpy:
       case Intrinsic::memcpy_inline:
         DELEGATE(MemCpyInst);
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 55e153f289590..df146458b4e6f 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -394,12 +394,9 @@ handleUnreachableTerminator(Instruction *I,
                             SmallVectorImpl<Value *> &PoisonedValues);
 
 /// Remove all instructions from a basic block other than its terminator
-/// and any present EH pad instructions. Returns a pair where the first element
-/// is the number of instructions (excluding debug info intrinsics) that have
-/// been removed, and the second element is the number of debug info intrinsics
+/// and any present EH pad instructions. Returns the number of instructions
 /// that have been removed.
-LLVM_ABI std::pair<unsigned, unsigned>
-removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
+LLVM_ABI unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
 
 /// Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 6d1dafbae60b9..1e2f05b60a9a3 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -343,9 +343,6 @@ void AliasSetTracker::add(AnyMemTransferInst *MTI) {
 }
 
 void AliasSetTracker::addUnknown(Instruction *Inst) {
-  if (isa<DbgInfoIntrinsic>(Inst))
-    return; // Ignore DbgInfo Intrinsics.
-
   if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index 5d1af52e8ab58..d7695e5cfc0d3 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -34,8 +34,7 @@ CallGraph::CallGraph(Module &M)
       CallsExternalNode(std::make_unique<CallGraphNode>(this, nullptr)) {
   // Add every interesting function to the call graph.
   for (Function &F : M)
-    if (!isDbgInfoIntrinsic(F.getIntrinsicID()))
-      addToCallGraph(&F);
+    addToCallGraph(&F);
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
@@ -101,7 +100,7 @@ void CallGraph::populateCallGraphNode(CallGraphNode *Node) {
         const Function *Callee = Call->getCalledFunction();
         if (!Callee)
           Node->addCalledFunction(Call, CallsExternalNode.get());
-        else if (!isDbgInfoIntrinsic(Callee->getIntrinsicID()))
+        else
           Node->addCalledFunction(Call, getOrInsertFunction(Callee));
 
         // Add reference to callback functions.
diff --git a/llvm/lib/Analysis/DemandedBits.cpp b/llvm/lib/Analysis/DemandedBits.cpp
index d7e2a3fa4fc59..6694d5cc06c8c 100644
--- a/llvm/lib/Analysis/DemandedBits.cpp
+++ b/llvm/lib/Analysis/DemandedBits.cpp
@@ -46,8 +46,7 @@ using namespace llvm::PatternMatch;
 #define DEBUG_TYPE "demanded-bits"
 
 static bool isAlwaysLive(Instruction *I) {
-  return I->isTerminator() || isa<DbgInfoIntrinsic>(I) || I->isEHPad() ||
-         I->mayHaveSideEffects();
+  return I->isTerminator() || I->isEHPad() || I->mayHaveSideEffects();
 }
 
 void DemandedBits::determineLiveOperandBits(
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 425f3682122cd..71a75b496455a 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -434,7 +434,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &S
     // If we see a free or a call which may write to memory (i.e. which might do
     // a free) the pointer could be marked invalid.
     if (isa<CallInst>(BBI) && BBI->mayWriteToMemory() &&
-        !isa<LifetimeIntrinsic>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+        !isa<LifetimeIntrinsic>(BBI))
       return false;
 
     Value *AccessedPtr;
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index f062189bac6a0..d6f490cb69a52 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -188,9 +188,6 @@ MemDepResult MemoryDependenceResults::getCallDependencyFrom(
   // Walk backwards through the block, looking for dependencies.
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
-    // Debug intrinsics don't cause dependences and should not affect Limit
-    if (isa<DbgInfoIntrinsic>(Inst))
-      continue;
 
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
@@ -432,11 +429,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
   while (ScanIt != BB->begin()) {
     Instruction *Inst = &*--ScanIt;
 
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-      // Debug intrinsics don't (and can't) cause dependencies.
-      if (isa<DbgInfoIntrinsic>(II))
-        continue;
-
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --*Limit;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9df667926faf0..a17417cb5189c 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7846,8 +7846,6 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(
    iterator_range<BasicBlock::const_iterator> Range, unsigned ScanLimit) {
   assert(ScanLimit && "scan limit must be non-zero");
   for (const Instruction &I : Range) {
-    if (isa<DbgInfoIntrinsic>(I))
-        continue;
     if (--ScanLimit == 0)
       return false;
     if (!isGuaranteedToTransferExecutionToSuccessor(&I))
@@ -8050,8 +8048,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     // well-defined operands.
 
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         break;
 
@@ -8076,8 +8072,6 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
 
   while (true) {
     for (const auto &I : make_range(Begin, End)) {
-      if (isa<DbgInfoIntrinsic>(I))
-        continue;
       if (--ScanLimit == 0)
         return false;
       if (mustTriggerUB(&I, YieldsPoison))
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 3792b456c836e..43574a54c37dd 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -896,12 +896,7 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   BasicBlock::iterator BBI = BI->getIterator();
   if (BBI != BB->begin()) {
     --BBI;
-    while (isa<DbgInfoIntrinsic>(BBI)) {
-      if (BBI == BB->begin())
-        break;
-      --BBI;
-    }
-    if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+    if (!isa<PHINode>(BBI))
       return nullptr;
   }
 
@@ -2981,10 +2976,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
   // Make sure there are no instructions between the first instruction
   // and return.
   BasicBlock::const_iterator BI = BB->getFirstNonPHIIt();
-  // Skip over debug and the bitcast.
-  while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
-         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(&*BI) ||
-         isFakeUse(&*BI))
+  // Skip over pseudo-probes and the bitcast.
+  while (&*BI == BCI || &*BI == EVI || isa<PseudoProbeInst>(BI) ||
+         isLifetimeEndOrBitCastFor(&*BI) || isFakeUse(&*BI))
     BI = std::next(BI);
   if (&*BI != RetI)
     return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 4f548cbad5c30..ec0c5473b0db0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1320,10 +1320,7 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
-  // Increase the SDNodeOrder if dealing with a non-debug instruction.
-  if (!isa<DbgInfoIntrinsic>(I))
-    ++SDNodeOrder;
-
+  ++SDNodeOrder;
   CurInst = &I;
 
   // Set inserted listener only if required.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index b02a03c0b0cb2..ac6d25f141ec6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1507,7 +1507,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
                                       const FunctionLoweringInfo &FuncInfo) {
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
          !I->isTerminator() &&     // Terminators aren't folded.
-         !isa<DbgInfoIntrinsic>(I) && // Debug instructions aren't folded.
          !I->isEHPad() &&             // EH pad instructions aren't folded.
          !FuncInfo.isExportedInst(I); // Exported instrs must be computed.
 }
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e6b1f76dfacf6..196fe294a274b 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -586,11 +586,6 @@ bool llvm::stripDebugInfo(Function &F) {
   DenseMap<MDNode *, MDNode *> LoopIDsMap;
   for (BasicBlock &BB : F) {
     for (Instruction &I : llvm::make_early_inc_range(BB)) {
-      if (isa<DbgInfoIntrinsic>(&I)) {
-        I.eraseFromParent();
-        Changed = true;
-        continue;
-      }
       if (I.getDebugLoc()) {
         Changed = true;
         I.setDebugLoc(DebugLoc());
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 0c0b512e3b6ce..75c7dd944b467 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -369,8 +369,7 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
 
   unsigned Count = 0;
   for (; Count < ClScanLimit && !BI->isTerminator(); ++BI) {
-    if (!isa<DbgInfoIntrinsic>(*BI))
-      ++Count;
+    ++Count;
 
     if (isNoModRef(AA->getModRefInfo(&*BI, AllocaLoc)))
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 9604f252dd3df..c2eb24b482d44 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -2318,7 +2318,7 @@ bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
   // instructions in it that are not involved in the original set Insts.
   for (auto *B : L->blocks()) {
     for (auto &In : *B) {
-      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+      if (isa<BranchInst>(In))
         continue;
       if (!Worklist.count(&In) && In.mayHaveSideEffects())
         return false;
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 47bb20f4aa073..d0a5be8b2e23a 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -117,7 +117,7 @@ class PPCBoolRetToInt : public FunctionPass {
 
   // A PHINode is Promotable if:
   // 1. Its type is i1 AND
-  // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic
+  // 2. All of its uses are ReturnInt, CallInst, or PHINode
   // AND
   // 3. All of its operands are Constant or Argument or
   //    CallInst or PHINode AND
@@ -136,8 +136,7 @@ class PPCBoolRetToInt : public FunctionPass {
     for (const PHINode *P : Promotable) {
       // Condition 2 and 3
       auto IsValidUser = [] (const Value *V) -> bool {
-        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) ||
-        isa<DbgInfoIntrinsic>(V);
+        return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V);
       };
       auto IsValidOperand = [] (const Value *V) -> bool {
         return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) ||
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f62361d334704..8c156c93ba8d1 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -719,9 +719,7 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
     if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
       return false;
 
-    // Ignore debug info so that's not counted against MaxInstrsToScan.
-    // Otherwise debug info could affect codegen.
-    if (!isa<DbgInfoIntrinsic>(Inst) && ++NumScanned > MaxInstrsToScan)
+    if (++NumScanned > MaxInstrsToScan)
       return false;
   }
 
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index cb18b55ae2183..2c17863266a97 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -717,8 +717,6 @@ static void moveFunctionData(Function &Old, Function &New,
     if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
       NewEnds.insert(std::make_pair(RI->getReturnValue(), &CurrBB));
 
-    std::vector<Instruction *> DebugInsts;
-
     for (Instruction &Val : CurrBB) {
       // Since debug-info originates from many different locations in the
       // program, it will cause incorrect reporting from a debugger if we keep
@@ -749,21 +747,12 @@ static void moveFunctionData(Function &Old, Function &New,
       // From this point we are only handling call instructions.
       CallInst *CI = cast<CallInst>(&Val);
 
-      // Collect debug intrinsics for later removal.
-      if (isa<DbgInfoIntrinsic>(CI)) {
-        DebugInsts.push_back(&Val);
-        continue;
-      }
-
       // Edit the scope of called functions inside of outlined functions.
       if (DISubprogram *SP = New.getSubprogram()) {
         DILocation *DI = DILocation::get(New.getContext(), 0, 0, SP);
         Val.setDebugLoc(DI);
       }
     }
-
-    for (Instruction *I : DebugInsts)
-      I->eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index dda3d5a788157..7fd7d4d4f750b 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -385,8 +385,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) {
     // line number. Real instructions generated by optimizations may not come
     // with a line number either.
     auto HasValidDbgLine = [](Instruction *J) {
-      return !isa<PHINode>(J) && !isa<DbgInfoIntrinsic>(J) &&
-             !J->isLifetimeStartOrEnd() && J->getDebugLoc();
+      return !isa<PHINode>(J) && !J->isLifetimeStartOrEnd() && J->getDebugLoc();
     };
 
     Instruction *J = &*BB->getFirstInsertionPt();
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4fe900e9421f8..e2cd2a59fab91 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -4787,11 +4787,7 @@ bool InstCombinerImpl::freezeOtherUses(FreezeInst &FI) {
     MoveBefore = *MoveBeforeOpt;
   }
 
-  // Don't move to the position of a debug intrinsic.
-  if (isa<DbgInfoIntrinsic>(MoveBefore))
-    MoveBefore = MoveBefore->getNextNonDebugInstruction()->getIterator();
-  // Re-point iterator to come after any debug-info records, if we're
-  // running in "RemoveDIs" mode
+  // Re-point iterator to come after any debug-info records.
   MoveBefore.setHeadBit(false);
 
   bool Changed = false;
@@ -5582,11 +5578,9 @@ bool InstCombinerImpl::prepareWorklist(Function &F) {
       continue;
 
     unsigned NumDeadInstInBB;
-    unsigned NumDeadDbgInstInBB;
-    std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
-        removeAllNonTerminatorAndEHPadInstructions(&BB);
+    NumDeadInstInBB = removeAllNonTerminatorAndEHPadInstructions(&BB);
 
-    MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
+    MadeIRChange |= NumDeadInstInBB != 0;
     NumDeadInst += NumDeadInstInBB;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 9351a42581ba0..3dfb36f4f1815 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -583,10 +583,6 @@ static bool functionHasLines(const Function &F, unsigned &EndLine) {
   EndLine = 0;
   for (const auto &BB : F) {
     for (const auto &I : BB) {
-      // Debug intrinsic locations correspond to the location of the
-      // declaration, not necessarily any statements or expressions.
-      if (isa<DbgInfoIntrinsic>(&I)) continue;
-
       const DebugLoc &Loc = I.getDebugLoc();
       if (!Loc)
         continue;
@@ -874,10 +870,6 @@ bool GCOVProfiler::emitProfileNotes(
         }
 
         for (const auto &I : BB) {
-          // Debug intrinsic locations correspond to the location of the
-          // declaration, not necessarily any statements or expressions.
-          if (isa<DbgInfoIntrinsic>(&I)) continue;
-
           const DebugLoc &Loc = I.getDebugLoc();
           if (!Loc)
             continue;
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index ec9f78edfeb1c..8ae6f7745a9e7 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -527,8 +527,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
         AtomicAccesses.push_back(&Inst);
       else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
         LocalLoadsAndStores.push_back(&Inst);
-      else if ((isa<CallInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) ||
-               isa<InvokeInst>(Inst)) {
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
           maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
         if (isa<MemIntrinsic>(Inst))
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index ea907af96edd9..985b9c0e53125 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -562,20 +562,7 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
     if (isLive(&I))
       continue;
 
-    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
-      // Avoid removing a dbg.assign that is linked to instructions because it
-      // holds information about an existing store.
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DII))
-        if (!at::getAssignmentInsts(DAI).empty())
-          continue;
-      // Check if the scope of this variable location is alive.
-      if (AliveScopes.count(DII->getDebugLoc()->getScope()))
-        continue;
-
-      // Fallthrough and drop the intrinsic.
-    } else {
-      Changed.ChangedNonDebugInstr = true;
-    }
+    Changed.ChangedNonDebugInstr = true;
 
     // Prepare to delete.
     Worklist.push_back(&I);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c580dd4ff230a..d9d05c3e8cc49 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2684,10 +2684,6 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
 /// When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets.
 bool GVNPass::processInstruction(Instruction *I) {
-  // Ignore dbg info intrinsics.
-  if (isa<DbgInfoIntrinsic>(I))
-    return false;
-
   // If the instruction can be easily simplified then do so now in preference
   // to value numbering it.  Value numbering often exposes redundancies, for
   // example if it determines that %y is equal to %x then the instruction
@@ -2974,8 +2970,7 @@ bool GVNPass::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
 bool GVNPass::performScalarPRE(Instruction *CurInst) {
   if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
       isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
-      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
-      isa<DbgInfoIntrinsic>(CurInst))
+      CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects())
     return false;
 
   // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 1c2e1531e47d8..0acbaf58a8f74 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -1166,8 +1166,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
         SI.insert(Store, VN);
       else if (auto *Call = dyn_cast<CallInst>(&I1)) {
         if (auto *Intr = dyn_cast<IntrinsicInst>(Call)) {
-          if (isa<DbgInfoIntrinsic>(Intr) ||
-              Intr->getIntrinsicID() == Intrinsic::assume ||
+          if (Intr->getIntrinsicID() == Intrinsic::assume ||
               Intr->getIntrinsicID() == Intrinsic::sideeffect)
             continue;
         }
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bd59caa6a959a..abb6ff1dcfe6c 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1204,10 +1204,6 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
 
     return !Invalidated;
   } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-    // Don't sink or hoist dbg info; it's legal, but not useful.
-    if (isa<DbgInfoIntrinsic>(I))
-      return false;
-
     // Don't sink calls which can throw.
     if (CI->mayThrow())
       return false;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e4f35e4b2108b..4ba69034d6448 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5613,8 +5613,7 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
     }
   }
 
-  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
-         && !isa<DbgInfoIntrinsic>(LowestIP) &&
+  assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad() &&
          "Insertion point must be a normal instruction");
 
   // Then, climb up the immediate dominator tree as far as we can go while
@@ -5627,9 +5626,6 @@ BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
   // Ignore landingpad instructions.
   while (IP->isEHPad()) ++IP;
 
-  // Ignore debug intrinsics.
-  while (isa<DbgInfoIntrinsic>(IP)) ++IP;
-
   // Set IP below instructions recently inserted by SCEVExpander. This keeps the
   // IP consistent across expansions and allows the previously inserted
   // instructions to be reused by subsequent expansion.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index afa7abfea419e..a22d84dcf014d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -551,7 +551,7 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
       for (Instruction &I : *BB) {
         // These won't get into the final code - don't even try calculating the
         // cost for them.
-        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
+        if (EphValues.count(&I))
           continue;
 
         // Track this instruction's expected baseline cost when executing the
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index cb202f5f71b91..f053e202655be 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -296,10 +296,6 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   };
   auto AllPrecedingUsesFromBlockHoisted =
       [&HasNoUnhoistedInstr](const User *U) {
-        // Do not hoist any debug info intrinsics.
-        if (isa<DbgInfoIntrinsic>(U))
-          return false;
-
         return HasNoUnhoistedInstr(U->operand_values());
       };
 
@@ -313,9 +309,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
         return false;  // too much to hoist
     } else {
-      // Debug info intrinsics should not be counted for threshold.
-      if (!isa<DbgInfoIntrinsic>(I))
-        NotHoistedInstCount++;
+      NotHoistedInstCount++;
       if (NotHoistedInstCount > SpecExecMaxNotHoisted)
         return false; // too much left behind
       NotHoisted.insert(&I);
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index c71c5a70a12fd..e7d989a43840d 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -239,8 +239,7 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
       // A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
       // considered accessing memory and will be marked as a tail call if we
       // don't bail out here.
-      if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
-          isa<PseudoProbeInst>(&I))
+      if (!CI || CI->isTailCall() || isa<PseudoProbeInst>(&I))
         continue;
 
       // Bail out for intrinsic stackrestore call because it can modify
@@ -335,9 +334,6 @@ static bool markTails(Function &F, OptimizationRemarkEmitter *ORE) {
 /// instructions between the call and this instruction are movable.
 ///
 static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
-  if (isa<DbgInfoIntrinsic>(I))
-    return true;
-
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
     if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
         llvm::findAllocaForValue(II->getArgOperand(1)))
@@ -396,12 +392,6 @@ static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
   return true;
 }
 
-static Instruction *firstNonDbg(BasicBlock::iterator I) {
-  while (isa<DbgInfoIntrinsic>(I))
-    ++I;
-  return &*I;
-}
-
 namespace {
 class TailRecursionEliminator {
   Function &F;
@@ -493,9 +483,8 @@ CallInst *TailRecursionEliminator::findTRECandidate(BasicBlock *BB) {
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F.getEntryBlock() &&
-      firstNonDbg(BB->front().getIterator()) == CI &&
-      firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+  if (BB == &F.getEntryBlock() && &BB->front() == CI &&
+      &*std::next(BB->begin()) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 1210bdf4a1c98..9883974c55e3b 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -471,10 +471,6 @@ CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
         Info.LifeEnd = IntrInst;
         continue;
       }
-      // At this point, permit debug uses outside of the region.
-      // This is fixed in a later call to fixupDebugInfoPostExtraction().
-      if (isa<DbgInfoIntrinsic>(IntrInst))
-        continue;
     }
     // Find untracked uses of the address, bail.
     if (!definedInRegion(Blocks, U))
@@ -1077,10 +1073,6 @@ static void applyFirstDebugLoc(Function *oldFunction,
       return any_of(*BB, [&BranchI](const Instruction &I) {
         if (!I.getDebugLoc())
           return false;
-        // Don't use source locations attached to debug-intrinsics: they could
-        // be from completely unrelated scopes.
-        if (isa<DbgInfoIntrinsic>(I))
-          return false;
         BranchI->setDebugLoc(I.getDebugLoc());
         return true;
       });
@@ -1329,7 +1321,6 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
   //  2) They need to point to fresh metadata, e.g. because they currently
   //     point to a variable in the wrong scope.
   SmallDenseMap<DINode *, DINode *> RemappedMetadata;
-  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
   SmallVector<DbgVariableRecord *, 4> DVRsToDelete;
   DenseMap<const MDNode *, MDNode *> Cache;
 
@@ -1370,55 +1361,29 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
       }
 
       DbgVariableRecord &DVR = cast<DbgVariableRecord>(DR);
-      // Apply the two updates that dbg.values get: invalid operands, and
-      // variable metadata fixup.
+      // If any of the used locations are invalid, delete the record.
       if (any_of(DVR.location_ops(), IsInvalidLocation)) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // DbgAssign intrinsics have an extra Value argument:
       if (DVR.isDbgAssign() && IsInvalidLocation(DVR.getAddress())) {
         DVRsToDelete.push_back(&DVR);
         continue;
       }
+
+      // If the variable was in the scope of the old function, i.e. it was not
+      // inlined, point the intrinsic to a fresh variable within the new
+      // function.
       if (!DVR.getDebugLoc().getInlinedAt())
         DVR.setVariable(GetUpdatedDIVariable(DVR.getVariable()));
     }
   };
 
-  for (Instruction &I : instructions(NewFunc)) {
+  for (Instruction &I : instructions(NewFunc))
     UpdateDbgRecordsOnInst(I);
 
-    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
-    if (!DII)
-      continue;
-
-    // Point the intrinsic to a fresh label within the new function if the
-    // intrinsic was not inlined from some other function.
-    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
-      UpdateDbgLabel(DLI);
-      continue;
-    }
-
-    auto *DVI = cast<DbgVariableIntrinsic>(DII);
-    // If any of the used locations are invalid, delete the intrinsic.
-    if (any_of(DVI->location_ops(), IsInvalidLocation)) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // DbgAssign intrinsics have an extra Value argument:
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI);
-        DAI && IsInvalidLocation(DAI->getAddress())) {
-      DebugIntrinsicsToDelete.push_back(DVI);
-      continue;
-    }
-    // If the variable was in the scope of the old function, i.e. it was not
-    // inlined, point the intrinsic to a fresh variable within the new function.
-    if (!DVI->getDebugLoc().getInlinedAt())
-      DVI->setVariable(GetUpdatedDIVariable(DVI->getVariable()));
-  }
-
-  for (auto *DII : DebugIntrinsicsToDelete)
-    DII->eraseFromParent();
   for (auto *DVR : DVRsToDelete)
     DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
   DIB.finalizeSubprogram(NewSP);
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index ff8a91bc7e7d4..c2dbdc57eb3b5 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -353,7 +353,7 @@ bool llvm::collectDebugInfoMetadata(Module &M,
 
         // Cllect dbg.values and dbg.declare.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -368,14 +368,8 @@ bool llvm::collectDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
         DebugInfoBeforePass.InstToDelete.insert({&I, &I});
 
@@ -597,7 +591,7 @@ bool llvm::checkDebugInfoMetadata(Module &M,
 
         // Collect dbg.values and dbg.declares.
         if (DebugifyLevel > Level::Locations) {
-          auto HandleDbgVariable = [&](auto *DbgVar) {
+          auto HandleDbgVariable = [&](DbgVariableRecord *DbgVar) {
             if (!SP)
               return;
             // Skip inlined variables.
@@ -612,14 +606,8 @@ bool llvm::checkDebugInfoMetadata(Module &M,
           };
           for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
             HandleDbgVariable(&DVR);
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
-            HandleDbgVariable(DVI);
         }
 
-        // Skip debug instructions other than dbg.value and dbg.declare.
-        if (isa<DbgInfoIntrinsic>(&I))
-          continue;
-
         LLVM_DEBUG(dbgs() << "  Collecting info for inst: " << I << '\n');
 
         DebugInfoAfterPass.DILocations.insert({&I, hasLoc(I)});
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index d1db2ee29f3a2..3a5c7a3b1738e 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -353,13 +353,6 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
       CallBase &CB = *cast<CallBase>(&*CurInst);
 
-      // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CB)) {
-        LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
-        ++CurInst;
-        continue;
-      }
-
       // Cannot handle inline asm.
       if (CB.isInlineAsm()) {
         LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f47c467d15140..7df5e9958182c 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1927,16 +1927,11 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
       }
     }
 
-    // Remove debug info intrinsics if we're not keeping inline info.
+    // Remove debug info records if we're not keeping inline info.
     if (NoInlineLineTables) {
       BasicBlock::iterator BI = FI->begin();
       while (BI != FI->end()) {
-        if (isa<DbgInfoIntrinsic>(BI)) {
-          BI = BI->eraseFromParent();
-          continue;
-        } else {
-          BI->dropDbgRecords();
-        }
+        BI->dropDbgRecords();
         ++BI;
       }
     }
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 33143700f5604..f5208d50c6aae 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2848,10 +2848,8 @@ bool llvm::handleUnreachableTerminator(
   return Changed;
 }
 
-std::pair<unsigned, unsigned>
-llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
+unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
-  unsigned NumDeadDbgInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
   // having to update as many def-use and use-def chains.
   Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
@@ -2870,15 +2868,12 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
       EndInst = Inst;
       continue;
     }
-    if (isa<DbgInfoIntrinsic>(Inst))
-      ++NumDeadDbgInst;
-    else
-      ++NumDeadInst;
+    ++NumDeadInst;
     // RemoveDIs: erasing debug-info must be done manually.
     Inst->dropDbgRecords();
     Inst->eraseFromParent();
   }
-  return {NumDeadInst, NumDeadDbgInst};
+  return NumDeadInst;
 }
 
 unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 6b42503b2e015..66d0573e83f65 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -547,36 +547,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // possible or create a clone in the OldPreHeader if not.
     Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
 
-    // Record all debug intrinsics preceding LoopEntryBranch to avoid
+    // Record all debug records preceding LoopEntryBranch to avoid
     // duplication.
-    using DbgIntrinsicHash =
+    using DbgHash =
         std::pair<std::pair<hash_code, DILocalVariable *>, DIExpression *>;
-    auto makeHash = [](auto *D) -> DbgIntrinsicHash {
+    auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
       auto VarLocOps = D->location_ops();
       return {{hash_combine_range(VarLocOps), D->getVariable()},
               D->getExpression()};
     };
 
-    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-    for (Instruction &I : llvm::drop_begin(llvm::reverse(*OrigPreheader))) {
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-        DbgIntrinsics.insert(makeHash(DII));
-        // Until RemoveDIs supports dbg.declares in DbgVariableRecord format,
-        // we'll need to collect DbgVariableRecords attached to any other debug
-        // intrinsics.
-        for (const DbgVariableRecord &DVR :
-             filterDbgVars(DII->getDbgRecordRange()))
-          DbgIntrinsics.insert(makeHash(&DVR));
-      } else {
-        break;
-      }
-    }
-
+    SmallDenseSet<DbgHash, 8> DbgRecords;
     // Build DbgVariableRecord hashes for DbgVariableRecords attached to the
-    // terminator, which isn't considered in the loop above.
+    // terminator.
     for (const DbgVariableRecord &DVR :
          filterDbgVars(OrigPreheader->getTerminator()->getDbgRecordRange()))
-      DbgIntrinsics.insert(makeHash(&DVR));
+      DbgRecords.insert(makeHash(&DVR));
 
     // Remember the local noalias scope declarations in the header. After the
     // rotation, they must be duplicated and the scope must be cloned. This
@@ -623,7 +609,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       // memory (without proving that the loop doesn't write).
       if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
           !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst) &&
+          !isa<AllocaInst>(Inst) &&
           // It is not safe to hoist the value of these instructions in
           // coroutines, as the addresses of otherwise eligible variables (e.g.
           // thread-local variables and errno) may change if the coroutine is
@@ -642,7 +628,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           // Erase anything we've seen before.
           for (DbgVariableRecord &DVR :
                make_early_inc_range(filterDbgVars(DbgValueRange)))
-            if (DbgIntrinsics.count(makeHash(&DVR)))
+            if (DbgRecords.count(makeHash(&DVR)))
               DVR.eraseFromParent();
         }
 
@@ -671,7 +657,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
         // Erase anything we've seen before.
         for (DbgVariableRecord &DVR :
              make_early_inc_range(filterDbgVars(Range)))
-          if (DbgIntrinsics.count(makeHash(&DVR)))
+          if (DbgRecords.count(makeHash(&DVR)))
             DVR.eraseFromParent();
       }
 
@@ -679,13 +665,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
       RemapInstruction(C, ValueMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-      // Avoid inserting the same intrinsic twice.
-      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
-        if (DbgIntrinsics.count(makeHash(DII))) {
-          C->eraseFromParent();
-          continue;
-        }
-
       // With the operands remapped, see if the instruction constant folds or is
       // otherwise simplifyable.  This commonly occurs because the entry from PHI
       // nodes allows icmps and other instructions to fold.
@@ -806,7 +785,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
                                     &InsertedPHIs);
 
-    // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+    // Attach debug records to the new phis if that phi uses a value that
     // previously had debug metadata attached. This keeps the debug info
     // up-to-date in the loop body.
     if (!InsertedPHIs.empty())
@@ -952,9 +931,6 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
     if (!isSafeToSpeculativelyExecute(&*I))
       return false;
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
-
     switch (I->getOpcode()) {
     default:
       return false;
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 70afd4133df7c..24fe08d6c3e4e 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -182,8 +182,7 @@ SCEVExpander::GetOptimalInsertionPointForCastOf(Value *V) const {
     BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
     while ((isa<BitCastInst>(IP) &&
             isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
-            cast<BitCastInst>(IP)->getOperand(0) != A) ||
-           isa<DbgInfoIntrinsic>(IP))
+            cast<BitCastInst>(IP)->getOperand(0) != A))
       ++IP;
     return IP;
   }
@@ -278,11 +277,6 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
-
       auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
         // Ensure that no-wrap flags match.
         if (isa<OverflowingBinaryOperator>(I)) {
@@ -382,10 +376,6 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *Offset, Value *V,
   if (IP != BlockBegin) {
     --IP;
     for (; ScanLimit; --IP, --ScanLimit) {
-      // Don't count dbg.value against the ScanLimit, to avoid perturbing the
-      // generated code.
-      if (isa<DbgInfoIntrinsic>(IP))
-        ScanLimit++;
       if (auto *GEP = dyn_cast<GetElementPtrInst>(IP)) {
         if (GEP->getPointerOperand() == V &&
             GEP->getSourceElementType() == Builder.getInt8Ty() &&
@@ -1545,8 +1535,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
           InsertPt = L->getHeader()->getFirstInsertionPt();
 
         while (InsertPt != Builder.GetInsertPoint() &&
-               (isInsertedInstruction(&*InsertPt) ||
-                isa<DbgInfoIntrinsic>(&*InsertPt))) {
+               (isInsertedInstruction(&*InsertPt))) {
           InsertPt = std::next(InsertPt);
         }
         break;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0980f0e57aa6d..eb52c1b7e6fba 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1130,17 +1130,14 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
 
     Instruction *NewBonusInst = BonusInst.clone();
 
-    if (!isa<DbgInfoIntrinsic>(BonusInst)) {
-      if (!NewBonusInst->getDebugLoc().isSameSourceLocation(
-              PTI->getDebugLoc())) {
-        // Unless the instruction has the same !dbg location as the original
-        // branch, drop it. When we fold the bonus instructions we want to make
-        // sure we reset their debug locations in order to avoid stepping on
-        // dead code caused by folding dead branches.
-        NewBonusInst->setDebugLoc(DebugLoc::getDropped());
-      } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
-        mapAtomInstance(DL, VMap);
-      }
+    if (!NewBonusInst->getDebugLoc().isSameSourceLocation(PTI->getDebugLoc())) {
+      // Unless the instruction has the same !dbg location as the original
+      // branch, drop it. When we fold the bonus instructions we want to make
+      // sure we reset their debug locations in order to avoid stepping on
+      // dead code caused by folding dead branches.
+      NewBonusInst->setDebugLoc(DebugLoc::getDropped());
+    } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) {
+      mapAtomInstance(DL, VMap);
     }
 
     RemapInstruction(NewBonusInst, VMap,
@@ -1158,9 +1155,6 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
     RemapDbgRecordRange(NewBonusInst->getModule(), Range, VMap,
                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-    if (isa<DbgInfoIntrinsic>(BonusInst))
-      continue;
-
     NewBonusInst->takeName(&BonusInst);
     BonusInst.setName(NewBonusInst->getName() + ".old");
     VMap[&BonusInst] = NewBonusInst;
@@ -1903,21 +1897,6 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     Instruction *I1 = &*BB1ItrPair.first;
 
-    // Skip debug info if it is not identical.
-    bool AllDbgInstsAreIdentical = all_of(OtherSuccIterRange, [I1](auto &Iter) {
-      Instruction *I2 = &*Iter;
-      return I1->isIdenticalToWhenDefined(I2);
-    });
-    if (!AllDbgInstsAreIdentical) {
-      while (isa<DbgInfoIntrinsic>(I1))
-        I1 = &*++BB1ItrPair.first;
-      for (auto &SuccIter : OtherSuccIterRange) {
-        Instruction *I2 = &*SuccIter;
-        while (isa<DbgInfoIntrinsic>(I2))
-          I2 = &*++SuccIter;
-      }
-    }
-
     bool AllInstsAreIdentical = true;
     bool HasTerminator = I1->isTerminator();
     for (auto &SuccIter : OtherSuccIterRange) {
@@ -1965,49 +1944,33 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
 
     if (AllInstsAreIdentical) {
       BB1ItrPair.first++;
-      if (isa<DbgInfoIntrinsic>(I1)) {
-        // The debug location is an integral part of a debug info intrinsic
-        // and can't be separated from it or replaced.  Instead of attempting
-        // to merge locations, simply hoist both copies of the intrinsic.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          auto *I2 = &*SuccIter++;
-          assert(isa<DbgInfoIntrinsic>(I2));
-          I2->moveBefore(TI->getIterator());
+      // For a normal instruction, we just move one to right before the
+      // branch, then replace all uses of the other with the first.  Finally,
+      // we remove the now redundant second instruction.
+      hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
+      // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
+      // and leave any that were not hoisted behind (by calling moveBefore
+      // rather than moveBeforePreserving).
+      I1->moveBefore(TI->getIterator());
+      for (auto &SuccIter : OtherSuccIterRange) {
+        Instruction *I2 = &*SuccIter++;
+        assert(I2 != I1);
+        if (!I2->use_empty())
+          I2->replaceAllUsesWith(I1);
+        I1->andIRFlags(I2);
+        if (auto *CB = dyn_cast<CallBase>(I1)) {
+          bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
+          assert(Success && "We should not be trying to hoist callbases "
+                            "with non-intersectable attributes");
+          // For NDEBUG Compile.
+          (void)Success;
         }
-      } else {
-        // For a normal instruction, we just move one to right before the
-        // branch, then replace all uses of the other with the first.  Finally,
-        // we remove the now redundant second instruction.
-        hoistLockstepIdenticalDbgVariableRecords(TI, I1, OtherInsts);
-        // We've just hoisted DbgVariableRecords; move I1 after them (before TI)
-        // and leave any that were not hoisted behind (by calling moveBefore
-        // rather than moveBeforePreserving).
-        I1->moveBefore(TI->getIterator());
-        for (auto &SuccIter : OtherSuccIterRange) {
-          Instruction *I2 = &*SuccIter++;
-          assert(I2 != I1);
-          if (!I2->use_empty())
-            I2->replaceAllUsesWith(I1);
-          I1->andIRFlags(I2);
-          if (auto *CB = dyn_cast<CallBase>(I1)) {
-            bool Success = CB->tryIntersectAttributes(cast<CallBase>(I2));
-            assert(Success && "We should not be trying to hoist callbases "
-                              "with non-intersectable attributes");
-            // For NDEBUG Compile.
-            (void)Success;
-          }
 
-          combineMetadataForCSE(I1, I2, true);
-          // I1 and I2 are being combined into a single instruction.  Its debug
-          // location is the merged locations of the original instructions.
-          I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-          I2->eraseFromParent();
-        }
+        combineMetadataForCSE(I1, I2, true);
+        // I1 and I2 are being combined into a single instruction.  Its debug
+        // location is the merged locations of the original instructions.
+        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+        I2->eraseFromParent();
       }
       if (!Changed)
         NumHoistCommonCode += SuccIterPairs.size();
@@ -2297,11 +2260,8 @@ static void sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
   SmallVector<Instruction*,4> Insts;
   for (auto *BB : Blocks) {
     Instruction *I = BB->getTerminator();
-    do {
-      I = I->getPrevNode();
-    } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
-    if (!isa<DbgInfoIntrinsic>(I))
-      Insts.push_back(I);
+    I = I->getPrevNode();
+    Insts.push_back(I);
   }
 
   // We don't need to do any more checking here; canSinkInstructions should
@@ -3234,7 +3194,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // - All of their uses are in ThenBB.
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
-  SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
+  SmallVector<Instruction *, 4> SpeculatedPseudoProbes;
 
   unsigned SpeculatedInstructions = 0;
   bool HoistLoadsStores = Options.HoistLoadsStoresWithCondFaulting;
@@ -3243,12 +3203,6 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   StoreInst *SpeculatedStore = nullptr;
   EphemeralValueTracker EphTracker;
   for (Instruction &I : reverse(drop_end(*ThenBB))) {
-    // Skip debug info.
-    if (isa<DbgInfoIntrinsic>(I)) {
-      SpeculatedDbgIntrinsics.push_back(&I);
-      continue;
-    }
-
     // Skip pseudo probes. The consequence is we lose track of the branch
     // probability for ThenBB, which is fine since the optimization here takes
     // place regardless of the branch probability.
@@ -3257,7 +3211,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
       // the samples collected on the non-conditional path are counted towards
       // the conditional path. We leave it for the counts inference algorithm to
       // figure out a proper count for an unknown probe.
-      SpeculatedDbgIntrinsics.push_back(&I);
+      SpeculatedPseudoProbes.push_back(&I);
       continue;
     }
 
@@ -3388,9 +3342,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   // hoisting above.
   for (auto &I : make_early_inc_range(*ThenBB)) {
     if (!SpeculatedStoreValue || &I != SpeculatedStore) {
-      // Don't update the DILocation of dbg.assign intrinsics.
-      if (!isa<DbgAssignIntrinsic>(&I))
-        I.setDebugLoc(DebugLoc::getDropped());
+      I.setDebugLoc(DebugLoc::getDropped());
     }
     I.dropUBImplyingAttrsAndMetadata();
 
@@ -3402,9 +3354,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   }
 
   // Hoist the instructions.
-  // In "RemoveDIs" non-instr debug-info mode, drop DbgVariableRecords attached
-  // to these instructions, in the same way that dbg.value intrinsics are
-  // dropped at the end of this block.
+  // Drop DbgVariableRecords attached to these instructions.
   for (auto &It : *ThenBB)
     for (DbgRecord &DR : make_early_inc_range(It.getDbgRecordRange()))
       // Drop all records except assign-kind DbgVariableRecords (dbg.assign
@@ -3442,15 +3392,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
     PN.setIncomingValue(ThenI, V);
   }
 
-  // Remove speculated dbg intrinsics.
-  // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
-  // dbg value for the different flows and inserting it after the select.
-  for (Instruction *I : SpeculatedDbgIntrinsics) {
-    // We still want to know that an assignment took place so don't remove
-    // dbg.assign intrinsics.
-    if (!isa<DbgAssignIntrinsic>(I))
-      I->eraseFromParent();
-  }
+  // Remove speculated pseudo probes.
+  for (Instruction *I : SpeculatedPseudoProbes)
+    I->eraseFromParent();
 
   ++NumSpeculations;
   return true;
@@ -4162,8 +4106,8 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     // Don't check the branch condition comparison itself.
     if (&I == Cond)
       continue;
-    // Ignore dbg intrinsics, and the terminator.
-    if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
+    // Ignore the terminator.
+    if (isa<BranchInst>(I))
       continue;
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
@@ -7762,8 +7706,7 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     BranchInst *BI2 = dyn_cast<BranchInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
@@ -7784,12 +7727,6 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
       }
     }
 
-    // The debug info in OtherPred doesn't cover the merged control flow that
-    // used to go through BB.  We need to delete it or update it.
-    for (Instruction &Inst : llvm::make_early_inc_range(*OtherPred))
-      if (isa<DbgInfoIntrinsic>(Inst))
-        Inst.eraseFromParent();
-
     SmallSetVector<BasicBlock *, 16> UniqueSuccs(succ_begin(BB), succ_end(BB));
     for (BasicBlock *Succ : UniqueSuccs) {
       Succ->removePredecessor(BB);
@@ -7837,8 +7774,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
     if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
-      for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-        ;
+      ++I;
       if (I->isTerminator() &&
           tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
         return true;
@@ -7847,8 +7783,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
   // See if we can merge an empty landing pad block with another which is
   // equivalent.
   if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
-    for (++I; isa<DbgInfoIntrinsic>(I); ++I)
-      ;
+    ++I;
     if (I->isTerminator() && tryToMergeLandingPad(LPad, BI, BB, DTU))
       return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 8e09e6f8d4935..0c4e5bb3d4721 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -896,13 +896,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       } // end of PHI handling
 
       // We handle calls that:
-      //   * Are debug info intrinsics.
       //   * Have a mapping to an IR intrinsic.
       //   * Have a vector version available.
       auto *CI = dyn_cast<CallInst>(&I);
 
       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
-          !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
             (!VFDatabase::getMappings(*CI).empty() ||
              isTLIScalarize(*TLI, *CI)))) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 8bff3c018714d..d0bf637b70aba 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -24334,9 +24334,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       continue;
     }
 
-    if (isa<DbgInfoIntrinsic>(It))
-      continue;
-
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(It)) {
       // Check that the PHI is a reduction PHI.

From c9a87a50aee3c91f36d33c170d5131bcc370c289 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Tue, 17 Jun 2025 08:14:05 -0700
Subject: [PATCH 745/851] [SLPVectorizer] Use accurate cost for external users
 of resize shuffles (#137419)

When implementing the vectorization, we potentially need to add shuffles
for external users. In such cases, we may be shuffling a smaller vector
into a larger vector. When this happens `ResizeToVF` will just build a
poison padded identity vector. Then the to build the final shuffle, we
just use the `SK_InsertSubvector` mask.

This is possibly clearer by looking at the included test in
SLPVectorizer/AMDGPU/external-shuffle.ll

In the exit block we have a bunch of shuffles to glue the vectorized
tree match the `InsertElement` users. `TMP25` holds the result of
resizing the v2i16 vectorized sequence to match the `InsertElement` size
v16i16. Then `TMP26` is the final shuffle which replaces the
`InsertElement` sequence. This is just an insertsubvector.

However, when calculating the cost for these shuffles, we aren't
modelling this correctly. `ResizeToVF` will indicate to
`performExtractsShuffleAction` that we cannot use the original mask due
to the resize shuffle. The consequence is that the cost calculation uses
a different shuffle mask than what is ultimately used.

Going back to the included test, we can consider again `TMP26`. Clearly
we can see the shuffle uses a mask {0, 1, 2, 3, 16, 17, poison ..}.
However, we will currently calculate the cost with a mask {0, 1, 2, 3,
20, 21, ...} we have replaced 16 and 17 with 20 and 21 (Index + Vector
Size). Queries like BasicTTImpl::improveShuffleKindFromMask will not
recognize this as an `SK_InsertSubvector` mask, and targets which have
reduced costs for `SK_InsertSubvector` will not accurately calculate the
cost.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  54 +++++---
 .../SLPVectorizer/AMDGPU/external-shuffle.ll  | 128 ++++++------------
 .../extractelement-single-use-many-nodes.ll   |  11 +-
 .../X86/vec_list_bias-inseltpoison.ll         |  25 ++--
 4 files changed, 99 insertions(+), 119 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d0bf637b70aba..d811e9d77d183 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14910,25 +14910,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
 
   Cost += ExtractCost;
   auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
-                                    bool) {
+                                    bool ForSingleMask) {
     InstructionCost C = 0;
     unsigned VF = Mask.size();
     unsigned VecVF = TE->getVectorFactor();
-    if (VF != VecVF &&
-        (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
-         !ShuffleVectorInst::isIdentityMask(Mask, VF))) {
-      SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
-      std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
-                OrigMask.begin());
-      C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
-                           getWidenedType(TE->getMainOp()->getType(), VecVF),
-                           OrigMask);
-      LLVM_DEBUG(
-          dbgs() << "SLP: Adding cost " << C
-                 << " for final shuffle of insertelement external users.\n";
-          TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
-      Cost += C;
-      return std::make_pair(TE, true);
+    bool HasLargeIndex =
+        any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
+    if ((VF != VecVF && HasLargeIndex) ||
+        !ShuffleVectorInst::isIdentityMask(Mask, VF)) {
+
+      if (HasLargeIndex) {
+        SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
+        std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+                  OrigMask.begin());
+        C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
+                             getWidenedType(TE->getMainOp()->getType(), VecVF),
+                             OrigMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+        Cost += C;
+        return std::make_pair(TE, true);
+      }
+
+      if (!ForSingleMask) {
+        SmallVector<int> ResizeMask(VF, PoisonMaskElem);
+        for (unsigned I = 0; I < VF; ++I) {
+          if (Mask[I] != PoisonMaskElem)
+            ResizeMask[Mask[I]] = Mask[I];
+        }
+        if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
+          C = ::getShuffleCost(
+              *TTI, TTI::SK_PermuteSingleSrc,
+              getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
+        LLVM_DEBUG(
+            dbgs() << "SLP: Adding cost " << C
+                   << " for final shuffle of insertelement external users.\n";
+            TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+
+        Cost += C;
+      }
     }
     return std::make_pair(TE, false);
   };
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
index ce9e47a03dee3..f3e89b60b8045 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll
@@ -10,124 +10,84 @@ define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out,
 ; GCN-NEXT:  [[ENTRY:.*]]:
 ; GCN-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
 ; GCN-NEXT:    [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2
-; GCN-NEXT:    [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3
 ; GCN-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
 ; GCN-NEXT:    [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4
-; GCN-NEXT:    [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5
 ; GCN-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
 ; GCN-NEXT:    [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6
-; GCN-NEXT:    [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7
 ; GCN-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
 ; GCN-NEXT:    [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8
-; GCN-NEXT:    [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9
 ; GCN-NEXT:    [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
 ; GCN-NEXT:    [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10
-; GCN-NEXT:    [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11
 ; GCN-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
 ; GCN-NEXT:    [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12
-; GCN-NEXT:    [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13
 ; GCN-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14
 ; GCN-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
-; GCN-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
-; GCN-NEXT:    [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
-; GCN-NEXT:    [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
-; GCN-NEXT:    [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
-; GCN-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
-; GCN-NEXT:    [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
-; GCN-NEXT:    [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0
-; GCN-NEXT:    [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1
-; GCN-NEXT:    [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
-; GCN-NEXT:    [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
-; GCN-NEXT:    [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0
-; GCN-NEXT:    [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
 ; GCN-NEXT:    br label %[[DO_BODY:.*]]
 ; GCN:       [[DO_BODY]]:
-; GCN-NEXT:    [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ]
-; GCN-NEXT:    [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ]
+; GCN-NEXT:    [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ]
 ; GCN-NEXT:    [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8
-; GCN-NEXT:    [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1
-; GCN-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
-; GCN-NEXT:    [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1
-; GCN-NEXT:    [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
-; GCN-NEXT:    [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1
-; GCN-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
-; GCN-NEXT:    [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1
-; GCN-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
-; GCN-NEXT:    [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1
-; GCN-NEXT:    [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
-; GCN-NEXT:    [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1
-; GCN-NEXT:    [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
+; GCN-NEXT:    [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2
+; GCN-NEXT:    [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8
+; GCN-NEXT:    [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2
+; GCN-NEXT:    [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8
+; GCN-NEXT:    [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2
+; GCN-NEXT:    [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8
 ; GCN-NEXT:    [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2
 ; GCN-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
-; GCN-NEXT:    [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0
-; GCN-NEXT:    [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0
-; GCN-NEXT:    [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0
-; GCN-NEXT:    [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0
-; GCN-NEXT:    [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0
-; GCN-NEXT:    [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0
 ; GCN-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
 ; GCN:       [[EXIT]]:
-; GCN-NEXT:    [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; GCN-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; GCN-NEXT:    [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP57]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP58:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; GCN-NEXT:    [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; GCN-NEXT:    [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2
-; GCN-NEXT:    [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3
-; GCN-NEXT:    [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4
-; GCN-NEXT:    [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5
-; GCN-NEXT:    [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6
-; GCN-NEXT:    [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7
-; GCN-NEXT:    [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8
-; GCN-NEXT:    [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9
-; GCN-NEXT:    [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10
-; GCN-NEXT:    [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11
-; GCN-NEXT:    [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12
-; GCN-NEXT:    [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13
-; GCN-NEXT:    [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; GCN-NEXT:    [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32
-; GCN-NEXT:    store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2157]], ptr [[OUT]], align 32
+; GCN-NEXT:    store <16 x i16> [[TMP49]], ptr [[OUT1]], align 32
+; GCN-NEXT:    store <16 x i16> [[VEC2151]], ptr [[OUT2]], align 32
 ; GCN-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
index 28bab3276c47d..6942df532ae29 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
@@ -7,9 +7,8 @@ define void @foo(double %i) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double poison, double 0.000000e+00>, double [[I]], i32 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]]
 ; CHECK-NEXT:    [[I82:%.*]] = fsub double 0.000000e+00, poison
+; CHECK-NEXT:    [[I103:%.*]] = fsub double 0.000000e+00, [[I]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 0, i32 poison, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> <double 0.000000e+00, double poison, double poison, double poison, double 0.000000e+00, double poison, double poison, double poison>, <8 x i32> <i32 8, i32 1, i32 poison, i32 3, i32 12, i32 5, i32 poison, i32 7>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2
@@ -22,13 +21,11 @@ define void @foo(double %i) {
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]])
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]]
 ; CHECK:       bb115:
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = fmul double 0.000000e+00, [[I103]]
+; CHECK-NEXT:    [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]]
 ; CHECK-NEXT:    [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> <double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double poison>, double [[I82]], i32 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer
 ; CHECK-NEXT:    [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index e3a6020a542fb..2cc2f28ccf6d5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -25,7 +25,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -34,7 +33,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
@@ -42,17 +40,20 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
 ; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
-; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
-; CHECK-NEXT:    [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[T701:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
 ; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
-; CHECK-NEXT:    [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
-; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T72]], splat (i32 3)
+; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], splat (i32 3)
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
 ;

From 02b78ff9c639993356ccc72b847128fd1ff7f2ba Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 08:21:24 -0700
Subject: [PATCH 746/851] [llvm] include Compiler.h in a few headers where it
 was missed (#144464)

Add missing `#include "llvm/Support/Compiler.h"` in a few LLVM headers
that use the `LLVM_ABI` macro.
---
 llvm/include/llvm/Option/OptSpecifier.h              | 2 ++
 llvm/include/llvm/Transforms/IPO.h                   | 2 ++
 llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/llvm/include/llvm/Option/OptSpecifier.h b/llvm/include/llvm/Option/OptSpecifier.h
index dc6acae7fc002..cb87fbd17ec1c 100644
--- a/llvm/include/llvm/Option/OptSpecifier.h
+++ b/llvm/include/llvm/Option/OptSpecifier.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_OPTION_OPTSPECIFIER_H
 #define LLVM_OPTION_OPTSPECIFIER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 namespace opt {
 
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 56b30968ffd77..7523ae66429ac 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_IPO_H
 #define LLVM_TRANSFORMS_IPO_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class ModulePass;
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index ab0bd3a5a9962..f20ae1809aa56 100644
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 #define LLVM_TRANSFORMS_UTILS_PROMOTEMEMTOREG_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 template <typename T> class ArrayRef;

From 14286244f1dca9300ead8bf83f049df2ffa97180 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 17 Jun 2025 16:12:14 +0100
Subject: [PATCH 747/851] Follow up to 9eb0020555, squelch unused variable
 warning

It turns out that this now-deleted debug-intrinsic code was the only use of
CI.
---
 llvm/lib/Transforms/IPO/IROutliner.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 2c17863266a97..8d6ff72fa6061 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -744,9 +744,6 @@ static void moveFunctionData(Function &Old, Function &New,
         continue;
       }
 
-      // From this point we are only handling call instructions.
-      CallInst *CI = cast<CallInst>(&Val);
-
       // Edit the scope of called functions inside of outlined functions.
       if (DISubprogram *SP = New.getSubprogram()) {
         DILocation *DI = DILocation::get(New.getContext(), 0, 0, SP);

From 1410e69b641182e942470a90d4a0bb5a2910805f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Tue, 17 Jun 2025 11:26:47 -0400
Subject: [PATCH 748/851] [SPIRV] Allow __spirv_SpecConstant in Vulkan shaders
 (#143543)

There is a builtin __spirv_SpecConstant that the SPIR-V backend expands
into a specialization constant. However, it is currently only enable for
OpenCL shaders, and not the graphic shaders.

We want to use it for specialization constants coming from HLSL, so we
are enabling it for graphic shaders as well.

Implements https://github.com/llvm/wg-hlsl/pull/287

Fixes https://github.com/llvm/llvm-project/issues/142991
---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.td        |  2 +
 .../CodeGen/SPIRV/constant/spec-constant.ll   | 73 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 llvm/test/CodeGen/SPIRV/constant/spec-constant.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 6842e5ff067cf..401a762cd62a3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -674,6 +674,8 @@ defm : DemangledNativeBuiltin<"ndrange_3D", OpenCL_std, Enqueue, 1, 3, OpBuildND
 
 // Spec constant builtin records:
 defm : DemangledNativeBuiltin<"__spirv_SpecConstant", OpenCL_std, SpecConstant, 2, 2, OpSpecConstant>;
+defm : DemangledNativeBuiltin<"__spirv_SpecConstant", GLSL_std_450,
+                              SpecConstant, 2, 2, OpSpecConstant>;
 defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecConstant, 1, 0, OpSpecConstantComposite>;
 
 // Async Copy and Prefetch builtin records:
diff --git a/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
new file mode 100644
index 0000000000000..299d61d3bffde
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/constant/spec-constant.ll
@@ -0,0 +1,73 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; CHECK-DAG: OpDecorate [[bool_const:%[0-9]+]] SpecId 1
+; CHECK-DAG: OpDecorate [[short_const:%[0-9]+]] SpecId 2
+; CHECK-DAG: OpDecorate [[int_const:%[0-9]+]] SpecId 3
+; CHECK-DAG: OpDecorate [[long_const:%[0-9]+]] SpecId 4
+; CHECK-DAG: OpDecorate [[float_const:%[0-9]+]] SpecId 8
+; CHECK-DAG: OpDecorate [[double_const:%[0-9]+]] SpecId 9
+; CHECK-DAG: OpDecorate [[enum_const:%[0-9]+]] SpecId 10
+
+; CHECK-DAG: [[bool_const]] = OpSpecConstantTrue {{%[0-9]+}}
+; CHECK-DAG: [[short_const]] = OpSpecConstant {{%[0-9]+}} 4
+; CHECK-DAG: [[int_const]] = OpSpecConstant {{%[0-9]+}} 5
+; CHECK-DAG: [[long_const]] = OpSpecConstant {{%[0-9]+}} 8
+; CHECK-DAG: [[float_const]] = OpSpecConstant {{%[0-9]+}} 1112014848
+; CHECK-DAG: [[double_const]] = OpSpecConstant {{%[0-9]+}} 0 1079574528
+; CHECK-DAG: [[enum_const]] = OpSpecConstant {{%[0-9]+}} 30
+
+@_ZL10bool_const = internal addrspace(10) global i32 0, align 4
+@_ZL11short_const = internal addrspace(10) global i16 0, align 2
+@_ZL9int_const = internal addrspace(10) global i32 0, align 4
+@_ZL10long_const = internal addrspace(10) global i64 0, align 8
+@_ZL11float_const = internal addrspace(10) global float 0.000000e+00, align 4
+@_ZL12double_const = internal addrspace(10) global double 0.000000e+00, align 8
+@_ZL10enum_const = internal addrspace(10) global i32 0, align 4
+
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @main() local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[b:%[0-9]+]] = OpSelect {{%[0-9]+}} [[bool_const]]
+  ; CHECK: OpStore {{%[0-9]+}} [[b]]
+  %0 = tail call spir_func i1 @_Z20__spirv_SpecConstantib(i32 1, i1 true)
+  %storedv.i.i = zext i1 %0 to i32
+  store i32 %storedv.i.i, ptr addrspace(10) @_ZL10bool_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[short_const]]
+  %2 = tail call spir_func i16 @_Z20__spirv_SpecConstantis(i32 2, i16 4)
+  store i16 %2, ptr addrspace(10) @_ZL11short_const, align 2
+
+  ; CHECK: OpStore {{%[0-9]+}} [[int_const]]
+  %4 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 3, i32 5)
+  store i32 %4, ptr addrspace(10) @_ZL9int_const, align 4
+
+
+  ; CHECK: OpStore {{%[0-9]+}} [[long_const]]
+  %6 = tail call spir_func i64 @_Z20__spirv_SpecConstantix(i32 4, i64 8)
+  store i64 %6, ptr addrspace(10) @_ZL10long_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[float_const]]
+  %14 = tail call reassoc nnan ninf nsz arcp afn spir_func float @_Z20__spirv_SpecConstantif(i32 8, float 5.000000e+01)
+  store float %14, ptr addrspace(10) @_ZL11float_const, align 4
+
+  ; CHECK: OpStore {{%[0-9]+}} [[double_const]]
+  %16 = tail call reassoc nnan ninf nsz arcp afn spir_func double @_Z20__spirv_SpecConstantid(i32 9, double 1.000000e+02)
+  store double %16, ptr addrspace(10) @_ZL12double_const, align 8
+
+  ; CHECK: OpStore {{%[0-9]+}} [[enum_const]]
+  %18 = tail call spir_func i32 @_Z20__spirv_SpecConstantii(i32 10, i32 30)
+  store i32 %18, ptr addrspace(10) @_ZL10enum_const, align 4
+  ret void
+}
+
+
+declare i1 @_Z20__spirv_SpecConstantib(i32, i1)
+declare i8 @_Z20__spirv_SpecConstantia(i32, i8)
+declare i16 @_Z20__spirv_SpecConstantis(i32, i16)
+declare i32 @_Z20__spirv_SpecConstantii(i32, i32)
+declare i64 @_Z20__spirv_SpecConstantix(i32, i64)
+declare float @_Z20__spirv_SpecConstantif(i32, float)
+declare double @_Z20__spirv_SpecConstantid(i32, double)
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
\ No newline at end of file

From c80282d333d7248c8a34694ce1bec9a40681c1c5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 00:27:41 +0900
Subject: [PATCH 749/851] AMDGPU: Directly select minimumnum/maximumnum with
 ieee_mode=0 (#141903)

The hardware min/max follow the IR rules with IEEE mode disabled,
so we can avoid the canonicalizes of the input. We lose the quieting
of a signaling nan if both inputs are nans, but we only require that
with strictfp.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |    2 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   25 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |    2 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   36 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   49 +
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   42 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        | 1442 +++++++----------
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        | 1442 +++++++----------
 9 files changed, 1368 insertions(+), 1673 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 18a948d68e97b..7a50923ffedc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -92,6 +92,8 @@ def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().F
 def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
 def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">;
 def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
+def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
+def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e8dff85064383..f82e6df9bcbfc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -957,12 +957,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
-  auto &MinNumMaxNum = getActionDefinitionsBuilder({
-      G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
-
-  // TODO: These should be custom lowered and are directly legal with IEEE=0
-  auto &MinimumNumMaximumNum =
-      getActionDefinitionsBuilder({G_FMINIMUMNUM, G_FMAXIMUMNUM});
+  auto &MinNumMaxNum = getActionDefinitionsBuilder(
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
+       G_FMAXNUM_IEEE});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -980,8 +977,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
-  MinimumNumMaximumNum.lower();
-
   if (ST.hasVOP3PInsts())
     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 
@@ -2162,6 +2157,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
     return legalizeFPTOI(MI, MRI, B, false);
   case TargetOpcode::G_FMINNUM:
   case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINIMUMNUM:
+  case TargetOpcode::G_FMAXIMUMNUM:
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
@@ -2741,9 +2738,17 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
 
   // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINNUM/G_FMAXNUM
-  if (!MFI->getMode().IEEE)
+  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
+  //
+  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
+  // enabled.
+  if (!MFI->getMode().IEEE) {
+    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
+        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
+      return true;
+
     return !IsIEEEOp;
+  }
 
   if (IsIEEEOp)
     return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index dd7aef8f0c583..4391a48ff2b68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4009,6 +4009,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FMAXNUM:
   case AMDGPU::G_FMINIMUM:
   case AMDGPU::G_FMAXIMUM:
+  case AMDGPU::G_FMINIMUMNUM:
+  case AMDGPU::G_FMAXIMUMNUM:
   case AMDGPU::G_INTRINSIC_TRUNC:
   case AMDGPU::G_STRICT_FADD:
   case AMDGPU::G_STRICT_FSUB:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0ced3a6ba9bc0..586de433ea28a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -531,8 +531,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
                        Legal);
 
-  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
-                     Custom);
+  setOperationAction(
+      {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+      {MVT::f32, MVT::f64}, Custom);
 
   // These are really only legal for ieee_mode functions. We should be avoiding
   // them for functions that don't have ieee_mode enabled, so just say they are
@@ -771,7 +772,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                         MVT::v32f16, MVT::v32bf16},
                        Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        MVT::f16, Custom);
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
@@ -825,8 +828,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
                          VT, Custom);
 
-    setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
-                       Custom);
+    setOperationAction(
+        {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
+        {MVT::v2f16, MVT::v4f16}, Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
@@ -6062,6 +6066,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
     return lowerFMINNUM_FMAXNUM(Op, DAG);
+  case ISD::FMINIMUMNUM:
+  case ISD::FMAXIMUMNUM:
+    return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
     return lowerFMINIMUM_FMAXIMUM(Op, DAG);
@@ -6086,8 +6093,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMUL:
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
-  case ISD::FMINIMUMNUM:
-  case ISD::FMAXIMUMNUM:
   case ISD::UADDSAT:
   case ISD::USUBSAT:
   case ISD::SADDSAT:
@@ -6995,6 +7000,23 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   return Op;
 }
 
+SDValue
+SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  bool IsIEEEMode = Info->getMode().IEEE;
+
+  if (IsIEEEMode)
+    return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
+
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+      VT == MVT::v16bf16)
+    return splitBinaryVectorOp(Op, DAG);
+  return Op;
+}
+
 SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d71a22722129e..89fb12b52c3e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -147,6 +147,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitFP_ROUNDVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7c..897c30948cf06 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1390,6 +1390,55 @@ def : GCNPat<
   (S_ADD_U64_PSEUDO $src0, $src1)>;
 }
 
+//===----------------------------------------------------------------------===//
+// FP min/max patterns
+//===----------------------------------------------------------------------===//
+
+
+class FPBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (vt (VOP3Mods vt:$src0, i32:$src0_mods)),
+                      (vt (VOP3Mods vt:$src1, i32:$src1_mods)))),
+    (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+class FPPkBinOpPat <SDPatternOperator node, ValueType vt, Instruction inst>
+  : GCNPat <(vt (node (VOP3PMods v2f16:$src0, i32:$src0_mods),
+                      (VOP3PMods v2f16:$src1, i32:$src1_mods))),
+  (inst $src0_mods, $src0, $src1_mods, $src1, DSTCLAMP.NONE)
+>;
+
+/// With IEEE=0, signalingness is ignored and the non-nan input will
+/// be directly returned.
+let OtherPredicates = [IEEEModeDisabled] in {
+  def : FPBinOpPat<fminimumnum, f32, V_MIN_F32_e64>;
+  def : FPBinOpPat<fmaximumnum, f32, V_MAX_F32_e64>;
+  def : FPBinOpPat<fminimumnum, f64, V_MIN_F64_e64>;
+  def : FPBinOpPat<fmaximumnum, f64, V_MAX_F64_e64>;
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = NotHasTrue16BitInsts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseRealTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_t16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_t16_e64>;
+  }
+
+  let SubtargetPredicate = Has16BitInsts,
+      True16Predicate = UseFakeTrue16Insts in {
+    def : FPBinOpPat<fminimumnum, f16, V_MIN_F16_fake16_e64>;
+    def : FPBinOpPat<fmaximumnum, f16, V_MAX_F16_fake16_e64>;
+  }
+
+  let SubtargetPredicate = HasVOP3PInsts in {
+    def : FPPkBinOpPat<fminimumnum, v2f16, V_PK_MIN_F16>;
+    def : FPPkBinOpPat<fmaximumnum, v2f16, V_PK_MAX_F16>;
+  }
+}
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 46da9d33639b6..86e73ed03f187 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2019,9 +2019,7 @@ define float @v_fneg_minimumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
   %fneg = fneg float %min
@@ -2044,8 +2042,7 @@ define float @v_fneg_self_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %a)
   %min.fneg = fneg float %min
@@ -2068,8 +2065,7 @@ define float @v_fneg_posk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float 4.0, float %a)
   %fneg = fneg float %min
@@ -2092,8 +2088,7 @@ define float @v_fneg_negk_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -4.0, float %a)
   %fneg = fneg float %min
@@ -2251,8 +2246,7 @@ define float @v_fneg_neg0_minimumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float -0.0, float %a)
   %fneg = fneg float %min
@@ -2299,7 +2293,6 @@ define float @v_fneg_0_minimumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_minimumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2330,9 +2323,7 @@ define <2 x float> @v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_minimumnum_multi_use_minimumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %min = call float @llvm.minimumnum.f32(float %a, float %b)
@@ -2364,9 +2355,7 @@ define float @v_fneg_maximumnum_f32_no_ieee(float %a, float %b) #4 {
 ; GCN-LABEL: v_fneg_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
   %fneg = fneg float %max
@@ -2389,8 +2378,7 @@ define float @v_fneg_self_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_self_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %a)
   %max.fneg = fneg float %max
@@ -2413,8 +2401,7 @@ define float @v_fneg_posk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_posk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, -4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float 4.0, float %a)
   %fneg = fneg float %max
@@ -2437,8 +2424,7 @@ define float @v_fneg_negk_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_negk_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -4.0, float %a)
   %fneg = fneg float %max
@@ -2473,8 +2459,7 @@ define float @v_fneg_neg0_maximumnum_f32_no_ieee(float %a) #4 {
 ; GCN-LABEL: v_fneg_neg0_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, 0, v0
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float -0.0, float %a)
   %fneg = fneg float %max
@@ -2499,7 +2484,6 @@ define float @v_fneg_0_maximumnum_foldable_use_f32_no_ieee(float %a, float %b) #
 ; GCN-LABEL: v_fneg_0_maximumnum_foldable_use_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GCN-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -2530,9 +2514,7 @@ define <2 x float> @v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee(float %a,
 ; GCN-LABEL: v_fneg_maximumnum_multi_use_maximumnum_f32_no_ieee:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v1, -1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_min_f32_e64 v0, -v0, -v1
 ; GCN-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %max = call float @llvm.maximumnum.f32(float %a, float %b)
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index c45d86ce306e7..4f73e8e9c1883 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -3414,8 +3414,8 @@ define <2 x half> @v_maximumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3652,57 +3652,57 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_maximumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_maximumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3712,8 +3712,8 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -3722,11 +3722,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_maximumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3735,10 +3735,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3750,11 +3750,11 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3767,10 +3767,10 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3834,12 +3834,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3939,16 +3946,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3965,16 +3972,16 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3988,6 +3995,18 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4003,8 +4022,8 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
@@ -4026,10 +4045,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4058,10 +4077,10 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4268,22 +4287,22 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4304,19 +4323,19 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4333,6 +4352,21 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4351,10 +4385,10 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
@@ -4380,15 +4414,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4419,15 +4452,14 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.maximumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4554,28 +4586,28 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4600,22 +4632,22 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4635,6 +4667,24 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_maximumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_maximumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4656,12 +4706,12 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
@@ -4691,18 +4741,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4736,18 +4785,17 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4978,52 +5026,52 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v12, v17
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -5031,8 +5079,8 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5064,34 +5112,34 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_maximumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_maximumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -5123,15 +5171,45 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_max_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_v16f16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-LABEL: v_maximumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_maximumnum_v16f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v9
@@ -5156,29 +5234,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5214,29 +5292,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v16f16:
@@ -5280,29 +5358,29 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -6174,34 +6252,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6285,34 +6363,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6396,34 +6474,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v18
@@ -6516,34 +6594,34 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v18
@@ -6584,11 +6662,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6606,11 +6684,11 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v2f32:
@@ -6624,29 +6702,16 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6663,8 +6728,8 @@ define <2 x float> @v_maximumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
@@ -6784,14 +6849,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6812,14 +6877,14 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v3f32:
@@ -6836,40 +6901,19 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6889,10 +6933,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
@@ -6913,10 +6957,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_maximumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6943,10 +6987,10 @@ define <3 x float> @v_maximumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v3 :: v_dual_max_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_max_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7030,17 +7074,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7064,17 +7108,17 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_maximumnum_v4f32:
@@ -7094,43 +7138,22 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_max_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_maximumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -7153,12 +7176,12 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v5
@@ -7182,10 +7205,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7214,10 +7237,10 @@ define <4 x float> @v_maximumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v4 :: v_dual_max_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v6 :: v_dual_max_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7365,11 +7388,11 @@ define <2 x double> @v_maximumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v2f64:
@@ -7606,14 +7629,14 @@ define <3 x double> @v_maximumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v3f64:
@@ -7895,17 +7918,17 @@ define <4 x double> @v_maximumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
-; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_maximumnum_v4f64:
@@ -8091,10 +8114,10 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8107,89 +8130,35 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-GISEL-LABEL: v_maximumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_maximumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_maximumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8320,85 +8289,35 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_maximumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8472,87 +8391,35 @@ define float @v_maximumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_maximumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_maximumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8631,14 +8498,14 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8659,11 +8526,7 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8671,82 +8534,28 @@ define <2 x half> @v_maximumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_maximumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_maximumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_max_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_maximumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8909,12 +8718,19 @@ define <3 x half> @v_maximumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_max_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_max_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 5cb051d2ab857..558006d2b6957 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -3239,8 +3239,8 @@ define <2 x half> @v_minimumnum_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3477,57 +3477,57 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v3, v3
 ; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v3, v3
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v3f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX950-SDAG-LABEL: v_minimumnum_v3f16:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v2, v3, v3
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-GISEL-LABEL: v_minimumnum_v3f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3537,8 +3537,8 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3547,11 +3547,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-SDAG-LABEL: v_minimumnum_v3f16:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3560,10 +3560,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3575,11 +3575,11 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-SDAG-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-SDAG-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3592,10 +3592,10 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3659,12 +3659,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan(<3 x half> %x, <3 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan:
 ; GFX10:       ; %bb.0:
@@ -3764,16 +3771,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v2, v2
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v6
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v5, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v5
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -3790,16 +3797,16 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v4f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -3813,6 +3820,18 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v4f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v3, v3
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3828,8 +3847,8 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
@@ -3851,10 +3870,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3883,10 +3902,10 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v2
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v3
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4093,22 +4112,22 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v3, v3
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v7, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v3, v3, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v8, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v7, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v7
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v6, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
@@ -4129,19 +4148,19 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v6f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4158,6 +4177,21 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v2, v2, v3
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v6f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v5, v5
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v3
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v6f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4176,10 +4210,10 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
@@ -4205,15 +4239,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4244,15 +4277,14 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v3
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <6 x half> @llvm.minimumnum.v6f16(<6 x half> %x, <6 x half> %y)
@@ -4379,28 +4411,28 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v4, v4
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v6, v6
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v15, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v12
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v9, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v4, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v4, v4, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v10, v14
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v5, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v5, v5, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v11, v15
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v6, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v6, v6, v9
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -4425,22 +4457,22 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v8f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4460,6 +4492,24 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v3, v3, v4
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX950-GISEL-LABEL: v_minimumnum_v8f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v7, v7
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: v_minimumnum_v8f16:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4481,12 +4531,12 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
@@ -4516,18 +4566,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v4
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4561,18 +4610,17 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v4
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> %x, <8 x half> %y)
@@ -4803,52 +4851,52 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_max_f16_e32 v16, v0, v0
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v8, v8
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v8, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v8, v1, v1
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v9, v9
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v9, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v2, v2
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v8, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v9, v2, v2
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v10, v10
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v10, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v16, v16, v19
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v3, v3
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v9, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v2, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v10, v3, v3
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v11, v11
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v11, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v8, v17, v8
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v4, v4
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v10, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v3, v3, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v11, v4, v4
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v12, v12
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v12, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v9, v18, v9
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v5, v5
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v11, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v12, v5, v5
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v13, v13
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v5, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v13, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v10, v19, v10
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v19, v6, v6
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v12, v17
+; GFX8-GISEL-NEXT:    v_min_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v6, v6
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v6, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v11, v17, v11
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v7, v7
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v12, v18, v12
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v18, v14, v14
 ; GFX8-GISEL-NEXT:    v_max_f16_sdwa v14, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_sdwa v5, v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v13, v15, v15
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v18, v19, v18
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v13, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v6, v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v13, v17, v13
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v14, v7, v7
+; GFX8-GISEL-NEXT:    v_max_f16_e32 v17, v15, v15
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v7, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_max_f16_sdwa v15, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v14, v14, v17
 ; GFX8-GISEL-NEXT:    v_min_f16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v16, v0
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v1, v8, v1
@@ -4856,8 +4904,8 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v3, v10, v3
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v4, v11, v4
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v18, v6
-; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v13, v7
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v6, v13, v6
+; GFX8-GISEL-NEXT:    v_or_b32_e32 v7, v14, v7
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_minimumnum_v16f16:
@@ -4889,34 +4937,34 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX900-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: v_minimumnum_v16f16:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX9-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX9-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX900-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX900-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-SDAG-LABEL: v_minimumnum_v16f16:
 ; GFX950-SDAG:       ; %bb.0:
@@ -4948,15 +4996,45 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX950-SDAG-NEXT:    v_pk_min_f16 v7, v7, v8
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_v16f16:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-LABEL: v_minimumnum_v16f16:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v9, v9
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v10, v10
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v12, v12
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v4, v4, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v13, v13
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v5, v5, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v14, v14
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v15, v15
+; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_pk_min_f16 v7, v7, v8
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_minimumnum_v16f16:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v8, v8
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v9, v9, v9
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-SDAG-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX10-SDAG-NEXT:    v_pk_max_f16 v8, v11, v11
 ; GFX10-SDAG-NEXT:    v_pk_min_f16 v1, v1, v9
@@ -4981,29 +5059,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX10-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5039,29 +5117,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v11, v11
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v8
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v9
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v10
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v11
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v12
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v13
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v14
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v15
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v15, v15
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v3, v3, v8
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v4, v4, v9
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v5, v5, v10
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v6, v6, v11
+; GFX11-GISEL-NEXT:    v_pk_min_f16 v7, v7, v12
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v16f16:
@@ -5105,29 +5183,29 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v9
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v11, v11
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v8
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v9
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v10
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v11
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v12
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v13
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v14
-; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v15
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v15, v15
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v3, v3, v8
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v4, v4, v9
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v5, v5, v10
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v6, v6, v11
+; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v7, v7, v12
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> %x, <16 x half> %y)
   ret <16 x half> %result
@@ -5999,34 +6077,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX950-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX950-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX950-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6110,34 +6188,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX10-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX10-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX10-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6221,34 +6299,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v16, v16, v16
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v17, v17, v17
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v2, v2, v2
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v18, v18, v18
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v19, v19, v19
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v4, v4, v4
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v20, v20, v20
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v5, v5, v5
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v21, v21, v21
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v6, v6, v6
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v22, v22, v22
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v7, v7, v7
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v23, v23, v23
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v8, v8, v8
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v24, v24, v24
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v9, v9, v9
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v25, v25, v25
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v10, v10, v10
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v26, v26, v26
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v11, v11, v11
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v27, v27, v27
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v12, v12, v12
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v28, v28, v28
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v13, v13, v13
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v29, v29, v29
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v14, v14, v14
 ; GFX11-GISEL-NEXT:    v_pk_max_f16 v30, v30, v30
+; GFX11-GISEL-NEXT:    v_pk_max_f16 v15, v15, v15
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v16
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v1, v1, v17
 ; GFX11-GISEL-NEXT:    v_pk_min_f16 v2, v2, v18
@@ -6341,34 +6419,34 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) {
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v0, v0, v0
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v16, v16, v16
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
-; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v17, v17, v17
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v2, v2, v2
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v18, v18, v18
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v19, v19, v19
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v4, v4, v4
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v20, v20, v20
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v5, v5, v5
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v21, v21, v21
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v6, v6, v6
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v22, v22, v22
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v7, v7, v7
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v23, v23, v23
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v8, v8, v8
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v24, v24, v24
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v9, v9, v9
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v25, v25, v25
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v10, v10, v10
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v26, v26, v26
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v11, v11, v11
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v27, v27, v27
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v12, v12, v12
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v28, v28, v28
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v13, v13, v13
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v29, v29, v29
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v14, v14, v14
 ; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v30, v30, v30
+; GFX12-GISEL-NEXT:    v_pk_max_num_f16 v15, v15, v15
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v0, v0, v16
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v1, v1, v17
 ; GFX12-GISEL-NEXT:    v_pk_min_num_f16 v2, v2, v18
@@ -6409,11 +6487,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6431,11 +6509,11 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v3
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v2f32:
@@ -6449,29 +6527,16 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6488,8 +6553,8 @@ define <2 x float> @v_minimumnum_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
@@ -6609,14 +6674,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6637,14 +6702,14 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v4
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v3f32:
@@ -6661,40 +6726,19 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v2, v2, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v3f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v5
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v6, v6
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v3f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6714,10 +6758,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
@@ -6738,10 +6782,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX11-GISEL-LABEL: v_minimumnum_v3f32:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v4, v4, v4
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4
 ; GFX11-GISEL-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6768,10 +6812,10 @@ define <3 x float> @v_minimumnum_v3f32(<3 x float> %x, <3 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v3, v3, v3
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v4, v4, v4
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v3 :: v_dual_min_num_f32 v1, v1, v4
 ; GFX12-GISEL-NEXT:    v_min_num_f32_e32 v2, v2, v5
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -6855,17 +6899,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX7-GISEL:       ; %bb.0:
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX7-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX7-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6889,17 +6933,17 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v5
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v6
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
 ; GFX8-GISEL-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-GISEL-NEXT:    v_mul_f32_e32 v4, 1.0, v7
+; GFX8-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: v_minimumnum_v4f32:
@@ -6919,43 +6963,22 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX9-SDAG-NEXT:    v_min_f32_e32 v3, v3, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
-; GFX900-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX900-GISEL-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v4f32:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_pk_mul_f32 v[6:7], 1.0, v[6:7] op_sel_hi:[0,1]
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX950-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
-; GFX950-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_minimumnum_v4f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v6, v6
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v2, v2, v4
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-GISEL-NEXT:    v_max_f32_e32 v4, v7, v7
+; GFX9-GISEL-NEXT:    v_min_f32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f32:
 ; GFX10-SDAG:       ; %bb.0:
@@ -6978,12 +7001,12 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-GISEL-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-GISEL-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX10-GISEL-NEXT:    v_min_f32_e32 v1, v1, v5
@@ -7007,10 +7030,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5
+; GFX11-GISEL-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3
 ; GFX11-GISEL-NEXT:    v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5
 ; GFX11-GISEL-NEXT:    v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7039,10 +7062,10 @@ define <4 x float> @v_minimumnum_v4f32(<4 x float> %x, <4 x float> %y) {
 ; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1
-; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5
+; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3
 ; GFX12-GISEL-NEXT:    v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v0, v0, v4 :: v_dual_min_num_f32 v1, v1, v5
 ; GFX12-GISEL-NEXT:    v_dual_min_num_f32 v2, v2, v6 :: v_dual_min_num_f32 v3, v3, v7
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -7190,11 +7213,11 @@ define <2 x double> @v_minimumnum_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[6:7], v[6:7]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v2f64:
@@ -7431,14 +7454,14 @@ define <3 x double> @v_minimumnum_v3f64(<3 x double> %x, <3 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
 ; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[6:7]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[10:11]
+; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[6:7]
+; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v3f64:
@@ -7720,17 +7743,17 @@ define <4 x double> @v_minimumnum_v4f64(<4 x double> %x, <4 x double> %y) {
 ; GFX950-GISEL:       ; %bb.0:
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[10:11], v[10:11]
+; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[12:13], v[12:13]
+; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[8:9]
 ; GFX950-GISEL-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
-; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX950-GISEL-NEXT:    v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX950-GISEL-NEXT:    v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX950-GISEL-NEXT:    v_max_f64 v[14:15], v[14:15], v[14:15]
-; GFX950-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[8:9]
-; GFX950-GISEL-NEXT:    v_min_f64 v[2:3], v[2:3], v[10:11]
-; GFX950-GISEL-NEXT:    v_min_f64 v[4:5], v[4:5], v[12:13]
-; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[14:15]
+; GFX950-GISEL-NEXT:    v_max_f64 v[8:9], v[14:15], v[14:15]
+; GFX950-GISEL-NEXT:    v_min_f64 v[6:7], v[6:7], v[8:9]
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_minimumnum_v4f64:
@@ -7916,10 +7939,10 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7932,89 +7955,35 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
 ; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f16_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-SDAG:       ; %bb.0:
-; GFX11-TRUE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-SDAG-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-SDAG-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-TRUE16-GISEL:       ; %bb.0:
-; GFX11-TRUE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-GISEL-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-GISEL-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-SDAG:       ; %bb.0:
-; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-FAKE16-GISEL-LABEL: v_minimumnum_f16_no_ieee:
-; GFX11-FAKE16-GISEL:       ; %bb.0:
-; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-GISEL-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-GISEL-NEXT:    v_min_f16_e32 v0, v0, v1
-; GFX11-FAKE16-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_minimumnum_f16_no_ieee:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-TRUE16-SDAG-LABEL: v_minimumnum_f16_no_ieee:
 ; GFX12-TRUE16-SDAG:       ; %bb.0:
@@ -8145,85 +8114,35 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
 }
 
 define float @v_minimumnum_f32_no_ieee(float %x, float %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-GISEL-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX8-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX9-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f32_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f32_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f32_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f32_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_f32_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f32_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f32_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8297,87 +8216,35 @@ define float @v_minimumnum_f32_nnan_no_ieee(float %x, float %y) #0 {
 }
 
 define double @v_minimumnum_f64_no_ieee(double %x, double %y) #0 {
-; GFX7-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX7-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX7-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-SDAG:       ; %bb.0:
-; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX8-GISEL:       ; %bb.0:
-; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX8-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX8-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX9-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f64_no_ieee:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-LABEL: v_minimumnum_f64_no_ieee:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX10-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX10-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_f64_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-SDAG-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_f64_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_f64_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
-; GFX11-GISEL-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_f64_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_f64_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8456,14 +8323,14 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX7-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX7-SDAG:       ; %bb.0:
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-SDAG-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8484,11 +8351,7 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-SDAG:       ; %bb.0:
 ; GFX8-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v1, v1, v1
-; GFX8-SDAG-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-SDAG-NEXT:    v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-SDAG-NEXT:    v_min_f16_e32 v0, v0, v1
 ; GFX8-SDAG-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -8496,82 +8359,28 @@ define <2 x half> @v_minimumnum_v2f16_no_ieee(<2 x half> %x, <2 x half> %y) #0 {
 ; GFX8-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX8-GISEL:       ; %bb.0:
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v2, v0, v0
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_max_f16_e32 v3, v1, v1
-; GFX8-GISEL-NEXT:    v_max_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v2, v3
-; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-GISEL-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX8-GISEL-NEXT:    v_min_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX900-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX900-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX950-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX10-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX10-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-SDAG-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: v_minimumnum_v2f16_no_ieee:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-GISEL-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_minimumnum_v2f16_no_ieee:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-LABEL: v_minimumnum_v2f16_no_ieee:
 ; GFX12-SDAG:       ; %bb.0:
@@ -8734,12 +8543,19 @@ define <3 x half> @v_minimumnum_v3f16_nnan_no_ieee(<3 x half> %x, <3 x half> %y)
 ; GFX8-GISEL-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_min_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_min_f16 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimumnum_v3f16_nnan_no_ieee:
 ; GFX10:       ; %bb.0:

From 72fb8ae541dcb6d4ab24283bd91a1fc64a9b7e3b Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Tue, 17 Jun 2025 15:29:37 +0000
Subject: [PATCH 750/851] [lld][test][PAC] Do not rely on concrete offsets in
 LTO tests (#143358)

When changing codegen (e.g. in #130809), offsets in binaries produced by
LTO tests might change. We do not need to match concrete offset values,
it's enough to ensure that hex values in particular places are
identical.

---------

Co-authored-by: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
---
 lld/test/ELF/lto/aarch64-pac-got-func.ll | 50 ++++++++++++------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/lld/test/ELF/lto/aarch64-pac-got-func.ll b/lld/test/ELF/lto/aarch64-pac-got-func.ll
index a37c67a2f3ba8..0baa3559a6f90 100644
--- a/lld/test/ELF/lto/aarch64-pac-got-func.ll
+++ b/lld/test/ELF/lto/aarch64-pac-got-func.ll
@@ -5,29 +5,29 @@
 ; RUN: llvm-readelf -r -x.got %t | FileCheck %s
 
 ; CHECK:      Relocation section '.rela.dyn' at offset 0x3d0 contains 8 entries:
-; CHECK-NEXT:     Offset             Info             Type               Symbol's Value  Symbol's Name + Addend
-; CHECK-NEXT: 00000000000206a0  0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
-; CHECK-NEXT: 00000000000206a8  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
-; CHECK-NEXT: 00000000000206b0  0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
-; CHECK-NEXT: 00000000000206b8  0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
-; CHECK-NEXT: 00000000000206c0  0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
-; CHECK-NEXT: 00000000000206c8  0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
-; CHECK-NEXT: 0000000000020690  0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010490 func + 0
-; CHECK-NEXT: 0000000000020698  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 00000000000306d0 var + 0
+; CHECK-NEXT:     Offset                Info             Type               Symbol's Value  Symbol's Name + Addend
+; CHECK-NEXT: [[#%x,ADDR:]]        0000000100000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 func_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x8]]  0000000200000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g1 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x10]] 0000000300000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g2 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x18]] 0000000400000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g3 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x20]] 0000000500000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 g4 + 0
+; CHECK-NEXT: {{0*}}[[#ADDR+0x28]] 0000000600000412 R_AARCH64_AUTH_GLOB_DAT 0000000000000000 var_undef + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x10]] 0000000700000412 R_AARCH64_AUTH_GLOB_DAT 0000000000010800 func + 0
+; CHECK-NEXT: {{0*}}[[#ADDR-0x8]]  0000000a00000412 R_AARCH64_AUTH_GLOB_DAT 0000000000031400 var + 0
 
 ; CHECK:      Hex dump of section '.got':
-; CHECK-NEXT: 0x00020690 00000000 00000080 00000000 000000a0
-;;                                      ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206a0 00000000 00000080 00000000 000000a0
-;;                                      ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
-;;                                                        ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206b0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-; CHECK-NEXT: 0x000206c0 00000000 000000a0 00000000 000000a0
-;;                                      ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
-;;                                                        ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR-0x10]] 00000000 00000080 00000000 000000a0
+;;                                                  ^^ func: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ var: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR]]      00000000 00000080 00000000 000000a0
+;;                                                  ^^ func_undef: 0b10000000 bit 63 address diversity = true, bits 61..60 key = IA
+;;                                                                    ^^ g1: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x10]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g2: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ g3: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+; CHECK-NEXT: 0x{{0*}}[[#ADDR+0x20]] 00000000 000000a0 00000000 000000a0
+;;                                                  ^^ g4: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
+;;                                                                    ^^ var_undef: 0b10100000 bit 63 address diversity = true, bits 61..60 key = DA
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -37,16 +37,16 @@ target triple = "aarch64-unknown-linux-gnu"
 @g3 = external global ptr
 @g4 = external global ptr
 
-define void @func() {
+define void @func() align 1024 {
 entry:
   ret void
 }
 declare void @func_undef()
 
-@var = global i32 42
+@var = global i32 42, align 1024
 @var_undef = external global i32
 
-define void @bar() #0 {
+define void @bar() #0 align 1024 {
 entry:
   store ptr ptrauth (ptr @func, i32 0), ptr @g1
   store ptr ptrauth (ptr @func_undef, i32 0), ptr @g2
@@ -55,7 +55,7 @@ entry:
   ret void
 }
 
-define void @_start() {
+define void @_start() align 1024 {
 entry:
   ret void
 }

From 4ced29b8482e3537da7d27d410bf7947b0666b4c Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 16:33:24 +0100
Subject: [PATCH 751/851] [lldb][Expression] Don't create Objective-C IR
 checker for pure-C++ targets/frames (#144503)

There's no need to create this utility function (and run it) for
targets/frames that aren't Objective-C/Objective-C++.
---
 .../Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp   | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
index 6ef5d3f5be6d9..be17c5421fc51 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp
@@ -18,6 +18,7 @@
 
 #include "lldb/Expression/UtilityFunction.h"
 #include "lldb/Target/ExecutionContext.h"
+#include "lldb/Target/Language.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StackFrame.h"
 #include "lldb/Target/Target.h"
@@ -46,7 +47,13 @@ ClangDynamicCheckerFunctions::Install(DiagnosticManager &diagnostic_manager,
     ObjCLanguageRuntime *objc_language_runtime =
         ObjCLanguageRuntime::Get(*process);
 
-    if (objc_language_runtime) {
+    SourceLanguage lang = process->GetTarget().GetLanguage();
+    if (!lang)
+      if (auto *frame = exe_ctx.GetFramePtr())
+        lang = frame->GetLanguage();
+
+    if (objc_language_runtime &&
+        Language::LanguageIsObjC(lang.AsLanguageType())) {
       Expected<std::unique_ptr<UtilityFunction>> checker_fn =
           objc_language_runtime->CreateObjectChecker(VALID_OBJC_OBJECT_CHECK_NAME, exe_ctx);
       if (!checker_fn)

From a5f5f1209aa122ee295ae0dc0f1ee594ad988ecd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 17 Jun 2025 16:46:44 +0100
Subject: [PATCH 752/851] [AMDGPU] Use subtarget feature for v_lshl_add_u64
 pattern. NFC. (#144544)

Following on from #133723, use the new subtarget feature for the
selection pattern as well as for the instruction definition.
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a005e0245b8ff..f372101cb7b77 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -806,7 +806,7 @@ def : GCNPat<
  (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
  (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
 
-let SubtargetPredicate = isGFX940Plus in
+let SubtargetPredicate = HasLshlAddU64Inst in
 def : GCNPat<
   (ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
   (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)

From cd4e3843395329538feb1c29cd582471b482caf7 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 08:37:15 -0700
Subject: [PATCH 753/851] [flang][test] Removed temporary workaround for
 buildbots.

---
 flang/test/Semantics/modfile75.F90 | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/test/Semantics/modfile75.F90 b/flang/test/Semantics/modfile75.F90
index a61c59bbb31b8..8f7adafe7204d 100644
--- a/flang/test/Semantics/modfile75.F90
+++ b/flang/test/Semantics/modfile75.F90
@@ -1,6 +1,4 @@
 !RUN: rm -rf %t && mkdir -p %t
-! The next line is a temporary clean-up for the buildbots to pass.
-!RUN: rm -f modfile75a.mod modfile75b.mod
 !RUN: %flang -c -fhermetic-module-files -DWHICH=1 -J%t %s && %flang -c -fhermetic-module-files -DWHICH=2 -J%t %s && %flang_fc1 -fdebug-unparse -J%t %s | FileCheck %s
 
 #if WHICH == 1

From cf637b7e3554976419a0d672ad4c252137dc34f3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 17 Jun 2025 16:57:32 +0100
Subject: [PATCH 754/851] [flang][OpenMP] Fix goto within SECTION (#144502)

Previously we didn't push any context for SECTION and they are not
modelled with differing scopes and so goto detection couldn't tell that
GOTOs between two SECTIONs were between constructs rather than just
staying inside of the parent SECTIONS construct.

Fixes #143231
---
 flang/lib/Semantics/resolve-directives.cpp     | 18 ++++++++++++++++--
 .../Semantics/OpenMP/parallel-sections01.f90   |  2 ++
 flang/test/Semantics/OpenMP/sections-goto.f90  | 11 +++++++++++
 flang/test/Semantics/OpenMP/sections02.f90     |  2 ++
 4 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/sections-goto.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index b5f8667fe36f2..282660684e78a 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -384,6 +384,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPSectionsConstruct &);
   void Post(const parser::OpenMPSectionsConstruct &) { PopContext(); }
 
+  bool Pre(const parser::OpenMPSectionConstruct &);
+  void Post(const parser::OpenMPSectionConstruct &) { PopContext(); }
+
   bool Pre(const parser::OpenMPCriticalConstruct &critical);
   void Post(const parser::OpenMPCriticalConstruct &) { PopContext(); }
 
@@ -2003,6 +2006,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionsConstruct &x) {
   return true;
 }
 
+bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
+  PushContext(x.source, llvm::omp::Directive::OMPD_section);
+  GetContext().withinConstruct = true;
+  return true;
+}
+
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const auto &beginCriticalDir{std::get<parser::OmpCriticalDirective>(x.t)};
   const auto &endCriticalDir{std::get<parser::OmpEndCriticalDirective>(x.t)};
@@ -3024,8 +3033,13 @@ void OmpAttributeVisitor::CheckLabelContext(const parser::CharBlock source,
     const parser::CharBlock target, std::optional<DirContext> sourceContext,
     std::optional<DirContext> targetContext) {
   auto dirContextsSame = [](DirContext &lhs, DirContext &rhs) -> bool {
-    // Sometimes nested constructs share a scope but are different contexts
-    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive);
+    // Sometimes nested constructs share a scope but are different contexts.
+    // The directiveSource comparison is for OmpSection. Sections do not have
+    // their own scopes and two different sections both have the same directive.
+    // Their source however is different. This string comparison is unfortunate
+    // but should only happen for GOTOs inside of SECTION.
+    return (lhs.scope == rhs.scope) && (lhs.directive == rhs.directive) &&
+        (lhs.directiveSource == rhs.directiveSource);
   };
   unsigned version{context_.langOptions().OpenMPVersion};
   if (targetContext &&
diff --git a/flang/test/Semantics/OpenMP/parallel-sections01.f90 b/flang/test/Semantics/OpenMP/parallel-sections01.f90
index 6c5a053bf49c9..19448258af766 100644
--- a/flang/test/Semantics/OpenMP/parallel-sections01.f90
+++ b/flang/test/Semantics/OpenMP/parallel-sections01.f90
@@ -35,6 +35,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block
diff --git a/flang/test/Semantics/OpenMP/sections-goto.f90 b/flang/test/Semantics/OpenMP/sections-goto.f90
new file mode 100644
index 0000000000000..9fa9df9f50b9c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/sections-goto.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! Regression test for #143231
+
+!$omp sections
+! ERROR: invalid branch into an OpenMP structured block
+! ERROR: invalid branch leaving an OpenMP structured block
+goto 10
+!$omp section
+10 print *, "Invalid jump"
+!$omp end sections
+end
diff --git a/flang/test/Semantics/OpenMP/sections02.f90 b/flang/test/Semantics/OpenMP/sections02.f90
index ee29922a72c08..8144b491071d8 100644
--- a/flang/test/Semantics/OpenMP/sections02.f90
+++ b/flang/test/Semantics/OpenMP/sections02.f90
@@ -19,6 +19,8 @@ program OmpConstructSections01
    !$omp section
    print *, "This is a single statement structured block"
    !$omp section
+   !ERROR: invalid branch into an OpenMP structured block
+   !ERROR: invalid branch leaving an OpenMP structured block
    open (10, file="random-file-name.txt", err=30)
    !ERROR: invalid branch into an OpenMP structured block
    !ERROR: invalid branch leaving an OpenMP structured block

From 0108a5908cab5e418c683ef9b6e1810755344b5e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 17 Jun 2025 07:55:52 -0700
Subject: [PATCH 755/851] [SLP]Fix a crash on an subvector size calculation for
 non-power-of-2 vector

Patch fixes cost estimation for the extractelements from non-power-of-2
vectors, defined as subvector extracts. In this case the subvector size
might be not adjusted to a whole register size, need to get the minimum
between whole vector size and the actual difference to prevent compiler
crash.

Fixes #143513
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 16 ++--
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 25 +++---
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 25 +++---
 .../SystemZ/non-power-2-subvector-extract.ll  | 87 +++++++++++++++++++
 4 files changed, 119 insertions(+), 34 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d811e9d77d183..4551a365a6967 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12085,7 +12085,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
-                                        SmallVectorImpl<unsigned> &Indices)
+                                        SmallVectorImpl<unsigned> &Indices,
+                                        SmallVectorImpl<unsigned> &SubVecSizes)
         -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
@@ -12130,7 +12131,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                       return std::min(S, I);
                     }),
                 EltsPerVector);
-            Indices.push_back(OffsetReg1 % NumElts);
+            unsigned Index = OffsetReg1 % NumElts;
+            Indices.push_back(Index);
+            SubVecSizes.push_back(std::min(NumElts - Index, EltsPerVector));
           }
           Idx = I - OffsetReg1;
         }
@@ -12152,8 +12155,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
       SmallVector<unsigned, 2> Indices;
+      SmallVector<unsigned, 2> SubVecSizes;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask, Indices);
+          CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
       if (!RegShuffleKind) {
         if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
             !ShuffleVectorInst::isIdentityMask(
@@ -12171,12 +12175,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
       }
       const unsigned BaseVF = getFullVectorNumberOfElements(
           *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
-      for (unsigned Idx : Indices) {
-        assert((Idx + EltsPerVector) <= BaseVF &&
+      for (const auto [Idx, SubVecSize] : zip(Indices, SubVecSizes)) {
+        assert((Idx + SubVecSize) <= BaseVF &&
                "SK_ExtractSubvector index out of range");
         Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
                                  getWidenedType(ScalarTy, BaseVF), {}, CostKind,
-                                 Idx, getWidenedType(ScalarTy, EltsPerVector));
+                                 Idx, getWidenedType(ScalarTy, SubVecSize));
       }
       // Second attempt to check, if just a permute is better estimated than
       // subvector extract.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 28b48bd3ce6d9..9bfd92ef35a46 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -272,24 +272,21 @@ define <16 x i16> @add_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = add i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = add i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HADD8:%.*]] = add <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HADDD1:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HADDE:%.*]] = insertelement <16 x i16> [[HADDD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HADDF:%.*]] = insertelement <16 x i16> [[HADDE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HADD92:%.*]] = shufflevector <16 x i16> [[HADD8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADDB:%.*]] = insertelement <16 x i16> [[HADD92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = add <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HADDB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 0062527b678c9..13b4d7da97c9d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -272,24 +272,21 @@ define <16 x i16> @sub_v16i16_0123456789ABCDEF(<16 x i16> %a, <16 x i16> %b) {
 
 define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE2-NEXT:    [[BC:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 12
-; SSE2-NEXT:    [[BD:%.*]] = extractelement <16 x i16> [[B]], i64 13
-; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B]], i64 14
+; SSE2-NEXT:    [[BE:%.*]] = extractelement <16 x i16> [[B:%.*]], i64 14
 ; SSE2-NEXT:    [[BF:%.*]] = extractelement <16 x i16> [[B]], i64 15
-; SSE2-NEXT:    [[BCD:%.*]] = sub i16 [[BC]], [[BD]]
 ; SSE2-NEXT:    [[BEF:%.*]] = sub i16 [[BE]], [[BF]]
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> [[B1]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[HSUB8:%.*]] = sub <16 x i16> [[TMP3]], [[TMP7]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 10, i32 poison, i32 14, i32 24, i32 26, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 11, i32 poison, i32 15, i32 25, i32 27, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[B]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[HSUBD1:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 poison, i32 poison>
-; SSE2-NEXT:    [[HSUBE:%.*]] = insertelement <16 x i16> [[HSUBD1]], i16 [[BCD]], i64 14
-; SSE2-NEXT:    [[HSUBF:%.*]] = insertelement <16 x i16> [[HSUBE]], i16 [[BEF]], i64 15
-; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBF]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:    [[HSUB92:%.*]] = shufflevector <16 x i16> [[HSUB8]], <16 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUBB:%.*]] = insertelement <16 x i16> [[HSUB92]], i16 [[BEF]], i64 11
+; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i16> [[B1]], <16 x i16> poison, <16 x i32> <i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP9:%.*]] = sub <16 x i16> [[TMP10]], [[TMP8]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <16 x i16> [[HSUBB]], <16 x i16> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:    ret <16 x i16> [[RESULT]]
 ;
 ; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
new file mode 100644
index 0000000000000..6006bf9cb262d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/non-power-2-subvector-extract.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=s390x-ibm-linux -mcpu=z13 -slp-max-reg-size=256 -slp-vectorize-hor-store -slp-vectorize-non-power-of-2 < %s | FileCheck %s
+
+@c = external global [1 x [10 x i32]]
+@j.0 = external global i32
+
+define void @p() {
+; CHECK-LABEL: define void @p(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP0]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <7 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <7 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[OR_1_5_I_3:%.*]] = or i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    store i32 [[OR_1_5_I_3]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <7 x i32> [[TMP4]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <7 x i32> [[TMP4]], splat (i32 1)
+; CHECK-NEXT:    store <7 x i32> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    [[OR_1_5_I_5:%.*]] = or i32 [[TMP10]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[OR_1_6_I_5:%.*]] = or i32 [[OR_1_5_I_5]], [[TMP11]]
+; CHECK-NEXT:    store i32 [[OR_1_6_I_5]], ptr @j.0, align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i32> [[TMP8]], splat (i32 1)
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx12.promoted.5.i = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %conv14.5.i = xor i32 %arrayidx12.promoted.5.i, 1
+  store i32 %conv14.5.i, ptr getelementptr inbounds nuw (i8, ptr @c, i64 200), align 4
+  %arrayidx12.promoted.5.i.1 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %conv14.5.i.1 = xor i32 %arrayidx12.promoted.5.i.1, 1
+  store i32 %conv14.5.i.1, ptr getelementptr inbounds nuw (i8, ptr @c, i64 204), align 4
+  %arrayidx12.promoted.5.i.2 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %conv14.5.i.2 = xor i32 %arrayidx12.promoted.5.i.2, 1
+  store i32 %conv14.5.i.2, ptr getelementptr inbounds nuw (i8, ptr @c, i64 208), align 4
+  %arrayidx12.promoted.1.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %conv14.1.i.3 = xor i32 %arrayidx12.promoted.1.i.3, 1
+  store i32 %conv14.1.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 52), align 4
+  %arrayidx12.promoted.5.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %conv14.5.i.3 = xor i32 %arrayidx12.promoted.5.i.3, 1
+  store i32 %conv14.5.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 212), align 4
+  %arrayidx12.promoted.6.i.3 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  %conv14.6.i.3 = xor i32 %arrayidx12.promoted.6.i.3, 1
+  %or.1.5.i.3 = or i32 %arrayidx12.promoted.1.i.3, %arrayidx12.promoted.5.i.3
+  store i32 %conv14.6.i.3, ptr getelementptr inbounds nuw (i8, ptr @c, i64 252), align 4
+  store i32 %or.1.5.i.3, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %conv14.1.i.4 = xor i32 %arrayidx12.promoted.1.i.4, 1
+  store i32 %conv14.1.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 56), align 4
+  %arrayidx12.promoted.5.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %conv14.5.i.4 = xor i32 %arrayidx12.promoted.5.i.4, 1
+  store i32 %conv14.5.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 216), align 4
+  %arrayidx12.promoted.6.i.4 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %conv14.6.i.4 = xor i32 %arrayidx12.promoted.6.i.4, 1
+  store i32 %conv14.6.i.4, ptr getelementptr inbounds nuw (i8, ptr @c, i64 256), align 4
+  %arrayidx12.promoted.1.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %conv14.1.i.5 = xor i32 %arrayidx12.promoted.1.i.5, 1
+  store i32 %conv14.1.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 60), align 4
+  %arrayidx12.promoted.5.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %conv14.5.i.5 = xor i32 %arrayidx12.promoted.5.i.5, 1
+  store i32 %conv14.5.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 220), align 4
+  %arrayidx12.promoted.6.i.5 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  %conv14.6.i.5 = xor i32 %arrayidx12.promoted.6.i.5, 1
+  %0 = or i32 %arrayidx12.promoted.6.i.4, %arrayidx12.promoted.1.i.5
+  %or.1.5.i.5 = or i32 %0, %arrayidx12.promoted.5.i.5
+  %or.1.6.i.5 = or i32 %or.1.5.i.5, %arrayidx12.promoted.6.i.5
+  store i32 %conv14.6.i.5, ptr getelementptr inbounds nuw (i8, ptr @c, i64 260), align 4
+  store i32 %or.1.6.i.5, ptr @j.0, align 4
+  %arrayidx12.promoted.1.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %conv14.1.i.6 = xor i32 %arrayidx12.promoted.1.i.6, 1
+  store i32 %conv14.1.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 64), align 4
+  %arrayidx12.promoted.5.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %conv14.5.i.6 = xor i32 %arrayidx12.promoted.5.i.6, 1
+  store i32 %conv14.5.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 224), align 4
+  %arrayidx12.promoted.6.i.6 = load i32, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  %conv14.6.i.6 = xor i32 %arrayidx12.promoted.6.i.6, 1
+  store i32 %conv14.6.i.6, ptr getelementptr inbounds nuw (i8, ptr @c, i64 264), align 4
+  ret void
+}

From 00139f10c3cd4118de7148635c820bb42843287a Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Tue, 17 Jun 2025 09:00:18 -0700
Subject: [PATCH 756/851] [NVPTX] Cleanup ld/st lowering (#143936)

---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp  | 450 +++++++------------
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h    |   3 +-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td      |   4 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td     |  94 ++--
 llvm/test/CodeGen/NVPTX/bug26185-2.ll        |  22 +-
 llvm/test/CodeGen/NVPTX/bug26185.ll          |  73 ++-
 llvm/test/CodeGen/NVPTX/i1-ext-load.ll       |   4 +-
 llvm/test/CodeGen/NVPTX/ldu-ldg.ll           |   8 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll |  19 +-
 9 files changed, 306 insertions(+), 371 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 79b1bfbc8072b..ff10eea371049 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -136,7 +136,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     break;
   case NVPTXISD::LDUV2:
   case NVPTXISD::LDUV4:
-    if (tryLDGLDU(N))
+    if (tryLDU(N))
       return;
     break;
   case NVPTXISD::StoreV2:
@@ -324,7 +324,7 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   case Intrinsic::nvvm_ldu_global_f:
   case Intrinsic::nvvm_ldu_global_i:
   case Intrinsic::nvvm_ldu_global_p:
-    return tryLDGLDU(N);
+    return tryLDU(N);
 
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
   case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
@@ -1048,35 +1048,28 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   assert(LD->readMem() && "Expected load");
 
   // do not support pre/post inc/dec
-  LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
+  const LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(LD);
   if (PlainLoad && PlainLoad->isIndexed())
     return false;
 
-  EVT LoadedVT = LD->getMemoryVT();
-  if (!LoadedVT.isSimple())
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
     return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
   // Address Space Setting
   const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
   if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+    return tryLDG(LD);
 
-  SDLoc DL(N);
+  SDLoc DL(LD);
   SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
-  // Type Setting: fromType + fromTypeWidth
-  //
-  // Sign   : ISD::SEXTLOAD
-  // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
-  //          type is integer
-  // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  // Read at least 8 bits (predicates are stored as 8-bit values)
-  unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
+  const unsigned FromTypeWidth = LoadedVT.getSizeInBits();
 
   // Vector Setting
-  unsigned int FromType =
+  const unsigned FromType =
       (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
           ? NVPTX::PTXLdStInstCode::Signed
           : NVPTX::PTXLdStInstCode::Untyped;
@@ -1102,29 +1095,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   if (!Opcode)
     return false;
 
-  SDNode *NVPTXLD =
-      CurDAG->getMachineNode(*Opcode, DL, TargetVT, MVT::Other, Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
   if (!NVPTXLD)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = LD->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, NVPTXLD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-static bool isSubVectorPackedInI32(EVT EltVT) {
-  // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
-  // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
-  // vectorized loads/stores with the actual element type for i8/i16 as that
-  // would require v8/v16 variants that do not exist.
-  // In order to load/store such vectors efficiently, in Type Legalization
-  // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will
-  // lower to PTX as vectors of b32.
-  return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
-}
-
 static unsigned getLoadStoreVectorNumElts(SDNode *N) {
   switch (N->getOpcode()) {
   case NVPTXISD::LoadV2:
@@ -1142,21 +1123,21 @@ static unsigned getLoadStoreVectorNumElts(SDNode *N) {
 }
 
 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  const EVT MemEVT = MemSD->getMemoryVT();
+  MemSDNode *LD = cast<MemSDNode>(N);
+  const EVT MemEVT = LD->getMemoryVT();
   if (!MemEVT.isSimple())
     return false;
   const MVT MemVT = MemEVT.getSimpleVT();
 
   // Address Space Setting
-  const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
-  if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
-    return tryLDGLDU(N);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+  if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+    return tryLDG(LD);
 
-  EVT EltVT = N->getValueType(0);
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  const MVT EltVT = LD->getSimpleValueType(0);
+  SDLoc DL(LD);
+  SDValue Chain = LD->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
 
   // Type Setting: fromType + fromTypeWidth
   //
@@ -1167,18 +1148,15 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   // Read at least 8 bits (predicates are stored as 8-bit values)
   // The last operand holds the original LoadSDNode::getExtensionType() value
   const unsigned TotalWidth = MemVT.getSizeInBits();
-  unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
-  unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
-                          ? NVPTX::PTXLdStInstCode::Signed
-                          : NVPTX::PTXLdStInstCode::Untyped;
+  const unsigned ExtensionType =
+      N->getConstantOperandVal(N->getNumOperands() - 1);
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
-
-  if (isSubVectorPackedInI32(EltVT)) {
-    assert(ExtensionType == ISD::NON_EXTLOAD);
-    EltVT = MVT::i32;
-  }
+  const unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
 
+  assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
   assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
          FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
@@ -1196,192 +1174,183 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   std::optional<unsigned> Opcode;
   switch (N->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case NVPTXISD::LoadV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2,
-                             NVPTX::LDV_i16_v2, NVPTX::LDV_i32_v2,
-                             NVPTX::LDV_i64_v2);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v2, NVPTX::LDV_i16_v2,
+                        NVPTX::LDV_i32_v2, NVPTX::LDV_i64_v2);
     break;
   case NVPTXISD::LoadV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
-                             NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4,
-                             NVPTX::LDV_i64_v4);
+    Opcode =
+        pickOpcodeForVT(EltVT.SimpleTy, NVPTX::LDV_i8_v4, NVPTX::LDV_i16_v4,
+                        NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4);
     break;
   case NVPTXISD::LoadV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT.SimpleTy, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LDV_i32_v8, {/* no v8i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
+  SDNode *NVPTXLD = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
+  MachineMemOperand *MemRef = LD->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
 
-  ReplaceNode(N, LD);
+  ReplaceNode(LD, NVPTXLD);
   return true;
 }
 
-bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
-  auto *Mem = cast<MemSDNode>(N);
-
-  // If this is an LDG intrinsic, the address is the third operand. If its an
-  // LDG/LDU SD node (from custom vector handling), then its the second operand
-  SDValue Op1 = N->getOperand(N->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+bool NVPTXDAGToDAGISel::tryLDG(MemSDNode *LD) {
+  const EVT LoadedEVT = LD->getMemoryVT();
+  if (!LoadedEVT.isSimple())
+    return false;
+  const MVT LoadedVT = LoadedEVT.getSimpleVT();
 
-  const EVT OrigType = N->getValueType(0);
-  EVT EltVT = Mem->getMemoryVT();
-  unsigned NumElts = 1;
+  SDLoc DL(LD);
 
-  if (EltVT == MVT::i128 || EltVT == MVT::f128) {
-    EltVT = MVT::i64;
-    NumElts = 2;
-  }
-  if (EltVT.isVector()) {
-    NumElts = EltVT.getVectorNumElements();
-    EltVT = EltVT.getVectorElementType();
-    // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16
-    // elements.
-    if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
-        (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
-        (EltVT == MVT::i16 && OrigType == MVT::v2i16) ||
-        (EltVT == MVT::i8 && OrigType == MVT::v4i8)) {
-      assert(NumElts % OrigType.getVectorNumElements() == 0 &&
-             "NumElts must be divisible by the number of elts in subvectors");
-      EltVT = OrigType;
-      NumElts /= OrigType.getVectorNumElements();
-    }
+  const unsigned TotalWidth = LoadedVT.getSizeInBits();
+  unsigned ExtensionType;
+  unsigned NumElts;
+  if (const auto *Load = dyn_cast<LoadSDNode>(LD)) {
+    ExtensionType = Load->getExtensionType();
+    NumElts = 1;
+  } else {
+    ExtensionType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
+    NumElts = getLoadStoreVectorNumElts(LD);
   }
+  const unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
+                                ? NVPTX::PTXLdStInstCode::Signed
+                                : NVPTX::PTXLdStInstCode::Untyped;
 
-  // Build the "promoted" result VTList for the load. If we are really loading
-  // i8s, then the return type will be promoted to i16 since we do not expose
-  // 8-bit registers in NVPTX.
-  const EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
-  SmallVector<EVT, 5> InstVTs;
-  InstVTs.append(NumElts, NodeVT);
-  InstVTs.push_back(MVT::Other);
-  SDVTList InstVTList = CurDAG->getVTList(InstVTs);
-  SDValue Chain = N->getOperand(0);
+  const unsigned FromTypeWidth = TotalWidth / NumElts;
+
+  assert(!(LD->getSimpleValueType(0).isVector() &&
+           ExtensionType != ISD::NON_EXTLOAD));
+  assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
+         FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
 
   SDValue Base, Offset;
-  SelectADDR(Op1, Base, Offset);
-  SDValue Ops[] = {Base, Offset, Chain};
+  SelectADDR(LD->getOperand(1), Base, Offset);
+  SDValue Ops[] = {getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base,
+                   Offset, LD->getChain()};
 
+  const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
   std::optional<unsigned> Opcode;
-  switch (N->getOpcode()) {
+  switch (LD->getOpcode()) {
   default:
-    return false;
+    llvm_unreachable("Unexpected opcode");
   case ISD::LOAD:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8,
-        NVPTX::INT_PTX_LDG_GLOBAL_i16, NVPTX::INT_PTX_LDG_GLOBAL_i32,
-        NVPTX::INT_PTX_LDG_GLOBAL_i64);
-    break;
-  case ISD::INTRINSIC_W_CHAIN:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8,
-        NVPTX::INT_PTX_LDU_GLOBAL_i16, NVPTX::INT_PTX_LDU_GLOBAL_i32,
-        NVPTX::INT_PTX_LDU_GLOBAL_i64);
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_GLOBAL_NC_i8,
+                             NVPTX::LD_GLOBAL_NC_i16, NVPTX::LD_GLOBAL_NC_i32,
+                             NVPTX::LD_GLOBAL_NC_i64);
     break;
   case NVPTXISD::LoadV2:
     Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i16_ELE, NVPTX::INT_PTX_LDG_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v2i64_ELE);
-    break;
-  case NVPTXISD::LDUV2:
-    Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v2i8_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i16_ELE, NVPTX::INT_PTX_LDU_G_v2i32_ELE,
-        NVPTX::INT_PTX_LDU_G_v2i64_ELE);
+        TargetVT, NVPTX::LD_GLOBAL_NC_v2i8, NVPTX::LD_GLOBAL_NC_v2i16,
+        NVPTX::LD_GLOBAL_NC_v2i32, NVPTX::LD_GLOBAL_NC_v2i64);
     break;
   case NVPTXISD::LoadV4:
     Opcode = pickOpcodeForVT(
-        EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
-        NVPTX::INT_PTX_LDG_G_v4i64_ELE);
-    break;
-  case NVPTXISD::LDUV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                             NVPTX::INT_PTX_LDU_G_v4i8_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i16_ELE,
-                             NVPTX::INT_PTX_LDU_G_v4i32_ELE, {/* no v4i64 */});
+        TargetVT, NVPTX::LD_GLOBAL_NC_v4i8, NVPTX::LD_GLOBAL_NC_v4i16,
+        NVPTX::LD_GLOBAL_NC_v4i32, NVPTX::LD_GLOBAL_NC_v4i64);
     break;
   case NVPTXISD::LoadV8:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                             {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
-                             {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(TargetVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::LD_GLOBAL_NC_v8i32, {/* no v8i64 */});
     break;
   }
   if (!Opcode)
     return false;
 
-  SDLoc DL(N);
-  SDNode *LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
+  SDNode *NVPTXLDG = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
 
-  // For automatic generation of LDG (through SelectLoad[Vector], not the
-  // intrinsics), we may have an extending load like:
-  //
-  //   i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
-  //
-  // In this case, the matching logic above will select a load for the original
-  // memory type (in this case, i8) and our types will not match (the node needs
-  // to return an i32 in this case). Our LDG/LDU nodes do not support the
-  // concept of sign-/zero-extension, so emulate it here by adding an explicit
-  // CVT instruction. Ptxas should clean up any redundancies here.
-
-  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
-
-  if (OrigType != EltVT &&
-      (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
-    // We have an extending-load. The instruction we selected operates on the
-    // smaller type, but the SDNode we are replacing has the larger type. We
-    // need to emit a CVT to make the types match.
-    unsigned CvtOpc =
-        GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
-
-    // For each output value, apply the manual sign/zero-extension and make sure
-    // all users of the load go through that CVT.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      SDValue Res(LD, i);
-      SDValue OrigVal(N, i);
-
-      SDNode *CvtNode =
-        CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
-                               CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
-                                                         DL, MVT::i32));
-      ReplaceUses(OrigVal, SDValue(CvtNode, 0));
-    }
+  ReplaceNode(LD, NVPTXLDG);
+  return true;
+}
+
+bool NVPTXDAGToDAGISel::tryLDU(SDNode *N) {
+  auto *LD = cast<MemSDNode>(N);
+
+  unsigned NumElts;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::INTRINSIC_W_CHAIN:
+    NumElts = 1;
+    break;
+  case NVPTXISD::LDUV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::LDUV4:
+    NumElts = 4;
+    break;
   }
 
-  ReplaceNode(N, LD);
+  const MVT::SimpleValueType SelectVT =
+      MVT::getIntegerVT(LD->getMemoryVT().getSizeInBits() / NumElts).SimpleTy;
+
+  // If this is an LDU intrinsic, the address is the third operand. If its an
+  // LDU SD node (from custom vector handling), then its the second operand
+  SDValue Addr =
+      LD->getOperand(LD->getOpcode() == ISD::INTRINSIC_W_CHAIN ? 2 : 1);
+
+  SDValue Base, Offset;
+  SelectADDR(Addr, Base, Offset);
+  SDValue Ops[] = {Base, Offset, LD->getChain()};
+
+  std::optional<unsigned> Opcode;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::INTRINSIC_W_CHAIN:
+    Opcode =
+        pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_i8, NVPTX::LDU_GLOBAL_i16,
+                        NVPTX::LDU_GLOBAL_i32, NVPTX::LDU_GLOBAL_i64);
+    break;
+  case NVPTXISD::LDUV2:
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v2i8,
+                             NVPTX::LDU_GLOBAL_v2i16, NVPTX::LDU_GLOBAL_v2i32,
+                             NVPTX::LDU_GLOBAL_v2i64);
+    break;
+  case NVPTXISD::LDUV4:
+    Opcode = pickOpcodeForVT(SelectVT, NVPTX::LDU_GLOBAL_v4i8,
+                             NVPTX::LDU_GLOBAL_v4i16, NVPTX::LDU_GLOBAL_v4i32,
+                             {/* no v4i64 */});
+    break;
+  }
+  if (!Opcode)
+    return false;
+
+  SDLoc DL(N);
+  SDNode *NVPTXLDU = CurDAG->getMachineNode(*Opcode, DL, LD->getVTList(), Ops);
+
+  ReplaceNode(LD, NVPTXLDU);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   MemSDNode *ST = cast<MemSDNode>(N);
   assert(ST->writeMem() && "Expected store");
-  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
-  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
+  StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(ST);
+  AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(ST);
   assert((PlainStore || AtomicStore) && "Expected store");
 
   // do not support pre/post inc/dec
   if (PlainStore && PlainStore->isIndexed())
     return false;
 
-  EVT StoreVT = ST->getMemoryVT();
+  const EVT StoreVT = ST->getMemoryVT();
   if (!StoreVT.isSimple())
     return false;
 
   // Address Space Setting
-  unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
 
-  SDLoc DL(N);
+  SDLoc DL(ST);
   SDValue Chain = ST->getChain();
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Vector Setting
   const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
@@ -1417,85 +1386,78 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   if (!NVPTXST)
     return false;
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
+  MachineMemOperand *MemRef = ST->getMemOperand();
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
-  ReplaceNode(N, NVPTXST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
-  SDValue Op1 = N->getOperand(1);
-  EVT EltVT = Op1.getValueType();
-  MemSDNode *MemSD = cast<MemSDNode>(N);
-  EVT StoreVT = MemSD->getMemoryVT();
+  MemSDNode *ST = cast<MemSDNode>(N);
+  const EVT StoreVT = ST->getMemoryVT();
   assert(StoreVT.isSimple() && "Store value is not simple");
 
   // Address Space Setting
-  unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+  const unsigned CodeAddrSpace = getCodeAddrSpace(ST);
   if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
     report_fatal_error("Cannot store to pointer that points to constant "
                        "memory space");
   }
 
-  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
+  SDLoc DL(ST);
+  SDValue Chain = ST->getChain();
+  const auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
 
-  unsigned NumElts = getLoadStoreVectorNumElts(N);
-
-  SmallVector<SDValue, 16> Ops(N->ops().slice(1, NumElts));
-  SDValue N2 = N->getOperand(NumElts + 1);
-  unsigned ToTypeWidth = TotalWidth / NumElts;
+  const unsigned NumElts = getLoadStoreVectorNumElts(ST);
 
-  if (isSubVectorPackedInI32(EltVT)) {
-    EltVT = MVT::i32;
-  }
+  SmallVector<SDValue, 16> Ops(ST->ops().slice(1, NumElts));
+  SDValue Addr = N->getOperand(NumElts + 1);
+  const unsigned ToTypeWidth = TotalWidth / NumElts;
 
   assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
          TotalWidth <= 256 && "Invalid width for store");
 
   SDValue Offset, Base;
-  SelectADDR(N2, Base, Offset);
+  SelectADDR(Addr, Base, Offset);
 
   Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
               getI32Imm(CodeAddrSpace, DL),
               getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
               getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
 
+  const MVT::SimpleValueType EltVT =
+      ST->getOperand(1).getSimpleValueType().SimpleTy;
   std::optional<unsigned> Opcode;
-  switch (N->getOpcode()) {
+  switch (ST->getOpcode()) {
   default:
     return false;
   case NVPTXISD::StoreV2:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2,
-                             NVPTX::STV_i16_v2, NVPTX::STV_i32_v2,
-                             NVPTX::STV_i64_v2);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v2, NVPTX::STV_i16_v2,
+                             NVPTX::STV_i32_v2, NVPTX::STV_i64_v2);
     break;
   case NVPTXISD::StoreV4:
-    Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
-                             NVPTX::STV_i16_v4, NVPTX::STV_i32_v4,
-                             NVPTX::STV_i64_v4);
+    Opcode = pickOpcodeForVT(EltVT, NVPTX::STV_i8_v4, NVPTX::STV_i16_v4,
+                             NVPTX::STV_i32_v4, NVPTX::STV_i64_v4);
     break;
   case NVPTXISD::StoreV8:
-    Opcode =
-        pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
-                        {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */});
+    Opcode = pickOpcodeForVT(EltVT, {/* no v8i8 */}, {/* no v8i16 */},
+                             NVPTX::STV_i32_v8, {/* no v8i64 */});
     break;
   }
 
   if (!Opcode)
     return false;
 
-  SDNode *ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
+  SDNode *NVPTXST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
 
-  MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
-  CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
+  MachineMemOperand *MemRef = ST->getMemOperand();
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
 
-  ReplaceNode(N, ST);
+  ReplaceNode(ST, NVPTXST);
   return true;
 }
 
@@ -2285,70 +2247,6 @@ void NVPTXDAGToDAGISel::SelectI128toV2I64(SDNode *N) {
   ReplaceNode(N, Mov);
 }
 
-/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
-/// conversion from \p SrcTy to \p DestTy.
-unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
-                                             LoadSDNode *LdNode) {
-  bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
-  switch (SrcTy.SimpleTy) {
-  default:
-    llvm_unreachable("Unhandled source type");
-  case MVT::i8:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
-    }
-  case MVT::i16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
-    }
-  case MVT::i32:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
-    case MVT::i64:
-      return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
-    }
-  case MVT::i64:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::i8:
-      return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
-    case MVT::i16:
-      return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
-    case MVT::i32:
-      return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
-    }
-  case MVT::f16:
-    switch (DestTy.SimpleTy) {
-    default:
-      llvm_unreachable("Unhandled dest type");
-    case MVT::f32:
-      return NVPTX::CVT_f32_f16;
-    case MVT::f64:
-      return NVPTX::CVT_f64_f16;
-    }
-  }
-}
-
 bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   SDLoc DL(N);
   assert(N->getOpcode() == ISD::ATOMIC_FENCE);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 473f4781a6c38..ff58e4486a222 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -75,7 +75,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   void SelectTexSurfHandle(SDNode *N);
   bool tryLoad(SDNode *N);
   bool tryLoadVector(SDNode *N);
-  bool tryLDGLDU(SDNode *N);
+  bool tryLDU(SDNode *N);
+  bool tryLDG(MemSDNode *N);
   bool tryStore(SDNode *N);
   bool tryStoreVector(SDNode *N);
   bool tryLoadParam(SDNode *N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4c3501df57f84..5dbdce52f0553 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -135,11 +135,7 @@ def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
 def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
 def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
 def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
 def hasClusters : Predicate<"Subtarget->hasClusters()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
 def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
 def noPTXASUnreachableBug : Predicate<"!Subtarget->hasPTXASUnreachableBug()">;
 def hasOptEnabled : Predicate<"TM.getOptLevel() != CodeGenOptLevel::None">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b3c1296cf0ca6..5de3dee1fb344 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2143,15 +2143,12 @@ defm INT_PTX_SATOM_XOR  : ATOM2_bitwise_impl<"xor">;
 
 class LDU_G<string TyStr, NVPTXRegClass regclass>
   :  NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ldu.global." # TyStr # " \t$result, [$src];",
-                      []>, Requires<[hasLDU]>;
+               "ldu.global." # TyStr # " \t$result, [$src];", []>;
 
-def INT_PTX_LDU_GLOBAL_i8  : LDU_G<"b8", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
-def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
-def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
-def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
-def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
+def LDU_GLOBAL_i8  : LDU_G<"b8",  Int16Regs>;
+def LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
+def LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
+def LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
 
 // vector
 
@@ -2168,19 +2165,14 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
                "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
 
-def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
+def LDU_GLOBAL_v2i8  : VLDU_G_ELE_V2<"b8",  Int16Regs>;
+def LDU_GLOBAL_v2i16 : VLDU_G_ELE_V2<"b16", Int16Regs>;
+def LDU_GLOBAL_v2i32 : VLDU_G_ELE_V2<"b32", Int32Regs>;
+def LDU_GLOBAL_v2i64 : VLDU_G_ELE_V2<"b64", Int64Regs>;
 
-def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
+def LDU_GLOBAL_v4i8  : VLDU_G_ELE_V4<"b8",  Int16Regs>;
+def LDU_GLOBAL_v4i16 : VLDU_G_ELE_V4<"b16", Int16Regs>;
+def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", Int32Regs>;
 
 
 //-----------------------------------
@@ -2191,55 +2183,47 @@ def INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"b32", Float32Regs>;
 // non-coherent texture cache, and therefore the values read must be read-only
 // during the lifetime of the kernel.
 
-class LDG_G<string TyStr, NVPTXRegClass regclass>
-  : NVPTXInst<(outs regclass:$result), (ins ADDR:$src),
-               "ld.global.nc." # TyStr # " \t$result, [$src];",
-                        []>, Requires<[hasLDG]>;
+class LDG_G<NVPTXRegClass regclass>
+  : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+               "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>;
 
-def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
-def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
-def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
-def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
-def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
+def LD_GLOBAL_NC_i8  : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i16 : LDG_G<Int16Regs>;
+def LD_GLOBAL_NC_i32 : LDG_G<Int32Regs>;
+def LD_GLOBAL_NC_i64 : LDG_G<Int64Regs>;
 
 // vector
 
 // Elementized vector ldg
-class VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V2<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-            (ins ADDR:$src),
-            "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>;
 
 
-class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V4<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), 
-            (ins ADDR:$src),
-            "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+            (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+            "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
 
-class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
+class VLDG_G_ELE_V8<NVPTXRegClass regclass> :
   NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
                   regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
-             (ins ADDR:$src),
-             "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
+             (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src),
+             "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
-def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
-
-def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
-def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
-def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
-
-def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
-def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
-def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
-def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
+def LD_GLOBAL_NC_v2i8  : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i16 : VLDG_G_ELE_V2<Int16Regs>;
+def LD_GLOBAL_NC_v2i32 : VLDG_G_ELE_V2<Int32Regs>;
+def LD_GLOBAL_NC_v2i64 : VLDG_G_ELE_V2<Int64Regs>;
+
+def LD_GLOBAL_NC_v4i8  : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i16 : VLDG_G_ELE_V4<Int16Regs>;
+def LD_GLOBAL_NC_v4i32 : VLDG_G_ELE_V4<Int32Regs>;
+
+def LD_GLOBAL_NC_v4i64 : VLDG_G_ELE_V4<Int64Regs>;
+def LD_GLOBAL_NC_v8i32 : VLDG_G_ELE_V8<Int32Regs>;
 
 multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
   if Supports32 then
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
index c4d1537557cad..4e11f58f85ee0 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -10,14 +11,29 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-; CHECK-LABEL: spam
 define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1, i64 %arg2, i64 %arg3) #0 {
+; CHECK-LABEL: spam(
+; CHECK:       .maxntid 1, 1, 1
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<9>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %bb
+; CHECK-NEXT:    ld.param.b64 %rd1, [spam_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [spam_param_3];
+; CHECK-NEXT:    shl.b64 %rd3, %rd2, 1;
+; CHECK-NEXT:    add.s64 %rd4, %rd1, %rd3;
+; CHECK-NEXT:    ld.param.b64 %rd5, [spam_param_1];
+; CHECK-NEXT:    ld.global.nc.s16 %r1, [%rd4+16];
+; CHECK-NEXT:    mul.wide.s32 %rd6, %r1, %r1;
+; CHECK-NEXT:    ld.global.b64 %rd7, [%rd5];
+; CHECK-NEXT:    add.s64 %rd8, %rd6, %rd7;
+; CHECK-NEXT:    st.global.b64 [%rd5], %rd8;
+; CHECK-NEXT:    ret;
 bb:
   %tmp5 = add nsw i64 %arg3, 8
   %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
-; CHECK: ld.global.nc.b16
   %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
-; CHECK: cvt.s32.s16
   %tmp8 = sext i16 %tmp7 to i64
   %tmp9 = mul nsw i64 %tmp8, %tmp8
   %tmp10 = load i64, ptr addrspace(1) %arg1, align 8
diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll
index 3b30ce560edbc..6148c0756e393 100644
--- a/llvm/test/CodeGen/NVPTX/bug26185.ll
+++ b/llvm/test/CodeGen/NVPTX/bug26185.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
@@ -7,45 +8,93 @@
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-unknown-unknown"
 
-; CHECK-LABEL: ex_zext
 define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.u32.u8
   %valext = zext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext
 define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.s8 %r1, [%rd2];
+; CHECK-NEXT:    st.global.b32 [%rd4], %r1;
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.b8
   %val = load i8, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext i8 %val to i32
   store i32 %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_zext_v2
 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_zext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_zext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_zext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs1;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r2, %r1};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.u32.u16
   %valext = zext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
 }
 
-; CHECK-LABEL: ex_sext_v2
 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
+; CHECK-LABEL: ex_sext_v2(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.b64 %rd1, [ex_sext_v2_param_0];
+; CHECK-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; CHECK-NEXT:    ld.param.b64 %rd3, [ex_sext_v2_param_1];
+; CHECK-NEXT:    cvta.to.global.u64 %rd4, %rd3;
+; CHECK-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs2;
+; CHECK-NEXT:    cvt.s32.s8 %r2, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs1;
+; CHECK-NEXT:    cvt.s32.s8 %r4, %r3;
+; CHECK-NEXT:    st.global.v2.b32 [%rd4], {%r4, %r2};
+; CHECK-NEXT:    ret;
 entry:
-; CHECK: ld.global.nc.v2.b8
   %val = load <2 x i8>, ptr %data
-; CHECK: cvt.s32.s8
   %valext = sext <2 x i8> %val to <2 x i32>
   store <2 x i32> %valext, ptr %res
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
index bb88d1f2755ca..3dceefb93a47d 100644
--- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll
@@ -7,7 +7,6 @@ target triple = "nvptx-nvidia-cuda"
 
 define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK-LABEL: foo(
-; CHECK:    .reg .b16 %rs<2>;
 ; CHECK:    .reg .b32 %r<4>;
 ; CHECK:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
@@ -15,8 +14,7 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
 ; CHECK:    cvta.to.global.u64 %rd2, %rd1;
 ; CHECK:    ld.param.b64 %rd3, [foo_param_1];
 ; CHECK:    cvta.to.global.u64 %rd4, %rd3;
-; CHECK:    ld.global.nc.b8 %rs1, [%rd2];
-; CHECK:    cvt.u32.u8 %r1, %rs1;
+; CHECK:    ld.global.nc.b8 %r1, [%rd2];
 ; CHECK:    add.s32 %r2, %r1, 1;
 ; CHECK:    and.b32 %r3, %r2, 1;
 ; CHECK:    st.global.b32 [%rd4], %r3;
diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
index 7ac697c4ce203..7f4b049af84fb 100644
--- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
+++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll
@@ -163,14 +163,12 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) {
 define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i8_param_0];
-; CHECK-NEXT:    ld.global.nc.b8 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u8 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b8 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i8 @llvm.nvvm.ldg.global.i.i8.p1(ptr addrspace(1) %ptr, i32 4)
@@ -180,14 +178,12 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) {
 define i16 @test_ldg_i16(ptr addrspace(1) %ptr) {
 ; CHECK-LABEL: test_ldg_i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_ldg_i16_param_0];
-; CHECK-NEXT:    ld.global.nc.b16 %rs1, [%rd1];
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT:    ld.global.nc.b16 %r1, [%rd1];
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %val = tail call i16 @llvm.nvvm.ldg.global.i.i16.p1(ptr addrspace(1) %ptr, i32 2)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 3bbdf641ade26..ddaa9fd831af7 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -211,7 +211,7 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .local .align 8 .b8 __local_depot3[24];
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
-; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
+; CHECK-PTX-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
 ; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX-EMPTY:
@@ -220,18 +220,15 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
 ; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs4, %rs3;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs4;
-; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5];
-; CHECK-PTX-NEXT:    cvt.u16.u8 %rs6, %rs5;
-; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs6;
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+2], %rs1;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2+1], %rs2;
+; CHECK-PTX-NEXT:    ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
+; CHECK-PTX-NEXT:    st.local.b8 [%rd2], %rs3;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.b32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    mov.b16 %rs7, 1;
-; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs7;
+; CHECK-PTX-NEXT:    mov.b16 %rs4, 1;
+; CHECK-PTX-NEXT:    st.b8 [%SP+12], %rs4;
 ; CHECK-PTX-NEXT:    mov.b64 %rd3, 1;
 ; CHECK-PTX-NEXT:    st.b64 [%SP+16], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 8;

From eb31c422d0dc816bf285a81bf92690d4d16273ed Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 21:43:17 +0530
Subject: [PATCH 757/851] [Driver] Add support for GCC installation detection
 in Baremetal toolchain (#121829)

This patch introduces enhancements to the Baremetal toolchain to support
GCC toolchain detection.
- If the --gcc-install-dir or --gcc-toolchain options are provided and
point to valid paths, the sysroot is derived from those locations.
- If not, the logic falls back to the existing sysroot inference
mechanism already present in the Baremetal toolchain.
- Support for adding include paths for the libstdc++ library has also
been added.

Additionally, the restriction to always use the integrated assembler has
been removed. With a valid GCC installation, the GNU assembler can now
be used as well.

This patch currently updates and adds tests for the ARM target only.
RISC-V-specific tests will be introduced in a later patch, once the
RISCVToolChain is fully merged into the Baremetal toolchain. At this
stage, there is no way to test the RISC-V target within this PR.

RFC:
https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/docs/Toolchain.rst                      |   5 +
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 +
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 235 +++++++++++++-----
 clang/lib/Driver/ToolChains/BareMetal.h       |  19 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 +
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 +
 clang/test/Driver/aarch64-gnutools.c          |   4 +
 clang/test/Driver/aarch64-toolchain-extra.c   |  28 +++
 clang/test/Driver/aarch64-toolchain.c         |  61 +++++
 clang/test/Driver/arm-gnutools.c              |   6 +
 clang/test/Driver/arm-toolchain-extra.c       |  29 +++
 clang/test/Driver/arm-toolchain.c             |  62 +++++
 clang/test/Driver/baremetal.cpp               |  16 ++
 clang/test/Driver/check-no-multlib-warning.c  |  10 +
 32 files changed, 418 insertions(+), 64 deletions(-)
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 create mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 create mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 create mode 100644 clang/test/Driver/aarch64-gnutools.c
 create mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 create mode 100644 clang/test/Driver/aarch64-toolchain.c
 create mode 100644 clang/test/Driver/arm-gnutools.c
 create mode 100644 clang/test/Driver/arm-toolchain-extra.c
 create mode 100644 clang/test/Driver/arm-toolchain.c
 create mode 100644 clang/test/Driver/check-no-multlib-warning.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index 958199eb7a2e2..d56b21d74c7e3 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,3 +347,8 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
+
+GCC Installation
+=================
+Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
+using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 29f6480ba935c..94224e1038758 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,6 +847,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
+def warn_drv_multilib_not_available_for_target: Warning<
+  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
+  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d8168ed15febd..0fbfe6c77f342 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,6 +31,40 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
+
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
+}
+
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -95,7 +129,8 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
+static std::string computeClangRuntimesSysRoot(const Driver &D,
+                                               bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -108,56 +143,123 @@ static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   return std::string(SysRootDir);
 }
 
-BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
-                     const ArgList &Args)
-    : ToolChain(D, Triple, Args),
-      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
-  getProgramPaths().push_back(getDriver().Dir);
-
-  findMultilibs(D, Triple, Args);
-  SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
-    for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
-      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-      getFilePaths().push_back(std::string(Dir));
-      getLibraryPaths().push_back(std::string(Dir));
-    }
+// Only consider the GCC toolchain based on the values provided through the
+// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
+// whether the GCC toolchain was initialized successfully.
+bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
+                                    const llvm::opt::ArgList &Args) {
+  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
+      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
+    GCCInstallation.init(Triple, Args);
+    return GCCInstallation.isValid();
   }
+  return false;
 }
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
+// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
+// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
+// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
+// `bin/../<target-triple>/lib` directory.
+static bool detectGCCToolchainAdjacent(const Driver &D) {
+  SmallString<128> GCCDir;
+  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
+                          "lib/crt0.o");
+  return llvm::sys::fs::exists(GCCDir);
 }
 
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
+// If no sysroot is provided the driver will first attempt to infer it from the
+// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
+// location of a GCC toolchain.
+// If neither flag is used, the sysroot defaults to either:
+//    - `bin/../<target-triple>`
+//    - `bin/../lib/clang-runtimes/<target-triple>`
+//
+// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
+// does not exist relative to the driver.
+std::string BareMetal::computeSysRoot() const {
+  // Use Baremetal::sysroot if it has already been set.
+  if (!SysRoot.empty())
+    return SysRoot;
+
+  // Use the sysroot specified via the `--sysroot` command-line flag, if
+  // provided.
+  const Driver &D = getDriver();
+  if (!D.SysRoot.empty())
+    return D.SysRoot;
 
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
+  // Attempt to infer sysroot from a valid GCC installation.
+  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
+  SmallString<128> inferredSysRoot;
+  if (IsGCCInstallationValid) {
+    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
+                            "..", GCCInstallation.getTriple().str());
+  } else if (detectGCCToolchainAdjacent(D)) {
+    // Use the triple as provided to the driver. Unlike the parsed triple
+    // this has not been normalized to always contain every field.
+    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
+  }
+  // If a valid sysroot was inferred and exists, use it
+  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
+    return std::string(inferredSysRoot);
 
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
+  // Use the clang-runtimes path.
+  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
+}
 
-  return Triple.getEnvironmentName() == "elf";
+static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
+                                  const Multilib &Multilib,
+                                  StringRef InstallPath,
+                                  ToolChain::path_list &Paths) {
+  if (const auto &PathsCallback = Multilibs.filePathsCallback())
+    for (const auto &Path : PathsCallback(Multilib))
+      addPathIfExists(D, InstallPath + Path, Paths);
 }
 
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
+// GCC mutltilibs will only work for those targets that have their multlib
+// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
+// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
+// in GCCInstallation.
+BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
+                     const ArgList &Args)
+    : Generic_ELF(D, Triple, Args) {
+  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
+  std::string ComputedSysRoot = computeSysRoot();
+  if (IsGCCInstallationValid) {
+    if (!isRISCVBareMetal(Triple))
+      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
+
+    Multilibs = GCCInstallation.getMultilibs();
+    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
+
+    path_list &Paths = getFilePaths();
+    // Add toolchain/multilib specific file paths.
+    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
+                          GCCInstallation.getInstallPath(), Paths);
+    // Adding filepath for locating crt{begin,end}.o files.
+    Paths.push_back(GCCInstallation.getInstallPath().str());
+    // Adding filepath for locating crt0.o file.
+    Paths.push_back(ComputedSysRoot + "/lib");
+
+    ToolChain::path_list &PPaths = getProgramPaths();
+    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
+    // directory off of the parent of the GCC installation.
+    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
+                           GCCInstallation.getTriple().str() + "/bin")
+                         .str());
+    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
+  } else {
+    getProgramPaths().push_back(getDriver().Dir);
+    findMultilibs(D, Triple, Args);
+    const SmallString<128> SysRootDir(computeSysRoot());
+    if (!SysRootDir.empty()) {
+      for (const Multilib &M : getOrderedMultilibs()) {
+        SmallString<128> Dir(SysRootDir);
+        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+        getFilePaths().push_back(std::string(Dir));
+        getLibraryPaths().push_back(std::string(Dir));
+      }
+    }
+  }
 }
 
 static void
@@ -216,7 +318,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -234,7 +336,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -242,7 +344,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple)) {
+  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -263,8 +365,6 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
-std::string BareMetal::computeSysRoot() const { return SysRoot; }
-
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -292,10 +392,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRoot(computeSysRoot());
-  if (!SysRoot.empty()) {
+  const SmallString<128> SysRootDir(computeSysRoot());
+  if (!SysRootDir.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRoot);
+      SmallString<128> Dir(SysRootDir);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -309,6 +409,19 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
+void BareMetal::addLibStdCxxIncludePaths(
+    const llvm::opt::ArgList &DriverArgs,
+    llvm::opt::ArgStringList &CC1Args) const {
+  if (!IsGCCInstallationValid)
+    return;
+  const GCCVersion &Version = GCCInstallation.getVersion();
+  StringRef TripleStr = GCCInstallation.getTriple().str();
+  const Multilib &Multilib = GCCInstallation.getMultilib();
+  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
+                           TripleStr, Multilib.includeSuffix(), DriverArgs,
+                           CC1Args);
+}
+
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -339,23 +452,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-    case ToolChain::CST_Libcxx: {
-      SmallString<128> P(D.Dir);
-      llvm::sys::path::append(P, "..", "include");
-      AddCXXIncludePath(P);
-      break;
-    }
-    case ToolChain::CST_Libstdcxx:
-      // We only support libc++ toolchain installation.
-      break;
+  case ToolChain::CST_Libcxx: {
+    SmallString<128> P(D.Dir);
+    llvm::sys::path::append(P, "..", "include");
+    AddCXXIncludePath(P);
+    break;
+  }
+  case ToolChain::CST_Libstdcxx:
+    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
+    break;
   }
 
-  std::string SysRoot(computeSysRoot());
-  if (SysRoot.empty())
+  std::string SysRootDir(computeSysRoot());
+  if (SysRootDir.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRoot);
+    SmallString<128> Dir(SysRootDir);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index f6295bda0a6a2..930f8584e6435 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
+#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -19,7 +20,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -35,7 +36,8 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool useIntegratedAs() const override { return true; }
+  bool initGCCInstallation(const llvm::Triple &Triple,
+                           const llvm::opt::ArgList &Args);
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -48,9 +50,15 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
+  UnwindTableLevel
+  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
+    return UnwindTableLevel::None;
+  }
+
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
+
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
@@ -67,6 +75,9 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
+  void
+  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
+                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -80,6 +91,8 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 
   std::string SysRoot;
 
+  bool IsGCCInstallationValid;
+
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -104,7 +117,7 @@ class LLVM_LIBRARY_VISIBILITY StaticLibTool : public Tool {
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
new file mode 100755
index 0000000000000..b23e55619b2ff
--- /dev/null
+++ b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
@@ -0,0 +1 @@
+#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
new file mode 100644
index 0000000000000..0214639ed3804
--- /dev/null
+++ b/clang/test/Driver/aarch64-gnutools.c
@@ -0,0 +1,4 @@
+// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
new file mode 100644
index 0000000000000..2610e962bd690
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -0,0 +1,28 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in aarch64-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/aarch64-nogcc/bin
+// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
+// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
new file mode 100644
index 0000000000000..7f2c01d928e43
--- /dev/null
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -0,0 +1,61 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
+
+// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
+
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
new file mode 100644
index 0000000000000..6e107f19dabc5
--- /dev/null
+++ b/clang/test/Driver/arm-gnutools.c
@@ -0,0 +1,6 @@
+// check that gnu assembler is invoked with arm baremetal as well
+
+// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
+// RUN: 2>&1 | FileCheck %s
+
+// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
new file mode 100644
index 0000000000000..114de0a8154ab
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -0,0 +1,29 @@
+// A basic clang -cc1 command-line, and simple environment check.
+
+// The tests here are similar to those in arm-toolchain.c, however
+// these tests need to create symlinks to test directory trees in order to
+// set up the environment and therefore shell support is required.
+// REQUIRES: shell
+// UNSUPPORTED: system-windows
+
+// If there is no GCC install detected then the driver searches for executables
+// and runtime starting from the directory tree above the driver itself.
+// The test below checks that the driver correctly finds the linker and
+// runtime if and only if they exist.
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/arm-nogcc/bin
+// RUN: ln -s %clang %t/arm-nogcc/bin/clang
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
+// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
+// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
+// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+
+// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
new file mode 100644
index 0000000000000..2e38461fb7a3e
--- /dev/null
+++ b/clang/test/Driver/arm-toolchain.c
@@ -0,0 +1,62 @@
+// UNSUPPORTED: system-windows
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
+
+// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
+
+// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
+
+// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+
+// RUN: %clangxx -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=  2>&1 \
+// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
+
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index a80aa9b437117..2ac83402dda30 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -196,6 +196,22 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
+// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
+// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
+// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
+// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
+// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
+
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
new file mode 100644
index 0000000000000..9a0d7cee450a3
--- /dev/null
+++ b/clang/test/Driver/check-no-multlib-warning.c
@@ -0,0 +1,10 @@
+// UNSUPPORTED: system-windows
+
+
+// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
+
+// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
+// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets

From e6a41399cb8796e5d18940d49b0151704568321a Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Tue, 17 Jun 2025 09:24:01 -0700
Subject: [PATCH 758/851] Reland "[libc] utf8 to 32 CharacterConverter"
 (#144450)

Reverts llvm/llvm-project#144446
Figured out the issue, so creating a new pull request.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../__support/wchar/character_converter.cpp   |  56 +++++
 libc/test/src/__support/CMakeLists.txt        |   5 +-
 libc/test/src/__support/wchar/CMakeLists.txt  |  10 +
 .../src/__support/wchar/utf8_to_32_test.cpp   | 196 ++++++++++++++++++
 4 files changed, 264 insertions(+), 3 deletions(-)
 create mode 100644 libc/test/src/__support/wchar/utf8_to_32_test.cpp

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index ca709769616c3..3b9046dfb9a76 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/types/char32_t.h"
 #include "hdr/types/char8_t.h"
+#include "src/__support/CPP/bit.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/math_extras.h"
@@ -30,6 +31,50 @@ bool CharacterConverter::isComplete() {
   return state->bytes_processed == state->total_bytes;
 }
 
+int CharacterConverter::push(char8_t utf8_byte) {
+  uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
+  // Checking the first byte if first push
+  if (state->bytes_processed == 0) {
+    // UTF-8 char has 1 byte total
+    if (num_ones == 0) {
+      state->total_bytes = 1;
+    }
+    // UTF-8 char has 2 through 4 bytes total
+    else if (num_ones >= 2 && num_ones <= 4) {
+      /* Since the format is 110xxxxx, 1110xxxx, and 11110xxx for 2, 3, and 4,
+      we will make the base mask with 7 ones and right shift it as necessary. */
+      constexpr size_t SIGNIFICANT_BITS = 7;
+      char8_t base_mask =
+          static_cast<char8_t>(mask_trailing_ones<uint8_t, SIGNIFICANT_BITS>());
+      state->total_bytes = num_ones;
+      utf8_byte &= (base_mask >> num_ones);
+    }
+    // Invalid first byte
+    else {
+      // bytes_processed and total_bytes will always be 0 here
+      state->partial = static_cast<char32_t>(0);
+      return -1;
+    }
+    state->partial = static_cast<char32_t>(utf8_byte);
+    state->bytes_processed++;
+    return 0;
+  }
+  // Any subsequent push
+  // Adding 6 more bits so need to left shift
+  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+  if (num_ones == 1 && !isComplete()) {
+    char32_t byte =
+        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    state->partial = state->partial << ENCODED_BITS_PER_UTF8;
+    state->partial |= byte;
+    state->bytes_processed++;
+    return 0;
+  }
+  // Invalid byte -> reset the state
+  clear();
+  return -1;
+}
+
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
   if (!isComplete())
@@ -54,6 +99,17 @@ int CharacterConverter::push(char32_t utf32) {
   return -1;
 }
 
+ErrorOr<char32_t> CharacterConverter::pop_utf32() {
+  // If pop is called too early, do not reset the state, use error to determine
+  // whether enough bytes have been pushed
+  if (!isComplete() || state->bytes_processed == 0)
+    return Error(-1);
+  char32_t utf32 = state->partial;
+  // reset if successful pop
+  clear();
+  return utf32;
+}
+
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   if (isComplete())
     return Error(-1);
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 76218a16e0cf7..9f626ed31cc07 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,9 +275,8 @@ add_subdirectory(fixed_point)
 add_subdirectory(HashTable)
 add_subdirectory(time)
 add_subdirectory(threads)
-
-# Requires access to uchar header which is not on macos
-# Therefore, cannot currently build this on macos in overlay mode
+# Requires access to uchar header which is not on MacOS
+# Cannot currently build this on MacOS in overlay mode
 if(NOT(LIBC_TARGET_OS_IS_DARWIN))
   add_subdirectory(wchar)
 endif()
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
index 5dff6e9115f7d..5176bfd4b024b 100644
--- a/libc/test/src/__support/wchar/CMakeLists.txt
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -1,5 +1,15 @@
 add_custom_target(libc-support-wchar-tests)
 
+add_libc_test(
+  utf8_to_32_test 
+  SUITE
+    libc-support-tests
+  SRCS
+    utf8_to_32_test.cpp 
+  DEPENDS
+    libc.src.__support.wchar.character_converter
+)
+
 add_libc_test(
   utf32_to_8_test
   SUITE
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..9cb059faa9374
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,196 @@
+//===-- Unittests for character_converter utf8->utf32 ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/error_or.h"
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  char ch = 'A';
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_EQ(err, 0);
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[2] = {static_cast<char>(0xC2),
+                      static_cast<char>(0x8E)}; //  car symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
+                      static_cast<char>(0x91)}; // ∑ sigma symbol
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
+                      static_cast<char>(0xA4),
+                      static_cast<char>(0xA1)}; // 🤡 clown emoji
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  char_conv.push(static_cast<char8_t>(ch[0]));
+  char_conv.push(static_cast<char8_t>(ch[1]));
+  char_conv.push(static_cast<char8_t>(ch[2]));
+  char_conv.push(static_cast<char8_t>(ch[3]));
+  auto wch = char_conv.pop_utf32();
+
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch));
+
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {
+      static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
+      static_cast<char>(0x00)}; // first and third bytes are invalid
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Prev byte was single byte so trying to push another should error.
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  // Last byte is invalid since it does not have correct starting sequence.
+  // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
+  const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0x80),
+                      static_cast<char>(0x80), static_cast<char>(0xC0)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0x80)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  // Should produce an error on 3rd byte
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, -1);
+
+  // Should produce an error since mbstate was reset
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(wch.has_value());
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
+                      static_cast<char>(0xC7), static_cast<char>(0x8C)};
+
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+
+  // Second two byte character
+  err = char_conv.push(static_cast<char8_t>(ch[2]));
+  ASSERT_EQ(err, 0);
+  err = char_conv.push(static_cast<char8_t>(ch[3]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 460);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
+  LIBC_NAMESPACE::internal::mbstate state;
+  state.bytes_processed = 0;
+  state.total_bytes = 0;
+  LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+  const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
+  int err = char_conv.push(static_cast<char8_t>(ch[0]));
+  ASSERT_EQ(err, 0);
+  auto wch = char_conv.pop_utf32();
+  ASSERT_FALSE(
+      wch.has_value()); // Should fail since we have not read enough bytes
+  err = char_conv.push(static_cast<char8_t>(ch[1]));
+  ASSERT_EQ(err, 0);
+  wch = char_conv.pop_utf32();
+  ASSERT_TRUE(wch.has_value());
+  ASSERT_EQ(static_cast<int>(wch.value()), 142);
+}

From 65d590e8d012df9dabbf8b3ec929fd1543c7398a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:25:09 +0100
Subject: [PATCH 759/851] [X86] combineLogicBlendIntoConditionalNegate -
 convert to SDPatternMatch matching. NFC. (#144536)

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12fcc614ab254..4cff42c2ac464 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47591,27 +47591,19 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
 static SDValue combineLogicBlendIntoConditionalNegate(
     EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   EVT MaskVT = Mask.getValueType();
   assert(MaskVT.isInteger() &&
          DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
          "Mask must be zero/all-bits");
 
-  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
-    return SDValue();
-  if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
+  if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT ||
+      !DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
     return SDValue();
 
-  auto IsNegV = [](SDNode *N, SDValue V) {
-    return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
-           ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
-  };
-
   SDValue V;
-  if (IsNegV(Y.getNode(), X))
-    V = X;
-  else if (IsNegV(X.getNode(), Y))
-    V = Y;
-  else
+  if (!sd_match(Y, m_Neg(m_AllOf(m_Specific(X), m_Value(V)))) &&
+      !sd_match(X, m_Neg(m_AllOf(m_Specific(Y), m_Value(V)))))
     return SDValue();
 
   SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

From c66be289901b3f035187d391e80e3610d7d6232e Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 18:31:06 +0200
Subject: [PATCH 760/851] [clang][bytecode] Allocate IntegralAP and Floating
 types using an allocator (#144246)

Both `APInt` and `APFloat` will heap-allocate memory themselves using
the system allocator when the size of their data exceeds 64 bits.

This is why clang has `APNumericStorage`, which allocates its memory
using an allocator (via `ASTContext`) instead. Calling `getValue()` on
an ast node like that will then create a new `APInt`/`APFloat` , which
will copy the data (in the `APFloat` case, we even copy it twice).
That's sad but whatever.

In the bytecode interpreter, we have a similar problem. Large integers
and floating-point values are placement-new allocated into the
`InterpStack` (or into the bytecode, which is a `vector<std::byte>`).
When we then later interrupt interpretation, we don't run the destructor
for all items on the stack, which means we leak the memory the
`APInt`/`APFloat` (which backs the `IntegralAP`/`Floating` the
interpreter uses).

Fix this by using an approach similar to the one used in the AST. Add an
allocator to `InterpState`, which is used for temporaries and local
values. Those values will be freed at the end of interpretation. For
global variables, we need to promote the values to global lifetime,
which we do via `InitGlobal` and `FinishInitGlobal` ops.

Interestingly, this results in a slight _improvement_ in compile times:
https://llvm-compile-time-tracker.com/compare.php?from=6bfcdda9b1ddf0900f82f7e30cb5e3253a791d50&to=88d1d899127b408f0fb0f385c2c58e6283195049&stat=instructions:u
(but don't ask me why).

Fixes https://github.com/llvm/llvm-project/issues/139012
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 112 +++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 +
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  60 +++-
 clang/lib/AST/ByteCode/Floating.h             | 252 ++++++++-----
 clang/lib/AST/ByteCode/Integral.h             |   3 +
 clang/lib/AST/ByteCode/IntegralAP.h           | 231 +++++++-----
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +++++-
 clang/lib/AST/ByteCode/Interp.h               | 337 ++++++++++++++----
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 ++-
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 ++
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 +
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 929 insertions(+), 341 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 9fe4803ce98ec..3f884ed8d094a 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,7 +748,8 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  return this->emitConstFloat(E->getValue(), E);
+  APFloat F = E->getValue();
+  return this->emitFloat(F, E);
 }
 
 template <class Emitter>
@@ -4185,8 +4186,10 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float:
-    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
+  case PT_Float: {
+    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
+    return this->emitFloat(F, E);
+  }
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4674,10 +4677,7 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      if (!this->emitFinishInit(Init))
-        return false;
-
-      return this->emitPopPtr(Init);
+      return this->emitFinishInitGlobal(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,51 +4698,45 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  } else {
-    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
-
-    if (VarT) {
-      unsigned Offset = this->allocateLocalPrimitive(
-          VD, *VarT, VD->getType().isConstQualified(), nullptr,
-          ScopeKind::Block, IsConstexprUnknown);
-      if (Init) {
-        // If this is a toplevel declaration, create a scope for the
-        // initializer.
-        if (Toplevel) {
-          LocalScope<Emitter> Scope(this);
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-        } else {
-          if (!this->visit(Init))
-            return false;
-          return this->emitSetLocal(*VarT, Offset, VD);
-        }
-      }
-    } else {
-      if (std::optional<unsigned> Offset =
-              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
-                                  IsConstexprUnknown)) {
-        if (!Init)
-          return true;
+  }
+  // Local variables.
+  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-        if (!this->emitGetPtrLocal(*Offset, Init))
+  if (VarT) {
+    unsigned Offset = this->allocateLocalPrimitive(
+        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
+        IsConstexprUnknown);
+    if (Init) {
+      // If this is a toplevel declaration, create a scope for the
+      // initializer.
+      if (Toplevel) {
+        LocalScope<Emitter> Scope(this);
+        if (!this->visit(Init))
           return false;
-
-        if (!visitInitializer(Init))
+        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+      } else {
+        if (!this->visit(Init))
           return false;
+        return this->emitSetLocal(*VarT, Offset, VD);
+      }
+    }
+  } else {
+    if (std::optional<unsigned> Offset = this->allocateLocal(
+            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
+      if (!Init)
+        return true;
 
-        if (!this->emitFinishInit(Init))
-          return false;
+      if (!this->emitGetPtrLocal(*Offset, Init))
+        return false;
 
-        return this->emitPopPtr(Init);
-      }
-      return false;
+      if (!visitInitializer(Init))
+        return false;
+
+      return this->emitFinishInitPop(Init);
     }
-    return true;
+    return false;
   }
-
-  return false;
+  return true;
 }
 
 template <class Emitter>
@@ -4751,8 +4745,10 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat())
-    return this->emitConstFloat(Val.getFloat(), E);
+  else if (Val.isFloat()) {
+    APFloat F = Val.getFloat();
+    return this->emitFloat(F, E);
+  }
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6133,8 +6129,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6176,8 +6174,10 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
+      APFloat F(TargetSemantics, 1);
+      if (!this->emitFloat(F, E))
         return false;
+
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,6 +6953,20 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
+  assert(!DiscardResult && "Should've been checked before");
+
+  if (Floating::singleWord(F.getSemantics()))
+    return this->emitConstFloat(Floating(F), E);
+
+  APInt I = F.bitcastToAPInt();
+  return this->emitConstFloat(
+      Floating(const_cast<uint64_t *>(I.getRawData()),
+               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
+      E);
+}
+
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index ac3ad84766dc6..a1d068cc7e0ae 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,6 +391,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
+  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 5531295dfa2f8..46e4d0d940b3e 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 846dc2fe92a70..7c6b78386b14f 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,34 +50,56 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto F = Floating::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  auto Sem = Floating::deserializeSemantics(*OpPC);
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
+      llvm::APFloatBase::EnumToSemantics(Sem));
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+  Floating Result(Memory.get(), Sem);
+  Floating::deserialize(*OpPC, &Result);
+
+  OpPC += align(Result.bytesToSerialize());
+
+  std::string S;
+  llvm::raw_string_ostream SS(S);
+  SS << Result;
+  return S;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
-
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  using T = IntegralAP<false>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
+
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  OpPC += Result.bytesToSerialize();
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+  return Str;
 }
+
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  auto F = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(F.bytesToSerialize());
+  using T = IntegralAP<true>;
+  unsigned BitWidth = T::deserializeSize(*OpPC);
+  auto Memory =
+      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
 
-  std::string Result;
-  llvm::raw_string_ostream SS(Result);
-  SS << F;
-  return Result;
+  T Result(Memory.get(), BitWidth);
+  T::deserialize(*OpPC, &Result);
+
+  std::string Str;
+  llvm::raw_string_ostream SS(Str);
+  SS << Result;
+
+  OpPC += Result.bytesToSerialize();
+  return Str;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 3750568fc23c7..659892e720abf 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,63 +17,79 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
+// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
+// floating values.
+#define ALLOCATE_ALL 0
+
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
+using APInt = llvm::APInt;
 
+/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 class Floating final {
 private:
-  // The underlying value storage.
-  APFloat F;
+  union {
+    uint64_t Val = 0;
+    uint64_t *Memory;
+  };
+  llvm::APFloatBase::Semantics Semantics;
+
+  APFloat getValue() const {
+    unsigned BitWidth = bitWidth();
+    if (singleWord())
+      return APFloat(getSemantics(), APInt(BitWidth, Val));
+    unsigned NumWords = numWords();
+    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+  }
 
 public:
-  /// Zero-initializes a Floating.
-  Floating() : F(0.0f) {}
-  Floating(const APFloat &F) : F(F) {}
+  Floating() = default;
+  Floating(llvm::APFloatBase::Semantics Semantics)
+      : Val(0), Semantics(Semantics) {}
+  Floating(const APFloat &F) {
 
-  // Static constructors for special floating point values.
-  static Floating getInf(const llvm::fltSemantics &Sem) {
-    return Floating(APFloat::getInf(Sem));
+    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
+    this->copy(F);
   }
-  const APFloat &getAPFloat() const { return F; }
+  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
+      : Memory(Memory), Semantics(Semantics) {}
+
+  APFloat getAPFloat() const { return getValue(); }
 
-  bool operator<(Floating RHS) const { return F < RHS.F; }
-  bool operator>(Floating RHS) const { return F > RHS.F; }
-  bool operator<=(Floating RHS) const { return F <= RHS.F; }
-  bool operator>=(Floating RHS) const { return F >= RHS.F; }
-  bool operator==(Floating RHS) const { return F == RHS.F; }
-  bool operator!=(Floating RHS) const { return F != RHS.F; }
-  Floating operator-() const { return Floating(-F); }
+  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
+  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
+  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
+  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
+    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
+                                       &IsExact);
   }
 
-  Floating toSemantics(const llvm::fltSemantics *Sem,
-                       llvm::RoundingMode RM) const {
-    APFloat Copy = F;
+  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
+                   Floating *Result) const {
+    APFloat Copy = getValue();
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    return Floating(Copy);
-  }
-
-  /// Convert this Floating to one with the same semantics as \Other.
-  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
-    return toSemantics(&Other.F.getSemantics(), RM);
+    Result->copy(Copy);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(F.bitcastToAPInt());
+    return APSInt(getValue().bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(F); }
+  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    F.toString(Buffer);
+    getValue().toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -83,25 +99,62 @@ class Floating final {
     return NameStr;
   }
 
-  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
+  unsigned bitWidth() const {
+    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
+  }
+  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
+  bool singleWord() const {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return numWords() == 1;
+  }
+  static bool singleWord(const llvm::fltSemantics &Sem) {
+#if ALLOCATE_ALL
+    return false;
+#endif
+    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
+  }
+  const llvm::fltSemantics &getSemantics() const {
+    return llvm::APFloatBase::EnumToSemantics(Semantics);
+  }
+
+  void copy(const APFloat &F) {
+    if (singleWord()) {
+      Val = F.bitcastToAPInt().getZExtValue();
+    } else {
+      assert(Memory);
+      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
+                  numWords() * sizeof(uint64_t));
+    }
+  }
+
+  void take(uint64_t *NewMemory) {
+    if (singleWord())
+      return;
+
+    if (Memory)
+      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return F.isNegative(); }
-  bool isZero() const { return F.isZero(); }
-  bool isNonZero() const { return F.isNonZero(); }
-  bool isMin() const { return F.isSmallest(); }
-  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
-  bool isNan() const { return F.isNaN(); }
-  bool isSignaling() const { return F.isSignaling(); }
-  bool isInf() const { return F.isInfinity(); }
-  bool isFinite() const { return F.isFinite(); }
-  bool isNormal() const { return F.isNormal(); }
-  bool isDenormal() const { return F.isDenormal(); }
-  llvm::FPClassTest classify() const { return F.classify(); }
-  APFloat::fltCategory getCategory() const { return F.getCategory(); }
+  bool isNegative() const { return getValue().isNegative(); }
+  bool isZero() const { return getValue().isZero(); }
+  bool isNonZero() const { return getValue().isNonZero(); }
+  bool isMin() const { return getValue().isSmallest(); }
+  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
+  bool isNan() const { return getValue().isNaN(); }
+  bool isSignaling() const { return getValue().isSignaling(); }
+  bool isInf() const { return getValue().isInfinity(); }
+  bool isFinite() const { return getValue().isFinite(); }
+  bool isNormal() const { return getValue().isNormal(); }
+  bool isDenormal() const { return getValue().isDenormal(); }
+  llvm::FPClassTest classify() const { return getValue().classify(); }
+  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
+    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -118,97 +171,130 @@ class Floating final {
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating &Result) {
+                                        Floating *Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result = Floating(F);
+    Result->copy(F);
     return Status;
   }
 
-  static Floating bitcastFromMemory(const std::byte *Buff,
-                                    const llvm::fltSemantics &Sem) {
+  static void bitcastFromMemory(const std::byte *Buff,
+                                const llvm::fltSemantics &Sem,
+                                Floating *Result) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-
-    return Floating(APFloat(Sem, API));
+    Result->copy(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = F.bitcastToAPInt();
+    llvm::APInt API = getValue().bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(llvm::fltSemantics *) +
-           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
+    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    // Semantics followed by an APInt.
-    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
-
-    llvm::APInt API = F.bitcastToAPInt();
-    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
-                           bitWidth() / 8);
+    std::memcpy(Buff, &Semantics, sizeof(Semantics));
+    if (singleWord()) {
+      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
+    } else {
+      std::memcpy(Buff + sizeof(Semantics), Memory,
+                  numWords() * sizeof(uint64_t));
+    }
   }
 
-  static Floating deserialize(const std::byte *Buff) {
-    const llvm::fltSemantics *Sem;
-    std::memcpy((void *)&Sem, Buff, sizeof(void *));
-    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
+  static llvm::APFloatBase::Semantics
+  deserializeSemantics(const std::byte *Buff) {
+    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
   }
 
-  static Floating abs(const Floating &F) {
-    APFloat V = F.F;
-    if (V.isNegative())
-      V.changeSign();
-    return Floating(V);
+  static void deserialize(const std::byte *Buff, Floating *Result) {
+    llvm::APFloatBase::Semantics Semantics;
+    std::memcpy(&Semantics, Buff, sizeof(Semantics));
+
+    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
+        llvm::APFloatBase::EnumToSemantics(Semantics));
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+
+    Result->Semantics = Semantics;
+    if (NumWords == 1 && !ALLOCATE_ALL) {
+      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
+    } else {
+      assert(Result->Memory);
+      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
+                  NumWords * sizeof(uint64_t));
+    }
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.add(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.add(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.add(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.add(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.subtract(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.subtract(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.F.getSemantics(), 1);
-    *R = Floating(A.F);
-    return R->F.subtract(One, RM);
+    APFloat One(A.getSemantics(), 1);
+    APFloat LHS = A.getValue();
+
+    auto Status = LHS.subtract(One, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.multiply(B.F, RM);
+
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.multiply(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    *R = Floating(A.F);
-    return R->F.divide(B.F, RM);
+    APFloat LHS = A.getValue();
+    APFloat RHS = B.getValue();
+
+    auto Status = LHS.divide(RHS, RM);
+    R->copy(LHS);
+    return Status;
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    *R = -A;
+    R->copy(-A.getValue());
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index 13fdb5369f2b7..af5cd2d13ecca 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,6 +99,9 @@ template <unsigned Bits, bool Signed> class Integral final {
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
+  bool operator>=(unsigned RHS) const {
+    return static_cast<unsigned>(V) >= RHS;
+  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 8ee08dfb5cfe7..259262bdc5243 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,12 +28,19 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
-template <unsigned Bits, bool Signed> class Integral;
 
+/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
+/// It will NOT copy the memory (unless, of course, copy() is called) and it
+/// won't alllocate anything. The allocation should happen via InterpState or
+/// Program.
 template <bool Signed> class IntegralAP final {
-private:
+public:
+  union {
+    uint64_t *Memory = nullptr;
+    uint64_t Val;
+  };
+  unsigned BitWidth = 0;
   friend IntegralAP<!Signed>;
-  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -52,106 +59,129 @@ template <bool Signed> class IntegralAP final {
                                : V.trunc(BitSize).getZExtValue();
   }
 
+  APInt getValue() const {
+    if (singleWord())
+      return APInt(BitWidth, Val, Signed);
+    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
+    return llvm::APInt(BitWidth, NumWords, Memory);
+  }
+
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  template <typename T>
-  IntegralAP(T Value, unsigned BitWidth)
-      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
+  void take(uint64_t *NewMemory) {
+    assert(!singleWord());
+    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
+    Memory = NewMemory;
+  }
+
+  void copy(const APInt &V) {
+    assert(BitWidth == V.getBitWidth());
+    assert(numWords() == V.getNumWords());
+
+    if (V.isSingleWord()) {
+      if constexpr (Signed)
+        Val = V.getSExtValue();
+      else
+        Val = V.getZExtValue();
+      return;
+    }
+    assert(Memory);
+    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
+  }
 
-  IntegralAP(APInt V) : V(V) {}
-  /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
+  // Constructors.
+  IntegralAP() = default;
+  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
+  IntegralAP(uint64_t *Memory, unsigned BitWidth)
+      : Memory(Memory), BitWidth(BitWidth) {}
+  IntegralAP(const APInt &V)
+      : IntegralAP(const_cast<uint64_t *>((const uint64_t *)V.getRawData()),
+                   V.getBitWidth()) {}
 
-  IntegralAP operator-() const { return IntegralAP(-V); }
+  IntegralAP operator-() const { return IntegralAP(-getValue()); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(V - Other.V);
+    return IntegralAP(getValue() - Other.getValue());
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return V.ugt(RHS.V);
-    return V.sgt(RHS.V);
+      return getValue().sgt(RHS.getValue());
+    return getValue().ugt(RHS.getValue());
   }
-  bool operator>=(IntegralAP RHS) const {
+  bool operator>=(unsigned RHS) const {
     if constexpr (Signed)
-      return V.uge(RHS.V);
-    return V.sge(RHS.V);
+      return getValue().sge(RHS);
+    return getValue().uge(RHS);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return V.slt(RHS.V);
-    return V.slt(RHS.V);
-  }
-  bool operator<=(IntegralAP RHS) const {
-    if constexpr (Signed)
-      return V.ult(RHS.V);
-    return V.ult(RHS.V);
+      return getValue().slt(RHS.getValue());
+    return getValue().ult(RHS.getValue());
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(V);
+    return truncateCast<Ty, Signed>(getValue());
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
+    if (NumBits == 0)
+      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-
+    assert(false);
     return IntegralAP<Signed>(Copy);
   }
 
+  static IntegralAP from(const APInt &Value) {
+    return IntegralAP<Signed>(Value);
+  }
+
   template <bool InputSigned>
   static IntegralAP from(IntegralAP<InputSigned> V, unsigned NumBits = 0) {
     if (NumBits == 0)
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
-  }
-
-  template <unsigned Bits, bool InputSigned>
-  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    return IntegralAP<Signed>(I.toAPInt(BitWidth));
-  }
-
-  static IntegralAP zero(int32_t BitWidth) {
-    APInt V = APInt(BitWidth, 0LL, Signed);
-    return IntegralAP(V);
+      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
   }
 
-  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
+  constexpr unsigned bitWidth() const { return BitWidth; }
+  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
+  constexpr bool singleWord() const { return numWords() == 1; }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
+    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(V.sext(Bits), !Signed);
+      return APSInt(getValue().sext(Bits), !Signed);
     else
-      return APSInt(V.zext(Bits), !Signed);
+      return APSInt(getValue().zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return V.isZero(); }
+  bool isZero() const { return getValue().isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return V.isNonNegative();
+      return getValue().isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !V.isNonNegative();
+      return !getValue().isNonNegative();
     return false;
   }
-  bool isMin() const { return V.isMinValue(); }
-  bool isMax() const { return V.isMaxValue(); }
+  bool isMin() const { return getValue().isMinValue(); }
+  bool isMax() const { return getValue().isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && V == -1; }
+  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
 
-  unsigned countLeadingZeros() const { return V.countl_zero(); }
+  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
+  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -161,53 +191,64 @@ template <bool Signed> class IntegralAP final {
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(
+          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    APInt Copy = V;
-    return IntegralAP<false>(Copy);
+    return IntegralAP<false>(Memory, BitWidth);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
   }
 
   static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
+    // FIXME: Remove this.
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
     return IntegralAP(V);
   }
 
+  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
+                                IntegralAP *Result) {
+    APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
+    llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
+    Result->copy(V);
+  }
+
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
+    APInt V1 = getValue();
+    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V.slt(RHS.V))
+      if (V1.slt(V2))
         return ComparisonCategoryResult::Less;
-      if (V.sgt(RHS.V))
+      if (V1.sgt(V2))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V.ult(RHS.V))
+    if (V1.ult(V2))
       return ComparisonCategoryResult::Less;
-    if (V.ugt(RHS.V))
+    if (V1.ugt(V2))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return add(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    IntegralAP<Signed> One(1, A.bitWidth());
-    return sub(A, One, A.bitWidth() + 1, R);
+    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
+    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -224,87 +265,95 @@ template <bool Signed> class IntegralAP final {
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.srem(B.V));
+      R->copy(A.getValue().srem(B.getValue()));
     else
-      *R = IntegralAP(A.V.urem(B.V));
+      R->copy(A.getValue().urem(B.getValue()));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      *R = IntegralAP(A.V.sdiv(B.V));
+      R->copy(A.getValue().sdiv(B.getValue()));
     else
-      *R = IntegralAP(A.V.udiv(B.V));
+      R->copy(A.getValue().udiv(B.getValue()));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V & B.V);
+    R->copy(A.getValue() & B.getValue());
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    *R = IntegralAP(A.V | B.V);
+    R->copy(A.getValue() | B.getValue());
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    *R = IntegralAP(A.V ^ B.V);
+    R->copy(A.getValue() ^ B.getValue());
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.V;
+    APInt AI = A.getValue();
     AI.negate();
-    *R = IntegralAP(AI);
+    R->copy(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    *R = IntegralAP(~A.V);
+    R->copy(~A.getValue());
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
+    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.V.getZExtValue();
+    unsigned ShiftAmount = B.getValue().getZExtValue();
     if constexpr (Signed)
-      *R = IntegralAP(A.V.ashr(ShiftAmount));
+      R->copy(A.getValue().ashr(ShiftAmount));
     else
-      *R = IntegralAP(A.V.lshr(ShiftAmount));
+      R->copy(A.getValue().lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
-    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
+    assert(BitWidth != 0);
+    uint32_t NumWords = llvm::APInt::getNumWords(bitWidth());
+    return sizeof(uint64_t) + (NumWords * sizeof(uint64_t));
   }
 
   void serialize(std::byte *Buff) const {
-    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
-    uint32_t BitWidth = V.getBitWidth();
+    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
+    std::memcpy(Buff, &BitWidth, sizeof(uint64_t));
+    if (singleWord())
+      std::memcpy(Buff + sizeof(uint64_t), &Val, NumWords * sizeof(uint64_t));
+    else
+      std::memcpy(Buff + sizeof(uint64_t), Memory, NumWords * sizeof(uint64_t));
+  }
 
-    std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
-    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
-                           BitWidth / CHAR_BIT);
+  static uint32_t deserializeSize(const std::byte *Buff) {
+    return *reinterpret_cast<const uint64_t *>(Buff);
   }
 
-  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
-    uint32_t BitWidth;
-    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
-    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
+  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
+    uint32_t BitWidth = Result->BitWidth;
+    uint32_t NumWords = llvm::APInt::getNumWords(BitWidth);
+    assert(BitWidth == Result->BitWidth);
+    assert(Result->Memory);
 
-    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
-                            BitWidth / CHAR_BIT);
-    return Val;
+    if (NumWords == 1)
+      std::memcpy(&Result->Val, Buff + sizeof(uint64_t), sizeof(uint64_t));
+    else
+      std::memcpy(Result->Memory, Buff + sizeof(uint64_t),
+                  NumWords * sizeof(uint64_t));
   }
 
 private:
@@ -312,7 +361,7 @@ template <bool Signed> class IntegralAP final {
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->V = Op<APInt>{}(A.V, B.V);
+      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
       return false;
     }
 
@@ -320,7 +369,7 @@ template <bool Signed> class IntegralAP final {
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->V = Result;
+    R->copy(Result);
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 5c8abffb3a99d..1e2032feabb64 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,8 +1935,10 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
@@ -1946,8 +1948,10 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
+
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2053,6 +2057,100 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
+                                PrimType T) {
+
+  if (T == PT_IntAPS) {
+    auto &Val = Ptr.deref<IntegralAP<true>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_IntAP) {
+    auto &Val = Ptr.deref<IntegralAP<false>>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  } else if (T == PT_Float) {
+    auto &Val = Ptr.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+}
+
+template <typename T>
+static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
+  assert(needsAlloc<T>());
+  auto &Val = Ptr.deref<T>();
+  if (!Val.singleWord()) {
+    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+    Val.take(NewMemory);
+  }
+}
+
+static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
+  if (const Record *R = Ptr.getRecord()) {
+    for (const Record::Field &Fi : R->fields()) {
+      if (Fi.Desc->isPrimitive()) {
+        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
+          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
+        });
+        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
+      } else
+        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
+    }
+    return;
+  }
+
+  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
+    unsigned NumElems = D->getNumElems();
+    if (NumElems == 0)
+      return;
+
+    if (D->isPrimitiveArray()) {
+      PrimType PT = D->getPrimType();
+      if (!needsAlloc(PT))
+        return;
+      assert(NumElems >= 1);
+      const Pointer EP = Ptr.atIndex(0);
+      bool AllSingleWord = true;
+      TYPE_SWITCH_ALLOC(PT, {
+        if (!EP.deref<T>().singleWord()) {
+          copyPrimitiveMemory<T>(S, EP);
+          AllSingleWord = false;
+        }
+      });
+      if (AllSingleWord)
+        return;
+      for (unsigned I = 1; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I);
+        copyPrimitiveMemory(S, EP, PT);
+      }
+    } else {
+      assert(D->isCompositeArray());
+      for (unsigned I = 0; I != D->getNumElems(); ++I) {
+        const Pointer EP = Ptr.atIndex(I).narrow();
+        finishGlobalRecurse(S, EP);
+      }
+    }
+  }
+}
+
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+
+  finishGlobalRecurse(S, Ptr);
+  if (Ptr.canBeInitialized()) {
+    Ptr.initialize();
+    Ptr.activate();
+  }
+
+  return true;
+}
+
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index ae3d4a441a799..66d3e6d79e8b2 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
+  if (Bits > 1 && RHS >= Bits) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,6 +370,9 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -408,6 +411,7 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -423,7 +427,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -434,6 +438,7 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
+
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -442,7 +447,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -453,6 +458,7 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
+
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -461,8 +467,10 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+  Floating Result = S.allocFloat(LHS.getSemantics());
+
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -484,9 +492,14 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -539,10 +552,20 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
+    // Result.atIndex(0).initialize();
+    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    // Result.atIndex(1).initialize();
+
+    Floating RA = S.allocFloat(A.getSemantics());
+    RA.copy(ResR);
+    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
     Result.atIndex(0).initialize();
-    Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    Result.atIndex(1).initialize();
+
+    Floating RI = S.allocFloat(A.getSemantics());
+    RI.copy(ResI);
+    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+
     Result.initialize();
   } else {
     // Integer element type.
@@ -608,9 +631,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -625,9 +651,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -644,7 +673,11 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Bits);
+
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -659,12 +692,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -679,12 +715,15 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
+  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
-  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(LHS.bitWidth());
+
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -707,8 +746,10 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result;
+
+  Floating Result = S.allocFloat(LHS.getSemantics());
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
+
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -730,31 +771,44 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
-  T Result;
 
-  if (!T::neg(Value, &Result)) {
+  if constexpr (std::is_same_v<T, Floating>) {
+    T Result = S.allocFloat(Value.getSemantics());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+    return false;
+  } else {
+    T Result;
+    if constexpr (needsAlloc<T>())
+      Result = S.allocAP<T>(Value.bitWidth());
+
+    if (!T::neg(Value, &Result)) {
+      S.Stk.push<T>(Result);
+      return true;
+    }
+
+    assert(isIntegralType(Name) &&
+           "don't expect other types to fail at constexpr negation");
     S.Stk.push<T>(Result);
-    return true;
-  }
 
-  assert(isIntegralType(Name) &&
-         "don't expect other types to fail at constexpr negation");
-  S.Stk.push<T>(Result);
+    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+    if (S.checkingForUndefinedBehavior()) {
+      const Expr *E = S.Current->getExpr(OpPC);
+      QualType Type = E->getType();
+      SmallString<32> Trunc;
+      NegatedValue.trunc(Result.bitWidth())
+          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                    /*UpperCase=*/true, /*InsertSeparators=*/true);
+      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+          << Trunc << Type << E->getSourceRange();
+      return true;
+    }
 
-  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-  if (S.checkingForUndefinedBehavior()) {
-    const Expr *E = S.Current->getExpr(OpPC);
-    QualType Type = E->getType();
-    SmallString<32> Trunc;
-    NegatedValue.trunc(Result.bitWidth())
-        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                  /*UpperCase=*/true, /*InsertSeparators=*/true);
-    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-        << Trunc << Type << E->getSourceRange();
-    return true;
+    return handleOverflow(S, OpPC, NegatedValue);
   }
-
-  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -783,6 +837,8 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -890,7 +946,6 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
-
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -898,7 +953,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result;
+  Floating Result = S.allocFloat(Value.getSemantics());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -952,12 +1007,15 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
+
   T Result;
+  if constexpr (needsAlloc<T>())
+    Result = S.allocAP<T>(Val.bitWidth());
+
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
-
   return false;
 }
 
@@ -1325,10 +1383,23 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Arg.bitWidth());
+    Result.copy(Arg.toAPSInt());
+    S.Stk.push<T>(Result);
+    return true;
+  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
+inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
+  Floating Result = S.allocFloat(F.getSemantics());
+  Result.copy(F.getAPFloat());
+  S.Stk.push<Floating>(Result);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1483,7 +1554,24 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
+
   P.deref<T>() = S.Stk.pop<T>();
+
+  if constexpr (std::is_same_v<T, Floating>) {
+    auto &Val = P.deref<Floating>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+
+  } else if constexpr (needsAlloc<T>()) {
+    auto &Val = P.deref<T>();
+    if (!Val.singleWord()) {
+      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
+      Val.take(NewMemory);
+    }
+  }
+
   P.initialize();
   return true;
 }
@@ -1585,7 +1673,22 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+
+  if constexpr (needsAlloc<T>()) {
+    T Result = S.allocAP<T>(Value.bitWidth());
+    if (T::isSigned())
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .sextOrTrunc(Value.bitWidth()));
+    else
+      Result.copy(Value.toAPSInt()
+                      .trunc(F->Decl->getBitWidthValue())
+                      .zextOrTrunc(Value.bitWidth()));
+
+    Field.deref<T>() = Result;
+  } else {
+    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
+  }
   Field.activate();
   Field.initialize();
   return true;
@@ -1765,6 +1868,8 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
+bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
+
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2271,7 +2376,8 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = F.toSemantics(Sem, RM);
+  Floating Result = S.allocFloat(*Sem);
+  F.toSemantics(Sem, RM, &Result);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2295,15 +2401,25 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(
-      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(
-      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  // Copy data.
+  {
+    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
+    Result.copy(Source);
+  }
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2312,11 +2428,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
-  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
+  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2365,7 +2481,12 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<false>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2381,7 +2502,12 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
+
+  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
+  ResultAP.copy(Result);
+
+  S.Stk.push<IntegralAP<true>>(ResultAP);
+
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2441,8 +2567,9 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-
-  S.Stk.push<Floating>(Fixed.toFloat(Sem));
+  Floating Result = S.allocFloat(*Sem);
+  Result.copy(Fixed.toFloat(Sem));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -2506,12 +2633,18 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<false>>(Result);
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  if (!Result.singleWord())
+    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
+  S.Stk.push<IntegralAP<true>>(Result);
   return true;
 }
 
@@ -2578,7 +2711,9 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                    LT *Result) {
+
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2596,7 +2731,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS);
+        S, OpPC, LHS, RHS, Result);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2644,6 +2779,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
+      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2652,6 +2788,48 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   return true;
 }
 
+/// A version of DoShift that works on IntegralAP.
+template <class LT, class RT, ShiftDir Dir>
+inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
+                      LT *Result) {
+  const unsigned Bits = LHS.bitWidth();
+  const APSInt &LHSAP = LHS.toAPSInt();
+  APSInt RHSAP = RHS.toAPSInt();
+
+  // OpenCL 6.3j: shift values are effectively % word size of LHS.
+  if (S.getLangOpts().OpenCL)
+    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
+                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
+                    RHSAP.isUnsigned());
+
+  if (RHS.isNegative()) {
+    // During constant-folding, a negative shift is an opposite shift. Such a
+    // shift is not a constant expression.
+    const SourceInfo &Loc = S.Current->getSource(OpPC);
+    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
+    if (!S.noteUndefinedBehavior())
+      return false;
+    RHS = -RHS;
+    return DoShiftAP<LT, RT,
+                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
+        S, OpPC, LHS, RHS, Result);
+  }
+
+  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
+    return false;
+
+  if constexpr (Dir == ShiftDir::Left) {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP << SA);
+  } else {
+    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
+    Result->copy(LHSAP >> SA);
+  }
+
+  S.Stk.push<LT>(*Result);
+  return true;
+}
+
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2659,7 +2837,13 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2668,8 +2852,13 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-
-  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
+  if constexpr (needsAlloc<LT>()) {
+    LT Result = S.allocAP<LT>(LHS.bitWidth());
+    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  } else {
+    LT Result;
+    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
+  }
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3252,7 +3441,15 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+      Floating Result = S.allocFloat(*Sem);
+      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
+      S.Stk.push<Floating>(Result);
+
+      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
+    } else if constexpr (needsAlloc<T>()) {
+      T Result = S.allocAP<T>(ResultBitWidth);
+      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
+      S.Stk.push<T>(Result);
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3310,7 +3507,11 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  Floating F = Floating::deserialize(*OpPC);
+  auto &Semantics =
+      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
+
+  auto F = S.allocFloat(Semantics);
+  Floating::deserialize(*OpPC, &F);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3318,17 +3519,25 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<false>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
-  OpPC += align(I.bytesToSerialize());
-  return I;
+  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
+  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+  assert(Result.bitWidth() == BitWidth);
+
+  IntegralAP<true>::deserialize(*OpPC, &Result);
+  OpPC += align(Result.bytesToSerialize());
+  return Result;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d01e3d042a8bf..5304bd77f2c06 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,6 +57,21 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
+
+  if (T == PT_IntAPS) {
+    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<true>>(Result);
+    return;
+  }
+
+  if (T == PT_IntAP) {
+    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
+    Result.copy(Val);
+    S.Stk.push<IntegralAP<false>>(Result);
+    return;
+  }
+
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -327,13 +342,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result;
+  Floating Result = S.allocFloat(TargetSemantics);
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -342,10 +357,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result = Floating(
+      Result.copy(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -360,7 +375,9 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
+  Floating Result = S.allocFloat(TargetSemantics);
+  Result.copy(APFloat::getInf(TargetSemantics));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -368,10 +385,12 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  S.Stk.push<Floating>(Floating(Copy));
+  Result.copy(Copy);
+  S.Stk.push<Floating>(Result);
 
   return true;
 }
@@ -380,11 +399,13 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -392,11 +413,13 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
+  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
@@ -571,8 +594,16 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
+  APFloat F = Val.getAPFloat();
+  if (!F.isNegative()) {
+    S.Stk.push<Floating>(Val);
+    return true;
+  }
 
-  S.Stk.push<Floating>(Floating::abs(Val));
+  Floating Result = S.allocFloat(Val.getSemantics());
+  F.changeSign();
+  Result.copy(F);
+  S.Stk.push<Floating>(Result);
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 239b3104e89f1..2569cac018b31 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,7 +402,9 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
+          Floating R = S.allocFloat(Semantics);
+          Floating::bitcastFromMemory(M.get(), Semantics, &R);
+          P.deref<Floating>() = R;
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index e8dc6f0483d60..08765561985e2 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,6 +15,7 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
+#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -126,6 +127,33 @@ class InterpState final : public State, public SourceMapper {
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
+  void *allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *allocate(size_t Num = 1) const {
+    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
+  }
+
+  template <typename T> T allocAP(unsigned BitWidth) {
+    unsigned NumWords = APInt::getNumWords(BitWidth);
+    if (NumWords == 1)
+      return T(BitWidth);
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return T(Mem, BitWidth);
+  }
+
+  Floating allocFloat(const llvm::fltSemantics &Sem) {
+    if (Floating::singleWord(Sem))
+      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
+
+    unsigned NumWords =
+        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
+    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
+    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
+    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
+  }
+
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -161,6 +189,8 @@ class InterpState final : public State, public SourceMapper {
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
+
+  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index c76ac5f8ae868..57e01f7bd9da0 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,6 +48,7 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
+
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -88,6 +89,9 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
+def IntegralTypeClass : TypeClass {
+  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
+}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -265,12 +269,13 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstFloat : ConstOpcode<Float, ArgFloat>;
-def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
+def ConstFloat : Opcode { let Args = [ArgFloat]; }
+
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -328,6 +333,7 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
+def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -389,7 +395,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [AluTypeClass];
+  let Types = [IntegralTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index 6152fbfbe3a74..a156cccbb3c1b 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,6 +76,13 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
+template <typename T> constexpr bool needsAlloc() {
+  return std::is_same_v<T, IntegralAP<false>> ||
+         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
+}
+constexpr bool needsAlloc(PrimType T) {
+  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
+}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -209,6 +216,16 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
+#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
+  do {                                                                         \
+    switch (Expr) {                                                            \
+      TYPE_SWITCH_CASE(PT_Float, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
+      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
+    default:;                                                                  \
+    }                                                                          \
+  } while (0)
+
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 23ba1bbd193b1..5d9c422447493 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,6 +132,14 @@ class Program final {
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
+  void *Allocate(size_t Size, unsigned Align = 8) const {
+    return Allocator.Allocate(Size, Align);
+  }
+  template <typename T> T *Allocate(size_t Num = 1) const {
+    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
+  }
+  void Deallocate(void *Ptr) const {}
+
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -204,7 +212,7 @@ class Program final {
   };
 
   /// Allocator for globals.
-  PoolAllocTy Allocator;
+  mutable PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -238,4 +246,18 @@ class Program final {
 } // namespace interp
 } // namespace clang
 
+inline void *operator new(size_t Bytes, const clang::interp::Program &C,
+                          size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
+inline void operator delete(void *Ptr, const clang::interp::Program &C,
+                            size_t) {
+  C.Deallocate(Ptr);
+}
+inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
+                            size_t Alignment = 8) {
+  return C.Allocate(Bytes, Alignment);
+}
+
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 710612bef8fd0..1013a771d13b4 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,6 +21,9 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
+#if __x86_64
+  // both-note@-2 {{indeterminate value can only initialize an object of type}}
+#endif
 }
 
 template <class Intermediate, class Init>
@@ -38,11 +41,8 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
-#if 0
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
-                                                                                 // expected-note{{in call}}
-#endif
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
+                                                                                 // both-note{{in call}}
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 21dca15a45775..174c1ffa79a43 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  //constexpr long double NaN5 = __builtin_nanf128("");
+  constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,8 +655,6 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
-/// FIXME: The commented out tests here use a IntAP value and fail.
-/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -709,7 +707,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int clz50 = __builtin_clzg((unsigned __int128)0);
+  int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -717,7 +715,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -775,7 +773,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -785,7 +783,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 9ec75a50bc48c84c68430f113332769d23481ef5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 17 Jun 2025 09:36:52 -0700
Subject: [PATCH 761/851] MIPS: Replace MipsMCExpr with MCSpecifierExpr

---
 .../Target/Mips/AsmParser/MipsAsmParser.cpp   | 58 ++++++++++---------
 .../Target/Mips/MCTargetDesc/CMakeLists.txt   |  1 -
 .../Mips/MCTargetDesc/MipsELFObjectWriter.cpp |  1 +
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       |  8 +++
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  3 +-
 .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp   |  2 +-
 .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp   | 39 -------------
 .../lib/Target/Mips/MCTargetDesc/MipsMCExpr.h | 36 ------------
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  | 21 +++----
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp       |  2 +-
 llvm/lib/Target/Mips/MipsMCInstLower.cpp      | 10 ++--
 llvm/lib/Target/Mips/MipsTargetObjectFile.cpp |  2 +-
 12 files changed, 58 insertions(+), 125 deletions(-)
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
 delete mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h

diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 7ea7c58f1a512..071c016b92e7f 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -2965,9 +2965,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
         Res.getConstant() == 0 && !IsLocalSym) {
       if (UseXGOT) {
         const MCExpr *CallHiExpr =
-            MipsMCExpr::create(Mips::S_CALL_HI16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_HI16, getContext());
         const MCExpr *CallLoExpr =
-            MipsMCExpr::create(Mips::S_CALL_LO16, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_CALL_LO16, getContext());
         TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                     STI);
         TOut.emitRRR(IsPtr64 ? Mips::DADDu : Mips::ADDu, DstReg, DstReg, GPReg,
@@ -2976,7 +2976,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                      MCOperand::createExpr(CallLoExpr), IDLoc, STI);
       } else {
         const MCExpr *CallExpr =
-            MipsMCExpr::create(Mips::S_GOT_CALL, SymExpr, getContext());
+            MCSpecifierExpr::create(SymExpr, Mips::S_GOT_CALL, getContext());
         TOut.emitRRX(IsPtr64 ? Mips::LD : Mips::LW, DstReg, GPReg,
                      MCOperand::createExpr(CallExpr), IDLoc, STI);
       }
@@ -3009,9 +3009,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       const MCExpr *CallHiExpr =
-          MipsMCExpr::create(Mips::S_GOT_HI16, SymExpr, getContext());
-      const MCExpr *CallLoExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_LO16, getContext());
+          MCSpecifierExpr::create(SymExpr, Mips::S_GOT_HI16, getContext());
+      const MCExpr *CallLoExpr = MCSpecifierExpr::create(
+          Res.getAddSym(), Mips::S_GOT_LO16, getContext());
 
       TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc,
                   STI);
@@ -3042,8 +3042,8 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // The daddiu's marked with a '>' may be omitted if they are redundant. If
       // this happens then the last instruction must use $rd as the result
       // register.
-      GotExpr =
-          MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT_DISP, getContext());
+      GotExpr = MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT_DISP,
+                                        getContext());
       if (Res.getConstant() != 0) {
         // Symbols fully resolve with just the %got_disp(symbol) but we
         // must still account for any offset to the symbol for
@@ -3070,14 +3070,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       // this happens then the last instruction must use $rd as the result
       // register.
       if (IsLocalSym) {
-        GotExpr = MipsMCExpr::create(Mips::S_GOT, SymExpr, getContext());
-        LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+        GotExpr = MCSpecifierExpr::create(SymExpr, Mips::S_GOT, getContext());
+        LoExpr = MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
       } else {
         // External symbols fully resolve the symbol with just the %got(symbol)
         // but we must still account for any offset to the symbol for
         // expressions like symbol+8.
         GotExpr =
-            MipsMCExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
+            MCSpecifierExpr::create(Res.getAddSym(), Mips::S_GOT, getContext());
         if (Res.getConstant() != 0)
           LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
       }
@@ -3097,8 +3097,10 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
-  const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, SymExpr, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, SymExpr, getContext());
+  const auto *HiExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_HI, getContext());
+  const auto *LoExpr =
+      MCSpecifierExpr::create(SymExpr, Mips::S_LO, getContext());
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
@@ -3110,9 +3112,9 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // source register.
 
     const auto *HighestExpr =
-        MipsMCExpr::create(Mips::S_HIGHEST, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHEST, getContext());
     const auto *HigherExpr =
-        MipsMCExpr::create(Mips::S_HIGHER, SymExpr, getContext());
+        MCSpecifierExpr::create(SymExpr, Mips::S_HIGHER, getContext());
 
     bool RdRegIsRsReg =
         UseSrcReg &&
@@ -3310,7 +3312,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
 
   if(IsPicEnabled) {
     const MCExpr *GotSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *GotExpr = MipsMCExpr::create(Mips::S_GOT, GotSym, getContext());
+    const auto *GotExpr =
+        MCSpecifierExpr::create(GotSym, Mips::S_GOT, getContext());
 
     if(isABI_O32() || isABI_N32()) {
       TOut.emitRRX(Mips::LW, ATReg, GPReg, MCOperand::createExpr(GotExpr),
@@ -3321,7 +3324,8 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     }
   } else { //!IsPicEnabled
     const MCExpr *HiSym = MCSymbolRefExpr::create(Sym, getContext());
-    const auto *HiExpr = MipsMCExpr::create(Mips::S_HI, HiSym, getContext());
+    const auto *HiExpr =
+        MCSpecifierExpr::create(HiSym, Mips::S_HI, getContext());
 
     // FIXME: This is technically correct but gives a different result to gas,
     // but gas is incomplete there (it has a fixme noting it doesn't work with
@@ -3334,10 +3338,10 @@ bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
     } else { //isABI_N64()
       const MCExpr *HighestSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HighestExpr =
-          MipsMCExpr::create(Mips::S_HIGHEST, HighestSym, getContext());
+          MCSpecifierExpr::create(HighestSym, Mips::S_HIGHEST, getContext());
       const MCExpr *HigherSym = MCSymbolRefExpr::create(Sym, getContext());
       const auto *HigherExpr =
-          MipsMCExpr::create(Mips::S_HIGHER, HigherSym, getContext());
+          MCSpecifierExpr::create(HigherSym, Mips::S_HIGHER, getContext());
 
       TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
                   STI);
@@ -3424,7 +3428,7 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3474,7 +3478,7 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3554,7 +3558,7 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
 
   MCSymbol *Sym = getContext().createTempSymbol();
   const MCExpr *LoSym = MCSymbolRefExpr::create(Sym, getContext());
-  const auto *LoExpr = MipsMCExpr::create(Mips::S_LO, LoSym, getContext());
+  const auto *LoExpr = MCSpecifierExpr::create(LoSym, Mips::S_LO, getContext());
 
   getStreamer().switchSection(ReadOnlySection);
   getStreamer().emitLabel(Sym, IDLoc);
@@ -3777,15 +3781,15 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       //                  sw  $8,  %lo(sym)($at)
       const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_LO, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_LO, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(Mips::S_HI, OffExpr, getContext()));
+          MCSpecifierExpr::create(OffExpr, Mips::S_HI, getContext()));
 
       if (ABI.IsN64()) {
         MCOperand HighestOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHEST, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHEST, getContext()));
         MCOperand HigherOperand = MCOperand::createExpr(
-            MipsMCExpr::create(Mips::S_HIGHER, OffExpr, getContext()));
+            MCSpecifierExpr::create(OffExpr, Mips::S_HIGHER, getContext()));
 
         TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
         TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
@@ -6394,7 +6398,7 @@ const MCExpr *MipsAsmParser::parseRelocExpr() {
   while (Ops.size()) {
     if (Parser.parseToken(AsmToken::RParen, "expected ')'"))
       return nullptr;
-    Res = MipsMCExpr::create(Ops.pop_back_val(), Res, getContext());
+    Res = MCSpecifierExpr::create(Res, Ops.pop_back_val(), getContext());
   }
   return Res;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index d3f16e5042c3a..8b73a7bdd4bc1 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -7,7 +7,6 @@ add_llvm_component_library(LLVMMipsDesc
   MipsInstPrinter.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
-  MipsMCExpr.cpp
   MipsMCTargetDesc.cpp
   MipsNaClELFStreamer.cpp
   MipsOptionRecord.cpp
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 58aa374e5302d..25e31941bbb45 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index b64f86f382974..0941d93fe0eb6 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCAsmInfo.h"
 #include "MipsABIInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -59,6 +60,13 @@ MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
   AllowAtInName = true;
 }
 
+const MCSpecifierExpr *Mips::createGpOff(const MCExpr *Expr, Mips::Specifier S,
+                                         MCContext &Ctx) {
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_GPREL, Ctx);
+  Expr = MCSpecifierExpr::create(Expr, Mips::S_NEG, Ctx);
+  return MCSpecifierExpr::create(Expr, S, Ctx);
+}
+
 static void printImpl(const MCAsmInfo &MAI, raw_ostream &OS,
                       const MCSpecifierExpr &Expr) {
   int64_t AbsVal;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 0975116328fc1..6ba90a5c20257 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
-#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/MC/MCFixup.h"
@@ -77,6 +76,8 @@ enum {
 };
 
 bool isGpOff(const MCSpecifierExpr &E);
+const MCSpecifierExpr *createGpOff(const MCExpr *Expr, Specifier S,
+                                   MCContext &Ctx);
 }
 
 } // namespace llvm
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index d2981c4ad4d20..35d4e0db35c31 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -581,7 +581,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                const MCSubtargetInfo &STI) const {
   MCExpr::ExprKind Kind = Expr->getKind();
   if (Kind == MCExpr::Specifier) {
-    const MipsMCExpr *MipsExpr = cast<MipsMCExpr>(Expr);
+    const auto *MipsExpr = cast<MCSpecifierExpr>(Expr);
 
     Mips::Fixups FixupKind = Mips::Fixups(0);
     switch (MipsExpr->getSpecifier()) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
deleted file mode 100644
index 821f662f0cbfb..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- MipsMCExpr.cpp - Mips specific MC expression classes --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MipsMCExpr.h"
-#include "MCTargetDesc/MipsMCAsmInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mipsmcexpr"
-
-const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::Specifier S,
-                                     const MCExpr *Expr, MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(Expr, S);
-}
-
-const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S,
-                                     MCContext &Ctx) {
-  return new (Ctx) MipsMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S);
-}
-
-const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S,
-                                          const MCExpr *Expr, MCContext &Ctx) {
-  return create(S, create(Mips::S_NEG, create(Mips::S_GPREL, Expr, Ctx), Ctx),
-                Ctx);
-}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
deleted file mode 100644
index b78aeabb57992..0000000000000
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
-
-namespace llvm {
-
-class MipsMCExpr : public MCSpecifierExpr {
-public:
-  using Specifier = Spec;
-
-private:
-  explicit MipsMCExpr(const MCExpr *Expr, Specifier S)
-      : MCSpecifierExpr(Expr, S) {}
-
-public:
-  static const MipsMCExpr *create(Specifier S, const MCExpr *Expr,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *create(const MCSymbol *Sym, Specifier S,
-                                  MCContext &Ctx);
-  static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr,
-                                       MCContext &Ctx);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 80a854c799014..6097ad8017846 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -16,7 +16,6 @@
 #include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
-#include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -1266,9 +1265,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   MCInst TmpInst;
   TmpInst.setOpcode(Mips::LUi);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *HiSym = MipsMCExpr::create(
-      Mips::S_HI, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *HiSym = MCSpecifierExpr::create(GP_Disp, Mips::S_HI, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1277,9 +1274,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.setOpcode(Mips::ADDiu);
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
-  const MCExpr *LoSym = MipsMCExpr::create(
-      Mips::S_LO, MCSymbolRefExpr::create(GP_Disp, MCA.getContext()),
-      MCA.getContext());
+  auto *LoSym = MCSpecifierExpr::create(GP_Disp, Mips::S_LO, MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
   getStreamer().emitInstruction(TmpInst, STI);
 
@@ -1342,12 +1337,12 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
     emitRRI(Mips::SD, GPReg, Mips::SP, RegOrOffset, SMLoc(), &STI);
   }
 
-  const MipsMCExpr *HiExpr = MipsMCExpr::createGpOff(
-      Mips::S_HI, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
-  const MipsMCExpr *LoExpr = MipsMCExpr::createGpOff(
-      Mips::S_LO, MCSymbolRefExpr::create(&Sym, MCA.getContext()),
-      MCA.getContext());
+  auto *HiExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_HI, MCA.getContext());
+  auto *LoExpr =
+      Mips::createGpOff(MCSymbolRefExpr::create(&Sym, MCA.getContext()),
+                        Mips::S_LO, MCA.getContext());
 
   // lui $gp, %hi(%neg(%gp_rel(funcSym)))
   emitRX(Mips::LUi, GPReg, MCOperand::createExpr(HiExpr), SMLoc(), &STI);
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index da3f7cb55b301..a6300a9c11d49 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -1244,7 +1244,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
 void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
-  if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
+  if (auto *MipsExpr = dyn_cast<MCSpecifierExpr>(Value)) {
     if (MipsExpr && MipsExpr->getSpecifier() == Mips::S_DTPREL) {
       switch (Size) {
       case 4:
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index 935fcd8fa7154..cdf58384427f2 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -175,9 +175,9 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   if (IsGpOff)
-    Expr = MipsMCExpr::createGpOff(TargetKind, Expr, *Ctx);
+    Expr = Mips::createGpOff(Expr, TargetKind, *Ctx);
   else if (TargetKind != Mips::S_None)
-    Expr = MipsMCExpr::create(TargetKind, Expr, *Ctx);
+    Expr = MCSpecifierExpr::create(Expr, TargetKind, *Ctx);
 
   return MCOperand::createExpr(Expr);
 }
@@ -216,7 +216,7 @@ MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
   const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::create(BB2->getSymbol(), *Ctx);
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Sym1, Sym2, *Ctx);
 
-  return MCOperand::createExpr(MipsMCExpr::create(Kind, Sub, *Ctx));
+  return MCOperand::createExpr(MCSpecifierExpr::create(Sub, Kind, *Ctx));
 }
 
 void MipsMCInstLower::
@@ -248,7 +248,7 @@ lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
   if (MI->getNumOperands() == 2) {
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(1).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 3) {
     // Create %hi($tgt-$baltgt).
@@ -290,7 +290,7 @@ void MipsMCInstLower::lowerLongBranchADDiu(const MachineInstr *MI,
     // Lower register operand.
     const MCExpr *Expr =
         MCSymbolRefExpr::create(MI->getOperand(2).getMBB()->getSymbol(), *Ctx);
-    const auto *MipsExpr = MipsMCExpr::create(Spec, Expr, *Ctx);
+    const auto *MipsExpr = MCSpecifierExpr::create(Expr, Spec, *Ctx);
     OutMI.addOperand(MCOperand::createExpr(MipsExpr));
   } else if (MI->getNumOperands() == 4) {
     // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 23aa699318a2e..78a9f3b7cc71b 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -189,5 +189,5 @@ MipsTargetObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, getContext());
   Expr = MCBinaryExpr::createAdd(
       Expr, MCConstantExpr::create(0x8000, getContext()), getContext());
-  return MipsMCExpr::create(Mips::S_DTPREL, Expr, getContext());
+  return MCSpecifierExpr::create(Expr, Mips::S_DTPREL, getContext());
 }

From 382e3fdbb476a5d5771b315daedcd05a15883fbc Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Tue, 17 Jun 2025 17:37:27 +0100
Subject: [PATCH 762/851] [lldb][Formatter] Get element type for unordered_maps
 from __hash_table::value_type (#144517)

https://github.com/llvm/llvm-project/pull/143501 changes usage of
`__hash_value_type` in libcxx to an empty tag type. This type will no
longer have a definition in DWARF. Currently the LLDB unordered_map
formatter deduces the map's `element_type` by looking at the `__cc_`
member of `__hash_value_type`. But that will no longer work because we
only have its forward declaration. Since what we're really after is the
type that `__hash_value_type` is wrapping, we can just look at the
`__hash_table::value_type` typedef. With
https://github.com/llvm/llvm-project/pull/143501 that will now point to
the `std::pair` element type (which used to be what we got from
`__cc_`).

TBD: need to double-check this works for older layouts. Quick glance at
the code makes me suspicious of cases like `unordered_map<std::pair<int,
int>, int>`
---
 .../Language/CPlusPlus/LibCxxUnorderedMap.cpp  | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 642723dd91132..ffc33395830bb 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -99,14 +99,20 @@ static bool isUnorderedMap(ConstString type_name) {
 
 CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     GetElementType(CompilerType table_type) {
-  auto element_type = table_type.GetTypedefedType().GetTypeTemplateArgument(0);
+  auto element_type =
+      table_type.GetDirectNestedTypeWithName("value_type").GetTypedefedType();
+
+  // In newer unordered_map layouts, the std::pair element type isn't wrapped
+  // in any helper types. So return it directly.
+  if (isStdTemplate(element_type.GetTypeName(), "pair"))
+    return element_type;
 
   // This synthetic provider is used for both unordered_(multi)map and
-  // unordered_(multi)set. For unordered_map, the element type has an
-  // additional type layer, an internal struct (`__hash_value_type`)
-  // that wraps a std::pair. Peel away the internal wrapper type - whose
-  // structure is of no value to users, to expose the std::pair. This
-  // matches the structure returned by the std::map synthetic provider.
+  // unordered_(multi)set. For older unordered_map layouts, the element type has
+  // an additional type layer, an internal struct (`__hash_value_type`) that
+  // wraps a std::pair. Peel away the internal wrapper type - whose structure is
+  // of no value to users, to expose the std::pair. This matches the structure
+  // returned by the std::map synthetic provider.
   if (isUnorderedMap(
           m_backend.GetCompilerType().GetCanonicalType().GetTypeName())) {
     std::string name;

From 4e884dd993e040f7ccd83ecdc3c4570d23a42ee6 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 17 Jun 2025 09:42:53 -0700
Subject: [PATCH 763/851] SPARC: Remove SparcMCExpr.h

---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 11 ++++---
 .../MCTargetDesc/SparcELFObjectWriter.cpp     |  4 +--
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.cpp     |  1 -
 .../Sparc/MCTargetDesc/SparcMCAsmInfo.h       |  5 +++
 .../Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp |  8 ++---
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.cpp |  2 +-
 .../Target/Sparc/MCTargetDesc/SparcMCExpr.h   | 32 -------------------
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp     |  6 ++--
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |  2 +-
 .../Target/Sparc/SparcTargetObjectFile.cpp    |  2 +-
 10 files changed, 23 insertions(+), 50 deletions(-)
 delete mode 100644 llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 90aacacd8ed2d..28ae349031669 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
@@ -109,7 +109,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   ParseStatus parseExpression(int64_t &Val);
 
   // Helper function for dealing with %lo / %hi in PIC mode.
-  const SparcMCExpr *adjustPICRelocation(uint16_t VK, const MCExpr *subExpr);
+  const MCSpecifierExpr *adjustPICRelocation(uint16_t VK,
+                                             const MCExpr *subExpr);
 
   // Helper function to see if current token can start an expression.
   bool isPossibleExpression(const AsmToken &Token);
@@ -1642,7 +1643,7 @@ MCRegister SparcAsmParser::matchRegisterName(const AsmToken &Tok,
 static bool hasGOTReference(const MCExpr *Expr) {
   switch (Expr->getKind()) {
   case MCExpr::Target:
-    if (const SparcMCExpr *SE = dyn_cast<SparcMCExpr>(Expr))
+    if (const MCSpecifierExpr *SE = dyn_cast<MCSpecifierExpr>(Expr))
       return hasGOTReference(SE->getSubExpr());
     break;
 
@@ -1668,8 +1669,8 @@ static bool hasGOTReference(const MCExpr *Expr) {
   return false;
 }
 
-const SparcMCExpr *SparcAsmParser::adjustPICRelocation(uint16_t RelType,
-                                                       const MCExpr *subExpr) {
+const MCSpecifierExpr *
+SparcAsmParser::adjustPICRelocation(uint16_t RelType, const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSET_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
index bef7f3c02dae3..2a581d381d4ab 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -72,7 +72,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup,
   if (mc::isRelocation(Fixup.getKind()))
     return Kind;
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Fixup.getValue())) {
+  if (const auto *SExpr = dyn_cast<MCSpecifierExpr>(Fixup.getValue())) {
     if (SExpr->getSpecifier() == ELF::R_SPARC_DISP32)
       return ELF::R_SPARC_DISP32;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 800567bf58ffa..36365593e2460 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcMCAsmInfo.h"
-#include "SparcMCExpr.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index 7ea800f119174..a4a2fa3f9933e 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -36,6 +36,11 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
                           const MCSpecifierExpr &Expr) const override;
 };
 
+namespace Sparc {
+uint16_t parseSpecifier(StringRef name);
+StringRef getSpecifierName(uint16_t S);
+} // namespace Sparc
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 4ce9bea5d7958..8ba99719946a2 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcFixupKinds.h"
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "SparcMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -134,7 +134,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 
   assert(MO.isExpr());
   const MCExpr *Expr = MO.getExpr();
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -164,7 +164,7 @@ unsigned SparcMCCodeEmitter::getSImm5OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
@@ -190,7 +190,7 @@ SparcMCCodeEmitter::getSImm13OpValue(const MCInst &MI, unsigned OpNo,
   if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
     return CE->getValue();
 
-  if (const SparcMCExpr *SExpr = dyn_cast<SparcMCExpr>(Expr)) {
+  if (auto *SExpr = dyn_cast<MCSpecifierExpr>(Expr)) {
     Fixups.push_back(MCFixup::create(0, Expr, SExpr->getSpecifier()));
     return 0;
   }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index 6d43b93713906..1ee6e80985605 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
deleted file mode 100644
index 8e7c173c70ccb..0000000000000
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//====- SparcMCExpr.h - Sparc specific MC expression classes --*- C++ -*-=====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes Sparc-specific MCExprs, used for modifiers like
-// "%hi" or "%lo" etc.,
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-#define LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCEXPR_H
-
-#include "SparcFixupKinds.h"
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class StringRef;
-using SparcMCExpr = MCSpecifierExpr;
-
-namespace Sparc {
-uint16_t parseSpecifier(StringRef name);
-StringRef getSpecifierName(uint16_t S);
-} // namespace Sparc
-
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index f4201f9a8dc1a..5366e905d6df0 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SparcInstPrinter.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "MCTargetDesc/SparcTargetStreamer.h"
 #include "Sparc.h"
@@ -82,7 +82,7 @@ class SparcAsmPrinter : public AsmPrinter {
 static MCOperand createSparcMCOperand(uint16_t Kind, MCSymbol *Sym,
                                       MCContext &OutContext) {
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
+  auto *expr = MCSpecifierExpr::create(MCSym, Kind, OutContext);
   return MCOperand::createExpr(expr);
 }
 static MCOperand createPCXCallOP(MCSymbol *Label,
@@ -101,7 +101,7 @@ static MCOperand createPCXRelExprOp(uint16_t Spec, MCSymbol *GOTLabel,
 
   const MCBinaryExpr *Sub = MCBinaryExpr::createSub(Cur, Start, OutContext);
   const MCBinaryExpr *Add = MCBinaryExpr::createAdd(GOT, Sub, OutContext);
-  const SparcMCExpr *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
+  auto *expr = MCSpecifierExpr::create(Add, Spec, OutContext);
   return MCOperand::createExpr(expr);
 }
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index a6ea079746095..21ecf3d5ed70e 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcISelLowering.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "SparcMachineFunctionInfo.h"
 #include "SparcRegisterInfo.h"
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index a42a67d91d848..711bf9b31a377 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetObjectFile.h"
-#include "MCTargetDesc/SparcMCExpr.h"
+#include "MCTargetDesc/SparcMCAsmInfo.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"

From 0c608175c11cf0ce797be7575a7c8d8ebcdecbd8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 17 Jun 2025 17:43:04 +0100
Subject: [PATCH 764/851] [X86] matchLogicBlend - convert to SDPatternMatch
 matching. NFC. (#144546)

Removes a LOT of commutative matching.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 36 +++++--------------------
 1 file changed, 7 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4cff42c2ac464..7f425b3d479dd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52084,36 +52084,14 @@ static SDValue canonicalizeBitSelect(SDNode *N, const SDLoc &DL,
   return DAG.getNode(ISD::OR, DL, VT, X, Y);
 }
 
-// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Try to match OR(ANDNP(MASK,X),AND(MASK,Y)) logic pattern.
+// TODO: Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
+// Waiting for ANDNP combine allows other combines to happen that prevent
+// matching.
 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
-  if (N->getOpcode() != ISD::OR)
-    return false;
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Canonicalize AND to LHS.
-  if (N1.getOpcode() == ISD::AND)
-    std::swap(N0, N1);
-
-  // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
-  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
-    return false;
-
-  Mask = N1.getOperand(0);
-  X = N1.getOperand(1);
-
-  // Check to see if the mask appeared in both the AND and ANDNP.
-  if (N0.getOperand(0) == Mask)
-    Y = N0.getOperand(1);
-  else if (N0.getOperand(1) == Mask)
-    Y = N0.getOperand(0);
-  else
-    return false;
-
-  // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
-  // ANDNP combine allows other combines to happen that prevent matching.
-  return true;
+  using namespace SDPatternMatch;
+  return sd_match(N, m_Or(m_BinOp(X86ISD::ANDNP, m_Value(Mask), m_Value(X)),
+                          m_And(m_Deferred(Mask), m_Value(Y))));
 }
 
 // Try to fold:

From b14e03d8555043bc35e9c75fff7f52d28950b3ab Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 17 Jun 2025 18:44:37 +0200
Subject: [PATCH 765/851] [LLDB] Consolidate C++ string buffer summaries
 (#144258)

As part of https://github.com/llvm/llvm-project/pull/143177, I moved the
non-libc++ specific formatting of `std::string`s out to `CxxStringTypes`
as MSVC's STL `std::string` can also be thought of a pointer+size pair.
I named this kind of string "string buffer".

This PR picks that change, so the MSVC PR can be smaller.
Unfortunately, libstdc++'s `std::string` does not fit this (it also uses
a different string printer function).

This resolves two FIXMEs in the libc++ tests, where empty u16 and u32
strings didn't have any prefix (u/U).
---
 .../Language/CPlusPlus/CxxStringTypes.cpp     | 102 +++++++++---
 .../Language/CPlusPlus/CxxStringTypes.h       |  29 ++++
 .../Plugins/Language/CPlusPlus/LibCxx.cpp     | 147 ++++--------------
 .../string/TestDataFormatterLibcxxString.py   |   8 +-
 .../TestDataFormatterLibcxxStringView.py      |   8 +-
 5 files changed, 148 insertions(+), 146 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
index fc17b76804d9f..bf8c393445908 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.cpp
@@ -116,15 +116,7 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-  // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -136,13 +128,13 @@ bool lldb_private::formatters::WCharStringSummaryProvider(
   options.SetPrefixToken("L");
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -177,15 +169,7 @@ bool lldb_private::formatters::WCharSummaryProvider(
     return false;
 
   // Get a wchar_t basic type from the current type system
-  CompilerType wchar_compiler_type =
-      valobj.GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeWChar);
-
-  if (!wchar_compiler_type)
-    return false;
-
-    // Safe to pass nullptr for exe_scope here.
-  std::optional<uint64_t> size =
-      llvm::expectedToOptional(wchar_compiler_type.GetBitSize(nullptr));
+  std::optional<uint64_t> size = GetWCharByteSize(valobj);
   if (!size)
     return false;
   const uint32_t wchar_size = *size;
@@ -199,13 +183,13 @@ bool lldb_private::formatters::WCharSummaryProvider(
   options.SetBinaryZeroIsTerminator(false);
 
   switch (wchar_size) {
-  case 8:
+  case 1:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>(
         options);
-  case 16:
+  case 2:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>(
         options);
-  case 32:
+  case 4:
     return StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>(
         options);
   default:
@@ -214,3 +198,73 @@ bool lldb_private::formatters::WCharSummaryProvider(
   }
   return true;
 }
+
+std::optional<uint64_t>
+lldb_private::formatters::GetWCharByteSize(ValueObject &valobj) {
+  return llvm::expectedToOptional(
+      valobj.GetCompilerType()
+          .GetBasicTypeFromAST(lldb::eBasicTypeWChar)
+          .GetByteSize(nullptr));
+}
+
+template <StringPrinter::StringElementType element_type>
+bool lldb_private::formatters::StringBufferSummaryProvider(
+    Stream &stream, const TypeSummaryOptions &summary_options,
+    lldb::ValueObjectSP location_sp, uint64_t size, std::string prefix_token) {
+
+  if (size == 0) {
+    stream.PutCString(prefix_token);
+    stream.PutCString("\"\"");
+    return true;
+  }
+
+  if (!location_sp)
+    return false;
+
+  StringPrinter::ReadBufferAndDumpToStreamOptions options(*location_sp);
+
+  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
+    const auto max_size =
+        location_sp->GetTargetSP()->GetMaximumSizeOfStringSummary();
+    if (size > max_size) {
+      size = max_size;
+      options.SetIsTruncated(true);
+    }
+  }
+
+  {
+    DataExtractor extractor;
+    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
+    if (bytes_read < size)
+      return false;
+
+    options.SetData(std::move(extractor));
+  }
+  options.SetStream(&stream);
+  if (prefix_token.empty())
+    options.SetPrefixToken(nullptr);
+  else
+    options.SetPrefixToken(prefix_token);
+  options.SetQuote('"');
+  options.SetSourceSize(size);
+  options.SetBinaryZeroIsTerminator(false);
+  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+}
+
+// explicit instantiations for all string element types
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::ASCII>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF8>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF16>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
+template bool
+lldb_private::formatters::StringBufferSummaryProvider<StringElementType::UTF32>(
+    Stream &, const TypeSummaryOptions &, lldb::ValueObjectSP, uint64_t,
+    std::string);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
index a2b606d28cac1..337dcf2fefdcf 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/CxxStringTypes.h
@@ -10,6 +10,7 @@
 #ifndef LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 #define LLDB_SOURCE_PLUGINS_LANGUAGE_CPLUSPLUS_CXXSTRINGTYPES_H
 
+#include "lldb/DataFormatters/StringPrinter.h"
 #include "lldb/DataFormatters/TypeSummary.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/ValueObject/ValueObject.h"
@@ -43,6 +44,34 @@ bool Char32SummaryProvider(ValueObject &valobj, Stream &stream,
 bool WCharSummaryProvider(ValueObject &valobj, Stream &stream,
                           const TypeSummaryOptions &options); // wchar_t
 
+std::optional<uint64_t> GetWCharByteSize(ValueObject &valobj);
+
+/// Print a summary for a string buffer to \a stream.
+///
+/// \param[in] stream
+///     The output stream to print the summary to.
+///
+/// \param[in] summary_options
+///     Options for printing the string contents. This function respects the
+///     capping.
+///
+/// \param[in] location_sp
+///     ValueObject of a pointer to the string being printed.
+///
+/// \param[in] size
+///     The size of the buffer pointed to by \a location_sp.
+///
+/// \param[in] prefix_token
+///     A prefix before the double quotes (e.g. 'u' results in u"...").
+///
+/// \return
+///     Returns whether the string buffer was successfully printed.
+template <StringPrinter::StringElementType element_type>
+bool StringBufferSummaryProvider(Stream &stream,
+                                 const TypeSummaryOptions &summary_options,
+                                 lldb::ValueObjectSP location_sp, uint64_t size,
+                                 std::string prefix_token);
+
 } // namespace formatters
 } // namespace lldb_private
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 358cf7d78fa21..7143089209dd3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -24,6 +24,7 @@
 #include "lldb/ValueObject/ValueObject.h"
 #include "lldb/ValueObject/ValueObjectConstResult.h"
 
+#include "Plugins/Language/CPlusPlus/CxxStringTypes.h"
 #include "Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/lldb-enumerations.h"
@@ -535,70 +536,6 @@ ExtractLibcxxStringInfo(ValueObject &valobj) {
   return std::make_pair(size, location_sp);
 }
 
-static bool
-LibcxxWStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                             const TypeSummaryOptions &summary_options,
-                             ValueObjectSP location_sp, size_t size) {
-  if (size == 0) {
-    stream.Printf("L\"\"");
-    return true;
-  }
-  if (!location_sp)
-    return false;
-
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
-  }
-
-  DataExtractor extractor;
-  const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-  if (bytes_read < size)
-    return false;
-
-  // std::wstring::size() is measured in 'characters', not bytes
-  TypeSystemClangSP scratch_ts_sp =
-      ScratchTypeSystemClang::GetForTarget(*valobj.GetTargetSP());
-  if (!scratch_ts_sp)
-    return false;
-
-  auto wchar_t_size =
-      scratch_ts_sp->GetBasicType(lldb::eBasicTypeWChar).GetByteSize(nullptr);
-  if (!wchar_t_size)
-    return false;
-
-  options.SetData(std::move(extractor));
-  options.SetStream(&stream);
-  options.SetPrefixToken("L");
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-
-  switch (*wchar_t_size) {
-  case 1:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF8>(
-        options);
-    break;
-
-  case 2:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF16>(
-        options);
-    break;
-
-  case 4:
-    return StringPrinter::ReadBufferAndDumpToStream<
-        lldb_private::formatters::StringPrinter::StringElementType::UTF32>(
-        options);
-  }
-  return false;
-}
-
 bool lldb_private::formatters::LibcxxWStringSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &summary_options) {
@@ -609,52 +546,22 @@ bool lldb_private::formatters::LibcxxWStringSummaryProvider(
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        location_sp, size);
-}
-
-template <StringPrinter::StringElementType element_type>
-static bool
-LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
-                            const TypeSummaryOptions &summary_options,
-                            std::string prefix_token, ValueObjectSP location_sp,
-                            uint64_t size) {
-
-  if (size == 0) {
-    stream.Printf("\"\"");
-    return true;
-  }
-
-  if (!location_sp)
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
     return false;
 
-  StringPrinter::ReadBufferAndDumpToStreamOptions options(valobj);
-
-  if (summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryCapped) {
-    const auto max_size = valobj.GetTargetSP()->GetMaximumSizeOfStringSummary();
-    if (size > max_size) {
-      size = max_size;
-      options.SetIsTruncated(true);
-    }
-  }
-
-  {
-    DataExtractor extractor;
-    const size_t bytes_read = location_sp->GetPointeeData(extractor, 0, size);
-    if (bytes_read < size)
-      return false;
-
-    options.SetData(std::move(extractor));
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, location_sp, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, location_sp, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, location_sp, size, "L");
   }
-  options.SetStream(&stream);
-  if (prefix_token.empty())
-    options.SetPrefixToken(nullptr);
-  else
-    options.SetPrefixToken(prefix_token);
-  options.SetQuote('"');
-  options.SetSourceSize(size);
-  options.SetBinaryZeroIsTerminator(false);
-  return StringPrinter::ReadBufferAndDumpToStream<element_type>(options);
+  return false;
 }
 
 template <StringPrinter::StringElementType element_type>
@@ -669,8 +576,8 @@ LibcxxStringSummaryProvider(ValueObject &valobj, Stream &stream,
   ValueObjectSP location_sp;
   std::tie(size, location_sp) = *string_info;
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, location_sp, size);
+  return StringBufferSummaryProvider<element_type>(
+      stream, summary_options, location_sp, size, prefix_token);
 }
 template <StringPrinter::StringElementType element_type>
 static bool formatStringImpl(ValueObject &valobj, Stream &stream,
@@ -742,8 +649,8 @@ static bool formatStringViewImpl(ValueObject &valobj, Stream &stream,
     return true;
   }
 
-  return LibcxxStringSummaryProvider<element_type>(
-      valobj, stream, summary_options, prefix_token, dataobj, size);
+  return StringBufferSummaryProvider<element_type>(stream, summary_options,
+                                                   dataobj, size, prefix_token);
 }
 
 bool lldb_private::formatters::LibcxxStringViewSummaryProviderASCII(
@@ -781,8 +688,22 @@ bool lldb_private::formatters::LibcxxWStringViewSummaryProvider(
     return true;
   }
 
-  return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
-                                        dataobj, size);
+  auto wchar_t_size = GetWCharByteSize(valobj);
+  if (!wchar_t_size)
+    return false;
+
+  switch (*wchar_t_size) {
+  case 1:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF8>(
+        stream, summary_options, dataobj, size, "L");
+  case 2:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF16>(
+        stream, summary_options, dataobj, size, "L");
+  case 4:
+    return StringBufferSummaryProvider<StringPrinter::StringElementType::UTF32>(
+        stream, summary_options, dataobj, size, "L");
+  }
+  return false;
 }
 
 static bool
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
index 5c5cf4ca16b98..32764629d65a7 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py
@@ -65,11 +65,9 @@ def cleanup():
                 '(%s::wstring) IHaveEmbeddedZerosToo = L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"'
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
-                # FIXME: This should have a 'u' prefix.
-                '(%s::u16string) u16_empty = ""' % ns,
+                '(%s::u16string) u16_empty = u""' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                # FIXME: This should have a 'U' prefix.
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
@@ -123,7 +121,7 @@ def cleanup():
                 % ns,
                 '(%s::u16string) u16_string = u"ß水氶"' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
-                '(%s::u32string) u32_empty = ""' % ns,
+                '(%s::u32string) u32_empty = U""' % ns,
                 "(%s::string *) null_str = nullptr" % ns,
             ],
         )
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
index f8fc8ae66405b..3883395f23924 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string_view/TestDataFormatterLibcxxStringView.py
@@ -81,11 +81,11 @@ def cleanup():
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
         self.expect_var_path(
             "oops", type="std::string_view", summary='"Hellooo World\\n"'
         )
@@ -145,11 +145,11 @@ def cleanup():
             summary='L"hello world!\\0てざ ル゜䋨ミ㠧槊 きゅへ狦穤襩 じゃ馩リョ 䤦監"',
         )
         self.expect_var_path("u16_string", type="std::u16string_view", summary='u"ß水氶"')
-        self.expect_var_path("u16_empty", type="std::u16string_view", summary='""')
+        self.expect_var_path("u16_empty", type="std::u16string_view", summary='u""')
         self.expect_var_path(
             "u32_string", type="std::u32string_view", summary='U"🍄🍅🍆🍌"'
         )
-        self.expect_var_path("u32_empty", type="std::u32string_view", summary='""')
+        self.expect_var_path("u32_empty", type="std::u32string_view", summary='U""')
 
         self.runCmd("cont")
         self.expect(

From 5baf351ba819e1e6bae0250492e85a2862ef406b Mon Sep 17 00:00:00 2001
From: yonghong-song <yhs@fb.com>
Date: Tue, 17 Jun 2025 09:51:40 -0700
Subject: [PATCH 766/851] [BPF] Do not allow gotol in the middle of asm insn
 (#144545)

Previously I accidentally allowed 'gotol' insn in the middle of asm insn
([1]). But actually 'gotol' is not allowed in the middle of any asm
insn, so remove it from isValidIdInMiddle().

[1] https://github.com/yonghong-song/llvm-project/commit/6c412b6c6faa2dabd8602d35d3f5e796fb1daf80
---
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 139ac429dd135..7d1819134d162 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -261,7 +261,6 @@ struct BPFOperand : public MCParsedAsmOperand {
         .Case("bswap32", true)
         .Case("bswap64", true)
         .Case("goto", true)
-        .Case("gotol", true)
         .Case("ll", true)
         .Case("skb", true)
         .Case("s", true)

From 556e69b7f4328a0d7c36c9d7ca0dd8f52f82ad71 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Tue, 17 Jun 2025 17:52:03 +0100
Subject: [PATCH 767/851] [lldb] make lit use the same Python executable for
 building and testing (#143756)

When testing LLDB, we want to make sure to use the same Python as the
one we used to build it.

This patch uses the CMake variable `Python3_ROOT_DIR` to add the correct
Python to the `PATH` in LLDB lit tests, in order to ensure of this.

Please see https://github.com/swiftlang/swift/pull/82063 for the
original issue.

This is a continuation of https://github.com/swiftlang/swift/pull/82063.
---
 lldb/cmake/modules/FindPythonAndSwig.cmake | 4 +++-
 lldb/test/API/lit.cfg.py                   | 7 +++++++
 lldb/test/API/lit.site.cfg.py.in           | 1 +
 lldb/test/Shell/lit.cfg.py                 | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/lldb/cmake/modules/FindPythonAndSwig.cmake b/lldb/cmake/modules/FindPythonAndSwig.cmake
index 1f6f553e86048..b478038f144d9 100644
--- a/lldb/cmake/modules/FindPythonAndSwig.cmake
+++ b/lldb/cmake/modules/FindPythonAndSwig.cmake
@@ -6,7 +6,9 @@
 
 macro(FindPython3)
   # Use PYTHON_HOME as a hint to find Python 3.
-  set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  if(NOT Python3_ROOT_DIR)
+    set(Python3_ROOT_DIR "${PYTHON_HOME}")
+  endif()
   find_package(Python3 COMPONENTS Interpreter Development)
   if(Python3_FOUND AND Python3_Interpreter_FOUND)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 646a446c86fdb..83713213ce1fe 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -349,3 +349,10 @@ def delete_module_cache(path):
     for v in ["SystemDrive"]:
         if v in os.environ:
             config.environment[v] = os.environ[v]
+
+# Some steps required to initialize the tests dynamically link with python.dll
+# and need to know the location of the Python libraries. This ensures that we
+# use the same version of Python that was used to build lldb to run our tests.
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 8552d17d66631..86d58889cc4ad 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -20,6 +20,7 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@"
 config.python_executable = "@LLDB_PYTHON_API_TEST_EXECUTABLE@"
+config.python_root_dir = "@Python3_ROOT_DIR@"
 config.lua_executable = "@LUA_EXECUTABLE@"
 config.lldb_lua_cpath = "@LLDB_LUA_CPATH@"
 config.lua_test_entry = "TestLuaAPI.py"
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index ab6113767187a..8c9448b23c56b 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -203,3 +203,6 @@ def calculate_arch_features(arch_string):
 # location of the Python libraries. This ensures that we use the same
 # version of Python that was used to build lldb to run our tests.
 config.environment["PYTHONHOME"] = config.python_root_dir
+config.environment["PATH"] = os.path.pathsep.join(
+    (config.python_root_dir, config.environment.get("PATH", ""))
+)

From 8063bd153c6aca43869d96aee64aeceb9be98ca5 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Tue, 17 Jun 2025 09:55:02 -0700
Subject: [PATCH 768/851] [MLIR][XeGPU] Add support for elementwise ops in Wg
 to Sg distribute pass [1/N] (#142797)

This PR adds support for Elementwise operations' (unary & binary)
lowering from Workgroup to Subgroup.
---
 .../Transforms/XeGPUWgToSgDistribute.cpp      |  89 +++++++++-
 .../XeGPU/xegpu-wg-to-sg-elemwise.mlir        | 164 ++++++++++++++++++
 2 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index a26c6b52f0ddc..e3563d10bc6f1 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -8,10 +8,12 @@
 #include "mlir/Dialect/XeGPU/Transforms/Passes.h"
 
 #include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -19,6 +21,7 @@
 #include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
 #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include <optional>
 
 namespace mlir {
 namespace xegpu {
@@ -328,6 +331,65 @@ struct WgToSgPrefetchNdOp : public OpConversionPattern<xegpu::PrefetchNdOp> {
   }
 };
 
+// This pattern transforms elementwise ops to work at subgroup level.
+struct WgToSgElementwiseOp : public ConversionPattern {
+  WgToSgElementwiseOp(MLIRContext *ctx)
+      : ConversionPattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Only match ops with elementwise trait and single result.
+    if (!OpTrait::hasElementwiseMappableTraits(op) || op->getNumResults() != 1)
+      return failure();
+
+    auto resultType = dyn_cast<VectorType>(op->getResult(0).getType());
+    assert(resultType && "Expected result to be a VectorType");
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+
+    xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+    if (!layout || !layout.getSgLayout())
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+
+    size_t numVariants = operands.empty() ? 0 : operands.front().size();
+
+    if (llvm::any_of(operands, [&](const ValueRange &operandVec) {
+          return operandVec.size() != numVariants;
+        }))
+      return failure();
+
+    SmallVector<Value> newResults;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+
+    for (size_t i = 0; i < numVariants; ++i) {
+      SmallVector<Value> opOperands;
+      for (auto &operandVec : operands)
+        opOperands.push_back(operandVec[i]);
+
+      OperationState state(op->getLoc(), op->getName());
+      state.addOperands(opOperands);
+      state.addTypes(newResultType);
+      // Copy all attributes, but update "layout_result_0" to drop
+      // sgLayout/sgData
+      for (auto attr : op->getAttrs()) {
+        if (auto layout = dyn_cast<xegpu::LayoutAttr>(attr.getValue()))
+          state.addAttribute(attr.getName(), layout.dropSgLayoutAndData());
+        else
+          state.addAttribute(attr.getName(), attr.getValue());
+      }
+      Operation *newOp = rewriter.create(state);
+      newResults.push_back(newOp->getResult(0));
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newResults});
+    return success();
+  }
+};
+
 // Handles UnrealizedConversionCastOp generated during
 // SCFStructuralTypeConversions (step 1). This op may appear as either a
 // target or source materialization for Vector values, e.g.:
@@ -411,7 +473,8 @@ namespace xegpu {
 void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
   patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
                WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
-               UnrealizedConversionCastOpPattern>(patterns.getContext());
+               UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
+      patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -518,6 +581,30 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
     return isLegal(layout);
   });
 
+  target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
+      [=](Operation *op) -> std::optional<bool> {
+        // Only handle elementwise mappable ops
+        if (!OpTrait::hasElementwiseMappableTraits(op))
+          return true;
+
+        VectorType resultType =
+            dyn_cast<VectorType>(op->getResult(0).getType());
+        if (!resultType)
+          return true;
+
+        // Check if all operands are vectors of the same shape
+        // TODO: Support other types.
+        for (Value operand : op->getOperands()) {
+          VectorType operandType = dyn_cast<VectorType>(operand.getType());
+          if (!operandType || operandType.getShape() != resultType.getShape()) {
+            return true;
+          }
+        }
+
+        xegpu::LayoutAttr layout = xegpu::getLayoutAttr(op->getResult(0));
+        return isLegal(layout);
+      });
+
   target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
       [=](UnrealizedConversionCastOp op) {
         return llvm::is_contained(existingCastOps, op.getOperation());
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
new file mode 100644
index 0000000000000..64f01d61d6e80
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
+
+gpu.module @test_elementwise_ops {
+  // CHECK-LABEL: unary_ops
+  gpu.func @unary_ops(%a: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: math.exp {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %exp = math.exp %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.negf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>} : vector<12x8xf32>
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: binary_ops
+  gpu.func @binary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK: arith.addf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %addf = arith.addf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: math.powf {{.*}}, {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: ternary_ops
+  gpu.func @ternary_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi1>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi1>
+      -> !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi1, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi1>
+    // CHECK: arith.select {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi1>, vector<12x8xf32>
+    %select = arith.select %load_c, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi1>, vector<24x32xf32>
+    // CHECK: math.fma  {{.*}}, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %fma = math.fma %load_a, %load_b, %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: type_conversion_ops
+  gpu.func @type_conversion_ops(%a: memref<24x32xf32>, %b: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.truncf {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32> to vector<12x8xf16>
+    %truncf = arith.truncf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32> to vector<24x32xf16>
+    // CHECK: arith.bitcast {{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32> to vector<12x8xf32>
+    %bitcast = arith.bitcast %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32> to vector<24x32xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: comparison_ops
+  gpu.func @comparison_ops(%a: memref<24x32xf32>, %b: memref<24x32xf32>, %c: memref<24x32xi32>, %d: memref<24x32xi32>) {
+    %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_c = xegpu.create_nd_tdesc %c[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %tdesc_d = xegpu.create_nd_tdesc %d[0, 0] : memref<24x32xi32>
+      -> !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_c = xegpu.load_nd %tdesc_c
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    %load_d = xegpu.load_nd %tdesc_d
+      : !xegpu.tensor_desc<24x32xi32, #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>>
+      -> vector<24x32xi32>
+    // CHECK: arith.cmpf ult, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xf32>
+    %cmpf = arith.cmpf ult, %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK: arith.cmpi eq, {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}
+    // CHECK-SAME: : vector<12x8xi32>
+    %cmpi = arith.cmpi eq, %load_c, %load_d
+      {layout_result_0 = #xegpu.layout<sg_layout = [2, 4], sg_data = [12, 8], lane_layout = [2, 8], lane_data = [1, 1]>}
+      : vector<24x32xi32>
+    gpu.return
+  }
+
+  // 1 to N decomposition of elementwise operations
+  // CHECK-LABEL: elementwise_ops_rr_assignment
+  gpu.func @elementwise_ops_rr_assignment(%a: memref<24x32xf32>, %b: memref<24x32xf32>) {
+     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<24x32xf32>
+      -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+    %load_a = xegpu.load_nd %tdesc_a
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    %load_b = xegpu.load_nd %tdesc_b
+      : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
+      -> vector<24x32xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: arith.negf
+    %negf = arith.negf %load_a
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
+    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-NOT: math.powf
+    %powf = math.powf %load_a, %load_b
+      {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
+      : vector<24x32xf32>
+    gpu.return
+  }
+}

From 01a7a21a4b8070a88e5dcc9753066e38d26faf85 Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Tue, 17 Jun 2025 12:04:04 -0500
Subject: [PATCH 769/851] [CMake] Add BINARY_DIR argument for
 add_lit_testsuites (#144431)

We're doing some slightly odd things with LIT in the offload-test-suite.
Specifically we generate multiple binary directories to configure and
run tests with different configurations from the same source root.

In this configuration the subdirectory targets need to instead point to
the correct generated binary directory and use test filtering to get a
subset of tests.
---
 llvm/cmake/modules/AddLLVM.cmake | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 7a7340ff8a456..8d8a94d1cddc4 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -2192,7 +2192,7 @@ endfunction()
 
 function(add_lit_testsuites project directory)
   if (NOT LLVM_ENABLE_IDE)
-    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER" "PARAMS;DEPENDS;ARGS" ${ARGN})
+    cmake_parse_arguments(ARG "EXCLUDE_FROM_CHECK_ALL" "FOLDER;BINARY_DIR" "PARAMS;DEPENDS;ARGS" ${ARGN})
 
     if (NOT ARG_FOLDER)
       get_subproject_title(subproject_title)
@@ -2213,13 +2213,18 @@ function(add_lit_testsuites project directory)
       endif()
 
       # Create a check- target for the directory.
-      string(REPLACE ${directory} "" name_slash ${lit_suite})
+      string(REPLACE "${directory}/" "" name_slash ${lit_suite})
       if (name_slash)
+        set(filter ${name_slash})
         string(REPLACE "/" "-" name_slash ${name_slash})
         string(REPLACE "\\" "-" name_dashes ${name_slash})
-        string(TOLOWER "${project}${name_dashes}" name_var)
+        string(TOLOWER "${project}-${name_dashes}" name_var)
+        set(lit_args ${lit_suite})
+        if (ARG_BINARY_DIR)
+          set(lit_args ${ARG_BINARY_DIR} --filter=${filter})
+        endif()
         add_lit_target("check-${name_var}" "Running lit suite ${lit_suite}"
-          ${lit_suite}
+          ${lit_args}
           ${EXCLUDE_FROM_CHECK_ALL}
           PARAMS ${ARG_PARAMS}
           DEPENDS ${ARG_DEPENDS}

From 526310e916af2073e30b57b678307ce94df803f3 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 13:10:51 -0400
Subject: [PATCH 770/851] [Remarks] Elaborate on called intrinsics (#143985)

---
 llvm/lib/IR/DiagnosticInfo.cpp          |  4 ++++
 llvm/test/Transforms/GVN/opt-remarks.ll | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 0f1291b8bd8be..b94dcace5e3c7 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -211,6 +212,9 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
+  } else if (auto *II = dyn_cast<IntrinsicInst>(V)) {
+    raw_string_ostream OS(Val);
+    OS << "call " << II->getCalledFunction()->getName();
   } else if (auto *I = dyn_cast<Instruction>(V)) {
     Val = I->getOpcodeName();
   } else if (auto *MD = dyn_cast<MetadataAsValue>(V)) {
diff --git a/llvm/test/Transforms/GVN/opt-remarks.ll b/llvm/test/Transforms/GVN/opt-remarks.ll
index 7c3f16917bc97..8fb2d5756f95d 100644
--- a/llvm/test/Transforms/GVN/opt-remarks.ll
+++ b/llvm/test/Transforms/GVN/opt-remarks.ll
@@ -62,6 +62,19 @@
 ; YAML-NEXT:   - ClobberedBy:     store
 ; YAML-NEXT:     DebugLoc:        { File: '/tmp/s.c', Line: 2, Column: 10 }
 ; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            gvn
+; YAML-NEXT: Name:            LoadClobbered
+; YAML-NEXT: Function:        lifetime_end
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'load of type '
+; YAML-NEXT:   - Type:            i8
+; YAML-NEXT:   - String:          ' not eliminated'
+; YAML-NEXT:   - String:          ' in favor of '
+; YAML-NEXT:   - OtherAccess:     store
+; YAML-NEXT:   - String:          ' because it is clobbered by '
+; YAML-NEXT:   - ClobberedBy:     call llvm.lifetime.end.p0
+; YAML-NEXT: ...
 
 define i32 @arg(ptr %p, i32 %i) {
 entry:
@@ -93,6 +106,15 @@ entry:
   %add = add i32 %load1, %load
   ret i32 %add
 }
+
+define i8 @lifetime_end(ptr %p, i8 %val) {
+  call void @llvm.lifetime.start.p0(i64 32, ptr %p)
+  store i8 %val, ptr %p
+  call void @llvm.lifetime.end.p0(i64 32, ptr %p)
+  %1 = load i8, ptr %p
+  ret i8 %1
+}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}

From ec230aa7a7d13c222c0b34b87c3c16937383b4a0 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 22:49:55 +0530
Subject: [PATCH 771/851] [Driver] Add support for crtbegin.o, crtend.o and
 libgloss lib to BareMetal toolchain object (#121830)

This patch conditionalise the addition of crt{begin,end}.o object files
along
with addition of -lgloss lib based on whether libc selected is newlib or
llvm
libc. Since there is no way a user can specify which libc it wants to
link
against, currently passing valid GCCInstallation to driver will select
newlib
otherwise it will default to llvm libc.

Moreover, this patch makes gnuld the default linker for baremetal
toolchain
object. User need to pass `-fuse-ld=lld` explicitly to driver to select
lld

This is the 2nd patch in the series of patches of merging RISCVToolchain
into
BareMetal toolchain object.

RFC:

https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp   | 37 +++++++-
 clang/lib/Driver/ToolChains/BareMetal.h     |  3 +-
 clang/test/Driver/aarch64-toolchain-extra.c | 13 ++-
 clang/test/Driver/aarch64-toolchain.c       | 95 ++++++++++++++++++++
 clang/test/Driver/arm-toolchain-extra.c     |  7 ++
 clang/test/Driver/arm-toolchain.c           | 99 ++++++++++++++++++++-
 clang/test/Driver/baremetal.cpp             |  3 +-
 clang/test/Driver/sanitizer-ld.c            |  2 +-
 8 files changed, 246 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 0fbfe6c77f342..a08bb588dd764 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -584,9 +584,31 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Arch == llvm::Triple::aarch64_be ? "-EB" : "-EL");
   }
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
-                   options::OPT_r)) {
-    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
+  bool NeedCRTs =
+      !Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles);
+
+  const char *CRTBegin, *CRTEnd;
+  if (NeedCRTs) {
+    if (!Args.hasArg(options::OPT_r))
+      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
+    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) {
+      auto RuntimeLib = TC.GetRuntimeLibType(Args);
+      switch (RuntimeLib) {
+      case (ToolChain::RLT_Libgcc): {
+        CRTBegin = "crtbegin.o";
+        CRTEnd = "crtend.o";
+        break;
+      }
+      case (ToolChain::RLT_CompilerRT): {
+        CRTBegin =
+            TC.getCompilerRTArgString(Args, "crtbegin", ToolChain::FT_Object);
+        CRTEnd =
+            TC.getCompilerRTArgString(Args, "crtend", ToolChain::FT_Object);
+        break;
+      }
+      }
+      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTBegin)));
+    }
   }
 
   Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
@@ -609,15 +631,22 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+    CmdArgs.push_back("--start-group");
     AddRunTimeLibs(TC, D, CmdArgs, Args);
-
     CmdArgs.push_back("-lc");
+    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D))
+      CmdArgs.push_back("-lgloss");
+    CmdArgs.push_back("--end-group");
   }
 
   if (D.isUsingLTO())
     addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
                   D.getLTOMode() == LTOK_Thin);
 
+  if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
+      NeedCRTs)
+    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
+
   if (TC.getTriple().isRISCV())
     CmdArgs.push_back("-X");
 
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 930f8584e6435..54805530bae82 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -38,6 +38,7 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 public:
   bool initGCCInstallation(const llvm::Triple &Triple,
                            const llvm::opt::ArgList &Args);
+  bool hasValidGCCInstallation() const { return IsGCCInstallationValid; }
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -63,8 +64,6 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
     return ToolChain::CST_Libcxx;
   }
 
-  const char *getDefaultLinker() const override { return "ld.lld"; }
-
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 2610e962bd690..2a930e35acd45 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -15,14 +15,21 @@
 // RUN: mkdir -p %t/aarch64-nogcc/bin
 // RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
 // RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
+// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld %t/aarch64-nogcc/bin/aarch64-none-elf-ld
 // RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
 // RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
 
 // RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
 // RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
+// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
 
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index 7f2c01d928e43..83cd95136b158 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -1,5 +1,24 @@
 // UNSUPPORTED: system-windows
 
+// Test interaction with -fuse-ld=lld
+// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
+// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=LLD-AARCH64-BAREMETAL %s
+
+// LLD-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
+// LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
+// LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
+// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// LLD-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
+
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf --rtlib=libgcc \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
@@ -9,6 +28,14 @@
 // C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf --rtlib=libgcc \
@@ -18,6 +45,14 @@
 
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
@@ -29,6 +64,14 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
@@ -40,6 +83,14 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
@@ -50,6 +101,14 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
@@ -59,3 +118,39 @@
 
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-COMPILER-RT %s
+
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
+// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
+// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
+
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
+// AARCH64-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 114de0a8154ab..2adf4ab698ba0 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -15,6 +15,7 @@
 // RUN: mkdir -p %t/arm-nogcc/bin
 // RUN: ln -s %clang %t/arm-nogcc/bin/clang
 // RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
+// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld %t/arm-nogcc/bin/armv6m-none-eabi-ld
 // RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
 // RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
 // RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
@@ -26,4 +27,10 @@
 // RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
 
 // C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
 
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index 2e38461fb7a3e..66bed1b0c4d87 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -1,5 +1,23 @@
 // UNSUPPORTED: system-windows
 
+// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
+// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=LLD-ARM-BAREMETAL %s
+
+// LLD-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
+// LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
+// LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
+// LLD-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// LLD-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
 // RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
@@ -9,6 +27,14 @@
 // C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
@@ -18,6 +44,14 @@
 
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// C-ARM-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
@@ -28,8 +62,17 @@
 // CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1" 
+// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
@@ -41,6 +84,14 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
@@ -51,6 +102,14 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
 // RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
@@ -59,4 +118,40 @@
 // RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
 
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-COMPILER-RT %s
+
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
+// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
+
+// RUN: %clang -### %s -fuse-ld= \
+// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt --unwindlib=libunwind \
+// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
+// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
+
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
+// ARM-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 2ac83402dda30..eff8f775a9c1e 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -319,7 +319,8 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc"
+// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index befd322d027c9..d2e4877e89d78 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1033,7 +1033,7 @@
 // RUN:     --target=riscv32-unknown-elf -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32
 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error:
-// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \

From 8513066f2c49457f5d1f63e275403330f854041c Mon Sep 17 00:00:00 2001
From: someoneinjd <someoneinjd@outlook.com>
Date: Wed, 18 Jun 2025 01:23:14 +0800
Subject: [PATCH 772/851] [clangd] Implement LSP 3.17 positionEncoding
 (#142903)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds support for the `positionEncoding` client capability
introduced in LSP 3.17. Clangd can now negotiate the position encoding
with the client during initialization.

Fix https://github.com/clangd/clangd/issues/1746

Co-authored-by: kadir çetinkaya <kadircetinkaya.06.tr@gmail.com>
---
 clang-tools-extra/clangd/ClangdLSPServer.cpp  | 10 ++++--
 clang-tools-extra/clangd/Protocol.cpp         | 20 +++++++++---
 clang-tools-extra/clangd/Protocol.h           |  5 +--
 .../clangd/test/positionencoding.test         | 32 +++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)
 create mode 100644 clang-tools-extra/clangd/test/positionencoding.test

diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 29321f7cd3fa2..a703009e2b467 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -494,9 +494,9 @@ static std::vector<llvm::StringRef> semanticTokenModifiers() {
 void ClangdLSPServer::onInitialize(const InitializeParams &Params,
                                    Callback<llvm::json::Value> Reply) {
   // Determine character encoding first as it affects constructed ClangdServer.
-  if (Params.capabilities.offsetEncoding && !Opts.Encoding) {
+  if (Params.capabilities.PositionEncodings && !Opts.Encoding) {
     Opts.Encoding = OffsetEncoding::UTF16; // fallback
-    for (OffsetEncoding Supported : *Params.capabilities.offsetEncoding)
+    for (OffsetEncoding Supported : *Params.capabilities.PositionEncodings)
       if (Supported != OffsetEncoding::UnsupportedEncoding) {
         Opts.Encoding = Supported;
         break;
@@ -686,6 +686,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
   ServerCaps["executeCommandProvider"] =
       llvm::json::Object{{"commands", Commands}};
 
+  if (Opts.Encoding)
+    ServerCaps["positionEncoding"] = *Opts.Encoding;
+
   llvm::json::Object Result{
       {{"serverInfo",
         llvm::json::Object{
@@ -693,6 +696,9 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params,
             {"version", llvm::formatv("{0} {1} {2}", versionString(),
                                       featureString(), platformString())}}},
        {"capabilities", std::move(ServerCaps)}}};
+
+  // TODO: offsetEncoding capability is a deprecated clangd extension and should
+  // be deleted.
   if (Opts.Encoding)
     Result["offsetEncoding"] = *Opts.Encoding;
   Reply(std::move(Result));
diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp
index c9e8a175b5d76..2c858e28fa243 100644
--- a/clang-tools-extra/clangd/Protocol.cpp
+++ b/clang-tools-extra/clangd/Protocol.cpp
@@ -497,10 +497,19 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       if (auto Cancel = StaleRequestSupport->getBoolean("cancel"))
         R.CancelsStaleRequests = *Cancel;
     }
+    if (auto *PositionEncodings = General->get("positionEncodings")) {
+      R.PositionEncodings.emplace();
+      if (!fromJSON(*PositionEncodings, *R.PositionEncodings,
+                    P.field("general").field("positionEncodings")))
+        return false;
+    }
   }
   if (auto *OffsetEncoding = O->get("offsetEncoding")) {
-    R.offsetEncoding.emplace();
-    if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+    R.PositionEncodings.emplace();
+    elog("offsetEncoding capability is a deprecated clangd extension that'll "
+         "go away with clangd 23. Migrate to standard positionEncodings "
+         "capability introduced by LSP 3.17");
+    if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                   P.field("offsetEncoding")))
       return false;
   }
@@ -536,8 +545,11 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R,
       }
     }
     if (auto *OffsetEncoding = Experimental->get("offsetEncoding")) {
-      R.offsetEncoding.emplace();
-      if (!fromJSON(*OffsetEncoding, *R.offsetEncoding,
+      R.PositionEncodings.emplace();
+      elog("offsetEncoding capability is a deprecated clangd extension that'll "
+           "go away with clangd 23. Migrate to standard positionEncodings "
+           "capability introduced by LSP 3.17");
+      if (!fromJSON(*OffsetEncoding, *R.PositionEncodings,
                     P.field("offsetEncoding")))
         return false;
     }
diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h
index 8a7809d6677ee..3a6bf155ee153 100644
--- a/clang-tools-extra/clangd/Protocol.h
+++ b/clang-tools-extra/clangd/Protocol.h
@@ -528,8 +528,9 @@ struct ClientCapabilities {
   /// textDocument.semanticHighlightingCapabilities.semanticHighlighting
   bool TheiaSemanticHighlighting = false;
 
-  /// Supported encodings for LSP character offsets. (clangd extension).
-  std::optional<std::vector<OffsetEncoding>> offsetEncoding;
+  /// Supported encodings for LSP character offsets.
+  /// general.positionEncodings
+  std::optional<std::vector<OffsetEncoding>> PositionEncodings;
 
   /// The content format that should be used for Hover requests.
   /// textDocument.hover.contentEncoding
diff --git a/clang-tools-extra/clangd/test/positionencoding.test b/clang-tools-extra/clangd/test/positionencoding.test
new file mode 100644
index 0000000000000..eea7a1a596e9a
--- /dev/null
+++ b/clang-tools-extra/clangd/test/positionencoding.test
@@ -0,0 +1,32 @@
+# RUN: clangd -lit-test < %s | FileCheck -strict-whitespace %s
+# This test verifies that we can negotiate UTF-8 offsets via the positionEncodings capability introduced in LSP 3.17.
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{"general":{"positionEncodings":["utf-8","utf-16"]}},"trace":"off"}}
+# CHECK: "positionEncoding": "utf-8"
+---
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"/*ö*/int x;\nint y=x;"}}}
+---
+{"jsonrpc":"2.0","id":1,"method":"textDocument/definition","params":{"textDocument":{"uri":"test:///main.cpp"},"position":{"line":1,"character":6}}}
+# /*ö*/int x;
+# 01234567890
+# x is character (and utf-16) range [9,10) but byte range [10,11).
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": [
+# CHECK-NEXT:    {
+# CHECK-NEXT:      "range": {
+# CHECK-NEXT:        "end": {
+# CHECK-NEXT:          "character": 11,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        },
+# CHECK-NEXT:        "start": {
+# CHECK-NEXT:          "character": 10,
+# CHECK-NEXT:          "line": 0
+# CHECK-NEXT:        }
+# CHECK-NEXT:      },
+# CHECK-NEXT:      "uri": "file://{{.*}}/main.cpp"
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
+---
+{"jsonrpc":"2.0","id":10000,"method":"shutdown"}
+---
+{"jsonrpc":"2.0","method":"exit"}

From 9dd1c66e8ffba73fead13aaf359e290f6e1d4899 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 17 Jun 2025 18:24:07 +0100
Subject: [PATCH 773/851] [VPlan] Expand VPWidenIntOrFpInductionRecipe into
 separate recipes (#118638)

The motivation of this PR is to make #115274 easier to implement, and
should allow us to add EVL support by just passing EVL to the VF
operand.

The current difficulty with widening IVs with EVL is that
VPWidenIntOrFpInductionRecipe generates its own backedge value. Since
it's a VPHeaderPHIRecipe the VF operand must be in the preheader, which
means we can't use the EVL since it's defined in the loop body.

The gist in this PR is to take the approach in #114305 and expand
VPWidenIntOrFpInductionRecipe into several recipes for the initial
value, phi and backedge value just before execution. I.e. this example:

```
  vector.ph:
  Successor(s): vector loop

  <x1> vector loop: {
    vector.body:
      WIDEN-INDUCTION %i = phi %start, %step, %vf
      ...
      EMIT branch-on-count ...
    No successors
  }
```

gets expanded to:

```
vector.ph:
  ...
  vp<%induction.start> = ...
  vp<%induction.increment> = ...

Successor(s): vector loop

<x1> vector loop: {
  vector.body:
    ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
    ...
    vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
    EMIT branch-on-count ...
  No successors
}
```

This allows us to a value defined in the loop in the backedge value, and
also means we can just reuse the existing backedge fixups in
VPlan::execute without having to specially handle it ourselves.

After this #115274 should just become a matter of setting the VF operand
to EVL (and building the increment step in the loop body, not the
preheader).
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   4 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  20 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  22 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 154 +---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 159 +++-
 .../Transforms/Vectorize/VPlanTransforms.h    |   5 -
 .../AArch64/clamped-trip-count.ll             |   8 +-
 .../AArch64/conditional-branches-cost.ll      |   4 +-
 .../AArch64/divs-with-scalable-vfs.ll         |   4 +-
 .../AArch64/epilog-iv-select-cmp.ll           |  28 +-
 .../LoopVectorize/AArch64/optsize_minsize.ll  |   8 +-
 .../AArch64/simple_early_exit.ll              |   2 +-
 .../AArch64/sve-inductions-unusual-types.ll   |   8 +-
 .../AArch64/sve-interleaved-accesses.ll       |  22 +-
 .../sve-interleaved-masked-accesses.ll        |  80 +-
 ...eave-to-widen-memory-remove-loop-region.ll |   2 +-
 .../LoopVectorize/ARM/optsize_minsize.ll      |   2 +-
 .../RISCV/interleaved-masked-access.ll        |  60 +-
 .../LoopVectorize/RISCV/mask-index-type.ll    |   2 +-
 ...ruction-or-drop-poison-generating-flags.ll |   2 +-
 .../Transforms/LoopVectorize/RISCV/pr88802.ll |   2 +-
 .../LoopVectorize/RISCV/strided-accesses.ll   |   2 +-
 .../LoopVectorize/RISCV/uniform-load-store.ll |  10 +-
 ...ectorize-force-tail-with-evl-interleave.ll |   6 +-
 .../LoopVectorize/X86/constant-fold.ll        |   4 +-
 .../X86/drop-poison-generating-flags.ll       |  24 +-
 .../LoopVectorize/X86/induction-costs.ll      |   2 +-
 .../LoopVectorize/X86/interleave-cost.ll      |   4 +-
 ...outer_loop_test1_no_explicit_vect_width.ll |   2 +-
 .../LoopVectorize/X86/scatter_crash.ll        |  16 +-
 .../epilog-vectorization-any-of-reductions.ll |   4 +-
 .../LoopVectorize/first-order-recurrence.ll   | 719 +++++++++++++-----
 ...eref-pred-poison-ub-ops-feeding-pointer.ll |  10 +-
 .../LoopVectorize/pointer-induction.ll        | 138 ++++
 .../LoopVectorize/reduction-inloop-pred.ll    |  36 +-
 .../scalable-first-order-recurrence.ll        |   8 +-
 .../LoopVectorize/scalable-inductions.ll      |   8 +-
 .../LoopVectorize/scalable-iv-outside-user.ll |   6 +-
 .../single_early_exit_live_outs.ll            |   2 +-
 .../Transforms/LoopVectorize/uniform-blend.ll |   2 +-
 .../LoopVectorize/vplan-iv-transforms.ll      |  65 ++
 41 files changed, 1069 insertions(+), 597 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1470fd1f7314..f887b34e76422 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2764,8 +2764,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
 
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
-  if (EnableVPlanNativePath)
-    fixNonInductionPHIs(State);
+  fixNonInductionPHIs(State);
 
   // After vectorization, the exit blocks of the original loop will have
   // additional predecessors. Invalidate SCEVs for the exit phis in case SE
@@ -7324,7 +7323,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
          "Trying to execute plan with unsupported VF");
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
-  VPlanTransforms::runPass(VPlanTransforms::materializeStepVectors, BestVPlan);
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
   VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index cca3d32c0783e..4332332ef5cc3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1029,17 +1029,11 @@ void VPlan::execute(VPTransformState *State) {
     if (isa<VPWidenPHIRecipe>(&R))
       continue;
 
-    if (isa<VPWidenInductionRecipe>(&R)) {
-      PHINode *Phi = nullptr;
-      if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-        Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
-      } else {
-        auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
-        assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-               "recipe generating only scalars should have been replaced");
-        auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-        Phi = cast<PHINode>(GEP->getPointerOperand());
-      }
+    if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+      assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
+             "recipe generating only scalars should have been replaced");
+      auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
+      PHINode *Phi = cast<PHINode>(GEP->getPointerOperand());
 
       Phi->setIncomingBlock(1, VectorLatchBB);
 
@@ -1047,10 +1041,6 @@ void VPlan::execute(VPTransformState *State) {
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
       Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
-
-      // Use the steps for the last part as backedge value for the induction.
-      if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
-        Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
       continue;
     }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a3c4a514a5dd..f3306ad7cb8ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1951,12 +1951,13 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector values.
+/// producing their vector values. This is an abstract recipe and must be
+/// converted to concrete recipes before executing.
 class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   TruncInst *Trunc;
 
   // If this recipe is unrolled it will have 2 additional operands.
-  bool isUnrolled() const { return getNumOperands() == 6; }
+  bool isUnrolled() const { return getNumOperands() == 5; }
 
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
@@ -1992,9 +1993,10 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
 
-  /// Generate the vectorized and scalarized versions of the phi node as
-  /// needed by their users.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("cannot execute this recipe, should be expanded via "
+                     "expandVPWidenIntOrFpInductionRecipe");
+  }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
@@ -2005,16 +2007,6 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   VPValue *getVFValue() { return getOperand(2); }
   const VPValue *getVFValue() const { return getOperand(2); }
 
-  // TODO: Remove once VPWidenIntOrFpInduction is fully expanded in
-  // convertToConcreteRecipes.
-  VPInstructionWithType *getStepVector() {
-    auto *StepVector =
-        cast<VPInstructionWithType>(getOperand(3)->getDefiningRecipe());
-    assert(StepVector->getOpcode() == VPInstruction::StepVector &&
-           "step vector operand must be a VPInstruction::StepVector");
-    return StepVector;
-  }
-
   VPValue *getSplatVFValue() {
     // If the recipe has been unrolled return the VPValue for the induction
     // increment.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 048286d7a97bc..1ed0b97849a8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -952,6 +952,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
   case VPInstruction::CanonicalIVIncrementForPart:
   case VPInstruction::BranchOnCount:
   case VPInstruction::BranchOnCond:
+  case VPInstruction::Broadcast:
   case VPInstruction::ReductionStartVector:
     return true;
   case VPInstruction::PtrAdd:
@@ -1077,15 +1078,14 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
 
 void VPInstructionWithType::execute(VPTransformState &State) {
   State.setDebugLocFrom(getDebugLoc());
-  switch (getOpcode()) {
-  case Instruction::ZExt:
-  case Instruction::Trunc: {
+  if (isScalarCast()) {
     Value *Op = State.get(getOperand(0), VPLane(0));
     Value *Cast = State.Builder.CreateCast(Instruction::CastOps(getOpcode()),
                                            Op, ResultTy);
     State.set(this, Cast, VPLane(0));
-    break;
+    return;
   }
+  switch (getOpcode()) {
   case VPInstruction::StepVector: {
     Value *StepVector =
         State.Builder.CreateStepVector(VectorType::get(ResultTy, State.VF));
@@ -1965,44 +1965,6 @@ InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
   return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
 }
 
-/// This function adds
-/// (0 * Step, 1 * Step, 2 * Step, ...)
-/// to each vector element of Val.
-/// \p Opcode is relevant for FP induction variable.
-/// \p InitVec is an integer step vector from 0 with a step of 1.
-static Value *getStepVector(Value *Val, Value *Step, Value *InitVec,
-                            Instruction::BinaryOps BinOp, ElementCount VF,
-                            IRBuilderBase &Builder) {
-  assert(VF.isVector() && "only vector VFs are supported");
-
-  // Create and check the types.
-  auto *ValVTy = cast<VectorType>(Val->getType());
-  ElementCount VLen = ValVTy->getElementCount();
-
-  Type *STy = Val->getType()->getScalarType();
-  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
-         "Induction Step must be an integer or FP");
-  assert(Step->getType() == STy && "Step has wrong type");
-
-  if (STy->isIntegerTy()) {
-    Step = Builder.CreateVectorSplat(VLen, Step);
-    assert(Step->getType() == Val->getType() && "Invalid step vec");
-    // FIXME: The newly created binary instructions should contain nsw/nuw
-    // flags, which can be found from the original scalar operations.
-    Step = Builder.CreateMul(InitVec, Step);
-    return Builder.CreateAdd(Val, Step, "induction");
-  }
-
-  // Floating point induction.
-  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
-         "Binary Opcode should be specified for FP induction");
-  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
-
-  Step = Builder.CreateVectorSplat(VLen, Step);
-  Value *MulOp = Builder.CreateFMul(InitVec, Step);
-  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
 /// A helper function that returns an integer or floating-point constant with
 /// value C.
 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
@@ -2010,104 +1972,6 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
                            : ConstantFP::get(Ty, C);
 }
 
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
-  assert(!State.Lane && "Int or FP induction being replicated.");
-
-  Value *Start = getStartValue()->getLiveInIRValue();
-  const InductionDescriptor &ID = getInductionDescriptor();
-  TruncInst *Trunc = getTruncInst();
-  IRBuilderBase &Builder = State.Builder;
-  assert(getPHINode()->getType() == ID.getStartValue()->getType() &&
-         "Types must match");
-  assert(State.VF.isVector() && "must have vector VF");
-
-  // The value from the original loop to which we are mapping the new induction
-  // variable.
-  Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : getPHINode();
-
-  // Fast-math-flags propagate from the original induction instruction.
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-  if (isa_and_present<FPMathOperator>(ID.getInductionBinOp()))
-    Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
-
-  // Now do the actual transformations, and start with fetching the step value.
-  Value *Step = State.get(getStepValue(), VPLane(0));
-
-  assert((isa<PHINode, TruncInst>(EntryVal)) &&
-         "Expected either an induction phi-node or a truncate of it!");
-
-  // Construct the initial value of the vector IV in the vector loop preheader
-  auto CurrIP = Builder.saveIP();
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  Builder.SetInsertPoint(VectorPH->getTerminator());
-  if (isa<TruncInst>(EntryVal)) {
-    assert(Start->getType()->isIntegerTy() &&
-           "Truncation requires an integer type");
-    auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = Builder.CreateTrunc(Step, TruncType);
-    Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
-  }
-
-  Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
-  Value *SteppedStart =
-      ::getStepVector(SplatStart, Step, State.get(getStepVector()),
-                      ID.getInductionOpcode(), State.VF, State.Builder);
-
-  // We create vector phi nodes for both integer and floating-point induction
-  // variables. Here, we determine the kind of arithmetic we will perform.
-  Instruction::BinaryOps AddOp;
-  Instruction::BinaryOps MulOp;
-  if (Step->getType()->isIntegerTy()) {
-    AddOp = Instruction::Add;
-    MulOp = Instruction::Mul;
-  } else {
-    AddOp = ID.getInductionOpcode();
-    MulOp = Instruction::FMul;
-  }
-
-  Value *SplatVF;
-  if (VPValue *SplatVFOperand = getSplatVFValue()) {
-    // The recipe has been unrolled. In that case, fetch the splat value for the
-    // induction increment.
-    SplatVF = State.get(SplatVFOperand);
-  } else {
-    // Multiply the vectorization factor by the step using integer or
-    // floating-point arithmetic as appropriate.
-    Type *StepType = Step->getType();
-    Value *RuntimeVF = State.get(getVFValue(), VPLane(0));
-    if (Step->getType()->isFloatingPointTy())
-      RuntimeVF = Builder.CreateUIToFP(RuntimeVF, StepType);
-    else
-      RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
-    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
-
-    // Create a vector splat to use in the induction update.
-    SplatVF = Builder.CreateVectorSplat(State.VF, Mul);
-  }
-
-  Builder.restoreIP(CurrIP);
-
-  // We may need to add the step a number of times, depending on the unroll
-  // factor. The last of those goes into the PHI.
-  PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
-  VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
-  VecInd->setDebugLoc(getDebugLoc());
-  State.set(this, VecInd);
-
-  Instruction *LastInduction = cast<Instruction>(
-      Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
-  LastInduction->setDebugLoc(getDebugLoc());
-
-  VecInd->addIncoming(SteppedStart, VectorPH);
-  // Add induction update using an incorrect block temporarily. The phi node
-  // will be fixed after VPlan execution. Note that at this point the latch
-  // block cannot be used, as it does not exist yet.
-  // TODO: Model increment value in VPlan, by turning the recipe into a
-  // multi-def and a subclass of VPHeaderPHIRecipe.
-  VecInd->addIncoming(LastInduction, VectorPH);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -3871,12 +3735,14 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
-  assert(EnableVPlanNativePath &&
-         "Non-native vplans are not expected to have VPWidenPHIRecipes.");
-
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
-  Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
+  // Manually move it with the other PHIs in case PHI recipes above this one
+  // also inserted non-phi instructions.
+  // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
+  // convertToConcreteRecipes.
+  VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
   State.set(this, VecPhi);
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 05a0e15f9a199..11f0f2a930329 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1358,17 +1358,6 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
     WideIV->setStartValue(NewStart);
     auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
     WideIV->setStepValue(NewStep);
-    // TODO: Remove once VPWidenIntOrFpInductionRecipe is fully expanded.
-    VPInstructionWithType *OldStepVector = WideIV->getStepVector();
-    assert(OldStepVector->getNumUsers() == 1 &&
-           "step vector should only be used by single "
-           "VPWidenIntOrFpInductionRecipe");
-    auto *NewStepVector =
-        new VPInstructionWithType(VPInstruction::StepVector, {}, NewIVTy, {},
-                                  OldStepVector->getDebugLoc());
-    NewStepVector->insertAfter(OldStepVector->getDefiningRecipe());
-    OldStepVector->replaceAllUsesWith(NewStepVector);
-    OldStepVector->eraseFromParent();
 
     auto *NewBTC = new VPWidenCastRecipe(
         Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
@@ -2518,6 +2507,127 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
+/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
+/// value, phi and backedge value. In the following example:
+///
+///  vector.ph:
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      WIDEN-INDUCTION %i = phi %start, %step, %vf
+///      ...
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+///
+/// WIDEN-INDUCTION will get expanded to:
+///
+///  vector.ph:
+///    ...
+///    vp<%induction.start> = ...
+///    vp<%induction.increment> = ...
+///
+///  Successor(s): vector loop
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
+///      ...
+///      vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
+///      EMIT branch-on-count ...
+///    No successors
+///  }
+static void
+expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
+                              VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = WidenIVR->getParent()->getPlan();
+  VPValue *Start = WidenIVR->getStartValue();
+  VPValue *Step = WidenIVR->getStepValue();
+  VPValue *VF = WidenIVR->getVFValue();
+  DebugLoc DL = WidenIVR->getDebugLoc();
+
+  // The value from the original loop to which we are mapping the new induction
+  // variable.
+  Type *Ty = TypeInfo.inferScalarType(WidenIVR);
+
+  const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  // FIXME: The newly created binary instructions should contain nsw/nuw
+  // flags, which can be found from the original scalar operations.
+  VPIRFlags Flags;
+  if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+    Flags = ID.getInductionBinOp()->getFastMathFlags();
+  }
+
+  // If the phi is truncated, truncate the start and step values.
+  VPBuilder Builder(Plan->getVectorPreheader());
+  Type *StepTy = TypeInfo.inferScalarType(Step);
+  if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
+    assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
+    Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
+    Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
+    StepTy = Ty;
+  }
+
+  // Construct the initial value of the vector IV in the vector loop preheader.
+  Type *IVIntTy =
+      IntegerType::get(StepTy->getContext(), StepTy->getScalarSizeInBits());
+  VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
+  if (StepTy->isFloatingPointTy())
+    Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
+
+  VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
+  VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
+
+  Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
+  Init =
+      Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags, {}, "induction");
+
+  // Create the widened phi of the vector IV.
+  auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), nullptr,
+                                       WidenIVR->getDebugLoc(), "vec.ind");
+  WidePHI->addOperand(Init);
+  WidePHI->insertBefore(WidenIVR);
+
+  // Create the backedge value for the vector IV.
+  VPValue *Inc;
+  VPValue *Prev;
+  // If unrolled, use the increment and prev value from the operands.
+  if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
+    Inc = SplatVF;
+    Prev = WidenIVR->getLastUnrolledPartOperand();
+  } else {
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    if (StepTy->isFloatingPointTy())
+      VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
+                                    DL);
+    else
+      VF =
+          Builder.createScalarCast(Instruction::CastOps::Trunc, VF, StepTy, DL);
+
+    Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
+    Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
+    Prev = WidePHI;
+  }
+
+  VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
+  Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
+  auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
+                                    WidenIVR->getDebugLoc(), "vec.ind.next");
+
+  WidePHI->addOperand(Next);
+
+  WidenIVR->replaceAllUsesWith(WidePHI);
+}
+
 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
   // Replace loop regions with explicity CFG.
   SmallVector<VPRegionBlock *> LoopRegions;
@@ -2625,6 +2735,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
         continue;
       }
 
+      if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+        expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
+        ToRemove.push_back(WidenIVR);
+        continue;
+      }
+
       VPValue *VectorStep;
       VPValue *ScalarStep;
       if (!match(&R, m_VPInstruction<VPInstruction::WideIVStep>(
@@ -2935,27 +3051,6 @@ void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
   }
 }
 
-void VPlanTransforms::materializeStepVectors(VPlan &Plan) {
-  for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    auto *IVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
-    if (!IVR)
-      continue;
-
-    Type *Ty = IVR->getPHINode()->getType();
-    if (TruncInst *Trunc = IVR->getTruncInst())
-      Ty = Trunc->getType();
-    if (Ty->isFloatingPointTy())
-      Ty = IntegerType::get(Ty->getContext(), Ty->getScalarSizeInBits());
-
-    VPBuilder Builder(Plan.getVectorPreheader());
-    VPInstruction *StepVector = Builder.createNaryOp(
-        VPInstruction::StepVector, {}, Ty, {}, IVR->getDebugLoc());
-    assert(IVR->getNumOperands() == 3 &&
-           "can only add step vector before unrolling");
-    IVR->addOperand(StepVector);
-  }
-}
-
 void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5a03bdb7c6882..7e51c05d1b5b5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -209,11 +209,6 @@ struct VPlanTransforms {
   optimizeInductionExitUsers(VPlan &Plan,
                              DenseMap<VPValue *, VPValue *> &EndValues);
 
-  /// Materialize VPInstruction::StepVectors for VPWidenIntOrFpInductionRecipes.
-  /// TODO: Remove once all of VPWidenIntOrFpInductionRecipe is expanded in
-  /// convertToConcreteRecipes.
-  static void materializeStepVectors(VPlan &Plan);
-
   /// Add explicit broadcasts for live-ins and VPValues defined in \p Plan's entry block if they are used as vectors.
   static void materializeBroadcasts(VPlan &Plan);
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index ae7719757dc30..24c703ae42f0a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -16,9 +16,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -36,8 +36,8 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
@@ -100,9 +100,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 1, [[TMP6]]
@@ -120,8 +120,8 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP14]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index f36161703dba5..976f95ff4f0ba 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -862,8 +862,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; DEFAULT-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE14]]:
-; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[EXIT:.*]]
@@ -964,8 +964,8 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
 ; PRED-NEXT:    store i8 [[TMP33]], ptr [[TMP32]], align 1
 ; PRED-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; PRED:       [[PRED_STORE_CONTINUE14]]:
-; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; PRED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
 ; PRED-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
 ; PRED-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 4775a6ec3f917..d59607711b5bf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -123,9 +123,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
@@ -246,9 +246,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 1, [[TMP9]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 5508a65744c6b..895781de31f33 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -87,12 +87,12 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]]
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP1]], align 8
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[L]], 3
@@ -184,16 +184,16 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT10]], zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
-; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT14:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT14]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT14]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP14]] = select <4 x i1> [[TMP11]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]]
 ; CHECK-NEXT:    [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4
@@ -207,12 +207,12 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
 ; CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
 ; CHECK-NEXT:    br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL21:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX22:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL23:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX22]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL23]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX24]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[START]], 0
 ; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
 ; CHECK-NEXT:    [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
index 4d3afd71921d6..e4718dc216358 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll
@@ -394,9 +394,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
@@ -514,13 +514,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; DEFAULT-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; DEFAULT-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -590,13 +590,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; OPTSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; OPTSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; OPTSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; OPTSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; OPTSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; OPTSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; OPTSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; OPTSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
@@ -666,13 +666,13 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32
 ; MINSIZE-NEXT:    [[TMP8:%.*]] = icmp ugt i64 15, [[TMP6]]
 ; MINSIZE-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
 ; MINSIZE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 15)
-; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; MINSIZE-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
 ; MINSIZE-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; MINSIZE-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.stepvector.nxv16i8()
 ; MINSIZE-NEXT:    [[TMP11:%.*]] = mul <vscale x 16 x i8> [[TMP10]], splat (i8 1)
 ; MINSIZE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 16 x i8> zeroinitializer, [[TMP11]]
 ; MINSIZE-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP4]] to i8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
index 74b0c2c0e033a..d02d03b4b437d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -120,8 +120,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 49584bd47353d..f44744071ae58 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -20,11 +20,11 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i7>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7> 
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i7> [[TMP7]], splat (i7 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -85,11 +85,11 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP40]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT_:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = trunc <vscale x 2 x i64> [[DOTSPLAT_]] to <vscale x 2 x i3>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i3> [[TMP7]], splat (i3 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP9]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 77e713256d247..7e4edf739695a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -101,11 +101,11 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -185,11 +185,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
@@ -579,9 +579,9 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> splat (i64 1023), [[TMP2]]
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i64 0, [[TMP1]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[DOTNEG]], i64 0
@@ -809,9 +809,9 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -958,9 +958,9 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1113,13 +1113,13 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
@@ -1191,13 +1191,13 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP21:%.*]] = shl <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP21]], splat (i64 3)
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP7]], 3
@@ -1284,14 +1284,14 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP9]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP33]], 2
+; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i32 [[TMP16]], -1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP34]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 3567aff0ace4e..bd2bd5aa27952 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -37,11 +37,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -60,10 +60,10 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -83,11 +83,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -108,11 +108,11 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -182,11 +182,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -201,7 +201,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -221,11 +221,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -243,7 +243,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -309,13 +309,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -331,7 +331,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -352,13 +352,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -378,7 +378,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -456,11 +456,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -483,10 +483,10 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
@@ -506,11 +506,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -535,11 +535,11 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p,
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP8]] to i64
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]], <vscale x 16 x i8> [[TMP18]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 022789ad9de70..fea57fa8b6b68 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -107,8 +107,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF4-NEXT:    store i64 [[TMP32]], ptr [[TMP31]], align 8
 ; VF4-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; VF4:       [[PRED_STORE_CONTINUE6]]:
-; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF4-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; VF4-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF4:       [[MIDDLE_BLOCK]]:
 ; VF4-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
index a7a7b1af5953b..1d898fbaaed36 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll
@@ -390,9 +390,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
 ; DEFAULT-NEXT:    store i8 [[TMP71]], ptr [[TMP70]], align 1
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE36]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE36]]:
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
 ; DEFAULT-NEXT:    [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; DEFAULT-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    br label %[[FOR_COND_CLEANUP:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index b7c9612e57aec..79425ae3a67ec 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -21,11 +21,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -38,8 +38,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP8]], splat (i32 1)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP12]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP15]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP7]])
@@ -48,7 +48,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP18]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP17]], <vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -69,11 +69,11 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -88,8 +88,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP6]])
@@ -98,7 +98,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
@@ -190,11 +190,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; SCALAR_EPILOGUE-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
-; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
-; SCALAR_EPILOGUE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; SCALAR_EPILOGUE-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; SCALAR_EPILOGUE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_EPILOGUE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_EPILOGUE:       vector.body:
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -209,16 +209,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP14]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP16:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP16]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP17]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
 ; SCALAR_EPILOGUE-NEXT:    [[TMP18:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP18]]
-; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
-; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; SCALAR_EPILOGUE-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP19]], i32 1, <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i8> poison)
+; SCALAR_EPILOGUE-NEXT:    [[TMP20:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP21:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP20]]
-; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; SCALAR_EPILOGUE-NEXT:    [[TMP22:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; SCALAR_EPILOGUE-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP22]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP24:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; SCALAR_EPILOGUE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP24]]
@@ -233,7 +233,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_EPILOGUE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP30]]
 ; SCALAR_EPILOGUE-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP23]], <vscale x 16 x ptr> [[TMP31]], i32 1, <vscale x 16 x i1> [[TMP7]])
 ; SCALAR_EPILOGUE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
-; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; SCALAR_EPILOGUE-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_EPILOGUE-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_EPILOGUE-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_EPILOGUE:       middle.block:
@@ -254,11 +254,11 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -275,16 +275,16 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP16]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[TMP18]], i32 1, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP19]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER2]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER4]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER5]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP21]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP23]]
@@ -299,7 +299,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP29]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP22]], <vscale x 16 x ptr> [[TMP30]], i32 1, <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
index c64f6df075a04..3e4d337c0706c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll
@@ -22,9 +22,9 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) {
 ; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; VLENUNK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; VLENUNK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
 ; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; VLENUNK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; VLENUNK-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; VLENUNK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP8]]
 ; VLENUNK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
index 88d9ed2ce201e..2f9ff20bf0f98 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll
@@ -20,13 +20,13 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1001, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[B]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP6]], splat (i64 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP5]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
index 3dc17e615048e..51a8b451dffd9 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -137,8 +137,8 @@ define void @test(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    store i8 [[TMP40]], ptr [[P]], align 1
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue32:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i32> [[VEC_IND]], splat (i32 16)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT1:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index f89a863d1e5f5..79590f5060ad4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -553,9 +553,9 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; STRIDED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[STRIDE]], i64 0
 ; STRIDED-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; STRIDED-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; STRIDED-NEXT:    [[TMP14:%.*]] = mul <vscale x 4 x i64> [[TMP12]], splat (i64 1)
 ; STRIDED-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
 ; STRIDED-NEXT:    [[TMP17:%.*]] = mul i64 1, [[TMP11]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
index 70c04ded5cf57..827612cfe36d5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll
@@ -325,9 +325,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP7]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -432,9 +432,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; TF-SCALABLE-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -996,11 +996,11 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
@@ -1127,11 +1127,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i64> [[TMP6]], splat (i64 1)
 ; SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP8]]
 ; SCALABLE-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP5]]
@@ -1233,11 +1233,11 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc
 ; TF-SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; TF-SCALABLE-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; TF-SCALABLE-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 2
-; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[B]], i64 0
 ; TF-SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
+; TF-SCALABLE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; TF-SCALABLE-NEXT:    [[TMP7:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 1)
 ; TF-SCALABLE-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP7]]
 ; TF-SCALABLE-NEXT:    [[TMP8:%.*]] = mul i64 1, [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
index 427123cfca6d4..cd246053bcb30 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll
@@ -29,17 +29,17 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
 ; IF-EVL-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; IF-EVL-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
-; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
 ; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; IF-EVL:       vector.body:
 ; IF-EVL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
+; IF-EVL-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; IF-EVL-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = icmp ule <vscale x 4 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
index 7d9ed7d6215c5..05a495d51c458 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -110,8 +110,8 @@ define void @redundant_or_1(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -214,8 +214,8 @@ define void @redundant_or_2(ptr %dst, i1 %c.0, i1 %c.1) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP16]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
 ; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
index 38b58fbfd1021..53fd2ed43972c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll
@@ -39,8 +39,8 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -91,8 +91,8 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nusw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nusw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -143,8 +143,8 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr nuw float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr nuw float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -196,8 +196,8 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -248,8 +248,8 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -305,8 +305,8 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -356,8 +356,8 @@ define void @preserve_nuw_nsw_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -409,8 +409,8 @@ define void @drop_scalar_exact(ptr noalias nocapture readonly %input, ptr %outpu
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -518,8 +518,8 @@ define void @preserve_vector_exact_no_addr(ptr noalias nocapture readonly %input
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -570,8 +570,8 @@ define void @preserve_exact_no_addr(ptr %output) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[OUTPUT]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    store <4 x i64> [[PREDPHI]], ptr [[TMP3]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -698,8 +698,8 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP16]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
@@ -778,8 +778,8 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
 ; CHECK-NEXT:    store <4 x i8> [[PREDPHI]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 7aeb32afe43be..0a85548f8750b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -411,9 +411,9 @@ define i16 @iv_and_step_trunc() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i16> [[VEC_IND1]], [[TMP1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], splat (i16 2)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index 4fbee321b6a48..7f2544ddf149d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -519,8 +519,8 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> zeroinitializer, <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -640,8 +640,8 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
index 6480c0ab1099d..02d48cbda1aab 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -71,8 +71,8 @@
 ; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
 
 ; AVX: [[ForInc]]:
-; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], splat (i64 8)
 ; AVX: br i1 true, label %middle.block, label %vector.body
 
 @arr2 = external global [8 x i32], align 16
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 4038ace617c17..99650592d2dea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -131,7 +131,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[IND_END43:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; CHECK-NEXT:    br label [[VECTOR_BODY29:%.*]]
-; CHECK:       vector.body28:
+; CHECK:       vector.body30:
 ; CHECK-NEXT:    [[INDEX34:%.*]] = phi i64 [ 0, [[VECTOR_PH25]] ], [ [[INDEX_NEXT39:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND35:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT36:%.*]], [[VECTOR_BODY29]] ]
 ; CHECK-NEXT:    [[VEC_IND37:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH25]] ], [ [[VEC_IND_NEXT38:%.*]], [[VECTOR_BODY29]] ]
@@ -153,18 +153,18 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT36]] = add <16 x i64> [[VEC_IND35]], splat (i64 32)
 ; CHECK-NEXT:    [[VEC_IND_NEXT38]] = add <16 x i64> [[VEC_IND37]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT39]], [[N_VEC32]]
-; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK35:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       middle.block35:
+; CHECK-NEXT:    br i1 [[TMP41]], label [[MIDDLE_BLOCK37:%.*]], label [[VECTOR_BODY29]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block37:
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK43:%.*]]
-; CHECK:       vec.epilog.iter.check42:
+; CHECK:       vec.epilog.iter.check44:
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
 ; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label [[VEC_EPILOG_SCALAR_PH40]], label [[VEC_EPILOG_PH42]]
-; CHECK:       vec.epilog.ph41:
+; CHECK:       vec.epilog.ph43:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL42:%.*]] = phi i64 [ [[IND_END41]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL44:%.*]] = phi i64 [ [[IND_END43]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK24]] ]
@@ -183,7 +183,7 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY49:%.*]]
-; CHECK:       vec.epilog.vector.body49:
+; CHECK:       vec.epilog.vector.body57:
 ; CHECK-NEXT:    [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
 ; CHECK-NEXT:    [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
@@ -206,10 +206,10 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       vec.epilog.middle.block62:
+; CHECK:       vec.epilog.middle.block64:
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
-; CHECK:       vec.epilog.scalar.ph40:
+; CHECK:       vec.epilog.scalar.ph42:
 ; CHECK-NEXT:    [[BC_RESUME_VAL56:%.*]] = phi i64 [ [[IND_END54]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END55]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 8, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL67:%.*]] = phi i64 [ [[IND_END57]], [[VEC_EPILOG_MIDDLE_BLOCK40]] ], [ [[IND_END58]], [[VEC_EPILOG_ITER_CHECK43]] ], [ 0, [[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 1365e9f73d854..6e62ff842c6d1 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -245,8 +245,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT14:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT6]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 9be26d4247a36..7684d274a75cf 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -870,7 +870,7 @@ for.end:
 ; }
 ;
 ;
-define i32 @PR27246(ptr %dst) {
+define i32 @PR27246() {
 ; UNROLL-NO-IC-LABEL: @PR27246(
 ; UNROLL-NO-IC-NEXT:  entry:
 ; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
@@ -882,8 +882,7 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC:       vector.ph:
 ; UNROLL-NO-IC-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 8
 ; UNROLL-NO-IC-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; UNROLL-NO-IC-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; UNROLL-NO-IC-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
@@ -892,13 +891,10 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 -4)
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-IC:       middle.block:
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
@@ -906,21 +902,19 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-IC:       scalar.ph:
 ; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-IC-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-IC:       for.cond.cleanup:
 ; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-IC-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-IC:       for.cond1:
-; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-IC-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-IC-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-IC-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-IC-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-IC-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-IC:       for.cond.cleanup3:
-; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-IC-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-IC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-IC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -936,38 +930,33 @@ define i32 @PR27246(ptr %dst) {
 ; UNROLL-NO-VF:       vector.ph:
 ; UNROLL-NO-VF-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 2
 ; UNROLL-NO-VF-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
+; UNROLL-NO-VF-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL-NO-VF:       vector.body:
-; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP2]], ptr [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[I_016]], [[N_VEC]]
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; UNROLL-NO-VF-NEXT:    br label [[FOR_COND2:%.*]]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP1]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND1:%.*]]
 ; UNROLL-NO-VF:       for.cond.cleanup:
 ; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; UNROLL-NO-VF:       for.cond1:
-; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; UNROLL-NO-VF-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; UNROLL-NO-VF-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; UNROLL-NO-VF-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; UNROLL-NO-VF-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; UNROLL-NO-VF-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; UNROLL-NO-VF:       for.cond.cleanup3:
-; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[OFFSET_IDX]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; UNROLL-NO-VF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; UNROLL-NO-VF-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -983,24 +972,18 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER:       vector.ph:
 ; SINK-AFTER-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[I_016]], 4
 ; SINK-AFTER-NEXT:    [[N_VEC:%.*]] = sub i32 [[I_016]], [[N_MOD_VF]]
-; SINK-AFTER-NEXT:    [[TMP0:%.*]] = sub i32 [[I_016]], [[N_VEC]]
-; SINK-AFTER-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[I_016]]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[E_015]], i32 3
+; SINK-AFTER-NEXT:    [[IND_END:%.*]] = sub i32 [[I_016]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[I_016]], i64 0
 ; SINK-AFTER-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; SINK-AFTER-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
-; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-AFTER:       vector.body:
-; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[VEC_IND:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[VEC_IND]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND1]] ]
-; SINK-AFTER-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP3]], ptr [[TMP1]], align 4
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 -4)
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SINK-AFTER-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND1]], !llvm.loop [[LOOP8:![0-9]+]]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SINK-AFTER-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SINK-AFTER:       middle.block:
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
 ; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
@@ -1008,21 +991,19 @@ define i32 @PR27246(ptr %dst) {
 ; SINK-AFTER-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]]
 ; SINK-AFTER:       scalar.ph:
 ; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[E_015]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
-; SINK-AFTER-NEXT:    br label [[FOR_COND2:%.*]]
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND1:%.*]]
 ; SINK-AFTER:       for.cond.cleanup:
 ; SINK-AFTER-NEXT:    [[E_1_LCSSA_LCSSA:%.*]] = phi i32 [ [[E_1_LCSSA]], [[FOR_COND_CLEANUP3]] ]
 ; SINK-AFTER-NEXT:    ret i32 [[E_1_LCSSA_LCSSA]]
 ; SINK-AFTER:       for.cond1:
-; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND2]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND2]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SINK-AFTER-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_016]]
+; SINK-AFTER-NEXT:    [[E_1:%.*]] = phi i32 [ [[K_0:%.*]], [[FOR_COND1]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
+; SINK-AFTER-NEXT:    [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; SINK-AFTER-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1
 ; SINK-AFTER-NEXT:    [[DEC]] = add nsw i32 [[K_0]], -1
-; SINK-AFTER-NEXT:    store i32 [[E_1]], ptr [[GEP_DST]], align 4
-; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND2]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
+; SINK-AFTER-NEXT:    br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]]
 ; SINK-AFTER:       for.cond.cleanup3:
-; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND2]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    [[E_1_LCSSA]] = phi i32 [ [[E_1]], [[FOR_COND1]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
 ; SINK-AFTER-NEXT:    [[INC]] = add nuw nsw i32 [[I_016]], 1
 ; SINK-AFTER-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49
 ; SINK-AFTER-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
@@ -1042,10 +1023,8 @@ for.cond.cleanup:
 for.cond1:
   %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
   %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %i.016
   %cmp2 = icmp sgt i32 %k.0, 1
   %dec = add nsw i32 %k.0, -1
-  store i32 %e.1, ptr %gep.dst
   br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
 
 for.cond.cleanup3:
@@ -1072,22 +1051,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; UNROLL-NO-IC:       vector.body:
 ; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 8
-; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 10
-; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 12
-; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 8
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 10
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 12
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 14
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; UNROLL-NO-IC-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1169,22 +1148,22 @@ define i32 @PR30183(i32 %pre_load, ptr %a, ptr %b, i64 %n) {
 ; SINK-AFTER:       vector.body:
 ; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 2
-; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 4
-; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 2
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 6
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP3]], 2
 ; SINK-AFTER-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP4]], 2
 ; SINK-AFTER-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP5]], 2
 ; SINK-AFTER-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 2
-; SINK-AFTER-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[TMP7]], 2
-; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP8]]
+; SINK-AFTER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP7]]
+; SINK-AFTER-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
 ; SINK-AFTER-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
 ; SINK-AFTER-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; SINK-AFTER-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; SINK-AFTER-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 4
 ; SINK-AFTER-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP12]], align 4
-; SINK-AFTER-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP13]], align 4
-; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP14]], align 4
-; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP15]], align 4
+; SINK-AFTER-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP13]], align 4
+; SINK-AFTER-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP14]], align 4
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SINK-AFTER-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -1372,27 +1351,27 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; UNROLL-NO-VF:       vector.body:
 ; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i32 [[VECTOR_RECUR]], 1
-; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i32 [[VECTOR_RECUR]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], [[X]]
 ; UNROLL-NO-VF-NEXT:    [[TMP3]] = add nuw i32 [[VECTOR_RECUR]], 2
-; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 96
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP3]], 96
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; UNROLL-NO-VF:       middle.block:
 ; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; UNROLL-NO-VF:       scalar.ph:
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT1:%.*]] = phi i32 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; UNROLL-NO-VF-NEXT:    br label [[FOR_BODY:%.*]]
 ; UNROLL-NO-VF:       for.body:
+; UNROLL-NO-VF-NEXT:    [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; UNROLL-NO-VF-NEXT:    [[VAL_PHI:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[ADDX:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT1]], [[SCALAR_PH]] ], [ [[ADDX1:%.*]], [[FOR_BODY]] ]
-; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[VAL_PHI]], 1
-; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[VAL_PHI]] to i64
-; UNROLL-NO-VF-NEXT:    [[ADDX1]] = add i32 [[VAL_PHI]], [[X]]
-; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[VAL_PHI]], 95
+; UNROLL-NO-VF-NEXT:    [[INC]] = add i32 [[INC_PHI]], 1
+; UNROLL-NO-VF-NEXT:    [[BC:%.*]] = zext i32 [[INC_PHI]] to i64
+; UNROLL-NO-VF-NEXT:    [[ADDX]] = add i32 [[INC_PHI]], [[X]]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
 ; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; UNROLL-NO-VF:       for.end:
-; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI1]], [[FOR_BODY]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[VAL_PHI]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; UNROLL-NO-VF-NEXT:    ret i32 [[VAL_PHI_LCSSA]]
 ;
 ; SINK-AFTER-LABEL: @extract_second_last_iteration(
@@ -2473,7 +2452,178 @@ for.end12.loopexit:                               ; preds = %cond.end
   ret void
 }
 
-define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
+; Dead instructions, like the exit condition are not part of the actual VPlan
+; and do not need to be sunk. PR44634.
+define void @sink_dead_inst(ptr %a) {
+; UNROLL-NO-IC-LABEL: @sink_dead_inst(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP5]] = add <4 x i16> [[TMP2]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = sub <4 x i16> [[TMP6]], splat (i16 10)
+; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = sub <4 x i16> [[TMP7]], splat (i16 10)
+; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[TMP10]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP10]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP8]], ptr [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT:    store <4 x i16> [[TMP9]], ptr [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP5]], i32 3
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-IC:       for.cond:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-IC-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; UNROLL-NO-IC-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    ret void
+;
+; UNROLL-NO-VF-LABEL: @sink_dead_inst(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = add i16 [[TMP2]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP6]] = add i16 [[TMP3]], 5
+; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = sub i16 [[VECTOR_RECUR]], 10
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = sub i16 [[TMP5]], 10
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    store i16 [[TMP7]], ptr [[TMP9]], align 2
+; UNROLL-NO-VF-NEXT:    store i16 [[TMP8]], ptr [[TMP10]], align 2
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 15, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT2:%.*]] = phi i32 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-VF:       for.cond:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT2]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-VF-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; UNROLL-NO-VF-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    ret void
+;
+; SINK-AFTER-LABEL: @sink_dead_inst(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP3]] = add <4 x i16> [[TMP1]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = sub <4 x i16> [[TMP4]], splat (i16 10)
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[OFFSET_IDX]]
+; SINK-AFTER-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
+; SINK-AFTER-NEXT:    store <4 x i16> [[TMP5]], ptr [[TMP7]], align 2
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; SINK-AFTER-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; SINK-AFTER-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 13, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT2]], [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
+; SINK-AFTER:       for.cond:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], [[SCALAR_PH]] ], [ [[REC_2_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REC_2]], 15
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32
+; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; SINK-AFTER-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[A]], i16 [[IV]]
+; SINK-AFTER-NEXT:    store i16 [[USE_REC_1]], ptr [[GEP]], align 2
+; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %rec.2 = phi i32 [ -27, %entry ], [ %rec.2.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %cmp = icmp eq i32 %rec.2, 15
+  %iv.next = add i16 %iv, 1
+  %rec.2.prev = zext i16 %iv.next to i32
+  %rec.1.prev = add i16 %iv.next, 5
+  %gep = getelementptr i16, ptr %a, i16 %iv
+  store i16 %use.rec.1, ptr %gep
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
+define i32 @sink_into_replication_region(i32 %y) {
 ; UNROLL-NO-IC-LABEL: @sink_into_replication_region(
 ; UNROLL-NO-IC-NEXT:  bb:
 ; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
@@ -2564,74 +2714,18 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC:       pred.udiv.continue18:
 ; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP37]], [[PRED_UDIV_IF17]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP39]], label [[PRED_UDIV_IF19:%.*]], label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.if19:
 ; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
 ; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
 ; UNROLL-NO-IC-NEXT:    [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE21]]
+; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
 ; UNROLL-NO-IC:       pred.udiv.continue20:
 ; UNROLL-NO-IC-NEXT:    [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE18]] ], [ [[TMP42]], [[PRED_UDIV_IF19]] ]
 ; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; UNROLL-NO-IC-NEXT:    [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
 ; UNROLL-NO-IC-NEXT:    [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
-; UNROLL-NO-IC-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP64]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-IC:       pred.store.if:
-; UNROLL-NO-IC-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[TMP44]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP65]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-IC:       pred.store.continue:
-; UNROLL-NO-IC-NEXT:    [[TMP66:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP66]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
-; UNROLL-NO-IC:       pred.store.if21:
-; UNROLL-NO-IC-NEXT:    [[TMP67:%.*]] = extractelement <4 x i32> [[TMP44]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP67]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE22]]
-; UNROLL-NO-IC:       pred.store.continue22:
-; UNROLL-NO-IC-NEXT:    [[TMP52:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP52]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
-; UNROLL-NO-IC:       pred.store.if23:
-; UNROLL-NO-IC-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP44]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP53]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE24]]
-; UNROLL-NO-IC:       pred.store.continue24:
-; UNROLL-NO-IC-NEXT:    [[TMP54:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP54]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
-; UNROLL-NO-IC:       pred.store.if25:
-; UNROLL-NO-IC-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP44]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP55]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE26]]
-; UNROLL-NO-IC:       pred.store.continue26:
-; UNROLL-NO-IC-NEXT:    [[TMP56:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP56]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
-; UNROLL-NO-IC:       pred.store.if27:
-; UNROLL-NO-IC-NEXT:    [[TMP57:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP57]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE28]]
-; UNROLL-NO-IC:       pred.store.continue28:
-; UNROLL-NO-IC-NEXT:    [[TMP58:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP58]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
-; UNROLL-NO-IC:       pred.store.if29:
-; UNROLL-NO-IC-NEXT:    [[TMP59:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP59]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE30]]
-; UNROLL-NO-IC:       pred.store.continue30:
-; UNROLL-NO-IC-NEXT:    [[TMP60:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP60]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
-; UNROLL-NO-IC:       pred.store.if31:
-; UNROLL-NO-IC-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP61]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_STORE_CONTINUE32]]
-; UNROLL-NO-IC:       pred.store.continue32:
-; UNROLL-NO-IC-NEXT:    [[TMP62:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP62]], label [[PRED_STORE_IF33:%.*]], label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.if33:
-; UNROLL-NO-IC-NEXT:    [[TMP63:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3
-; UNROLL-NO-IC-NEXT:    store i32 [[TMP63]], ptr [[DST]], align 4
-; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE20]]
-; UNROLL-NO-IC:       pred.store.continue34:
 ; UNROLL-NO-IC-NEXT:    [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
 ; UNROLL-NO-IC-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
 ; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -2656,7 +2750,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-IC-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-IC-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-IC-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-IC-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-IC-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-IC-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2688,25 +2781,15 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-VF:       pred.udiv.continue:
 ; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.if3:
 ; UNROLL-NO-VF-NEXT:    [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1
 ; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]]
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-VF:       pred.udiv.continue4:
 ; UNROLL-NO-VF-NEXT:    [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ]
 ; UNROLL-NO-VF-NEXT:    [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
 ; UNROLL-NO-VF-NEXT:    [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]]
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; UNROLL-NO-VF:       pred.store.if:
-; UNROLL-NO-VF-NEXT:    store i32 [[VECTOR_RECUR]], ptr [[DST:%.*]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; UNROLL-NO-VF:       pred.store.continue:
-; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF5:%.*]], label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.if5:
-; UNROLL-NO-VF-NEXT:    store i32 [[TMP6]], ptr [[DST]], align 4
-; UNROLL-NO-VF-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
-; UNROLL-NO-VF:       pred.store.continue6:
 ; UNROLL-NO-VF-NEXT:    [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]]
 ; UNROLL-NO-VF-NEXT:    [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]]
 ; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
@@ -2730,7 +2813,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; UNROLL-NO-VF-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; UNROLL-NO-VF-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; UNROLL-NO-VF-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; UNROLL-NO-VF-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; UNROLL-NO-VF-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; UNROLL-NO-VF-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2785,44 +2867,16 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER:       pred.udiv.continue6:
 ; SINK-AFTER-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP16]], [[PRED_UDIV_IF5]] ]
 ; SINK-AFTER-NEXT:    [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; SINK-AFTER-NEXT:    br i1 [[TMP18]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.if7:
 ; SINK-AFTER-NEXT:    [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
 ; SINK-AFTER-NEXT:    [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
 ; SINK-AFTER-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE9]]
+; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; SINK-AFTER:       pred.udiv.continue8:
 ; SINK-AFTER-NEXT:    [[TMP22]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP21]], [[PRED_UDIV_IF7]] ]
 ; SINK-AFTER-NEXT:    [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; SINK-AFTER-NEXT:    [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
-; SINK-AFTER-NEXT:    [[TMP33:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
-; SINK-AFTER-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; SINK-AFTER:       pred.store.if:
-; SINK-AFTER-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
-; SINK-AFTER-NEXT:    store i32 [[TMP34]], ptr [[DST:%.*]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; SINK-AFTER:       pred.store.continue:
-; SINK-AFTER-NEXT:    [[TMP35:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; SINK-AFTER-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; SINK-AFTER:       pred.store.if9:
-; SINK-AFTER-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
-; SINK-AFTER-NEXT:    store i32 [[TMP28]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; SINK-AFTER:       pred.store.continue10:
-; SINK-AFTER-NEXT:    [[TMP29:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; SINK-AFTER-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; SINK-AFTER:       pred.store.if11:
-; SINK-AFTER-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
-; SINK-AFTER-NEXT:    store i32 [[TMP30]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; SINK-AFTER:       pred.store.continue12:
-; SINK-AFTER-NEXT:    [[TMP31:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; SINK-AFTER-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF13:%.*]], label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.if13:
-; SINK-AFTER-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
-; SINK-AFTER-NEXT:    store i32 [[TMP32]], ptr [[DST]], align 4
-; SINK-AFTER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
-; SINK-AFTER:       pred.store.continue14:
 ; SINK-AFTER-NEXT:    [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
 ; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; SINK-AFTER-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2845,7 +2899,6 @@ define i32 @sink_into_replication_region(i32 %y, ptr %dst) {
 ; SINK-AFTER-NEXT:    [[VAR6]] = add i32 [[VAR5]], [[VAR4]]
 ; SINK-AFTER-NEXT:    [[VAR7]] = udiv i32 219220132, [[VAR3]]
 ; SINK-AFTER-NEXT:    [[VAR8]] = add nsw i32 [[VAR3]], -1
-; SINK-AFTER-NEXT:    store i32 [[VAR4]], ptr [[DST]], align 4
 ; SINK-AFTER-NEXT:    [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2
 ; SINK-AFTER-NEXT:    br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
 ;
@@ -2863,7 +2916,6 @@ bb:
   %var6 = add i32 %var5, %var4
   %var7 = udiv i32 219220132, %var3
   %var8 = add nsw i32 %var3, -1
-  store i32 %var4, ptr %dst
   %var9 = icmp slt i32 %var3, 2
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
@@ -3293,6 +3345,287 @@ bb:
   br i1 %var9, label %bb1, label %bb2, !prof !2
 }
 
+; %vec.dead will be marked as dead instruction in the vector loop and no recipe
+; will be created for it. Make sure a valid sink target is used.
+define i32 @sink_after_dead_inst(ptr %A.ptr) {
+; UNROLL-NO-IC-LABEL: @sink_after_dead_inst(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
+; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP4]], i32 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; UNROLL-NO-IC-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP6]], align 4
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[LOOP:%.*]]
+; UNROLL-NO-IC:       loop:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; UNROLL-NO-IC-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; UNROLL-NO-IC-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; UNROLL-NO-IC-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; UNROLL-NO-IC-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; UNROLL-NO-IC-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; UNROLL-NO-IC-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; UNROLL-NO-IC-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; UNROLL-NO-IC-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+; UNROLL-NO-VF-LABEL: @sink_after_dead_inst(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[VECTOR_RECUR]] to i16
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = add i16 [[TMP1]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP4:%.*]] = or i16 [[TMP2]], [[TMP2]]
+; UNROLL-NO-VF-NEXT:    [[TMP5:%.*]] = or i16 [[TMP3]], [[TMP3]]
+; UNROLL-NO-VF-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP10:%.*]] = zext i16 [[TMP5]] to i32
+; UNROLL-NO-VF-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; UNROLL-NO-VF-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]]
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP8]], align 4
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[TMP9]], align 4
+; UNROLL-NO-VF-NEXT:    [[TMP7]] = add nuw i32 [[VECTOR_RECUR]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP7]], 16
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[LOOP:%.*]]
+; UNROLL-NO-VF:       loop:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; UNROLL-NO-VF-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; UNROLL-NO-VF-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; UNROLL-NO-VF-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; UNROLL-NO-VF-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; UNROLL-NO-VF-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; UNROLL-NO-VF-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; UNROLL-NO-VF-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; UNROLL-NO-VF-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+; SINK-AFTER-LABEL: @sink_after_dead_inst(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
+; SINK-AFTER-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SINK-AFTER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[OFFSET_IDX]]
+; SINK-AFTER-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; SINK-AFTER-NEXT:    store <4 x i32> zeroinitializer, ptr [[TMP5]], align 4
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; SINK-AFTER-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[LOOP:%.*]]
+; SINK-AFTER:       loop:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; SINK-AFTER-NEXT:    [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ]
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i32 [[FOR]], 15
+; SINK-AFTER-NEXT:    [[C:%.*]] = icmp eq i1 [[CMP]], true
+; SINK-AFTER-NEXT:    [[VEC_DEAD:%.*]] = and i1 [[C]], true
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[B1:%.*]] = or i16 [[IV_NEXT]], [[IV_NEXT]]
+; SINK-AFTER-NEXT:    [[B3:%.*]] = and i1 [[CMP]], [[C]]
+; SINK-AFTER-NEXT:    [[FOR_PREV]] = zext i16 [[B1]] to i32
+; SINK-AFTER-NEXT:    [[EXT:%.*]] = zext i1 [[B3]] to i32
+; SINK-AFTER-NEXT:    [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]]
+; SINK-AFTER-NEXT:    store i32 0, ptr [[A_GEP]], align 4
+; SINK-AFTER-NEXT:    br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT:    ret i32 [[FOR_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i16 [ 0, %entry ], [ %iv.next, %loop ]
+  %for = phi i32 [ 0, %entry ], [ %for.prev, %loop ]
+  %cmp = icmp eq i32 %for, 15
+  %C = icmp eq i1 %cmp, true
+  %vec.dead = and i1 %C, 1
+  %iv.next = add i16 %iv, 1
+  %B1 = or i16 %iv.next, %iv.next
+  %B3 = and i1 %cmp, %C
+  %for.prev = zext i16 %B1 to i32
+
+  %ext = zext i1 %B3 to i32
+  %A.gep = getelementptr i32, ptr %A.ptr, i16 %iv
+  store i32 0, ptr %A.gep
+  br i1 %vec.dead, label %for.end, label %loop
+
+for.end:
+  ret i32 %for
+}
+
+; %rec.1 only has %use.rec.1 as use, which can be removed. This enables %rec.1
+; to be removed also.
+define void @unused_recurrence(ptr %a) {
+; UNROLL-NO-IC-LABEL: @unused_recurrence(
+; UNROLL-NO-IC-NEXT:  entry:
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT:    [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[STEP_ADD]], splat (i16 1)
+; UNROLL-NO-IC-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
+; UNROLL-NO-IC-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4)
+; UNROLL-NO-IC-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; UNROLL-NO-IC-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-IC:       middle.block:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; UNROLL-NO-IC-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-IC:       scalar.ph:
+; UNROLL-NO-IC-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 997, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-IC-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-IC-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-IC:       for.cond:
+; UNROLL-NO-IC-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-IC-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-IC-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-IC-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-IC-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-IC-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-IC:       for.end:
+; UNROLL-NO-IC-NEXT:    ret void
+;
+; UNROLL-NO-VF-LABEL: @unused_recurrence(
+; UNROLL-NO-VF-NEXT:  entry:
+; UNROLL-NO-VF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL-NO-VF:       vector.ph:
+; UNROLL-NO-VF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; UNROLL-NO-VF:       vector.body:
+; UNROLL-NO-VF-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT:    [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i16
+; UNROLL-NO-VF-NEXT:    [[OFFSET_IDX:%.*]] = add i16 -27, [[DOTCAST]]
+; UNROLL-NO-VF-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP1:%.*]] = add i16 [[TMP0]], 1
+; UNROLL-NO-VF-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], 5
+; UNROLL-NO-VF-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; UNROLL-NO-VF-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
+; UNROLL-NO-VF-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; UNROLL-NO-VF:       middle.block:
+; UNROLL-NO-VF-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; UNROLL-NO-VF:       scalar.ph:
+; UNROLL-NO-VF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; UNROLL-NO-VF-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[TMP2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; UNROLL-NO-VF-NEXT:    br label [[FOR_COND:%.*]]
+; UNROLL-NO-VF:       for.cond:
+; UNROLL-NO-VF-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; UNROLL-NO-VF-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; UNROLL-NO-VF-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; UNROLL-NO-VF-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; UNROLL-NO-VF-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; UNROLL-NO-VF-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; UNROLL-NO-VF:       for.end:
+; UNROLL-NO-VF-NEXT:    ret void
+;
+; SINK-AFTER-LABEL: @unused_recurrence(
+; SINK-AFTER-NEXT:  entry:
+; SINK-AFTER-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; SINK-AFTER:       vector.ph:
+; SINK-AFTER-NEXT:    br label [[VECTOR_BODY:%.*]]
+; SINK-AFTER:       vector.body:
+; SINK-AFTER-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 -27, i16 -26, i16 -25, i16 -24>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT:    [[TMP0:%.*]] = add <4 x i16> [[VEC_IND]], splat (i16 1)
+; SINK-AFTER-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[TMP0]], splat (i16 5)
+; SINK-AFTER-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; SINK-AFTER-NEXT:    [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4)
+; SINK-AFTER-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028
+; SINK-AFTER-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
+; SINK-AFTER:       middle.block:
+; SINK-AFTER-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+; SINK-AFTER-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SINK-AFTER:       scalar.ph:
+; SINK-AFTER-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i16 [ 1001, [[MIDDLE_BLOCK]] ], [ -27, [[ENTRY:%.*]] ]
+; SINK-AFTER-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; SINK-AFTER-NEXT:    br label [[FOR_COND:%.*]]
+; SINK-AFTER:       for.cond:
+; SINK-AFTER-NEXT:    [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[REC_1:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_1_PREV:%.*]], [[FOR_COND]] ]
+; SINK-AFTER-NEXT:    [[USE_REC_1:%.*]] = sub i16 [[REC_1]], 10
+; SINK-AFTER-NEXT:    [[IV_NEXT]] = add i16 [[IV]], 1
+; SINK-AFTER-NEXT:    [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5
+; SINK-AFTER-NEXT:    [[CMP:%.*]] = icmp eq i16 [[IV]], 1000
+; SINK-AFTER-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+; SINK-AFTER:       for.end:
+; SINK-AFTER-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %iv = phi i16 [ -27, %entry ], [ %iv.next, %for.cond ]
+  %rec.1 = phi i16 [ 0, %entry ], [ %rec.1.prev, %for.cond ]
+  %use.rec.1 = sub i16 %rec.1, 10
+  %iv.next= add i16 %iv, 1
+  %rec.1.prev = add i16 %iv.next, 5
+  %cmp = icmp eq i16 %iv, 1000
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.end:
+  ret void
+}
+
 ; Test case for https://github.com/llvm/llvm-project/issues/95520.
 define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) {
 ; UNROLL-NO-IC-LABEL: @recurence_uniform_load(
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index a622193290c8f..e27734755dfb2 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -59,8 +59,8 @@ define void @ptr_depends_on_sdiv(ptr noalias %dst, i16 noundef %off) {
 ; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -132,8 +132,8 @@ define void @ptr_depends_on_possibly_poison_value(ptr noalias %dst, i16 %off) {
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -203,8 +203,8 @@ define void @ptr_doesnt_depend_on_poison_or_ub(ptr noalias %dst, i16 noundef %of
 ; CHECK-NEXT:    store i64 [[TMP13]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -276,8 +276,8 @@ define void @ptr_depends_on_possibly_poison_value_from_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
@@ -350,8 +350,8 @@ define void @ptr_depends_on_noundef_load(ptr noalias %dst) {
 ; CHECK-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 1
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, [[EXIT:label %.*]], label %[[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index d973e451d887d..a4f2b077cb066 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -282,3 +282,141 @@ for.cond:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.cond
   ret void
 }
+
+; Test that when WidenPointerInductionRecipes are ordered before other
+; WidenIntOrFpInductionRecipes that their PHIs are emitted in the right place.
+define void @outside_lattice(ptr noalias %p, ptr noalias %q, i32 %n) {
+; DEFAULT-LABEL: @outside_lattice(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; DEFAULT-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; DEFAULT:       vector.scevcheck:
+; DEFAULT-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; DEFAULT-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; DEFAULT-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; DEFAULT-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; DEFAULT-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; DEFAULT-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; DEFAULT-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; DEFAULT-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; DEFAULT-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; DEFAULT-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; DEFAULT-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; DEFAULT-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; DEFAULT-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; DEFAULT-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; DEFAULT-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; DEFAULT-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; DEFAULT-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; DEFAULT-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]]
+; DEFAULT:       for.end:
+; DEFAULT-NEXT:    ret void
+;
+; STRIDED-LABEL: @outside_lattice(
+; STRIDED-NEXT:  entry:
+; STRIDED-NEXT:    [[TMP0:%.*]] = zext i32 [[N:%.*]] to i64
+; STRIDED-NEXT:    [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 4
+; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; STRIDED:       vector.scevcheck:
+; STRIDED-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; STRIDED-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; STRIDED-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
+; STRIDED-NEXT:    br i1 [[TMP2]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; STRIDED:       vector.ph:
+; STRIDED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX1]], 4
+; STRIDED-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX1]], [[N_MOD_VF]]
+; STRIDED-NEXT:    [[TMP3:%.*]] = mul i64 [[N_VEC]], 4
+; STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; STRIDED-NEXT:    [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
+; STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; STRIDED:       vector.body:
+; STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; STRIDED-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; STRIDED-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
+; STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
+; STRIDED-NEXT:    store <4 x ptr> [[VECTOR_GEP]], ptr [[TMP6]], align 8
+; STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i32 [[OFFSET_IDX]]
+; STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; STRIDED-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP8]], align 4
+; STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
+; STRIDED-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 16
+; STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; STRIDED:       middle.block:
+; STRIDED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX1]], [[N_VEC]]
+; STRIDED-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; STRIDED:       scalar.ph:
+; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ], [ [[P]], [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; STRIDED-NEXT:    br label [[FOR_BODY:%.*]]
+; STRIDED:       for.body:
+; STRIDED-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[IV_INT:%.*]] = phi i32 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[FOR_BODY]] ]
+; STRIDED-NEXT:    [[P_GEP:%.*]] = getelementptr inbounds ptr, ptr [[P]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store ptr [[IV_PTR]], ptr [[P_GEP]], align 8
+; STRIDED-NEXT:    [[Q_GEP:%.*]] = getelementptr inbounds i32, ptr [[Q]], i32 [[IV_INT]]
+; STRIDED-NEXT:    store i32 [[IV_INT]], ptr [[Q_GEP]], align 4
+; STRIDED-NEXT:    [[IV_INT_NEXT]] = add i32 [[IV_INT]], 1
+; STRIDED-NEXT:    [[IV_PTR_NEXT]] = getelementptr inbounds i32, ptr [[IV_PTR]], i32 1
+; STRIDED-NEXT:    [[DONE:%.*]] = icmp ult i32 [[IV_INT_NEXT]], [[N]]
+; STRIDED-NEXT:    br i1 [[DONE]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP9:![0-9]+]]
+; STRIDED:       for.end:
+; STRIDED-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv.ptr = phi ptr [ %p, %entry ], [ %iv.ptr.next, %for.body ]
+  %iv.int = phi i32 [ 0, %entry ], [ %iv.int.next, %for.body ]
+
+  %p.gep = getelementptr inbounds ptr, ptr %p, i32 %iv.int
+  store ptr %iv.ptr, ptr %p.gep
+
+  %q.gep = getelementptr inbounds i32, ptr %q, i32 %iv.int
+  store i32 %iv.int, ptr %q.gep
+
+  %iv.int.next = add i32 %iv.int, 1
+  %iv.ptr.next = getelementptr inbounds i32, ptr %iv.ptr, i32 1
+
+  %done = icmp ult i32 %iv.int.next, %n
+  br i1 %done, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
index d497f0c22dbbc..fbe3a7a470e86 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll
@@ -116,7 +116,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -125,12 +125,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -139,12 +139,12 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -153,7 +153,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
@@ -321,7 +321,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
@@ -330,12 +330,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
@@ -344,12 +344,12 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
@@ -358,7 +358,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> splat (i32 1)
@@ -436,7 +436,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK:       pred.load.if3:
+; CHECK:       pred.load.if2:
 ; CHECK-NEXT:    [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
@@ -445,12 +445,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP22]], i64 1
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
-; CHECK:       pred.load.continue4:
+; CHECK:       pred.load.continue3:
 ; CHECK-NEXT:    [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK:       pred.load.if5:
+; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -459,12 +459,12 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP32]], i64 2
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
-; CHECK:       pred.load.continue6:
+; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP34:%.*]] = phi <4 x i32> [ [[TMP24]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP33]], [[PRED_LOAD_IF5]] ]
 ; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.if7:
+; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4
@@ -473,7 +473,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) {
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i32, ptr [[TMP38]], align 4
 ; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP48]], i64 3
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
-; CHECK:       pred.load.continue8:
+; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP50:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP49]], [[PRED_LOAD_IF7]] ]
 ; CHECK-NEXT:    [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index a70d8f72c8a33..bb84dbf8ed236 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -658,9 +658,9 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF1-NEXT:    [[N_VEC:%.*]] = sub i32 96, [[N_MOD_VF]]
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
-; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF1-NEXT:    [[TMP7:%.*]] = mul <vscale x 4 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-VF4UF1-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP7]]
 ; CHECK-VF4UF1-NEXT:    [[TMP8:%.*]] = mul i32 1, [[TMP5]]
@@ -707,17 +707,17 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x)  {
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 4
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4UF2-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-VF4UF2-NEXT:    [[TMP8:%.*]] = mul <vscale x 4 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-VF4UF2-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_BODY]]:
 ; CHECK-VF4UF2-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-VF4UF2-NEXT:    [[STEP_ADD:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-VF4UF2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4UF2-NEXT:    [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT2]]
 ; CHECK-VF4UF2-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[VEC_IND_NEXT]], [[BROADCAST_SPLAT]]
 ; CHECK-VF4UF2-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
index f136b0e2e0b31..10f96284c0184 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll
@@ -23,12 +23,12 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
@@ -103,12 +103,12 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 1 x i64> @llvm.stepvector.nxv1i64()
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
index 2a48e0a5e5310..15db687ba64ff 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -19,17 +19,17 @@ define i32 @iv_live_out_wide(ptr %dst) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
 ; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 5ff43dcf42bcf..ec1e8fa1e1b33 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -918,8 +918,8 @@ define i64 @same_exit_block_pre_inc_use4() {
 ; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_SPLIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
index 130db548ca8cb..fe533672f2ca2 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll
@@ -291,8 +291,8 @@ define void @redundant_branch_and_blends_without_mask(ptr %A) {
 ; CHECK-NEXT:    store i32 [[TMP34]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_CONTINUE12]]:
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
 ; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
index 59277186195fc..7654bc9a141e0 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll
@@ -65,3 +65,68 @@ loop.latch:
 exit:
   ret void
 }
+
+; Check that VPWidenIntOrFPInductionRecipe is expanded into smaller recipes in
+; the final VPlan.
+define void @iv_expand(ptr %p, i64 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'iv_expand'
+; CHECK:      VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK:      <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%3> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     ir<%iv> = WIDEN-INDUCTION  ir<0>, ir<1>, vp<%0>
+; CHECK-NEXT:     vp<%4> = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:     CLONE ir<%q> = getelementptr ir<%p>, vp<%4>
+; CHECK-NEXT:     vp<%5> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN ir<%x> = load vp<%5>
+; CHECK-NEXT:     WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:     vp<%6> = vector-pointer ir<%q>
+; CHECK-NEXT:     WIDEN store vp<%6>, ir<%y>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK:      VPlan 'Final VPlan for VF={8},UF={1}'
+; CHECK:      ir-bb<vector.ph>:
+; CHECK-NEXT:     IR   %n.mod.vf = urem i64 %n, 8
+; CHECK-NEXT:     IR   %n.vec = sub i64 %n, %n.mod.vf
+; CHECK-NEXT:     EMIT vp<[[STEP_VECTOR:%.+]]> = step-vector
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_0:%.+]]> = broadcast ir<0>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_1:%.+]]> = broadcast ir<1>
+; CHECK-NEXT:     EMIT vp<[[MUL:%.+]]> = mul vp<[[STEP_VECTOR]]>, vp<[[BROADCAST_1]]>
+; CHECK-NEXT:     EMIT vp<[[INDUCTION:%.+]]> = add vp<[[BROADCAST_0]]>, vp<[[MUL]]>
+; CHECK-NEXT:     EMIT vp<[[TRUNC:%.+]]> = trunc ir<8> to i64
+; CHECK-NEXT:     EMIT vp<[[INC:%.+]]> = mul ir<1>, vp<[[TRUNC]]>
+; CHECK-NEXT:     EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast vp<[[INC]]>
+; CHECK-NEXT: Successor(s): vector.body
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT-SCALAR vp<[[SCALAR_PHI:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT:   WIDEN-PHI ir<%iv> = phi [ vp<[[INDUCTION]]>, ir-bb<vector.ph> ], [ vp<%vec.ind.next>, vector.body ]
+; CHECK-NEXT:   CLONE ir<%q> = getelementptr ir<%p>, vp<[[SCALAR_PHI]]>
+; CHECK-NEXT:   vp<[[VEC_PTR_1:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN ir<%x> = load vp<[[VEC_PTR_1]]>
+; CHECK-NEXT:   WIDEN ir<%y> = add ir<%x>, ir<%iv>
+; CHECK-NEXT:   vp<[[VEC_PTR_2:%.+]]> = vector-pointer ir<%q>
+; CHECK-NEXT:   WIDEN store vp<[[VEC_PTR_2]]>, ir<%y>
+; CHECK-NEXT:   EMIT vp<%index.next> = add nuw vp<[[SCALAR_PHI]]>, ir<8>
+; CHECK-NEXT:   EMIT vp<%vec.ind.next> = add ir<%iv>, vp<[[BROADCAST_INC]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<%index.next>, ir<%n.vec>
+; CHECK-NEXT: Successor(s): middle.block, vector.body
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [0, %entry], [%iv.next, %loop]
+  %q = getelementptr i64, ptr %p, i64 %iv
+  %x = load i64, ptr %q
+  %y = add i64 %x, %iv
+  store i64 %y, ptr %q
+  %iv.next = add i64 %iv, 1
+  %done = icmp eq i64 %iv.next, %n
+  br i1 %done, label %exit, label %loop
+
+exit:
+  ret void
+}

From 9e0186d925f0c375a627866c59394f25c22eb3ff Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 10:24:57 -0700
Subject: [PATCH 774/851] [HLSL][RootSignature] Implement `ResourceRange` as an
 `IntervalMap` (#140957)

A resource range consists of a closed interval, `[a;b]`, denoting which
shader registers it is bound to.

For instance:
 - `CBV(b1)`  corresponds to the resource range of `[1;1]`
 - `CBV(b0, numDescriptors = 3)` likewise to `[0;2]`

We want to provide an error diagnostic when there is an overlap in the
required registers (an overlap in the resource ranges).

The goal of this pr is to implement a structure to model a set of
resource ranges and provide an api to detect any overlap over a set of
resource ranges.

`ResourceRange` models this by implementing an `IntervalMap` to denote a
mapping from an interval of registers back to a resource range. It
allows for a new `ResourceRange` to be added to the mapping and it will
report if and what the first overlap is.

For the context of how this will be used in validation of a
`RootSignatureDecl` please see the proceeding pull request here:
https://github.com/llvm/llvm-project/pull/140962.

- Implements `ResourceRange` as an `IntervalMap`
- Adds unit testing of the various `insert` scenarios

Note: it was also considered to implement this as an `IntervalTree`,
this would allow reporting of a diagnostic for each overlap that is
encountered, as opposed to just the first. However, error generation of
just reporting the first error is already rather verbose, and adding the
additional diagnostics only made this worse.

Part 1 of https://github.com/llvm/llvm-project/issues/129942
---
 .../llvm/Frontend/HLSL/HLSLRootSignature.h    |   1 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |  57 ++++++
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  |  61 ++++++
 llvm/unittests/Frontend/CMakeLists.txt        |   1 +
 .../Frontend/HLSLRootSignatureRangesTest.cpp  | 177 ++++++++++++++++++
 5 files changed, 297 insertions(+)
 create mode 100644 llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp

diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
index 2f028817b45b6..9dfbd3cb68928 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DXILABI.h"
+#include <limits>
 #include <variant>
 
 namespace llvm {
diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index ca20e6719f3a4..4d2cd183ebcbc 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -15,6 +15,7 @@
 #define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATUREUTILS_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntervalMap.h"
 #include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
@@ -64,6 +65,62 @@ class MetadataBuilder {
   SmallVector<Metadata *> GeneratedMetadata;
 };
 
+// RangeInfo holds the information to correctly construct a ResourceRange
+// and retains this information to be used for displaying a better diagnostic
+struct RangeInfo {
+  const static uint32_t Unbounded = ~0u;
+
+  uint32_t LowerBound;
+  uint32_t UpperBound;
+};
+
+class ResourceRange {
+public:
+  using MapT = llvm::IntervalMap<uint32_t, const RangeInfo *, 16,
+                                 llvm::IntervalMapInfo<uint32_t>>;
+
+private:
+  MapT Intervals;
+
+public:
+  ResourceRange(MapT::Allocator &Allocator) : Intervals(MapT(Allocator)) {}
+
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  std::optional<const RangeInfo *> getOverlapping(const RangeInfo &Info) const;
+
+  // Return the mapped RangeInfo at X or nullptr if no mapping exists
+  const RangeInfo *lookup(uint32_t X) const;
+
+  // Insert the required (sub-)intervals such that the interval of [a;b] =
+  // [Info.LowerBound, Info.UpperBound] is covered and points to a valid
+  // RangeInfo &.
+  //
+  // For instance consider the following chain of inserting RangeInfos with the
+  // intervals denoting the Lower/Upper-bounds:
+  //
+  // A = [0;2]
+  //   insert(A) -> false
+  //   intervals: [0;2] -> &A
+  // B = [5;7]
+  //   insert(B) -> false
+  //   intervals: [0;2] -> &A, [5;7] -> &B
+  // C = [4;7]
+  //   insert(C) -> true
+  //   intervals: [0;2] -> &A, [4;7] -> &C
+  // D = [1;5]
+  //   insert(D) -> true
+  //   intervals: [0;2] -> &A, [3;3] -> &D, [4;7] -> &C
+  // E = [0;unbounded]
+  //   insert(E) -> true
+  //   intervals: [0;unbounded] -> E
+  //
+  // Returns a reference to the first RangeInfo that overlaps with
+  // [Info.LowerBound;Info.UpperBound], or, std::nullopt if there is no overlap
+  // (equivalent to getOverlapping)
+  std::optional<const RangeInfo *> insert(const RangeInfo &Info);
+};
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 5bae72a3986f8..1e198b639cfdc 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -355,6 +355,67 @@ MDNode *MetadataBuilder::BuildStaticSampler(const StaticSampler &Sampler) {
   return MDNode::get(Ctx, Operands);
 }
 
+std::optional<const RangeInfo *>
+ResourceRange::getOverlapping(const RangeInfo &Info) const {
+  MapT::const_iterator Interval = Intervals.find(Info.LowerBound);
+  if (!Interval.valid() || Info.UpperBound < Interval.start())
+    return std::nullopt;
+  return Interval.value();
+}
+
+const RangeInfo *ResourceRange::lookup(uint32_t X) const {
+  return Intervals.lookup(X, nullptr);
+}
+
+std::optional<const RangeInfo *> ResourceRange::insert(const RangeInfo &Info) {
+  uint32_t LowerBound = Info.LowerBound;
+  uint32_t UpperBound = Info.UpperBound;
+
+  std::optional<const RangeInfo *> Res = std::nullopt;
+  MapT::iterator Interval = Intervals.begin();
+
+  while (true) {
+    if (UpperBound < LowerBound)
+      break;
+
+    Interval.advanceTo(LowerBound);
+    if (!Interval.valid()) // No interval found
+      break;
+
+    // Let Interval = [x;y] and [LowerBound;UpperBound] = [a;b] and note that
+    // a <= y implicitly from Intervals.find(LowerBound)
+    if (UpperBound < Interval.start())
+      break; // found interval does not overlap with inserted one
+
+    if (!Res.has_value()) // Update to be the first found intersection
+      Res = Interval.value();
+
+    if (Interval.start() <= LowerBound && UpperBound <= Interval.stop()) {
+      // x <= a <= b <= y implies that [a;b] is covered by [x;y]
+      //  -> so we don't need to insert this, report an overlap
+      return Res;
+    } else if (LowerBound <= Interval.start() &&
+               Interval.stop() <= UpperBound) {
+      // a <= x <= y <= b implies that [x;y] is covered by [a;b]
+      //  -> so remove the existing interval that we will cover with the
+      //  overwrite
+      Interval.erase();
+    } else if (LowerBound < Interval.start() && UpperBound <= Interval.stop()) {
+      // a < x <= b <= y implies that [a; x] is not covered but [x;b] is
+      //  -> so set b = x - 1 such that [a;x-1] is now the interval to insert
+      UpperBound = Interval.start() - 1;
+    } else if (Interval.start() <= LowerBound && Interval.stop() < UpperBound) {
+      // a < x <= b <= y implies that [y; b] is not covered but [a;y] is
+      //  -> so set a = y + 1 such that [y+1;b] is now the interval to insert
+      LowerBound = Interval.stop() + 1;
+    }
+  }
+
+  assert(LowerBound <= UpperBound && "Attempting to insert an empty interval");
+  Intervals.insert(LowerBound, UpperBound, &Info);
+  return Res;
+}
+
 } // namespace rootsig
 } // namespace hlsl
 } // namespace llvm
diff --git a/llvm/unittests/Frontend/CMakeLists.txt b/llvm/unittests/Frontend/CMakeLists.txt
index 2119642769e3d..4048143b36819 100644
--- a/llvm/unittests/Frontend/CMakeLists.txt
+++ b/llvm/unittests/Frontend/CMakeLists.txt
@@ -12,6 +12,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(LLVMFrontendTests
   HLSLRootSignatureDumpTest.cpp
+  HLSLRootSignatureRangesTest.cpp
   OpenACCTest.cpp
   OpenMPContextTest.cpp
   OpenMPIRBuilderTest.cpp
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
new file mode 100644
index 0000000000000..0ef6fe84f0ec9
--- /dev/null
+++ b/llvm/unittests/Frontend/HLSLRootSignatureRangesTest.cpp
@@ -0,0 +1,177 @@
+//===------ HLSLRootSignatureRangeTest.cpp - RootSignature Range tests ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/HLSL/HLSLRootSignatureUtils.h"
+#include "gtest/gtest.h"
+
+using namespace llvm::hlsl::rootsig;
+
+namespace {
+
+TEST(HLSLRootSignatureTest, NoOverlappingInsertTests) {
+  // Ensures that there is never a reported overlap
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 3;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 7;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 10;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C), std::nullopt);
+
+  // A = [0;3]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+
+  // B = [4;7]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(7), &B);
+
+  EXPECT_EQ(Range.lookup(8), nullptr);
+  EXPECT_EQ(Range.lookup(9), nullptr);
+
+  // C = [10;unbounded]
+  EXPECT_EQ(Range.lookup(10), &C);
+  EXPECT_EQ(Range.lookup(42), &C);
+  EXPECT_EQ(Range.lookup(98237423), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, SingleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when we insert a range that
+  // overlaps with one other range but does not cover (replace) it
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 1;
+  A.UpperBound = 5;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 0;
+  B.UpperBound = 2;
+  EXPECT_EQ(Range.insert(B).value(), &A);
+
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = RangeInfo::Unbounded;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [1;5]
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), &A);
+  EXPECT_EQ(Range.lookup(4), &A);
+  EXPECT_EQ(Range.lookup(5), &A);
+
+  // B = [0;0]
+  EXPECT_EQ(Range.lookup(0), &B);
+
+  // C = [6; unbounded]
+  EXPECT_EQ(Range.lookup(6), &C);
+  EXPECT_EQ(Range.lookup(RangeInfo::Unbounded), &C);
+}
+
+TEST(HLSLRootSignatureTest, MultipleOverlappingInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // overlaps more than one range and it does not cover (replace) either
+  // range. In this case it will just fill in the interval between the two
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 6;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  RangeInfo C;
+  C.LowerBound = 1;
+  C.UpperBound = 5;
+  EXPECT_EQ(Range.insert(C).value(), &A);
+
+  // A = [0;2]
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+
+  // B = [4;6]
+  EXPECT_EQ(Range.lookup(4), &B);
+  EXPECT_EQ(Range.lookup(5), &B);
+  EXPECT_EQ(Range.lookup(6), &B);
+
+  // C = [3;3]
+  EXPECT_EQ(Range.lookup(3), &C);
+}
+
+TEST(HLSLRootSignatureTest, CoverInsertTests) {
+  // Ensures that we correctly report an overlap when inserted range
+  // covers one or more ranges
+  ResourceRange::MapT::Allocator Allocator;
+  ResourceRange Range(Allocator);
+
+  RangeInfo A;
+  A.LowerBound = 0;
+  A.UpperBound = 2;
+  EXPECT_EQ(Range.insert(A), std::nullopt);
+
+  RangeInfo B;
+  B.LowerBound = 4;
+  B.UpperBound = 5;
+  EXPECT_EQ(Range.insert(B), std::nullopt);
+
+  // Covers B
+  RangeInfo C;
+  C.LowerBound = 4;
+  C.UpperBound = 6;
+  EXPECT_EQ(Range.insert(C).value(), &B);
+
+  // A = [0;2]
+  // C = [4;6] <- covers reference to B
+  EXPECT_EQ(Range.lookup(0), &A);
+  EXPECT_EQ(Range.lookup(1), &A);
+  EXPECT_EQ(Range.lookup(2), &A);
+  EXPECT_EQ(Range.lookup(3), nullptr);
+  EXPECT_EQ(Range.lookup(4), &C);
+  EXPECT_EQ(Range.lookup(5), &C);
+  EXPECT_EQ(Range.lookup(6), &C);
+
+  // Covers all other ranges
+  RangeInfo D;
+  D.LowerBound = 0;
+  D.UpperBound = 7;
+  EXPECT_EQ(Range.insert(D).value(), &A);
+
+  // D = [0;7] <- Covers reference to A and C
+  EXPECT_EQ(Range.lookup(0), &D);
+  EXPECT_EQ(Range.lookup(1), &D);
+  EXPECT_EQ(Range.lookup(2), &D);
+  EXPECT_EQ(Range.lookup(3), &D);
+  EXPECT_EQ(Range.lookup(4), &D);
+  EXPECT_EQ(Range.lookup(5), &D);
+  EXPECT_EQ(Range.lookup(6), &D);
+  EXPECT_EQ(Range.lookup(7), &D);
+}
+
+} // namespace

From ed07b54b38c675235b4ce1bfd49e1fff372f6520 Mon Sep 17 00:00:00 2001
From: Morris Hafner <mmha@users.noreply.github.com>
Date: Tue, 17 Jun 2025 18:35:49 +0100
Subject: [PATCH 775/851] [CIR][NFCI] Represent Complex RValues As Single Value
 (#144519)

This patch removes one mlir::Value in the RValue class that has been
used to represent complex values in classic CG. In CIR we plan on
representing complex as a single value. It also removes some now
unnecessary member functions related to complex handling.
---
 clang/lib/CIR/CodeGen/CIRGenCall.cpp       |  2 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp       |  6 ++---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 10 ++++----
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp       |  3 +--
 clang/lib/CIR/CodeGen/CIRGenValue.h        | 27 +++++++---------------
 5 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 0d9064425fa95..af0e6ca822b8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -443,7 +443,7 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
       mlir::Value v;
       if (arg.isAggregate())
         cgm.errorNYI(loc, "emitCall: aggregate call argument");
-      v = arg.getKnownRValue().getScalarVal();
+      v = arg.getKnownRValue().getValue();
 
       // We might have to widen integers, but we should never truncate.
       if (argType != v.getType() && mlir::isa<cir::IntType>(v.getType()))
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 2e43f10be132c..4f2046ad26d72 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -219,7 +219,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       const mlir::Value vector =
           builder.createLoad(loc, dst.getVectorAddress());
       const mlir::Value newVector = builder.create<cir::VecInsertOp>(
-          loc, vector, src.getScalarVal(), dst.getVectorIdx());
+          loc, vector, src.getValue(), dst.getVectorIdx());
       builder.createStore(loc, newVector, dst.getVectorAddress());
       return;
     }
@@ -232,7 +232,7 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
   assert(!cir::MissingFeatures::opLoadStoreObjC());
 
   assert(src.isScalar() && "Can't emit an aggregate store with this method");
-  emitStoreOfScalar(src.getScalarVal(), dst, isInit);
+  emitStoreOfScalar(src.getValue(), dst, isInit);
 }
 
 static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
@@ -949,7 +949,7 @@ LValue CIRGenFunction::emitCallExprLValue(const CallExpr *e) {
          "Can't have a scalar return unless the return type is a "
          "reference type!");
 
-  return makeNaturalAlignPointeeAddrLValue(rv.getScalarVal(), e->getType());
+  return makeNaturalAlignPointeeAddrLValue(rv.getValue(), e->getType());
 }
 
 LValue CIRGenFunction::emitBinaryOperatorLValue(const BinaryOperator *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 75b4d2a637e6e..8d0db5cd0a1e5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -131,11 +131,11 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value emitLoadOfLValue(const Expr *e) {
     LValue lv = cgf.emitLValue(e);
     // FIXME: add some akin to EmitLValueAlignmentAssumption(E, V);
-    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
   }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
-    return cgf.emitLoadOfLValue(lv, loc).getScalarVal();
+    return cgf.emitLoadOfLValue(lv, loc).getValue();
   }
 
   // l-values
@@ -400,10 +400,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       cgf.cgm.errorNYI(e->getSourceRange(), "Atomic inc/dec");
       // TODO(cir): This is not correct, but it will produce reasonable code
       // until atomic operations are implemented.
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     } else {
-      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getScalarVal();
+      value = cgf.emitLoadOfLValue(lv, e->getExprLoc()).getValue();
       input = value;
     }
 
@@ -1805,7 +1805,7 @@ mlir::Value ScalarExprEmitter::VisitCallExpr(const CallExpr *e) {
   if (e->getCallReturnType(cgf.getContext())->isReferenceType())
     return emitLoadOfLValue(e);
 
-  auto v = cgf.emitCallExpr(e).getScalarVal();
+  auto v = cgf.emitCallExpr(e).getValue();
   assert(!cir::MissingFeatures::emitLValueAlignmentAssumption());
   return v;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index 019a44636ce3c..9193f6f1cd996 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -391,8 +391,7 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
     // If this function returns a reference, take the address of the
     // expression rather than the value.
     RValue result = emitReferenceBindingToExpr(rv);
-    builder.CIRBaseBuilderTy::createStore(loc, result.getScalarVal(),
-                                          *fnRetAlloca);
+    builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), *fnRetAlloca);
   } else {
     mlir::Value value = nullptr;
     switch (CIRGenFunction::getEvaluationKind(rv->getType())) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h
index c1e08ba1e9b67..84972fc7f9118 100644
--- a/clang/lib/CIR/CodeGen/CIRGenValue.h
+++ b/clang/lib/CIR/CodeGen/CIRGenValue.h
@@ -33,11 +33,7 @@ class RValue {
   enum Flavor { Scalar, Complex, Aggregate };
 
   union {
-    // Stores first and second value.
-    struct {
-      mlir::Value first;
-      mlir::Value second;
-    } vals;
+    mlir::Value value;
 
     // Stores aggregate address.
     Address aggregateAddr;
@@ -47,7 +43,7 @@ class RValue {
   unsigned flavor : 2;
 
 public:
-  RValue() : vals{nullptr, nullptr}, flavor(Scalar) {}
+  RValue() : value(nullptr), flavor(Scalar) {}
 
   bool isScalar() const { return flavor == Scalar; }
   bool isComplex() const { return flavor == Complex; }
@@ -56,14 +52,9 @@ class RValue {
   bool isVolatileQualified() const { return isVolatile; }
 
   /// Return the value of this scalar value.
-  mlir::Value getScalarVal() const {
+  mlir::Value getValue() const {
     assert(isScalar() && "Not a scalar!");
-    return vals.first;
-  }
-
-  /// Return the real/imag components of this complex value.
-  std::pair<mlir::Value, mlir::Value> getComplexVal() const {
-    return std::make_pair(vals.first, vals.second);
+    return value;
   }
 
   /// Return the value of the address of the aggregate.
@@ -83,22 +74,20 @@ class RValue {
 
   static RValue get(mlir::Value v) {
     RValue er;
-    er.vals.first = v;
+    er.value = v;
     er.flavor = Scalar;
     er.isVolatile = false;
     return er;
   }
 
-  static RValue getComplex(mlir::Value v1, mlir::Value v2) {
+  static RValue getComplex(mlir::Value v) {
     RValue er;
-    er.vals = {v1, v2};
+    er.value = v;
     er.flavor = Complex;
     er.isVolatile = false;
     return er;
   }
-  static RValue getComplex(const std::pair<mlir::Value, mlir::Value> &c) {
-    return getComplex(c.first, c.second);
-  }
+
   // FIXME: Aggregate rvalues need to retain information about whether they are
   // volatile or not.  Remove default to find all places that probably get this
   // wrong.

From 3a06e9a710b7cfdbf1c002acc46fa76617e8baf8 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 23:09:38 +0530
Subject: [PATCH 776/851] Conditionalise the addition of Aarch64 function Multi
 versioning support on aarch64 target (#143749)

Currently, `ENABLE_BAREMETAL_AARCH64_FMV` is added to builtin defines
for all baremetal targets though it is only needed for aarch64. This
patch fixes this by adding it only for aarch64 target.
---
 compiler-rt/lib/builtins/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 075c4647abf69..5e832315f3666 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -884,7 +884,11 @@ else ()
   if(COMPILER_RT_DISABLE_AARCH64_FMV)
     list(APPEND BUILTIN_DEFS DISABLE_AARCH64_FMV)
   elseif(COMPILER_RT_BAREMETAL_BUILD)
-    list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+    foreach (arch ${BUILTIN_SUPPORTED_ARCH})
+      if("${arch}" MATCHES "arm64|aarch64")
+        list(APPEND BUILTIN_DEFS ENABLE_BAREMETAL_AARCH64_FMV)
+      endif()
+    endforeach ()
   endif()
 
   append_list_if(COMPILER_RT_HAS_ASM_LSE HAS_ASM_LSE BUILTIN_DEFS)

From 7ea710fafa5782a274ded2ab6933c63c5c71f2ee Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Tue, 17 Jun 2025 10:44:21 -0700
Subject: [PATCH 777/851] Fix/reapply "[libc] Migrate stdio tests to
 ErrnoCheckingTest. (#144134)

This reverts commit 92a116c4ef822950f8c57eaa5164c844c73a1f7e with a fix
for fgets test - convert nullptr to fgets return type (char*), since the
matcher is pedantic.
---
 libc/test/src/stdio/CMakeLists.txt           | 13 ++++++++++++
 libc/test/src/stdio/fdopen_test.cpp          | 14 ++++++-------
 libc/test/src/stdio/fgetc_test.cpp           | 22 +++++++++++---------
 libc/test/src/stdio/fgetc_unlocked_test.cpp  | 22 +++++++++++---------
 libc/test/src/stdio/fgets_test.cpp           | 19 ++++++++++-------
 libc/test/src/stdio/fileop_test.cpp          | 20 +++++-------------
 libc/test/src/stdio/fopencookie_test.cpp     | 15 +++++++------
 libc/test/src/stdio/remove_test.cpp          | 10 ++++-----
 libc/test/src/stdio/rename_test.cpp          |  9 ++++----
 libc/test/src/stdio/setvbuf_test.cpp         |  9 ++++----
 libc/test/src/stdio/unlocked_fileop_test.cpp |  7 +++----
 libc/test/src/stdlib/StrtolTest.h            |  1 -
 libc/test/src/stdlib/strtold_test.cpp        |  1 -
 13 files changed, 85 insertions(+), 77 deletions(-)

diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index ce2171f19597b..4aa8b95880018 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -20,6 +20,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -68,6 +69,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fwrite
     libc.src.stdio.setvbuf
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -88,6 +90,7 @@ add_libc_test(
     libc.src.stdio.fread_unlocked
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
 )
 
 add_libc_test(
@@ -109,6 +112,7 @@ add_libc_test(
     libc.src.stdio.fread
     libc.src.stdio.fseek
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
   LINK_LIBRARIES
     LibcMemoryHelpers
 )
@@ -438,6 +442,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.sys.stat.mkdirat
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
   )
 
   add_libc_test(
@@ -452,6 +457,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.rename
       libc.src.unistd.access
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 
@@ -468,6 +474,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.stdio.fgets
       libc.src.stdio.fputs
       libc.src.unistd.close
+      libc.test.UnitTest.ErrnoCheckingTest
       libc.test.UnitTest.ErrnoSetterMatcher
   )
 endif()
@@ -488,6 +495,8 @@ add_libc_test(
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
     libc.src.stdio.getc
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -510,6 +519,8 @@ add_libc_test(
     libc.src.stdio.funlockfile
     libc.src.stdio.fwrite
     libc.src.stdio.getc_unlocked
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
@@ -527,6 +538,8 @@ add_libc_test(
     libc.src.stdio.fgets
     libc.src.stdio.fopen
     libc.src.stdio.fwrite
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
 )
 
 add_libc_test(
diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp
index 104fc478b100e..b53184c30be36 100644
--- a/libc/test/src/stdio/fdopen_test.cpp
+++ b/libc/test/src/stdio/fdopen_test.cpp
@@ -9,20 +9,21 @@
 #include "src/stdio/fdopen.h"
 
 #include "hdr/fcntl_macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/fclose.h"
 #include "src/stdio/fgets.h"
 #include "src/stdio/fputs.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include <sys/stat.h> // For S_IRWXU
 
-TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
+using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
-  libc_errno = 0;
   constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU);
@@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) {
   ASSERT_ERRNO_SUCCESS();
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC);
@@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) {
   ASSERT_TRUE(nullptr == fp);
 }
 
-TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
-  libc_errno = 0;
+TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) {
   constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test";
   auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME);
   int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU);
@@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) {
   auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w");
   ASSERT_ERRNO_EQ(EINVAL);
   ASSERT_TRUE(nullptr == fp2);
-  libc_errno = 0;
   LIBC_NAMESPACE::close(fd);
   ASSERT_ERRNO_SUCCESS();
 }
diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp
index 56bde5f0099a8..be2e50271b510 100644
--- a/libc/test/src/stdio/fgetc_test.cpp
+++ b/libc/test/src/stdio/fgetc_test.cpp
@@ -14,12 +14,15 @@
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -27,29 +30,28 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp
index 90429ecf4e82b..bef9dafd3d87c 100644
--- a/libc/test/src/stdio/fgetc_unlocked_test.cpp
+++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp
@@ -17,12 +17,15 @@
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/getc_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
 public:
   using GetcFunc = int(FILE *);
   void test_with_func(GetcFunc *func, const char *filename) {
@@ -30,31 +33,30 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test {
     ASSERT_FALSE(file == nullptr);
     constexpr char CONTENT[] = "123456789";
     constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1;
-    ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+    ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+                Succeeds(WRITE_SIZE));
     // This is a write-only file so reads should fail.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Fails(EBADF, EOF));
     // This is an error and not a real EOF.
     ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
     ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-    libc_errno = 0;
 
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
     file = LIBC_NAMESPACE::fopen(filename, "r");
     ASSERT_FALSE(file == nullptr);
 
     LIBC_NAMESPACE::flockfile(file);
     for (size_t i = 0; i < WRITE_SIZE; ++i) {
-      int c = func(file);
-      ASSERT_EQ(c, int('1' + i));
+      ASSERT_THAT(func(file), Succeeds(int('1' + i)));
     }
     // Reading more should return EOF but not set error.
-    ASSERT_EQ(func(file), EOF);
+    ASSERT_THAT(func(file), Succeeds(EOF));
     ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0);
     ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0);
 
     LIBC_NAMESPACE::funlockfile(file);
-    ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+    ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
   }
 };
 
diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp
index abed3d4052939..8fc38b0659181 100644
--- a/libc/test/src/stdio/fgets_test.cpp
+++ b/libc/test/src/stdio/fgets_test.cpp
@@ -12,11 +12,14 @@
 #include "src/stdio/fgets.h"
 #include "src/stdio/fopen.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
 
-TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
+TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   constexpr char FILENAME[] = "testdata/fgets.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -29,15 +32,16 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   char buff[8];
   char *output;
 
-  ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file));
+  ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file),
+              Succeeds(WRITE_SIZE));
   // This is a write-only file so reads should fail.
-  ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr);
+  ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file),
+              Fails(EBADF, static_cast<char *>(nullptr)));
   // This is an error and not a real EOF.
   ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 
   file = LIBC_NAMESPACE::fopen(FILENAME, "r");
   ASSERT_FALSE(file == nullptr);
@@ -55,6 +59,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   // This is also implementation defined.
   output = LIBC_NAMESPACE::fgets(buff, 0, file);
   ASSERT_TRUE(output == nullptr);
+  ASSERT_ERRNO_SUCCESS();
 #endif
 
   const char *output_arr[] = {
@@ -86,5 +91,5 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) {
   ASSERT_NE(LIBC_NAMESPACE::feof(file), 0);
   ASSERT_ERRNO_SUCCESS();
 
-  ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
+  ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds());
 }
diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp
index e624181c795b8..e097785832d56 100644
--- a/libc/test/src/stdio/fileop_test.cpp
+++ b/libc/test/src/stdio/fileop_test.cpp
@@ -17,17 +17,18 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE;
 using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns;
 
-TEST(LlvmLibcFILETest, SimpleFileOperations) {
+TEST_F(LlvmLibcFILETest, SimpleFileOperations) {
   constexpr char FILENAME[] = "testdata/simple_operations.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
@@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
@@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
 
@@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file),
               returns(EQ(EOF)).with_errno(NE(0)));
   ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(file);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file),
               returns(EQ(size_t(0))).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 
@@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
   ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0);
 
   // This is not a readable file.
-  libc_errno = 0;
   ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file),
               returns(EQ(0)).with_errno(NE(0)));
-  libc_errno = 0;
 
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file));
 
@@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) {
 
   // Check that the other functions correctly set libc_errno.
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0);
   // ASSERT_ERRNO_FAILURE();
 
-  // libc_errno = 0;
   // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"),
   //           static_cast<FILE *>(nullptr));
   // ASSERT_ERRNO_FAILURE();
 }
 
-TEST(LlvmLibcFILETest, FFlush) {
+TEST_F(LlvmLibcFILETest, FFlush) {
   constexpr char FILENAME[] = "testdata/fflush.test";
   ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+");
   ASSERT_FALSE(file == nullptr);
@@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) {
   ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0);
 }
 
-TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
+TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   using MyStruct = struct {
     char c;
     unsigned long long i;
@@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) {
   constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct);
   constexpr char FILENAME[] = "testdata/fread_fwrite.test";
 
-  libc_errno = 0;
   FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w");
   ASSERT_FALSE(file == nullptr);
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file));
diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp
index 03e1ac286b646..bcf5e674141a7 100644
--- a/libc/test/src/stdio/fopencookie_test.cpp
+++ b/libc/test/src/stdio/fopencookie_test.cpp
@@ -15,6 +15,7 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fseek.h"
 #include "src/stdio/fwrite.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
@@ -22,6 +23,7 @@
 #include "hdr/types/size_t.h"
 #include "src/__support/libc_errno.h"
 
+using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 using MemoryView = LIBC_NAMESPACE::testing::MemoryView;
 
 struct StringStream {
@@ -88,7 +90,7 @@ int close_ss(void *cookie) {
 constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss,
                                                        &seek_ss, &close_ss};
 
-TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) {
   constexpr char CONTENT[] = "Hello,readonly!";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(CONTENT)));
@@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) {
   size_t INIT_BUFSIZE = 32;
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(INIT_BUFSIZE));
@@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
             LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_EQ(EBADF);
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) {
   constexpr char INITIAL_CONTENT[] = "1234567890987654321";
   constexpr char WRITE_DATA[] = "append";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
@@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0));
   ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0);
@@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) {
   const char INITIAL_CONTENT[] = "1234567890987654321";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(INITIAL_CONTENT)));
@@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) {
   free(ss);
 }
 
-TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) {
+TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) {
   constexpr char WRITE_DATA[] = "hello, file";
   auto *ss = reinterpret_cast<StringStream *>(malloc(sizeof(StringStream)));
   ss->buf = reinterpret_cast<char *>(malloc(sizeof(WRITE_DATA)));
diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp
index 84984e26398c0..296bff1f5dc15 100644
--- a/libc/test/src/stdio/remove_test.cpp
+++ b/libc/test/src/stdio/remove_test.cpp
@@ -11,16 +11,17 @@
 #include "src/sys/stat/mkdirat.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
 #include <unistd.h>
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
+using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   // The test strategy is to create a file and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) {
+TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) {
   // The test strategy is to create a dir and remove it, and also verify that
   // it was removed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
   constexpr const char *FILENAME = "remove.test.dir";
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
index ac494a4ecaf8e..135fb98c07fbb 100644
--- a/libc/test/src/stdio/rename_test.cpp
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -8,18 +8,19 @@
 
 #include "include/llvm-libc-macros/linux/sys-stat-macros.h"
 #include "include/llvm-libc-macros/linux/unistd-macros.h"
-#include "src/__support/libc_errno.h"
 #include "src/fcntl/open.h"
 #include "src/stdio/rename.h"
 #include "src/unistd/access.h"
 #include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/ErrnoSetterMatcher.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) {
   // The test strategy is to create a file and rename it, and also verify that
   // it was renamed.
-  libc_errno = 0;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
 
@@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
   ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
 }
 
-TEST(LlvmLibcRenameTest, RenameNonExistent) {
+TEST_F(LlvmLibcRenameTest, RenameNonExistent) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
 
   constexpr const char *FILENAME1 = "rename.test.file1";
diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp
index 5872943c1bb41..a0936ba79ef73 100644
--- a/libc/test/src/stdio/setvbuf_test.cpp
+++ b/libc/test/src/stdio/setvbuf_test.cpp
@@ -11,12 +11,14 @@
 #include "src/stdio/fread.h"
 #include "src/stdio/fwrite.h"
 #include "src/stdio/setvbuf.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
 #include "hdr/stdio_macros.h"
-#include "src/__support/libc_errno.h"
 
-TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
+using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a NBF buffer to the write handle. Since it is NBF, the data
   // written using the write handle should be immediately readable by the read
@@ -52,7 +54,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) {
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr));
 }
 
-TEST(LlvmLibcSetvbufTest, SetLBFBuffer) {
+TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) {
   // The idea in this test is that we open a file for writing and reading, and
   // then set a LBF buffer to the write handle. Since it is LBF, the data
   // written using the write handle should be available right after a '\n' is
@@ -102,6 +104,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) {
             0);
   ASSERT_ERRNO_EQ(EINVAL);
 
-  libc_errno = 0;
   ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f));
 }
diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp
index 5d482b70064bd..e99b382d12112 100644
--- a/libc/test/src/stdio/unlocked_fileop_test.cpp
+++ b/libc/test/src/stdio/unlocked_fileop_test.cpp
@@ -15,11 +15,12 @@
 #include "src/stdio/fread_unlocked.h"
 #include "src/stdio/funlockfile.h"
 #include "src/stdio/fwrite_unlocked.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
 
-#include "src/__support/libc_errno.h"
+using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
-TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
+TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) {
   constexpr char fNAME[] = "testdata/unlocked_read_and_write.test";
   ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w");
   ASSERT_FALSE(f == nullptr);
@@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
@@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) {
             LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f));
   ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0);
   ASSERT_ERRNO_FAILURE();
-  libc_errno = 0;
 
   LIBC_NAMESPACE::clearerr_unlocked(f);
   ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0);
diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h
index 3eeccc5727e77..03f0a6539c785 100644
--- a/libc/test/src/stdlib/StrtolTest.h
+++ b/libc/test/src/stdlib/StrtolTest.h
@@ -9,7 +9,6 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/ctype_utils.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/Test.h"
diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp
index c2f2b9c9a11c3..eb4056dc7ba64 100644
--- a/libc/test/src/stdlib/strtold_test.cpp
+++ b/libc/test/src/stdlib/strtold_test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/libc_errno.h"
 #include "src/__support/uint128.h"
 #include "src/stdlib/strtold.h"
 

From 4943e746909ddbf8845e7fa397a97b918bf777df Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 10:51:07 -0700
Subject: [PATCH 778/851] fixup! [Remarks] Elaborate on called intrinsics
 (#143985)

---
 llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
index 2ad068eb7dc3d..49276c9416234 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/irtranslator/fallback.ll
@@ -9,7 +9,7 @@ declare <vscale x 1 x i8> @llvm.riscv.vadd.nxv1i8.nxv1i8(
   <vscale x 1 x i8>,
   i64)
 
-; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK_WITH_REPORT_ERR:  <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_arg
 define <vscale x 1 x i8> @scalable_arg(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, i64 %2) nounwind {
 entry:
@@ -22,7 +22,7 @@ entry:
   ret <vscale x 1 x i8> %a
 }
 
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call:
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call
 ; FALLBACK-WITH-REPORT-OUT-LABEL: scalable_inst
 define <vscale x 1 x i8> @scalable_inst(i64 %0) nounwind {
 entry:

From 030b5519ec139757c13a6d6f337e69750ec24d6e Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Tue, 17 Jun 2025 10:52:34 -0700
Subject: [PATCH 779/851] [mlir][bazel] add missing deps for XeGPUTransforms

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index e7398a696beaa..55ee49444dc1c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3512,6 +3512,7 @@ cc_library(
     deps = [
         ":AffineUtils",
         ":Analysis",
+        ":ArithDialect",	
         ":ArithUtils",
         ":DialectUtils",
         ":FunctionInterfaces",
@@ -3521,6 +3522,7 @@ cc_library(
         ":IndexDialect",
         ":InliningUtils",
         ":LoopLikeInterface",
+        ":MathDialect",
         ":MemRefDialect",
         ":Pass",
         ":SCFTransforms",

From b876b3fa98cffd5b8755398f9a8218f667464d76 Mon Sep 17 00:00:00 2001
From: vitor1001 <56533861+vitor1001@users.noreply.github.com>
Date: Tue, 17 Jun 2025 19:52:56 +0200
Subject: [PATCH 780/851] Add missing intrinsics to cuda headers (#143664)

LLVM prevents the sm_32_intrinsics.hpp header from being included with a
#define __SM_32_INTRINSICS_HPP__. It also provides drop-in replacements
of the functions defined in the CUDA header.

One issue is that some intrinsics were added after the replacement was
written, and thus have no replacement, breaking code that calls them
(Raft is one example).

This patch adds the missing intrinsics.
---
 clang/lib/Headers/__clang_cuda_intrinsics.h | 284 ++++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h
index 8b230af6f6647..5e13f3f78df70 100644
--- a/clang/lib/Headers/__clang_cuda_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_intrinsics.h
@@ -479,6 +479,290 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
   return ret;
 }
 
+#pragma push_macro("__INTRINSIC_LOAD")
+#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType,  \
+                         __Clobber)                                            \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __TmpType __ret;                                                           \
+    asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber);        \
+    return (__DeclType)__ret;                                                  \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD2")
+#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1}, [%2];"                                              \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y)                               \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)(__tmp.x);                                        \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    return __ret;                                                              \
+  }
+
+#pragma push_macro("__INTRINSIC_LOAD4")
+#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \
+                          __Clobber)                                           \
+  inline __device__ __DeclType __FnName(const __DeclType *__ptr) {             \
+    __DeclType __ret;                                                          \
+    __TmpType __tmp;                                                           \
+    asm(__AsmOp " {%0,%1,%2,%3}, [%4];"                                        \
+        : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z),          \
+          __AsmType(__tmp.w)                                                   \
+        : "l"(__ptr)__Clobber);                                                \
+    using __ElementType = decltype(__ret.x);                                   \
+    __ret.x = (__ElementType)__tmp.x;                                          \
+    __ret.y = (__ElementType)__tmp.y;                                          \
+    __ret.z = (__ElementType)__tmp.z;                                          \
+    __ret.w = (__ElementType)__tmp.w;                                          \
+    return __ret;                                                              \
+  }
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", );
+
+inline __device__ long __ldcg(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long,
+                 unsigned long long, "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short,
+                 "=h", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int,
+                 "=r", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long,
+                 "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2,
+                  "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4,
+                  "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2,
+                  "=h", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4,
+                  "=h", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2,
+                  "=l", : "memory");
+
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory");
+__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory");
+
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2,
+                  "=f", : "memory");
+__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4,
+                  "=f", : "memory");
+__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2,
+                  "=d", : "memory");
+
+inline __device__ long __ldcv(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long,
+                 "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short,
+                 "=h", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int,
+                 "=r", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long,
+                 unsigned long long, "=l", );
+
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2,
+                  "=l", );
+
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", );
+__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", );
+__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", );
+__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", );
+
+#pragma pop_macro("__INTRINSIC_LOAD")
+#pragma pop_macro("__INTRINSIC_LOAD2")
+#pragma pop_macro("__INTRINSIC_LOAD4")
+
+inline __device__ long __ldcs(const long *__ptr) {
+  unsigned long __ret;
+  if (sizeof(long) == 8) {
+    asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr));
+  } else {
+    asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr));
+  }
+  return (long)__ret;
+}
+
+#pragma push_macro("__INTRINSIC_STORE")
+#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp = (__TmpType)__value;                                      \
+    asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory");       \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE2")
+#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x),            \
+        __AsmType(__tmp.y)                                                     \
+        : "memory");                                                           \
+  }
+
+#pragma push_macro("__INTRINSIC_STORE4")
+#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType,           \
+                           __AsmType)                                          \
+  inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) {     \
+    __TmpType __tmp;                                                           \
+    using __ElementType = decltype(__tmp.x);                                   \
+    __tmp.x = (__ElementType)(__value.x);                                      \
+    __tmp.y = (__ElementType)(__value.y);                                      \
+    __tmp.z = (__ElementType)(__value.z);                                      \
+    __tmp.w = (__ElementType)(__value.w);                                      \
+    asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x),      \
+        __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w)             \
+        : "memory");                                                           \
+  }
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short,
+                  "h");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r");
+__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long,
+                  unsigned long long, "l");
+
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l");
+
+__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f");
+__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f");
+__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f");
+__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d");
+
+#pragma pop_macro("__INTRINSIC_STORE")
+#pragma pop_macro("__INTRINSIC_STORE2")
+#pragma pop_macro("__INTRINSIC_STORE4")
+
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
 
 #if CUDA_VERSION >= 11000

From 0cfc59ff51720ee60a71dd34077fc161886a3701 Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Tue, 17 Jun 2025 10:56:31 -0700
Subject: [PATCH 781/851] [mlir][bazel] remove extra empty space for
 XeGPUTransforms

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 55ee49444dc1c..cb0f9d8c7413c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3512,7 +3512,7 @@ cc_library(
     deps = [
         ":AffineUtils",
         ":Analysis",
-        ":ArithDialect",	
+        ":ArithDialect",
         ":ArithUtils",
         ":DialectUtils",
         ":FunctionInterfaces",

From e29bb9a038245320164c5890d1a75843e4a664ef Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com>
Date: Tue, 17 Jun 2025 10:57:52 -0700
Subject: [PATCH 782/851] [IR2Vec] Consider only reachable BBs and non-debug
 instructions (#143476)

Changes to consider BBs that are reachable from the entry block. Similarly we skip debug instruction while computing the embeddings.

(Tracking issue - #141817)
---
 llvm/lib/Analysis/IR2Vec.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 0f7303c1b0917..fa38c35796a0e 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -13,7 +13,9 @@
 
 #include "llvm/Analysis/IR2Vec.h"
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
@@ -190,7 +192,8 @@ Embedding SymbolicEmbedder::getOperandEmbedding(const Value *Op) const {
 void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   Embedding BBVector(Dimension, 0);
 
-  for (const auto &I : BB) {
+  // We consider only the non-debug and non-pseudo instructions
+  for (const auto &I : BB.instructionsWithoutDebug()) {
     Embedding InstVector(Dimension, 0);
 
     const auto OpcVec = lookupVocab(I.getOpcodeName());
@@ -215,9 +218,11 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
 void SymbolicEmbedder::computeEmbeddings() const {
   if (F.isDeclaration())
     return;
-  for (const auto &BB : F) {
-    computeEmbeddings(BB);
-    FuncVector += BBVecMap[&BB];
+
+  // Consider only the basic blocks that are reachable from entry
+  for (const BasicBlock *BB : depth_first(&F)) {
+    computeEmbeddings(*BB);
+    FuncVector += BBVecMap[BB];
   }
 }
 

From 31523de4b000ca254259ae3167d28922e1302648 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Tue, 17 Jun 2025 23:43:07 +0530
Subject: [PATCH 783/851] [Driver] Fix link order of BareMetal toolchain object
 (#132806)

The linker job in BareMetal toolchain object will be used by GNU ld and
lld both.
However, gnuld process the arguments in the order in which they appear
on command
line, whereas there is no such restriction with lld.

The previous order was:
LibraryPaths -> Libraries -> LTOOptions -> LinkerInputs
The new order is:
LibraryPaths -> LTOOptions -> LinkerInputs -> Libraries

LTO options need to be added before adding any linker inputs because
file format
after compile stage during LTO is bitcode which gnuld natively cannot
process.
Hence will need to pass appropriate plugins before adding any bitcode
file on the
command line.

Object files that are getting linked need to be passed before processing
any
libraries so that gnuld can appropriately do symbol resolution for the
symbols
for which no definition is provided through user code.

Similar link order is also followed by other linker jobs for gnuld such
as in
gnutools::Linker in Gnu.cpp

This is the 3rd patch in the series of patches of merging RISCVToolchain
into
BareMetal toolchain object.

RFC:

https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp   | 12 ++--
 clang/test/Driver/aarch64-toolchain-extra.c |  2 +-
 clang/test/Driver/aarch64-toolchain.c       | 28 ++++----
 clang/test/Driver/arm-toolchain-extra.c     |  2 +-
 clang/test/Driver/arm-toolchain.c           | 28 ++++----
 clang/test/Driver/baremetal-multilib.yaml   |  3 +-
 clang/test/Driver/baremetal-sysroot.cpp     |  8 ++-
 clang/test/Driver/baremetal.cpp             | 79 +++++++++++++--------
 8 files changed, 92 insertions(+), 70 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index a08bb588dd764..a665040662a3f 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -568,8 +568,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple::ArchType Arch = TC.getArch();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
-  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
-
   CmdArgs.push_back("-Bstatic");
 
   if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
@@ -619,6 +617,12 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   for (const auto &LibPath : TC.getLibraryPaths())
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
+  if (D.isUsingLTO())
+    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
+                  D.getLTOMode() == LTOK_Thin);
+
+  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
+
   if (TC.ShouldLinkCXXStdlib(Args)) {
     bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                !Args.hasArg(options::OPT_static);
@@ -639,10 +643,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--end-group");
   }
 
-  if (D.isUsingLTO())
-    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
-                  D.getLTOMode() == LTOK_Thin);
-
   if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
       NeedCRTs)
     CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
index 2a930e35acd45..a0b5f2902962f 100644
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ b/clang/test/Driver/aarch64-toolchain-extra.c
@@ -31,5 +31,5 @@
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index 83cd95136b158..e12107fa2c506 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -11,12 +11,12 @@
 // LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// LLD-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -29,12 +29,12 @@
 // C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -46,12 +46,12 @@
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -65,12 +65,12 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -84,12 +84,12 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -102,12 +102,12 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -119,12 +119,12 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
index 2adf4ab698ba0..a04b41c13e95e 100644
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ b/clang/test/Driver/arm-toolchain-extra.c
@@ -31,6 +31,6 @@
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOGCC: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
 
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index 66bed1b0c4d87..d4f9bf2aaf3d5 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -10,12 +10,12 @@
 // LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// LLD-ARM-BAREMETAL: "-Bstatic" "-EL"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// LLD-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// LLD-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -28,12 +28,12 @@
 // C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL: "-Bstatic" "-EL"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
@@ -45,12 +45,12 @@
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOSYSROOT: "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
+// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -65,12 +65,12 @@
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 
@@ -85,12 +85,12 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -103,12 +103,12 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clangxx -### %s -fuse-ld= \
@@ -120,12 +120,12 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-Bstatic" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index 853a4e9e36e43..1a80c3b4ccfc8 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -8,8 +8,9 @@
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include/c++/v1"
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include"
 # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml"
-# CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+# CHECK-NEXT: ld{{(.exe)?}}" "-Bstatic"
 # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib"
+# CHECK-SAME: "{{.*}}.o"
 # CHECK-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 # CHECK-SAME: "-lc"
 # CHECK-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 5d5b336a01b0b..47f0616df8501 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -9,15 +9,17 @@
 // RUN: mkdir -p %T/baremetal_default_sysroot/lib/clang-runtimes/armv6m-none-eabi
 // RUN: ln -s %clang %T/baremetal_default_sysroot/bin/clang
 
-// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.out 2>&1 \
 // RUN:     -target armv6m-none-eabi --sysroot= \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-C %s
 // CHECK-V6M-C: "{{.*}}clang{{.*}}" "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
-// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "-Bstatic"
+// CHECK-V6M-C-SAME: "crt0.o"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
+// CHECK-V6M-C-SAME: "{{.*}}.o"
 // CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
-// CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
+// CHECK-V6M-C-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index eff8f775a9c1e..b75f1a9280d12 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -15,11 +15,12 @@
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
+// CHECK-V6M-C-SAME: "{{.*}}.o"
+// CHECK-V6M-C-SAME: {{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
 // CHECK-V6M-C-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
 
@@ -39,9 +40,10 @@
 // CHECK-V6M-TREE-SAME: {{^}} "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
+// CHECK-V6M-TREE-SAME "{{.*}}.o"
 // CHECK-V6M-TREE-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-TREE-SAME: "-lc"
 // CHECK-V6M-TREE-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
@@ -53,19 +55,21 @@
 // CHECK-ARMV7M-PER-TARGET: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-ARMV7M-PER-TARGET: "-isysroot" "[[SYSROOT:[^"]*]]"
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
+// CHECK-ARMV7M-PER-TARGET: "{{.*}}.o"
 // CHECK-ARMV7M-PER-TARGET: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-ARMV7M-PER-TARGET: "-lc"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lm"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -77,8 +81,9 @@
 // CHECK-V6M-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
 // CHECK-V6M-LIBCXX-SAME: "-lm"
 // CHECK-V6M-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -92,8 +97,9 @@
 // CHECK-V6M-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
-// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
+// CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lc"
@@ -104,7 +110,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-NDL %s
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-V6M-NDL: ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -119,6 +125,7 @@
 // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
+// CHECK-V6M-LIBCXX-USR: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lm"
 // CHECK-V6M-LIBCXX-USR-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc"
@@ -149,7 +156,7 @@
 
 // RUN: %clang -### %s --target=armebv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
-// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "--be8" "-EB"
+// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -159,7 +166,7 @@
 
 // RUN: %clang -### %s --target=armv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
-// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -170,7 +177,7 @@
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
-// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EB"
+// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -181,7 +188,7 @@
 
 // RUN: %clang -### %s --target=aarch64-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
-// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
+// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -221,9 +228,10 @@
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV64-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
+// CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
 // CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
@@ -232,8 +240,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-DEFAULTCXX %s
 // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-DEFAULTCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
@@ -246,8 +255,9 @@
 // CHECK-RV64-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-LIBCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
@@ -260,8 +270,9 @@
 // CHECK-RV64-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV64-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
@@ -277,9 +288,10 @@
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV32-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
+// CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
 // CHECK-RV32-SAME: "-X" "-o" "a.out"
@@ -288,8 +300,9 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV32-DEFAULTCXX %s
 // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
@@ -302,8 +315,9 @@
 // CHECK-RV32-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
@@ -315,8 +329,9 @@
 // CHECK-RV32-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
+// CHECK-RV32-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lc"
@@ -337,7 +352,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-NDL %s
 // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64-NDL: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
@@ -356,7 +371,7 @@
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include"
 // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -375,7 +390,7 @@
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -394,7 +409,7 @@
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -408,7 +423,7 @@
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \
@@ -429,7 +444,7 @@
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include"
 // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \
@@ -440,8 +455,9 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPCEABI-SAME:"{{.*}}.o"
 // CHECK-PPCEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCEABI-SAME: "-lc"
 // CHECK-PPCEABI-SAME: "-o" "a.out"
@@ -454,8 +470,9 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPC64EABI-SAME:"{{.*}}.o"
 // CHECK-PPC64EABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64EABI-SAME: "-lc"
 // CHECK-PPC64EABI-SAME: "-o" "a.out"
@@ -468,8 +485,9 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPCLEEABI-SAME:"{{.*}}.o"
 // CHECK-PPCLEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCLEEABI-SAME: "-lc"
 // CHECK-PPCLEEABI-SAME: "-o" "a.out"
@@ -482,8 +500,9 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
+// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
+// CHECK-PPC64LEEABI-SAME:"{{.*}}.o"
 // CHECK-PPC64LEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64LEEABI-SAME: "-lc"
 // CHECK-PPC64LEEABI-SAME: "-o" "a.out"

From 2ab9c35ea93f8557827d4cadcceb05e4eed2d30a Mon Sep 17 00:00:00 2001
From: joaosaffran <126493771+joaosaffran@users.noreply.github.com>
Date: Tue, 17 Jun 2025 11:16:09 -0700
Subject: [PATCH 784/851] [DXContainer] Update DXContainer to match D3D12 spec
 (#143201)

Update the descriptor range flag values in DXContainerConstants.def to
match
the Direct3D12 specification. This changes two aspects:

1. Modify the DESCRIPTOR_RANGE_FLAG macro to use direct values instead
of
   bit shifts
2. Update the flag values to use hex notation and match D3D12's
   D3D12_DESCRIPTOR_RANGE_FLAGS enumeration:
   - DESCRIPTORS_VOLATILE: 0x1
   - DATA_VOLATILE: 0x2
   - DATA_STATIC_WHILE_SET_AT_EXECUTE: 0x4
   - DATA_STATIC: 0x8
   - DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: 0x10000
3. Removed NONE value from ROOT_DESCRIPTOR_FLAG

This ensures better compatibility with the D3D12 API and makes the
values
more explicit in the code.

Requested here:
https://github.com/llvm/llvm-project/pull/138315#discussion_r2132818269

---------

Co-authored-by: joaosaffran <joao.saffran@microsoft.com>
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |  6 +--
 .../BinaryFormat/DXContainerConstants.def     | 41 ++++++++++---------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index 08949e39716d5..6d625dad5853f 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -154,17 +154,17 @@ enum class FeatureFlags : uint64_t {
 static_assert((uint64_t)FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-#define ROOT_ELEMENT_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_ELEMENT_FLAG(Num, Val) Val = Num,
 enum class RootElementFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = 1ull << Num,
+#define ROOT_DESCRIPTOR_FLAG(Num, Val) Val = Num,
 enum class RootDescriptorFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
 
-#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = 1ull << Num,
+#define DESCRIPTOR_RANGE_FLAG(Num, Val) Val = Num,
 enum class DescriptorRangeFlag : uint32_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 501ef0c31cdd0..18e79e6fa65a5 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -59,18 +59,19 @@ SHADER_FEATURE_FLAG(33, 39, NextUnusedBit, "Next reserved shader flag bit (not a
 // ROOT_ELEMENT_FLAG(bit offset for the flag, name).
 #ifdef ROOT_ELEMENT_FLAG
 
-ROOT_ELEMENT_FLAG(0, AllowInputAssemblerInputLayout)
-ROOT_ELEMENT_FLAG(1, DenyVertexShaderRootAccess)
-ROOT_ELEMENT_FLAG(2, DenyHullShaderRootAccess)
-ROOT_ELEMENT_FLAG(3, DenyDomainShaderRootAccess)
-ROOT_ELEMENT_FLAG(4, DenyGeometryShaderRootAccess)
-ROOT_ELEMENT_FLAG(5, DenyPixelShaderRootAccess)
-ROOT_ELEMENT_FLAG(6, AllowStreamOutput)
-ROOT_ELEMENT_FLAG(7, LocalRootSignature)
-ROOT_ELEMENT_FLAG(8, DenyAmplificationShaderRootAccess)
-ROOT_ELEMENT_FLAG(9, DenyMeshShaderRootAccess)
-ROOT_ELEMENT_FLAG(10, CBVSRVUAVHeapDirectlyIndexed)
-ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0, NONE)
+ROOT_ELEMENT_FLAG(0x1, AllowInputAssemblerInputLayout)
+ROOT_ELEMENT_FLAG(0x2, DenyVertexShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x4, DenyHullShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x8, DenyDomainShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x10, DenyGeometryShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x20, DenyPixelShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x40, AllowStreamOutput)
+ROOT_ELEMENT_FLAG(0x80, LocalRootSignature)
+ROOT_ELEMENT_FLAG(0x100, DenyAmplificationShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x200, DenyMeshShaderRootAccess)
+ROOT_ELEMENT_FLAG(0x400, CBVSRVUAVHeapDirectlyIndexed)
+ROOT_ELEMENT_FLAG(0x800, SamplerHeapDirectlyIndexed)
 #undef ROOT_ELEMENT_FLAG
 #endif // ROOT_ELEMENT_FLAG
 
@@ -79,9 +80,9 @@ ROOT_ELEMENT_FLAG(11, SamplerHeapDirectlyIndexed)
 #ifdef ROOT_DESCRIPTOR_FLAG
 
 ROOT_DESCRIPTOR_FLAG(0, NONE)
-ROOT_DESCRIPTOR_FLAG(1, DATA_VOLATILE)
-ROOT_DESCRIPTOR_FLAG(2, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
+ROOT_DESCRIPTOR_FLAG(0x2, DATA_VOLATILE)
+ROOT_DESCRIPTOR_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+ROOT_DESCRIPTOR_FLAG(0x8, DATA_STATIC)
 #undef ROOT_DESCRIPTOR_FLAG
 #endif // ROOT_DESCRIPTOR_FLAG
 
@@ -90,11 +91,11 @@ ROOT_DESCRIPTOR_FLAG(3, DATA_STATIC)
 #ifdef DESCRIPTOR_RANGE_FLAG
 
 DESCRIPTOR_RANGE_FLAG(0, NONE)
-DESCRIPTOR_RANGE_FLAG(1, DESCRIPTORS_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(2, DATA_VOLATILE)
-DESCRIPTOR_RANGE_FLAG(3, DATA_STATIC_WHILE_SET_AT_EXECUTE)
-DESCRIPTOR_RANGE_FLAG(4, DATA_STATIC)
-DESCRIPTOR_RANGE_FLAG(16, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
+DESCRIPTOR_RANGE_FLAG(0x1, DESCRIPTORS_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x2, DATA_VOLATILE)
+DESCRIPTOR_RANGE_FLAG(0x4, DATA_STATIC_WHILE_SET_AT_EXECUTE)
+DESCRIPTOR_RANGE_FLAG(0x8, DATA_STATIC)
+DESCRIPTOR_RANGE_FLAG(0x10000, DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS)
 #undef DESCRIPTOR_RANGE_FLAG
 #endif // DESCRIPTOR_RANGE_FLAG
 

From bb288de4e0e74f235402ff41be60dabcd57e379f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 17 Jun 2025 11:22:23 -0700
Subject: [PATCH 785/851] [LoopPeel] Support last iteration peeling of min/max
 intrinsics (#143598)

This isn't terribly useful at the moment because of the step=1
restriction but it should be functionally sound. This is mostly just
making sure the codepaths don't diverge as we make other changes.
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |  5 +-
 .../LoopUnroll/peel-last-iteration-minmax.ll  | 48 +++++++++++++++----
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index f343962548259..27e70c5ddc0fc 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -545,8 +545,11 @@ countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
     const SCEV *IterVal = AddRec->evaluateAtIteration(
         SE.getConstant(AddRec->getType(), NewPeelCount), SE);
     if (!PeelWhilePredicateIsKnown(NewPeelCount, IterVal, BoundSCEV, Step,
-                                   Pred))
+                                   Pred)) {
+      if (shouldPeelLastIteration(L, Pred, AddRec, BoundSCEV, SE, TTI))
+        DesiredPeelCountLast = 1;
       return;
+    }
     DesiredPeelCount = NewPeelCount;
   };
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
index cd098e123b5f6..5e8540814fff2 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
@@ -41,16 +41,27 @@ define i32 @smin_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -74,16 +85,28 @@ define i32 @smax_unit_step() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nuw nsw i32 1024, [[IV1]]
+; CHECK-NEXT:    call void @foo(i32 [[SUB1]])
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i32 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT1]], 1023
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
 ; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SUB]], i32 1)
 ; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+; CHECK-NEXT:    ret i32 [[MINMAX]]
 ;
 entry:
   br label %loop
@@ -135,3 +158,8 @@ exit:
   ret i32 %minmax.lcssa
 }
 
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+;.

From 8cd05b88ec623018ca2c68cf2418d2beed026d27 Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 11:27:35 -0700
Subject: [PATCH 786/851] [NFC][HLSL] Move Sema work from
 `ParseMicrosoftRootSignatureAttributeArgs` (#143184)

This separates semantic analysis from parsing by moving `RootSignatureDecl` creation, scope storage, and lookup logic into
`SemaHLSL`.

For more context see:
https://github.com/llvm/llvm-project/issues/142834.

- Define `ActOnStartRootSignatureDecl` and `ActOnFinishRootSignatureDecl` on `SemaHLSL`
- NFC so no test changes.

Resolves: https://github.com/llvm/llvm-project/issues/142834

---------

Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/include/clang/Parse/Parser.h  |  2 +-
 clang/include/clang/Sema/SemaHLSL.h | 13 ++++++++++++
 clang/lib/Parse/ParseDeclCXX.cpp    | 32 ++++++++++++-----------------
 clang/lib/Sema/SemaDecl.cpp         |  1 +
 clang/lib/Sema/SemaHLSL.cpp         | 25 ++++++++++++++++++++++
 5 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 3243b94c5e5e6..a47e23ffbd357 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3598,7 +3598,7 @@ class Parser : public CodeCompletionHandler {
   /// keyword.
   bool isClassCompatibleKeyword(Token Tok) const;
 
-  void ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs);
+  void ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs);
 
   ///@}
 
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index ba5f06f93dc30..33c4b8d1568bf 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -119,6 +119,19 @@ class SemaHLSL : public SemaBase {
                                        bool IsCompAssign);
   void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc);
 
+  /// Computes the unique Root Signature identifier from the given signature,
+  /// then lookup if there is a previousy created Root Signature decl.
+  ///
+  /// Returns the identifier and if it was found
+  std::pair<IdentifierInfo *, bool>
+  ActOnStartRootSignatureDecl(StringRef Signature);
+
+  /// Creates the Root Signature decl of the parsed Root Signature elements
+  /// onto the AST and push it onto current Scope
+  void ActOnFinishRootSignatureDecl(
+      SourceLocation Loc, IdentifierInfo *DeclIdent,
+      SmallVector<llvm::hlsl::rootsig::RootElement> &Elements);
+
   void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL);
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index a5c76501c7c18..c1493a5bfd3b3 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -29,6 +29,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCodeCompletion.h"
+#include "clang/Sema/SemaHLSL.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <optional>
 
@@ -4903,7 +4904,7 @@ void Parser::ParseMicrosoftUuidAttributeArgs(ParsedAttributes &Attrs) {
   }
 }
 
-void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
+void Parser::ParseHLSLRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
   assert(Tok.is(tok::identifier) &&
          "Expected an identifier to denote which MS attribute to consider");
   IdentifierInfo *RootSignatureIdent = Tok.getIdentifierInfo();
@@ -4945,18 +4946,14 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
 
   // Construct our identifier
   StringRef Signature = StrLiteral.value()->getString();
-  auto Hash = llvm::hash_value(Signature);
-  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
-  IdentifierInfo *DeclIdent = &(Actions.getASTContext().Idents.get(IdStr));
-
-  LookupResult R(Actions, DeclIdent, SourceLocation(),
-                 Sema::LookupOrdinaryName);
-  // Check if we have already found a decl of the same name, if we haven't
-  // then parse the root signature string and construct the in-memory elements
-  if (!Actions.LookupQualifiedName(R, Actions.CurContext)) {
+  auto [DeclIdent, Found] =
+      Actions.HLSL().ActOnStartRootSignatureDecl(Signature);
+  // If we haven't found an already defined DeclIdent then parse the root
+  // signature string and construct the in-memory elements
+  if (!Found) {
+    // Offset location 1 to account for '"'
     SourceLocation SignatureLoc =
-        StrLiteral.value()->getExprLoc().getLocWithOffset(
-            1); // offset 1 for '"'
+        StrLiteral.value()->getExprLoc().getLocWithOffset(1);
     // Invoke the root signature parser to construct the in-memory constructs
     hlsl::RootSignatureLexer Lexer(Signature, SignatureLoc);
     SmallVector<llvm::hlsl::rootsig::RootElement> RootElements;
@@ -4966,12 +4963,9 @@ void Parser::ParseMicrosoftRootSignatureAttributeArgs(ParsedAttributes &Attrs) {
       return;
     }
 
-    // Create the Root Signature
-    auto *SignatureDecl = HLSLRootSignatureDecl::Create(
-        Actions.getASTContext(), /*DeclContext=*/Actions.CurContext,
-        RootSignatureLoc, DeclIdent, RootElements);
-    SignatureDecl->setImplicit();
-    Actions.PushOnScopeChains(SignatureDecl, getCurScope());
+    // Construct the declaration.
+    Actions.HLSL().ActOnFinishRootSignatureDecl(RootSignatureLoc, DeclIdent,
+                                                RootElements);
   }
 
   // Create the arg for the ParsedAttr
@@ -5014,7 +5008,7 @@ void Parser::ParseMicrosoftAttributes(ParsedAttributes &Attrs) {
       if (Tok.getIdentifierInfo()->getName() == "uuid")
         ParseMicrosoftUuidAttributeArgs(Attrs);
       else if (Tok.getIdentifierInfo()->getName() == "RootSignature")
-        ParseMicrosoftRootSignatureAttributeArgs(Attrs);
+        ParseHLSLRootSignatureAttributeArgs(Attrs);
       else {
         IdentifierInfo *II = Tok.getIdentifierInfo();
         SourceLocation NameLoc = Tok.getLocation();
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 5cffd82e3372e..02ac898a2b702 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -62,6 +62,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Frontend/HLSL/HLSLRootSignature.h"
 #include "llvm/Support/SaveAndRestore.h"
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index ba491b6134293..4a8479a00e0e7 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -978,6 +978,31 @@ void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS,
       << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str());
 }
 
+std::pair<IdentifierInfo *, bool>
+SemaHLSL::ActOnStartRootSignatureDecl(StringRef Signature) {
+  llvm::hash_code Hash = llvm::hash_value(Signature);
+  std::string IdStr = "__hlsl_rootsig_decl_" + std::to_string(Hash);
+  IdentifierInfo *DeclIdent = &(getASTContext().Idents.get(IdStr));
+
+  // Check if we have already found a decl of the same name.
+  LookupResult R(SemaRef, DeclIdent, SourceLocation(),
+                 Sema::LookupOrdinaryName);
+  bool Found = SemaRef.LookupQualifiedName(R, SemaRef.CurContext);
+  return {DeclIdent, Found};
+}
+
+void SemaHLSL::ActOnFinishRootSignatureDecl(
+    SourceLocation Loc, IdentifierInfo *DeclIdent,
+    SmallVector<llvm::hlsl::rootsig::RootElement> &Elements) {
+
+  auto *SignatureDecl = HLSLRootSignatureDecl::Create(
+      SemaRef.getASTContext(), /*DeclContext=*/SemaRef.CurContext, Loc,
+      DeclIdent, Elements);
+
+  SignatureDecl->setImplicit();
+  SemaRef.PushOnScopeChains(SignatureDecl, SemaRef.getCurScope());
+}
+
 void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) {
   if (AL.getNumArgs() != 1) {
     Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1;

From 80f3a28bbe7c2e17fb4b60e974c4157ec7e1eefc Mon Sep 17 00:00:00 2001
From: Justin King <jcking@wulver.com>
Date: Tue, 17 Jun 2025 11:28:14 -0700
Subject: [PATCH 787/851] Revert "lsan: Support free_sized and
 free_aligned_sized from C23" (#144575)

Reverts llvm/llvm-project#144415

Need to update approach to handle Apple platforms gracefully.
---
 compiler-rt/lib/lsan/lsan_allocator.cpp       |  4 ----
 compiler-rt/lib/lsan/lsan_allocator.h         |  2 --
 compiler-rt/lib/lsan/lsan_interceptors.cpp    | 18 ---------------
 compiler-rt/lib/lsan/lsan_malloc_mac.cpp      | 23 ++++++++-----------
 .../sanitizer_common/sanitizer_malloc_mac.inc | 15 ------------
 5 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index a436d9c07ac6c..493bf5f9efc57 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -220,10 +220,6 @@ void lsan_free(void *p) {
   Deallocate(p);
 }
 
-void lsan_free_sized(void *p, uptr) { Deallocate(p); }
-
-void lsan_free_aligned_sized(void *p, uptr, uptr) { Deallocate(p); }
-
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack) {
   return SetErrnoOnNull(Reallocate(stack, p, size, 1));
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 2342f11fb5d0d..5eed0cbdb309b 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -127,8 +127,6 @@ void *lsan_aligned_alloc(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_memalign(uptr alignment, uptr size, const StackTrace &stack);
 void *lsan_malloc(uptr size, const StackTrace &stack);
 void lsan_free(void *p);
-void lsan_free_sized(void *p, uptr size);
-void lsan_free_aligned_sized(void *p, uptr alignment, uptr size);
 void *lsan_realloc(void *p, uptr size, const StackTrace &stack);
 void *lsan_reallocarray(void *p, uptr nmemb, uptr size,
                         const StackTrace &stack);
diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp
index 8e33130840e92..a8252cddacf25 100644
--- a/compiler-rt/lib/lsan/lsan_interceptors.cpp
+++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp
@@ -84,24 +84,6 @@ INTERCEPTOR(void, free, void *p) {
   lsan_free(p);
 }
 
-INTERCEPTOR(void, free_sized, void *p, uptr size) {
-  if (UNLIKELY(!p))
-    return;
-  if (DlsymAlloc::PointerIsMine(p))
-    return DlsymAlloc::Free(p);
-  ENSURE_LSAN_INITED;
-  lsan_free_sized(p, size);
-}
-
-INTERCEPTOR(void, free_aligned_sized, void *p, uptr alignment, uptr size) {
-  if (UNLIKELY(!p))
-    return;
-  if (DlsymAlloc::PointerIsMine(p))
-    return DlsymAlloc::Free(p);
-  ENSURE_LSAN_INITED;
-  lsan_free_aligned_sized(p, alignment, size);
-}
-
 INTERCEPTOR(void*, calloc, uptr nmemb, uptr size) {
   if (DlsymAlloc::Use())
     return DlsymAlloc::Callocate(nmemb, size);
diff --git a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
index 8a16c053da238..525c30272ccca 100644
--- a/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
+++ b/compiler-rt/lib/lsan/lsan_malloc_mac.cpp
@@ -44,19 +44,16 @@ using namespace __lsan;
   void *p = lsan_valloc(size, stack)
 #define COMMON_MALLOC_FREE(ptr) \
   lsan_free(ptr)
-#  define COMMON_MALLOC_FREE_SIZED(ptr, size) lsan_free_sized(ptr, size)
-#  define COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size) \
-    lsan_free_aligned_sized(ptr, alignment, size)
-#  define COMMON_MALLOC_SIZE(ptr) uptr size = lsan_mz_size(ptr)
-#  define COMMON_MALLOC_FILL_STATS(zone, stats)
-#  define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name)    \
-    (void)zone_name;                                                        \
-    Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", \
-           ptr);
-#  define COMMON_MALLOC_NAMESPACE __lsan
-#  define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
-#  define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
+#define COMMON_MALLOC_SIZE(ptr) \
+  uptr size = lsan_mz_size(ptr)
+#define COMMON_MALLOC_FILL_STATS(zone, stats)
+#define COMMON_MALLOC_REPORT_UNKNOWN_REALLOC(ptr, zone_ptr, zone_name) \
+  (void)zone_name; \
+  Report("mz_realloc(%p) -- attempting to realloc unallocated memory.\n", ptr);
+#define COMMON_MALLOC_NAMESPACE __lsan
+#define COMMON_MALLOC_HAS_ZONE_ENUMERATOR 0
+#define COMMON_MALLOC_HAS_EXTRA_INTROSPECTION_INIT 0
 
-#  include "sanitizer_common/sanitizer_malloc_mac.inc"
+#include "sanitizer_common/sanitizer_malloc_mac.inc"
 
 #endif // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
index 72ad22999b5a4..6343eb284afbf 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_malloc_mac.inc
@@ -144,21 +144,6 @@ INTERCEPTOR(void, free, void *ptr) {
   COMMON_MALLOC_FREE(ptr);
 }
 
-#ifdef COMMON_MALLOC_FREE_SIZED
-INTERCEPTOR(void, free_sized, void *ptr, size_t size) {
-  COMMON_MALLOC_ENTER();
-  COMMON_MALLOC_FREE_SIZED(ptr, size);
-}
-#endif
-
-#ifdef COMMON_MALLOC_FREE_ALIGNED_SIZED
-INTERCEPTOR(void, free_aligned_sized, void *ptr, size_t alignment,
-            size_t size) {
-  COMMON_MALLOC_ENTER();
-  COMMON_MALLOC_FREE_ALIGNED_SIZED(ptr, alignment, size);
-}
-#endif
-
 INTERCEPTOR(void *, realloc, void *ptr, size_t size) {
   COMMON_MALLOC_ENTER();
   COMMON_MALLOC_REALLOC(ptr, size);

From 391dafd8af9c0309f2ca75621dae1dbae307b428 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 17 Jun 2025 11:28:43 -0700
Subject: [PATCH 788/851] [RISCV] Consolidate both copies of getLMUL1VT [nfc]
 (#144568)

Put one copy on RISCVTargetLowering as a static function so that both
locations can use it, and rename the method to getM1VT for slightly
improved readability.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 64 +++++++++----------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  9 +++
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +---
 3 files changed, 39 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 779786fa400fc..33aae7ab16cca 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3498,14 +3498,6 @@ getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
 }
 
-static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
-         "Unexpected vector MVT");
-  return MVT::getScalableVectorVT(
-      VT.getVectorElementType(),
-      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
-}
-
 struct VIDSequence {
   int64_t StepNumerator;
   unsigned StepDenominator;
@@ -4316,7 +4308,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
     MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
     MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-    assert(M1VT == getLMUL1VT(M1VT));
+    assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
 
     // The following semantically builds up a fixed length concat_vector
     // of the component build_vectors.  We eagerly lower to scalable and
@@ -4356,7 +4348,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       count_if(Op->op_values(), [](const SDValue &V) { return V.isUndef(); });
   unsigned NumDefElts = NumElts - NumUndefElts;
   if (NumDefElts >= 8 && NumDefElts > NumElts / 2 &&
-      ContainerVT.bitsLE(getLMUL1VT(ContainerVT))) {
+      ContainerVT.bitsLE(RISCVTargetLowering::getM1VT(ContainerVT))) {
     SmallVector<SDValue> SubVecAOps, SubVecBOps;
     SmallVector<SDValue> MaskVals;
     SDValue UndefElem = DAG.getUNDEF(Op->getOperand(0)->getValueType(0));
@@ -5114,7 +5106,8 @@ static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1,
 
   MVT InnerVT = ContainerVT;
   auto [Mask, VL] = getDefaultVLOps(IntVT, InnerVT, DL, DAG, Subtarget);
-  if (Op1.isUndef() && ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (Op1.isUndef() &&
+      ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       (RISCVISD::RI_VUNZIP2A_VL == Opc || RISCVISD::RI_VUNZIP2B_VL == Opc)) {
     InnerVT = ContainerVT.getHalfNumVectorElementsVT();
     VL = DAG.getConstant(VT.getVectorNumElements() / 2, DL,
@@ -5382,7 +5375,7 @@ static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
   EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
   MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
   MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
-  assert(M1VT == getLMUL1VT(M1VT));
+  assert(M1VT == RISCVTargetLowering::getM1VT(M1VT));
   unsigned NumOpElts = M1VT.getVectorMinNumElements();
   unsigned NumElts = ContainerVT.getVectorMinNumElements();
   unsigned NumOfSrcRegs = NumElts / NumOpElts;
@@ -6152,7 +6145,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
       return convertFromScalableVector(VT, Gather, DAG, Subtarget);
     }
 
-    const MVT M1VT = getLMUL1VT(ContainerVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     EVT SubIndexVT = M1VT.changeVectorElementType(IndexVT.getScalarType());
     auto [InnerTrueMask, InnerVL] =
         getDefaultScalableVLOps(M1VT, DL, DAG, Subtarget);
@@ -7801,7 +7794,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // This reduces the length of the chain of vslideups and allows us to
     // perform the vslideups at a smaller LMUL, limited to MF2.
     if (Op.getNumOperands() > 2 &&
-        ContainerVT.bitsGE(getLMUL1VT(ContainerVT))) {
+        ContainerVT.bitsGE(RISCVTargetLowering::getM1VT(ContainerVT))) {
       MVT HalfVT = VT.getHalfNumVectorElementsVT();
       assert(isPowerOf2_32(Op.getNumOperands()));
       size_t HalfNumOps = Op.getNumOperands() / 2;
@@ -9821,11 +9814,12 @@ getSmallestVTForIndex(MVT VecVT, unsigned MaxIdx, SDLoc DL, SelectionDAG &DAG,
   const unsigned MinVLMAX = VectorBitsMin / EltSize;
   MVT SmallerVT;
   if (MaxIdx < MinVLMAX)
-    SmallerVT = getLMUL1VT(VecVT);
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT);
   else if (MaxIdx < MinVLMAX * 2)
-    SmallerVT = getLMUL1VT(VecVT).getDoubleNumVectorElementsVT();
+    SmallerVT =
+        RISCVTargetLowering::getM1VT(VecVT).getDoubleNumVectorElementsVT();
   else if (MaxIdx < MinVLMAX * 4)
-    SmallerVT = getLMUL1VT(VecVT)
+    SmallerVT = RISCVTargetLowering::getM1VT(VecVT)
                     .getDoubleNumVectorElementsVT()
                     .getDoubleNumVectorElementsVT();
   if (!SmallerVT.isValid() || !VecVT.bitsGT(SmallerVT))
@@ -9898,9 +9892,8 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     // If we're compiling for an exact VLEN value, we can always perform
     // the insert in m1 as we can determine the register corresponding to
     // the index in the register group.
-    const MVT M1VT = getLMUL1VT(ContainerVT);
-    if (auto VLEN = Subtarget.getRealVLen();
-        VLEN && ContainerVT.bitsGT(M1VT)) {
+    const MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
+    if (auto VLEN = Subtarget.getRealVLen(); VLEN && ContainerVT.bitsGT(M1VT)) {
       EVT ElemVT = VecVT.getVectorElementType();
       unsigned ElemsPerVReg = *VLEN / ElemVT.getFixedSizeInBits();
       unsigned RemIdx = OrigIdx % ElemsPerVReg;
@@ -10127,7 +10120,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   const auto VLen = Subtarget.getRealVLen();
   if (auto *IdxC = dyn_cast<ConstantSDNode>(Idx);
       IdxC && VLen && VecVT.getSizeInBits().getKnownMinValue() > *VLen) {
-    MVT M1VT = getLMUL1VT(ContainerVT);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     unsigned OrigIdx = IdxC->getZExtValue();
     EVT ElemVT = VecVT.getVectorElementType();
     unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
@@ -10175,7 +10168,8 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   // TODO: We don't have the same code for insert_vector_elt because we
   // have BUILD_VECTOR and handle the degenerate case there.  Should we
   // consider adding an inverse BUILD_VECTOR node?
-  MVT LMUL2VT = getLMUL1VT(ContainerVT).getDoubleNumVectorElementsVT();
+  MVT LMUL2VT =
+      RISCVTargetLowering::getM1VT(ContainerVT).getDoubleNumVectorElementsVT();
   if (ContainerVT.bitsGT(LMUL2VT) && VecVT.isFixedLengthVector())
     return SDValue();
 
@@ -11107,7 +11101,7 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
                                  SDValue VL, const SDLoc &DL, SelectionDAG &DAG,
                                  const RISCVSubtarget &Subtarget) {
   const MVT VecVT = Vec.getSimpleValueType();
-  const MVT M1VT = getLMUL1VT(VecVT);
+  const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
   const MVT XLenVT = Subtarget.getXLenVT();
   const bool NonZeroAVL = isNonZeroAVL(VL);
 
@@ -11485,8 +11479,8 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     assert(VLen);
     AlignedIdx /= *VLen / RISCV::RVVBitsPerBlock;
   }
-  if (ContainerVecVT.bitsGT(getLMUL1VT(ContainerVecVT))) {
-    InterSubVT = getLMUL1VT(ContainerVecVT);
+  if (ContainerVecVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVecVT))) {
+    InterSubVT = RISCVTargetLowering::getM1VT(ContainerVecVT);
     // Extract a subvector equal to the nearest full vector register type. This
     // should resolve to a EXTRACT_SUBREG instruction.
     AlignedExtract = DAG.getExtractSubvector(DL, InterSubVT, Vec, AlignedIdx);
@@ -11677,7 +11671,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
   MVT InterSubVT = VecVT;
-  if (VecVT.bitsGT(getLMUL1VT(VecVT))) {
+  if (VecVT.bitsGT(RISCVTargetLowering::getM1VT(VecVT))) {
     // If VecVT has an LMUL > 1, then SubVecVT should have a smaller LMUL, and
     // we should have successfully decomposed the extract into a subregister.
     // We use an extract_subvector that will resolve to a subreg extract.
@@ -11688,7 +11682,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
       assert(VLen);
       Idx /= *VLen / RISCV::RVVBitsPerBlock;
     }
-    InterSubVT = getLMUL1VT(VecVT);
+    InterSubVT = RISCVTargetLowering::getM1VT(VecVT);
     Vec = DAG.getExtractSubvector(DL, InterSubVT, Vec, Idx);
   }
 
@@ -11805,7 +11799,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
     // For fractional LMUL, check if we can use a higher LMUL
     // instruction to avoid a vslidedown.
     if (SDValue Src = foldConcatVector(V1, V2);
-        Src && getLMUL1VT(VT).bitsGT(VT)) {
+        Src && RISCVTargetLowering::getM1VT(VT).bitsGT(VT)) {
       EVT NewVT = VT.getDoubleNumVectorElementsVT();
       Src = DAG.getExtractSubvector(DL, NewVT, Src, 0);
       // Freeze the source so we can increase its use count.
@@ -12187,7 +12181,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // vrgather.vv v14, v9, v16
   // vrgather.vv v13, v10, v16
   // vrgather.vv v12, v11, v16
-  if (ContainerVT.bitsGT(getLMUL1VT(ContainerVT)) &&
+  if (ContainerVT.bitsGT(RISCVTargetLowering::getM1VT(ContainerVT)) &&
       ContainerVT.getVectorElementCount().isKnownMultipleOf(2)) {
     auto [Lo, Hi] = DAG.SplitVector(Vec, DL);
     Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, Lo.getSimpleValueType(), Lo);
@@ -12252,7 +12246,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_REVERSE(SDValue Op,
   // At LMUL > 1, do the index computation in 16 bits to reduce register
   // pressure.
   if (IntVT.getScalarType().bitsGT(MVT::i16) &&
-      IntVT.bitsGT(getLMUL1VT(IntVT))) {
+      IntVT.bitsGT(RISCVTargetLowering::getM1VT(IntVT))) {
     assert(isUInt<16>(MaxVLMAX - 1)); // Largest VLMAX is 65536 @ zvl65536b
     GatherOpc = RISCVISD::VRGATHEREI16_VV_VL;
     IntVT = IntVT.changeVectorElementType(MVT::i16);
@@ -12339,7 +12333,7 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Load->getMemOperand();
     SDValue NewLoad =
         DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
@@ -12400,7 +12394,7 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   const auto [MinVLMAX, MaxVLMAX] =
       RISCVTargetLowering::computeVLMAXBounds(ContainerVT, Subtarget);
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
-      getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
+      RISCVTargetLowering::getM1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Store->getMemOperand();
     return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
                         MMO->getPointerInfo(), MMO->getBaseAlign(),
@@ -20368,7 +20362,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return Scalar.getOperand(0);
 
     // Use M1 or smaller to avoid over constraining register allocation
-    const MVT M1VT = getLMUL1VT(VT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VT);
     if (M1VT.bitsLT(VT)) {
       SDValue M1Passthru = DAG.getExtractSubvector(DL, M1VT, Passthru, 0);
       SDValue Result =
@@ -20382,7 +20376,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // no purpose.
     if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Scalar);
         Const && !Const->isZero() && isInt<5>(Const->getSExtValue()) &&
-        VT.bitsLE(getLMUL1VT(VT)) && Passthru.isUndef())
+        VT.bitsLE(RISCVTargetLowering::getM1VT(VT)) && Passthru.isUndef())
       return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Passthru, Scalar, VL);
 
     break;
@@ -20390,7 +20384,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::VMV_X_S: {
     SDValue Vec = N->getOperand(0);
     MVT VecVT = N->getOperand(0).getSimpleValueType();
-    const MVT M1VT = getLMUL1VT(VecVT);
+    const MVT M1VT = RISCVTargetLowering::getM1VT(VecVT);
     if (M1VT.bitsLT(VecVT)) {
       Vec = DAG.getExtractSubvector(DL, M1VT, Vec, 0);
       return DAG.getNode(RISCVISD::VMV_X_S, DL, N->getSimpleValueType(0), Vec);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a62382..f67d7f155c9d0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -363,6 +363,15 @@ class RISCVTargetLowering : public TargetLowering {
   static std::pair<unsigned, unsigned>
   computeVLMAXBounds(MVT ContainerVT, const RISCVSubtarget &Subtarget);
 
+  /// Given a vector (either fixed or scalable), return the scalable vector
+  /// corresponding to a vector register (i.e. an m1 register group).
+  static MVT getM1VT(MVT VT) {
+    unsigned EltSizeInBits = VT.getVectorElementType().getSizeInBits();
+    assert(EltSizeInBits <= RISCV::RVVBitsPerBlock && "Unexpected vector MVT");
+    return MVT::getScalableVectorVT(VT.getVectorElementType(),
+                                    RISCV::RVVBitsPerBlock / EltSizeInBits);
+  }
+
   static unsigned getRegClassIDForLMUL(RISCVVType::VLMUL LMul);
   static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
   static unsigned getRegClassIDForVecVT(MVT VT);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 0093c92ea5ef0..aadda2ce85529 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -602,15 +602,6 @@ InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
   return FirstSlideCost + SecondSlideCost + MaskCost;
 }
 
-// Consolidate!
-static MVT getLMUL1VT(MVT VT) {
-  assert(VT.getVectorElementType().getSizeInBits() <= RISCV::RVVBitsPerBlock &&
-         "Unexpected vector MVT");
-  return MVT::getScalableVectorVT(
-      VT.getVectorElementType(),
-      RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
-}
-
 InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                              VectorType *Tp, ArrayRef<int> Mask,
                                              TTI::TargetCostKind CostKind,
@@ -870,7 +861,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     MVT ContainerVT = LT.second;
     if (LT.second.isFixedLengthVector())
       ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
-    MVT M1VT = getLMUL1VT(ContainerVT);
+    MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
     if (ContainerVT.bitsLE(M1VT)) {
       // Example sequence:
       //   csrr a0, vlenb

From 1f10c6a277fbc1b1c6ceb7546b001af39feb92ce Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Tue, 17 Jun 2025 10:46:08 -0700
Subject: [PATCH 789/851] [Matrix] Hoist more IRBuilder<>'s. NFC

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index ece0bb56fff01..96b156494fd91 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1211,13 +1211,13 @@ class LowerMatrixIntrinsics {
 
     switch (Inst->getCalledFunction()->getIntrinsicID()) {
     case Intrinsic::matrix_multiply:
-      return LowerMultiply(Inst);
+      return LowerMultiply(Inst, Builder);
     case Intrinsic::matrix_transpose:
-      return LowerTranspose(Inst);
+      return LowerTranspose(Inst, Builder);
     case Intrinsic::matrix_column_major_load:
-      return LowerColumnMajorLoad(Inst);
+      return LowerColumnMajorLoad(Inst, Builder);
     case Intrinsic::matrix_column_major_store:
-      return LowerColumnMajorStore(Inst);
+      return LowerColumnMajorStore(Inst, Builder);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
       MatrixTy Result;
@@ -1312,8 +1312,8 @@ class LowerMatrixIntrinsics {
 
   /// Lower a load instruction with shape information.
   MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,
-                     Value *Stride, bool IsVolatile, ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                     Value *Stride, bool IsVolatile, ShapeInfo Shape,
+                     IRBuilder<> &Builder) {
     return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,
                       Builder);
   }
@@ -1321,14 +1321,14 @@ class LowerMatrixIntrinsics {
   /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  MatrixTy LowerColumnMajorLoad(CallInst *Inst) {
+  MatrixTy LowerColumnMajorLoad(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
     return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
                      cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
-                     {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+                     {Inst->getArgOperand(3), Inst->getArgOperand(4)}, Builder);
   }
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
@@ -1373,8 +1373,7 @@ class LowerMatrixIntrinsics {
   /// Lower a store instruction with shape information.
   MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,
                       MaybeAlign A, Value *Stride, bool IsVolatile,
-                      ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
+                      ShapeInfo Shape, IRBuilder<> &Builder) {
     auto StoreVal = getMatrix(Matrix, Shape, Builder);
     return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,
                        Builder);
@@ -1383,7 +1382,7 @@ class LowerMatrixIntrinsics {
   /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  MatrixTy LowerColumnMajorStore(CallInst *Inst) {
+  MatrixTy LowerColumnMajorStore(CallInst *Inst, IRBuilder<> &Builder) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
@@ -1391,7 +1390,8 @@ class LowerMatrixIntrinsics {
     Value *Stride = Inst->getArgOperand(2);
     return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
                       cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
-                      {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+                      {Inst->getArgOperand(4), Inst->getArgOperand(5)},
+                      Builder);
   }
 
   // Set elements I..I+NumElts-1 to Block
@@ -2166,8 +2166,7 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lowers llvm.matrix.multiply.
-  MatrixTy LowerMultiply(CallInst *MatMul) {
-    IRBuilder<> Builder(MatMul);
+  MatrixTy LowerMultiply(CallInst *MatMul, IRBuilder<> &Builder) {
     auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
     ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
@@ -2192,9 +2191,8 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lowers llvm.matrix.transpose.
-  MatrixTy LowerTranspose(CallInst *Inst) {
+  MatrixTy LowerTranspose(CallInst *Inst, IRBuilder<> &Builder) {
     MatrixTy Result;
-    IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
     FixedVectorType *VectorTy = cast<FixedVectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
@@ -2230,13 +2228,15 @@ class LowerMatrixIntrinsics {
   MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
                      IRBuilder<> &Builder) {
     return LowerLoad(Inst, Ptr, Inst->getAlign(),
-                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                     Builder);
   }
 
   MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
                       Value *Ptr, IRBuilder<> &Builder) {
     return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+                      Builder);
   }
 
   /// Lower binary operators.

From b59d4cf05447fdaf3d3c859e10db0b3c892f6ec6 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Tue, 17 Jun 2025 11:46:12 -0700
Subject: [PATCH 790/851]  [Reland] Adjust bit cast instruction filter for DXIL
 Prepare pass (#143783)

Relands https://github.com/llvm/llvm-project/pull/142678, with a new
change to remove an unnecessary gep argument, after a revert was needed
due to unforeseen bugs.
Fixes https://github.com/llvm/llvm-project/issues/139013
---
 llvm/lib/Target/DirectX/DXILPrepare.cpp       | 44 ++++++++++++-
 .../DirectX/llc-vector-load-scalarize.ll      | 64 ++++++++-----------
 .../DirectX/noop_bitcast_global_array_type.ll | 53 +++++++++++++++
 3 files changed, 121 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll

diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index e0068787f5e5a..cb58f4833631d 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -148,9 +148,49 @@ class DXILPrepareModule : public ModulePass {
                                      Type *Ty) {
     // Omit bitcasts if the incoming value matches the instruction type.
     auto It = PointerTypes.find(Operand);
-    if (It != PointerTypes.end())
-      if (cast<TypedPointerType>(It->second)->getElementType() == Ty)
+    if (It != PointerTypes.end()) {
+      auto *OpTy = cast<TypedPointerType>(It->second)->getElementType();
+      if (OpTy == Ty)
         return nullptr;
+    }
+
+    Type *ValTy = Operand->getType();
+    // Also omit the bitcast for matching global array types
+    if (auto *GlobalVar = dyn_cast<GlobalVariable>(Operand))
+      ValTy = GlobalVar->getValueType();
+
+    if (auto *AI = dyn_cast<AllocaInst>(Operand))
+      ValTy = AI->getAllocatedType();
+
+    if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+      Type *ElTy = ArrTy->getElementType();
+      if (ElTy == Ty)
+        return nullptr;
+    }
+
+    // finally, drill down GEP instructions until we get the array
+    // that is being accessed, and compare element types
+    if (ConstantExpr *GEPInstr = dyn_cast<ConstantExpr>(Operand)) {
+      while (GEPInstr->getOpcode() == Instruction::GetElementPtr) {
+        Value *OpArg = GEPInstr->getOperand(0);
+        if (ConstantExpr *NewGEPInstr = dyn_cast<ConstantExpr>(OpArg)) {
+          GEPInstr = NewGEPInstr;
+          continue;
+        }
+
+        if (auto *GlobalVar = dyn_cast<GlobalVariable>(OpArg))
+          ValTy = GlobalVar->getValueType();
+        if (auto *AI = dyn_cast<AllocaInst>(Operand))
+          ValTy = AI->getAllocatedType();
+        if (auto *ArrTy = dyn_cast<ArrayType>(ValTy)) {
+          Type *ElTy = ArrTy->getElementType();
+          if (ElTy == Ty)
+            return nullptr;
+        }
+        break;
+      }
+    }
+
     // Insert bitcasts where we are removing the instruction.
     Builder.SetInsertPoint(&Inst);
     // This code only gets hit in opaque-pointer mode, so the type of the
diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
index 778113bd3160f..d5797f6b51348 100644
--- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
+++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll
@@ -60,19 +60,15 @@ define <4 x i32> @load_array_vec_test() #0 {
 define <4 x i32> @load_vec_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @load_vec_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @vecData.scalarized to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP4]], i32 1
-; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP8]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @vecData.scalarized, i32 3), align 4
+; CHECK-NEXT:    [[DOTUPTO0:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTUPTO1:%.*]] = insertelement <4 x i32> [[DOTUPTO0]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[DOTUPTO2:%.*]] = insertelement <4 x i32> [[DOTUPTO1]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[DOTUPTO2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4
   ret <4 x i32> %1
@@ -103,31 +99,23 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
 define <4 x i32> @multid_load_test() #0 {
 ; CHECK-LABEL: define <4 x i32> @multid_load_test(
 ; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(3) [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(3) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(3) [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(3) [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(3) [[TMP13]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3) to ptr addrspace(3)
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(3) [[TMP15]], align 4
-; CHECK-NEXT:    [[DOTI05:%.*]] = add i32 [[TMP2]], [[TMP10]]
-; CHECK-NEXT:    [[DOTI16:%.*]] = add i32 [[TMP4]], [[TMP12]]
-; CHECK-NEXT:    [[DOTI27:%.*]] = add i32 [[TMP6]], [[TMP14]]
-; CHECK-NEXT:    [[DOTI38:%.*]] = add i32 [[TMP8]], [[TMP16]]
-; CHECK-NEXT:    [[DOTUPTO01215:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI05]], i32 0
-; CHECK-NEXT:    [[DOTUPTO11316:%.*]] = insertelement <4 x i32> [[DOTUPTO01215]], i32 [[DOTI16]], i32 1
-; CHECK-NEXT:    [[DOTUPTO21417:%.*]] = insertelement <4 x i32> [[DOTUPTO11316]], i32 [[DOTI27]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[DOTUPTO21417]], i32 [[DOTI38]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[TMP17]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 1), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 2), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 3), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), align 4
+; CHECK-NEXT:    [[DOTI13:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 1), align 4
+; CHECK-NEXT:    [[DOTI25:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 2), align 4
+; CHECK-NEXT:    [[DOTI37:%.*]] = load i32, ptr addrspace(3) getelementptr (i32, ptr addrspace(3) getelementptr inbounds ([36 x i32], ptr addrspace(3) @groushared2dArrayofVectors.scalarized.1dim, i32 0, i32 4), i32 3), align 4
+; CHECK-NEXT:    [[DOTI08:%.*]] = add i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[DOTI19:%.*]] = add i32 [[TMP2]], [[DOTI13]]
+; CHECK-NEXT:    [[DOTI210:%.*]] = add i32 [[TMP3]], [[DOTI25]]
+; CHECK-NEXT:    [[DOTI311:%.*]] = add i32 [[TMP4]], [[DOTI37]]
+; CHECK-NEXT:    [[DOTUPTO015:%.*]] = insertelement <4 x i32> poison, i32 [[DOTI08]], i32 0
+; CHECK-NEXT:    [[DOTUPTO116:%.*]] = insertelement <4 x i32> [[DOTUPTO015]], i32 [[DOTI19]], i32 1
+; CHECK-NEXT:    [[DOTUPTO217:%.*]] = insertelement <4 x i32> [[DOTUPTO116]], i32 [[DOTI210]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[DOTUPTO217]], i32 [[DOTI311]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP6]]
 ;
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
   %2 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 1, i32 1), align 4
diff --git a/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
new file mode 100644
index 0000000000000..1f33700e014c7
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/noop_bitcast_global_array_type.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S --dxil-prepare %s | FileCheck %s
+
+; Test that global arrays do not get a bitcast instruction
+; after the dxil-prepare pass.
+
+target triple = "dxilv1.2-unknown-shadermodel6.2-compute"
+
+@inputTile.1dim = local_unnamed_addr addrspace(3) global [3 x float] zeroinitializer, align 2
+
+; CHECK-LABEL: testload
+define float @testload() local_unnamed_addr {
+  ; NOTE: this would be "bitcast ptr addrspace(3)..." before the change that introduced this test,
+  ; after the dxil-prepare pass is run
+  ; CHECK-NEXT: load float, ptr addrspace(3) @inputTile.1dim, align 2
+  %v = load float, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret float %v
+}
+
+; CHECK-LABEL: teststore
+define void @teststore() local_unnamed_addr {  
+  ; CHECK-next: store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2
+  store float 2.000000e+00, ptr addrspace(3) @inputTile.1dim, align 2  
+  
+  ret void
+}
+
+; CHECK-LABEL: testGEPConst
+define float @testGEPConst() local_unnamed_addr {  
+  ; CHECK-NEXT: load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  %v = load float, ptr addrspace(3) getelementptr (float, ptr addrspace(3) @inputTile.1dim, i32 1), align 4
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testGEPNonConst
+define float @testGEPNonConst(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %gep = getelementptr float, ptr addrspace(3) @inputTile.1dim, i32 %i
+  %v = load float, ptr addrspace(3) %gep
+  
+  ret float %v
+}
+
+; CHECK-LABEL: testAlloca
+define float @testAlloca(i32 %i) local_unnamed_addr {  
+  ; CHECK-NEXT: alloca [3 x float], align 4
+  %arr = alloca [3 x float], align 4
+  ; CHECK-NEXT: getelementptr [3 x float], ptr %arr, i32 1
+  %gep = getelementptr [3 x float], ptr %arr, i32 1
+  %v = load float, ptr %gep
+  ret float %v
+}

From dd65e6e0608c3390752750a0f19bca4409603db9 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <JanPatrick.Lehr@amd.com>
Date: Tue, 17 Jun 2025 20:51:40 +0200
Subject: [PATCH 791/851] [Offload][libc] Add cmake cache AMDGPU buildbot
 (#144500)

An upcoming libc4GPU buildbot will be using this CMake cache file for
its build configuration.
---
 offload/cmake/caches/AMDGPULibcBot.cmake | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 offload/cmake/caches/AMDGPULibcBot.cmake

diff --git a/offload/cmake/caches/AMDGPULibcBot.cmake b/offload/cmake/caches/AMDGPULibcBot.cmake
new file mode 100644
index 0000000000000..728dfe3f0a3f1
--- /dev/null
+++ b/offload/cmake/caches/AMDGPULibcBot.cmake
@@ -0,0 +1,20 @@
+set(CMAKE_INSTALL_PREFIX /tmp/llvm.install.test CACHE STRING "")
+
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(BUILD_SHARED_LIBS ON CACHE BOOL "")
+set(CMAKE_C_COMPILER_LAUNCHER ccache CACHE STRING "")
+set(CMAKE_CXX_COMPILER_LAUNCHER ccache CACHE STRING "")
+
+set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "compiler-rt;libunwind;openmp;offload" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+
+set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
+set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
+set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
+
+set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
+set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
+
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")

From 9cb754509608b9d9143fa17f775631bbfcce0848 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <quic_garvgupt@quicinc.com>
Date: Wed, 18 Jun 2025 00:28:23 +0530
Subject: [PATCH 792/851] [Driver] Add option to force undefined symbols during
 linking in BareMetal toolchain object. (#132807)

Add support for `-u` option to force defined symbols. This option is
supported by both lld and gnuld.

This is done as a part of the effort to merge RISCVToolchain object into
BareMetal toolchain object.

This is the 4th patch in the series of patches for merging
RISCVToolchain object into BareMetal toolchain object.

RFC:
https://discourse.llvm.org/t/merging-riscvtoolchain-and-baremetal-toolchains/75524
---
 clang/lib/Driver/ToolChains/BareMetal.cpp       |  5 +++--
 clang/test/Driver/baremetal-undefined-symbols.c | 14 ++++++++++++++
 clang/test/Driver/riscv-args.c                  |  6 ------
 3 files changed, 17 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Driver/baremetal-undefined-symbols.c
 delete mode 100644 clang/test/Driver/riscv-args.c

diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index a665040662a3f..d4e4e6d04b417 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -609,8 +609,9 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
-                            options::OPT_s, options::OPT_t, options::OPT_r});
+  Args.addAllArgs(CmdArgs,
+                  {options::OPT_L, options::OPT_u, options::OPT_T_Group,
+                   options::OPT_s, options::OPT_t, options::OPT_r});
 
   TC.AddFilePathLibArgs(Args, CmdArgs);
 
diff --git a/clang/test/Driver/baremetal-undefined-symbols.c b/clang/test/Driver/baremetal-undefined-symbols.c
new file mode 100644
index 0000000000000..bff58c7c54c33
--- /dev/null
+++ b/clang/test/Driver/baremetal-undefined-symbols.c
@@ -0,0 +1,14 @@
+// Check the arguments are correctly passed
+
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LD %s
+// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
+
+// TODO: Merge this test with the above in the last patch when finally integrating riscv
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=aarch64-none-elf --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
+// RUN: %clang -### --target=armv6m-none-eabi --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
+// CHECK-ARM-LD: {{.*}} "-T" "a.lds" "-u" "foo" {{.*}} "--defsym=FOO=10"
diff --git a/clang/test/Driver/riscv-args.c b/clang/test/Driver/riscv-args.c
deleted file mode 100644
index cab08e5b0f811..0000000000000
--- a/clang/test/Driver/riscv-args.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// Check the arguments are correctly passed
-
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-LD %s
-// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"

From 57828fec760f086b334ce0cb1c465fc559dcaea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 17 Jun 2025 21:08:23 +0200
Subject: [PATCH 793/851] Revert "[clang][bytecode] Allocate IntegralAP and
 Floating types using an allocator (#144246)"

This reverts commit c66be289901b3f035187d391e80e3610d7d6232e.

This breaks the armv8-quick builder:
https://lab.llvm.org/buildbot/#/builders/154/builds/17549
---
 clang/lib/AST/ByteCode/Compiler.cpp           | 112 +++---
 clang/lib/AST/ByteCode/Compiler.h             |   1 -
 clang/lib/AST/ByteCode/Descriptor.cpp         |   2 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |  60 +---
 clang/lib/AST/ByteCode/Floating.h             | 252 +++++--------
 clang/lib/AST/ByteCode/Integral.h             |   3 -
 clang/lib/AST/ByteCode/IntegralAP.h           | 231 +++++-------
 clang/lib/AST/ByteCode/Interp.cpp             | 106 +-----
 clang/lib/AST/ByteCode/Interp.h               | 337 ++++--------------
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  55 +--
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp |   4 +-
 clang/lib/AST/ByteCode/InterpState.h          |  30 --
 clang/lib/AST/ByteCode/Opcodes.td             |  14 +-
 clang/lib/AST/ByteCode/PrimType.h             |  17 -
 clang/lib/AST/ByteCode/Program.h              |  24 +-
 .../ByteCode/builtin-bit-cast-long-double.cpp |  10 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |  12 +-
 17 files changed, 341 insertions(+), 929 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 3f884ed8d094a..9fe4803ce98ec 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -748,8 +748,7 @@ bool Compiler<Emitter>::VisitFloatingLiteral(const FloatingLiteral *E) {
   if (DiscardResult)
     return true;
 
-  APFloat F = E->getValue();
-  return this->emitFloat(F, E);
+  return this->emitConstFloat(E->getValue(), E);
 }
 
 template <class Emitter>
@@ -4186,10 +4185,8 @@ bool Compiler<Emitter>::visitZeroInitializer(PrimType T, QualType QT,
                              nullptr, E);
   case PT_MemberPtr:
     return this->emitNullMemberPtr(0, nullptr, E);
-  case PT_Float: {
-    APFloat F = APFloat::getZero(Ctx.getFloatSemantics(QT));
-    return this->emitFloat(F, E);
-  }
+  case PT_Float:
+    return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E);
   case PT_FixedPoint: {
     auto Sem = Ctx.getASTContext().getFixedPointSemantics(E->getType());
     return this->emitConstFixedPoint(FixedPoint::zero(Sem), E);
@@ -4677,7 +4674,10 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       if (!visitInitializer(Init))
         return false;
 
-      return this->emitFinishInitGlobal(Init);
+      if (!this->emitFinishInit(Init))
+        return false;
+
+      return this->emitPopPtr(Init);
     };
 
     DeclScope<Emitter> LocalScope(this, VD);
@@ -4698,45 +4698,51 @@ VarCreationState Compiler<Emitter>::visitVarDecl(const VarDecl *VD,
       return false;
 
     return !Init || (checkDecl() && initGlobal(*GlobalIndex));
-  }
-  // Local variables.
-  InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
+  } else {
+    InitLinkScope<Emitter> ILS(this, InitLink::Decl(VD));
 
-  if (VarT) {
-    unsigned Offset = this->allocateLocalPrimitive(
-        VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block,
-        IsConstexprUnknown);
-    if (Init) {
-      // If this is a toplevel declaration, create a scope for the
-      // initializer.
-      if (Toplevel) {
-        LocalScope<Emitter> Scope(this);
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
-      } else {
-        if (!this->visit(Init))
-          return false;
-        return this->emitSetLocal(*VarT, Offset, VD);
+    if (VarT) {
+      unsigned Offset = this->allocateLocalPrimitive(
+          VD, *VarT, VD->getType().isConstQualified(), nullptr,
+          ScopeKind::Block, IsConstexprUnknown);
+      if (Init) {
+        // If this is a toplevel declaration, create a scope for the
+        // initializer.
+        if (Toplevel) {
+          LocalScope<Emitter> Scope(this);
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD) && Scope.destroyLocals();
+        } else {
+          if (!this->visit(Init))
+            return false;
+          return this->emitSetLocal(*VarT, Offset, VD);
+        }
       }
-    }
-  } else {
-    if (std::optional<unsigned> Offset = this->allocateLocal(
-            VD, VD->getType(), nullptr, ScopeKind::Block, IsConstexprUnknown)) {
-      if (!Init)
-        return true;
+    } else {
+      if (std::optional<unsigned> Offset =
+              this->allocateLocal(VD, VD->getType(), nullptr, ScopeKind::Block,
+                                  IsConstexprUnknown)) {
+        if (!Init)
+          return true;
 
-      if (!this->emitGetPtrLocal(*Offset, Init))
-        return false;
+        if (!this->emitGetPtrLocal(*Offset, Init))
+          return false;
 
-      if (!visitInitializer(Init))
-        return false;
+        if (!visitInitializer(Init))
+          return false;
+
+        if (!this->emitFinishInit(Init))
+          return false;
 
-      return this->emitFinishInitPop(Init);
+        return this->emitPopPtr(Init);
+      }
+      return false;
     }
-    return false;
+    return true;
   }
-  return true;
+
+  return false;
 }
 
 template <class Emitter>
@@ -4745,10 +4751,8 @@ bool Compiler<Emitter>::visitAPValue(const APValue &Val, PrimType ValType,
   assert(!DiscardResult);
   if (Val.isInt())
     return this->emitConst(Val.getInt(), ValType, E);
-  else if (Val.isFloat()) {
-    APFloat F = Val.getFloat();
-    return this->emitFloat(F, E);
-  }
+  else if (Val.isFloat())
+    return this->emitConstFloat(Val.getFloat(), E);
 
   if (Val.isLValue()) {
     if (Val.isNullPointer())
@@ -6129,10 +6133,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitAddf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6174,10 +6176,8 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       const auto &TargetSemantics = Ctx.getFloatSemantics(E->getType());
       if (!this->emitLoadFloat(E))
         return false;
-      APFloat F(TargetSemantics, 1);
-      if (!this->emitFloat(F, E))
+      if (!this->emitConstFloat(llvm::APFloat(TargetSemantics, 1), E))
         return false;
-
       if (!this->emitSubf(getFPOptions(E), E))
         return false;
       if (!this->emitStoreFloat(E))
@@ -6953,20 +6953,6 @@ bool Compiler<Emitter>::emitDummyPtr(const DeclTy &D, const Expr *E) {
   return true;
 }
 
-template <class Emitter>
-bool Compiler<Emitter>::emitFloat(const APFloat &F, const Expr *E) {
-  assert(!DiscardResult && "Should've been checked before");
-
-  if (Floating::singleWord(F.getSemantics()))
-    return this->emitConstFloat(Floating(F), E);
-
-  APInt I = F.bitcastToAPInt();
-  return this->emitConstFloat(
-      Floating(const_cast<uint64_t *>(I.getRawData()),
-               llvm::APFloatBase::SemanticsToEnum(F.getSemantics())),
-      E);
-}
-
 //  This function is constexpr if and only if To, From, and the types of
 //  all subobjects of To and From are types T such that...
 //  (3.1) - is_union_v<T> is false;
diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h
index a1d068cc7e0ae..ac3ad84766dc6 100644
--- a/clang/lib/AST/ByteCode/Compiler.h
+++ b/clang/lib/AST/ByteCode/Compiler.h
@@ -391,7 +391,6 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
   bool emitRecordDestruction(const Record *R, SourceInfo Loc);
   bool emitDestruction(const Descriptor *Desc, SourceInfo Loc);
   bool emitDummyPtr(const DeclTy &D, const Expr *E);
-  bool emitFloat(const APFloat &F, const Expr *E);
   unsigned collectBaseOffset(const QualType BaseType,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 46e4d0d940b3e..5531295dfa2f8 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -368,7 +368,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, MetadataSize MD,
                        bool IsTemporary, bool IsConst, UnknownSize)
     : Source(D), ElemSize(primSize(Type)), Size(UnknownSizeMark),
       MDSize(MD.value_or(0)),
-      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)), PrimT(Type),
+      AllocSize(MDSize + sizeof(InitMapPtr) + alignof(void *)),
       IsConst(IsConst), IsMutable(false), IsTemporary(IsTemporary),
       IsArray(true), CtorFn(getCtorArrayPrim(Type)),
       DtorFn(getDtorArrayPrim(Type)), MoveFn(getMoveArrayPrim(Type)) {
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 7c6b78386b14f..846dc2fe92a70 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -50,56 +50,34 @@ inline static std::string printArg(Program &P, CodePtr &OpPC) {
 }
 
 template <> inline std::string printArg<Floating>(Program &P, CodePtr &OpPC) {
-  auto Sem = Floating::deserializeSemantics(*OpPC);
-
-  unsigned BitWidth = llvm::APFloatBase::semanticsSizeInBits(
-      llvm::APFloatBase::EnumToSemantics(Sem));
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
-  Floating Result(Memory.get(), Sem);
-  Floating::deserialize(*OpPC, &Result);
-
-  OpPC += align(Result.bytesToSerialize());
+  auto F = Floating::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  std::string S;
-  llvm::raw_string_ostream SS(S);
-  SS << Result;
-  return S;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <>
 inline std::string printArg<IntegralAP<false>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<false>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
-
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  OpPC += Result.bytesToSerialize();
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
-  return Str;
-}
+  auto F = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
+}
 template <>
 inline std::string printArg<IntegralAP<true>>(Program &P, CodePtr &OpPC) {
-  using T = IntegralAP<true>;
-  unsigned BitWidth = T::deserializeSize(*OpPC);
-  auto Memory =
-      std::make_unique<uint64_t[]>(llvm::APInt::getNumWords(BitWidth));
-
-  T Result(Memory.get(), BitWidth);
-  T::deserialize(*OpPC, &Result);
-
-  std::string Str;
-  llvm::raw_string_ostream SS(Str);
-  SS << Result;
+  auto F = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(F.bytesToSerialize());
 
-  OpPC += Result.bytesToSerialize();
-  return Str;
+  std::string Result;
+  llvm::raw_string_ostream SS(Result);
+  SS << F;
+  return Result;
 }
 
 template <> inline std::string printArg<FixedPoint>(Program &P, CodePtr &OpPC) {
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720abf..3750568fc23c7 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -17,79 +17,63 @@
 #include "clang/AST/APValue.h"
 #include "llvm/ADT/APFloat.h"
 
-// XXX This is just a debugging help. Setting this to 1 will heap-allocate ALL
-// floating values.
-#define ALLOCATE_ALL 0
-
 namespace clang {
 namespace interp {
 
 using APFloat = llvm::APFloat;
 using APSInt = llvm::APSInt;
-using APInt = llvm::APInt;
 
-/// If a Floating is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 class Floating final {
 private:
-  union {
-    uint64_t Val = 0;
-    uint64_t *Memory;
-  };
-  llvm::APFloatBase::Semantics Semantics;
-
-  APFloat getValue() const {
-    unsigned BitWidth = bitWidth();
-    if (singleWord())
-      return APFloat(getSemantics(), APInt(BitWidth, Val));
-    unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
-  }
+  // The underlying value storage.
+  APFloat F;
 
 public:
-  Floating() = default;
-  Floating(llvm::APFloatBase::Semantics Semantics)
-      : Val(0), Semantics(Semantics) {}
-  Floating(const APFloat &F) {
+  /// Zero-initializes a Floating.
+  Floating() : F(0.0f) {}
+  Floating(const APFloat &F) : F(F) {}
 
-    Semantics = llvm::APFloatBase::SemanticsToEnum(F.getSemantics());
-    this->copy(F);
+  // Static constructors for special floating point values.
+  static Floating getInf(const llvm::fltSemantics &Sem) {
+    return Floating(APFloat::getInf(Sem));
   }
-  Floating(uint64_t *Memory, llvm::APFloatBase::Semantics Semantics)
-      : Memory(Memory), Semantics(Semantics) {}
-
-  APFloat getAPFloat() const { return getValue(); }
+  const APFloat &getAPFloat() const { return F; }
 
-  bool operator<(Floating RHS) const { return getValue() < RHS.getValue(); }
-  bool operator>(Floating RHS) const { return getValue() > RHS.getValue(); }
-  bool operator<=(Floating RHS) const { return getValue() <= RHS.getValue(); }
-  bool operator>=(Floating RHS) const { return getValue() >= RHS.getValue(); }
+  bool operator<(Floating RHS) const { return F < RHS.F; }
+  bool operator>(Floating RHS) const { return F > RHS.F; }
+  bool operator<=(Floating RHS) const { return F <= RHS.F; }
+  bool operator>=(Floating RHS) const { return F >= RHS.F; }
+  bool operator==(Floating RHS) const { return F == RHS.F; }
+  bool operator!=(Floating RHS) const { return F != RHS.F; }
+  Floating operator-() const { return Floating(-F); }
 
   APFloat::opStatus convertToInteger(APSInt &Result) const {
     bool IsExact;
-    return getValue().convertToInteger(Result, llvm::APFloat::rmTowardZero,
-                                       &IsExact);
+    return F.convertToInteger(Result, llvm::APFloat::rmTowardZero, &IsExact);
   }
 
-  void toSemantics(const llvm::fltSemantics *Sem, llvm::RoundingMode RM,
-                   Floating *Result) const {
-    APFloat Copy = getValue();
+  Floating toSemantics(const llvm::fltSemantics *Sem,
+                       llvm::RoundingMode RM) const {
+    APFloat Copy = F;
     bool LosesInfo;
     Copy.convert(*Sem, RM, &LosesInfo);
     (void)LosesInfo;
-    Result->copy(Copy);
+    return Floating(Copy);
+  }
+
+  /// Convert this Floating to one with the same semantics as \Other.
+  Floating toSemantics(const Floating &Other, llvm::RoundingMode RM) const {
+    return toSemantics(&Other.F.getSemantics(), RM);
   }
 
   APSInt toAPSInt(unsigned NumBits = 0) const {
-    return APSInt(getValue().bitcastToAPInt());
+    return APSInt(F.bitcastToAPInt());
   }
-  APValue toAPValue(const ASTContext &) const { return APValue(getValue()); }
+  APValue toAPValue(const ASTContext &) const { return APValue(F); }
   void print(llvm::raw_ostream &OS) const {
     // Can't use APFloat::print() since it appends a newline.
     SmallVector<char, 16> Buffer;
-    getValue().toString(Buffer);
+    F.toString(Buffer);
     OS << Buffer;
   }
   std::string toDiagnosticString(const ASTContext &Ctx) const {
@@ -99,62 +83,25 @@ class Floating final {
     return NameStr;
   }
 
-  unsigned bitWidth() const {
-    return llvm::APFloatBase::semanticsSizeInBits(getSemantics());
-  }
-  unsigned numWords() const { return llvm::APInt::getNumWords(bitWidth()); }
-  bool singleWord() const {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return numWords() == 1;
-  }
-  static bool singleWord(const llvm::fltSemantics &Sem) {
-#if ALLOCATE_ALL
-    return false;
-#endif
-    return APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem)) == 1;
-  }
-  const llvm::fltSemantics &getSemantics() const {
-    return llvm::APFloatBase::EnumToSemantics(Semantics);
-  }
-
-  void copy(const APFloat &F) {
-    if (singleWord()) {
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      assert(Memory);
-      std::memcpy(Memory, F.bitcastToAPInt().getRawData(),
-                  numWords() * sizeof(uint64_t));
-    }
-  }
-
-  void take(uint64_t *NewMemory) {
-    if (singleWord())
-      return;
-
-    if (Memory)
-      std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
+  unsigned bitWidth() const { return F.semanticsSizeInBits(F.getSemantics()); }
 
   bool isSigned() const { return true; }
-  bool isNegative() const { return getValue().isNegative(); }
-  bool isZero() const { return getValue().isZero(); }
-  bool isNonZero() const { return getValue().isNonZero(); }
-  bool isMin() const { return getValue().isSmallest(); }
-  bool isMinusOne() const { return getValue().isExactlyValue(-1.0); }
-  bool isNan() const { return getValue().isNaN(); }
-  bool isSignaling() const { return getValue().isSignaling(); }
-  bool isInf() const { return getValue().isInfinity(); }
-  bool isFinite() const { return getValue().isFinite(); }
-  bool isNormal() const { return getValue().isNormal(); }
-  bool isDenormal() const { return getValue().isDenormal(); }
-  llvm::FPClassTest classify() const { return getValue().classify(); }
-  APFloat::fltCategory getCategory() const { return getValue().getCategory(); }
+  bool isNegative() const { return F.isNegative(); }
+  bool isZero() const { return F.isZero(); }
+  bool isNonZero() const { return F.isNonZero(); }
+  bool isMin() const { return F.isSmallest(); }
+  bool isMinusOne() const { return F.isExactlyValue(-1.0); }
+  bool isNan() const { return F.isNaN(); }
+  bool isSignaling() const { return F.isSignaling(); }
+  bool isInf() const { return F.isInfinity(); }
+  bool isFinite() const { return F.isFinite(); }
+  bool isNormal() const { return F.isNormal(); }
+  bool isDenormal() const { return F.isDenormal(); }
+  llvm::FPClassTest classify() const { return F.classify(); }
+  APFloat::fltCategory getCategory() const { return F.getCategory(); }
 
   ComparisonCategoryResult compare(const Floating &RHS) const {
-    llvm::APFloatBase::cmpResult CmpRes = getValue().compare(RHS.getValue());
+    llvm::APFloatBase::cmpResult CmpRes = F.compare(RHS.F);
     switch (CmpRes) {
     case llvm::APFloatBase::cmpLessThan:
       return ComparisonCategoryResult::Less;
@@ -171,130 +118,97 @@ class Floating final {
   static APFloat::opStatus fromIntegral(APSInt Val,
                                         const llvm::fltSemantics &Sem,
                                         llvm::RoundingMode RM,
-                                        Floating *Result) {
+                                        Floating &Result) {
     APFloat F = APFloat(Sem);
     APFloat::opStatus Status = F.convertFromAPInt(Val, Val.isSigned(), RM);
-    Result->copy(F);
+    Result = Floating(F);
     return Status;
   }
 
-  static void bitcastFromMemory(const std::byte *Buff,
-                                const llvm::fltSemantics &Sem,
-                                Floating *Result) {
+  static Floating bitcastFromMemory(const std::byte *Buff,
+                                    const llvm::fltSemantics &Sem) {
     size_t Size = APFloat::semanticsSizeInBits(Sem);
     llvm::APInt API(Size, true);
     llvm::LoadIntFromMemory(API, (const uint8_t *)Buff, Size / 8);
-    Result->copy(APFloat(Sem, API));
+
+    return Floating(APFloat(Sem, API));
   }
 
   void bitcastToMemory(std::byte *Buff) const {
-    llvm::APInt API = getValue().bitcastToAPInt();
+    llvm::APInt API = F.bitcastToAPInt();
     llvm::StoreIntToMemory(API, (uint8_t *)Buff, bitWidth() / 8);
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    return sizeof(Semantics) + (numWords() * sizeof(uint64_t));
+    return sizeof(llvm::fltSemantics *) +
+           (APFloat::semanticsSizeInBits(F.getSemantics()) / 8);
   }
 
   void serialize(std::byte *Buff) const {
-    std::memcpy(Buff, &Semantics, sizeof(Semantics));
-    if (singleWord()) {
-      std::memcpy(Buff + sizeof(Semantics), &Val, sizeof(uint64_t));
-    } else {
-      std::memcpy(Buff + sizeof(Semantics), Memory,
-                  numWords() * sizeof(uint64_t));
-    }
+    // Semantics followed by an APInt.
+    *reinterpret_cast<const llvm::fltSemantics **>(Buff) = &F.getSemantics();
+
+    llvm::APInt API = F.bitcastToAPInt();
+    llvm::StoreIntToMemory(API, (uint8_t *)(Buff + sizeof(void *)),
+                           bitWidth() / 8);
   }
 
-  static llvm::APFloatBase::Semantics
-  deserializeSemantics(const std::byte *Buff) {
-    return *reinterpret_cast<const llvm::APFloatBase::Semantics *>(Buff);
+  static Floating deserialize(const std::byte *Buff) {
+    const llvm::fltSemantics *Sem;
+    std::memcpy((void *)&Sem, Buff, sizeof(void *));
+    return bitcastFromMemory(Buff + sizeof(void *), *Sem);
   }
 
-  static void deserialize(const std::byte *Buff, Floating *Result) {
-    llvm::APFloatBase::Semantics Semantics;
-    std::memcpy(&Semantics, Buff, sizeof(Semantics));
-
-    unsigned BitWidth = llvm::APFloat::semanticsSizeInBits(
-        llvm::APFloatBase::EnumToSemantics(Semantics));
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-
-    Result->Semantics = Semantics;
-    if (NumWords == 1 && !ALLOCATE_ALL) {
-      std::memcpy(&Result->Val, Buff + sizeof(Semantics), sizeof(uint64_t));
-    } else {
-      assert(Result->Memory);
-      std::memcpy(Result->Memory, Buff + sizeof(Semantics),
-                  NumWords * sizeof(uint64_t));
-    }
+  static Floating abs(const Floating &F) {
+    APFloat V = F.F;
+    if (V.isNegative())
+      V.changeSign();
+    return Floating(V);
   }
 
   // -------
 
   static APFloat::opStatus add(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.add(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.add(B.F, RM);
   }
 
   static APFloat::opStatus increment(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.add(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.add(One, RM);
   }
 
   static APFloat::opStatus sub(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.subtract(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.subtract(B.F, RM);
   }
 
   static APFloat::opStatus decrement(const Floating &A, llvm::RoundingMode RM,
                                      Floating *R) {
-    APFloat One(A.getSemantics(), 1);
-    APFloat LHS = A.getValue();
-
-    auto Status = LHS.subtract(One, RM);
-    R->copy(LHS);
-    return Status;
+    APFloat One(A.F.getSemantics(), 1);
+    *R = Floating(A.F);
+    return R->F.subtract(One, RM);
   }
 
   static APFloat::opStatus mul(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.multiply(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.multiply(B.F, RM);
   }
 
   static APFloat::opStatus div(const Floating &A, const Floating &B,
                                llvm::RoundingMode RM, Floating *R) {
-    APFloat LHS = A.getValue();
-    APFloat RHS = B.getValue();
-
-    auto Status = LHS.divide(RHS, RM);
-    R->copy(LHS);
-    return Status;
+    *R = Floating(A.F);
+    return R->F.divide(B.F, RM);
   }
 
   static bool neg(const Floating &A, Floating *R) {
-    R->copy(-A.getValue());
+    *R = -A;
     return false;
   }
 };
diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index af5cd2d13ecca..13fdb5369f2b7 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -99,9 +99,6 @@ template <unsigned Bits, bool Signed> class Integral final {
   bool operator>=(Integral RHS) const { return V >= RHS.V; }
   bool operator==(Integral RHS) const { return V == RHS.V; }
   bool operator!=(Integral RHS) const { return V != RHS.V; }
-  bool operator>=(unsigned RHS) const {
-    return static_cast<unsigned>(V) >= RHS;
-  }
 
   bool operator>(unsigned RHS) const {
     return V >= 0 && static_cast<unsigned>(V) > RHS;
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 259262bdc5243..8ee08dfb5cfe7 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -28,19 +28,12 @@ namespace interp {
 
 using APInt = llvm::APInt;
 using APSInt = llvm::APSInt;
+template <unsigned Bits, bool Signed> class Integral;
 
-/// If an IntegralAP is constructed from Memory, it DOES NOT OWN THAT MEMORY.
-/// It will NOT copy the memory (unless, of course, copy() is called) and it
-/// won't alllocate anything. The allocation should happen via InterpState or
-/// Program.
 template <bool Signed> class IntegralAP final {
-public:
-  union {
-    uint64_t *Memory = nullptr;
-    uint64_t Val;
-  };
-  unsigned BitWidth = 0;
+private:
   friend IntegralAP<!Signed>;
+  APInt V;
 
   template <typename T, bool InputSigned>
   static T truncateCast(const APInt &V) {
@@ -59,82 +52,52 @@ template <bool Signed> class IntegralAP final {
                                : V.trunc(BitSize).getZExtValue();
   }
 
-  APInt getValue() const {
-    if (singleWord())
-      return APInt(BitWidth, Val, Signed);
-    unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
-  }
-
 public:
   using AsUnsigned = IntegralAP<false>;
 
-  void take(uint64_t *NewMemory) {
-    assert(!singleWord());
-    std::memcpy(NewMemory, Memory, numWords() * sizeof(uint64_t));
-    Memory = NewMemory;
-  }
-
-  void copy(const APInt &V) {
-    assert(BitWidth == V.getBitWidth());
-    assert(numWords() == V.getNumWords());
-
-    if (V.isSingleWord()) {
-      if constexpr (Signed)
-        Val = V.getSExtValue();
-      else
-        Val = V.getZExtValue();
-      return;
-    }
-    assert(Memory);
-    std::memcpy(Memory, V.getRawData(), V.getNumWords() * sizeof(uint64_t));
-  }
+  template <typename T>
+  IntegralAP(T Value, unsigned BitWidth)
+      : V(APInt(BitWidth, static_cast<uint64_t>(Value), Signed)) {}
 
-  // Constructors.
-  IntegralAP() = default;
-  IntegralAP(unsigned BitWidth) : BitWidth(BitWidth) {}
-  IntegralAP(uint64_t *Memory, unsigned BitWidth)
-      : Memory(Memory), BitWidth(BitWidth) {}
-  IntegralAP(const APInt &V)
-      : IntegralAP(const_cast<uint64_t *>((const uint64_t *)V.getRawData()),
-                   V.getBitWidth()) {}
+  IntegralAP(APInt V) : V(V) {}
+  /// Arbitrary value for uninitialized variables.
+  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
 
-  IntegralAP operator-() const { return IntegralAP(-getValue()); }
+  IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {
-    return IntegralAP(getValue() - Other.getValue());
+    return IntegralAP(V - Other.V);
   }
   bool operator>(const IntegralAP &RHS) const {
     if constexpr (Signed)
-      return getValue().sgt(RHS.getValue());
-    return getValue().ugt(RHS.getValue());
+      return V.ugt(RHS.V);
+    return V.sgt(RHS.V);
   }
-  bool operator>=(unsigned RHS) const {
+  bool operator>=(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().sge(RHS);
-    return getValue().uge(RHS);
+      return V.uge(RHS.V);
+    return V.sge(RHS.V);
   }
   bool operator<(IntegralAP RHS) const {
     if constexpr (Signed)
-      return getValue().slt(RHS.getValue());
-    return getValue().ult(RHS.getValue());
+      return V.slt(RHS.V);
+    return V.slt(RHS.V);
+  }
+  bool operator<=(IntegralAP RHS) const {
+    if constexpr (Signed)
+      return V.ult(RHS.V);
+    return V.ult(RHS.V);
   }
 
   template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
   explicit operator Ty() const {
-    return truncateCast<Ty, Signed>(getValue());
+    return truncateCast<Ty, Signed>(V);
   }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
-    if (NumBits == 0)
-      NumBits = sizeof(T) * 8;
     assert(NumBits > 0);
     APInt Copy = APInt(NumBits, static_cast<uint64_t>(Value), Signed);
-    assert(false);
-    return IntegralAP<Signed>(Copy);
-  }
 
-  static IntegralAP from(const APInt &Value) {
-    return IntegralAP<Signed>(Value);
+    return IntegralAP<Signed>(Copy);
   }
 
   template <bool InputSigned>
@@ -143,45 +106,52 @@ template <bool Signed> class IntegralAP final {
       NumBits = V.bitWidth();
 
     if constexpr (InputSigned)
-      return IntegralAP<Signed>(V.getValue().sextOrTrunc(NumBits));
-    return IntegralAP<Signed>(V.getValue().zextOrTrunc(NumBits));
+      return IntegralAP<Signed>(V.V.sextOrTrunc(NumBits));
+    return IntegralAP<Signed>(V.V.zextOrTrunc(NumBits));
+  }
+
+  template <unsigned Bits, bool InputSigned>
+  static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
+    return IntegralAP<Signed>(I.toAPInt(BitWidth));
+  }
+
+  static IntegralAP zero(int32_t BitWidth) {
+    APInt V = APInt(BitWidth, 0LL, Signed);
+    return IntegralAP(V);
   }
 
-  constexpr unsigned bitWidth() const { return BitWidth; }
-  constexpr unsigned numWords() const { return APInt::getNumWords(BitWidth); }
-  constexpr bool singleWord() const { return numWords() == 1; }
+  constexpr unsigned bitWidth() const { return V.getBitWidth(); }
 
   APSInt toAPSInt(unsigned Bits = 0) const {
     if (Bits == 0)
       Bits = bitWidth();
 
-    APInt V = getValue();
     if constexpr (Signed)
-      return APSInt(getValue().sext(Bits), !Signed);
+      return APSInt(V.sext(Bits), !Signed);
     else
-      return APSInt(getValue().zext(Bits), !Signed);
+      return APSInt(V.zext(Bits), !Signed);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
-  bool isZero() const { return getValue().isZero(); }
+  bool isZero() const { return V.isZero(); }
   bool isPositive() const {
     if constexpr (Signed)
-      return getValue().isNonNegative();
+      return V.isNonNegative();
     return true;
   }
   bool isNegative() const {
     if constexpr (Signed)
-      return !getValue().isNonNegative();
+      return !V.isNonNegative();
     return false;
   }
-  bool isMin() const { return getValue().isMinValue(); }
-  bool isMax() const { return getValue().isMaxValue(); }
+  bool isMin() const { return V.isMinValue(); }
+  bool isMax() const { return V.isMaxValue(); }
   static constexpr bool isSigned() { return Signed; }
-  bool isMinusOne() const { return Signed && getValue().isAllOnes(); }
+  bool isMinusOne() const { return Signed && V == -1; }
 
-  unsigned countLeadingZeros() const { return getValue().countl_zero(); }
+  unsigned countLeadingZeros() const { return V.countl_zero(); }
 
-  void print(llvm::raw_ostream &OS) const { getValue().print(OS, Signed); }
+  void print(llvm::raw_ostream &OS) const { V.print(OS, Signed);}
   std::string toDiagnosticString(const ASTContext &Ctx) const {
     std::string NameStr;
     llvm::raw_string_ostream OS(NameStr);
@@ -191,64 +161,53 @@ template <bool Signed> class IntegralAP final {
 
   IntegralAP truncate(unsigned BitWidth) const {
     if constexpr (Signed)
-      return IntegralAP(
-          getValue().trunc(BitWidth).sextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).sextOrTrunc(this->bitWidth()));
     else
-      return IntegralAP(
-          getValue().trunc(BitWidth).zextOrTrunc(this->bitWidth()));
+      return IntegralAP(V.trunc(BitWidth).zextOrTrunc(this->bitWidth()));
   }
 
   IntegralAP<false> toUnsigned() const {
-    return IntegralAP<false>(Memory, BitWidth);
+    APInt Copy = V;
+    return IntegralAP<false>(Copy);
   }
 
   void bitcastToMemory(std::byte *Dest) const {
-    llvm::StoreIntToMemory(getValue(), (uint8_t *)Dest, bitWidth() / 8);
+    llvm::StoreIntToMemory(V, (uint8_t *)Dest, bitWidth() / 8);
   }
 
   static IntegralAP bitcastFromMemory(const std::byte *Src, unsigned BitWidth) {
-    // FIXME: Remove this.
     APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
     llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
     return IntegralAP(V);
   }
 
-  static void bitcastFromMemory(const std::byte *Src, unsigned BitWidth,
-                                IntegralAP *Result) {
-    APInt V(BitWidth, static_cast<uint64_t>(0), Signed);
-    llvm::LoadIntFromMemory(V, (const uint8_t *)Src, BitWidth / 8);
-    Result->copy(V);
-  }
-
   ComparisonCategoryResult compare(const IntegralAP &RHS) const {
     assert(Signed == RHS.isSigned());
     assert(bitWidth() == RHS.bitWidth());
-    APInt V1 = getValue();
-    APInt V2 = RHS.getValue();
     if constexpr (Signed) {
-      if (V1.slt(V2))
+      if (V.slt(RHS.V))
         return ComparisonCategoryResult::Less;
-      if (V1.sgt(V2))
+      if (V.sgt(RHS.V))
         return ComparisonCategoryResult::Greater;
       return ComparisonCategoryResult::Equal;
     }
 
     assert(!Signed);
-    if (V1.ult(V2))
+    if (V.ult(RHS.V))
       return ComparisonCategoryResult::Less;
-    if (V1.ugt(V2))
+    if (V.ugt(RHS.V))
       return ComparisonCategoryResult::Greater;
     return ComparisonCategoryResult::Equal;
   }
 
   static bool increment(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return add(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return add(A, One, A.bitWidth() + 1, R);
   }
 
   static bool decrement(IntegralAP A, IntegralAP *R) {
-    APSInt One(APInt(A.bitWidth(), 1ull, Signed), !Signed);
-    return sub(A, IntegralAP<Signed>(One), A.bitWidth() + 1, R);
+    IntegralAP<Signed> One(1, A.bitWidth());
+    return sub(A, One, A.bitWidth() + 1, R);
   }
 
   static bool add(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
@@ -265,95 +224,87 @@ template <bool Signed> class IntegralAP final {
 
   static bool rem(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().srem(B.getValue()));
+      *R = IntegralAP(A.V.srem(B.V));
     else
-      R->copy(A.getValue().urem(B.getValue()));
+      *R = IntegralAP(A.V.urem(B.V));
     return false;
   }
 
   static bool div(IntegralAP A, IntegralAP B, unsigned OpBits, IntegralAP *R) {
     if constexpr (Signed)
-      R->copy(A.getValue().sdiv(B.getValue()));
+      *R = IntegralAP(A.V.sdiv(B.V));
     else
-      R->copy(A.getValue().udiv(B.getValue()));
+      *R = IntegralAP(A.V.udiv(B.V));
     return false;
   }
 
   static bool bitAnd(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() & B.getValue());
+    *R = IntegralAP(A.V & B.V);
     return false;
   }
 
   static bool bitOr(IntegralAP A, IntegralAP B, unsigned OpBits,
                     IntegralAP *R) {
-    R->copy(A.getValue() | B.getValue());
+    *R = IntegralAP(A.V | B.V);
     return false;
   }
 
   static bool bitXor(IntegralAP A, IntegralAP B, unsigned OpBits,
                      IntegralAP *R) {
-    R->copy(A.getValue() ^ B.getValue());
+    *R = IntegralAP(A.V ^ B.V);
     return false;
   }
 
   static bool neg(const IntegralAP &A, IntegralAP *R) {
-    APInt AI = A.getValue();
+    APInt AI = A.V;
     AI.negate();
-    R->copy(AI);
+    *R = IntegralAP(AI);
     return false;
   }
 
   static bool comp(IntegralAP A, IntegralAP *R) {
-    R->copy(~A.getValue());
+    *R = IntegralAP(~A.V);
     return false;
   }
 
   static void shiftLeft(const IntegralAP A, const IntegralAP B, unsigned OpBits,
                         IntegralAP *R) {
-    *R = IntegralAP(A.getValue().shl(B.getValue().getZExtValue()));
+    *R = IntegralAP(A.V.shl(B.V.getZExtValue()));
   }
 
   static void shiftRight(const IntegralAP A, const IntegralAP B,
                          unsigned OpBits, IntegralAP *R) {
-    unsigned ShiftAmount = B.getValue().getZExtValue();
+    unsigned ShiftAmount = B.V.getZExtValue();
     if constexpr (Signed)
-      R->copy(A.getValue().ashr(ShiftAmount));
+      *R = IntegralAP(A.V.ashr(ShiftAmount));
     else
-      R->copy(A.getValue().lshr(ShiftAmount));
+      *R = IntegralAP(A.V.lshr(ShiftAmount));
   }
 
   // === Serialization support ===
   size_t bytesToSerialize() const {
-    assert(BitWidth != 0);
-    uint32_t NumWords = llvm::APInt::getNumWords(bitWidth());
-    return sizeof(uint64_t) + (NumWords * sizeof(uint64_t));
+    // 4 bytes for the BitWidth followed by N bytes for the actual APInt.
+    return sizeof(uint32_t) + (V.getBitWidth() / CHAR_BIT);
   }
 
   void serialize(std::byte *Buff) const {
-    uint64_t NumWords = llvm::APInt::getNumWords(bitWidth());
-    std::memcpy(Buff, &BitWidth, sizeof(uint64_t));
-    if (singleWord())
-      std::memcpy(Buff + sizeof(uint64_t), &Val, NumWords * sizeof(uint64_t));
-    else
-      std::memcpy(Buff + sizeof(uint64_t), Memory, NumWords * sizeof(uint64_t));
-  }
+    assert(V.getBitWidth() < std::numeric_limits<uint8_t>::max());
+    uint32_t BitWidth = V.getBitWidth();
 
-  static uint32_t deserializeSize(const std::byte *Buff) {
-    return *reinterpret_cast<const uint64_t *>(Buff);
+    std::memcpy(Buff, &BitWidth, sizeof(uint32_t));
+    llvm::StoreIntToMemory(V, (uint8_t *)(Buff + sizeof(uint32_t)),
+                           BitWidth / CHAR_BIT);
   }
 
-  static void deserialize(const std::byte *Buff, IntegralAP<Signed> *Result) {
-    uint32_t BitWidth = Result->BitWidth;
-    uint32_t NumWords = llvm::APInt::getNumWords(BitWidth);
-    assert(BitWidth == Result->BitWidth);
-    assert(Result->Memory);
+  static IntegralAP<Signed> deserialize(const std::byte *Buff) {
+    uint32_t BitWidth;
+    std::memcpy(&BitWidth, Buff, sizeof(uint32_t));
+    IntegralAP<Signed> Val(APInt(BitWidth, 0ull, !Signed));
 
-    if (NumWords == 1)
-      std::memcpy(&Result->Val, Buff + sizeof(uint64_t), sizeof(uint64_t));
-    else
-      std::memcpy(Result->Memory, Buff + sizeof(uint64_t),
-                  NumWords * sizeof(uint64_t));
+    llvm::LoadIntFromMemory(Val.V, (const uint8_t *)Buff + sizeof(uint32_t),
+                            BitWidth / CHAR_BIT);
+    return Val;
   }
 
 private:
@@ -361,7 +312,7 @@ template <bool Signed> class IntegralAP final {
   static bool CheckAddSubMulUB(const IntegralAP &A, const IntegralAP &B,
                                unsigned BitWidth, IntegralAP *R) {
     if constexpr (!Signed) {
-      R->copy(Op<APInt>{}(A.getValue(), B.getValue()));
+      R->V = Op<APInt>{}(A.V, B.V);
       return false;
     }
 
@@ -369,7 +320,7 @@ template <bool Signed> class IntegralAP final {
     const APSInt &RHS = B.toAPSInt();
     APSInt Value = Op<APSInt>{}(LHS.extend(BitWidth), RHS.extend(BitWidth));
     APSInt Result = Value.trunc(LHS.getBitWidth());
-    R->copy(Result);
+    R->V = Result;
 
     return Result.extend(BitWidth) != Value;
   }
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 1e2032feabb64..5c8abffb3a99d 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1935,10 +1935,8 @@ bool CastPointerIntegralAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -1948,10 +1946,8 @@ bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
   if (!CheckPointerToIntegralCast(S, OpPC, Ptr, BitWidth))
     return false;
 
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  Result.copy(APInt(BitWidth, Ptr.getIntegerRepresentation()));
-
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(Ptr.getIntegerRepresentation(), BitWidth));
   return true;
 }
 
@@ -2057,100 +2053,6 @@ bool arePotentiallyOverlappingStringLiterals(const Pointer &LHS,
   return Shorter == Longer.take_front(Shorter.size());
 }
 
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr,
-                                PrimType T) {
-
-  if (T == PT_IntAPS) {
-    auto &Val = Ptr.deref<IntegralAP<true>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_IntAP) {
-    auto &Val = Ptr.deref<IntegralAP<false>>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  } else if (T == PT_Float) {
-    auto &Val = Ptr.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-}
-
-template <typename T>
-static void copyPrimitiveMemory(InterpState &S, const Pointer &Ptr) {
-  assert(needsAlloc<T>());
-  auto &Val = Ptr.deref<T>();
-  if (!Val.singleWord()) {
-    uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-    Val.take(NewMemory);
-  }
-}
-
-static void finishGlobalRecurse(InterpState &S, const Pointer &Ptr) {
-  if (const Record *R = Ptr.getRecord()) {
-    for (const Record::Field &Fi : R->fields()) {
-      if (Fi.Desc->isPrimitive()) {
-        TYPE_SWITCH_ALLOC(Fi.Desc->getPrimType(), {
-          copyPrimitiveMemory<T>(S, Ptr.atField(Fi.Offset));
-        });
-        copyPrimitiveMemory(S, Ptr.atField(Fi.Offset), Fi.Desc->getPrimType());
-      } else
-        finishGlobalRecurse(S, Ptr.atField(Fi.Offset));
-    }
-    return;
-  }
-
-  if (const Descriptor *D = Ptr.getFieldDesc(); D && D->isArray()) {
-    unsigned NumElems = D->getNumElems();
-    if (NumElems == 0)
-      return;
-
-    if (D->isPrimitiveArray()) {
-      PrimType PT = D->getPrimType();
-      if (!needsAlloc(PT))
-        return;
-      assert(NumElems >= 1);
-      const Pointer EP = Ptr.atIndex(0);
-      bool AllSingleWord = true;
-      TYPE_SWITCH_ALLOC(PT, {
-        if (!EP.deref<T>().singleWord()) {
-          copyPrimitiveMemory<T>(S, EP);
-          AllSingleWord = false;
-        }
-      });
-      if (AllSingleWord)
-        return;
-      for (unsigned I = 1; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I);
-        copyPrimitiveMemory(S, EP, PT);
-      }
-    } else {
-      assert(D->isCompositeArray());
-      for (unsigned I = 0; I != D->getNumElems(); ++I) {
-        const Pointer EP = Ptr.atIndex(I).narrow();
-        finishGlobalRecurse(S, EP);
-      }
-    }
-  }
-}
-
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC) {
-  const Pointer &Ptr = S.Stk.pop<Pointer>();
-
-  finishGlobalRecurse(S, Ptr);
-  if (Ptr.canBeInitialized()) {
-    Ptr.initialize();
-    Ptr.activate();
-  }
-
-  return true;
-}
-
 // https://github.com/llvm/llvm-project/issues/102513
 #if defined(_MSC_VER) && !defined(__clang__) && !defined(NDEBUG)
 #pragma optimize("", off)
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 66d3e6d79e8b2..ae3d4a441a799 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -189,7 +189,7 @@ bool CheckShift(InterpState &S, CodePtr OpPC, const LT &LHS, const RT &RHS,
 
   // C++11 [expr.shift]p1: Shift width must be less than the bit width of
   // the shifted type.
-  if (Bits > 1 && RHS >= Bits) {
+  if (Bits > 1 && RHS >= RT::from(Bits, RHS.bitWidth())) {
     const Expr *E = S.Current->getExpr(OpPC);
     const APSInt Val = RHS.toAPSInt();
     QualType Ty = E->getType();
@@ -370,9 +370,6 @@ bool AddSubMulHelper(InterpState &S, CodePtr OpPC, unsigned Bits, const T &LHS,
                      const T &RHS) {
   // Fast path - add the numbers with fixed width.
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!OpFW(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -411,7 +408,6 @@ bool Add(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::add, std::plus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -427,7 +423,7 @@ inline bool Addf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::add(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -438,7 +434,6 @@ bool Sub(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() + 1;
-
   return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -447,7 +442,7 @@ inline bool Subf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::sub(LHS, RHS, getRoundingMode(FPO), &Result);
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -458,7 +453,6 @@ bool Mul(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
   const unsigned Bits = RHS.bitWidth() * 2;
-
   return AddSubMulHelper<T, T::mul, std::multiplies>(S, OpPC, Bits, LHS, RHS);
 }
 
@@ -467,10 +461,8 @@ inline bool Mulf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
   const Floating &LHS = S.Stk.pop<Floating>();
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(LHS.getSemantics());
-
+  Floating Result;
   auto Status = Floating::mul(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -492,14 +484,9 @@ inline bool Mulc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexMul(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
     Result.atIndex(1).initialize();
     Result.initialize();
   } else {
@@ -552,20 +539,10 @@ inline bool Divc(InterpState &S, CodePtr OpPC) {
     HandleComplexComplexDiv(A, B, C, D, ResR, ResI);
 
     // Copy into the result.
-    // Result.atIndex(0).deref<Floating>() = Floating(ResR);
-    // Result.atIndex(0).initialize();
-    // Result.atIndex(1).deref<Floating>() = Floating(ResI);
-    // Result.atIndex(1).initialize();
-
-    Floating RA = S.allocFloat(A.getSemantics());
-    RA.copy(ResR);
-    Result.atIndex(0).deref<Floating>() = RA; // Floating(ResR);
+    Result.atIndex(0).deref<Floating>() = Floating(ResR);
     Result.atIndex(0).initialize();
-
-    Floating RI = S.allocFloat(A.getSemantics());
-    RI.copy(ResI);
-    Result.atIndex(1).deref<Floating>() = RI; // Floating(ResI);
-
+    Result.atIndex(1).deref<Floating>() = Floating(ResI);
+    Result.atIndex(1).initialize();
     Result.initialize();
   } else {
     // Integer element type.
@@ -631,12 +608,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitAnd(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  unsigned Bits = RHS.bitWidth();
 
+  unsigned Bits = RHS.bitWidth();
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitAnd(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -651,12 +625,9 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool BitOr(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  unsigned Bits = RHS.bitWidth();
 
+  unsigned Bits = RHS.bitWidth();
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitOr(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -673,11 +644,7 @@ bool BitXor(InterpState &S, CodePtr OpPC) {
   const T &LHS = S.Stk.pop<T>();
 
   unsigned Bits = RHS.bitWidth();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Bits);
-
   if (!T::bitXor(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -692,15 +659,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Rem(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::rem(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -715,15 +679,12 @@ template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Div(InterpState &S, CodePtr OpPC) {
   const T &RHS = S.Stk.pop<T>();
   const T &LHS = S.Stk.pop<T>();
-  const unsigned Bits = RHS.bitWidth() * 2;
 
   if (!CheckDivRem(S, OpPC, LHS, RHS))
     return false;
 
+  const unsigned Bits = RHS.bitWidth() * 2;
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(LHS.bitWidth());
-
   if (!T::div(LHS, RHS, Bits, &Result)) {
     S.Stk.push<T>(Result);
     return true;
@@ -746,10 +707,8 @@ inline bool Divf(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
     return false;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  Floating Result = S.allocFloat(LHS.getSemantics());
+  Floating Result;
   auto Status = Floating::div(LHS, RHS, getRoundingMode(FPO), &Result);
-
   S.Stk.push<Floating>(Result);
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
 }
@@ -771,44 +730,31 @@ inline bool Inv(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Neg(InterpState &S, CodePtr OpPC) {
   const T &Value = S.Stk.pop<T>();
+  T Result;
 
-  if constexpr (std::is_same_v<T, Floating>) {
-    T Result = S.allocFloat(Value.getSemantics());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-    return false;
-  } else {
-    T Result;
-    if constexpr (needsAlloc<T>())
-      Result = S.allocAP<T>(Value.bitWidth());
-
-    if (!T::neg(Value, &Result)) {
-      S.Stk.push<T>(Result);
-      return true;
-    }
-
-    assert(isIntegralType(Name) &&
-           "don't expect other types to fail at constexpr negation");
+  if (!T::neg(Value, &Result)) {
     S.Stk.push<T>(Result);
+    return true;
+  }
 
-    APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
-    if (S.checkingForUndefinedBehavior()) {
-      const Expr *E = S.Current->getExpr(OpPC);
-      QualType Type = E->getType();
-      SmallString<32> Trunc;
-      NegatedValue.trunc(Result.bitWidth())
-          .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
-                    /*UpperCase=*/true, /*InsertSeparators=*/true);
-      S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
-          << Trunc << Type << E->getSourceRange();
-      return true;
-    }
+  assert(isIntegralType(Name) &&
+         "don't expect other types to fail at constexpr negation");
+  S.Stk.push<T>(Result);
 
-    return handleOverflow(S, OpPC, NegatedValue);
+  APSInt NegatedValue = -Value.toAPSInt(Value.bitWidth() + 1);
+  if (S.checkingForUndefinedBehavior()) {
+    const Expr *E = S.Current->getExpr(OpPC);
+    QualType Type = E->getType();
+    SmallString<32> Trunc;
+    NegatedValue.trunc(Result.bitWidth())
+        .toString(Trunc, 10, Result.isSigned(), /*formatAsCLiteral=*/false,
+                  /*UpperCase=*/true, /*InsertSeparators=*/true);
+    S.report(E->getExprLoc(), diag::warn_integer_constant_overflow)
+        << Trunc << Type << E->getSourceRange();
+    return true;
   }
+
+  return handleOverflow(S, OpPC, NegatedValue);
 }
 
 enum class PushVal : bool {
@@ -837,8 +783,6 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   const T &Value = Ptr.deref<T>();
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Value.bitWidth());
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<T>(Value);
@@ -946,6 +890,7 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
   if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
     return false;
+
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
@@ -953,7 +898,7 @@ template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
   Floating Value = Ptr.deref<Floating>();
-  Floating Result = S.allocFloat(Value.getSemantics());
+  Floating Result;
 
   if constexpr (DoPush == PushVal::Yes)
     S.Stk.push<Floating>(Value);
@@ -1007,15 +952,12 @@ inline bool DecfPop(InterpState &S, CodePtr OpPC, uint32_t FPOI) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Comp(InterpState &S, CodePtr OpPC) {
   const T &Val = S.Stk.pop<T>();
-
   T Result;
-  if constexpr (needsAlloc<T>())
-    Result = S.allocAP<T>(Val.bitWidth());
-
   if (!T::comp(Val, &Result)) {
     S.Stk.push<T>(Result);
     return true;
   }
+
   return false;
 }
 
@@ -1383,23 +1325,10 @@ bool Flip(InterpState &S, CodePtr OpPC) {
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Const(InterpState &S, CodePtr OpPC, const T &Arg) {
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Arg.bitWidth());
-    Result.copy(Arg.toAPSInt());
-    S.Stk.push<T>(Result);
-    return true;
-  }
   S.Stk.push<T>(Arg);
   return true;
 }
 
-inline bool ConstFloat(InterpState &S, CodePtr OpPC, const Floating &F) {
-  Floating Result = S.allocFloat(F.getSemantics());
-  Result.copy(F.getAPFloat());
-  S.Stk.push<Floating>(Result);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 // Get/Set Local/Param/Global/This
 //===----------------------------------------------------------------------===//
@@ -1554,24 +1483,7 @@ bool SetGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
   const Pointer &P = S.P.getGlobal(I);
-
   P.deref<T>() = S.Stk.pop<T>();
-
-  if constexpr (std::is_same_v<T, Floating>) {
-    auto &Val = P.deref<Floating>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-
-  } else if constexpr (needsAlloc<T>()) {
-    auto &Val = P.deref<T>();
-    if (!Val.singleWord()) {
-      uint64_t *NewMemory = new (S.P) uint64_t[Val.numWords()];
-      Val.take(NewMemory);
-    }
-  }
-
   P.initialize();
   return true;
 }
@@ -1673,22 +1585,7 @@ bool InitBitField(InterpState &S, CodePtr OpPC, const Record::Field *F) {
   assert(F->isBitField());
   const T &Value = S.Stk.pop<T>();
   const Pointer &Field = S.Stk.peek<Pointer>().atField(F->Offset);
-
-  if constexpr (needsAlloc<T>()) {
-    T Result = S.allocAP<T>(Value.bitWidth());
-    if (T::isSigned())
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .sextOrTrunc(Value.bitWidth()));
-    else
-      Result.copy(Value.toAPSInt()
-                      .trunc(F->Decl->getBitWidthValue())
-                      .zextOrTrunc(Value.bitWidth()));
-
-    Field.deref<T>() = Result;
-  } else {
-    Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
-  }
+  Field.deref<T>() = Value.truncate(F->Decl->getBitWidthValue());
   Field.activate();
   Field.initialize();
   return true;
@@ -1868,8 +1765,6 @@ inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   return true;
 }
 
-bool FinishInitGlobal(InterpState &S, CodePtr OpPC);
-
 inline bool Dump(InterpState &S, CodePtr OpPC) {
   S.Stk.dump();
   return true;
@@ -2376,8 +2271,7 @@ template <PrimType TIn, PrimType TOut> bool Cast(InterpState &S, CodePtr OpPC) {
 inline bool CastFP(InterpState &S, CodePtr OpPC, const llvm::fltSemantics *Sem,
                    llvm::RoundingMode RM) {
   Floating F = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(*Sem);
-  F.toSemantics(Sem, RM, &Result);
+  Floating Result = F.toSemantics(Sem, RM);
   S.Stk.push<Floating>(Result);
   return true;
 }
@@ -2401,25 +2295,15 @@ inline bool CastFixedPoint(InterpState &S, CodePtr OpPC, uint32_t FPS) {
 /// to know what bitwidth the result should be.
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(
+      IntegralAP<false>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool CastAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  // Copy data.
-  {
-    APInt Source = S.Stk.pop<T>().toAPSInt().extOrTrunc(BitWidth);
-    Result.copy(Source);
-  }
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(
+      IntegralAP<true>::from(S.Stk.pop<T>(), BitWidth));
   return true;
 }
 
@@ -2428,11 +2312,11 @@ bool CastIntegralFloating(InterpState &S, CodePtr OpPC,
                           const llvm::fltSemantics *Sem, uint32_t FPOI) {
   const T &From = S.Stk.pop<T>();
   APSInt FromAP = From.toAPSInt();
+  Floating Result;
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-  Floating Result = S.allocFloat(*Sem);
   auto Status =
-      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), &Result);
+      Floating::fromIntegral(FromAP, *Sem, getRoundingMode(FPO), Result);
   S.Stk.push<Floating>(Result);
 
   return CheckFloatResult(S, OpPC, Result, Status, FPO);
@@ -2481,12 +2365,7 @@ static inline bool CastFloatingIntegralAP(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<false>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<false>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2502,12 +2381,7 @@ static inline bool CastFloatingIntegralAPS(InterpState &S, CodePtr OpPC,
     return handleOverflow(S, OpPC, F.getAPFloat());
 
   FPOptions FPO = FPOptions::getFromOpaqueInt(FPOI);
-
-  auto ResultAP = S.allocAP<IntegralAP<true>>(BitWidth);
-  ResultAP.copy(Result);
-
-  S.Stk.push<IntegralAP<true>>(ResultAP);
-
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>(Result));
   return CheckFloatResult(S, OpPC, F, Status, FPO);
 }
 
@@ -2567,9 +2441,8 @@ static inline bool CastFloatingFixedPoint(InterpState &S, CodePtr OpPC,
 static inline bool CastFixedPointFloating(InterpState &S, CodePtr OpPC,
                                           const llvm::fltSemantics *Sem) {
   const auto &Fixed = S.Stk.pop<FixedPoint>();
-  Floating Result = S.allocFloat(*Sem);
-  Result.copy(Fixed.toFloat(Sem));
-  S.Stk.push<Floating>(Result);
+
+  S.Stk.push<Floating>(Fixed.toFloat(Sem));
   return true;
 }
 
@@ -2633,18 +2506,12 @@ bool Zero(InterpState &S, CodePtr OpPC) {
 }
 
 static inline bool ZeroIntAP(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<false>>(Result);
+  S.Stk.push<IntegralAP<false>>(IntegralAP<false>::zero(BitWidth));
   return true;
 }
 
 static inline bool ZeroIntAPS(InterpState &S, CodePtr OpPC, uint32_t BitWidth) {
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  if (!Result.singleWord())
-    std::memset(Result.Memory, 0, Result.numWords() * sizeof(uint64_t));
-  S.Stk.push<IntegralAP<true>>(Result);
+  S.Stk.push<IntegralAP<true>>(IntegralAP<true>::zero(BitWidth));
   return true;
 }
 
@@ -2711,9 +2578,7 @@ inline bool RVOPtr(InterpState &S, CodePtr OpPC) {
 //===----------------------------------------------------------------------===//
 
 template <class LT, class RT, ShiftDir Dir>
-inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                    LT *Result) {
-
+inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS) {
   const unsigned Bits = LHS.bitWidth();
 
   // OpenCL 6.3j: shift values are effectively % word size of LHS.
@@ -2731,7 +2596,7 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
     RHS = -RHS;
     return DoShift<LT, RT,
                    Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
+        S, OpPC, LHS, RHS);
   }
 
   if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
@@ -2779,7 +2644,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
       // Do the shift on potentially signed LT, then convert to unsigned type.
       LT A;
       LT::shiftRight(LHS, LT::from(RHS, Bits), Bits, &A);
-      // LT::shiftRight(LHS, LT(RHSTemp), Bits, &A);
       R = LT::AsUnsigned::from(A);
     }
   }
@@ -2788,48 +2652,6 @@ inline bool DoShift(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
   return true;
 }
 
-/// A version of DoShift that works on IntegralAP.
-template <class LT, class RT, ShiftDir Dir>
-inline bool DoShiftAP(InterpState &S, CodePtr OpPC, LT &LHS, RT &RHS,
-                      LT *Result) {
-  const unsigned Bits = LHS.bitWidth();
-  const APSInt &LHSAP = LHS.toAPSInt();
-  APSInt RHSAP = RHS.toAPSInt();
-
-  // OpenCL 6.3j: shift values are effectively % word size of LHS.
-  if (S.getLangOpts().OpenCL)
-    RHSAP &= APSInt(llvm::APInt(RHSAP.getBitWidth(),
-                                static_cast<uint64_t>(LHSAP.getBitWidth() - 1)),
-                    RHSAP.isUnsigned());
-
-  if (RHS.isNegative()) {
-    // During constant-folding, a negative shift is an opposite shift. Such a
-    // shift is not a constant expression.
-    const SourceInfo &Loc = S.Current->getSource(OpPC);
-    S.CCEDiag(Loc, diag::note_constexpr_negative_shift) << RHS.toAPSInt();
-    if (!S.noteUndefinedBehavior())
-      return false;
-    RHS = -RHS;
-    return DoShiftAP<LT, RT,
-                     Dir == ShiftDir::Left ? ShiftDir::Right : ShiftDir::Left>(
-        S, OpPC, LHS, RHS, Result);
-  }
-
-  if (!CheckShift<Dir>(S, OpPC, LHS, RHS, Bits))
-    return false;
-
-  if constexpr (Dir == ShiftDir::Left) {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP << SA);
-  } else {
-    unsigned SA = (unsigned)RHSAP.getLimitedValue(LHS.bitWidth() - 1);
-    Result->copy(LHSAP >> SA);
-  }
-
-  S.Stk.push<LT>(*Result);
-  return true;
-}
-
 template <PrimType NameL, PrimType NameR>
 inline bool Shr(InterpState &S, CodePtr OpPC) {
   using LT = typename PrimConv<NameL>::T;
@@ -2837,13 +2659,7 @@ inline bool Shr(InterpState &S, CodePtr OpPC) {
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
 
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS, &Result);
-  }
+  return DoShift<LT, RT, ShiftDir::Right>(S, OpPC, LHS, RHS);
 }
 
 template <PrimType NameL, PrimType NameR>
@@ -2852,13 +2668,8 @@ inline bool Shl(InterpState &S, CodePtr OpPC) {
   using RT = typename PrimConv<NameR>::T;
   auto RHS = S.Stk.pop<RT>();
   auto LHS = S.Stk.pop<LT>();
-  if constexpr (needsAlloc<LT>()) {
-    LT Result = S.allocAP<LT>(LHS.bitWidth());
-    return DoShiftAP<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  } else {
-    LT Result;
-    return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS, &Result);
-  }
+
+  return DoShift<LT, RT, ShiftDir::Left>(S, OpPC, LHS, RHS);
 }
 
 static inline bool ShiftFixedPoint(InterpState &S, CodePtr OpPC, bool Left) {
@@ -3441,15 +3252,7 @@ inline bool BitCastPrim(InterpState &S, CodePtr OpPC, bool TargetIsUCharOrByte,
 
     if constexpr (std::is_same_v<T, Floating>) {
       assert(Sem);
-      Floating Result = S.allocFloat(*Sem);
-      Floating::bitcastFromMemory(Buff.data(), *Sem, &Result);
-      S.Stk.push<Floating>(Result);
-
-      // S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
-    } else if constexpr (needsAlloc<T>()) {
-      T Result = S.allocAP<T>(ResultBitWidth);
-      T::bitcastFromMemory(Buff.data(), ResultBitWidth, &Result);
-      S.Stk.push<T>(Result);
+      S.Stk.push<Floating>(T::bitcastFromMemory(Buff.data(), *Sem));
     } else {
       assert(!Sem);
       S.Stk.push<T>(T::bitcastFromMemory(Buff.data(), ResultBitWidth));
@@ -3507,11 +3310,7 @@ template <typename T> inline T ReadArg(InterpState &S, CodePtr &OpPC) {
 }
 
 template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
-  auto &Semantics =
-      llvm::APFloatBase::EnumToSemantics(Floating::deserializeSemantics(*OpPC));
-
-  auto F = S.allocFloat(Semantics);
-  Floating::deserialize(*OpPC, &F);
+  Floating F = Floating::deserialize(*OpPC);
   OpPC += align(F.bytesToSerialize());
   return F;
 }
@@ -3519,25 +3318,17 @@ template <> inline Floating ReadArg<Floating>(InterpState &S, CodePtr &OpPC) {
 template <>
 inline IntegralAP<false> ReadArg<IntegralAP<false>>(InterpState &S,
                                                     CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<false>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<false>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<false> I = IntegralAP<false>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
 inline IntegralAP<true> ReadArg<IntegralAP<true>>(InterpState &S,
                                                   CodePtr &OpPC) {
-  uint32_t BitWidth = IntegralAP<true>::deserializeSize(*OpPC);
-  auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-  assert(Result.bitWidth() == BitWidth);
-
-  IntegralAP<true>::deserialize(*OpPC, &Result);
-  OpPC += align(Result.bytesToSerialize());
-  return Result;
+  IntegralAP<true> I = IntegralAP<true>::deserialize(*OpPC);
+  OpPC += align(I.bytesToSerialize());
+  return I;
 }
 
 template <>
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5304bd77f2c06..d01e3d042a8bf 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -57,21 +57,6 @@ static void pushInteger(InterpState &S, const APSInt &Val, QualType QT) {
   assert(T);
 
   unsigned BitWidth = S.getASTContext().getTypeSize(QT);
-
-  if (T == PT_IntAPS) {
-    auto Result = S.allocAP<IntegralAP<true>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<true>>(Result);
-    return;
-  }
-
-  if (T == PT_IntAP) {
-    auto Result = S.allocAP<IntegralAP<false>>(BitWidth);
-    Result.copy(Val);
-    S.Stk.push<IntegralAP<false>>(Result);
-    return;
-  }
-
   if (QT->isSignedIntegerOrEnumerationType()) {
     int64_t V = Val.getSExtValue();
     INT_TYPE_SWITCH(*T, { S.Stk.push<T>(T::from(V, BitWidth)); });
@@ -342,13 +327,13 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
+  Floating Result;
   if (S.getASTContext().getTargetInfo().isNan2008()) {
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
   } else {
     // Prior to IEEE 754-2008, architectures were allowed to choose whether
@@ -357,10 +342,10 @@ static bool interp__builtin_nan(InterpState &S, CodePtr OpPC,
     // 2008 revisions, MIPS interpreted sNaN-2008 as qNan and qNaN-2008 as
     // sNaN. This is now known as "legacy NaN" encoding.
     if (Signaling)
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getQNaN(TargetSemantics, /*Negative=*/false, &Fill));
     else
-      Result.copy(
+      Result = Floating(
           llvm::APFloat::getSNaN(TargetSemantics, /*Negative=*/false, &Fill));
   }
 
@@ -375,9 +360,7 @@ static bool interp__builtin_inf(InterpState &S, CodePtr OpPC,
       S.getASTContext().getFloatTypeSemantics(
           Call->getDirectCallee()->getReturnType());
 
-  Floating Result = S.allocFloat(TargetSemantics);
-  Result.copy(APFloat::getInf(TargetSemantics));
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::getInf(TargetSemantics));
   return true;
 }
 
@@ -385,12 +368,10 @@ static bool interp__builtin_copysign(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame) {
   const Floating &Arg2 = S.Stk.pop<Floating>();
   const Floating &Arg1 = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(Arg1.getSemantics());
 
   APFloat Copy = Arg1.getAPFloat();
   Copy.copySign(Arg2.getAPFloat());
-  Result.copy(Copy);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating(Copy));
 
   return true;
 }
@@ -399,13 +380,11 @@ static bool interp__builtin_fmin(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::minimumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(minnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -413,13 +392,11 @@ static bool interp__builtin_fmax(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame, bool IsNumBuiltin) {
   const Floating &RHS = S.Stk.pop<Floating>();
   const Floating &LHS = S.Stk.pop<Floating>();
-  Floating Result = S.allocFloat(LHS.getSemantics());
 
   if (IsNumBuiltin)
-    Result.copy(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
+    S.Stk.push<Floating>(llvm::maximumnum(LHS.getAPFloat(), RHS.getAPFloat()));
   else
-    Result.copy(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
-  S.Stk.push<Floating>(Result);
+    S.Stk.push<Floating>(maxnum(LHS.getAPFloat(), RHS.getAPFloat()));
   return true;
 }
 
@@ -594,16 +571,8 @@ static bool interp__builtin_fpclassify(InterpState &S, CodePtr OpPC,
 static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC,
                                  const InterpFrame *Frame) {
   const Floating &Val = S.Stk.pop<Floating>();
-  APFloat F = Val.getAPFloat();
-  if (!F.isNegative()) {
-    S.Stk.push<Floating>(Val);
-    return true;
-  }
 
-  Floating Result = S.allocFloat(Val.getSemantics());
-  F.changeSign();
-  Result.copy(F);
-  S.Stk.push<Floating>(Result);
+  S.Stk.push<Floating>(Floating::abs(Val));
   return true;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 2569cac018b31..239b3104e89f1 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -402,9 +402,7 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
           if (llvm::sys::IsBigEndianHost)
             swapBytes(M.get(), NumBits.roundToBytes());
 
-          Floating R = S.allocFloat(Semantics);
-          Floating::bitcastFromMemory(M.get(), Semantics, &R);
-          P.deref<Floating>() = R;
+          P.deref<Floating>() = Floating::bitcastFromMemory(M.get(), Semantics);
           P.initialize();
           return true;
         }
diff --git a/clang/lib/AST/ByteCode/InterpState.h b/clang/lib/AST/ByteCode/InterpState.h
index 08765561985e2..e8dc6f0483d60 100644
--- a/clang/lib/AST/ByteCode/InterpState.h
+++ b/clang/lib/AST/ByteCode/InterpState.h
@@ -15,7 +15,6 @@
 
 #include "Context.h"
 #include "DynamicAllocator.h"
-#include "Floating.h"
 #include "Function.h"
 #include "InterpFrame.h"
 #include "InterpStack.h"
@@ -127,33 +126,6 @@ class InterpState final : public State, public SourceMapper {
 
   StdAllocatorCaller getStdAllocatorCaller(StringRef Name) const;
 
-  void *allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *allocate(size_t Num = 1) const {
-    return static_cast<T *>(allocate(Num * sizeof(T), alignof(T)));
-  }
-
-  template <typename T> T allocAP(unsigned BitWidth) {
-    unsigned NumWords = APInt::getNumWords(BitWidth);
-    if (NumWords == 1)
-      return T(BitWidth);
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return T(Mem, BitWidth);
-  }
-
-  Floating allocFloat(const llvm::fltSemantics &Sem) {
-    if (Floating::singleWord(Sem))
-      return Floating(llvm::APFloatBase::SemanticsToEnum(Sem));
-
-    unsigned NumWords =
-        APInt::getNumWords(llvm::APFloatBase::getSizeInBits(Sem));
-    uint64_t *Mem = (uint64_t *)this->allocate(NumWords * sizeof(uint64_t));
-    // std::memset(Mem, 0, NumWords * sizeof(uint64_t)); // Debug
-    return Floating(Mem, llvm::APFloatBase::SemanticsToEnum(Sem));
-  }
-
 private:
   friend class EvaluationResult;
   friend class InterpStateCCOverride;
@@ -189,8 +161,6 @@ class InterpState final : public State, public SourceMapper {
   llvm::SmallVector<
       std::pair<const Expr *, const LifetimeExtendedTemporaryDecl *>>
       SeenGlobalTemporaries;
-
-  mutable llvm::BumpPtrAllocator Allocator;
 };
 
 class InterpStateCCOverride final {
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 57e01f7bd9da0..c76ac5f8ae868 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -48,7 +48,6 @@ def ArgUint64 : ArgType { let Name = "uint64_t"; }
 def ArgIntAP : ArgType { let Name = "IntegralAP<false>"; let AsRef = true; }
 def ArgIntAPS : ArgType { let Name = "IntegralAP<true>"; let AsRef = true; }
 def ArgFloat : ArgType { let Name = "Floating"; let AsRef = true; }
-
 def ArgBool : ArgType { let Name = "bool"; }
 def ArgFixedPoint : ArgType { let Name = "FixedPoint"; let AsRef = true; }
 
@@ -89,9 +88,6 @@ def IntegerAndFixedTypeClass : TypeClass {
                Uint32, Sint64, Uint64, IntAP, IntAPS, FixedPoint];
 }
 
-def IntegralTypeClass : TypeClass {
-  let Types = !listconcat(IntegerTypeClass.Types, [Bool]);
-}
 def FixedSizeIntegralTypeClass : TypeClass {
   let Types = [Sint8, Uint8, Sint16, Uint16, Sint32,
                Uint32, Sint64, Uint64, Bool];
@@ -269,13 +265,12 @@ def ConstSint32 : ConstOpcode<Sint32, ArgSint32>;
 def ConstUint32 : ConstOpcode<Uint32, ArgUint32>;
 def ConstSint64 : ConstOpcode<Sint64, ArgSint64>;
 def ConstUint64 : ConstOpcode<Uint64, ArgUint64>;
-def ConstIntAP : ConstOpcode<IntAP, ArgIntAP>;
-def ConstIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
+def ConstFloat : ConstOpcode<Float, ArgFloat>;
+def constIntAP : ConstOpcode<IntAP, ArgIntAP>;
+def constIntAPS : ConstOpcode<IntAPS, ArgIntAPS>;
 def ConstBool : ConstOpcode<Bool, ArgBool>;
 def ConstFixedPoint : ConstOpcode<FixedPoint, ArgFixedPoint>;
 
-def ConstFloat : Opcode { let Args = [ArgFloat]; }
-
 // [] -> [Integer]
 def Zero : Opcode {
   let Types = [FixedSizeIntegralTypeClass];
@@ -333,7 +328,6 @@ def GetMemberPtrBasePop : Opcode {
 
 def FinishInitPop : Opcode;
 def FinishInit    : Opcode;
-def FinishInitGlobal : Opcode;
 
 def GetPtrDerivedPop : Opcode { let Args = [ArgUint32, ArgBool, ArgTypePtr]; }
 
@@ -395,7 +389,7 @@ class AccessOpcode : Opcode {
 }
 
 class BitFieldOpcode : Opcode {
-  let Types = [IntegralTypeClass];
+  let Types = [AluTypeClass];
   let Args = [ArgRecordField];
   let HasGroup = 1;
 }
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index a156cccbb3c1b..6152fbfbe3a74 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -76,13 +76,6 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
 }
 
 constexpr bool isIntegralType(PrimType T) { return T <= PT_FixedPoint; }
-template <typename T> constexpr bool needsAlloc() {
-  return std::is_same_v<T, IntegralAP<false>> ||
-         std::is_same_v<T, IntegralAP<true>> || std::is_same_v<T, Floating>;
-}
-constexpr bool needsAlloc(PrimType T) {
-  return T == PT_IntAP || T == PT_IntAPS || T == PT_Float;
-}
 
 /// Mapping from primitive types to their representation.
 template <PrimType T> struct PrimConv;
@@ -216,16 +209,6 @@ static inline bool aligned(const void *P) {
     }                                                                          \
   } while (0)
 
-#define TYPE_SWITCH_ALLOC(Expr, B)                                             \
-  do {                                                                         \
-    switch (Expr) {                                                            \
-      TYPE_SWITCH_CASE(PT_Float, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAP, B)                                            \
-      TYPE_SWITCH_CASE(PT_IntAPS, B)                                           \
-    default:;                                                                  \
-    }                                                                          \
-  } while (0)
-
 #define COMPOSITE_TYPE_SWITCH(Expr, B, D)                                      \
   do {                                                                         \
     switch (Expr) {                                                            \
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 5d9c422447493..23ba1bbd193b1 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -132,14 +132,6 @@ class Program final {
                                bool IsMutable = false, bool IsVolatile = false,
                                const Expr *Init = nullptr);
 
-  void *Allocate(size_t Size, unsigned Align = 8) const {
-    return Allocator.Allocate(Size, Align);
-  }
-  template <typename T> T *Allocate(size_t Num = 1) const {
-    return static_cast<T *>(Allocate(Num * sizeof(T), alignof(T)));
-  }
-  void Deallocate(void *Ptr) const {}
-
   /// Context to manage declaration lifetimes.
   class DeclScope {
   public:
@@ -212,7 +204,7 @@ class Program final {
   };
 
   /// Allocator for globals.
-  mutable PoolAllocTy Allocator;
+  PoolAllocTy Allocator;
 
   /// Global objects.
   std::vector<Global *> Globals;
@@ -246,18 +238,4 @@ class Program final {
 } // namespace interp
 } // namespace clang
 
-inline void *operator new(size_t Bytes, const clang::interp::Program &C,
-                          size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
-inline void operator delete(void *Ptr, const clang::interp::Program &C,
-                            size_t) {
-  C.Deallocate(Ptr);
-}
-inline void *operator new[](size_t Bytes, const clang::interp::Program &C,
-                            size_t Alignment = 8) {
-  return C.Allocate(Bytes, Alignment);
-}
-
 #endif
diff --git a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
index 1013a771d13b4..710612bef8fd0 100644
--- a/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
+++ b/clang/test/AST/ByteCode/builtin-bit-cast-long-double.cpp
@@ -21,9 +21,6 @@ template <class To, class From>
 constexpr To bit_cast(const From &from) {
   static_assert(sizeof(To) == sizeof(From));
   return __builtin_bit_cast(To, from);
-#if __x86_64
-  // both-note@-2 {{indeterminate value can only initialize an object of type}}
-#endif
 }
 
 template <class Intermediate, class Init>
@@ -41,8 +38,11 @@ constexpr Init round_trip(const Init &init) {
 
 namespace test_long_double {
 #if __x86_64
-constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // both-error{{must be initialized by a constant expression}}\
-                                                                                 // both-note{{in call}}
+/// FIXME: We could enable this, but since it aborts, it causes the usual mempory leak.
+#if 0
+constexpr __int128_t test_cast_to_int128 = bit_cast<__int128_t>((long double)0); // expected-error{{must be initialized by a constant expression}}\
+                                                                                 // expected-note{{in call}}
+#endif
 constexpr long double ld = 3.1425926539;
 
 struct bytes {
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 174c1ffa79a43..21dca15a45775 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -208,7 +208,7 @@ namespace nan {
 
   constexpr double NaN3 = __builtin_nan("foo"); // both-error {{must be initialized by a constant expression}}
   constexpr float NaN4 = __builtin_nanf("");
-  constexpr long double NaN5 = __builtin_nanf128("");
+  //constexpr long double NaN5 = __builtin_nanf128("");
 
   /// FIXME: This should be accepted by the current interpreter as well.
   constexpr char f[] = {'0', 'x', 'A', 'E', '\0'};
@@ -655,6 +655,8 @@ void test_noexcept(int *i) {
 } // end namespace test_launder
 
 
+/// FIXME: The commented out tests here use a IntAP value and fail.
+/// This currently means we will leak the IntAP value since nothing cleans it up.
 namespace clz {
   char clz1[__builtin_clz(1) == BITSIZE(int) - 1 ? 1 : -1];
   char clz2[__builtin_clz(7) == BITSIZE(int) - 3 ? 1 : -1];
@@ -707,7 +709,7 @@ namespace clz {
   char clz48[__builtin_clzg(1ULL << (BITSIZE(long long) - 1)) == 0 ? 1 : -1];
   char clz49[__builtin_clzg(1ULL << (BITSIZE(long long) - 1), 42) == 0 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int clz50 = __builtin_clzg((unsigned __int128)0);
+  // int clz50 = __builtin_clzg((unsigned __int128)0);
   char clz51[__builtin_clzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char clz52[__builtin_clzg((unsigned __int128)0x1) == BITSIZE(__int128) - 1 ? 1 : -1];
   char clz53[__builtin_clzg((unsigned __int128)0x1, 42) == BITSIZE(__int128) - 1 ? 1 : -1];
@@ -715,7 +717,7 @@ namespace clz {
   char clz55[__builtin_clzg((unsigned __int128)0xf, 42) == BITSIZE(__int128) - 4 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
+  // int clz58 = __builtin_clzg((unsigned _BitInt(128))0);
   char clz59[__builtin_clzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char clz60[__builtin_clzg((unsigned _BitInt(128))0x1) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char clz61[__builtin_clzg((unsigned _BitInt(128))0x1, 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
@@ -773,7 +775,7 @@ namespace ctz {
   char ctz46[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1)) == BITSIZE(long long) - 1 ? 1 : -1];
   char ctz47[__builtin_ctzg(1ULL << (BITSIZE(long long) - 1), 42) == BITSIZE(long long) - 1 ? 1 : -1];
 #ifdef __SIZEOF_INT128__
-  int ctz48 = __builtin_ctzg((unsigned __int128)0);
+  // int ctz48 = __builtin_ctzg((unsigned __int128)0);
   char ctz49[__builtin_ctzg((unsigned __int128)0, 42) == 42 ? 1 : -1];
   char ctz50[__builtin_ctzg((unsigned __int128)0x1) == 0 ? 1 : -1];
   char ctz51[__builtin_ctzg((unsigned __int128)0x1, 42) == 0 ? 1 : -1];
@@ -783,7 +785,7 @@ namespace ctz {
   char ctz55[__builtin_ctzg((unsigned __int128)1 << (BITSIZE(__int128) - 1), 42) == BITSIZE(__int128) - 1 ? 1 : -1];
 #endif
 #ifndef __AVR__
-  int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
+  // int ctz56 = __builtin_ctzg((unsigned _BitInt(128))0);
   char ctz57[__builtin_ctzg((unsigned _BitInt(128))0, 42) == 42 ? 1 : -1];
   char ctz58[__builtin_ctzg((unsigned _BitInt(128))0x1) == 0 ? 1 : -1];
   char ctz59[__builtin_ctzg((unsigned _BitInt(128))0x1, 42) == 0 ? 1 : -1];

From 667c7860ef5cc67a94c5233ff1be9c0e113ac514 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 17 Jun 2025 12:25:20 -0700
Subject: [PATCH 794/851] [CIR] Handle global string literals as char array
 initializer (#144384)

This change adds the line of code needed to handle a string literal as
an initializer for a character array.
---
 clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp |  4 ++--
 clang/test/CIR/CodeGen/string-literals.c     | 12 ++++++++++
 clang/test/CIR/CodeGen/string-literals.cpp   | 23 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/string-literals.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 1976742d4039e..8b817f3f3d8d2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -254,8 +254,8 @@ class ConstExprEmitter
   }
 
   mlir::Attribute VisitStringLiteral(StringLiteral *e, QualType t) {
-    cgm.errorNYI(e->getBeginLoc(), "ConstExprEmitter::VisitStringLiteral");
-    return {};
+    // This is a string literal initializing an array in an initializer.
+    return cgm.getConstantArrayFromStringLiteral(e);
   }
 
   mlir::Attribute VisitObjCEncodeExpr(ObjCEncodeExpr *e, QualType t) {
diff --git a/clang/test/CIR/CodeGen/string-literals.c b/clang/test/CIR/CodeGen/string-literals.c
index 00f59b09400c8..90ea21906f363 100644
--- a/clang/test/CIR/CodeGen/string-literals.c
+++ b/clang/test/CIR/CodeGen/string-literals.c
@@ -5,6 +5,18 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
 // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
 
+char g_str[] = "1234";
+
+// CIR: cir.global external @g_str = #cir.const_array<"1234\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+char g_oversized[100] = "123";
+
+// CIR: cir.global external @g_oversized = #cir.const_array<"123" : !cir.array<!s8i x 3>, trailing_zeros> : !cir.array<!s8i x 100>
+
+char g_exact[4] = "123";
+
+// CIR: cir.global external @g_exact = #cir.const_array<"123\00" : !cir.array<!s8i x 4>> : !cir.array<!s8i x 4>
+
 // CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"1\00" : !cir.array<!s8i x 2>> : !cir.array<!s8i x 2>
 // CIR: cir.global "private" cir_private dsolocal @[[STR2_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 1>
 // CIR: cir.global "private" cir_private dsolocal @[[STR3_GLOBAL:.*]] = #cir.zero : !cir.array<!s8i x 2>
diff --git a/clang/test/CIR/CodeGen/string-literals.cpp b/clang/test/CIR/CodeGen/string-literals.cpp
new file mode 100644
index 0000000000000..c56eb74387329
--- /dev/null
+++ b/clang/test/CIR/CodeGen/string-literals.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+// CIR: cir.global "private" cir_private dsolocal @[[STR1_GLOBAL:.*]] = #cir.const_array<"abcd\00" : !cir.array<!s8i x 5>> : !cir.array<!s8i x 5>
+
+// LLVM: @[[STR1_GLOBAL:.*]] = private global [5 x i8] c"abcd\00"
+
+// OGCG: @[[STR1_GLOBAL:.*]] = private unnamed_addr constant [5 x i8] c"abcd\00"
+
+decltype(auto) returns_literal() {
+    return "abcd";
+}
+
+// CIR: cir.func{{.*}} @_Z15returns_literalv() -> !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   %[[RET_ADDR:.*]] = cir.alloca !cir.ptr<!cir.array<!s8i x 5>>, !cir.ptr<!cir.ptr<!cir.array<!s8i x 5>>>, ["__retval"]
+// CIR:   %[[STR_ADDR:.*]] = cir.get_global @[[STR1_GLOBAL]] : !cir.ptr<!cir.array<!s8i x 5>>
+// CIR:   cir.store{{.*}} %[[STR_ADDR]], %[[RET_ADDR]]
+// CIR:   %[[RET:.*]] = cir.load %[[RET_ADDR]]
+// CIR:   cir.return %[[RET]]

From b1aa845595c4dc204dfbe0e48481572e936620fc Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 15:48:58 -0400
Subject: [PATCH 795/851] [libc++][NFC] Consistently qualify calls to C
 functions in <fstream> (#144539)

---
 libcxx/include/fstream | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 71c4957b691a6..00aa00ff7e9cd 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -696,7 +696,7 @@ basic_filebuf<_CharT, _Traits>* basic_filebuf<_CharT, _Traits>::open(const char*
   if (!__mdstr)
     return nullptr;
 
-  return __do_open(fopen(__s, __mdstr), __mode);
+  return __do_open(std::fopen(__s, __mdstr), __mode);
 }
 
 template <class _CharT, class _Traits>
@@ -761,7 +761,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     std::memmove(this->eback(), this->egptr() - __unget_sz, __unget_sz * sizeof(char_type));
     if (__always_noconv_) {
       size_t __nmemb = static_cast<size_t>(this->egptr() - this->eback() - __unget_sz);
-      __nmemb        = ::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
+      __nmemb        = std::fread(this->eback() + __unget_sz, 1, __nmemb, __file_);
       if (__nmemb != 0) {
         this->setg(this->eback(), this->eback() + __unget_sz, this->eback() + __unget_sz + __nmemb);
         __c = traits_type::to_int_type(*this->gptr());
@@ -778,7 +778,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
           std::min(static_cast<size_t>(__ibs_ - __unget_sz), static_cast<size_t>(__extbufend_ - __extbufnext_));
       codecvt_base::result __r;
       __st_last_  = __st_;
-      size_t __nr = fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
+      size_t __nr = std::fread((void*)const_cast<char*>(__extbufnext_), 1, __nmemb, __file_);
       if (__nr != 0) {
         if (!__cv_)
           std::__throw_bad_cast();
@@ -855,7 +855,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
             return traits_type::eof();
         } else if (__r == codecvt_base::ok || __r == codecvt_base::partial) {
           size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-          if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+          if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
             return traits_type::eof();
           if (__r == codecvt_base::partial) {
             this->setp(const_cast<char_type*>(__e), this->pptr());
@@ -990,12 +990,12 @@ int basic_filebuf<_CharT, _Traits>::sync() {
       char* __extbe;
       __r            = __cv_->unshift(__st_, __extbuf_, __extbuf_ + __ebs_, __extbe);
       size_t __nmemb = static_cast<size_t>(__extbe - __extbuf_);
-      if (fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
+      if (std::fwrite(__extbuf_, 1, __nmemb, __file_) != __nmemb)
         return -1;
     } while (__r == codecvt_base::partial);
     if (__r == codecvt_base::error)
       return -1;
-    if (fflush(__file_))
+    if (std::fflush(__file_))
       return -1;
   } else if (__cm_ & ios_base::in) {
     off_type __c;

From 19658d14749876cf0b6633f210c923be3709323b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 13:28:45 -0700
Subject: [PATCH 796/851] [llvm] annotate interfaces in llvm/Target for DLL
 export (#143615)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Purpose

This patch is one in a series of code-mods that annotate LLVM’s public
interface for export. This patch annotates the `llvm/Target` library.
These annotations currently have no meaningful impact on the LLVM build;
however, they are a prerequisite to support an LLVM Windows DLL (shared
library) build.

## Background

This effort is tracked in #109483. Additional context is provided in
[this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

A sub-set of these changes were generated automatically using the
[Interface Definition Scanner (IDS)](https://github.com/compnerd/ids)
tool, followed formatting with `git clang-format`.

The bulk of this change is manual additions of `LLVM_ABI` to
`LLVMInitializeX` functions defined in .cpp files under llvm/lib/Target.
Adding `LLVM_ABI` to the function implementation is required here
because they do not `#include "llvm/Support/TargetSelect.h"`, which
contains the declarations for this functions and was already updated
with `LLVM_ABI` in a previous patch. I considered patching these files
with `#include "llvm/Support/TargetSelect.h"` instead, but since
TargetSelect.h is a large file with a bunch of preprocessor x-macro
stuff in it I was concerned it would unnecessarily impact compile times.

In addition, a number of unit tests under llvm/unittests/Target required
additional dependencies to make them build correctly against the LLVM
DLL on Windows using MSVC.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 llvm/include/llvm/Target/CGPassBuilderOption.h        |  3 ++-
 llvm/include/llvm/Target/TargetLoweringObjectFile.h   |  3 ++-
 llvm/include/llvm/Target/TargetMachine.h              |  5 +++--
 llvm/include/llvm/Target/TargetOptions.h              | 11 ++++++-----
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/AArch64/AArch64TargetMachine.cpp      |  4 +++-
 .../lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp |  3 ++-
 .../AArch64/Disassembler/AArch64Disassembler.cpp      |  3 ++-
 .../AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp      |  4 +++-
 .../Target/AArch64/TargetInfo/AArch64TargetInfo.cpp   |  4 +++-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp           |  4 +++-
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp        |  3 ++-
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp  |  4 +++-
 .../Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp |  4 +++-
 llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp  |  4 +++-
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp |  4 +++-
 .../lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp |  4 +++-
 llvm/lib/Target/ARM/ARMAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/ARM/ARMTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp        |  2 +-
 llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp  |  3 ++-
 llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp  |  3 ++-
 llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp      |  5 ++++-
 llvm/lib/Target/AVR/AVRAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/AVR/AVRTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp        |  3 ++-
 llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp  |  5 ++++-
 llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp  |  3 ++-
 llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp      |  4 +++-
 llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp        |  3 ++-
 llvm/lib/Target/BPF/BPFAsmPrinter.cpp                 |  4 +++-
 llvm/lib/Target/BPF/BPFTargetMachine.cpp              |  3 ++-
 llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp  |  5 +++--
 llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp  |  4 ++--
 llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp      |  4 +++-
 .../lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp |  4 +++-
 .../Hexagon/Disassembler/HexagonDisassembler.cpp      |  4 +++-
 llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp      |  4 +++-
 .../Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp      |  4 +++-
 .../Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp    |  4 +++-
 .../Target/Lanai/Disassembler/LanaiDisassembler.cpp   |  4 +++-
 llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/Lanai/LanaiTargetMachine.cpp          |  3 ++-
 .../Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp  |  4 +++-
 .../Target/LoongArch/AsmParser/LoongArchAsmParser.cpp |  4 +++-
 .../LoongArch/Disassembler/LoongArchDisassembler.cpp  |  4 +++-
 llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp     |  4 +++-
 llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp  |  4 +++-
 .../LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp  |  3 ++-
 .../LoongArch/TargetInfo/LoongArchTargetInfo.cpp      |  4 +++-
 llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp  |  4 +++-
 .../Target/MSP430/Disassembler/MSP430Disassembler.cpp |  4 +++-
 .../Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp |  4 +++-
 llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp           |  4 +++-
 llvm/lib/Target/MSP430/MSP430TargetMachine.cpp        |  3 ++-
 .../lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp |  4 +++-
 llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp      |  3 ++-
 .../lib/Target/Mips/Disassembler/MipsDisassembler.cpp |  3 ++-
 .../lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp |  3 ++-
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp               |  4 +++-
 llvm/lib/Target/Mips/MipsTargetMachine.cpp            |  3 ++-
 llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp    |  4 +++-
 .../Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp    |  4 +++-
 .../Target/PowerPC/Disassembler/PPCDisassembler.cpp   |  4 +++-
 .../Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp          |  4 +++-
 .../Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp    |  4 +++-
 .../Target/RISCV/Disassembler/RISCVDisassembler.cpp   |  4 +++-
 llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp    |  4 +++-
 .../Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp  |  4 +++-
 .../Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp    |  4 +++-
 .../Target/Sparc/Disassembler/SparcDisassembler.cpp   |  5 +++--
 .../Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/Sparc/SparcAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/Sparc/SparcTargetMachine.cpp          |  3 ++-
 llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp  |  4 +++-
 .../lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp |  4 +++-
 .../SystemZ/Disassembler/SystemZDisassembler.cpp      |  4 +++-
 .../SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp      |  4 +++-
 llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp         |  4 +++-
 llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp      |  4 +++-
 .../Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp   |  4 +++-
 llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp          |  3 ++-
 llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp    |  4 +++-
 llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp    |  3 ++-
 llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp        |  3 ++-
 llvm/lib/Target/VE/VEAsmPrinter.cpp                   |  3 ++-
 llvm/lib/Target/VE/VETargetMachine.cpp                |  3 ++-
 .../WebAssembly/AsmParser/WebAssemblyAsmParser.cpp    |  4 +++-
 .../Disassembler/WebAssemblyDisassembler.cpp          |  3 ++-
 .../MCTargetDesc/WebAssemblyMCTargetDesc.cpp          |  4 +++-
 .../WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp |  4 +++-
 .../Target/WebAssembly/WebAssemblyTargetMachine.cpp   |  4 +++-
 .../Target/XCore/Disassembler/XCoreDisassembler.cpp   |  4 +++-
 .../Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp   |  4 +++-
 llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp  |  4 +++-
 llvm/lib/Target/XCore/XCoreAsmPrinter.cpp             |  4 +++-
 llvm/lib/Target/XCore/XCoreTargetMachine.cpp          |  3 ++-
 llvm/unittests/Target/AArch64/CMakeLists.txt          |  1 +
 llvm/unittests/Target/LoongArch/CMakeLists.txt        |  1 +
 llvm/unittests/Target/RISCV/CMakeLists.txt            |  1 +
 llvm/unittests/Target/SPIRV/CMakeLists.txt            |  1 +
 llvm/unittests/Target/VE/CMakeLists.txt               |  1 +
 llvm/unittests/Target/WebAssembly/CMakeLists.txt      |  1 +
 121 files changed, 322 insertions(+), 123 deletions(-)

diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 51f25c1360b87..f29cbe78a1853 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_CGPASSBUILDEROPTION_H
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
@@ -82,7 +83,7 @@ struct CGPassBuilderOption {
   std::optional<bool> DebugifyCheckAndStripAll;
 };
 
-CGPassBuilderOption getCGPassBuilderOption();
+LLVM_ABI CGPassBuilderOption getCGPassBuilderOption();
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
index 47617424a9688..27a688bc12abf 100644
--- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h
@@ -16,6 +16,7 @@
 
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCRegister.h"
+#include "llvm/Support/Compiler.h"
 #include <cstdint>
 
 namespace llvm {
@@ -43,7 +44,7 @@ class StringRef;
 class TargetMachine;
 class DSOLocalEquivalent;
 
-class TargetLoweringObjectFile : public MCObjectFileInfo {
+class LLVM_ABI TargetLoweringObjectFile : public MCObjectFileInfo {
   /// Name-mangler for global names.
   Mangler *Mang = nullptr;
 
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 906926729ed74..04c97c1502a1b 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
@@ -28,7 +29,7 @@
 #include <string>
 #include <utility>
 
-extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
+extern LLVM_ABI llvm::cl::opt<bool> NoKernelInfoEndLTO;
 
 namespace llvm {
 
@@ -78,7 +79,7 @@ struct MachineFunctionInfo;
 /// machine.  All target-specific information should be accessible through this
 /// interface.
 ///
-class TargetMachine {
+class LLVM_ABI TargetMachine {
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef DataLayoutString,
                 const Triple &TargetTriple, StringRef CPU, StringRef FS,
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 10638a0ec902f..a7c46921255b8 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Compiler.h"
 
 #include <memory>
 
@@ -158,12 +159,12 @@ class TargetOptions {
 
   /// DisableFramePointerElim - This returns true if frame pointer elimination
   /// optimization should be disabled for the given machine function.
-  bool DisableFramePointerElim(const MachineFunction &MF) const;
+  LLVM_ABI bool DisableFramePointerElim(const MachineFunction &MF) const;
 
   /// FramePointerIsReserved - This returns true if the frame pointer must
   /// always either point to a new frame record or be un-modified in the given
   /// function.
-  bool FramePointerIsReserved(const MachineFunction &MF) const;
+  LLVM_ABI bool FramePointerIsReserved(const MachineFunction &MF) const;
 
   /// If greater than 0, override the default value of
   /// MCAsmInfo::BinutilsVersion.
@@ -219,7 +220,7 @@ class TargetOptions {
   /// truncations).  If this is enabled (set to true), the code generator must
   /// assume that the rounding mode may dynamically change.
   unsigned HonorSignDependentRoundingFPMathOption : 1;
-  bool HonorSignDependentRoundingFPMath() const;
+  LLVM_ABI bool HonorSignDependentRoundingFPMath() const;
 
   /// NoZerosInBSS - By default some codegens place zero-initialized data to
   /// .bss section. This flag disables such behaviour (necessary, e.g. for
@@ -346,7 +347,7 @@ class TargetOptions {
   unsigned EnableDebugEntryValues : 1;
   /// NOTE: There are targets that still do not support the debug entry values
   /// production.
-  bool ShouldEmitDebugEntryValues() const;
+  LLVM_ABI bool ShouldEmitDebugEntryValues() const;
 
   // When set to true, use experimental new debug variable location tracking,
   // which seeks to follow the values of variables rather than their location,
@@ -450,7 +451,7 @@ class TargetOptions {
 
   DenormalMode getRawFP32DenormalMode() const { return FP32DenormalMode; }
 
-  DenormalMode getDenormalMode(const fltSemantics &FPType) const;
+  LLVM_ABI DenormalMode getDenormalMode(const fltSemantics &FPType) const;
 
   /// What exception model to use
   ExceptionHandling ExceptionModel = ExceptionHandling::None;
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 3f92c1dbfbf49..4099f40ea07fd 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -58,6 +58,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -3534,7 +3535,8 @@ INITIALIZE_PASS(AArch64AsmPrinter, "aarch64-asm-printer",
                 "AArch64 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 50f52cca6c8ac..8150e91c8ba52 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -44,6 +44,7 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -223,7 +224,8 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for AArch64"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 2f67ff55f26b7..d8bdc01a3454f 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -8322,7 +8322,8 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64AsmParser() {
   RegisterMCAsmParser<AArch64AsmParser> X(getTheAArch64leTarget());
   RegisterMCAsmParser<AArch64AsmParser> Y(getTheAArch64beTarget());
   RegisterMCAsmParser<AArch64AsmParser> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index bab0cbe7788e9..ae984be670fc2 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -310,7 +310,8 @@ createAArch64ExternalSymbolizer(const Triple &TT, LLVMOpInfoCallback GetOpInfo,
                                        SymbolLookUp, DisInfo);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheAArch64leTarget(),
                                          createAArch64Disassembler);
   TargetRegistry::RegisterMCDisassembler(getTheAArch64beTarget(),
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index b7959e02ec268..efc13589bab63 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/AArch64TargetParser.h"
@@ -503,7 +504,8 @@ static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetMC() {
   for (Target *T : {&getTheAArch64leTarget(), &getTheAArch64beTarget(),
                     &getTheAArch64_32Target(), &getTheARM64Target(),
                     &getTheARM64_32Target()}) {
diff --git a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 52c88fd0218d6..c9ebd3b4a6517 100644
--- a/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AArch64TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 Target &llvm::getTheAArch64leTarget() {
@@ -31,7 +32,8 @@ Target &llvm::getTheARM64_32Target() {
   return TheARM64_32Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAArch64TargetInfo() {
   // Now register the "arm64" name for use with "-march". We don't want it to
   // take possession of the Triple::aarch64 tags though.
   TargetRegistry::RegisterTarget(getTheARM64Target(), "arm64",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 491314daf2d81..84b0f98554097 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/TargetParser/TargetParser.h"
@@ -83,7 +84,8 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
                                      llvm::createR600AsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d59087839b0e1..f390d39043ed5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -89,6 +89,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
@@ -481,7 +482,7 @@ static cl::opt<bool> HasClosedWorldAssumption(
     cl::desc("Whether has closed-world assumption at link time"),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
   RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 0dc1d13773229..30dcd6d81f16d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/TargetParser.h"
 #include <optional>
@@ -9800,7 +9801,8 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUAsmParser() {
   RegisterMCAsmParser<AMDGPUAsmParser> A(getTheR600Target());
   RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
 }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ca0093d1f049c..349e408b79658 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -2648,7 +2649,8 @@ static MCDisassembler *createAMDGPUDisassembler(const Target &T,
   return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheGCNTarget(),
                                          createAMDGPUDisassembler);
   TargetRegistry::RegisterMCSymbolizer(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 2768e0c23cf01..b8f43c4550b7e 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/WithColor.h"
 
 namespace llvm::mca {
@@ -353,7 +354,8 @@ createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
 
 /// Extern function to initialize the targets for the AMDGPU backend
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMCA() {
   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
                                           createAMDGPUCustomBehaviour);
   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index c692895d84c00..d66725d3a6c4b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -156,7 +157,8 @@ static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
   return new AMDGPUMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetMC() {
 
   TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(getTheR600Target(),
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 98fd16e59bf1f..ad547556cf150 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -28,7 +29,8 @@ Target &llvm::getTheGCNTarget() {
 }
 
 /// Extern function to initialize the targets for the AMDGPU backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAMDGPUTargetInfo() {
   RegisterTarget<Triple::r600, false> R600(getTheR600Target(), "r600",
                                            "AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
   RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index fa14370025515..1443747709b7a 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -40,6 +40,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -2456,7 +2457,8 @@ INITIALIZE_PASS(ARMAsmPrinter, "arm-asm-printer", "ARM Assembly Printer", false,
 //===----------------------------------------------------------------------===//
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(getTheARMLETarget());
   RegisterAsmPrinter<ARMAsmPrinter> Y(getTheARMBETarget());
   RegisterAsmPrinter<ARMAsmPrinter> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 563e69a65ab3b..fee77a44e5e80 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
@@ -82,7 +83,7 @@ namespace llvm {
   void initializeARMExecutionDomainFixPass(PassRegistry&);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
   RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f3bdcd64805d8..25f0273013373 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -12722,7 +12722,7 @@ bool ARMAsmParser::parseDirectiveSEHCustom(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMAsmParser() {
   RegisterMCAsmParser<ARMAsmParser> X(getTheARMLETarget());
   RegisterMCAsmParser<ARMAsmParser> Y(getTheARMBETarget());
   RegisterMCAsmParser<ARMAsmParser> A(getTheThumbLETarget());
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ef30b1aafb28b..5f930fb0c8071 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1269,7 +1269,8 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
                                          createARMDisassembler);
   TargetRegistry::RegisterMCDisassembler(getTheARMBETarget(),
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index c756bff3b501a..2d22b27ceb131 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -770,7 +771,7 @@ bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
                     &getTheThumbLETarget(), &getTheThumbBETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 4d514f3ca4442..3e3670d4e0192 100644
--- a/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -8,6 +8,8 @@
 
 #include "TargetInfo/ARMTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 Target &llvm::getTheARMLETarget() {
@@ -27,7 +29,8 @@ Target &llvm::getTheThumbBETarget() {
   return TheThumbBETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeARMTargetInfo() {
   RegisterTarget<Triple::arm, /*HasJIT=*/true> X(getTheARMLETarget(), "arm",
                                                  "ARM", "ARM");
   RegisterTarget<Triple::armeb, /*HasJIT=*/true> Y(getTheARMBETarget(), "armeb",
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 1a1e5155979e6..ad8aa5717fb42 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -335,6 +336,7 @@ char AVRAsmPrinter::ID = 0;
 INITIALIZE_PASS(AVRAsmPrinter, "avr-asm-printer", "AVR Assembly Printer", false,
                 false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRAsmPrinter() {
   llvm::RegisterAsmPrinter<AVRAsmPrinter> X(getTheAVRTarget());
 }
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 72544b0afd8d2..b75417a0896a5 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
@@ -87,7 +88,7 @@ void AVRPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTarget() {
   // Register the target.
   RegisterTargetMachine<AVRTargetMachine> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index e82bd761eeb39..012cf2c70e2e5 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -731,7 +732,7 @@ ParseStatus AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRAsmParser() {
   RegisterMCAsmParser<AVRAsmParser> X(getTheAVRTarget());
 }
 
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 70428673fcd8d..c7a584868f4e6 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -23,6 +23,8 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
 
+#include "llvm/Support/Compiler.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "avr-disassembler"
@@ -50,7 +52,8 @@ static MCDisassembler *createAVRDisassembler(const Target &T,
   return new AVRDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheAVRTarget(),
                                          createAVRDisassembler);
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index f87fb70f97ff0..d29a7a56167c9 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -87,7 +88,7 @@ static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
   return new AVRTargetAsmStreamer(S);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<AVRMCAsmInfo> X(getTheAVRTarget());
 
diff --git a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
index dd61add1526cf..d81db50650ba7 100644
--- a/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
+++ b/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/AVRTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 namespace llvm {
 Target &getTheAVRTarget() {
   static Target TheAVRTarget;
@@ -15,7 +16,8 @@ Target &getTheAVRTarget() {
 }
 } // namespace llvm
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeAVRTargetInfo() {
   llvm::RegisterTarget<llvm::Triple::avr> X(llvm::getTheAVRTarget(), "avr",
                                             "Atmel AVR Microcontroller", "AVR");
 }
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 7d1819134d162..b49e8fd96c66a 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -532,7 +533,7 @@ bool BPFAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmParser() {
   RegisterMCAsmParser<BPFAsmParser> X(getTheBPFTarget());
   RegisterMCAsmParser<BPFAsmParser> Y(getTheBPFleTarget());
   RegisterMCAsmParser<BPFAsmParser> Z(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 5dd71cc91427a..e3843e0e112e2 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -155,7 +156,8 @@ INITIALIZE_PASS(BPFAsmPrinter, "bpf-asm-printer", "BPF Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFAsmPrinter() {
   RegisterAsmPrinter<BPFAsmPrinter> X(getTheBPFleTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Y(getTheBPFbeTarget());
   RegisterAsmPrinter<BPFAsmPrinter> Z(getTheBPFTarget());
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 0c3f61fdfedd6..527a480354571 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
@@ -41,7 +42,7 @@ static cl::opt<bool>
     DisableCheckUnreachable("bpf-disable-trap-unreachable", cl::Hidden,
                             cl::desc("Disable Trap Unreachable for BPF"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
   // Register the target.
   RegisterTargetMachine<BPFTargetMachine> X(getTheBPFleTarget());
   RegisterTargetMachine<BPFTargetMachine> Y(getTheBPFbeTarget());
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 536bee5393843..4dfae81e90191 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <cstdint>
@@ -82,8 +83,8 @@ static MCDisassembler *createBPFDisassembler(const Target &T,
   return new BPFDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheBPFTarget(),
                                          createBPFDisassembler);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index caf84701b999f..5f44dd9583aff 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Host.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -104,7 +105,7 @@ static MCInstrAnalysis *createBPFInstrAnalysis(const MCInstrInfo *Info) {
   return new BPFMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
   for (Target *T :
        {&getTheBPFleTarget(), &getTheBPFbeTarget(), &getTheBPFTarget()}) {
     // Register the MC asm info.
@@ -153,5 +154,4 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetMC() {
     TargetRegistry::RegisterMCAsmBackend(getTheBPFTarget(),
                                          createBPFbeAsmBackend);
   }
-
 }
diff --git a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
index d7cdcae916aaf..6ea6cd56a6d05 100644
--- a/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
+++ b/llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/BPFTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheBPFTarget() {
   return TheBPFTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeBPFTargetInfo() {
   TargetRegistry::RegisterTarget(getTheBPFTarget(), "bpf", "BPF (host endian)",
                                  "BPF", [](Triple::ArchType) { return false; },
                                  true);
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index f9b4bc0d14fd9..c423dca90a4ab 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
@@ -878,7 +879,8 @@ bool HexagonAsmParser::RegisterMatchesArch(MCRegister MatchNum) const {
 // extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmLexer();
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmParser() {
   RegisterMCAsmParser<HexagonAsmParser> X(getTheHexagonTarget());
 }
 
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 98b711f6b014b..5bd31707acb6f 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -164,7 +165,8 @@ static MCDisassembler *createHexagonDisassembler(const Target &T,
   return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheHexagonTarget(),
                                          createHexagonDisassembler);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index c7580d28618ab..f22852d1ef557 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -858,6 +859,7 @@ char HexagonAsmPrinter::ID = 0;
 INITIALIZE_PASS(HexagonAsmPrinter, "hexagon-asm-printer",
                 "Hexagon Assembly Printer", false, false)
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonAsmPrinter() {
   RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget());
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 19b7c6a315f56..66508fd767793 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -26,6 +26,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -174,7 +175,8 @@ static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTarget() {
   // Register the target.
   RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 91051cd4e2d51..980df819b2c26 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -34,6 +34,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/HexagonAttributes.h"
 #include "llvm/Support/raw_ostream.h"
@@ -775,7 +776,8 @@ static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheHexagonTarget(), createHexagonMCAsmInfo);
 
diff --git a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
index ef9f9fd337fac..34a7b945ca516 100644
--- a/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
+++ b/llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheHexagonTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheHexagonTarget() {
   return TheHexagonTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeHexagonTargetInfo() {
   RegisterTarget<Triple::hexagon, /*HasJIT=*/true> X(
       getTheHexagonTarget(), "hexagon", "Hexagon", "Hexagon");
 }
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 9cb7f71945d1d..6a74686a239d0 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -1223,6 +1224,7 @@ bool LanaiAsmParser::parseInstruction(ParseInstructionInfo & /*Info*/,
 #define GET_MATCHER_IMPLEMENTATION
 #include "LanaiGenAsmMatcher.inc"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmParser() {
   RegisterMCAsmParser<LanaiAsmParser> x(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
index 2720e1d9a6a64..5d87c3c4d72cf 100644
--- a/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
+++ b/llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -35,7 +36,8 @@ static MCDisassembler *createLanaiDisassembler(const Target & /*T*/,
   return new LanaiDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiDisassembler() {
   // Register the disassembler
   TargetRegistry::RegisterMCDisassembler(getTheLanaiTarget(),
                                          createLanaiDisassembler);
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index 1c4fc572243c5..24e4fc3f53e63 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "asm-printer"
@@ -242,6 +243,7 @@ INITIALIZE_PASS(LanaiAsmPrinter, "lanai-asm-printer", "Lanai Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiAsmPrinter() {
   RegisterAsmPrinter<LanaiAsmPrinter> X(getTheLanaiTarget());
 }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 7f94e778e7545..3d6ba9ecc55e2 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -21,12 +21,13 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include <optional>
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() {
   // Register the target.
   RegisterTargetMachine<LanaiTargetMachine> registered_target(
       getTheLanaiTarget());
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 4a381c033b384..687386c6962be 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstdint>
@@ -126,7 +127,8 @@ static MCInstrAnalysis *createLanaiInstrAnalysis(const MCInstrInfo *Info) {
   return new LanaiMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfo<LanaiMCAsmInfo> X(getTheLanaiTarget());
 
diff --git a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
index 5c63df670938f..f56591a45f8f8 100644
--- a/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
+++ b/llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LanaiTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,8 @@ Target &llvm::getTheLanaiTarget() {
   return TheLanaiTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLanaiTargetInfo() {
   RegisterTarget<Triple::lanai> X(getTheLanaiTarget(), "lanai", "Lanai",
                                   "Lanai");
 }
diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 7d58270089575..a8fed951b0cfa 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -1953,7 +1954,8 @@ ParseStatus LoongArchAsmParser::parseDirective(AsmToken DirectiveID) {
   return ParseStatus::NoMatch;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmParser() {
   RegisterMCAsmParser<LoongArchAsmParser> X(getTheLoongArch32Target());
   RegisterMCAsmParser<LoongArchAsmParser> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
index 761682423fffe..8c4668ec70c7e 100644
--- a/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
+++ b/llvm/lib/Target/LoongArch/Disassembler/LoongArchDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -45,7 +46,8 @@ static MCDisassembler *createLoongArchDisassembler(const Target &T,
   return new LoongArchDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheLoongArch32Target(),
                                          createLoongArchDisassembler);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index 64ac7c03c0419..b757d123fa0ff 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -309,7 +310,8 @@ INITIALIZE_PASS(LoongArchAsmPrinter, "loongarch-asm-printer",
                 "LoongArch Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchAsmPrinter() {
   RegisterAsmPrinter<LoongArchAsmPrinter> X(getTheLoongArch32Target());
   RegisterAsmPrinter<LoongArchAsmPrinter> Y(getTheLoongArch64Target());
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index d63e5a2b50e84..c36db9c75dd3a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>
 
@@ -29,7 +30,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarch"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTarget() {
   // Register the target.
   RegisterTargetMachine<LoongArchTargetMachine> X(getTheLoongArch32Target());
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
index 3ec070e5cbdd3..35277ce094a7d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp
@@ -204,7 +204,8 @@ MCStreamer *createLoongArchELFStreamer(const Triple &T, MCContext &Context,
 }
 } // end namespace
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetMC() {
   for (Target *T : {&getTheLoongArch32Target(), &getTheLoongArch64Target()}) {
     TargetRegistry::RegisterMCRegInfo(*T, createLoongArchMCRegisterInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createLoongArchMCInstrInfo);
diff --git a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
index 1d6be4069b71e..a7a5c25de3233 100644
--- a/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
+++ b/llvm/lib/Target/LoongArch/TargetInfo/LoongArchTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/LoongArchTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheLoongArch32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheLoongArch64Target() {
   return TheLoongArch64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeLoongArchTargetInfo() {
   RegisterTarget<Triple::loongarch32, /*HasJIT=*/false> X(
       getTheLoongArch32Target(), "loongarch32", "32-bit LoongArch",
       "LoongArch");
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index c61b8adf89ab4..5a4121f7cafd7 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "msp430-asm-parser"
@@ -534,7 +535,8 @@ bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
   return (parseMany(parseOne));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmParser() {
   RegisterMCAsmParser<MSP430AsmParser> X(getTheMSP430Target());
 }
 
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index 519bba763204f..4c5b473982f77 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -57,7 +58,8 @@ static MCDisassembler *createMSP430Disassembler(const Target &T,
   return new MSP430Disassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Disassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430Disassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheMSP430Target(),
                                          createMSP430Disassembler);
 }
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index df182a5459ead..2cb515aef11e4 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -80,7 +81,8 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Triple &T,
   return nullptr;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetMC() {
   Target &T = getTheMSP430Target();
 
   TargetRegistry::RegisterMCAsmInfo(T, createMSP430MCAsmInfo);
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 44e55b6a3c9b7..44eea8149c594 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -189,6 +190,7 @@ INITIALIZE_PASS(MSP430AsmPrinter, "msp430-asm-printer",
                 "MSP430 Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430AsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(getTheMSP430Target());
 }
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 763a2db2baca7..e6024f4a62185 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Target() {
   // Register the target.
   RegisterTargetMachine<MSP430TargetMachine> X(getTheMSP430Target());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index fc2b38f41c141..a6170b82e1f49 100644
--- a/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MSP430TargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMSP430Target() {
@@ -15,7 +16,8 @@ Target &llvm::getTheMSP430Target() {
   return TheMSP430Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430TargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMSP430TargetInfo() {
   RegisterTarget<Triple::msp430> X(getTheMSP430Target(), "msp430",
                                    "MSP430 [experimental]", "MSP430");
 }
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 071c016b92e7f..b559a8b896e0f 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -8906,7 +8906,8 @@ bool MipsAsmParser::parseInternalDirectiveReallowModule() {
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmParser() {
   RegisterMCAsmParser<MipsAsmParser> X(getTheMipsTarget());
   RegisterMCAsmParser<MipsAsmParser> Y(getTheMipselTarget());
   RegisterMCAsmParser<MipsAsmParser> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 2a3a8eac2e9af..b3f6cd1609fbb 100644
--- a/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -503,7 +503,8 @@ static MCDisassembler *createMipselDisassembler(
   return new MipsDisassembler(STI, Ctx, false);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheMipsTarget(),
                                          createMipsDisassembler);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index add36d87b9eff..29f61ed9b2b83 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -259,7 +260,7 @@ static MCInstrAnalysis *createMipsMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MipsMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
   for (Target *T : {&getTheMipsTarget(), &getTheMipselTarget(),
                     &getTheMips64Target(), &getTheMips64elTarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index a6300a9c11d49..87e06a6d3c08a 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -55,6 +55,7 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -1299,7 +1300,8 @@ INITIALIZE_PASS(MipsAsmPrinter, "mips-asm-printer", "Mips Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(getTheMipsTarget());
   RegisterAsmPrinter<MipsAsmPrinter> Y(getTheMipselTarget());
   RegisterAsmPrinter<MipsAsmPrinter> A(getTheMips64Target());
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 30b4d506c5caa..8c519fa379dd8 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -37,6 +37,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
@@ -51,7 +52,7 @@ static cl::opt<bool>
     EnableMulMulFix("mfix4300", cl::init(false),
                     cl::desc("Enable the VR4300 mulmul bug fix."), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   // Register the target.
   RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
   RegisterTargetMachine<MipselTargetMachine> Y(getTheMipselTarget());
diff --git a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index db5f607bbb4f5..458032042e15f 100644
--- a/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/MipsTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheMipsTarget() {
@@ -27,7 +28,8 @@ Target &llvm::getTheMips64elTarget() {
   return TheMips64elTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeMipsTargetInfo() {
   RegisterTarget<Triple::mips,
                  /*HasJIT=*/true>
       X(getTheMipsTarget(), "mips", "MIPS (32-bit big endian)", "Mips");
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 1cafd236a2925..cb7132b5f3042 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -71,7 +72,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetMC() {
   for (Target *T : {&getTheNVPTXTarget32(), &getTheNVPTXTarget64()}) {
     // Register the MC asm info.
     RegisterMCAsmInfo<NVPTXMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b4e2c46b94440..9af6fb2cb198e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -77,6 +77,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/NativeFormatting.h"
@@ -1947,7 +1948,8 @@ INITIALIZE_PASS(NVPTXAsmPrinter, "nvptx-asm-printer", "NVPTX Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(getTheNVPTXTarget32());
   RegisterAsmPrinter<NVPTXAsmPrinter> Y(getTheNVPTXTarget64());
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 85d28a703a4cb..ef310e5828f22 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -87,7 +88,7 @@ static cl::opt<bool> EarlyByValArgsCopy(
     cl::desc("Create a copy of byval function arguments early."),
     cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   // Register the target.
   RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
   RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
diff --git a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
index e4f0a517599fc..24fea037b1c54 100644
--- a/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
+++ b/llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/NVPTXTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheNVPTXTarget32() {
@@ -19,7 +20,8 @@ Target &llvm::getTheNVPTXTarget64() {
   return TheNVPTXTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeNVPTXTargetInfo() {
   RegisterTarget<Triple::nvptx> X(getTheNVPTXTarget32(), "nvptx",
                                   "NVIDIA PTX 32-bit", "NVPTX");
   RegisterTarget<Triple::nvptx64> Y(getTheNVPTXTarget64(), "nvptx64",
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index bb4c2fd3e5cf8..2b3727be644da 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -1784,7 +1785,8 @@ bool PPCAsmParser::parseGNUAttribute(SMLoc L) {
 }
 
 /// Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(getThePPC32Target());
   RegisterMCAsmParser<PPCAsmParser> B(getThePPC32LETarget());
   RegisterMCAsmParser<PPCAsmParser> C(getThePPC64Target());
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 0c6c17d5a0b68..71a76142bb389 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -50,7 +51,8 @@ static MCDisassembler *createPPCLEDisassembler(const Target &T,
   return new PPCDisassembler(STI, Ctx, /*IsLittleEndian=*/true);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getThePPC32Target(),
                                          createPPCDisassembler);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 44b5732be6e3e..dd2756a1a8238 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -41,6 +41,7 @@
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -473,7 +474,8 @@ static MCInstrAnalysis *createPPCMCInstrAnalysis(const MCInstrInfo *Info) {
   return new PPCMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetMC() {
   for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(),
                     &getThePPC64Target(), &getThePPC64LETarget()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d5d51e3ca6386..9e42011c0c746 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -63,6 +63,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -3374,7 +3375,8 @@ INITIALIZE_PASS(PPCAIXAsmPrinter, "ppc-aix-asm-printer",
                 "AIX PPC Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCAsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(getThePPC32Target(),
                                      createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(getThePPC32LETarget(),
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 359a43dd001d2..b5c6ac111dff0 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
@@ -113,7 +114,8 @@ static cl::opt<unsigned>
                          cl::init(0x7fff),
                          cl::desc("Maximum global merge offset"));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
   RegisterTargetMachine<PPCTargetMachine> B(getThePPC32LETarget());
diff --git a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index 0bfa0bd5ec0e7..982be2746b47b 100644
--- a/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getThePPC32Target() {
@@ -27,7 +28,8 @@ Target &llvm::getThePPC64LETarget() {
   return ThePPC64LETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializePowerPCTargetInfo() {
   RegisterTarget<Triple::ppc, /*HasJIT=*/true> W(getThePPC32Target(), "ppc32",
                                                  "PowerPC 32", "PPC");
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 510ca5f8c0d92..f1d6f99ba9815 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/RISCVAttributes.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
@@ -4021,7 +4022,8 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmParser() {
   RegisterMCAsmParser<RISCVAsmParser> X(getTheRISCV32Target());
   RegisterMCAsmParser<RISCVAsmParser> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 4363e5c5176c9..cbab081a6731e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -64,7 +65,8 @@ static MCDisassembler *createRISCVDisassembler(const Target &T,
   return new RISCVDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheRISCV32Target(),
                                          createRISCVDisassembler);
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index e3b89d84a134b..ae44306170758 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -16,6 +16,7 @@
 #include "RISCV.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "llvm-mca-riscv-custombehaviour"
@@ -344,7 +345,8 @@ createRISCVInstrumentManager(const MCSubtargetInfo &STI,
 }
 
 /// Extern function to initialize the targets for the RISC-V backend
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMCA() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMCA() {
   TargetRegistry::RegisterInstrumentManager(getTheRISCV32Target(),
                                             createRISCVInstrumentManager);
   TargetRegistry::RegisterInstrumentManager(getTheRISCV64Target(),
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index f3b93f032588c..f66c2d5f99cb3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <bitset>
 
@@ -331,7 +332,8 @@ static MCInstrAnalysis *createRISCVInstrAnalysis(const MCInstrInfo *Info) {
   return new RISCVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCObjectFileInfo(*T, createRISCVMCObjectFileInfo);
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index 83e9b4b4d7c5c..d4d7de289a107 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -38,6 +38,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/RISCVISAInfo.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
@@ -610,7 +611,8 @@ void RISCVAsmPrinter::emitFunctionEntryLabel() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVAsmPrinter() {
   RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
   RegisterAsmPrinter<RISCVAsmPrinter> Y(getTheRISCV64Target());
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0bea3bc432b66..b43b915d0ad4f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -34,6 +34,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
@@ -118,7 +119,7 @@ static cl::opt<bool>
                            cl::desc("Enable Machine Pipeliner for RISC-V"),
                            cl::init(false), cl::Hidden);
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
   auto *PR = PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
index 0a675d6849122..fc0965d263a8a 100644
--- a/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
+++ b/llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheRISCV32Target() {
@@ -20,7 +21,8 @@ Target &llvm::getTheRISCV64Target() {
   return TheRISCV64Target;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeRISCVTargetInfo() {
   RegisterTarget<Triple::riscv32, /*HasJIT=*/true> X(
       getTheRISCV32Target(), "riscv32", "32-bit RISC-V", "RISCV");
   RegisterTarget<Triple::riscv64, /*HasJIT=*/true> Y(
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
index 21a952649ff51..cc77ddd748a94 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 #define GET_INSTRINFO_MC_DESC
 #define ENABLE_INSTR_PREDICATE_VERIFIER
@@ -78,7 +79,8 @@ static MCInstrAnalysis *createSPIRVInstrAnalysis(const MCInstrInfo *Info) {
   return new SPIRVMCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetMC() {
   for (Target *T : {&getTheSPIRV32Target(), &getTheSPIRV64Target(),
                     &getTheSPIRVLogicalTarget()}) {
     RegisterMCAsmInfo<SPIRVMCAsmInfo> X(*T);
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 26b94788b810e..1ebfde2a603b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -657,7 +658,8 @@ INITIALIZE_PASS(SPIRVAsmPrinter, "spirv-asm-printer", "SPIRV Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVAsmPrinter() {
   RegisterAsmPrinter<SPIRVAsmPrinter> X(getTheSPIRV32Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Y(getTheSPIRV64Target());
   RegisterAsmPrinter<SPIRVAsmPrinter> Z(getTheSPIRVLogicalTarget());
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 82fe23a22b60f..d7cf211ba84dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
@@ -35,7 +36,7 @@
 
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() {
   // Register the target.
   RegisterTargetMachine<SPIRVTargetMachine> X(getTheSPIRV32Target());
   RegisterTargetMachine<SPIRVTargetMachine> Y(getTheSPIRV64Target());
diff --git a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
index febefc0249204..c4d086d7da5c9 100644
--- a/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
+++ b/llvm/lib/Target/SPIRV/TargetInfo/SPIRVTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SPIRVTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -24,7 +25,8 @@ Target &llvm::getTheSPIRVLogicalTarget() {
   return TheSPIRVLogicalTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSPIRVTargetInfo() {
   RegisterTarget<Triple::spirv32> X(getTheSPIRV32Target(), "spirv32",
                                     "SPIR-V 32-bit", "SPIRV");
   RegisterTarget<Triple::spirv64> Y(getTheSPIRV64Target(), "spirv64",
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 28ae349031669..f1009999dc1b7 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
@@ -1750,7 +1751,8 @@ bool SparcAsmParser::isPossibleExpression(const AsmToken &Token) {
   }
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmParser() {
   RegisterMCAsmParser<SparcAsmParser> A(getTheSparcTarget());
   RegisterMCAsmParser<SparcAsmParser> B(getTheSparcV9Target());
   RegisterMCAsmParser<SparcAsmParser> C(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 173fe3df0d95a..fab94fb4d40ca 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -46,8 +47,8 @@ static MCDisassembler *createSparcDisassembler(const Target &T,
   return new SparcDisassembler(STI, Ctx);
 }
 
-
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSparcTarget(),
                                          createSparcDisassembler);
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 72f9b3bcd9681..fa07578e512b5 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
@@ -107,7 +108,8 @@ static MCInstPrinter *createSparcMCInstPrinter(const Triple &T,
   return new SparcInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheSparcTarget(), createSparcMCAsmInfo);
   RegisterMCAsmInfoFn Y(getTheSparcV9Target(), createSparcV9MCAsmInfo);
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 5366e905d6df0..8e7e2e5f73709 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -506,7 +507,8 @@ INITIALIZE_PASS(SparcAsmPrinter, "sparc-asm-printer", "Sparc Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(getTheSparcTarget());
   RegisterAsmPrinter<SparcAsmPrinter> Y(getTheSparcV9Target());
   RegisterAsmPrinter<SparcAsmPrinter> Z(getTheSparcelTarget());
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index d4d8cbb044dec..52076a6b4dd22 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -18,10 +18,11 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 using namespace llvm;
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTarget() {
   // Register the target.
   RegisterTargetMachine<SparcV8TargetMachine> X(getTheSparcTarget());
   RegisterTargetMachine<SparcV9TargetMachine> Y(getTheSparcV9Target());
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 1f8837eb01949..2bfcffbd4fd0b 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SparcTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheSparcTarget() {
@@ -23,7 +24,8 @@ Target &llvm::getTheSparcelTarget() {
   return TheSparcelTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSparcTargetInfo() {
   RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(),
                                                     "sparc", "Sparc", "Sparc");
   RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y(
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 6ee2a87565baa..04a4c36109246 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
@@ -1784,6 +1785,7 @@ bool SystemZAsmParser::isLabel(AsmToken &Token) {
 
 // Force static initialization.
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmParser() {
   RegisterMCAsmParser<SystemZAsmParser> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index bd188f5b4b520..6ae529e974186 100644
--- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cstdint>
@@ -45,7 +46,8 @@ static MCDisassembler *createSystemZDisassembler(const Target &T,
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheSystemZTarget(),
                                          createSystemZDisassembler);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index d2ed5cac5c576..86e340b7ff1bd 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -239,7 +240,8 @@ static MCInstrAnalysis *createSystemZMCInstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetMC() {
   // Register the MCAsmInfo.
   TargetRegistry::RegisterMCAsmInfo(getTheSystemZTarget(),
                                     createSystemZMCAsmInfo);
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index aaf12b88de132..6f9d25c050b71 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Chrono.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ConvertEBCDIC.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -1738,6 +1739,7 @@ INITIALIZE_PASS(SystemZAsmPrinter, "systemz-asm-printer",
                 "SystemZ Assembly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index ddb5a730a6fd3..ece8928accd0c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
 #include <memory>
@@ -36,7 +37,8 @@ static cl::opt<bool> EnableMachineCombinerPass(
     cl::init(true), cl::Hidden);
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
   auto &PR = *PassRegistry::getPassRegistry();
diff --git a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 91e4c91b00b9d..703051f6f2d3c 100644
--- a/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/SystemZTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -17,7 +18,8 @@ Target &llvm::getTheSystemZTarget() {
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeSystemZTargetInfo() {
   RegisterTarget<Triple::systemz, /*HasJIT=*/true> X(
       getTheSystemZTarget(), "systemz", "SystemZ", "SystemZ");
 }
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index c54ce40de45ff..7987950a2a0aa 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -1510,7 +1511,7 @@ ParseStatus VEAsmParser::parseVEAsmOperand(std::unique_ptr<VEOperand> &Op) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
   RegisterMCAsmParser<VEAsmParser> A(getTheVETarget());
 }
 
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index 00487a1f5bb38..88200c5fc97eb 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -47,7 +48,8 @@ static MCDisassembler *createVEDisassembler(const Target &T,
   return new VEDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeVEDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
                                          createVEDisassembler);
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index 019748413d32e..699ef9808eb88 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
@@ -84,7 +85,7 @@ static MCInstPrinter *createVEMCInstPrinter(const Triple &T,
   return new VEInstPrinter(MAI, MII, MRI);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheVETarget(), createVEMCAsmInfo);
 
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index 7c4bf1cfd672e..dcc54b4cec01a 100644
--- a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/VETargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -16,7 +17,7 @@ Target &llvm::getTheVETarget() {
   return TheVETarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETargetInfo() {
   RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
                                                  "VE", "VE");
 }
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index af0dc0404d3cc..f7d770c18f883 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -419,6 +420,6 @@ INITIALIZE_PASS(VEAsmPrinter, "ve-asm-printer", "VE Assembly Printer", false,
                 false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmPrinter() {
   RegisterAsmPrinter<VEAsmPrinter> X(getTheVETarget());
 }
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 664a54cea7c52..14b8e330d87a4 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -19,13 +19,14 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "ve"
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVETarget() {
   // Register the target.
   RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
 
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 9649381f07b14..e4140755edf4e 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -32,6 +32,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace llvm;
@@ -1282,7 +1283,8 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 } // end anonymous namespace
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmParser() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmParser() {
   RegisterMCAsmParser<WebAssemblyAsmParser> X(getTheWebAssemblyTarget32());
   RegisterMCAsmParser<WebAssemblyAsmParser> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index 0399f9d38e4eb..8a29a5902ce22 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 
@@ -66,7 +67,7 @@ static MCDisassembler *createWebAssemblyDisassembler(const Target &T,
   return new WebAssemblyDisassembler(STI, Ctx, std::move(MCII));
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
 LLVMInitializeWebAssemblyDisassembler() {
   // Register the disassembler for each target.
   TargetRegistry::RegisterMCDisassembler(getTheWebAssemblyTarget32(),
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index a4162a07ee33f..6c0031f429c6d 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
@@ -124,7 +125,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetMC() {
   for (Target *T :
        {&getTheWebAssemblyTarget32(), &getTheWebAssemblyTarget64()}) {
     // Register the MC asm info.
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index ef2c77ade8cc5..e65fa8e60aeb1 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-target-info"
@@ -26,7 +27,8 @@ Target &llvm::getTheWebAssemblyTarget64() {
   return TheWebAssemblyTarget64;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTargetInfo() {
   RegisterTarget<Triple::wasm32> X(getTheWebAssemblyTarget32(), "wasm32",
                                    "WebAssembly 32-bit", "WebAssembly");
   RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index c61ed3c7d5d81..b43b7dbfc36be 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -759,7 +760,8 @@ INITIALIZE_PASS(WebAssemblyAsmPrinter, "webassembly-asm-printer",
                 "WebAssembly Assmebly Printer", false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyAsmPrinter() {
   RegisterAsmPrinter<WebAssemblyAsmPrinter> X(getTheWebAssemblyTarget32());
   RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(getTheWebAssemblyTarget64());
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index adb446b20ebf5..6e551e5c8ee4e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -28,6 +28,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LowerAtomicPass.h"
@@ -53,7 +54,8 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
       getTheWebAssemblyTarget32());
diff --git a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 57801752f170b..d36f18238f7a3 100644
--- a/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 
 using namespace llvm;
 
@@ -750,7 +751,8 @@ static MCDisassembler *createXCoreDisassembler(const Target &T,
   return new XCoreDisassembler(STI, Ctx);
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreDisassembler() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreDisassembler() {
   // Register the disassembler.
   TargetRegistry::RegisterMCDisassembler(getTheXCoreTarget(),
                                          createXCoreDisassembler);
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 098d874f21490..0ef2da04171e2 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
@@ -125,7 +126,8 @@ static MCTargetStreamer *createNullTargetStreamer(MCStreamer &S) {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetMC() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn X(getTheXCoreTarget(), createXCoreMCAsmInfo);
 
diff --git a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 8916c6ca7be74..556b31eab8b7e 100644
--- a/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "TargetInfo/XCoreTargetInfo.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 using namespace llvm;
 
 Target &llvm::getTheXCoreTarget() {
@@ -15,7 +16,8 @@ Target &llvm::getTheXCoreTarget() {
   return TheXCoreTarget;
 }
 
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTargetInfo() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreTargetInfo() {
   RegisterTarget<Triple::xcore> X(getTheXCoreTarget(), "xcore", "XCore",
                                   "XCore");
 }
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index b10b3056d82b2..0426088caf244 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -33,6 +33,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -295,6 +296,7 @@ INITIALIZE_PASS(XCoreAsmPrinter, "xcore-asm-printer", "XCore Assembly Printer",
                 false, false)
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreAsmPrinter() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void
+LLVMInitializeXCoreAsmPrinter() {
   RegisterAsmPrinter<XCoreAsmPrinter> X(getTheXCoreTarget());
 }
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 3627b81a48055..88f46c38b2f9a 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include <optional>
 
 using namespace llvm;
@@ -102,7 +103,7 @@ void XCorePassConfig::addPreEmitPass() {
 }
 
 // Force static initialization.
-extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
+extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(getTheXCoreTarget());
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeXCoreAsmPrinterPass(PR);
diff --git a/llvm/unittests/Target/AArch64/CMakeLists.txt b/llvm/unittests/Target/AArch64/CMakeLists.txt
index 67eb508e9bab8..9387ca90dd31a 100644
--- a/llvm/unittests/Target/AArch64/CMakeLists.txt
+++ b/llvm/unittests/Target/AArch64/CMakeLists.txt
@@ -16,6 +16,7 @@ set(LLVM_LINK_COMPONENTS
   GlobalISel
   MC
   MIRParser
+  Passes
   SelectionDAG
   Support
   Target
diff --git a/llvm/unittests/Target/LoongArch/CMakeLists.txt b/llvm/unittests/Target/LoongArch/CMakeLists.txt
index 6e7e49b4cb4e0..c3d33418a03aa 100644
--- a/llvm/unittests/Target/LoongArch/CMakeLists.txt
+++ b/llvm/unittests/Target/LoongArch/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   LoongArchCodeGen
   LoongArchDesc
   LoongArchInfo
+  Instrumentation
   MC
   MIRParser
   SelectionDAG
diff --git a/llvm/unittests/Target/RISCV/CMakeLists.txt b/llvm/unittests/Target/RISCV/CMakeLists.txt
index 10d6412f9b358..8da8c3896faf1 100644
--- a/llvm/unittests/Target/RISCV/CMakeLists.txt
+++ b/llvm/unittests/Target/RISCV/CMakeLists.txt
@@ -11,6 +11,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   MC
+  Passes
   SelectionDAG
   TargetParser
   )
diff --git a/llvm/unittests/Target/SPIRV/CMakeLists.txt b/llvm/unittests/Target/SPIRV/CMakeLists.txt
index d7f0290089c4c..29b31b16094a0 100644
--- a/llvm/unittests/Target/SPIRV/CMakeLists.txt
+++ b/llvm/unittests/Target/SPIRV/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   BinaryFormat
   Core
+  Passes
   SPIRVCodeGen
   SPIRVAnalysis
   Support
diff --git a/llvm/unittests/Target/VE/CMakeLists.txt b/llvm/unittests/Target/VE/CMakeLists.txt
index 271bf07f5b5d7..de823306a9aed 100644
--- a/llvm/unittests/Target/VE/CMakeLists.txt
+++ b/llvm/unittests/Target/VE/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   Core
   GlobalISel
+  Instrumentation
   MC
   SelectionDAG
   Support
diff --git a/llvm/unittests/Target/WebAssembly/CMakeLists.txt b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
index b1e01169e7a06..b1e180d218c1f 100644
--- a/llvm/unittests/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/unittests/Target/WebAssembly/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   CodeGenTypes
   Core
+  Instrumentation
   MC
   MIRParser
   TargetParser

From 7b7b5a397da1ecb9f767df5a3a3b6076cec109f9 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Tue, 17 Jun 2025 13:29:45 -0700
Subject: [PATCH 797/851] [AMDGPU] Remove AsmVOP3OpSel field completely. NFCI.
 (#144574)

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      | 1 -
 llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e74ccbee975ab..343482604ae55 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2652,7 +2652,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
    HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
-  field string AsmVOP3OpSel = AsmVOP3Base;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 7fdd951ecbd3c..926df955881e0 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -393,7 +393,6 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
   let Asm64 = "$vdst, $src0$bound_ctrl$fi";
-  let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi";
 }
 
 // Special case because there are no true output operands.  Hack vdst

From 8dcf4ba6359578c4d944b75b3f96a1fbd4fb9528 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Tue, 17 Jun 2025 13:30:50 -0700
Subject: [PATCH 798/851] [AMDGPU] Fix getAsmVOP3Base call agruments. (#144572)

https://github.com/llvm/llvm-project/pull/143465 has removed
getAsmVOP3OpSel and uses getAsmVOP3Base instead, but original
call to getAsmVOP3OpSel was using HasSrc*FloatMods and the
call to getAsmVOP3Base uses HasSrc*Mods. This does not play
well with opsel. An opsel instruction has modifiers in dag but
shall not have them in the asm string.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      |  4 ++--
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 343482604ae55..768f57c469d64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2648,8 +2648,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   // the asm operand name via this HasModifiers flag
   field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
   field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
-   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
-   HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
+   HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0FloatMods, HasSrc1FloatMods,
+   HasSrc2FloatMods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index f372101cb7b77..2dbc119f65cda 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1126,6 +1126,9 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
   let HasModifiers = 0;
   let HasSrc0IntMods = 0;
   let HasSrc1IntMods = 0;
+  let HasSrc0FloatMods = 0;
+  let HasSrc1FloatMods = 0;
+  let HasSrc2FloatMods = 0;
   let HasOMod = 0;
   let HasOpSel = 0;
   let HasClamp = 0;
@@ -1562,9 +1565,12 @@ let SubtargetPredicate = HasPseudoScalarTrans in {
   def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
 }
 
+let HasModifiers = 1 in
+def ASHR_PK_I8_Profile : VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>;
+
 let SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1 in {
-  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_i8_i32>;
-  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", VOP3_Profile<VOP_I16_I32_I32_I32, VOP3_OPSEL_ONLY>, int_amdgcn_ashr_pk_u8_i32>;
+  defm V_ASHR_PK_I8_I32 : VOP3Inst<"v_ashr_pk_i8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_i8_i32>;
+  defm V_ASHR_PK_U8_I32 : VOP3Inst<"v_ashr_pk_u8_i32", ASHR_PK_I8_Profile, int_amdgcn_ashr_pk_u8_i32>;
 } // End SubtargetPredicate = HasAshrPkInsts, isReMaterializable = 1
 
 class AshrPkI8Pat<VOP3_Pseudo inst, int lo, int hi>: GCNPat<

From 73f307a5ca308d356c557734765742c26bf7ed03 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 17 Jun 2025 13:32:11 -0700
Subject: [PATCH 799/851] [HLSL] Use ExtVector for firstbit intrinsics
 (#142679)

Fixes https://github.com/llvm/llvm-project/issues/142430

firstbit intrinsics were using the wrong vector type which causes some
conversions to fail. This PR switches them to ExtVector which resolves
the issue
---
 clang/lib/Sema/SemaHLSL.cpp                       | 8 ++++----
 clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl | 8 ++++++++
 clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl  | 8 ++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 4a8479a00e0e7..b55f4fd786b58 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2231,8 +2231,9 @@ static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
   if (VecTyA)
-    ReturnType = S->Context.getVectorType(ReturnType, VecTyA->getNumElements(),
-                                          VectorKind::Generic);
+    ReturnType =
+        S->Context.getExtVectorType(ReturnType, VecTyA->getNumElements());
+
   TheCall->setType(ReturnType);
 }
 
@@ -2545,8 +2546,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
 
     if (auto *VecTy = EltTy->getAs<VectorType>()) {
       EltTy = VecTy->getElementType();
-      ResTy = SemaRef.Context.getVectorType(ResTy, VecTy->getNumElements(),
-                                            VecTy->getVectorKind());
+      ResTy = SemaRef.Context.getExtVectorType(ResTy, VecTy->getNumElements());
     }
 
     if (!EltTy->isIntegerType()) {
diff --git a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
index debf6b6d3e3f5..a71b1878f8b55 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbithigh.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbithigh_long3(int64_t3 p0) {
 uint4 test_firstbithigh_long4(int64_t4 p0) {
   return firstbithigh(p0);
 }
+
+// CHECK-LABEL: test_firstbithigh_upcast
+// CHECK: [[FBH:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbituhigh.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBH]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbithigh_upcast(uint4 p0) {
+  return firstbithigh(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
index 5d490fabc5bc8..007db0c9c2ad5 100644
--- a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -151,3 +151,11 @@ uint3 test_firstbitlow_long3(int64_t3 p0) {
 uint4 test_firstbitlow_long4(int64_t4 p0) {
   return firstbitlow(p0);
 }
+
+// CHECK-LABEL: test_firstbitlow_upcast
+// CHECK: [[FBL:%.*]] = call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32(<4 x i32> %{{.*}})
+// CHECK: [[CONV:%.*]] = zext <4 x i32> [[FBL]] to <4 x i64>
+// CHECK: ret <4 x i64> [[CONV]]
+uint64_t4 test_firstbitlow_upcast(uint4 p0) {
+  return firstbitlow(p0);
+}

From a79186c1ea62bbe0579e0b1eed4ad507966cca41 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Tue, 17 Jun 2025 13:36:15 -0700
Subject: [PATCH 800/851] [Driver] Fix Arm/AArch64 Link Argument tests
 (#144582)

The openmp-offload-amdgpu-runtime-2 bot specifies default rtlib of
compiler-rt, but default unwindlib of libgcc. Change the tests to accept
that there may be `"--as-needed" "-lgcc_s" "--no-as-needed"` between
`libclang_rt.builtins.a` and `-lc`.

Relates to #121830
---
 clang/test/Driver/aarch64-toolchain.c | 3 ++-
 clang/test/Driver/arm-toolchain.c     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index e12107fa2c506..327161b81d9f6 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -135,7 +135,8 @@
 
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
+// AARCH64-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
 // AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index d4f9bf2aaf3d5..5368158cdeeda 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -136,7 +136,8 @@
 
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a" "-lc" "-lgloss" "--end-group"
+// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
+// ARM-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
 // ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
 
 // RUN: %clang -### %s -fuse-ld= \

From 7c4b2be983e900663a8d766ea9dc6f03b713e5b0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 16:38:27 -0400
Subject: [PATCH 801/851] [libc++][NFC] Refactor basic_streambuf to use public
 API functions when possible (#144547)

The implementation of std::basic_streambuf used private member variables
to manipulate the get and the put areas. Using public API functions is
equivalent but leads to code that is easier to understand, since the
public API functions are known more widely than our internal member
variables. Using the public API functions removes the need to map the
internal member variables back to get/put area manipulation functions in
one's head.

Finally, it also makes it easier to find subtle issues by instrumenting
accessor functions, which is impossible if the class uses the member
variables directly.
---
 libcxx/include/streambuf | 53 ++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index e25647909378e..585ae7af65aa8 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -178,8 +178,8 @@ public:
   // Get and put areas:
   // 27.6.2.2.3 Get area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize in_avail() {
-    if (__ninp_ < __einp_)
-      return static_cast<streamsize>(__einp_ - __ninp_);
+    if (gptr() < egptr())
+      return static_cast<streamsize>(egptr() - gptr());
     return showmanyc();
   }
 
@@ -190,37 +190,42 @@ public:
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sbumpc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return uflow();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sgetc() {
-    if (__ninp_ == __einp_)
+    if (gptr() == egptr())
       return underflow();
-    return traits_type::to_int_type(*__ninp_);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 streamsize sgetn(char_type* __s, streamsize __n) { return xsgetn(__s, __n); }
 
   // 27.6.2.2.4 Putback:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputbackc(char_type __c) {
-    if (__binp_ == __ninp_ || !traits_type::eq(__c, __ninp_[-1]))
+    if (eback() == gptr() || !traits_type::eq(__c, *(gptr() - 1)))
       return pbackfail(traits_type::to_int_type(__c));
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sungetc() {
-    if (__binp_ == __ninp_)
+    if (eback() == gptr())
       return pbackfail();
-    return traits_type::to_int_type(*--__ninp_);
+    this->gbump(-1);
+    return traits_type::to_int_type(*gptr());
   }
 
   // 27.6.2.2.5 Put area:
   inline _LIBCPP_HIDE_FROM_ABI_AFTER_V1 int_type sputc(char_type __c) {
-    if (__nout_ == __eout_)
+    if (pptr() == epptr())
       return overflow(traits_type::to_int_type(__c));
-    *__nout_++ = __c;
+    *pptr() = __c;
+    this->pbump(1);
     return traits_type::to_int_type(__c);
   }
 
@@ -312,17 +317,16 @@ protected:
   virtual streamsize showmanyc() { return 0; }
 
   virtual streamsize xsgetn(char_type* __s, streamsize __n) {
-    const int_type __eof = traits_type::eof();
     int_type __c;
     streamsize __i = 0;
     while (__i < __n) {
-      if (__ninp_ < __einp_) {
-        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(__einp_ - __ninp_, __n - __i));
-        traits_type::copy(__s, __ninp_, __len);
+      if (gptr() < egptr()) {
+        const streamsize __len = std::min(static_cast<streamsize>(INT_MAX), std::min(egptr() - gptr(), __n - __i));
+        traits_type::copy(__s, gptr(), __len);
         __s += __len;
         __i += __len;
         this->gbump(__len);
-      } else if ((__c = uflow()) != __eof) {
+      } else if ((__c = uflow()) != traits_type::eof()) {
         *__s = traits_type::to_char_type(__c);
         ++__s;
         ++__i;
@@ -336,7 +340,9 @@ protected:
   virtual int_type uflow() {
     if (underflow() == traits_type::eof())
       return traits_type::eof();
-    return traits_type::to_int_type(*__ninp_++);
+    int_type __c = traits_type::to_int_type(*gptr());
+    this->gbump(1);
+    return __c;
   }
 
   // 27.6.2.4.4 Putback:
@@ -345,17 +351,16 @@ protected:
   // 27.6.2.4.5 Put area:
   virtual streamsize xsputn(const char_type* __s, streamsize __n) {
     streamsize __i = 0;
-    int_type __eof = traits_type::eof();
     while (__i < __n) {
-      if (__nout_ >= __eout_) {
-        if (overflow(traits_type::to_int_type(*__s)) == __eof)
+      if (pptr() >= epptr()) {
+        if (overflow(traits_type::to_int_type(*__s)) == traits_type::eof())
           break;
         ++__s;
         ++__i;
       } else {
-        streamsize __chunk_size = std::min(__eout_ - __nout_, __n - __i);
-        traits_type::copy(__nout_, __s, __chunk_size);
-        __nout_ += __chunk_size;
+        streamsize __chunk_size = std::min(epptr() - pptr(), __n - __i);
+        traits_type::copy(pptr(), __s, __chunk_size);
+        __pbump(__chunk_size);
         __s += __chunk_size;
         __i += __chunk_size;
       }

From 9ae4d2e01331ddeb2543f1940a09ef9c76ff5268 Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 17 Jun 2025 22:44:02 +0200
Subject: [PATCH 802/851] [LLVM] [Support] Disable `ioctl()` terminal size
 check on Solaris (#144600)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#143514 broke the `clang-solaris11-sparcv9` bot; from what I can tell
that’s Solaris and according to `SolarisTargetInfo::getOSDefines`, the
macro `__sun__` should be defined on Solaris, so check for that and
don’t try to query the terminal size if it is defined.

Not sure this is the best solution but hopefully it fixes the bot.
---
 llvm/lib/Support/Unix/Process.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index db735b7484ad8..c6e79af44b9b4 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -320,7 +320,7 @@ static unsigned getColumns(int FileID) {
   // instead if it isn't available.
   unsigned Columns = 0;
 
-#ifdef HAVE_SYS_IOCTL_H
+#if defined(HAVE_SYS_IOCTL_H) && !defined(__sun__)
   struct winsize ws;
   if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
     Columns = ws.ws_col;

From c677a11c8d3223480cfe772e63fa0e7c09c76e2e Mon Sep 17 00:00:00 2001
From: David Peixotto <peix@meta.com>
Date: Tue, 17 Jun 2025 13:47:20 -0700
Subject: [PATCH 803/851] [lldb] Add support to list/enable/disable remaining
 plugin types. (#143970)

In #134418 we added support to list/enable/disable `SystemRuntime` and
`InstrumentationRuntime` plugins. We limited it to those two plugin
types to flesh out the idea with a smaller change.

This PR adds support for the remaining plugin types. We now support all
the plugins that can be registered directly with the plugin manager.
Plugins that are added by loading shared objects are still not
supported.
---
 lldb/include/lldb/Core/PluginManager.h        | 108 ++++-
 lldb/source/Core/PluginManager.cpp            | 441 ++++++++++++++++--
 lldb/test/API/commands/plugin/TestPlugin.py   |  62 +++
 .../Shell/Commands/command-plugin-list.test   |   8 +-
 4 files changed, 566 insertions(+), 53 deletions(-)
 create mode 100644 lldb/test/API/commands/plugin/TestPlugin.py

diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h
index e7b1691031111..1d7c976f3c382 100644
--- a/lldb/include/lldb/Core/PluginManager.h
+++ b/lldb/include/lldb/Core/PluginManager.h
@@ -236,12 +236,6 @@ class PluginManager {
   static SystemRuntimeCreateInstance
   GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
-
-  // Modify the enabled state of a SystemRuntime plugin.
-  // Returns false if the plugin name is not found.
-  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enabled);
-
   // ObjectFile
   static bool
   RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
@@ -549,12 +543,6 @@ class PluginManager {
   static InstrumentationRuntimeCreateInstance
   GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx);
 
-  static std::vector<RegisteredPluginInfo>
-  GetInstrumentationRuntimePluginInfo();
-
-  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                     bool enabled);
-
   // TypeSystem
   static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description,
                              TypeSystemCreateInstance create_callback,
@@ -690,6 +678,102 @@ class PluginManager {
   static bool CreateSettingForCPlusPlusLanguagePlugin(
       Debugger &debugger, const lldb::OptionValuePropertiesSP &properties_sp,
       llvm::StringRef description, bool is_global_property);
+
+  //
+  // Plugin Info+Enable Declarations
+  //
+  static std::vector<RegisteredPluginInfo> GetABIPluginInfo();
+  static bool SetABIPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetArchitecturePluginInfo();
+  static bool SetArchitecturePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDisassemblerPluginInfo();
+  static bool SetDisassemblerPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetDynamicLoaderPluginInfo();
+  static bool SetDynamicLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetEmulateInstructionPluginInfo();
+  static bool SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                 bool enable);
+
+  static std::vector<RegisteredPluginInfo>
+  GetInstrumentationRuntimePluginInfo();
+  static bool SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                     bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetJITLoaderPluginInfo();
+  static bool SetJITLoaderPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguagePluginInfo();
+  static bool SetLanguagePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetLanguageRuntimePluginInfo();
+  static bool SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetMemoryHistoryPluginInfo();
+  static bool SetMemoryHistoryPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectContainerPluginInfo();
+  static bool SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetObjectFilePluginInfo();
+  static bool SetObjectFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetOperatingSystemPluginInfo();
+  static bool SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                              bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetPlatformPluginInfo();
+  static bool SetPlatformPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetProcessPluginInfo();
+  static bool SetProcessPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetREPLPluginInfo();
+  static bool SetREPLPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetRegisterTypeBuilderPluginInfo();
+  static bool SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                  bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptInterpreterPluginInfo();
+  static bool SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetScriptedInterfacePluginInfo();
+  static bool SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetStructuredDataPluginInfo();
+  static bool SetStructuredDataPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolFilePluginInfo();
+  static bool SetSymbolFilePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolLocatorPluginInfo();
+  static bool SetSymbolLocatorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSymbolVendorPluginInfo();
+  static bool SetSymbolVendorPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetSystemRuntimePluginInfo();
+  static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTracePluginInfo();
+  static bool SetTracePluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTraceExporterPluginInfo();
+  static bool SetTraceExporterPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetTypeSystemPluginInfo();
+  static bool SetTypeSystemPluginEnabled(llvm::StringRef name, bool enable);
+
+  static std::vector<RegisteredPluginInfo> GetUnwindAssemblyPluginInfo();
+  static bool SetUnwindAssemblyPluginEnabled(llvm::StringRef name, bool enable);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index 5d44434033c55..dfa865929b64f 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -182,14 +182,176 @@ void PluginManager::Terminate() {
 }
 
 llvm::ArrayRef<PluginNamespace> PluginManager::GetPluginNamespaces() {
-  // Currently supported set of plugin namespaces. This will be expanded
-  // over time.
   static PluginNamespace PluginNamespaces[] = {
-      {"system-runtime", PluginManager::GetSystemRuntimePluginInfo,
-       PluginManager::SetSystemRuntimePluginEnabled},
-      {"instrumentation-runtime",
-       PluginManager::GetInstrumentationRuntimePluginInfo,
-       PluginManager::SetInstrumentationRuntimePluginEnabled}};
+
+      {
+          "abi",
+          PluginManager::GetABIPluginInfo,
+          PluginManager::SetABIPluginEnabled,
+      },
+
+      {
+          "architecture",
+          PluginManager::GetArchitecturePluginInfo,
+          PluginManager::SetArchitecturePluginEnabled,
+      },
+
+      {
+          "disassembler",
+          PluginManager::GetDisassemblerPluginInfo,
+          PluginManager::SetDisassemblerPluginEnabled,
+      },
+
+      {
+          "dynamic-loader",
+          PluginManager::GetDynamicLoaderPluginInfo,
+          PluginManager::SetDynamicLoaderPluginEnabled,
+      },
+
+      {
+          "emulate-instruction",
+          PluginManager::GetEmulateInstructionPluginInfo,
+          PluginManager::SetEmulateInstructionPluginEnabled,
+      },
+
+      {
+          "instrumentation-runtime",
+          PluginManager::GetInstrumentationRuntimePluginInfo,
+          PluginManager::SetInstrumentationRuntimePluginEnabled,
+      },
+
+      {
+          "jit-loader",
+          PluginManager::GetJITLoaderPluginInfo,
+          PluginManager::SetJITLoaderPluginEnabled,
+      },
+
+      {
+          "language",
+          PluginManager::GetLanguagePluginInfo,
+          PluginManager::SetLanguagePluginEnabled,
+      },
+
+      {
+          "language-runtime",
+          PluginManager::GetLanguageRuntimePluginInfo,
+          PluginManager::SetLanguageRuntimePluginEnabled,
+      },
+
+      {
+          "memory-history",
+          PluginManager::GetMemoryHistoryPluginInfo,
+          PluginManager::SetMemoryHistoryPluginEnabled,
+      },
+
+      {
+          "object-container",
+          PluginManager::GetObjectContainerPluginInfo,
+          PluginManager::SetObjectContainerPluginEnabled,
+      },
+
+      {
+          "object-file",
+          PluginManager::GetObjectFilePluginInfo,
+          PluginManager::SetObjectFilePluginEnabled,
+      },
+
+      {
+          "operating-system",
+          PluginManager::GetOperatingSystemPluginInfo,
+          PluginManager::SetOperatingSystemPluginEnabled,
+      },
+
+      {
+          "platform",
+          PluginManager::GetPlatformPluginInfo,
+          PluginManager::SetPlatformPluginEnabled,
+      },
+
+      {
+          "process",
+          PluginManager::GetProcessPluginInfo,
+          PluginManager::SetProcessPluginEnabled,
+      },
+
+      {
+          "repl",
+          PluginManager::GetREPLPluginInfo,
+          PluginManager::SetREPLPluginEnabled,
+      },
+
+      {
+          "register-type-builder",
+          PluginManager::GetRegisterTypeBuilderPluginInfo,
+          PluginManager::SetRegisterTypeBuilderPluginEnabled,
+      },
+
+      {
+          "script-interpreter",
+          PluginManager::GetScriptInterpreterPluginInfo,
+          PluginManager::SetScriptInterpreterPluginEnabled,
+      },
+
+      {
+          "scripted-interface",
+          PluginManager::GetScriptedInterfacePluginInfo,
+          PluginManager::SetScriptedInterfacePluginEnabled,
+      },
+
+      {
+          "structured-data",
+          PluginManager::GetStructuredDataPluginInfo,
+          PluginManager::SetStructuredDataPluginEnabled,
+      },
+
+      {
+          "symbol-file",
+          PluginManager::GetSymbolFilePluginInfo,
+          PluginManager::SetSymbolFilePluginEnabled,
+      },
+
+      {
+          "symbol-locator",
+          PluginManager::GetSymbolLocatorPluginInfo,
+          PluginManager::SetSymbolLocatorPluginEnabled,
+      },
+
+      {
+          "symbol-vendor",
+          PluginManager::GetSymbolVendorPluginInfo,
+          PluginManager::SetSymbolVendorPluginEnabled,
+      },
+
+      {
+          "system-runtime",
+          PluginManager::GetSystemRuntimePluginInfo,
+          PluginManager::SetSystemRuntimePluginEnabled,
+      },
+
+      {
+          "trace",
+          PluginManager::GetTracePluginInfo,
+          PluginManager::SetTracePluginEnabled,
+      },
+
+      {
+          "trace-exporter",
+          PluginManager::GetTraceExporterPluginInfo,
+          PluginManager::SetTraceExporterPluginEnabled,
+      },
+
+      {
+          "type-system",
+          PluginManager::GetTypeSystemPluginInfo,
+          PluginManager::SetTypeSystemPluginEnabled,
+      },
+
+      {
+          "unwind-assembly",
+          PluginManager::GetUnwindAssemblyPluginInfo,
+          PluginManager::SetUnwindAssemblyPluginEnabled,
+      },
+  };
 
   return PluginNamespaces;
 }
@@ -407,7 +569,7 @@ ABICreateInstance PluginManager::GetABICreateCallbackAtIndex(uint32_t idx) {
 #pragma mark Architecture
 
 typedef PluginInstance<ArchitectureCreateInstance> ArchitectureInstance;
-typedef std::vector<ArchitectureInstance> ArchitectureInstances;
+typedef PluginInstances<ArchitectureInstance> ArchitectureInstances;
 
 static ArchitectureInstances &GetArchitectureInstances() {
   static ArchitectureInstances g_instances;
@@ -417,25 +579,18 @@ static ArchitectureInstances &GetArchitectureInstances() {
 void PluginManager::RegisterPlugin(llvm::StringRef name,
                                    llvm::StringRef description,
                                    ArchitectureCreateInstance create_callback) {
-  GetArchitectureInstances().push_back({name, description, create_callback});
+  GetArchitectureInstances().RegisterPlugin(name, description, create_callback);
 }
 
 void PluginManager::UnregisterPlugin(
     ArchitectureCreateInstance create_callback) {
   auto &instances = GetArchitectureInstances();
-
-  for (auto pos = instances.begin(), end = instances.end(); pos != end; ++pos) {
-    if (pos->create_callback == create_callback) {
-      instances.erase(pos);
-      return;
-    }
-  }
-  llvm_unreachable("Plugin not found");
+  instances.UnregisterPlugin(create_callback);
 }
 
 std::unique_ptr<Architecture>
 PluginManager::CreateArchitectureInstance(const ArchSpec &arch) {
-  for (const auto &instances : GetArchitectureInstances()) {
+  for (const auto &instances : GetArchitectureInstances().GetSnapshot()) {
     if (auto plugin_up = instances.create_callback(arch))
       return plugin_up;
   }
@@ -718,15 +873,6 @@ PluginManager::GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetSystemRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
-  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
-                                                  bool enable) {
-  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark ObjectFile
 
 struct ObjectFileInstance : public PluginInstance<ObjectFileCreateInstance> {
@@ -1563,16 +1709,6 @@ PluginManager::GetInstrumentationRuntimeCreateCallbackAtIndex(uint32_t idx) {
   return GetInstrumentationRuntimeInstances().GetCallbackAtIndex(idx);
 }
 
-std::vector<RegisteredPluginInfo>
-PluginManager::GetInstrumentationRuntimePluginInfo() {
-  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
-}
-
-bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
-                                                           bool enable) {
-  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
-}
-
 #pragma mark TypeSystem
 
 struct TypeSystemInstance : public PluginInstance<TypeSystemCreateInstance> {
@@ -2057,3 +2193,234 @@ bool PluginManager::CreateSettingForCPlusPlusLanguagePlugin(
                                 "Settings for CPlusPlus language plug-ins",
                                 properties_sp, description, is_global_property);
 }
+
+//
+// Plugin Info+Enable Implementations
+//
+std::vector<RegisteredPluginInfo> PluginManager::GetABIPluginInfo() {
+  return GetABIInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetABIPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetABIInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetArchitecturePluginInfo() {
+  return GetArchitectureInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetArchitecturePluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetArchitectureInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDisassemblerPluginInfo() {
+  return GetDisassemblerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDisassemblerPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetDisassemblerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetDynamicLoaderPluginInfo() {
+  return GetDynamicLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetDynamicLoaderPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetDynamicLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetEmulateInstructionPluginInfo() {
+  return GetEmulateInstructionInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetEmulateInstructionPluginEnabled(llvm::StringRef name,
+                                                       bool enable) {
+  return GetEmulateInstructionInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetInstrumentationRuntimePluginInfo() {
+  return GetInstrumentationRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetInstrumentationRuntimePluginEnabled(llvm::StringRef name,
+                                                           bool enable) {
+  return GetInstrumentationRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetJITLoaderPluginInfo() {
+  return GetJITLoaderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetJITLoaderPluginEnabled(llvm::StringRef name,
+                                              bool enable) {
+  return GetJITLoaderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetLanguagePluginInfo() {
+  return GetLanguageInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguagePluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetLanguageInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetLanguageRuntimePluginInfo() {
+  return GetLanguageRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetLanguageRuntimePluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetLanguageRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetMemoryHistoryPluginInfo() {
+  return GetMemoryHistoryInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetMemoryHistoryPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetMemoryHistoryInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetObjectContainerPluginInfo() {
+  return GetObjectContainerInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectContainerPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetObjectContainerInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetObjectFilePluginInfo() {
+  return GetObjectFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetObjectFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetObjectFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetOperatingSystemPluginInfo() {
+  return GetOperatingSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetOperatingSystemPluginEnabled(llvm::StringRef name,
+                                                    bool enable) {
+  return GetOperatingSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetPlatformPluginInfo() {
+  return GetPlatformInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetPlatformPluginEnabled(llvm::StringRef name,
+                                             bool enable) {
+  return GetPlatformInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetProcessPluginInfo() {
+  return GetProcessInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetProcessPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetProcessInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetREPLPluginInfo() {
+  return GetREPLInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetREPLPluginEnabled(llvm::StringRef name, bool enable) {
+  return GetREPLInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetRegisterTypeBuilderPluginInfo() {
+  return GetRegisterTypeBuilderInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetRegisterTypeBuilderPluginEnabled(llvm::StringRef name,
+                                                        bool enable) {
+  return GetRegisterTypeBuilderInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptInterpreterPluginInfo() {
+  return GetScriptInterpreterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptInterpreterPluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptInterpreterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo>
+PluginManager::GetScriptedInterfacePluginInfo() {
+  return GetScriptedInterfaceInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetScriptedInterfacePluginEnabled(llvm::StringRef name,
+                                                      bool enable) {
+  return GetScriptedInterfaceInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetStructuredDataPluginInfo() {
+  return GetStructuredDataPluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetStructuredDataPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetStructuredDataPluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolFilePluginInfo() {
+  return GetSymbolFileInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolFilePluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetSymbolFileInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolLocatorPluginInfo() {
+  return GetSymbolLocatorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolLocatorPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSymbolLocatorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSymbolVendorPluginInfo() {
+  return GetSymbolVendorInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSymbolVendorPluginEnabled(llvm::StringRef name,
+                                                 bool enable) {
+  return GetSymbolVendorInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetSystemRuntimePluginInfo() {
+  return GetSystemRuntimeInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTracePluginInfo() {
+  return GetTracePluginInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTracePluginEnabled(llvm::StringRef name, bool enable) {
+  return GetTracePluginInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTraceExporterPluginInfo() {
+  return GetTraceExporterInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTraceExporterPluginEnabled(llvm::StringRef name,
+                                                  bool enable) {
+  return GetTraceExporterInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetTypeSystemPluginInfo() {
+  return GetTypeSystemInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetTypeSystemPluginEnabled(llvm::StringRef name,
+                                               bool enable) {
+  return GetTypeSystemInstances().SetInstanceEnabled(name, enable);
+}
+
+std::vector<RegisteredPluginInfo> PluginManager::GetUnwindAssemblyPluginInfo() {
+  return GetUnwindAssemblyInstances().GetPluginInfoForAllInstances();
+}
+bool PluginManager::SetUnwindAssemblyPluginEnabled(llvm::StringRef name,
+                                                   bool enable) {
+  return GetUnwindAssemblyInstances().SetInstanceEnabled(name, enable);
+}
diff --git a/lldb/test/API/commands/plugin/TestPlugin.py b/lldb/test/API/commands/plugin/TestPlugin.py
new file mode 100644
index 0000000000000..fdfb14bfcc24e
--- /dev/null
+++ b/lldb/test/API/commands/plugin/TestPlugin.py
@@ -0,0 +1,62 @@
+"""
+Make sure the plugin list, enable, and disable commands work.
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestFrameVar(TestBase):
+    # If your test case doesn't stress debug info, then
+    # set this to true.  That way it won't be run once for
+    # each debug info format.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_plugin_list_enable_disable_commands(self):
+        for plugin_namespace in [
+            "abi",
+            "architecture",
+            "disassembler",
+            "dynamic-loader",
+            "emulate-instruction",
+            "instrumentation-runtime",
+            "jit-loader",
+            "language",
+            "language-runtime",
+            "memory-history",
+            "object-container",
+            "object-file",
+            "operating-system",
+            "platform",
+            "process",
+            "repl",
+            "register-type-builder",
+            "script-interpreter",
+            "scripted-interface",
+            "structured-data",
+            "symbol-file",
+            "symbol-locator",
+            "symbol-vendor",
+            "system-runtime",
+            # 'trace', # No trace plugin is registered by default.
+            "trace-exporter",
+            "type-system",
+            "unwind-assembly",
+        ]:
+            self.do_list_disable_enable_test(plugin_namespace)
+
+    def do_list_disable_enable_test(self, plugin_namespace):
+        # Plugins are enabled by default.
+        self.expect(
+            f"plugin list {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
+
+        # Plugins can be disabled.
+        self.expect(
+            f"plugin disable {plugin_namespace}", substrs=[plugin_namespace, "[-]"]
+        )
+
+        # Plugins can be enabled.
+        self.expect(
+            f"plugin enable {plugin_namespace}", substrs=[plugin_namespace, "[+]"]
+        )
diff --git a/lldb/test/Shell/Commands/command-plugin-list.test b/lldb/test/Shell/Commands/command-plugin-list.test
index 9d3680d48cdd0..3f02157665bb2 100644
--- a/lldb/test/Shell/Commands/command-plugin-list.test
+++ b/lldb/test/Shell/Commands/command-plugin-list.test
@@ -10,10 +10,10 @@
 # Test plugin list without an argument will list all plugins.
 plugin list
 # CHECK-LABEL: plugin list
-# CHECK: system-runtime
-# CHECK:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
-# CHECK: instrumentation-runtime
-# CHECK:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: instrumentation-runtime
+# CHECK-DAG:  [+] AddressSanitizer               AddressSanitizer instrumentation runtime plugin.
+# CHECK-DAG: system-runtime
+# CHECK-DAG:  [+] systemruntime-macosx           System runtime plugin for Mac OS X native libraries
 
 # Test plugin list works with fully qualified name.
 plugin list system-runtime.systemruntime-macosx

From 908f74a25e01cc88d1dee1af5521d8fb1c21bc51 Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 13:49:18 -0700
Subject: [PATCH 804/851] [llvm] re-order LLVM_ABI and extern on
 NoKernelInfoEndLTO decl (#144601)

## Overview
Fix compilation error introduced by #143615. Build failure logs
available
[here](https://lab.llvm.org/buildbot/#/builders/195/builds/10573)

## Background
On `extern` variable declarations, `LLVM_ABI` must appear before
`extern` because `LLVM_ABI` currently resolves to
`[[gnu::visibility("default")]]` when building with gcc.
---
 llvm/include/llvm/Target/TargetMachine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 04c97c1502a1b..b286efdea3c19 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -29,7 +29,7 @@
 #include <string>
 #include <utility>
 
-extern LLVM_ABI llvm::cl::opt<bool> NoKernelInfoEndLTO;
+LLVM_ABI extern llvm::cl::opt<bool> NoKernelInfoEndLTO;
 
 namespace llvm {
 

From 49bf8d38d80ce43bd700f27833a7b8c8e7082af8 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 20:49:41 +0000
Subject: [PATCH 805/851] [gn build] Manually port b4e39e4f

---
 llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index c1d107eefdf9b..f4ee2599c01ce 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -223,6 +223,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=",
       "HAVE_STRERROR_R=",
       "HAVE_SYSCONF=",
+      "HAVE_SYS_IOCTL_H=",
       "HAVE_SYS_MMAN_H=",
       "HAVE_UNISTD_H=",
       "HAVE__CHSIZE_S=1",
@@ -250,6 +251,7 @@ write_cmake_config("config") {
       "HAVE_SIGALTSTACK=1",
       "HAVE_STRERROR_R=1",
       "HAVE_SYSCONF=1",
+      "HAVE_SYS_IOCTL_H=1",
       "HAVE_SYS_MMAN_H=1",
       "HAVE_UNISTD_H=1",
       "HAVE__CHSIZE_S=",

From 8d1610afd0db877460d1b3cd43cc4066478846a0 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 17 Jun 2025 16:50:31 -0400
Subject: [PATCH 806/851] [libc++] Mark two assertion tests as unsupported in
 C++03 mode

Our assertion checking facility requires at least C++11, so these
tests were failing when run in C++03 mode.
---
 .../streambuf.protected/streambuf.get.area/setg.assert.pass.cpp | 2 +-
 .../streambuf.protected/streambuf.put.area/setp.assert.pass.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
index becf89b12fdd1..973d744a1da44 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.get.area/setg.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>
diff --git a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
index abd42272de508..5aaad2738d325 100644
--- a/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
+++ b/libcxx/test/std/input.output/stream.buffers/streambuf/streambuf.protected/streambuf.put.area/setp.assert.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: has-unix-headers
-// UNSUPPORTED: libcpp-hardening-mode=none
+// UNSUPPORTED: c++03, libcpp-hardening-mode=none
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <streambuf>

From 3c7df98c7b2a203e49a74b229bbf535c2ef6274b Mon Sep 17 00:00:00 2001
From: Piotr Idzik <65706193+vil02@users.noreply.github.com>
Date: Tue, 17 Jun 2025 22:59:53 +0200
Subject: [PATCH 807/851] [clang-tidy] Add missing colon in the docs of
 performance-enum-size (#144525)

There is a syntax error in the provided code example - this PR fixes it.

I did a quick search - I could not find similar _typos_.
---
 .../docs/clang-tidy/checks/performance/enum-size.rst            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
index f72b8c7eabc22..b7631139a0133 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/enum-size.rst
@@ -34,7 +34,7 @@ dependent).
 .. code-block:: c++
 
     // AFTER
-    enum Color : std:int8_t {
+    enum Color : std::int8_t {
         RED = -1,
         GREEN = 0,
         BLUE = 1

From ecfb8fe5c1870091b095ae6ca1ad4cfc7158e619 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 17 Jun 2025 14:07:07 -0700
Subject: [PATCH 808/851] =?UTF-8?q?Revert=20stack=20"[Driver]=20Add=20supp?=
 =?UTF-8?q?ort=20for=20GCC=20installation=20detection=20in=20=E2=80=A6=20(?=
 =?UTF-8?q?#144603)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…Baremetal toolchain (#121829)"

This reverts the following stack of commits, due to them breaking the
Fuchsia toolchain and corresponding LLVM buildbot.

Revert "[Driver] Fix Arm/AArch64 Link Argument tests (#144582)" This
reverts commit a79186c1ea62bbe0579e0b1eed4ad507966cca41.

Revert "[Driver] Add option to force undefined symbols during linking in
BareMetal toolchain object. (#132807)" This reverts commit
9cb754509608b9d9143fa17f775631bbfcce0848.

Revert "[Driver] Fix link order of BareMetal toolchain object (#132806)"
This reverts commit 31523de4b000ca254259ae3167d28922e1302648.

Revert "[Driver] Add support for crtbegin.o, crtend.o and libgloss lib
to BareMetal toolchain object (#121830)" This reverts commit
ec230aa7a7d13c222c0b34b87c3c16937383b4a0.

Revert "[Driver] Add support for GCC installation detection in Baremetal
toolchain (#121829)" This reverts commit
eb31c422d0dc816bf285a81bf92690d4d16273ed.
---
 clang/docs/Toolchain.rst                      |   5 -
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 -
 clang/lib/Driver/ToolChains/BareMetal.cpp     | 287 +++++-------------
 clang/lib/Driver/ToolChains/BareMetal.h       |  22 +-
 .../aarch64-none-elf/include/c++/8.2.1/.keep  |   0
 .../aarch64-none-elf/lib/.keep                |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o |   0
 .../lib/gcc/aarch64-none-elf/8.2.1/crtend.o   |   0
 .../aarch64-none-elf/lib/crt0.o               |   0
 .../aarch64-none-elf/lib/crtbegin.o           |   0
 .../aarch64-none-elf/lib/crtend.o             |   0
 .../bin/aarch64-none-elf-ld                   |   1 -
 .../armv6m-none-eabi/include/c++/8.2.1/.keep  |   0
 .../armv6m-none-eabi/lib/.keep                |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o |   0
 .../lib/gcc/armv6m-none-eabi/8.2.1/crtend.o   |   0
 .../armv6m-none-eabi/lib/crt0.o               |   0
 .../armv6m-none-eabi/lib/crtbegin.o           |   0
 .../armv6m-none-eabi/lib/crtend.o             |   0
 .../bin/armv6m-none-eabi-ld                   |   1 -
 clang/test/Driver/aarch64-gnutools.c          |   4 -
 clang/test/Driver/aarch64-toolchain-extra.c   |  35 ---
 clang/test/Driver/aarch64-toolchain.c         | 157 ----------
 clang/test/Driver/arm-gnutools.c              |   6 -
 clang/test/Driver/arm-toolchain-extra.c       |  36 ---
 clang/test/Driver/arm-toolchain.c             | 158 ----------
 clang/test/Driver/baremetal-multilib.yaml     |   3 +-
 clang/test/Driver/baremetal-sysroot.cpp       |   8 +-
 .../test/Driver/baremetal-undefined-symbols.c |  14 -
 clang/test/Driver/baremetal.cpp               |  98 ++----
 clang/test/Driver/check-no-multlib-warning.c  |  10 -
 clang/test/Driver/riscv-args.c                |   6 +
 clang/test/Driver/sanitizer-ld.c              |   2 +-
 37 files changed, 119 insertions(+), 739 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
 delete mode 100644 clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
 delete mode 100755 clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
 delete mode 100644 clang/test/Driver/aarch64-gnutools.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain-extra.c
 delete mode 100644 clang/test/Driver/aarch64-toolchain.c
 delete mode 100644 clang/test/Driver/arm-gnutools.c
 delete mode 100644 clang/test/Driver/arm-toolchain-extra.c
 delete mode 100644 clang/test/Driver/arm-toolchain.c
 delete mode 100644 clang/test/Driver/baremetal-undefined-symbols.c
 delete mode 100644 clang/test/Driver/check-no-multlib-warning.c
 create mode 100644 clang/test/Driver/riscv-args.c

diff --git a/clang/docs/Toolchain.rst b/clang/docs/Toolchain.rst
index d56b21d74c7e3..958199eb7a2e2 100644
--- a/clang/docs/Toolchain.rst
+++ b/clang/docs/Toolchain.rst
@@ -347,8 +347,3 @@ workarounds for issues discovered in libstdc++, and these are removed
 as fixed libstdc++ becomes sufficiently old.
 
 You can instruct Clang to use libstdc++ with the ``-stdlib=libstdc++`` flag.
-
-GCC Installation
-=================
-Users can point to their GCC installation by using the ``-gcc-toolchain`` or by
-using ``-gcc-install-dir`` flag.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 94224e1038758..29f6480ba935c 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -847,9 +847,6 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 def err_drv_multilib_custom_error : Error<
   "multilib configuration error: %0">;
-def warn_drv_multilib_not_available_for_target: Warning<
-  "no multilib structure encoded for Arm, Aarch64 and PPC targets">,
-  InGroup<DiagGroup<"multilib-not-found">>;
 
 def err_drv_experimental_crel : Error<
   "-Wa,--allow-experimental-crel must be specified to use -Wa,--crel. "
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index d4e4e6d04b417..d8168ed15febd 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -31,40 +31,6 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-static bool isRISCVBareMetal(const llvm::Triple &Triple) {
-  if (!Triple.isRISCV())
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
-/// Is the triple powerpc[64][le]-*-none-eabi?
-static bool isPPCBareMetal(const llvm::Triple &Triple) {
-  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
-         Triple.getEnvironment() == llvm::Triple::EABI;
-}
-
 static bool findRISCVMultilibs(const Driver &D,
                                const llvm::Triple &TargetTriple,
                                const ArgList &Args, DetectedMultilibs &Result) {
@@ -129,8 +95,7 @@ static bool findRISCVMultilibs(const Driver &D,
   return false;
 }
 
-static std::string computeClangRuntimesSysRoot(const Driver &D,
-                                               bool IncludeTriple) {
+static std::string computeBaseSysRoot(const Driver &D, bool IncludeTriple) {
   if (!D.SysRoot.empty())
     return D.SysRoot;
 
@@ -143,123 +108,56 @@ static std::string computeClangRuntimesSysRoot(const Driver &D,
   return std::string(SysRootDir);
 }
 
-// Only consider the GCC toolchain based on the values provided through the
-// `--gcc-toolchain` and `--gcc-install-dir` flags. The function below returns
-// whether the GCC toolchain was initialized successfully.
-bool BareMetal::initGCCInstallation(const llvm::Triple &Triple,
-                                    const llvm::opt::ArgList &Args) {
-  if (Args.getLastArg(options::OPT_gcc_toolchain) ||
-      Args.getLastArg(clang::driver::options::OPT_gcc_install_dir_EQ)) {
-    GCCInstallation.init(Triple, Args);
-    return GCCInstallation.isValid();
+BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
+                     const ArgList &Args)
+    : ToolChain(D, Triple, Args),
+      SysRoot(computeBaseSysRoot(D, /*IncludeTriple=*/true)) {
+  getProgramPaths().push_back(getDriver().Dir);
+
+  findMultilibs(D, Triple, Args);
+  SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
+    for (const Multilib &M : getOrderedMultilibs()) {
+      SmallString<128> Dir(SysRoot);
+      llvm::sys::path::append(Dir, M.osSuffix(), "lib");
+      getFilePaths().push_back(std::string(Dir));
+      getLibraryPaths().push_back(std::string(Dir));
+    }
   }
-  return false;
 }
 
-// This logic is adapted from RISCVToolChain.cpp as part of the ongoing effort
-// to merge RISCVToolChain into the Baremetal toolchain. It infers the presence
-// of a valid GCC toolchain by checking whether the `crt0.o` file exists in the
-// `bin/../<target-triple>/lib` directory.
-static bool detectGCCToolchainAdjacent(const Driver &D) {
-  SmallString<128> GCCDir;
-  llvm::sys::path::append(GCCDir, D.Dir, "..", D.getTargetTriple(),
-                          "lib/crt0.o");
-  return llvm::sys::fs::exists(GCCDir);
-}
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+static bool isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
 
-// If no sysroot is provided the driver will first attempt to infer it from the
-// values of `--gcc-install-dir` or `--gcc-toolchain`, which specify the
-// location of a GCC toolchain.
-// If neither flag is used, the sysroot defaults to either:
-//    - `bin/../<target-triple>`
-//    - `bin/../lib/clang-runtimes/<target-triple>`
-//
-// To use the `clang-runtimes` path, ensure that `../<target-triple>/lib/crt0.o`
-// does not exist relative to the driver.
-std::string BareMetal::computeSysRoot() const {
-  // Use Baremetal::sysroot if it has already been set.
-  if (!SysRoot.empty())
-    return SysRoot;
-
-  // Use the sysroot specified via the `--sysroot` command-line flag, if
-  // provided.
-  const Driver &D = getDriver();
-  if (!D.SysRoot.empty())
-    return D.SysRoot;
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
 
-  // Attempt to infer sysroot from a valid GCC installation.
-  // If no valid GCC installation, check for a GCC toolchain alongside Clang.
-  SmallString<128> inferredSysRoot;
-  if (IsGCCInstallationValid) {
-    llvm::sys::path::append(inferredSysRoot, GCCInstallation.getParentLibPath(),
-                            "..", GCCInstallation.getTriple().str());
-  } else if (detectGCCToolchainAdjacent(D)) {
-    // Use the triple as provided to the driver. Unlike the parsed triple
-    // this has not been normalized to always contain every field.
-    llvm::sys::path::append(inferredSysRoot, D.Dir, "..", D.getTargetTriple());
-  }
-  // If a valid sysroot was inferred and exists, use it
-  if (!inferredSysRoot.empty() && llvm::sys::fs::exists(inferredSysRoot))
-    return std::string(inferredSysRoot);
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
 
-  // Use the clang-runtimes path.
-  return computeClangRuntimesSysRoot(D, /*IncludeTriple*/ true);
+  return Triple.getEnvironmentName() == "elf";
 }
 
-static void addMultilibsFilePaths(const Driver &D, const MultilibSet &Multilibs,
-                                  const Multilib &Multilib,
-                                  StringRef InstallPath,
-                                  ToolChain::path_list &Paths) {
-  if (const auto &PathsCallback = Multilibs.filePathsCallback())
-    for (const auto &Path : PathsCallback(Multilib))
-      addPathIfExists(D, InstallPath + Path, Paths);
+static bool isRISCVBareMetal(const llvm::Triple &Triple) {
+  if (!Triple.isRISCV())
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
 }
 
-// GCC mutltilibs will only work for those targets that have their multlib
-// structure encoded into GCCInstallation. Baremetal toolchain supports ARM,
-// AArch64, RISCV and PPC and of these only RISCV have GCC multilibs hardcoded
-// in GCCInstallation.
-BareMetal::BareMetal(const Driver &D, const llvm::Triple &Triple,
-                     const ArgList &Args)
-    : Generic_ELF(D, Triple, Args) {
-  IsGCCInstallationValid = initGCCInstallation(Triple, Args);
-  std::string ComputedSysRoot = computeSysRoot();
-  if (IsGCCInstallationValid) {
-    if (!isRISCVBareMetal(Triple))
-      D.Diag(clang::diag::warn_drv_multilib_not_available_for_target);
-
-    Multilibs = GCCInstallation.getMultilibs();
-    SelectedMultilibs.assign({GCCInstallation.getMultilib()});
-
-    path_list &Paths = getFilePaths();
-    // Add toolchain/multilib specific file paths.
-    addMultilibsFilePaths(D, Multilibs, SelectedMultilibs.back(),
-                          GCCInstallation.getInstallPath(), Paths);
-    // Adding filepath for locating crt{begin,end}.o files.
-    Paths.push_back(GCCInstallation.getInstallPath().str());
-    // Adding filepath for locating crt0.o file.
-    Paths.push_back(ComputedSysRoot + "/lib");
-
-    ToolChain::path_list &PPaths = getProgramPaths();
-    // Multilib cross-compiler GCC installations put ld in a triple-prefixed
-    // directory off of the parent of the GCC installation.
-    PPaths.push_back(Twine(GCCInstallation.getParentLibPath() + "/../" +
-                           GCCInstallation.getTriple().str() + "/bin")
-                         .str());
-    PPaths.push_back((GCCInstallation.getParentLibPath() + "/../bin").str());
-  } else {
-    getProgramPaths().push_back(getDriver().Dir);
-    findMultilibs(D, Triple, Args);
-    const SmallString<128> SysRootDir(computeSysRoot());
-    if (!SysRootDir.empty()) {
-      for (const Multilib &M : getOrderedMultilibs()) {
-        SmallString<128> Dir(SysRootDir);
-        llvm::sys::path::append(Dir, M.osSuffix(), "lib");
-        getFilePaths().push_back(std::string(Dir));
-        getLibraryPaths().push_back(std::string(Dir));
-      }
-    }
-  }
+/// Is the triple powerpc[64][le]-*-none-eabi?
+static bool isPPCBareMetal(const llvm::Triple &Triple) {
+  return Triple.isPPC() && Triple.getOS() == llvm::Triple::UnknownOS &&
+         Triple.getEnvironment() == llvm::Triple::EABI;
 }
 
 static void
@@ -318,7 +216,7 @@ getMultilibConfigPath(const Driver &D, const llvm::Triple &Triple,
       return {};
     }
   } else {
-    MultilibPath = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    MultilibPath = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     llvm::sys::path::append(MultilibPath, MultilibFilename);
   }
   return MultilibPath;
@@ -336,7 +234,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
   if (D.getVFS().exists(*MultilibPath)) {
     // If multilib.yaml is found, update sysroot so it doesn't use a target
     // specific suffix
-    SysRoot = computeClangRuntimesSysRoot(D, /*IncludeTriple=*/false);
+    SysRoot = computeBaseSysRoot(D, /*IncludeTriple=*/false);
     SmallVector<StringRef> CustomFlagMacroDefines;
     findMultilibsFromYAML(*this, D, *MultilibPath, Args, Result,
                           CustomFlagMacroDefines);
@@ -344,7 +242,7 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
     Multilibs = Result.Multilibs;
     MultilibMacroDefines.append(CustomFlagMacroDefines.begin(),
                                 CustomFlagMacroDefines.end());
-  } else if (isRISCVBareMetal(Triple) && !detectGCCToolchainAdjacent(D)) {
+  } else if (isRISCVBareMetal(Triple)) {
     if (findRISCVMultilibs(D, Triple, Args, Result)) {
       SelectedMultilibs = Result.SelectedMultilibs;
       Multilibs = Result.Multilibs;
@@ -365,6 +263,8 @@ Tool *BareMetal::buildStaticLibTool() const {
   return new tools::baremetal::StaticLibTool(*this);
 }
 
+std::string BareMetal::computeSysRoot() const { return SysRoot; }
+
 BareMetal::OrderedMultilibs BareMetal::getOrderedMultilibs() const {
   // Get multilibs in reverse order because they're ordered most-specific last.
   if (!SelectedMultilibs.empty())
@@ -392,10 +292,10 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   if (std::optional<std::string> Path = getStdlibIncludePath())
     addSystemInclude(DriverArgs, CC1Args, *Path);
 
-  const SmallString<128> SysRootDir(computeSysRoot());
-  if (!SysRootDir.empty()) {
+  const SmallString<128> SysRoot(computeSysRoot());
+  if (!SysRoot.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRootDir);
+      SmallString<128> Dir(SysRoot);
       llvm::sys::path::append(Dir, M.includeSuffix());
       llvm::sys::path::append(Dir, "include");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -409,19 +309,6 @@ void BareMetal::addClangTargetOptions(const ArgList &DriverArgs,
   CC1Args.push_back("-nostdsysteminc");
 }
 
-void BareMetal::addLibStdCxxIncludePaths(
-    const llvm::opt::ArgList &DriverArgs,
-    llvm::opt::ArgStringList &CC1Args) const {
-  if (!IsGCCInstallationValid)
-    return;
-  const GCCVersion &Version = GCCInstallation.getVersion();
-  StringRef TripleStr = GCCInstallation.getTriple().str();
-  const Multilib &Multilib = GCCInstallation.getMultilib();
-  addLibStdCXXIncludePaths(computeSysRoot() + "/include/c++/" + Version.Text,
-                           TripleStr, Multilib.includeSuffix(), DriverArgs,
-                           CC1Args);
-}
-
 void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc, options::OPT_nostdlibinc,
@@ -452,23 +339,23 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
   };
 
   switch (GetCXXStdlibType(DriverArgs)) {
-  case ToolChain::CST_Libcxx: {
-    SmallString<128> P(D.Dir);
-    llvm::sys::path::append(P, "..", "include");
-    AddCXXIncludePath(P);
-    break;
-  }
-  case ToolChain::CST_Libstdcxx:
-    addLibStdCxxIncludePaths(DriverArgs, CC1Args);
-    break;
+    case ToolChain::CST_Libcxx: {
+      SmallString<128> P(D.Dir);
+      llvm::sys::path::append(P, "..", "include");
+      AddCXXIncludePath(P);
+      break;
+    }
+    case ToolChain::CST_Libstdcxx:
+      // We only support libc++ toolchain installation.
+      break;
   }
 
-  std::string SysRootDir(computeSysRoot());
-  if (SysRootDir.empty())
+  std::string SysRoot(computeSysRoot());
+  if (SysRoot.empty())
     return;
 
   for (const Multilib &M : getOrderedMultilibs()) {
-    SmallString<128> Dir(SysRootDir);
+    SmallString<128> Dir(SysRoot);
     llvm::sys::path::append(Dir, M.gccSuffix());
     switch (GetCXXStdlibType(DriverArgs)) {
     case ToolChain::CST_Libcxx: {
@@ -568,6 +455,8 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   const llvm::Triple::ArchType Arch = TC.getArch();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
+  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
+
   CmdArgs.push_back("-Bstatic");
 
   if (TC.getTriple().isRISCV() && Args.hasArg(options::OPT_mno_relax))
@@ -582,48 +471,19 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Arch == llvm::Triple::aarch64_be ? "-EB" : "-EL");
   }
 
-  bool NeedCRTs =
-      !Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles);
-
-  const char *CRTBegin, *CRTEnd;
-  if (NeedCRTs) {
-    if (!Args.hasArg(options::OPT_r))
-      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
-    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) {
-      auto RuntimeLib = TC.GetRuntimeLibType(Args);
-      switch (RuntimeLib) {
-      case (ToolChain::RLT_Libgcc): {
-        CRTBegin = "crtbegin.o";
-        CRTEnd = "crtend.o";
-        break;
-      }
-      case (ToolChain::RLT_CompilerRT): {
-        CRTBegin =
-            TC.getCompilerRTArgString(Args, "crtbegin", ToolChain::FT_Object);
-        CRTEnd =
-            TC.getCompilerRTArgString(Args, "crtend", ToolChain::FT_Object);
-        break;
-      }
-      }
-      CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTBegin)));
-    }
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles,
+                   options::OPT_r)) {
+    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath("crt0.o")));
   }
 
-  Args.addAllArgs(CmdArgs,
-                  {options::OPT_L, options::OPT_u, options::OPT_T_Group,
-                   options::OPT_s, options::OPT_t, options::OPT_r});
+  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
+                            options::OPT_s, options::OPT_t, options::OPT_r});
 
   TC.AddFilePathLibArgs(Args, CmdArgs);
 
   for (const auto &LibPath : TC.getLibraryPaths())
     CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-L", LibPath)));
 
-  if (D.isUsingLTO())
-    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
-                  D.getLTOMode() == LTOK_Thin);
-
-  AddLinkerInputs(TC, Inputs, Args, CmdArgs, JA);
-
   if (TC.ShouldLinkCXXStdlib(Args)) {
     bool OnlyLibstdcxxStatic = Args.hasArg(options::OPT_static_libstdcxx) &&
                                !Args.hasArg(options::OPT_static);
@@ -636,17 +496,14 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
-    CmdArgs.push_back("--start-group");
     AddRunTimeLibs(TC, D, CmdArgs, Args);
+
     CmdArgs.push_back("-lc");
-    if (TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D))
-      CmdArgs.push_back("-lgloss");
-    CmdArgs.push_back("--end-group");
   }
 
-  if ((TC.hasValidGCCInstallation() || detectGCCToolchainAdjacent(D)) &&
-      NeedCRTs)
-    CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
+  if (D.isUsingLTO())
+    addLTOOptions(TC, Args, CmdArgs, Output, Inputs,
+                  D.getLTOMode() == LTOK_Thin);
 
   if (TC.getTriple().isRISCV())
     CmdArgs.push_back("-X");
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index 54805530bae82..f6295bda0a6a2 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_BAREMETAL_H
 
-#include "ToolChains/Gnu.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
 
@@ -20,7 +19,7 @@ namespace driver {
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
+class LLVM_LIBRARY_VISIBILITY BareMetal : public ToolChain {
 public:
   BareMetal(const Driver &D, const llvm::Triple &Triple,
             const llvm::opt::ArgList &Args);
@@ -36,9 +35,7 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
   Tool *buildStaticLibTool() const override;
 
 public:
-  bool initGCCInstallation(const llvm::Triple &Triple,
-                           const llvm::opt::ArgList &Args);
-  bool hasValidGCCInstallation() const { return IsGCCInstallationValid; }
+  bool useIntegratedAs() const override { return true; }
   bool isBareMetal() const override { return true; }
   bool isCrossCompiling() const override { return true; }
   bool HasNativeLLVMSupport() const override { return true; }
@@ -51,19 +48,15 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 
   StringRef getOSLibName() const override { return "baremetal"; }
 
-  UnwindTableLevel
-  getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override {
-    return UnwindTableLevel::None;
-  }
-
   RuntimeLibType GetDefaultRuntimeLibType() const override {
     return ToolChain::RLT_CompilerRT;
   }
-
   CXXStdlibType GetDefaultCXXStdlibType() const override {
     return ToolChain::CST_Libcxx;
   }
 
+  const char *getDefaultLinker() const override { return "ld.lld"; }
+
   void
   AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                             llvm::opt::ArgStringList &CC1Args) const override;
@@ -74,9 +67,6 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
   void AddClangCXXStdlibIncludeArgs(
       const llvm::opt::ArgList &DriverArgs,
       llvm::opt::ArgStringList &CC1Args) const override;
-  void
-  addLibStdCxxIncludePaths(const llvm::opt::ArgList &DriverArgs,
-                           llvm::opt::ArgStringList &CC1Args) const override;
   std::string computeSysRoot() const override;
   SanitizerMask getSupportedSanitizers() const override;
 
@@ -90,8 +80,6 @@ class LLVM_LIBRARY_VISIBILITY BareMetal : public Generic_ELF {
 
   std::string SysRoot;
 
-  bool IsGCCInstallationValid;
-
   SmallVector<std::string> MultilibMacroDefines;
 };
 
@@ -116,7 +104,7 @@ class LLVM_LIBRARY_VISIBILITY StaticLibTool : public Tool {
 
 class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
-  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "linker", TC) {}
+  Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
   bool isLinkJob() const override { return true; }
   bool hasIntegratedCPP() const override { return false; }
   void ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2ff..0000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld b/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
deleted file mode 100755
index b23e55619b2ff..0000000000000
--- a/clang/test/Driver/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/.keep
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2ff..0000000000000
--- a/clang/test/Driver/Inputs/basic_arm_gcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o b/clang/test/Driver/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crt0.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtbegin.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi/lib/crtend.o
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld b/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
deleted file mode 100755
index b23e55619b2ff..0000000000000
--- a/clang/test/Driver/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld
+++ /dev/null
@@ -1 +0,0 @@
-#!/bin/true
diff --git a/clang/test/Driver/aarch64-gnutools.c b/clang/test/Driver/aarch64-gnutools.c
deleted file mode 100644
index 0214639ed3804..0000000000000
--- a/clang/test/Driver/aarch64-gnutools.c
+++ /dev/null
@@ -1,4 +0,0 @@
-// RUN: %clang --target=aarch64-none-elf  --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/aarch64-toolchain-extra.c b/clang/test/Driver/aarch64-toolchain-extra.c
deleted file mode 100644
index a0b5f2902962f..0000000000000
--- a/clang/test/Driver/aarch64-toolchain-extra.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in aarch64-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/aarch64-nogcc/bin
-// RUN: ln -s %clang %t/aarch64-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/aarch64-none-elf %t/aarch64-nogcc/aarch64-none-elf
-// RUN: ln -s %S/Inputs/basic_aarch64_nogcc_tree/bin/aarch64-none-elf-ld %t/aarch64-nogcc/bin/aarch64-none-elf-ld
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/aarch64-nogcc/invalid \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
-
-// RUN: %t/aarch64-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/aarch64-nogcc/bin/../aarch64-none-elf \
-// RUN:    --target=aarch64-none-elf --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOGCC %s
-
-// C-AARCH64-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtbegin.o"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/bin/../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL-NOGCC: "{{.*}}/aarch64-nogcc/{{.*}}/aarch64-none-elf/lib/crtend.o"
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
deleted file mode 100644
index 327161b81d9f6..0000000000000
--- a/clang/test/Driver/aarch64-toolchain.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// Test interaction with -fuse-ld=lld
-// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=LLD-AARCH64-BAREMETAL %s
-
-// LLD-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// LLD-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL %s
-
-// C-AARCH64-BAREMETAL: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL %s
-
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1/backward"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-COMPILER-RT %s
-
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
-// AARCH64-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
-// AARCH64-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=aarch64-none-elf --rtlib=compiler-rt --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf 2>&1 \
-// RUN:   | FileCheck -check-prefix=AARCH64-BAREMETAL-UNWINDLIB %s
-
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
-// AARCH64-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// AARCH64-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/arm-gnutools.c b/clang/test/Driver/arm-gnutools.c
deleted file mode 100644
index 6e107f19dabc5..0000000000000
--- a/clang/test/Driver/arm-gnutools.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// check that gnu assembler is invoked with arm baremetal as well
-
-// RUN: %clang --target=armv6m-none-eabi  --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -fno-integrated-as %s -### -c \
-// RUN: 2>&1 | FileCheck %s
-
-// CHECK: "{{.*}}as{{(.exe)?}}"
diff --git a/clang/test/Driver/arm-toolchain-extra.c b/clang/test/Driver/arm-toolchain-extra.c
deleted file mode 100644
index a04b41c13e95e..0000000000000
--- a/clang/test/Driver/arm-toolchain-extra.c
+++ /dev/null
@@ -1,36 +0,0 @@
-// A basic clang -cc1 command-line, and simple environment check.
-
-// The tests here are similar to those in arm-toolchain.c, however
-// these tests need to create symlinks to test directory trees in order to
-// set up the environment and therefore shell support is required.
-// REQUIRES: shell
-// UNSUPPORTED: system-windows
-
-// If there is no GCC install detected then the driver searches for executables
-// and runtime starting from the directory tree above the driver itself.
-// The test below checks that the driver correctly finds the linker and
-// runtime if and only if they exist.
-//
-// RUN: rm -rf %t
-// RUN: mkdir -p %t/arm-nogcc/bin
-// RUN: ln -s %clang %t/arm-nogcc/bin/clang
-// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/armv6m-none-eabi %t/arm-nogcc/armv6m-none-eabi
-// RUN: ln -s %S/Inputs/basic_arm_nogcc_tree/bin/armv6m-none-eabi-ld %t/arm-nogcc/bin/armv6m-none-eabi-ld
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --gcc-toolchain=%t/arm-nogcc/invalid \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// RUN: %t/arm-nogcc/bin/clang %s -### -no-canonical-prefixes \
-// RUN:    --sysroot=%t/arm-nogcc/bin/../armv6m-none-eabi \
-// RUN:    --target=armv6m-none-eabi --rtlib=libgcc -fuse-ld=ld 2>&1 \
-// RUN:    | FileCheck -check-prefix=C-ARM-BAREMETAL-NOGCC %s
-
-// C-ARM-BAREMETAL-NOGCC: "-internal-isystem" "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/include"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtbegin.o"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/bin/../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL-NOGCC: "{{.*}}/arm-nogcc/{{.*}}/armv6m-none-eabi/lib/crtend.o"
-
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
deleted file mode 100644
index 5368158cdeeda..0000000000000
--- a/clang/test/Driver/arm-toolchain.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// UNSUPPORTED: system-windows
-
-// RUN: %clang -### %s -fuse-ld=lld -B%S/Inputs/lld \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=LLD-ARM-BAREMETAL %s
-
-// LLD-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "-Bstatic" "-EL"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// LLD-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL %s
-
-// C-ARM-BAREMETAL: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL: "-Bstatic" "-EL"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=C-ARM-BAREMETAL-NOSYSROOT %s
-
-// C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
-// C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "--start-group" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL %s
-
-// CXX-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/8.2.1"
-// CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libstdc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1/backward"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}.o" "-lstdc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-LIBCXX: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clangxx -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi -stdlib=libc++ --rtlib=libgcc \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=  2>&1 \
-// RUN:   | FileCheck -check-prefix=CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX %s
-
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-EL"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}.o" "-lc++" "-lm" "--start-group" "-lgcc_s" "-lgcc" "-lc" "-lgloss" "--end-group"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-COMPILER-RT %s
-
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}crt0.o"
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-COMPILER-RT: "--start-group" "{{.*}}libclang_rt.builtins.a"
-// ARM-BAREMETAL-COMPILER-RT: "-lc" "-lgloss" "--end-group"
-// ARM-BAREMETAL-COMPILER-RT: "{{.*}}clang_rt.crtend.o"
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
-
-// RUN: %clang -### %s -fuse-ld= \
-// RUN:   --target=armv6m-none-eabi --rtlib=compiler-rt --unwindlib=libunwind \
-// RUN:   --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree \
-// RUN:   --sysroot=%S/Inputs/basic_arm_gcc_tree/armv6m-none-eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=ARM-BAREMETAL-UNWINDLIB %s
-
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}crt0.o"
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtbegin.o"
-// ARM-BAREMETAL-UNWINDLIB: "--start-group" "{{.*}}libclang_rt.builtins.a" "--as-needed" "-lunwind" "--no-as-needed" "-lc" "-lgloss" "--end-group"
-// ARM-BAREMETAL-UNWINDLIB: "{{.*}}clang_rt.crtend.o"
diff --git a/clang/test/Driver/baremetal-multilib.yaml b/clang/test/Driver/baremetal-multilib.yaml
index 1a80c3b4ccfc8..853a4e9e36e43 100644
--- a/clang/test/Driver/baremetal-multilib.yaml
+++ b/clang/test/Driver/baremetal-multilib.yaml
@@ -8,9 +8,8 @@
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include/c++/v1"
 # CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/include"
 # CHECK-SAME: "-x" "c++" "{{.*}}baremetal-multilib.yaml"
-# CHECK-NEXT: ld{{(.exe)?}}" "-Bstatic"
+# CHECK-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 # CHECK-SAME: "-L[[SYSROOT]]/bin/../lib/clang-runtimes/arm-none-eabi/thumb/v8-m.main/fp/lib"
-# CHECK-SAME: "{{.*}}.o"
 # CHECK-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 # CHECK-SAME: "-lc"
 # CHECK-SAME: "-o" "{{.*}}.tmp.out"
diff --git a/clang/test/Driver/baremetal-sysroot.cpp b/clang/test/Driver/baremetal-sysroot.cpp
index 47f0616df8501..5d5b336a01b0b 100644
--- a/clang/test/Driver/baremetal-sysroot.cpp
+++ b/clang/test/Driver/baremetal-sysroot.cpp
@@ -9,17 +9,15 @@
 // RUN: mkdir -p %T/baremetal_default_sysroot/lib/clang-runtimes/armv6m-none-eabi
 // RUN: ln -s %clang %T/baremetal_default_sysroot/bin/clang
 
-// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.out 2>&1 \
+// RUN: %T/baremetal_default_sysroot/bin/clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target armv6m-none-eabi --sysroot= \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-C %s
 // CHECK-V6M-C: "{{.*}}clang{{.*}}" "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal-sysroot.cpp"
-// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "-Bstatic"
-// CHECK-V6M-C-SAME: "crt0.o"
+// CHECK-V6M-C-NEXT: "{{[^"]*}}ld{{(\.(lld|bfd|gold))?}}{{(\.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-V6M-C-SAME: "-L{{.*}}/baremetal_default_sysroot{{[/\\]+}}bin{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+}}armv6m-none-eabi{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{.*}}.o"
 // CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
-// CHECK-V6M-C-SAME: "-o" "{{.*}}.tmp.out"
+// CHECK-V6M-C-SAME: "-o" "{{.*}}.o"
diff --git a/clang/test/Driver/baremetal-undefined-symbols.c b/clang/test/Driver/baremetal-undefined-symbols.c
deleted file mode 100644
index bff58c7c54c33..0000000000000
--- a/clang/test/Driver/baremetal-undefined-symbols.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// Check the arguments are correctly passed
-
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-LD %s
-// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
-
-// TODO: Merge this test with the above in the last patch when finally integrating riscv
-// Make sure -T is the last with gcc-toolchain option
-// RUN: %clang -### --target=aarch64-none-elf --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
-// RUN: %clang -### --target=armv6m-none-eabi --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-ARM-LD %s
-// CHECK-ARM-LD: {{.*}} "-T" "a.lds" "-u" "foo" {{.*}} "--defsym=FOO=10"
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index b75f1a9280d12..a80aa9b437117 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -15,12 +15,11 @@
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-C-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-C-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-V6M-C-SAME: "{{.*}}.o"
-// CHECK-V6M-C-SAME: {{[^"]*}}libclang_rt.builtins.a"
+// CHECK-V6M-C-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-C-SAME: "-lc"
 // CHECK-V6M-C-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
 
@@ -40,10 +39,9 @@
 // CHECK-V6M-TREE-SAME: {{^}} "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
-// CHECK-V6M-TREE-SAME "{{.*}}.o"
 // CHECK-V6M-TREE-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-TREE-SAME: "-lc"
 // CHECK-V6M-TREE-SAME: "--target2=rel" "-o" "{{.*}}.tmp.out"
@@ -55,21 +53,19 @@
 // CHECK-ARMV7M-PER-TARGET: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-ARMV7M-PER-TARGET: "-isysroot" "[[SYSROOT:[^"]*]]"
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
-// CHECK-ARMV7M-PER-TARGET: "{{.*}}.o"
 // CHECK-ARMV7M-PER-TARGET: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-ARMV7M-PER-TARGET: "-lc"
 
 // RUN: %clangxx %s -### --target=armv6m-none-eabi 2>&1 \
 // RUN:     --sysroot=%S/Inputs/baremetal_arm | FileCheck --check-prefix=CHECK-V6M-DEFAULTCXX %s
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lc++"
 // CHECK-V6M-DEFAULTCXX-SAME: "-lm"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -81,9 +77,8 @@
 // CHECK-V6M-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
 // CHECK-V6M-LIBCXX-SAME: "-lm"
 // CHECK-V6M-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
@@ -97,9 +92,8 @@
 // CHECK-V6M-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
-// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
-// CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lc"
@@ -110,7 +104,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-V6M-NDL %s
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-V6M-NDL: ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-V6M-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -125,7 +119,6 @@
 // CHECK-V6M-LIBCXX-USR-SAME: "-internal-isystem" "{{[^"]+}}baremetal_cxx_sysroot{{[/\\]+}}usr{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX-USR: "{{[^"]*}}-Bstatic"
 // CHECK-V6M-LIBCXX-USR-SAME: "-L{{[^"]*}}{{[/\\]+}}baremetal_cxx_sysroot{{[/\\]+}}lib"
-// CHECK-V6M-LIBCXX-USR: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc++" "-lm"
 // CHECK-V6M-LIBCXX-USR-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-V6M-LIBCXX-USR-SAME: "-lc"
@@ -156,7 +149,7 @@
 
 // RUN: %clang -### %s --target=armebv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
-// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "--be8" "-EB"
+// CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -166,7 +159,7 @@
 
 // RUN: %clang -### %s --target=armv7-none-eabi --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
-// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -177,7 +170,7 @@
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
-// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EB"
+// CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -188,7 +181,7 @@
 
 // RUN: %clang -### %s --target=aarch64-none-elf --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
-// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "-Bstatic" "-EL"
+// CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -203,22 +196,6 @@
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-AARCH64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
 
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv32-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV32-NO-HOST-INC %s
-// CHECK-RISCV32-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV32-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV32-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
-// RUN: %clang -no-canonical-prefixes %s -### --target=riscv64-unknown-elf 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-RISCV64-NO-HOST-INC %s
-// CHECK-RISCV64-NO-HOST-INC: InstalledDir: [[INSTALLEDDIR:.+]]
-// CHECK-RISCV64-NO-HOST-INC: "-resource-dir" "[[RESOURCE:[^"]+]]"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
-// CHECK-RISCV64-NO-HOST-INC-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-
 // RUN: %clang %s -### --target=riscv64-unknown-elf -o %t.out -L some/directory/user/asked/for \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64 %s
@@ -228,10 +205,9 @@
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV64-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV64-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
 // CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
@@ -240,9 +216,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-DEFAULTCXX %s
 // CHECK-RV64-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-DEFAULTCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
@@ -255,9 +230,8 @@
 // CHECK-RV64-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV64-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBCXX-SAME:"{{.*}}.o"
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
@@ -270,9 +244,8 @@
 // CHECK-RV64-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV64-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV64-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
@@ -288,10 +261,9 @@
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}include"
 // CHECK-RV32-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-SAME: "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-RV32-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
-// CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
 // CHECK-RV32-SAME: "-X" "-o" "a.out"
@@ -300,9 +272,8 @@
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
 // RUN:   | FileCheck --check-prefix=CHECK-RV32-DEFAULTCXX %s
 // CHECK-RV32-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-DEFAULTCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-DEFAULTCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
@@ -315,9 +286,8 @@
 // CHECK-RV32-LIBCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}{{[^v].*}}"
 // CHECK-RV32-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
-// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-LIBCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
@@ -329,13 +299,11 @@
 // CHECK-RV32-LIBSTDCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-RV32-LIBSTDCXX-NOT: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}8.0.1"
-// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32-LIBSTDCXX: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv32_tree{{[/\\]+}}riscv32-unknown-elf{{[/\\]+}}lib"
-// CHECK-RV32-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-lc" "-X" "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
@@ -352,7 +320,7 @@
 // RUN:     -nodefaultlibs \
 // RUN:   | FileCheck --check-prefix=CHECK-RV64-NDL %s
 // CHECK-RV64-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
-// CHECK-RV64-NDL: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64-NDL: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}basic_riscv64_tree{{[/\\]+}}riscv64-unknown-elf{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
@@ -371,7 +339,7 @@
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV64FD-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}include"
 // CHECK-RV64FD-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV64FD-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV64FD-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv64imafdc{{[/\\]+}}lp64d{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -390,7 +358,7 @@
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32I-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32I-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32I-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32I-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32i{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -409,7 +377,7 @@
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IM-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IM-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IM-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IM-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32im{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
@@ -423,7 +391,7 @@
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IAC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}include"
 // CHECK-RV32IAC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IAC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IAC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32iac{{[/\\]+}}ilp32{{[/\\]+}}lib"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf -march=rv32imafc -mabi=ilp32f \
@@ -444,7 +412,7 @@
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-RV32IMAFC-SAME: "-internal-isystem" "[[SYSROOT]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}include"
 // CHECK-RV32IMAFC-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
-// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-RV32IMAFC-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-RV32IMAFC-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}rv32imafc{{[/\\]+}}ilp32f{{[/\\]+}}lib"
 
 // RUN: %clang -no-canonical-prefixes %s -### --target=powerpc-unknown-eabi 2>&1 \
@@ -455,9 +423,8 @@
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPCEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCEABI-SAME:"{{.*}}.o"
 // CHECK-PPCEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCEABI-SAME: "-lc"
 // CHECK-PPCEABI-SAME: "-o" "a.out"
@@ -470,9 +437,8 @@
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64EABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPC64EABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64EABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64EABI-SAME:"{{.*}}.o"
 // CHECK-PPC64EABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64EABI-SAME: "-lc"
 // CHECK-PPC64EABI-SAME: "-o" "a.out"
@@ -485,9 +451,8 @@
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPCLEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPCLEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPCLEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPCLEEABI-SAME:"{{.*}}.o"
 // CHECK-PPCLEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPCLEEABI-SAME: "-lc"
 // CHECK-PPCLEEABI-SAME: "-o" "a.out"
@@ -500,9 +465,8 @@
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[RESOURCE]]{{[/\\]+}}include"
 // CHECK-PPC64LEEABI-SAME: "-internal-isystem" "[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}include"
-// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "-Bstatic"
+// CHECK-PPC64LEEABI-NEXT: ld{{(.exe)?}}" "{{.*}}.o" "-Bstatic"
 // CHECK-PPC64LEEABI-SAME: "-L[[INSTALLEDDIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}clang-runtimes{{[/\\]+[^"]*}}lib"
-// CHECK-PPC64LEEABI-SAME:"{{.*}}.o"
 // CHECK-PPC64LEEABI-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-PPC64LEEABI-SAME: "-lc"
 // CHECK-PPC64LEEABI-SAME: "-o" "a.out"
diff --git a/clang/test/Driver/check-no-multlib-warning.c b/clang/test/Driver/check-no-multlib-warning.c
deleted file mode 100644
index 9a0d7cee450a3..0000000000000
--- a/clang/test/Driver/check-no-multlib-warning.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// UNSUPPORTED: system-windows
-
-
-// RUN: %clang --target=armv6m-none-eabi --gcc-toolchain=%S/Inputs/basic_arm_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=aarch64-none-elf --gcc-toolchain=%S/Inputs/basic_aarch64_gcc_tree -### 2>&1 | FileCheck %s
-// RUN: %clang --target=riscv32-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv32_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-// RUN: %clang --target=riscv64-unknown-elf --gcc-toolchain=%S/Inputs/basic_riscv64_tree -### 2>&1 | FileCheck --check-prefix=NOCHECK %s
-
-// CHECK: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
-// NOCHECK-NOT: warning: no multilib structure encoded for Arm, Aarch64 and PPC targets
diff --git a/clang/test/Driver/riscv-args.c b/clang/test/Driver/riscv-args.c
new file mode 100644
index 0000000000000..cab08e5b0f811
--- /dev/null
+++ b/clang/test/Driver/riscv-args.c
@@ -0,0 +1,6 @@
+// Check the arguments are correctly passed
+
+// Make sure -T is the last with gcc-toolchain option
+// RUN: %clang -### --target=riscv32 --gcc-toolchain= -Xlinker --defsym=FOO=10 -T a.lds -u foo %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-LD %s
+// CHECK-LD: {{.*}} "--defsym=FOO=10" {{.*}} "-u" "foo" {{.*}} "-T" "a.lds"
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index d2e4877e89d78..befd322d027c9 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -1033,7 +1033,7 @@
 // RUN:     --target=riscv32-unknown-elf -fuse-ld=ld \
 // RUN:   | %{filecheck} --check-prefix=CHECK-SHADOWCALLSTACK-ELF-RISCV32
 // CHECK-SHADOWCALLSTACK-ELF-RISCV32-NOT: error:
-// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-SHADOWCALLSTACK-ELF-RISCV32: "{{(.*[^-.0-9A-Z_a-z])?}}ld.lld{{(.exe)?}}"
 
 // RUN: %clang -fsanitize=shadow-call-stack -### %s 2>&1 \
 // RUN:     --target=riscv64-unknown-linux -fuse-ld=ld \

From a5a0d880736f5dc6a566374bc3b3ca0d86901510 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 17 Jun 2025 14:07:16 -0700
Subject: [PATCH 809/851] [libc++] Remove trailing newline from
 _LIBCPP_ASSERTION_HANDLER calls (#143573)

This newline was originally added in https://reviews.llvm.org/D142184
but I think updating `__libcpp_verbose_abort` to add newline instead is
more consistent, and works for other callers of `_LIBCPP_VERBOSE_ABORT`.

The `_LIBCPP_ASSERTION_HANDLER` calls through to either
`_LIBCPP_VERBOSE_ABORT` macro or the `__builtin_verbose_trap`. From what
I can tell neither of these function expect a trailing newline (at least
none of the usage of `_LIBCPP_VERBOSE_ABORT` or `__builtin_verbose_trap`
that I can find include a trailing newline except `_LIBCPP_ASSERTION_HANDLER`).

I noticed this discrepancy when working on
https://github.com/emscripten-core/emscripten/pull/24543
---
 libcxx/include/__assert               | 4 ++--
 libcxx/src/verbose_abort.cpp          | 3 +++
 libcxx/test/support/check_assertion.h | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 90eaa6023587b..1bfed2890b79f 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
-             expression) " failed: " message "\n"))
+       : _LIBCPP_ASSERTION_HANDLER(                                                                                    \
+             __FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(expression) " failed: " message))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp
index 94bdb451dee7a..efb7b9be6f61c 100644
--- a/libcxx/src/verbose_abort.cpp
+++ b/libcxx/src/verbose_abort.cpp
@@ -30,6 +30,9 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept {
     va_list list;
     va_start(list, format);
     std::vfprintf(stderr, format, list);
+    // Callers of `__libcpp_verbose_abort` do not include a newline but when
+    // writing the message to stderr we need to include one.
+    std::fputc('\n', stderr);
     va_end(list);
   }
 
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index a279400d651b4..ea04944ea9326 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -340,7 +340,7 @@ void std::__libcpp_verbose_abort(char const* format, ...) noexcept {
 
   std::fprintf(stderr, "%s\n", Marker);
   std::vfprintf(stderr, format, args);
-  std::fprintf(stderr, "%s", Marker);
+  std::fprintf(stderr, "\n%s", Marker);
 
   va_end(args);
 

From 844e41c2acedd5219d9363e38838abd5146f63c0 Mon Sep 17 00:00:00 2001
From: sribee8 <sriya.pratipati@gmail.com>
Date: Tue, 17 Jun 2025 14:12:35 -0700
Subject: [PATCH 810/851] [libc] Moved shared constexpr to the top (#144569)

Some conversions shared constexpr so moved to the top.

---------

Co-authored-by: Sriya Pratipati <sriyap@google.com>
---
 .../src/__support/wchar/character_converter.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3b9046dfb9a76..5ab0447bb08b2 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -19,6 +19,13 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
+// This is for utf-8 bytes other than the first byte
+constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
+// The number of bits per utf-8 byte that actually encode character
+// Information not metadata (# of bits excluding the byte headers)
+constexpr uint32_t MASK_ENCODED_BITS =
+    mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+
 CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 void CharacterConverter::clear() {
@@ -61,10 +68,8 @@ int CharacterConverter::push(char8_t utf8_byte) {
   }
   // Any subsequent push
   // Adding 6 more bits so need to left shift
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
   if (num_ones == 1 && !isComplete()) {
-    char32_t byte =
-        utf8_byte & mask_trailing_ones<uint32_t, ENCODED_BITS_PER_UTF8>();
+    char32_t byte = utf8_byte & MASK_ENCODED_BITS;
     state->partial = state->partial << ENCODED_BITS_PER_UTF8;
     state->partial |= byte;
     state->bytes_processed++;
@@ -117,12 +122,6 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
   constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
   constexpr char8_t CONTINUING_BYTE_HEADER = 0x80;
 
-  // the number of bits per utf-8 byte that actually encode character
-  // information not metadata (# of bits excluding the byte headers)
-  constexpr size_t ENCODED_BITS_PER_UTF8 = 6;
-  constexpr int MASK_ENCODED_BITS =
-      mask_trailing_ones<unsigned int, ENCODED_BITS_PER_UTF8>();
-
   char32_t output;
 
   // Shift to get the next 6 bits from the utf32 encoding

From 6fb36db4818abde56e5da47899dcdaacd8293903 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 17 Jun 2025 16:16:37 -0500
Subject: [PATCH 811/851] [LinkerWrapper] Fix 'save-temps' when targeting
 SPIR-V (#144605)

Summary:
The logic here is flawed, it was only intended to apply to the CPU case
where we use the linker passed in on the command line. This was falsely
applying to SPIR-V which caused issues.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 7a1007d03737e..0f1fa8b329fd6 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -268,7 +268,8 @@ Expected<std::string> findProgram(StringRef Name, ArrayRef<StringRef> Paths) {
 bool linkerSupportsLTO(const ArgList &Args) {
   llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
   return Triple.isNVPTX() || Triple.isAMDGPU() ||
-         Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld");
+         (!Triple.isGPU() &&
+          Args.getLastArgValue(OPT_linker_path_EQ).ends_with("lld"));
 }
 
 /// Returns the hashed value for a constant string.

From 362b9d78b4ee9107da2b5e90b3764b0f0fa610fe Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 17 Jun 2025 14:42:06 -0700
Subject: [PATCH 812/851] [lldb-dap] Refactoring DebugCommunication to improve
 test consistency. (#143818)

In DebugCommunication, we currently are using 2 thread to drive
lldb-dap. At the moment, they make an attempt at only synchronizing the
`recv_packets` between the reader thread and the main test thread. Other
stateful properties of the debug session are not guarded by a
locks/mutex.

To mitigate this, I am moving any state updates to the main thread
inside the `_recv_packet` method to ensure that between calls to
`_recv_packet` the state does not change out from under us in a test.

This does mean the precise timing of events has changed slightly as a
result and I've updated the existing tests that fail for me locally with
this new behavior.

I think this should result in overall more predictable behavior, even if
the test is slow due to the host workload or architecture differences.

---------

Co-authored-by: Ebuka Ezike <yerimyah1@gmail.com>
---
 .../test/tools/lldb-dap/dap_server.py         | 875 +++++++++++-------
 .../test/tools/lldb-dap/lldbdap_testcase.py   |  79 +-
 .../breakpoint/TestDAP_setBreakpoints.py      |   5 +-
 .../tools/lldb-dap/cancel/TestDAP_cancel.py   |  10 +-
 .../tools/lldb-dap/launch/TestDAP_launch.py   |  12 +-
 .../tools/lldb-dap/module/TestDAP_module.py   |   2 +-
 .../tools/lldb-dap/output/TestDAP_output.py   |   4 +-
 7 files changed, 590 insertions(+), 397 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 6d32491eaa5e9..23178a215206e 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -10,17 +10,124 @@
 import subprocess
 import signal
 import sys
+from dataclasses import dataclass
 import threading
 import time
-from typing import Any, Optional, Union, BinaryIO, TextIO
+from typing import (
+    IO,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+    Generic,
+    TypedDict,
+    Union,
+    BinaryIO,
+    TextIO,
+    Literal,
+    cast,
+)
 
 ## DAP type references
-Event = dict[str, Any]
-Request = dict[str, Any]
-Response = dict[str, Any]
+
+T = TypeVar("T")
+Te = TypeVar("Te")  # Generic type for event body
+Ta = TypeVar("Ta")  # Generic type for request arguments
+Tb = TypeVar("Tb")  # Generic type for response body
+
+
+class Event(Generic[Te], TypedDict):
+    type: Literal["event"]
+    seq: int
+    event: str
+    body: Optional[Te]
+
+
+class Request(Generic[Ta], TypedDict, total=False):
+    type: Literal["request"]
+    seq: int
+    command: str
+    arguments: Ta
+
+
+class Response(Generic[Tb], TypedDict):
+    type: Literal["response"]
+    seq: int
+    request_seq: int
+    success: bool
+    command: str
+    message: Optional[str]
+    body: Optional[Tb]
+
+
 ProtocolMessage = Union[Event, Request, Response]
 
 
+class AttachOrLaunchArguments(TypedDict, total=False):
+    stopOnEntry: bool
+    disableASLR: bool
+    disableSTDIO: bool
+    enableAutoVariableSummaries: bool
+    displayExtendedBacktrace: bool
+    enableSyntheticChildDebugging: bool
+    initCommands: List[str]
+    preRunCommands: List[str]
+    postRunCommands: List[str]
+    stopCommands: List[str]
+    exitCommands: List[str]
+    terminateCommands: List[str]
+    sourceMap: Union[List[Tuple[str, str]], Dict[str, str]]
+    sourcePath: str
+    debuggerRoot: str
+    commandEscapePrefix: str
+    customFrameFormat: str
+    customThreadFormat: str
+
+
+class LaunchArguments(AttachOrLaunchArguments, total=False):
+    program: str
+    args: List[str]
+    cwd: str
+    env: Dict[str, str]
+    shellExpandArguments: bool
+    runInTerminal: bool
+    launchCommands: List[str]
+
+
+# Using the function form of TypedDict to allow for hyphenated keys.
+AttachGdbServer = TypedDict(
+    "AttachGdbServer", {"gdb-remote-port": int, "gdb-remote-hostname": str}, total=False
+)
+
+
+class AttachArguments(AttachGdbServer, AttachOrLaunchArguments, total=False):
+    program: str
+    pid: int
+    waitFor: bool
+    attachCommands: List[str]
+    coreFile: str
+
+
+class BreakpointData(TypedDict, total=False):
+    column: int
+    condition: str
+    hitCondition: str
+    logMessage: str
+    mode: str
+
+
+class SourceBreakpoint(BreakpointData):
+    line: int
+
+
+class Breakpoint(TypedDict, total=False):
+    id: int
+    verified: bool
+
+
 def dump_memory(base_addr, data, num_per_line, outfile):
     data_len = len(data)
     hex_string = binascii.hexlify(data)
@@ -58,7 +165,9 @@ def dump_memory(base_addr, data, num_per_line, outfile):
         outfile.write("\n")
 
 
-def read_packet(f, verbose=False, trace_file=None):
+def read_packet(
+    f: IO[bytes], trace_file: Optional[IO[str]] = None
+) -> Optional[ProtocolMessage]:
     """Decode a JSON packet that starts with the content length and is
     followed by the JSON bytes from a file 'f'. Returns None on EOF.
     """
@@ -70,32 +179,20 @@ def read_packet(f, verbose=False, trace_file=None):
     prefix = "Content-Length: "
     if line.startswith(prefix):
         # Decode length of JSON bytes
-        if verbose:
-            print('content: "%s"' % (line))
         length = int(line[len(prefix) :])
-        if verbose:
-            print('length: "%u"' % (length))
         # Skip empty line
-        line = f.readline()
-        if verbose:
-            print('empty: "%s"' % (line))
+        line = f.readline().decode()
         # Read JSON bytes
         json_str = f.read(length)
-        if verbose:
-            print('json: "%s"' % (json_str))
         if trace_file:
-            trace_file.write("from adapter:\n%s\n" % (json_str))
+            trace_file.write(f"from adapter:\n{json_str!r}\n")
         # Decode the JSON bytes into a python dictionary
         return json.loads(json_str)
 
     raise Exception("unexpected malformed message from lldb-dap: " + line)
 
 
-def packet_type_is(packet, packet_type):
-    return "type" in packet and packet["type"] == packet_type
-
-
-def dump_dap_log(log_file):
+def dump_dap_log(log_file: Optional[str]) -> None:
     print("========= DEBUG ADAPTER PROTOCOL LOGS =========", file=sys.stderr)
     if log_file is None:
         print("no log file available", file=sys.stderr)
@@ -105,34 +202,30 @@ def dump_dap_log(log_file):
     print("========= END =========", file=sys.stderr)
 
 
-class Source(object):
+@dataclass
+class Source:
+    path: Optional[str]
+    source_reference: Optional[int]
+
+    @property
+    def name(self) -> Optional[str]:
+        if not self.path:
+            return None
+        return os.path.basename(self.path)
+
     def __init__(
         self, path: Optional[str] = None, source_reference: Optional[int] = None
     ):
-        self._name = None
-        self._path = None
-        self._source_reference = None
-
-        if path is not None:
-            self._name = os.path.basename(path)
-            self._path = path
-        elif source_reference is not None:
-            self._source_reference = source_reference
-        else:
+        if path is None and source_reference is None:
             raise ValueError("Either path or source_reference must be provided")
 
-    def __str__(self):
-        return f"Source(name={self.name}, path={self.path}), source_reference={self.source_reference})"
+        self.path = path
+        self.source_reference = source_reference
 
-    def as_dict(self):
-        source_dict = {}
-        if self._name is not None:
-            source_dict["name"] = self._name
-        if self._path is not None:
-            source_dict["path"] = self._path
-        if self._source_reference is not None:
-            source_dict["sourceReference"] = self._source_reference
-        return source_dict
+    def to_DAP(self) -> dict:
+        if self.path:
+            return {"path": self.path, "name": self.name}
+        return {"sourceReference": self.source_reference}
 
 
 class NotSupportedError(KeyError):
@@ -144,7 +237,7 @@ def __init__(
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: list[str],
+        init_commands: List[str],
         log_file: Optional[TextIO] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
@@ -152,35 +245,50 @@ def __init__(
         self.log_file = log_file
         self.send = send
         self.recv = recv
-        self.recv_packets: list[Optional[ProtocolMessage]] = []
-        self.recv_condition = threading.Condition()
-        self.recv_thread = threading.Thread(target=self._read_packet_thread)
-        self.process_event_body = None
-        self.exit_status: Optional[int] = None
-        self.capabilities: dict[str, Any] = {}
-        self.progress_events: list[Event] = []
-        self.reverse_requests = []
-        self.sequence = 1
-        self.threads = None
-        self.thread_stop_reasons = {}
-        self.recv_thread.start()
-        self.output_condition = threading.Condition()
-        self.output: dict[str, list[str]] = {}
-        self.configuration_done_sent = False
-        self.initialized = False
-        self.frame_scopes = {}
+        # Packets that have been received and processed but have not yet been
+        # requested by a test case.
+        self._pending_packets: List[Optional[ProtocolMessage]] = []
+        # Received packets that have not yet been processed.
+        self._recv_packets: List[Optional[ProtocolMessage]] = []
+        # Used as a mutex for _recv_packets and for notify when _recv_packets
+        # changes.
+        self._recv_condition = threading.Condition()
+        self._recv_thread = threading.Thread(target=self._read_packet_thread)
+
+        # session state
         self.init_commands = init_commands
-        self.resolved_breakpoints = {}
+        self.exit_status: Optional[int] = None
+        self.capabilities: Optional[Dict] = None
+        self.initialized: bool = False
+        self.configuration_done_sent: bool = False
+        self.process_event_body: Optional[Dict] = None
+        self.terminated: bool = False
+        self.events: List[Event] = []
+        self.progress_events: List[Event] = []
+        self.reverse_requests: List[Request] = []
+        self.module_events: List[Dict] = []
+        self.sequence: int = 1
+        self.output: Dict[str, str] = {}
+
+        # debuggee state
+        self.threads: Optional[dict] = None
+        self.thread_stop_reasons: Dict[str, Any] = {}
+        self.frame_scopes: Dict[str, Any] = {}
+        # keyed by breakpoint id
+        self.resolved_breakpoints: Dict[str, bool] = {}
+
+        # trigger enqueue thread
+        self._recv_thread.start()
 
     @classmethod
     def encode_content(cls, s: str) -> bytes:
         return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8")
 
     @classmethod
-    def validate_response(cls, command, response):
-        if command["command"] != response["command"]:
+    def validate_response(cls, request: Request, response: Response) -> None:
+        if request["command"] != response["command"]:
             raise ValueError("command mismatch in response")
-        if command["seq"] != response["request_seq"]:
+        if request["seq"] != response["request_seq"]:
             raise ValueError("seq mismatch in response")
 
     def _read_packet_thread(self):
@@ -189,262 +297,323 @@ def _read_packet_thread(self):
             while not done:
                 packet = read_packet(self.recv, trace_file=self.trace_file)
                 # `packet` will be `None` on EOF. We want to pass it down to
-                # handle_recv_packet anyway so the main thread can handle unexpected
-                # termination of lldb-dap and stop waiting for new packets.
+                # handle_recv_packet anyway so the main thread can handle
+                # unexpected termination of lldb-dap and stop waiting for new
+                # packets.
                 done = not self._handle_recv_packet(packet)
         finally:
             dump_dap_log(self.log_file)
 
-    def get_modules(self):
-        module_list = self.request_modules()["body"]["modules"]
-        modules = {}
-        for module in module_list:
-            modules[module["name"]] = module
-        return modules
+    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
+        """Handles an incoming packet.
 
-    def get_output(self, category, timeout=0.0, clear=True):
-        self.output_condition.acquire()
-        output = None
-        if category in self.output:
-            output = self.output[category]
-            if clear:
-                del self.output[category]
-        elif timeout != 0.0:
-            self.output_condition.wait(timeout)
-            if category in self.output:
-                output = self.output[category]
-                if clear:
-                    del self.output[category]
-        self.output_condition.release()
-        return output
+        Called by the read thread that is waiting for all incoming packets
+        to store the incoming packet in "self._recv_packets" in a thread safe
+        way. This function will then signal the "self._recv_condition" to
+        indicate a new packet is available.
 
-    def collect_output(self, category, timeout_secs, pattern, clear=True):
-        end_time = time.time() + timeout_secs
-        collected_output = ""
-        while end_time > time.time():
-            output = self.get_output(category, timeout=0.25, clear=clear)
-            if output:
-                collected_output += output
-                if pattern is not None and pattern in output:
-                    break
-        return collected_output if collected_output else None
-
-    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
-        self.recv_condition.acquire()
-        self.recv_packets.append(packet)
-        self.recv_condition.notify()
-        self.recv_condition.release()
+        Args:
+            packet: A new packet to store.
 
-    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
-        """Called by the read thread that is waiting for all incoming packets
-        to store the incoming packet in "self.recv_packets" in a thread safe
-        way. This function will then signal the "self.recv_condition" to
-        indicate a new packet is available. Returns True if the caller
-        should keep calling this function for more packets.
+        Returns:
+            True if the caller should keep calling this function for more
+            packets.
         """
-        # If EOF, notify the read thread by enqueuing a None.
-        if not packet:
-            self._enqueue_recv_packet(None)
-            return False
-
-        # Check the packet to see if is an event packet
-        keepGoing = True
-        packet_type = packet["type"]
-        if packet_type == "event":
-            event = packet["event"]
-            body = None
-            if "body" in packet:
-                body = packet["body"]
-            # Handle the event packet and cache information from these packets
-            # as they come in
-            if event == "output":
-                # Store any output we receive so clients can retrieve it later.
-                category = body["category"]
-                output = body["output"]
-                self.output_condition.acquire()
-                if category in self.output:
-                    self.output[category] += output
-                else:
-                    self.output[category] = output
-                self.output_condition.notify()
-                self.output_condition.release()
-                # no need to add 'output' event packets to our packets list
-                return keepGoing
-            elif event == "initialized":
-                self.initialized = True
-            elif event == "process":
-                # When a new process is attached or launched, remember the
-                # details that are available in the body of the event
-                self.process_event_body = body
-            elif event == "exited":
-                # Process exited, mark the status to indicate the process is not
-                # alive.
-                self.exit_status = body["exitCode"]
-            elif event == "continued":
-                # When the process continues, clear the known threads and
-                # thread_stop_reasons.
-                all_threads_continued = body.get("allThreadsContinued", True)
-                tid = body["threadId"]
-                if tid in self.thread_stop_reasons:
-                    del self.thread_stop_reasons[tid]
-                self._process_continued(all_threads_continued)
-            elif event == "stopped":
-                # Each thread that stops with a reason will send a
-                # 'stopped' event. We need to remember the thread stop
-                # reasons since the 'threads' command doesn't return
-                # that information.
-                self._process_stopped()
-                tid = body["threadId"]
-                self.thread_stop_reasons[tid] = body
-            elif event.startswith("progress"):
-                # Progress events come in as 'progressStart', 'progressUpdate',
-                # and 'progressEnd' events. Keep these around in case test
-                # cases want to verify them.
-                self.progress_events.append(packet)
-            elif event == "breakpoint":
-                # Breakpoint events are sent when a breakpoint is resolved
-                self._update_verified_breakpoints([body["breakpoint"]])
-            elif event == "capabilities":
-                # Update the capabilities with new ones from the event.
-                self.capabilities.update(body["capabilities"])
-
-        elif packet_type == "response":
-            if packet["command"] == "disconnect":
-                keepGoing = False
-        self._enqueue_recv_packet(packet)
-        return keepGoing
+        with self._recv_condition:
+            self._recv_packets.append(packet)
+            self._recv_condition.notify()
+            # packet is None on EOF
+            return packet is not None and not (
+                packet["type"] == "response" and packet["command"] == "disconnect"
+            )
+
+    def _recv_packet(
+        self,
+        *,
+        predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
+        timeout: Optional[float] = None,
+    ) -> Optional[ProtocolMessage]:
+        """Processes received packets from the adapter.
+
+        Updates the DebugCommunication stateful properties based on the received
+        packets in the order they are received.
+
+        NOTE: The only time the session state properties should be updated is
+        during this call to ensure consistency during tests.
+
+        Args:
+            predicate:
+                Optional, if specified, returns the first packet that matches
+                the given predicate.
+            timeout:
+                Optional, if specified, processes packets until either the
+                timeout occurs or the predicate matches a packet, whichever
+                occurs first.
+
+        Returns:
+            The first matching packet for the given predicate, if specified,
+            otherwise None.
+        """
+        assert (
+            threading.current_thread != self._recv_thread
+        ), "Must not be called from the _recv_thread"
+
+        def process_until_match():
+            self._process_recv_packets()
+            for i, packet in enumerate(self._pending_packets):
+                if packet is None:
+                    # We need to return a truthy value to break out of the
+                    # wait_for, use `EOFError` as an indicator of EOF.
+                    return EOFError()
+                if predicate and predicate(packet):
+                    self._pending_packets.pop(i)
+                    return packet
+
+        with self._recv_condition:
+            packet = self._recv_condition.wait_for(process_until_match, timeout)
+            return None if isinstance(packet, EOFError) else packet
+
+    def _process_recv_packets(self) -> None:
+        """Process received packets, updating the session state."""
+        with self._recv_condition:
+            for packet in self._recv_packets:
+                # Handle events that may modify any stateful properties of
+                # the DAP session.
+                if packet and packet["type"] == "event":
+                    self._handle_event(packet)
+                elif packet and packet["type"] == "request":
+                    # Handle reverse requests and keep processing.
+                    self._handle_reverse_request(packet)
+                # Move the packet to the pending queue.
+                self._pending_packets.append(packet)
+            self._recv_packets.clear()
+
+    def _handle_event(self, packet: Event) -> None:
+        """Handle any events that modify debug session state we track."""
+        event = packet["event"]
+        body: Optional[Dict] = packet.get("body", None)
+
+        if event == "output" and body:
+            # Store any output we receive so clients can retrieve it later.
+            category = body["category"]
+            output = body["output"]
+            if category in self.output:
+                self.output[category] += output
+            else:
+                self.output[category] = output
+        elif event == "initialized":
+            self.initialized = True
+        elif event == "process":
+            # When a new process is attached or launched, remember the
+            # details that are available in the body of the event
+            self.process_event_body = body
+        elif event == "exited" and body:
+            # Process exited, mark the status to indicate the process is not
+            # alive.
+            self.exit_status = body["exitCode"]
+        elif event == "continued" and body:
+            # When the process continues, clear the known threads and
+            # thread_stop_reasons.
+            all_threads_continued = body.get("allThreadsContinued", True)
+            tid = body["threadId"]
+            if tid in self.thread_stop_reasons:
+                del self.thread_stop_reasons[tid]
+            self._process_continued(all_threads_continued)
+        elif event == "stopped" and body:
+            # Each thread that stops with a reason will send a
+            # 'stopped' event. We need to remember the thread stop
+            # reasons since the 'threads' command doesn't return
+            # that information.
+            self._process_stopped()
+            tid = body["threadId"]
+            self.thread_stop_reasons[tid] = body
+        elif event.startswith("progress"):
+            # Progress events come in as 'progressStart', 'progressUpdate',
+            # and 'progressEnd' events. Keep these around in case test
+            # cases want to verify them.
+            self.progress_events.append(packet)
+        elif event == "breakpoint" and body:
+            # Breakpoint events are sent when a breakpoint is resolved
+            self._update_verified_breakpoints([body["breakpoint"]])
+        elif event == "capabilities" and body:
+            if self.capabilities is None:
+                self.capabilities = {}
+            # Update the capabilities with new ones from the event.
+            self.capabilities.update(body["capabilities"])
+
+    def _handle_reverse_request(self, request: Request) -> None:
+        if request in self.reverse_requests:
+            return
+        self.reverse_requests.append(request)
+        arguments = request.get("arguments")
+        if request["command"] == "runInTerminal" and arguments is not None:
+            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
+            proc = subprocess.Popen(
+                arguments["args"],
+                env=arguments.get("env", {}),
+                cwd=arguments["cwd"],
+                stdin=subprocess.DEVNULL,
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                shell=in_shell,
+            )
+            body = {}
+            if in_shell:
+                body["shellProcessId"] = proc.pid
+            else:
+                body["processId"] = proc.pid
+            self.send_packet(
+                {
+                    "type": "response",
+                    "seq": 0,
+                    "request_seq": request["seq"],
+                    "success": True,
+                    "command": "runInTerminal",
+                    "message": None,
+                    "body": body,
+                }
+            )
+        elif request["command"] == "startDebugging":
+            self.send_packet(
+                {
+                    "type": "response",
+                    "seq": 0,
+                    "request_seq": request["seq"],
+                    "success": True,
+                    "message": None,
+                    "command": "startDebugging",
+                    "body": {},
+                }
+            )
+        else:
+            desc = 'unknown reverse request "%s"' % (request["command"])
+            raise ValueError(desc)
 
     def _process_continued(self, all_threads_continued: bool):
         self.frame_scopes = {}
         if all_threads_continued:
             self.thread_stop_reasons = {}
 
-    def _update_verified_breakpoints(self, breakpoints: list[Event]):
-        for breakpoint in breakpoints:
-            if "id" in breakpoint:
-                self.resolved_breakpoints[str(breakpoint["id"])] = breakpoint.get(
-                    "verified", False
-                )
+    def _update_verified_breakpoints(self, breakpoints: list[Breakpoint]):
+        for bp in breakpoints:
+            # If no id is set, we cannot correlate the given breakpoint across
+            # requests, ignore it.
+            if "id" not in bp:
+                continue
+
+            self.resolved_breakpoints[str(bp["id"])] = bp.get("verified", False)
 
-    def send_packet(self, command_dict: Request, set_sequence=True):
+    def _send_recv(self, request: Request[Ta]) -> Optional[Response[Tb]]:
+        """Send a command python dictionary as JSON and receive the JSON
+        response. Validates that the response is the correct sequence and
+        command in the reply. Any events that are received are added to the
+        events list in this object"""
+        seq = self.send_packet(request)
+        response = self.receive_response(seq)
+        if response is None:
+            raise ValueError(f"no response for {request!r}")
+        self.validate_response(request, response)
+        return response
+
+    def send_packet(self, packet: ProtocolMessage) -> int:
         """Take the "command_dict" python dictionary and encode it as a JSON
         string and send the contents as a packet to the VSCode debug
-        adapter"""
-        # Set the sequence ID for this command automatically
-        if set_sequence:
-            command_dict["seq"] = self.sequence
+        adapter.
+
+        Returns the seq of the packet."""
+        # Set the seq for requests.
+        if packet["type"] == "request":
+            packet["seq"] = self.sequence
             self.sequence += 1
+        else:
+            packet["seq"] = 0
+
         # Encode our command dictionary as a JSON string
-        json_str = json.dumps(command_dict, separators=(",", ":"))
+        json_str = json.dumps(packet, separators=(",", ":"))
+
         if self.trace_file:
             self.trace_file.write("to adapter:\n%s\n" % (json_str))
+
         length = len(json_str)
         if length > 0:
             # Send the encoded JSON packet and flush the 'send' file
             self.send.write(self.encode_content(json_str))
             self.send.flush()
 
-    def recv_packet(
-        self,
-        filter_type: Optional[str] = None,
-        filter_event: Optional[Union[str, list[str]]] = None,
-        timeout: Optional[float] = None,
-    ) -> Optional[ProtocolMessage]:
-        """Get a JSON packet from the VSCode debug adapter. This function
-        assumes a thread that reads packets is running and will deliver
-        any received packets by calling handle_recv_packet(...). This
-        function will wait for the packet to arrive and return it when
-        it does."""
-        while True:
-            try:
-                self.recv_condition.acquire()
-                packet = None
-                while True:
-                    for i, curr_packet in enumerate(self.recv_packets):
-                        if not curr_packet:
-                            raise EOFError
-                        packet_type = curr_packet["type"]
-                        if filter_type is None or packet_type in filter_type:
-                            if filter_event is None or (
-                                packet_type == "event"
-                                and curr_packet["event"] in filter_event
-                            ):
-                                packet = self.recv_packets.pop(i)
-                                break
-                    if packet:
-                        break
-                    # Sleep until packet is received
-                    len_before = len(self.recv_packets)
-                    self.recv_condition.wait(timeout)
-                    len_after = len(self.recv_packets)
-                    if len_before == len_after:
-                        return None  # Timed out
-                return packet
-            except EOFError:
-                return None
-            finally:
-                self.recv_condition.release()
-
-    def send_recv(self, command):
-        """Send a command python dictionary as JSON and receive the JSON
-        response. Validates that the response is the correct sequence and
-        command in the reply. Any events that are received are added to the
-        events list in this object"""
-        self.send_packet(command)
-        done = False
-        while not done:
-            response_or_request = self.recv_packet(filter_type=["response", "request"])
-            if response_or_request is None:
-                desc = 'no response for "%s"' % (command["command"])
-                raise ValueError(desc)
-            if response_or_request["type"] == "response":
-                self.validate_response(command, response_or_request)
-                return response_or_request
-            else:
-                self.reverse_requests.append(response_or_request)
-                if response_or_request["command"] == "runInTerminal":
-                    subprocess.Popen(
-                        response_or_request["arguments"]["args"],
-                        env=response_or_request["arguments"]["env"],
-                    )
-                    self.send_packet(
-                        {
-                            "type": "response",
-                            "request_seq": response_or_request["seq"],
-                            "success": True,
-                            "command": "runInTerminal",
-                            "body": {},
-                        },
-                    )
-                elif response_or_request["command"] == "startDebugging":
-                    self.send_packet(
-                        {
-                            "type": "response",
-                            "request_seq": response_or_request["seq"],
-                            "success": True,
-                            "command": "startDebugging",
-                            "body": {},
-                        },
-                    )
-                else:
-                    desc = 'unknown reverse request "%s"' % (
-                        response_or_request["command"]
-                    )
-                    raise ValueError(desc)
+        return packet["seq"]
 
-        return None
+    def receive_response(self, seq: int) -> Optional[Response]:
+        """Waits for the a response with the associated request_sec."""
+
+        def predicate(p: ProtocolMessage):
+            return p["type"] == "response" and p["request_seq"] == seq
+
+        return cast(Optional[Response], self._recv_packet(predicate=predicate))
+
+    def get_modules(self):
+        modules = {}
+        resp = self.request_modules()
+        if resp["success"]:
+            module_list = resp["body"]["modules"]
+            for module in module_list:
+                modules[module["name"]] = module
+        else:
+            raise ValueError(f"request_modules failed: {resp!r}")
+        return modules
+
+    def get_output(self, category: str, clear=True) -> str:
+        output = ""
+        if category in self.output:
+            output = self.output.get(category, "")
+            if clear:
+                del self.output[category]
+        return output
+
+    def collect_output(
+        self,
+        category: str,
+        timeout_secs: float,
+        pattern: Optional[str] = None,
+        clear=True,
+    ) -> str:
+        """Collect output from 'output' events.
+
+        Args:
+            category: The category to collect.
+            timeout_secs: The max duration for collecting output.
+            pattern:
+                Optional, if set, return once this pattern is detected in the
+                collected output.
+
+        Returns:
+            The collected output.
+        """
+        deadline = time.monotonic() + timeout_secs
+        output = self.get_output(category, clear)
+        while deadline >= time.monotonic() and (
+            pattern is None or pattern not in output
+        ):
+            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
+            if not event:  # Timeout or EOF
+                break
+            output += self.get_output(category, clear=clear)
+        return output
 
     def wait_for_event(
-        self, filter: Union[str, list[str]], timeout: Optional[float] = None
+        self, filter: List[str] = [], timeout: Optional[float] = None
     ) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
-        return self.recv_packet(
-            filter_type="event", filter_event=filter, timeout=timeout
+
+        def predicate(p: ProtocolMessage):
+            return p["type"] == "event" and p["event"] in filter
+
+        return cast(
+            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
         )
 
     def wait_for_stopped(
         self, timeout: Optional[float] = None
-    ) -> Optional[list[Event]]:
+    ) -> Optional[List[Event]]:
         stopped_events = []
         stopped_event = self.wait_for_event(
             filter=["stopped", "exited"], timeout=timeout
@@ -463,9 +632,9 @@ def wait_for_stopped(
         return stopped_events
 
     def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
-        breakpoint_events: list[Event] = []
+        breakpoint_events: List[Event] = []
         while True:
-            event = self.wait_for_event("breakpoint", timeout=timeout)
+            event = self.wait_for_event(["breakpoint"], timeout=timeout)
             if not event:
                 break
             breakpoint_events.append(event)
@@ -476,20 +645,26 @@ def wait_for_breakpoints_to_be_verified(
     ):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event("breakpoint", timeout=timeout)
+            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
             if breakpoint_event is None:
                 break
 
-        return [id for id in breakpoint_ids if id not in self.resolved_breakpoints]
+        return [
+            id
+            for id in breakpoint_ids
+            if id not in self.resolved_breakpoints and not self.resolved_breakpoints[id]
+        ]
 
     def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event("exited", timeout=timeout)
+        event_dict = self.wait_for_event(["exited"], timeout=timeout)
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
     def wait_for_terminated(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event("terminated", timeout)
+        if self.terminated:
+            raise ValueError("already terminated")
+        event_dict = self.wait_for_event(["terminated"], timeout)
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -524,12 +699,10 @@ def get_stackFrame(self, frameIndex=0, threadId=None):
         if threadId is None:
             threadId = self.get_thread_id()
         if threadId is None:
-            print("invalid threadId")
             return None
         response = self.request_stackTrace(threadId, startFrame=frameIndex, levels=1)
         if response:
             return response["body"]["stackFrames"][0]
-        print("invalid response")
         return None
 
     def get_completions(self, text, frameId=None):
@@ -667,7 +840,7 @@ def request_attach(
         gdbRemotePort: Optional[int] = None,
         gdbRemoteHostname: Optional[str] = None,
     ):
-        args_dict = {}
+        args_dict: AttachArguments = {}
         if pid is not None:
             args_dict["pid"] = pid
         if program is not None:
@@ -699,8 +872,12 @@ def request_attach(
             args_dict["gdb-remote-port"] = gdbRemotePort
         if gdbRemoteHostname is not None:
             args_dict["gdb-remote-hostname"] = gdbRemoteHostname
-        command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        command_dict: Request = {
+            "command": "attach",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self._send_recv(command_dict)
 
     def request_breakpointLocations(
         self, file_path, line, end_line=None, column=None, end_column=None
@@ -722,7 +899,7 @@ def request_breakpointLocations(
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_configurationDone(self):
         command_dict = {
@@ -730,7 +907,7 @@ def request_configurationDone(self):
             "type": "request",
             "arguments": {},
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
             self.request_threads()
@@ -759,7 +936,7 @@ def request_continue(self, threadId=None, singleThread=False):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response["success"]:
             self._process_continued(response["body"]["allThreadsContinued"])
         # Caller must still call wait_for_stopped.
@@ -776,7 +953,7 @@ def request_restart(self, restartArguments=None):
         if restartArguments:
             command_dict["arguments"] = restartArguments
 
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         # Caller must still call wait_for_stopped.
         return response
 
@@ -792,7 +969,7 @@ def request_disconnect(self, terminateDebuggee=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_disassemble(
         self,
@@ -812,7 +989,7 @@ def request_disassemble(
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)["body"]["instructions"]
+        return self._send_recv(command_dict)["body"]["instructions"]
 
     def request_readMemory(self, memoryReference, offset, count):
         args_dict = {
@@ -825,7 +1002,7 @@ def request_readMemory(self, memoryReference, offset, count):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None):
         stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
@@ -841,7 +1018,7 @@ def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_exceptionInfo(self, threadId=None):
         if threadId is None:
@@ -852,7 +1029,7 @@ def request_exceptionInfo(self, threadId=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_initialize(self, sourceInitFile=False):
         command_dict = {
@@ -873,7 +1050,7 @@ def request_initialize(self, sourceInitFile=False):
                 "$__lldb_sourceInitFile": sourceInitFile,
             },
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response:
             if "body" in response:
                 self.capabilities = response["body"]
@@ -908,7 +1085,7 @@ def request_launch(
         customFrameFormat: Optional[str] = None,
         customThreadFormat: Optional[str] = None,
     ):
-        args_dict = {"program": program}
+        args_dict: LaunchArguments = {"program": program}
         if args:
             args_dict["args"] = args
         if cwd:
@@ -955,15 +1132,19 @@ def request_launch(
         args_dict["displayExtendedBacktrace"] = displayExtendedBacktrace
         if commandEscapePrefix is not None:
             args_dict["commandEscapePrefix"] = commandEscapePrefix
-        command_dict = {"command": "launch", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        command_dict: Request = {
+            "command": "launch",
+            "type": "request",
+            "arguments": args_dict,
+        }
+        return self._send_recv(command_dict)
 
     def request_next(self, threadId, granularity="statement"):
         if self.exit_status is not None:
             raise ValueError("request_continue called after process exited")
         args_dict = {"threadId": threadId, "granularity": granularity}
         command_dict = {"command": "next", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepIn(self, threadId, targetId, granularity="statement"):
         if self.exit_status is not None:
@@ -976,7 +1157,7 @@ def request_stepIn(self, threadId, targetId, granularity="statement"):
             "granularity": granularity,
         }
         command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepInTargets(self, frameId):
         if self.exit_status is not None:
@@ -988,14 +1169,14 @@ def request_stepInTargets(self, frameId):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_stepOut(self, threadId):
         if self.exit_status is not None:
             raise ValueError("request_stepOut called after process exited")
         args_dict = {"threadId": threadId}
         command_dict = {"command": "stepOut", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_pause(self, threadId=None):
         if self.exit_status is not None:
@@ -1004,49 +1185,47 @@ def request_pause(self, threadId=None):
             threadId = self.get_thread_id()
         args_dict = {"threadId": threadId}
         command_dict = {"command": "pause", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_scopes(self, frameId):
         args_dict = {"frameId": frameId}
         command_dict = {"command": "scopes", "type": "request", "arguments": args_dict}
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
-    def request_setBreakpoints(self, source: Source, line_array, data=None):
+    def request_setBreakpoints(
+        self,
+        source: Union[Source, str],
+        line_array: Optional[List[int]],
+        data: Optional[List[BreakpointData]] = None,
+    ):
         """data is array of parameters for breakpoints in line_array.
         Each parameter object is 1:1 mapping with entries in line_entry.
         It contains optional location/hitCondition/logMessage parameters.
         """
+        if isinstance(source, str):
+            source = Source(path=source)
         args_dict = {
-            "source": source.as_dict(),
+            "source": source.to_DAP(),
             "sourceModified": False,
         }
-        if line_array is not None:
+        if line_array:
             args_dict["lines"] = line_array
             breakpoints = []
             for i, line in enumerate(line_array):
-                breakpoint_data = None
+                breakpoint_data: BreakpointData = {}
                 if data is not None and i < len(data):
                     breakpoint_data = data[i]
-                bp = {"line": line}
-                if breakpoint_data is not None:
-                    if breakpoint_data.get("condition"):
-                        bp["condition"] = breakpoint_data["condition"]
-                    if breakpoint_data.get("hitCondition"):
-                        bp["hitCondition"] = breakpoint_data["hitCondition"]
-                    if breakpoint_data.get("logMessage"):
-                        bp["logMessage"] = breakpoint_data["logMessage"]
-                    if breakpoint_data.get("column"):
-                        bp["column"] = breakpoint_data["column"]
+                bp: SourceBreakpoint = {"line": line, **breakpoint_data}
                 breakpoints.append(bp)
             args_dict["breakpoints"] = breakpoints
 
-        command_dict = {
+        command_dict: Request = {
             "command": "setBreakpoints",
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
-        if response["success"]:
+        response = self._send_recv(command_dict)
+        if response and response["success"] and response["body"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
@@ -1061,7 +1240,7 @@ def request_setExceptionBreakpoints(
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=None):
         breakpoints = []
@@ -1078,7 +1257,7 @@ def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=Non
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
@@ -1099,7 +1278,7 @@ def request_dataBreakpointInfo(
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setDataBreakpoint(self, dataBreakpoints):
         """dataBreakpoints is a list of dictionary with following fields:
@@ -1116,7 +1295,7 @@ def request_setDataBreakpoint(self, dataBreakpoints):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_compileUnits(self, moduleId):
         args_dict = {"moduleId": moduleId}
@@ -1125,7 +1304,7 @@ def request_compileUnits(self, moduleId):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         return response
 
     def request_completions(self, text, frameId=None):
@@ -1137,10 +1316,10 @@ def request_completions(self, text, frameId=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_modules(self):
-        return self.send_recv({"command": "modules", "type": "request"})
+        return self._send_recv({"command": "modules", "type": "request"})
 
     def request_stackTrace(
         self, threadId=None, startFrame=None, levels=None, format=None, dump=False
@@ -1159,7 +1338,7 @@ def request_stackTrace(
             "type": "request",
             "arguments": args_dict,
         }
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if dump:
             for idx, frame in enumerate(response["body"]["stackFrames"]):
                 name = frame["name"]
@@ -1185,7 +1364,7 @@ def request_source(self, sourceReference):
                 "sourceReference": sourceReference,
             },
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_threads(self):
         """Request a list of all threads and combine any information from any
@@ -1193,7 +1372,7 @@ def request_threads(self):
         thread actually stopped. Returns an array of thread dictionaries
         with information about all threads"""
         command_dict = {"command": "threads", "type": "request", "arguments": {}}
-        response = self.send_recv(command_dict)
+        response = self._send_recv(command_dict)
         if not response["success"]:
             self.threads = None
             return response
@@ -1233,7 +1412,7 @@ def request_variables(
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_setVariable(self, containingVarRef, name, value, id=None):
         args_dict = {
@@ -1248,7 +1427,7 @@ def request_setVariable(self, containingVarRef, name, value, id=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_locations(self, locationReference):
         args_dict = {
@@ -1259,7 +1438,7 @@ def request_locations(self, locationReference):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def request_testGetTargetBreakpoints(self):
         """A request packet used in the LLDB test suite to get all currently
@@ -1271,12 +1450,12 @@ def request_testGetTargetBreakpoints(self):
             "type": "request",
             "arguments": {},
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
     def terminate(self):
         self.send.close()
-        if self.recv_thread.is_alive():
-            self.recv_thread.join()
+        if self._recv_thread.is_alive():
+            self._recv_thread.join()
 
     def request_setInstructionBreakpoints(self, memory_reference=[]):
         breakpoints = []
@@ -1291,7 +1470,7 @@ def request_setInstructionBreakpoints(self, memory_reference=[]):
             "type": "request",
             "arguments": args_dict,
         }
-        return self.send_recv(command_dict)
+        return self._send_recv(command_dict)
 
 
 class DebugAdapterServer(DebugCommunication):
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 3b54d598c3509..8778b51e7c360 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional
+from typing import Optional, Callable
 import uuid
 
 import dap_server
@@ -121,11 +121,19 @@ def wait_for_breakpoints_to_resolve(
             f"Expected to resolve all breakpoints. Unresolved breakpoint ids: {unresolved_breakpoints}",
         )
 
-    def waitUntil(self, condition_callback):
-        for _ in range(20):
-            if condition_callback():
+    def wait_until(
+        self,
+        predicate: Callable[[], bool],
+        delay: float = 0.5,
+        timeout: float = DEFAULT_TIMEOUT,
+    ) -> bool:
+        """Repeatedly run the predicate until either the predicate returns True
+        or a timeout has occurred."""
+        deadline = time.monotonic() + timeout
+        while deadline > time.monotonic():
+            if predicate():
                 return True
-            time.sleep(0.5)
+            time.sleep(delay)
         return False
 
     def assertCapabilityIsSet(self, key: str, msg: Optional[str] = None) -> None:
@@ -144,6 +152,7 @@ def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
+        breakpoint_ids = [str(i) for i in breakpoint_ids]
         stopped_events = self.dap_server.wait_for_stopped(timeout)
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -155,22 +164,16 @@ def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
                     and body["reason"] != "instruction breakpoint"
                 ):
                     continue
-                if "description" not in body:
+                if "hitBreakpointIds" not in body:
                     continue
-                # Descriptions for breakpoints will be in the form
-                # "breakpoint 1.1", so look for any description that matches
-                # ("breakpoint 1.") in the description field as verification
-                # that one of the breakpoint locations was hit. DAP doesn't
-                # allow breakpoints to have multiple locations, but LLDB does.
-                # So when looking at the description we just want to make sure
-                # the right breakpoint matches and not worry about the actual
-                # location.
-                description = body["description"]
-                for breakpoint_id in breakpoint_ids:
-                    match_desc = f"breakpoint {breakpoint_id}."
-                    if match_desc in description:
+                hit_breakpoint_ids = body["hitBreakpointIds"]
+                for bp in hit_breakpoint_ids:
+                    if str(bp) in breakpoint_ids:
                         return
-        self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}")
+        self.assertTrue(
+            False,
+            f"breakpoint not hit, wanted breakpoint_ids={breakpoint_ids} stopped_events={stopped_events}",
+        )
 
     def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
         """Wait for the process we are debugging to stop, and verify the stop
@@ -205,7 +208,9 @@ def verify_commands(self, flavor, output, commands):
                     found = True
                     break
             self.assertTrue(
-                found, "verify '%s' found in console output for '%s'" % (cmd, flavor)
+                found,
+                "verify '%s' found in console output for '%s' in %s"
+                % (cmd, flavor, output),
             )
 
     def get_dict_value(self, d, key_path):
@@ -277,26 +282,30 @@ def get_source_and_line(self, threadId=None, frameIndex=0):
                         return (source["path"], stackFrame["line"])
         return ("", 0)
 
-    def get_stdout(self, timeout=0.0):
-        return self.dap_server.get_output("stdout", timeout=timeout)
+    def get_stdout(self):
+        return self.dap_server.get_output("stdout")
 
-    def get_console(self, timeout=0.0):
-        return self.dap_server.get_output("console", timeout=timeout)
+    def get_console(self):
+        return self.dap_server.get_output("console")
 
-    def get_important(self, timeout=0.0):
-        return self.dap_server.get_output("important", timeout=timeout)
+    def get_important(self):
+        return self.dap_server.get_output("important")
 
-    def collect_stdout(self, timeout_secs, pattern=None):
+    def collect_stdout(self, timeout_secs: float, pattern: Optional[str] = None) -> str:
         return self.dap_server.collect_output(
             "stdout", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_console(self, timeout_secs, pattern=None):
+    def collect_console(
+        self, timeout_secs: float, pattern: Optional[str] = None
+    ) -> str:
         return self.dap_server.collect_output(
             "console", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_important(self, timeout_secs, pattern=None):
+    def collect_important(
+        self, timeout_secs: float, pattern: Optional[str] = None
+    ) -> str:
         return self.dap_server.collect_output(
             "important", timeout_secs=timeout_secs, pattern=pattern
         )
@@ -355,7 +364,7 @@ def stepOut(self, threadId=None, waitForStop=True, timeout=DEFAULT_TIMEOUT):
             return self.dap_server.wait_for_stopped(timeout)
         return None
 
-    def do_continue(self):  # `continue` is a keyword.
+    def do_continue(self) -> None:  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
@@ -363,10 +372,14 @@ def continue_to_next_stop(self, timeout=DEFAULT_TIMEOUT):
         self.do_continue()
         return self.dap_server.wait_for_stopped(timeout)
 
-    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
-        self.continue_to_breakpoints((breakpoint_id), timeout)
+    def continue_to_breakpoint(
+        self, breakpoint_id: int, timeout: Optional[float] = DEFAULT_TIMEOUT
+    ) -> None:
+        self.continue_to_breakpoints([breakpoint_id], timeout)
 
-    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
+    def continue_to_breakpoints(
+        self, breakpoint_ids: list[int], timeout: Optional[float] = DEFAULT_TIMEOUT
+    ) -> None:
         self.do_continue()
         self.verify_breakpoint_hit(breakpoint_ids, timeout)
 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 831edd6494c1e..a6eeee3a02543 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -78,7 +78,7 @@ def test_source_map(self):
         self.assertFalse(breakpoint["verified"])
         self.assertEqual(other_basename, breakpoint["source"]["name"])
         self.assertEqual(new_other_path, breakpoint["source"]["path"])
-        other_breakpoint_id = breakpoint["id"]
+        other_breakpoint_id = str(breakpoint["id"])
 
         self.dap_server.request_continue()
         self.verify_breakpoint_hit([other_breakpoint_id])
@@ -379,7 +379,8 @@ def test_column_breakpoints(self):
             self.assertEqual(breakpoint["line"], loop_line)
             self.assertEqual(breakpoint["column"], columns[index])
             self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
-            breakpoint_ids.append(breakpoint["id"])
+            self.assertIn("id", breakpoint, "expected breakpoint id")
+            breakpoint_ids.append(str(breakpoint["id"]))
 
         # Continue to the first breakpoint,
         self.continue_to_breakpoints([breakpoint_ids[0]])
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index 824ed8fe3bb97..c750cff071a80 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -54,18 +54,18 @@ def test_pending_request(self):
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
         cancel_seq = self.async_cancel(requestId=pending_seq)
 
-        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
+        blocking_resp = self.dap_server.receive_response(blocking_seq)
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], True)
 
-        pending_resp = self.dap_server.recv_packet(filter_type=["response"])
+        pending_resp = self.dap_server.receive_response(pending_seq)
         self.assertEqual(pending_resp["request_seq"], pending_seq)
         self.assertEqual(pending_resp["command"], "evaluate")
         self.assertEqual(pending_resp["success"], False)
         self.assertEqual(pending_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
+        cancel_resp = self.dap_server.receive_response(cancel_seq)
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
@@ -86,13 +86,13 @@ def test_inflight_request(self):
         )
         cancel_seq = self.async_cancel(requestId=blocking_seq)
 
-        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
+        blocking_resp = self.dap_server.receive_response(blocking_seq)
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], False)
         self.assertEqual(blocking_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
+        cancel_resp = self.dap_server.receive_response(cancel_seq)
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index ae8142ae4f484..c29e0d3fa7b81 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -191,7 +191,7 @@ def test_disableSTDIO(self):
         self.continue_to_exit()
         # Now get the STDOUT and verify our program argument is correct
         output = self.get_stdout()
-        self.assertEqual(output, None, "expect no program output")
+        self.assertEqual(output, "", "expect no program output")
 
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
@@ -392,14 +392,14 @@ def test_commands(self):
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue again and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the second breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
@@ -461,21 +461,21 @@ def test_extra_launch_commands(self):
         self.verify_commands("launchCommands", output, launchCommands)
         # Verify the "stopCommands" here
         self.continue_to_next_stop()
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_next_stop()
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
         self.continue_to_exit()
         # Get output from the console. This should contain both the
         # "exitCommands" that were run after the second breakpoint was hit
-        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        output = self.get_console()
         self.verify_commands("exitCommands", output, exitCommands)
 
     def test_failing_launch_commands(self):
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index 4fc221668a8ee..b1823e4c8b1c3 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -54,7 +54,7 @@ def checkSymbolsLoadedWithSize():
             return symbol_regex.match(program_module["symbolStatus"])
 
         if expect_debug_info_size:
-            self.waitUntil(checkSymbolsLoadedWithSize)
+            self.wait_until(checkSymbolsLoadedWithSize)
         active_modules = self.dap_server.get_modules()
         program_module = active_modules[program_basename]
         self.assertEqual(program_basename, program_module["name"])
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index 0425b55a5e552..4fcde623e3829 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -37,14 +37,14 @@ def test_output(self):
         # Disconnecting from the server to ensure any pending IO is flushed.
         self.dap_server.request_disconnect()
 
-        output += self.get_stdout(timeout=self.DEFAULT_TIMEOUT)
+        output += self.get_stdout()
         self.assertTrue(output and len(output) > 0, "expect program stdout")
         self.assertIn(
             "abcdefghi\r\nhello world\r\nfinally\0\0",
             output,
             "full stdout not found in: " + repr(output),
         )
-        console = self.get_console(timeout=self.DEFAULT_TIMEOUT)
+        console = self.get_console()
         self.assertTrue(console and len(console) > 0, "expect dap messages")
         self.assertIn(
             "out\0\0\r\nerr\0\0\r\n", console, f"full console message not found"

From 3f33c8482fc0b8dd0d2596262ebd0ed73d41665d Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 17 Jun 2025 15:27:41 -0700
Subject: [PATCH 813/851] [clang] Add release note for int->enum conversion
 change. (#144407)

This seems to be having some practical impact, so we should let people
know.
---
 clang/docs/ReleaseNotes.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6f28dbd03ca2a..12816eed2e8b5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -65,8 +65,10 @@ C++ Specific Potentially Breaking Changes
   standard library already have their own bespoke builtins.
 - A workaround for libstdc++4.7 has been removed. Note that 4.8.3 remains the oldest
   supported libstdc++ version.
-
 - Added ``!nonnull/!align`` metadata to load of references for better codegen.
+- Checking for int->enum conversions in constant expressions is more strict;
+  in particular, ``const E x = (E)-1;`` is not treated as a constant if it's
+  out of range. This impacts old versions of Boost.  (#GH143034)
 
 ABI Changes in This Version
 ---------------------------

From f25f2f7de4f8264d89ba3c4dc9daddb10a90c13f Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 17 Jun 2025 15:46:35 -0700
Subject: [PATCH 814/851] [MLIR][XeGPU] Extend unrolling support for scatter
 ops with chunk_size (#144447)

Add support for load/store with chunk_size, which requires special
consideration for the operand blocking since offests and masks are
 n-D and tensor are n+1-D. Support operations including create_tdesc,
update_tdesc, load, store, and prefetch.

---------

Co-authored-by: Adam Siemieniuk <adam.siemieniuk@intel.com>
---
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp  | 176 +++++++++++----
 .../Dialect/XeGPU/xegpu-unroll-patterns.mlir  | 208 ++++++++++++------
 .../lib/Dialect/XeGPU/TestXeGPUTransforms.cpp |  49 +++--
 3 files changed, 312 insertions(+), 121 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index 9c234c1e866b9..0457f8128b908 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -402,30 +402,58 @@ struct UnrollCreateDescOp : public UnrollPattern<xegpu::CreateDescOp> {
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getType();
+    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
+    VectorType indiceVecTy = indiceVec.getType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
-    auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
-
-    TypedValue<::mlir::VectorType> indiceVec = op.getOffsets();
-    VectorType indiceVecTy = indiceVec.getType();
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    // IndiceVec is 1 dim lower than tdescTy when chunkSize is larger than 1.
+    if (originalChunkSize > 1)
+      targetIndiceShape.pop_back();
 
+    auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0];
     SmallVector<Type> convertedIndiceTypes =
-        getUnrolledTypes(indiceVecTy, *targetShape);
+        getUnrolledTypes(indiceVecTy, targetIndiceShape);
     SmallVector<Value> convertedIndiceVec =
-        pack(indiceVec, convertedIndiceTypes, *targetShape, loc, rewriter);
+        pack(indiceVec, convertedIndiceTypes, targetIndiceShape, loc, rewriter);
 
     SmallVector<Value> newOps;
-    for (auto indice : convertedIndiceVec) {
-      auto newOp = rewriter.create<xegpu::CreateDescOp>(loc, newTdescTy,
-                                                        op.getSource(), indice);
-      newOps.push_back(newOp);
+
+    // More indices is need when chunkSize > 1. Since a big load from one
+    // address could be break into multiple small loads.
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto [indice, indiceType] :
+           llvm::zip(convertedIndiceVec, convertedIndiceTypes)) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          // Compute the offset
+          Value inc = rewriter.create<arith::ConstantIndexOp>(
+              loc, i * blockedChunkSize);
+          Value incVec = rewriter.create<vector::SplatOp>(loc, indiceType, inc);
+          Value offsetIndice =
+              rewriter.create<arith::AddIOp>(loc, indice, incVec);
+
+          auto newOp = rewriter.create<xegpu::CreateDescOp>(
+              loc, newTdescTy, op.getSource(), offsetIndice);
+
+          newOps.push_back(newOp);
+        }
+      }
+    } else {
+      for (auto indice : convertedIndiceVec) {
+        auto newOp = rewriter.create<xegpu::CreateDescOp>(
+            loc, newTdescTy, op.getSource(), indice);
+        newOps.push_back(newOp);
+      }
     }
 
     Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter);
@@ -444,16 +472,18 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
-    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
-
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
+    SmallVector<int64_t> targetMaskShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
     Type elemTy = tdescTy.getElementType();
     VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
 
@@ -462,10 +492,29 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     SmallVector<Value> convertedTdescs = pack(
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
-    SmallVector<Type> convertedMaskTypes =
-        getUnrolledTypes(maskTy, *targetShape);
-    SmallVector<Value> convertedMasks =
-        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      targetMaskShape.pop_back();
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, targetMaskShape, loc, rewriter);
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i)
+          convertedMasks.push_back(mask);
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+      newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape);
+      convertedMasks = pack(op.getMask(), convertedMaskTypes, targetMaskShape,
+                            loc, rewriter);
+    }
 
     SmallVector<Value> newOps;
     for (auto [t, m] : llvm::zip(convertedTdescs, convertedMasks)) {
@@ -476,7 +525,6 @@ struct UnrollLoadGatherOp : public UnrollPattern<xegpu::LoadGatherOp> {
     }
 
     Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter);
-
     rewriter.replaceOp(op, castOp);
     return success();
   }
@@ -489,8 +537,7 @@ struct UnrollPrefetchOp : public UnrollPattern<xegpu::PrefetchOp> {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
@@ -519,30 +566,51 @@ struct UnrollStoreScatterOp : public UnrollPattern<xegpu::StoreScatterOp> {
     VectorType valueTy = llvm::dyn_cast<VectorType>(op.getValue().getType());
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (!tdescTy.isScattered())
       return failure();
 
-    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
-
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
     if (!targetShape)
       return failure();
 
-    SmallVector<Type> convertedValTypes =
-        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<int64_t> targetIndiceShape(*targetShape);
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+
+    VectorType maskTy = llvm::dyn_cast<VectorType>(op.getMask().getType());
+
     SmallVector<Type> convertedTdescTypes =
         getUnrolledTypes(tdescTy, *targetShape);
-
-    SmallVector<Value> convertedValues =
-        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
     SmallVector<Value> convertedTdescs = pack(
         op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter);
 
-    SmallVector<Type> convertedMaskTypes =
-        getUnrolledTypes(maskTy, *targetShape);
-    SmallVector<Value> convertedMasks =
-        pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    SmallVector<Type> convertedMaskTypes;
+    SmallVector<Value> convertedMasks;
+
+    if (originalChunkSize > 1) {
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+      convertedMaskTypes = getUnrolledTypes(maskTy, (*targetShape)[0]);
+      SmallVector<Value> convertedMasks1D = pack(
+          op.getMask(), convertedMaskTypes, (*targetShape)[0], loc, rewriter);
+
+      for (auto mask : convertedMasks1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedMasks.push_back(mask);
+        }
+      }
+      // This is to handle the transpose effect when chunkSize > 1.
+      std::swap((*targetShape)[0], (*targetShape)[1]);
+
+    } else {
+      convertedMaskTypes = getUnrolledTypes(maskTy, *targetShape);
+      convertedMasks =
+          pack(op.getMask(), convertedMaskTypes, *targetShape, loc, rewriter);
+    }
+
+    SmallVector<Type> convertedValTypes =
+        getUnrolledTypes(valueTy, *targetShape);
+    SmallVector<Value> convertedValues =
+        pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
     for (size_t i = 0; i < convertedValues.size(); ++i) {
       Value v = convertedValues[i];
@@ -565,8 +633,10 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
     Location loc = op.getLoc();
     xegpu::TensorDescType tdescTy = op.getTensorDescType();
 
-    // check if the tensor descriptor type is a 1d vector type
-    if (tdescTy.getRank() > 1)
+    if (tdescTy.getRank() > 2)
+      return failure();
+
+    if (!tdescTy.isScattered())
       return failure();
 
     std::optional<SmallVector<int64_t>> targetShape = getTargetShape(op);
@@ -580,12 +650,32 @@ struct UnrollUpdateOffsetOp : public UnrollPattern<xegpu::UpdateOffsetOp> {
 
     TypedValue<::mlir::VectorType> offsetVec = op.getOffsets();
     VectorType offsetVecTy = offsetVec.getType();
-    SmallVector<Type> convertedOffsetTypes =
-        getUnrolledTypes(offsetVecTy, *targetShape);
-    SmallVector<Value> convertedOffsetVec =
-        pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
-
+    SmallVector<Type> convertedOffsetTypes;
+    SmallVector<Value> convertedOffsetVec;
     SmallVector<Value> newOps;
+    int64_t originalChunkSize = tdescTy.getChunkSize();
+    if (originalChunkSize > 1) {
+      SmallVector<int64_t> shape1D(targetShape->begin(),
+                                   targetShape->end() - 1);
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, shape1D);
+      SmallVector<Value> convertedOffsetVec1D =
+          pack(offsetVec, convertedOffsetTypes, shape1D, loc, rewriter);
+
+      int64_t blockedChunkSize = targetShape->back();
+      int64_t numNewChunks = originalChunkSize / blockedChunkSize;
+
+      for (auto offset : convertedOffsetVec1D) {
+        for (int64_t i = 0; i < numNewChunks; ++i) {
+          convertedOffsetVec.push_back(offset);
+        }
+      }
+
+    } else {
+      convertedOffsetTypes = getUnrolledTypes(offsetVecTy, *targetShape);
+      convertedOffsetVec =
+          pack(offsetVec, convertedOffsetTypes, *targetShape, loc, rewriter);
+    }
+
     for (auto [t, o] : llvm::zip(convertedTdesc, convertedOffsetVec)) {
       auto newOp =
           rewriter.create<xegpu::UpdateOffsetOp>(loc, t.getType(), t, o);
diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
index 52ec3b856da49..41414d802f212 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir
@@ -2,7 +2,7 @@
 
 gpu.module @test {
 
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
@@ -10,31 +10,31 @@ gpu.module @test {
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>,
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {__xegpu_blocking_tile_shape__ = array<i64: 8, 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_create_nd_tdesc_1d
+  // CHECK-LABEL: create_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast
   // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
   // CHECK-SAME: to !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {__xegpu_blocking_tile_shape__ = array<i64: 16>, __xegpu_blocking_unpack__}
-  gpu.func @test_create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %tdesc : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
   }
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc
+  // CHECK-LABEL: update_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
+  gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %update = xegpu.update_nd_offset %tdesc, [0, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return %update : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -42,11 +42,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_update_nd_tdesc_1d
+  // CHECK-LABEL: update_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-2: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-2: [[update:%.+]] = xegpu.update_nd_offset {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @update_nd_tdesc_1d(%src: memref<64xf32>) -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     %update = xegpu.update_nd_offset %tdesc, [32] : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return %update : !xegpu.tensor_desc<32xf32, #xegpu.layout<inst_data = [16]>>
@@ -54,11 +54,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     gpu.return
@@ -66,23 +66,23 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc_1d
+  // CHECK-LABEL: prefetch_nd_tdesc_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.prefetch_nd {{.*}} : !xegpu.tensor_desc<16xf32>
-  gpu.func @test_prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
+  gpu.func @prefetch_nd_tdesc_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     xegpu.prefetch_nd %tdesc : !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     gpu.return
   }
 
   //-----
-  // CHECK-LABEL: test_load_nd
+  // CHECK-LABEL: load_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
   // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32>
-  gpu.func @test_load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
+  gpu.func @load_nd(%src: memref<24x32xf32>) -> vector<24x32xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
     gpu.return %ld : vector<24x32xf32>
@@ -90,12 +90,12 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_load_nd_1d
+  // CHECK-LABEL: load_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: [[ld:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16xf32> -> vector<16xf32>
   // CHECK-COUNT-4: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<16xf32> into vector<64xf32>
-  gpu.func @test_load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
+  gpu.func @load_nd_1d(%src: memref<64xf32>) -> vector<64xf32> {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = xegpu.load_nd %tdesc: !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>> -> vector<64xf32>
     gpu.return %data : vector<64xf32>
@@ -103,11 +103,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   // CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK-COUNT-6: xegpu.store_nd {{.*}}  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     xegpu.store_nd %data, %tdesc: vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
@@ -116,11 +116,11 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_store_nd_1d
+  // CHECK-LABEL: store_nd_1d
   // CHECK-SAME: [[arg0:%.+]]: memref<64xf32>
   // CHECK-COUNT-4: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<64xf32> -> !xegpu.tensor_desc<16xf32>
   // CHECK-COUNT-4: xegpu.store_nd {{.*}}  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
-  gpu.func @test_store_nd_1d(%src: memref<64xf32>) {
+  gpu.func @store_nd_1d(%src: memref<64xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0] : memref<64xf32> -> !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
     %data = arith.constant dense<9.0> : vector<64xf32>
     xegpu.store_nd %data, %tdesc: vector<64xf32>, !xegpu.tensor_desc<64xf32, #xegpu.layout<inst_data = [16]>>
@@ -129,7 +129,7 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_createNd_loadNd_storeNd
+  // CHECK-LABEL: createNd_loadNd_storeNd
   // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32>
   //CHECK-COUNT-6: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]][{{.*}}] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   //CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -137,7 +137,7 @@ gpu.module @test {
   //CHECK: [[add:%.+]] = arith.addf {{.*}} : vector<24x32xf32>
   //CHECK-COUNT-6: [[extract:%.+]] = vector.extract_strided_slice {{.*}} : vector<24x32xf32> to vector<8x16xf32>
   //CHECK-COUNT-6: xegpu.store_nd {{.*}} : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32>
-  gpu.func @test_createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
+  gpu.func @createNd_loadNd_storeNd(%src: memref<24x32xf32>) {
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>>
     %data = arith.constant dense<9.0> : vector<24x32xf32>
     %ld = xegpu.load_nd %tdesc: !xegpu.tensor_desc<24x32xf32, #xegpu.layout<inst_data = [8, 16]>> -> vector<24x32xf32>
@@ -148,23 +148,23 @@ gpu.module @test {
 
   //-----
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: [[arg0:%.+]]: vector<32x32xf16>, [[arg1:%.+]]: vector<32x32xf16>
   //CHECK-COUNT-8: [[extract1:%.+]] = vector.extract_strided_slice [[arg0]] {{.*}} : vector<32x32xf16> to vector<8x16xf16>
   //CHECK-COUNT-4: [[extract2:%.+]] = vector.extract_strided_slice [[arg1]] {{.*}} : vector<32x32xf16> to vector<16x16xf16>
   //CHECK-COUNT-16: [[dpas:%.+]] = xegpu.dpas {{.*}} -> vector<8x16xf32>
   //CHECK-COUNT-8: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32>
-  gpu.func @test_dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
+  gpu.func @dpas(%a: vector<32x32xf16>, %b: vector<32x32xf16>) -> vector<32x32xf32> {
     %c = xegpu.dpas %a, %b : vector<32x32xf16>, vector<32x32xf16> -> vector<32x32xf32>
     gpu.return %c : vector<32x32xf32>
   }
 
 //-----
 
-  // CHECK-LABEL: test_create_tdesc_vec
+  // CHECK-LABEL: create_tdesc_vec
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_tdesc_vec(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -177,10 +177,10 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_create_tdesc_step
+  // CHECK-LABEL: create_tdesc_step
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
+  gpu.func @create_tdesc_step(%src: ui64) -> !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>> {
     %step = arith.constant dense<8> : vector<32xindex>
     %seq = vector.step  : vector<32xindex>
     %cst = arith.muli %seq, %step : vector<32xindex>
@@ -190,11 +190,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_load
+  // CHECK-LABEL: load
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  gpu.func @test_load(%src: ui64) -> vector<32xf32> {
+  gpu.func @load(%src: ui64) -> vector<32xf32> {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -212,11 +212,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_prefetch
+  // CHECK-LABEL: prefetch
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  gpu.func @test_prefetch(%src: ui64)  {
+  gpu.func @prefetch(%src: ui64)  {
 
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
@@ -233,11 +233,11 @@ gpu.module @test {
 
 //-----
 
-  // CHECK-LABEL: test_store
+  // CHECK-LABEL: store
   // CHECK-SAME: [[arg0:%.+]]: ui64
   // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
   // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
-  gpu.func @test_store(%src: ui64) {
+  gpu.func @store(%src: ui64) {
     %cst = arith.constant dense<[
     0,   8,  16,  24,  32,  40,  48,  56,
     64,  72,  80,  88,  96, 104, 112, 120,
@@ -256,47 +256,129 @@ gpu.module @test {
   }
 
 //-----
+  // CHECK-LABEL: create_tdesc_step_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 4 : i64>>
+  gpu.func @create_tdesc_step_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 4]>>
+  }
 
-  // CHECK-LABEL: test_prefetch_load_store_update
+//-----
+  // CHECK-LABEL: create_tdesc_step_chunk2
   // CHECK-SAME: [[arg0:%.+]]: ui64
-  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
-   // CHECK-COUNT-2: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex>
-   // CHECK-COUNT-2: xegpu.load  {{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
-  // CHECK-COUNT-2: xegpu.store  {{.*}} : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @create_tdesc_step_chunk2(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<32xindex>
+    %seq = vector.step  : vector<32xindex>
+    %cst = arith.muli %seq, %step : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }
 
-  gpu.func @test_prefetch_load_store_update(%src: ui64)  {
+// CHECK-LABEL: create_tdesc_step_chunk3
+  // CHECK-SAME: [[arg0:%.+]]: ui64  
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+ // CHECK: arith.addi %{{.*}}, %{{.*}} : vector<16xindex>
+  // CHECK: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+    gpu.func @create_tdesc_step_chunk3(%src: ui64) -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>> {
+    %step = arith.constant dense<8> : vector<16xindex>
+    %seq = vector.step  : vector<16xindex>
+    %cst = arith.muli %seq, %step : vector<16xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+    gpu.return %tdesc : !xegpu.tensor_desc<16x8xf32,  #xegpu.scatter_tdesc_attr<chunk_size=8>, #xegpu.layout<inst_data = [16, 2]>>
+  }
+
+//-----
+  // CHECK-LABEL: load_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.load  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1> -> vector<2x16xf32>
 
+  gpu.func @load_chunk(%src: ui64) -> vector<4x32xf32> {
     %cst = arith.constant dense<[
-    0,   8,  16,  24,  32,  40,  48,  56,
-    64,  72,  80,  88,  96, 104, 112, 120,
-    128, 136, 144, 152, 160, 168, 176, 184,
-    192, 200, 208, 216, 224, 232, 240, 248 
+        0,   8,  16,  24,  32,  40,  48,  56,
+        64,  72,  80,  88,  96, 104, 112, 120,
+        128, 136, 144, 152, 160, 168, 176, 184,
+        192, 200, 208, 216, 224, 232, 240, 248 
     ]> : vector<32xindex>
+    
+    %c17 = arith.constant 17: index
+    %mask = vector.create_mask %c17: vector<32xi1>
 
-    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32xf32,  #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>
-   
-    %delta = arith.constant dense<[
-    32,   32,  32,  32,  32,  32,  32,  32,
-    32,   32,  32,  32,  32,  32,  32,  64,
-    128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 256 
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> 
+    %ld = xegpu.load %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xi1> -> vector<4x32xf32>
+    
+    gpu.return %ld : vector<4x32xf32> 
+   }
+
+//-----
+  // CHECK-LABEL: store_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} :  ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.store  {{.*}} <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}> : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xi1>
+  gpu.func @store_chunk(%src: ui64) {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
     ]> : vector<32xindex>
-    %new_tdesc = xegpu.update_offset %tdesc, %delta
-              : !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xindex>     
- 
+    
     %c17 = arith.constant 17: index
     %mask = vector.create_mask %c17: vector<32xi1>
 
-    %ld_vec = xegpu.load %new_tdesc, %mask: !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, vector<32xi1> -> vector<32xf32>
+    %st_vec = arith.constant dense<1023.>: vector<4x32xf32>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.store %st_vec, %tdesc, %mask <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>: vector<4x32xf32>, !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16,2]>>, vector<32xi1>
+    
+    gpu.return
+  }
 
-    %st_vec = arith.addf %ld_vec, %ld_vec : vector<32xf32>
-    xegpu.store %st_vec, %tdesc, %mask: 
-                 vector<32xf32>, 
-                 !xegpu.tensor_desc<32xf32, #xegpu.scatter_tdesc_attr<>, #xegpu.layout<inst_data = [16]>>, 
-                 vector<32xi1>
-  
+//-----
+  // CHECK-LABEL: prefetch_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-2: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-2: xegpu.prefetch {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  gpu.func @prefetch_chunk(%src: ui64)  {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+      ]> : vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    xegpu.prefetch %tdesc: !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+    
     gpu.return
   }
+
+//-----
+  // CHECK-LABEL: update_chunk
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK-COUNT-4: xegpu.create_tdesc [[arg0]], {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  // CHECK-COUNT-4: xegpu.update_offset {{.*}} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<16xindex>
+  gpu.func @update_chunk(%src: ui64) -> !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>> {
+    %cst = arith.constant dense<[
+      0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248 
+    ]> : vector<32xindex>
+    %delta = arith.constant dense<32>: vector<32xindex>
+    %tdesc = xegpu.create_tdesc %src, %cst : ui64, vector<32xindex> -> !xegpu.tensor_desc<32x4xf32,  #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+
+    %new_tdesc = xegpu.update_offset %tdesc, %delta
+        : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>, vector<32xindex>
+
+    gpu.return %new_tdesc : !xegpu.tensor_desc<32x4xf32, #xegpu.scatter_tdesc_attr<chunk_size=4>, #xegpu.layout<inst_data = [16, 2]>>
+  }  
 }
+
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 57aaecbd7962f..4400d6d9625f7 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -19,6 +19,10 @@ using namespace mlir::xegpu;
 
 namespace {
 
+#define DEBUG_TYPE "test-xegpu-unroll"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
 struct TestXeGPUUnrollingPatterns
     : public PassWrapper<TestXeGPUUnrollingPatterns,
                          OperationPass<gpu::GPUModuleOp>> {
@@ -48,7 +52,9 @@ struct TestXeGPUUnrollingPatterns
     options.setNativeShapeFn(
         [&](Operation *op) -> std::optional<SmallVector<int64_t>> {
           if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp,
-                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp>(op)) {
+                  xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::StoreNdOp,
+                  xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
+                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
             xegpu::TensorDescType tdescTy;
             if (auto createNdOp = dyn_cast<xegpu::CreateNdDescOp>(op)) {
               tdescTy = createNdOp.getType();
@@ -61,20 +67,7 @@ struct TestXeGPUUnrollingPatterns
               tdescTy = loadNdOp.getTensorDescType();
             } else if (auto storeNdOp = dyn_cast<xegpu::StoreNdOp>(op)) {
               tdescTy = storeNdOp.getTensorDescType();
-            }
-
-            if (auto layout = tdescTy.getLayoutAttr()) {
-              auto inst_data = layout.getInstData();
-              if (inst_data && layout.isSgLayout())
-                return SmallVector<int64_t>(inst_data.asArrayRef().begin(),
-                                            inst_data.asArrayRef().end());
-            }
-          }
-
-          if (isa<xegpu::CreateDescOp, xegpu::UpdateOffsetOp, xegpu::PrefetchOp,
-                  xegpu::LoadGatherOp, xegpu::StoreScatterOp>(op)) {
-            xegpu::TensorDescType tdescTy;
-            if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
+            } else if (auto createOp = dyn_cast<xegpu::CreateDescOp>(op)) {
               tdescTy = createOp.getType();
             } else if (auto updateOp = dyn_cast<xegpu::UpdateOffsetOp>(op)) {
               tdescTy = updateOp.getTensorDescType();
@@ -111,14 +104,40 @@ struct TestXeGPUUnrollingPatterns
             Attribute encoding = tdescTy.getEncoding();
             auto layout = llvm::dyn_cast_if_present<xegpu::LayoutAttr>(
                 tdescTy.getLayout());
+
+            // If the encoding is a ScatterTensorDescAttr, we need to
+            // potentially adjust the chunk size based on the inst_data.
+            if (encoding && mlir::isa<xegpu::ScatterTensorDescAttr>(encoding)) {
+              auto scatterAttr =
+                  mlir::dyn_cast<xegpu::ScatterTensorDescAttr>(encoding);
+              int64_t chunkSize = scatterAttr.getChunkSize().getInt();
+
+              if (chunkSize > 1) {
+                int64_t blockedChunkSize = chunkSize;
+                auto instData = layout.getInstData();
+                if (!instData.empty())
+                  blockedChunkSize = instData.asArrayRef().back();
+
+                auto chunkSizeAttr = mlir::IntegerAttr::get(
+                    mlir::IntegerType::get(ctx, 64), blockedChunkSize);
+
+                // To create a new attribute with a different chunk_size:
+                auto newEncoding = xegpu::ScatterTensorDescAttr::get(
+                    ctx, scatterAttr.getMemorySpace(), chunkSizeAttr);
+
+                encoding = newEncoding;
+              }
+            }
             if (layout) {
               if (layout.getLaneLayout() == nullptr)
                 layout = xegpu::LayoutAttr();
               else
                 layout = layout.dropInstData();
             }
+
             newTy = xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                                layout);
+
           } else {
             newTy = type.clone(tileShape, elemTy);
           }

From fd7e46b864229a270726bd1026387740b9113094 Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Tue, 17 Jun 2025 15:50:42 -0700
Subject: [PATCH 815/851] Revert "[libc++] Remove trailing newline from
 _LIBCPP_ASSERTION_HANDLER calls" (#144615)

Reverts llvm/llvm-project#143573
---
 libcxx/include/__assert               | 4 ++--
 libcxx/src/verbose_abort.cpp          | 3 ---
 libcxx/test/support/check_assertion.h | 2 +-
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 1bfed2890b79f..90eaa6023587b 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(                                                                                    \
-             __FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(expression) " failed: " message))
+       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
+             expression) " failed: " message "\n"))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp
index efb7b9be6f61c..94bdb451dee7a 100644
--- a/libcxx/src/verbose_abort.cpp
+++ b/libcxx/src/verbose_abort.cpp
@@ -30,9 +30,6 @@ _LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept {
     va_list list;
     va_start(list, format);
     std::vfprintf(stderr, format, list);
-    // Callers of `__libcpp_verbose_abort` do not include a newline but when
-    // writing the message to stderr we need to include one.
-    std::fputc('\n', stderr);
     va_end(list);
   }
 
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index ea04944ea9326..a279400d651b4 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -340,7 +340,7 @@ void std::__libcpp_verbose_abort(char const* format, ...) noexcept {
 
   std::fprintf(stderr, "%s\n", Marker);
   std::vfprintf(stderr, format, args);
-  std::fprintf(stderr, "\n%s", Marker);
+  std::fprintf(stderr, "%s", Marker);
 
   va_end(args);
 

From 1cd18bc894b97b282677c1d140688a27ebbec924 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 07:59:05 +0900
Subject: [PATCH 816/851] AMDGPU: Add cost model tests for
 minimumnum/maximumnum (#141904)

The f16 cases in particular look broken since every vector size
has the same reported cost.
---
 .../Analysis/CostModel/AMDGPU/maximumnum.ll   | 452 ++++++++++++++++++
 .../Analysis/CostModel/AMDGPU/minimumnum.ll   | 452 ++++++++++++++++++
 2 files changed, 904 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
new file mode 100644
index 0000000000000..5b158e3d8d674
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @maximumnum_f16() {
+; GFX7-LABEL: 'maximumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16() {
+; GFX7-LABEL: 'maximumnum_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_bf16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_bf16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32() {
+; ALL-LABEL: 'maximumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64() {
+; ALL-LABEL: 'maximumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @maximumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @maximumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @maximumnum_f32_no_ieee() #0 {
+; ALL-LABEL: 'maximumnum_f32_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @maximumnum_f64_no_ieee() #0 {
+; ALL-LABEL: 'maximumnum_f64_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
new file mode 100644
index 0000000000000..97715cbab7d8a
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=ALL,GFX8 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,GFX9 %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=SIZE,GFX7-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=SIZE,GFX8-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=SIZE,GFX12-SIZE %s
+
+define void @minimumnum_f16() {
+; GFX7-LABEL: 'minimumnum_f16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16() {
+; GFX7-LABEL: 'minimumnum_bf16'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_bf16'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_bf16'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_bf16'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_bf16'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_bf16'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_bf16'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_bf16'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32() {
+; ALL-LABEL: 'minimumnum_f32'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f32'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64() {
+; ALL-LABEL: 'minimumnum_f64'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f64'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+define void @minimumnum_f16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+  %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+  %v3f16 = call <3x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+  %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+  %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+  %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+  ret void
+}
+
+define void @minimumnum_bf16_no_ieee() #0 {
+; GFX7-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX8-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+  %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+  %v3bf16 = call <3x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+  %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+  %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+  %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+  ret void
+}
+
+define void @minimumnum_f32_no_ieee() #0 {
+; ALL-LABEL: 'minimumnum_f32_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+  %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+  %v3f32 = call <3x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+  %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+  %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+  %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+  ret void
+}
+
+define void @minimumnum_f64_no_ieee() #0 {
+; ALL-LABEL: 'minimumnum_f64_no_ieee'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+  %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+  %v3f64 = call <3x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+  %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+  %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+  %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+  ret void
+}
+
+attributes #0 = { "amdgpu-ieee"="false" }

From 87b13ada109643bbf5495727b0bf59a46bd533aa Mon Sep 17 00:00:00 2001
From: Finn Plummer <finn.c.plum@gmail.com>
Date: Tue, 17 Jun 2025 15:59:38 -0700
Subject: [PATCH 817/851] [HLSL][RootSignature] Implement serialization of
 remaining Root Elements (#143198)

Implements serialization of the remaining `RootElement`s, namely
`RootDescriptor`s and `StaticSampler`s.

- Adds unit testing for the serialization methods

Resolves https://github.com/llvm/llvm-project/issues/138191
Resolves https://github.com/llvm/llvm-project/issues/138193
---
 .../Frontend/HLSL/HLSLRootSignatureUtils.h    |   6 +
 .../Frontend/HLSL/HLSLRootSignatureUtils.cpp  | 142 ++++++++++++++++++
 .../Frontend/HLSLRootSignatureDumpTest.cpp    | 122 ++++++++++++++-
 3 files changed, 269 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
index 4d2cd183ebcbc..25c2a9f0cc808 100644
--- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
+++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignatureUtils.h
@@ -38,6 +38,12 @@ LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
 
 LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const DescriptorTable &Table);
 
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const RootDescriptor &Descriptor);
+
+LLVM_ABI raw_ostream &operator<<(raw_ostream &OS,
+                                 const StaticSampler &StaticSampler);
+
 LLVM_ABI void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements);
 
 class MetadataBuilder {
diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
index 1e198b639cfdc..a1ddb318055be 100644
--- a/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
+++ b/llvm/lib/Frontend/HLSL/HLSLRootSignatureUtils.cpp
@@ -98,6 +98,109 @@ static raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+static const EnumEntry<SamplerFilter> SamplerFilterNames[] = {
+    {"MinMagMipPoint", SamplerFilter::MinMagMipPoint},
+    {"MinMagPointMipLinear", SamplerFilter::MinMagPointMipLinear},
+    {"MinPointMagLinearMipPoint", SamplerFilter::MinPointMagLinearMipPoint},
+    {"MinPointMagMipLinear", SamplerFilter::MinPointMagMipLinear},
+    {"MinLinearMagMipPoint", SamplerFilter::MinLinearMagMipPoint},
+    {"MinLinearMagPointMipLinear", SamplerFilter::MinLinearMagPointMipLinear},
+    {"MinMagLinearMipPoint", SamplerFilter::MinMagLinearMipPoint},
+    {"MinMagMipLinear", SamplerFilter::MinMagMipLinear},
+    {"Anisotropic", SamplerFilter::Anisotropic},
+    {"ComparisonMinMagMipPoint", SamplerFilter::ComparisonMinMagMipPoint},
+    {"ComparisonMinMagPointMipLinear",
+     SamplerFilter::ComparisonMinMagPointMipLinear},
+    {"ComparisonMinPointMagLinearMipPoint",
+     SamplerFilter::ComparisonMinPointMagLinearMipPoint},
+    {"ComparisonMinPointMagMipLinear",
+     SamplerFilter::ComparisonMinPointMagMipLinear},
+    {"ComparisonMinLinearMagMipPoint",
+     SamplerFilter::ComparisonMinLinearMagMipPoint},
+    {"ComparisonMinLinearMagPointMipLinear",
+     SamplerFilter::ComparisonMinLinearMagPointMipLinear},
+    {"ComparisonMinMagLinearMipPoint",
+     SamplerFilter::ComparisonMinMagLinearMipPoint},
+    {"ComparisonMinMagMipLinear", SamplerFilter::ComparisonMinMagMipLinear},
+    {"ComparisonAnisotropic", SamplerFilter::ComparisonAnisotropic},
+    {"MinimumMinMagMipPoint", SamplerFilter::MinimumMinMagMipPoint},
+    {"MinimumMinMagPointMipLinear", SamplerFilter::MinimumMinMagPointMipLinear},
+    {"MinimumMinPointMagLinearMipPoint",
+     SamplerFilter::MinimumMinPointMagLinearMipPoint},
+    {"MinimumMinPointMagMipLinear", SamplerFilter::MinimumMinPointMagMipLinear},
+    {"MinimumMinLinearMagMipPoint", SamplerFilter::MinimumMinLinearMagMipPoint},
+    {"MinimumMinLinearMagPointMipLinear",
+     SamplerFilter::MinimumMinLinearMagPointMipLinear},
+    {"MinimumMinMagLinearMipPoint", SamplerFilter::MinimumMinMagLinearMipPoint},
+    {"MinimumMinMagMipLinear", SamplerFilter::MinimumMinMagMipLinear},
+    {"MinimumAnisotropic", SamplerFilter::MinimumAnisotropic},
+    {"MaximumMinMagMipPoint", SamplerFilter::MaximumMinMagMipPoint},
+    {"MaximumMinMagPointMipLinear", SamplerFilter::MaximumMinMagPointMipLinear},
+    {"MaximumMinPointMagLinearMipPoint",
+     SamplerFilter::MaximumMinPointMagLinearMipPoint},
+    {"MaximumMinPointMagMipLinear", SamplerFilter::MaximumMinPointMagMipLinear},
+    {"MaximumMinLinearMagMipPoint", SamplerFilter::MaximumMinLinearMagMipPoint},
+    {"MaximumMinLinearMagPointMipLinear",
+     SamplerFilter::MaximumMinLinearMagPointMipLinear},
+    {"MaximumMinMagLinearMipPoint", SamplerFilter::MaximumMinMagLinearMipPoint},
+    {"MaximumMinMagMipLinear", SamplerFilter::MaximumMinMagMipLinear},
+    {"MaximumAnisotropic", SamplerFilter::MaximumAnisotropic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const SamplerFilter &Filter) {
+  printEnum(OS, Filter, ArrayRef(SamplerFilterNames));
+
+  return OS;
+}
+
+static const EnumEntry<TextureAddressMode> TextureAddressModeNames[] = {
+    {"Wrap", TextureAddressMode::Wrap},
+    {"Mirror", TextureAddressMode::Mirror},
+    {"Clamp", TextureAddressMode::Clamp},
+    {"Border", TextureAddressMode::Border},
+    {"MirrorOnce", TextureAddressMode::MirrorOnce},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const TextureAddressMode &Address) {
+  printEnum(OS, Address, ArrayRef(TextureAddressModeNames));
+
+  return OS;
+}
+
+static const EnumEntry<ComparisonFunc> ComparisonFuncNames[] = {
+    {"Never", ComparisonFunc::Never},
+    {"Less", ComparisonFunc::Less},
+    {"Equal", ComparisonFunc::Equal},
+    {"LessEqual", ComparisonFunc::LessEqual},
+    {"Greater", ComparisonFunc::Greater},
+    {"NotEqual", ComparisonFunc::NotEqual},
+    {"GreaterEqual", ComparisonFunc::GreaterEqual},
+    {"Always", ComparisonFunc::Always},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const ComparisonFunc &CompFunc) {
+  printEnum(OS, CompFunc, ArrayRef(ComparisonFuncNames));
+
+  return OS;
+}
+
+static const EnumEntry<StaticBorderColor> StaticBorderColorNames[] = {
+    {"TransparentBlack", StaticBorderColor::TransparentBlack},
+    {"OpaqueBlack", StaticBorderColor::OpaqueBlack},
+    {"OpaqueWhite", StaticBorderColor::OpaqueWhite},
+    {"OpaqueBlackUint", StaticBorderColor::OpaqueBlackUint},
+    {"OpaqueWhiteUint", StaticBorderColor::OpaqueWhiteUint},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const StaticBorderColor &BorderColor) {
+  printEnum(OS, BorderColor, ArrayRef(StaticBorderColorNames));
+
+  return OS;
+}
+
 static const EnumEntry<dxil::ResourceClass> ResourceClassNames[] = {
     {"CBV", dxil::ResourceClass::CBuffer},
     {"SRV", dxil::ResourceClass::SRV},
@@ -112,6 +215,20 @@ static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) {
   return OS;
 }
 
+static const EnumEntry<RootDescriptorFlags> RootDescriptorFlagNames[] = {
+    {"DataVolatile", RootDescriptorFlags::DataVolatile},
+    {"DataStaticWhileSetAtExecute",
+     RootDescriptorFlags::DataStaticWhileSetAtExecute},
+    {"DataStatic", RootDescriptorFlags::DataStatic},
+};
+
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const RootDescriptorFlags &Flags) {
+  printFlags(OS, Flags, ArrayRef(RootDescriptorFlagNames));
+
+  return OS;
+}
+
 static const EnumEntry<DescriptorRangeFlags> DescriptorRangeFlagNames[] = {
     {"DescriptorsVolatile", DescriptorRangeFlags::DescriptorsVolatile},
     {"DataVolatile", DescriptorRangeFlags::DataVolatile},
@@ -182,6 +299,31 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) {
   return OS;
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) {
+  ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type));
+  OS << "Root" << Type << "(" << Descriptor.Reg
+     << ", space = " << Descriptor.Space
+     << ", visibility = " << Descriptor.Visibility
+     << ", flags = " << Descriptor.Flags << ")";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const StaticSampler &Sampler) {
+  OS << "StaticSampler(" << Sampler.Reg << ", filter = " << Sampler.Filter
+     << ", addressU = " << Sampler.AddressU
+     << ", addressV = " << Sampler.AddressV
+     << ", addressW = " << Sampler.AddressW
+     << ", mipLODBias = " << Sampler.MipLODBias
+     << ", maxAnisotropy = " << Sampler.MaxAnisotropy
+     << ", comparisonFunc = " << Sampler.CompFunc
+     << ", borderColor = " << Sampler.BorderColor
+     << ", minLOD = " << Sampler.MinLOD << ", maxLOD = " << Sampler.MaxLOD
+     << ", space = " << Sampler.Space << ", visibility = " << Sampler.Visibility
+     << ")";
+  return OS;
+}
+
 void dumpRootElements(raw_ostream &OS, ArrayRef<RootElement> Elements) {
   OS << "RootElements{";
   bool First = true;
diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
index 1a0c8e2a16396..1c37ee709e098 100644
--- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
+++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp
@@ -108,6 +108,127 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) {
   EXPECT_EQ(Out, Expected);
 }
 
+TEST(HLSLRootSignatureTest, RootCBVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::CBuffer;
+  Descriptor.Reg = {RegisterType::BReg, 0};
+  Descriptor.setDefaultFlags();
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected = "RootCBV(b0, space = 0, "
+                         "visibility = All, "
+                         "flags = DataStaticWhileSetAtExecute)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootSRVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::SRV;
+  Descriptor.Reg = {RegisterType::TReg, 0};
+  Descriptor.Space = 42;
+  Descriptor.Visibility = ShaderVisibility::Geometry;
+  Descriptor.Flags = RootDescriptorFlags::None;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootSRV(t0, space = 42, visibility = Geometry, flags = None)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, RootUAVDump) {
+  RootDescriptor Descriptor;
+  Descriptor.Type = DescriptorType::UAV;
+  Descriptor.Reg = {RegisterType::UReg, 92374};
+  Descriptor.Space = 932847;
+  Descriptor.Visibility = ShaderVisibility::Hull;
+  Descriptor.Flags = RootDescriptorFlags::ValidFlags;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Descriptor;
+  OS.flush();
+
+  std::string Expected =
+      "RootUAV(u92374, space = 932847, visibility = Hull, flags = "
+      "DataVolatile | "
+      "DataStaticWhileSetAtExecute | "
+      "DataStatic)";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefaultStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = Anisotropic, "
+                         "addressU = Wrap, "
+                         "addressV = Wrap, "
+                         "addressW = Wrap, "
+                         "mipLODBias = 0.000000e+00, "
+                         "maxAnisotropy = 16, "
+                         "comparisonFunc = LessEqual, "
+                         "borderColor = OpaqueWhite, "
+                         "minLOD = 0.000000e+00, "
+                         "maxLOD = 3.402823e+38, "
+                         "space = 0, "
+                         "visibility = All"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
+TEST(HLSLRootSignatureTest, DefinedStaticSamplerDump) {
+  StaticSampler Sampler;
+  Sampler.Reg = {RegisterType::SReg, 0};
+
+  Sampler.Filter = SamplerFilter::ComparisonMinMagLinearMipPoint;
+  Sampler.AddressU = TextureAddressMode::Mirror;
+  Sampler.AddressV = TextureAddressMode::Border;
+  Sampler.AddressW = TextureAddressMode::Clamp;
+  Sampler.MipLODBias = 4.8f;
+  Sampler.MaxAnisotropy = 32;
+  Sampler.CompFunc = ComparisonFunc::NotEqual;
+  Sampler.BorderColor = StaticBorderColor::OpaqueBlack;
+  Sampler.MinLOD = 1.0f;
+  Sampler.MaxLOD = 32.0f;
+  Sampler.Space = 7;
+  Sampler.Visibility = ShaderVisibility::Domain;
+
+  std::string Out;
+  llvm::raw_string_ostream OS(Out);
+  OS << Sampler;
+  OS.flush();
+
+  std::string Expected = "StaticSampler(s0, "
+                         "filter = ComparisonMinMagLinearMipPoint, "
+                         "addressU = Mirror, "
+                         "addressV = Border, "
+                         "addressW = Clamp, "
+                         "mipLODBias = 4.800000e+00, "
+                         "maxAnisotropy = 32, "
+                         "comparisonFunc = NotEqual, "
+                         "borderColor = OpaqueBlack, "
+                         "minLOD = 1.000000e+00, "
+                         "maxLOD = 3.200000e+01, "
+                         "space = 7, "
+                         "visibility = Domain"
+                         ")";
+  EXPECT_EQ(Out, Expected);
+}
+
 TEST(HLSLRootSignatureTest, DefaultRootConstantsDump) {
   RootConstants Constants;
   Constants.Num32BitConstants = 1;
@@ -173,7 +294,6 @@ TEST(HLSLRootSignatureTest, AllRootFlagsDump) {
                          "DenyMeshShaderRootAccess | "
                          "CBVSRVUAVHeapDirectlyIndexed | "
                          "SamplerHeapDirectlyIndexed)";
-
   EXPECT_EQ(Out, Expected);
 }
 

From cb63b75e32a415c9bfc298ed7fdcd67e8d9de54c Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 17 Jun 2025 16:01:40 -0700
Subject: [PATCH 818/851] Revert "[lldb-dap] Refactoring DebugCommunication to
 improve test consistency. (#143818)

This reverts commit 362b9d78b4ee9107da2b5e90b3764b0f0fa610fe.

Buildbots using python3.10 are running into errors from this change.
---
 .../test/tools/lldb-dap/dap_server.py         | 875 +++++++-----------
 .../test/tools/lldb-dap/lldbdap_testcase.py   |  79 +-
 .../breakpoint/TestDAP_setBreakpoints.py      |   5 +-
 .../tools/lldb-dap/cancel/TestDAP_cancel.py   |  10 +-
 .../tools/lldb-dap/launch/TestDAP_launch.py   |  12 +-
 .../tools/lldb-dap/module/TestDAP_module.py   |   2 +-
 .../tools/lldb-dap/output/TestDAP_output.py   |   4 +-
 7 files changed, 397 insertions(+), 590 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 23178a215206e..6d32491eaa5e9 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -10,124 +10,17 @@
 import subprocess
 import signal
 import sys
-from dataclasses import dataclass
 import threading
 import time
-from typing import (
-    IO,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Optional,
-    Tuple,
-    TypeVar,
-    Generic,
-    TypedDict,
-    Union,
-    BinaryIO,
-    TextIO,
-    Literal,
-    cast,
-)
+from typing import Any, Optional, Union, BinaryIO, TextIO
 
 ## DAP type references
-
-T = TypeVar("T")
-Te = TypeVar("Te")  # Generic type for event body
-Ta = TypeVar("Ta")  # Generic type for request arguments
-Tb = TypeVar("Tb")  # Generic type for response body
-
-
-class Event(Generic[Te], TypedDict):
-    type: Literal["event"]
-    seq: int
-    event: str
-    body: Optional[Te]
-
-
-class Request(Generic[Ta], TypedDict, total=False):
-    type: Literal["request"]
-    seq: int
-    command: str
-    arguments: Ta
-
-
-class Response(Generic[Tb], TypedDict):
-    type: Literal["response"]
-    seq: int
-    request_seq: int
-    success: bool
-    command: str
-    message: Optional[str]
-    body: Optional[Tb]
-
-
+Event = dict[str, Any]
+Request = dict[str, Any]
+Response = dict[str, Any]
 ProtocolMessage = Union[Event, Request, Response]
 
 
-class AttachOrLaunchArguments(TypedDict, total=False):
-    stopOnEntry: bool
-    disableASLR: bool
-    disableSTDIO: bool
-    enableAutoVariableSummaries: bool
-    displayExtendedBacktrace: bool
-    enableSyntheticChildDebugging: bool
-    initCommands: List[str]
-    preRunCommands: List[str]
-    postRunCommands: List[str]
-    stopCommands: List[str]
-    exitCommands: List[str]
-    terminateCommands: List[str]
-    sourceMap: Union[List[Tuple[str, str]], Dict[str, str]]
-    sourcePath: str
-    debuggerRoot: str
-    commandEscapePrefix: str
-    customFrameFormat: str
-    customThreadFormat: str
-
-
-class LaunchArguments(AttachOrLaunchArguments, total=False):
-    program: str
-    args: List[str]
-    cwd: str
-    env: Dict[str, str]
-    shellExpandArguments: bool
-    runInTerminal: bool
-    launchCommands: List[str]
-
-
-# Using the function form of TypedDict to allow for hyphenated keys.
-AttachGdbServer = TypedDict(
-    "AttachGdbServer", {"gdb-remote-port": int, "gdb-remote-hostname": str}, total=False
-)
-
-
-class AttachArguments(AttachGdbServer, AttachOrLaunchArguments, total=False):
-    program: str
-    pid: int
-    waitFor: bool
-    attachCommands: List[str]
-    coreFile: str
-
-
-class BreakpointData(TypedDict, total=False):
-    column: int
-    condition: str
-    hitCondition: str
-    logMessage: str
-    mode: str
-
-
-class SourceBreakpoint(BreakpointData):
-    line: int
-
-
-class Breakpoint(TypedDict, total=False):
-    id: int
-    verified: bool
-
-
 def dump_memory(base_addr, data, num_per_line, outfile):
     data_len = len(data)
     hex_string = binascii.hexlify(data)
@@ -165,9 +58,7 @@ def dump_memory(base_addr, data, num_per_line, outfile):
         outfile.write("\n")
 
 
-def read_packet(
-    f: IO[bytes], trace_file: Optional[IO[str]] = None
-) -> Optional[ProtocolMessage]:
+def read_packet(f, verbose=False, trace_file=None):
     """Decode a JSON packet that starts with the content length and is
     followed by the JSON bytes from a file 'f'. Returns None on EOF.
     """
@@ -179,20 +70,32 @@ def read_packet(
     prefix = "Content-Length: "
     if line.startswith(prefix):
         # Decode length of JSON bytes
+        if verbose:
+            print('content: "%s"' % (line))
         length = int(line[len(prefix) :])
+        if verbose:
+            print('length: "%u"' % (length))
         # Skip empty line
-        line = f.readline().decode()
+        line = f.readline()
+        if verbose:
+            print('empty: "%s"' % (line))
         # Read JSON bytes
         json_str = f.read(length)
+        if verbose:
+            print('json: "%s"' % (json_str))
         if trace_file:
-            trace_file.write(f"from adapter:\n{json_str!r}\n")
+            trace_file.write("from adapter:\n%s\n" % (json_str))
         # Decode the JSON bytes into a python dictionary
         return json.loads(json_str)
 
     raise Exception("unexpected malformed message from lldb-dap: " + line)
 
 
-def dump_dap_log(log_file: Optional[str]) -> None:
+def packet_type_is(packet, packet_type):
+    return "type" in packet and packet["type"] == packet_type
+
+
+def dump_dap_log(log_file):
     print("========= DEBUG ADAPTER PROTOCOL LOGS =========", file=sys.stderr)
     if log_file is None:
         print("no log file available", file=sys.stderr)
@@ -202,30 +105,34 @@ def dump_dap_log(log_file: Optional[str]) -> None:
     print("========= END =========", file=sys.stderr)
 
 
-@dataclass
-class Source:
-    path: Optional[str]
-    source_reference: Optional[int]
-
-    @property
-    def name(self) -> Optional[str]:
-        if not self.path:
-            return None
-        return os.path.basename(self.path)
-
+class Source(object):
     def __init__(
         self, path: Optional[str] = None, source_reference: Optional[int] = None
     ):
-        if path is None and source_reference is None:
+        self._name = None
+        self._path = None
+        self._source_reference = None
+
+        if path is not None:
+            self._name = os.path.basename(path)
+            self._path = path
+        elif source_reference is not None:
+            self._source_reference = source_reference
+        else:
             raise ValueError("Either path or source_reference must be provided")
 
-        self.path = path
-        self.source_reference = source_reference
+    def __str__(self):
+        return f"Source(name={self.name}, path={self.path}), source_reference={self.source_reference})"
 
-    def to_DAP(self) -> dict:
-        if self.path:
-            return {"path": self.path, "name": self.name}
-        return {"sourceReference": self.source_reference}
+    def as_dict(self):
+        source_dict = {}
+        if self._name is not None:
+            source_dict["name"] = self._name
+        if self._path is not None:
+            source_dict["path"] = self._path
+        if self._source_reference is not None:
+            source_dict["sourceReference"] = self._source_reference
+        return source_dict
 
 
 class NotSupportedError(KeyError):
@@ -237,7 +144,7 @@ def __init__(
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: List[str],
+        init_commands: list[str],
         log_file: Optional[TextIO] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
@@ -245,50 +152,35 @@ def __init__(
         self.log_file = log_file
         self.send = send
         self.recv = recv
-        # Packets that have been received and processed but have not yet been
-        # requested by a test case.
-        self._pending_packets: List[Optional[ProtocolMessage]] = []
-        # Received packets that have not yet been processed.
-        self._recv_packets: List[Optional[ProtocolMessage]] = []
-        # Used as a mutex for _recv_packets and for notify when _recv_packets
-        # changes.
-        self._recv_condition = threading.Condition()
-        self._recv_thread = threading.Thread(target=self._read_packet_thread)
-
-        # session state
-        self.init_commands = init_commands
+        self.recv_packets: list[Optional[ProtocolMessage]] = []
+        self.recv_condition = threading.Condition()
+        self.recv_thread = threading.Thread(target=self._read_packet_thread)
+        self.process_event_body = None
         self.exit_status: Optional[int] = None
-        self.capabilities: Optional[Dict] = None
-        self.initialized: bool = False
-        self.configuration_done_sent: bool = False
-        self.process_event_body: Optional[Dict] = None
-        self.terminated: bool = False
-        self.events: List[Event] = []
-        self.progress_events: List[Event] = []
-        self.reverse_requests: List[Request] = []
-        self.module_events: List[Dict] = []
-        self.sequence: int = 1
-        self.output: Dict[str, str] = {}
-
-        # debuggee state
-        self.threads: Optional[dict] = None
-        self.thread_stop_reasons: Dict[str, Any] = {}
-        self.frame_scopes: Dict[str, Any] = {}
-        # keyed by breakpoint id
-        self.resolved_breakpoints: Dict[str, bool] = {}
-
-        # trigger enqueue thread
-        self._recv_thread.start()
+        self.capabilities: dict[str, Any] = {}
+        self.progress_events: list[Event] = []
+        self.reverse_requests = []
+        self.sequence = 1
+        self.threads = None
+        self.thread_stop_reasons = {}
+        self.recv_thread.start()
+        self.output_condition = threading.Condition()
+        self.output: dict[str, list[str]] = {}
+        self.configuration_done_sent = False
+        self.initialized = False
+        self.frame_scopes = {}
+        self.init_commands = init_commands
+        self.resolved_breakpoints = {}
 
     @classmethod
     def encode_content(cls, s: str) -> bytes:
         return ("Content-Length: %u\r\n\r\n%s" % (len(s), s)).encode("utf-8")
 
     @classmethod
-    def validate_response(cls, request: Request, response: Response) -> None:
-        if request["command"] != response["command"]:
+    def validate_response(cls, command, response):
+        if command["command"] != response["command"]:
             raise ValueError("command mismatch in response")
-        if request["seq"] != response["request_seq"]:
+        if command["seq"] != response["request_seq"]:
             raise ValueError("seq mismatch in response")
 
     def _read_packet_thread(self):
@@ -297,323 +189,262 @@ def _read_packet_thread(self):
             while not done:
                 packet = read_packet(self.recv, trace_file=self.trace_file)
                 # `packet` will be `None` on EOF. We want to pass it down to
-                # handle_recv_packet anyway so the main thread can handle
-                # unexpected termination of lldb-dap and stop waiting for new
-                # packets.
+                # handle_recv_packet anyway so the main thread can handle unexpected
+                # termination of lldb-dap and stop waiting for new packets.
                 done = not self._handle_recv_packet(packet)
         finally:
             dump_dap_log(self.log_file)
 
-    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
-        """Handles an incoming packet.
+    def get_modules(self):
+        module_list = self.request_modules()["body"]["modules"]
+        modules = {}
+        for module in module_list:
+            modules[module["name"]] = module
+        return modules
 
-        Called by the read thread that is waiting for all incoming packets
-        to store the incoming packet in "self._recv_packets" in a thread safe
-        way. This function will then signal the "self._recv_condition" to
-        indicate a new packet is available.
+    def get_output(self, category, timeout=0.0, clear=True):
+        self.output_condition.acquire()
+        output = None
+        if category in self.output:
+            output = self.output[category]
+            if clear:
+                del self.output[category]
+        elif timeout != 0.0:
+            self.output_condition.wait(timeout)
+            if category in self.output:
+                output = self.output[category]
+                if clear:
+                    del self.output[category]
+        self.output_condition.release()
+        return output
 
-        Args:
-            packet: A new packet to store.
+    def collect_output(self, category, timeout_secs, pattern, clear=True):
+        end_time = time.time() + timeout_secs
+        collected_output = ""
+        while end_time > time.time():
+            output = self.get_output(category, timeout=0.25, clear=clear)
+            if output:
+                collected_output += output
+                if pattern is not None and pattern in output:
+                    break
+        return collected_output if collected_output else None
+
+    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
+        self.recv_condition.acquire()
+        self.recv_packets.append(packet)
+        self.recv_condition.notify()
+        self.recv_condition.release()
 
-        Returns:
-            True if the caller should keep calling this function for more
-            packets.
-        """
-        with self._recv_condition:
-            self._recv_packets.append(packet)
-            self._recv_condition.notify()
-            # packet is None on EOF
-            return packet is not None and not (
-                packet["type"] == "response" and packet["command"] == "disconnect"
-            )
-
-    def _recv_packet(
-        self,
-        *,
-        predicate: Optional[Callable[[ProtocolMessage], bool]] = None,
-        timeout: Optional[float] = None,
-    ) -> Optional[ProtocolMessage]:
-        """Processes received packets from the adapter.
-
-        Updates the DebugCommunication stateful properties based on the received
-        packets in the order they are received.
-
-        NOTE: The only time the session state properties should be updated is
-        during this call to ensure consistency during tests.
-
-        Args:
-            predicate:
-                Optional, if specified, returns the first packet that matches
-                the given predicate.
-            timeout:
-                Optional, if specified, processes packets until either the
-                timeout occurs or the predicate matches a packet, whichever
-                occurs first.
-
-        Returns:
-            The first matching packet for the given predicate, if specified,
-            otherwise None.
+    def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
+        """Called by the read thread that is waiting for all incoming packets
+        to store the incoming packet in "self.recv_packets" in a thread safe
+        way. This function will then signal the "self.recv_condition" to
+        indicate a new packet is available. Returns True if the caller
+        should keep calling this function for more packets.
         """
-        assert (
-            threading.current_thread != self._recv_thread
-        ), "Must not be called from the _recv_thread"
-
-        def process_until_match():
-            self._process_recv_packets()
-            for i, packet in enumerate(self._pending_packets):
-                if packet is None:
-                    # We need to return a truthy value to break out of the
-                    # wait_for, use `EOFError` as an indicator of EOF.
-                    return EOFError()
-                if predicate and predicate(packet):
-                    self._pending_packets.pop(i)
-                    return packet
-
-        with self._recv_condition:
-            packet = self._recv_condition.wait_for(process_until_match, timeout)
-            return None if isinstance(packet, EOFError) else packet
-
-    def _process_recv_packets(self) -> None:
-        """Process received packets, updating the session state."""
-        with self._recv_condition:
-            for packet in self._recv_packets:
-                # Handle events that may modify any stateful properties of
-                # the DAP session.
-                if packet and packet["type"] == "event":
-                    self._handle_event(packet)
-                elif packet and packet["type"] == "request":
-                    # Handle reverse requests and keep processing.
-                    self._handle_reverse_request(packet)
-                # Move the packet to the pending queue.
-                self._pending_packets.append(packet)
-            self._recv_packets.clear()
-
-    def _handle_event(self, packet: Event) -> None:
-        """Handle any events that modify debug session state we track."""
-        event = packet["event"]
-        body: Optional[Dict] = packet.get("body", None)
-
-        if event == "output" and body:
-            # Store any output we receive so clients can retrieve it later.
-            category = body["category"]
-            output = body["output"]
-            if category in self.output:
-                self.output[category] += output
-            else:
-                self.output[category] = output
-        elif event == "initialized":
-            self.initialized = True
-        elif event == "process":
-            # When a new process is attached or launched, remember the
-            # details that are available in the body of the event
-            self.process_event_body = body
-        elif event == "exited" and body:
-            # Process exited, mark the status to indicate the process is not
-            # alive.
-            self.exit_status = body["exitCode"]
-        elif event == "continued" and body:
-            # When the process continues, clear the known threads and
-            # thread_stop_reasons.
-            all_threads_continued = body.get("allThreadsContinued", True)
-            tid = body["threadId"]
-            if tid in self.thread_stop_reasons:
-                del self.thread_stop_reasons[tid]
-            self._process_continued(all_threads_continued)
-        elif event == "stopped" and body:
-            # Each thread that stops with a reason will send a
-            # 'stopped' event. We need to remember the thread stop
-            # reasons since the 'threads' command doesn't return
-            # that information.
-            self._process_stopped()
-            tid = body["threadId"]
-            self.thread_stop_reasons[tid] = body
-        elif event.startswith("progress"):
-            # Progress events come in as 'progressStart', 'progressUpdate',
-            # and 'progressEnd' events. Keep these around in case test
-            # cases want to verify them.
-            self.progress_events.append(packet)
-        elif event == "breakpoint" and body:
-            # Breakpoint events are sent when a breakpoint is resolved
-            self._update_verified_breakpoints([body["breakpoint"]])
-        elif event == "capabilities" and body:
-            if self.capabilities is None:
-                self.capabilities = {}
-            # Update the capabilities with new ones from the event.
-            self.capabilities.update(body["capabilities"])
-
-    def _handle_reverse_request(self, request: Request) -> None:
-        if request in self.reverse_requests:
-            return
-        self.reverse_requests.append(request)
-        arguments = request.get("arguments")
-        if request["command"] == "runInTerminal" and arguments is not None:
-            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
-            proc = subprocess.Popen(
-                arguments["args"],
-                env=arguments.get("env", {}),
-                cwd=arguments["cwd"],
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                shell=in_shell,
-            )
-            body = {}
-            if in_shell:
-                body["shellProcessId"] = proc.pid
-            else:
-                body["processId"] = proc.pid
-            self.send_packet(
-                {
-                    "type": "response",
-                    "seq": 0,
-                    "request_seq": request["seq"],
-                    "success": True,
-                    "command": "runInTerminal",
-                    "message": None,
-                    "body": body,
-                }
-            )
-        elif request["command"] == "startDebugging":
-            self.send_packet(
-                {
-                    "type": "response",
-                    "seq": 0,
-                    "request_seq": request["seq"],
-                    "success": True,
-                    "message": None,
-                    "command": "startDebugging",
-                    "body": {},
-                }
-            )
-        else:
-            desc = 'unknown reverse request "%s"' % (request["command"])
-            raise ValueError(desc)
+        # If EOF, notify the read thread by enqueuing a None.
+        if not packet:
+            self._enqueue_recv_packet(None)
+            return False
+
+        # Check the packet to see if is an event packet
+        keepGoing = True
+        packet_type = packet["type"]
+        if packet_type == "event":
+            event = packet["event"]
+            body = None
+            if "body" in packet:
+                body = packet["body"]
+            # Handle the event packet and cache information from these packets
+            # as they come in
+            if event == "output":
+                # Store any output we receive so clients can retrieve it later.
+                category = body["category"]
+                output = body["output"]
+                self.output_condition.acquire()
+                if category in self.output:
+                    self.output[category] += output
+                else:
+                    self.output[category] = output
+                self.output_condition.notify()
+                self.output_condition.release()
+                # no need to add 'output' event packets to our packets list
+                return keepGoing
+            elif event == "initialized":
+                self.initialized = True
+            elif event == "process":
+                # When a new process is attached or launched, remember the
+                # details that are available in the body of the event
+                self.process_event_body = body
+            elif event == "exited":
+                # Process exited, mark the status to indicate the process is not
+                # alive.
+                self.exit_status = body["exitCode"]
+            elif event == "continued":
+                # When the process continues, clear the known threads and
+                # thread_stop_reasons.
+                all_threads_continued = body.get("allThreadsContinued", True)
+                tid = body["threadId"]
+                if tid in self.thread_stop_reasons:
+                    del self.thread_stop_reasons[tid]
+                self._process_continued(all_threads_continued)
+            elif event == "stopped":
+                # Each thread that stops with a reason will send a
+                # 'stopped' event. We need to remember the thread stop
+                # reasons since the 'threads' command doesn't return
+                # that information.
+                self._process_stopped()
+                tid = body["threadId"]
+                self.thread_stop_reasons[tid] = body
+            elif event.startswith("progress"):
+                # Progress events come in as 'progressStart', 'progressUpdate',
+                # and 'progressEnd' events. Keep these around in case test
+                # cases want to verify them.
+                self.progress_events.append(packet)
+            elif event == "breakpoint":
+                # Breakpoint events are sent when a breakpoint is resolved
+                self._update_verified_breakpoints([body["breakpoint"]])
+            elif event == "capabilities":
+                # Update the capabilities with new ones from the event.
+                self.capabilities.update(body["capabilities"])
+
+        elif packet_type == "response":
+            if packet["command"] == "disconnect":
+                keepGoing = False
+        self._enqueue_recv_packet(packet)
+        return keepGoing
 
     def _process_continued(self, all_threads_continued: bool):
         self.frame_scopes = {}
         if all_threads_continued:
             self.thread_stop_reasons = {}
 
-    def _update_verified_breakpoints(self, breakpoints: list[Breakpoint]):
-        for bp in breakpoints:
-            # If no id is set, we cannot correlate the given breakpoint across
-            # requests, ignore it.
-            if "id" not in bp:
-                continue
-
-            self.resolved_breakpoints[str(bp["id"])] = bp.get("verified", False)
-
-    def _send_recv(self, request: Request[Ta]) -> Optional[Response[Tb]]:
-        """Send a command python dictionary as JSON and receive the JSON
-        response. Validates that the response is the correct sequence and
-        command in the reply. Any events that are received are added to the
-        events list in this object"""
-        seq = self.send_packet(request)
-        response = self.receive_response(seq)
-        if response is None:
-            raise ValueError(f"no response for {request!r}")
-        self.validate_response(request, response)
-        return response
+    def _update_verified_breakpoints(self, breakpoints: list[Event]):
+        for breakpoint in breakpoints:
+            if "id" in breakpoint:
+                self.resolved_breakpoints[str(breakpoint["id"])] = breakpoint.get(
+                    "verified", False
+                )
 
-    def send_packet(self, packet: ProtocolMessage) -> int:
+    def send_packet(self, command_dict: Request, set_sequence=True):
         """Take the "command_dict" python dictionary and encode it as a JSON
         string and send the contents as a packet to the VSCode debug
-        adapter.
-
-        Returns the seq of the packet."""
-        # Set the seq for requests.
-        if packet["type"] == "request":
-            packet["seq"] = self.sequence
+        adapter"""
+        # Set the sequence ID for this command automatically
+        if set_sequence:
+            command_dict["seq"] = self.sequence
             self.sequence += 1
-        else:
-            packet["seq"] = 0
-
         # Encode our command dictionary as a JSON string
-        json_str = json.dumps(packet, separators=(",", ":"))
-
+        json_str = json.dumps(command_dict, separators=(",", ":"))
         if self.trace_file:
             self.trace_file.write("to adapter:\n%s\n" % (json_str))
-
         length = len(json_str)
         if length > 0:
             # Send the encoded JSON packet and flush the 'send' file
             self.send.write(self.encode_content(json_str))
             self.send.flush()
 
-        return packet["seq"]
-
-    def receive_response(self, seq: int) -> Optional[Response]:
-        """Waits for the a response with the associated request_sec."""
-
-        def predicate(p: ProtocolMessage):
-            return p["type"] == "response" and p["request_seq"] == seq
-
-        return cast(Optional[Response], self._recv_packet(predicate=predicate))
-
-    def get_modules(self):
-        modules = {}
-        resp = self.request_modules()
-        if resp["success"]:
-            module_list = resp["body"]["modules"]
-            for module in module_list:
-                modules[module["name"]] = module
-        else:
-            raise ValueError(f"request_modules failed: {resp!r}")
-        return modules
-
-    def get_output(self, category: str, clear=True) -> str:
-        output = ""
-        if category in self.output:
-            output = self.output.get(category, "")
-            if clear:
-                del self.output[category]
-        return output
-
-    def collect_output(
+    def recv_packet(
         self,
-        category: str,
-        timeout_secs: float,
-        pattern: Optional[str] = None,
-        clear=True,
-    ) -> str:
-        """Collect output from 'output' events.
-
-        Args:
-            category: The category to collect.
-            timeout_secs: The max duration for collecting output.
-            pattern:
-                Optional, if set, return once this pattern is detected in the
-                collected output.
-
-        Returns:
-            The collected output.
-        """
-        deadline = time.monotonic() + timeout_secs
-        output = self.get_output(category, clear)
-        while deadline >= time.monotonic() and (
-            pattern is None or pattern not in output
-        ):
-            event = self.wait_for_event(["output"], timeout=deadline - time.monotonic())
-            if not event:  # Timeout or EOF
-                break
-            output += self.get_output(category, clear=clear)
-        return output
+        filter_type: Optional[str] = None,
+        filter_event: Optional[Union[str, list[str]]] = None,
+        timeout: Optional[float] = None,
+    ) -> Optional[ProtocolMessage]:
+        """Get a JSON packet from the VSCode debug adapter. This function
+        assumes a thread that reads packets is running and will deliver
+        any received packets by calling handle_recv_packet(...). This
+        function will wait for the packet to arrive and return it when
+        it does."""
+        while True:
+            try:
+                self.recv_condition.acquire()
+                packet = None
+                while True:
+                    for i, curr_packet in enumerate(self.recv_packets):
+                        if not curr_packet:
+                            raise EOFError
+                        packet_type = curr_packet["type"]
+                        if filter_type is None or packet_type in filter_type:
+                            if filter_event is None or (
+                                packet_type == "event"
+                                and curr_packet["event"] in filter_event
+                            ):
+                                packet = self.recv_packets.pop(i)
+                                break
+                    if packet:
+                        break
+                    # Sleep until packet is received
+                    len_before = len(self.recv_packets)
+                    self.recv_condition.wait(timeout)
+                    len_after = len(self.recv_packets)
+                    if len_before == len_after:
+                        return None  # Timed out
+                return packet
+            except EOFError:
+                return None
+            finally:
+                self.recv_condition.release()
+
+    def send_recv(self, command):
+        """Send a command python dictionary as JSON and receive the JSON
+        response. Validates that the response is the correct sequence and
+        command in the reply. Any events that are received are added to the
+        events list in this object"""
+        self.send_packet(command)
+        done = False
+        while not done:
+            response_or_request = self.recv_packet(filter_type=["response", "request"])
+            if response_or_request is None:
+                desc = 'no response for "%s"' % (command["command"])
+                raise ValueError(desc)
+            if response_or_request["type"] == "response":
+                self.validate_response(command, response_or_request)
+                return response_or_request
+            else:
+                self.reverse_requests.append(response_or_request)
+                if response_or_request["command"] == "runInTerminal":
+                    subprocess.Popen(
+                        response_or_request["arguments"]["args"],
+                        env=response_or_request["arguments"]["env"],
+                    )
+                    self.send_packet(
+                        {
+                            "type": "response",
+                            "request_seq": response_or_request["seq"],
+                            "success": True,
+                            "command": "runInTerminal",
+                            "body": {},
+                        },
+                    )
+                elif response_or_request["command"] == "startDebugging":
+                    self.send_packet(
+                        {
+                            "type": "response",
+                            "request_seq": response_or_request["seq"],
+                            "success": True,
+                            "command": "startDebugging",
+                            "body": {},
+                        },
+                    )
+                else:
+                    desc = 'unknown reverse request "%s"' % (
+                        response_or_request["command"]
+                    )
+                    raise ValueError(desc)
+
+        return None
 
     def wait_for_event(
-        self, filter: List[str] = [], timeout: Optional[float] = None
+        self, filter: Union[str, list[str]], timeout: Optional[float] = None
     ) -> Optional[Event]:
         """Wait for the first event that matches the filter."""
-
-        def predicate(p: ProtocolMessage):
-            return p["type"] == "event" and p["event"] in filter
-
-        return cast(
-            Optional[Event], self._recv_packet(predicate=predicate, timeout=timeout)
+        return self.recv_packet(
+            filter_type="event", filter_event=filter, timeout=timeout
         )
 
     def wait_for_stopped(
         self, timeout: Optional[float] = None
-    ) -> Optional[List[Event]]:
+    ) -> Optional[list[Event]]:
         stopped_events = []
         stopped_event = self.wait_for_event(
             filter=["stopped", "exited"], timeout=timeout
@@ -632,9 +463,9 @@ def wait_for_stopped(
         return stopped_events
 
     def wait_for_breakpoint_events(self, timeout: Optional[float] = None):
-        breakpoint_events: List[Event] = []
+        breakpoint_events: list[Event] = []
         while True:
-            event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            event = self.wait_for_event("breakpoint", timeout=timeout)
             if not event:
                 break
             breakpoint_events.append(event)
@@ -645,26 +476,20 @@ def wait_for_breakpoints_to_be_verified(
     ):
         """Wait for all breakpoints to be verified. Return all unverified breakpoints."""
         while any(id not in self.resolved_breakpoints for id in breakpoint_ids):
-            breakpoint_event = self.wait_for_event(["breakpoint"], timeout=timeout)
+            breakpoint_event = self.wait_for_event("breakpoint", timeout=timeout)
             if breakpoint_event is None:
                 break
 
-        return [
-            id
-            for id in breakpoint_ids
-            if id not in self.resolved_breakpoints and not self.resolved_breakpoints[id]
-        ]
+        return [id for id in breakpoint_ids if id not in self.resolved_breakpoints]
 
     def wait_for_exited(self, timeout: Optional[float] = None):
-        event_dict = self.wait_for_event(["exited"], timeout=timeout)
+        event_dict = self.wait_for_event("exited", timeout=timeout)
         if event_dict is None:
             raise ValueError("didn't get exited event")
         return event_dict
 
     def wait_for_terminated(self, timeout: Optional[float] = None):
-        if self.terminated:
-            raise ValueError("already terminated")
-        event_dict = self.wait_for_event(["terminated"], timeout)
+        event_dict = self.wait_for_event("terminated", timeout)
         if event_dict is None:
             raise ValueError("didn't get terminated event")
         return event_dict
@@ -699,10 +524,12 @@ def get_stackFrame(self, frameIndex=0, threadId=None):
         if threadId is None:
             threadId = self.get_thread_id()
         if threadId is None:
+            print("invalid threadId")
             return None
         response = self.request_stackTrace(threadId, startFrame=frameIndex, levels=1)
         if response:
             return response["body"]["stackFrames"][0]
+        print("invalid response")
         return None
 
     def get_completions(self, text, frameId=None):
@@ -840,7 +667,7 @@ def request_attach(
         gdbRemotePort: Optional[int] = None,
         gdbRemoteHostname: Optional[str] = None,
     ):
-        args_dict: AttachArguments = {}
+        args_dict = {}
         if pid is not None:
             args_dict["pid"] = pid
         if program is not None:
@@ -872,12 +699,8 @@ def request_attach(
             args_dict["gdb-remote-port"] = gdbRemotePort
         if gdbRemoteHostname is not None:
             args_dict["gdb-remote-hostname"] = gdbRemoteHostname
-        command_dict: Request = {
-            "command": "attach",
-            "type": "request",
-            "arguments": args_dict,
-        }
-        return self._send_recv(command_dict)
+        command_dict = {"command": "attach", "type": "request", "arguments": args_dict}
+        return self.send_recv(command_dict)
 
     def request_breakpointLocations(
         self, file_path, line, end_line=None, column=None, end_column=None
@@ -899,7 +722,7 @@ def request_breakpointLocations(
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_configurationDone(self):
         command_dict = {
@@ -907,7 +730,7 @@ def request_configurationDone(self):
             "type": "request",
             "arguments": {},
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
             self.request_threads()
@@ -936,7 +759,7 @@ def request_continue(self, threadId=None, singleThread=False):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response["success"]:
             self._process_continued(response["body"]["allThreadsContinued"])
         # Caller must still call wait_for_stopped.
@@ -953,7 +776,7 @@ def request_restart(self, restartArguments=None):
         if restartArguments:
             command_dict["arguments"] = restartArguments
 
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         # Caller must still call wait_for_stopped.
         return response
 
@@ -969,7 +792,7 @@ def request_disconnect(self, terminateDebuggee=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_disassemble(
         self,
@@ -989,7 +812,7 @@ def request_disassemble(
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)["body"]["instructions"]
+        return self.send_recv(command_dict)["body"]["instructions"]
 
     def request_readMemory(self, memoryReference, offset, count):
         args_dict = {
@@ -1002,7 +825,7 @@ def request_readMemory(self, memoryReference, offset, count):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None):
         stackFrame = self.get_stackFrame(frameIndex=frameIndex, threadId=threadId)
@@ -1018,7 +841,7 @@ def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_exceptionInfo(self, threadId=None):
         if threadId is None:
@@ -1029,7 +852,7 @@ def request_exceptionInfo(self, threadId=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_initialize(self, sourceInitFile=False):
         command_dict = {
@@ -1050,7 +873,7 @@ def request_initialize(self, sourceInitFile=False):
                 "$__lldb_sourceInitFile": sourceInitFile,
             },
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response:
             if "body" in response:
                 self.capabilities = response["body"]
@@ -1085,7 +908,7 @@ def request_launch(
         customFrameFormat: Optional[str] = None,
         customThreadFormat: Optional[str] = None,
     ):
-        args_dict: LaunchArguments = {"program": program}
+        args_dict = {"program": program}
         if args:
             args_dict["args"] = args
         if cwd:
@@ -1132,19 +955,15 @@ def request_launch(
         args_dict["displayExtendedBacktrace"] = displayExtendedBacktrace
         if commandEscapePrefix is not None:
             args_dict["commandEscapePrefix"] = commandEscapePrefix
-        command_dict: Request = {
-            "command": "launch",
-            "type": "request",
-            "arguments": args_dict,
-        }
-        return self._send_recv(command_dict)
+        command_dict = {"command": "launch", "type": "request", "arguments": args_dict}
+        return self.send_recv(command_dict)
 
     def request_next(self, threadId, granularity="statement"):
         if self.exit_status is not None:
             raise ValueError("request_continue called after process exited")
         args_dict = {"threadId": threadId, "granularity": granularity}
         command_dict = {"command": "next", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepIn(self, threadId, targetId, granularity="statement"):
         if self.exit_status is not None:
@@ -1157,7 +976,7 @@ def request_stepIn(self, threadId, targetId, granularity="statement"):
             "granularity": granularity,
         }
         command_dict = {"command": "stepIn", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepInTargets(self, frameId):
         if self.exit_status is not None:
@@ -1169,14 +988,14 @@ def request_stepInTargets(self, frameId):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_stepOut(self, threadId):
         if self.exit_status is not None:
             raise ValueError("request_stepOut called after process exited")
         args_dict = {"threadId": threadId}
         command_dict = {"command": "stepOut", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_pause(self, threadId=None):
         if self.exit_status is not None:
@@ -1185,47 +1004,49 @@ def request_pause(self, threadId=None):
             threadId = self.get_thread_id()
         args_dict = {"threadId": threadId}
         command_dict = {"command": "pause", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_scopes(self, frameId):
         args_dict = {"frameId": frameId}
         command_dict = {"command": "scopes", "type": "request", "arguments": args_dict}
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
-    def request_setBreakpoints(
-        self,
-        source: Union[Source, str],
-        line_array: Optional[List[int]],
-        data: Optional[List[BreakpointData]] = None,
-    ):
+    def request_setBreakpoints(self, source: Source, line_array, data=None):
         """data is array of parameters for breakpoints in line_array.
         Each parameter object is 1:1 mapping with entries in line_entry.
         It contains optional location/hitCondition/logMessage parameters.
         """
-        if isinstance(source, str):
-            source = Source(path=source)
         args_dict = {
-            "source": source.to_DAP(),
+            "source": source.as_dict(),
             "sourceModified": False,
         }
-        if line_array:
+        if line_array is not None:
             args_dict["lines"] = line_array
             breakpoints = []
             for i, line in enumerate(line_array):
-                breakpoint_data: BreakpointData = {}
+                breakpoint_data = None
                 if data is not None and i < len(data):
                     breakpoint_data = data[i]
-                bp: SourceBreakpoint = {"line": line, **breakpoint_data}
+                bp = {"line": line}
+                if breakpoint_data is not None:
+                    if breakpoint_data.get("condition"):
+                        bp["condition"] = breakpoint_data["condition"]
+                    if breakpoint_data.get("hitCondition"):
+                        bp["hitCondition"] = breakpoint_data["hitCondition"]
+                    if breakpoint_data.get("logMessage"):
+                        bp["logMessage"] = breakpoint_data["logMessage"]
+                    if breakpoint_data.get("column"):
+                        bp["column"] = breakpoint_data["column"]
                 breakpoints.append(bp)
             args_dict["breakpoints"] = breakpoints
 
-        command_dict: Request = {
+        command_dict = {
             "command": "setBreakpoints",
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
-        if response and response["success"] and response["body"]:
+        response = self.send_recv(command_dict)
+        if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
 
@@ -1240,7 +1061,7 @@ def request_setExceptionBreakpoints(
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=None):
         breakpoints = []
@@ -1257,7 +1078,7 @@ def request_setFunctionBreakpoints(self, names, condition=None, hitCondition=Non
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if response["success"]:
             self._update_verified_breakpoints(response["body"]["breakpoints"])
         return response
@@ -1278,7 +1099,7 @@ def request_dataBreakpointInfo(
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setDataBreakpoint(self, dataBreakpoints):
         """dataBreakpoints is a list of dictionary with following fields:
@@ -1295,7 +1116,7 @@ def request_setDataBreakpoint(self, dataBreakpoints):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_compileUnits(self, moduleId):
         args_dict = {"moduleId": moduleId}
@@ -1304,7 +1125,7 @@ def request_compileUnits(self, moduleId):
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         return response
 
     def request_completions(self, text, frameId=None):
@@ -1316,10 +1137,10 @@ def request_completions(self, text, frameId=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_modules(self):
-        return self._send_recv({"command": "modules", "type": "request"})
+        return self.send_recv({"command": "modules", "type": "request"})
 
     def request_stackTrace(
         self, threadId=None, startFrame=None, levels=None, format=None, dump=False
@@ -1338,7 +1159,7 @@ def request_stackTrace(
             "type": "request",
             "arguments": args_dict,
         }
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if dump:
             for idx, frame in enumerate(response["body"]["stackFrames"]):
                 name = frame["name"]
@@ -1364,7 +1185,7 @@ def request_source(self, sourceReference):
                 "sourceReference": sourceReference,
             },
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_threads(self):
         """Request a list of all threads and combine any information from any
@@ -1372,7 +1193,7 @@ def request_threads(self):
         thread actually stopped. Returns an array of thread dictionaries
         with information about all threads"""
         command_dict = {"command": "threads", "type": "request", "arguments": {}}
-        response = self._send_recv(command_dict)
+        response = self.send_recv(command_dict)
         if not response["success"]:
             self.threads = None
             return response
@@ -1412,7 +1233,7 @@ def request_variables(
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_setVariable(self, containingVarRef, name, value, id=None):
         args_dict = {
@@ -1427,7 +1248,7 @@ def request_setVariable(self, containingVarRef, name, value, id=None):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_locations(self, locationReference):
         args_dict = {
@@ -1438,7 +1259,7 @@ def request_locations(self, locationReference):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def request_testGetTargetBreakpoints(self):
         """A request packet used in the LLDB test suite to get all currently
@@ -1450,12 +1271,12 @@ def request_testGetTargetBreakpoints(self):
             "type": "request",
             "arguments": {},
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
     def terminate(self):
         self.send.close()
-        if self._recv_thread.is_alive():
-            self._recv_thread.join()
+        if self.recv_thread.is_alive():
+            self.recv_thread.join()
 
     def request_setInstructionBreakpoints(self, memory_reference=[]):
         breakpoints = []
@@ -1470,7 +1291,7 @@ def request_setInstructionBreakpoints(self, memory_reference=[]):
             "type": "request",
             "arguments": args_dict,
         }
-        return self._send_recv(command_dict)
+        return self.send_recv(command_dict)
 
 
 class DebugAdapterServer(DebugCommunication):
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index 8778b51e7c360..3b54d598c3509 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional, Callable
+from typing import Optional
 import uuid
 
 import dap_server
@@ -121,19 +121,11 @@ def wait_for_breakpoints_to_resolve(
             f"Expected to resolve all breakpoints. Unresolved breakpoint ids: {unresolved_breakpoints}",
         )
 
-    def wait_until(
-        self,
-        predicate: Callable[[], bool],
-        delay: float = 0.5,
-        timeout: float = DEFAULT_TIMEOUT,
-    ) -> bool:
-        """Repeatedly run the predicate until either the predicate returns True
-        or a timeout has occurred."""
-        deadline = time.monotonic() + timeout
-        while deadline > time.monotonic():
-            if predicate():
+    def waitUntil(self, condition_callback):
+        for _ in range(20):
+            if condition_callback():
                 return True
-            time.sleep(delay)
+            time.sleep(0.5)
         return False
 
     def assertCapabilityIsSet(self, key: str, msg: Optional[str] = None) -> None:
@@ -152,7 +144,6 @@ def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
         "breakpoint_ids" should be a list of breakpoint ID strings
         (["1", "2"]). The return value from self.set_source_breakpoints()
         or self.set_function_breakpoints() can be passed to this function"""
-        breakpoint_ids = [str(i) for i in breakpoint_ids]
         stopped_events = self.dap_server.wait_for_stopped(timeout)
         for stopped_event in stopped_events:
             if "body" in stopped_event:
@@ -164,16 +155,22 @@ def verify_breakpoint_hit(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
                     and body["reason"] != "instruction breakpoint"
                 ):
                     continue
-                if "hitBreakpointIds" not in body:
+                if "description" not in body:
                     continue
-                hit_breakpoint_ids = body["hitBreakpointIds"]
-                for bp in hit_breakpoint_ids:
-                    if str(bp) in breakpoint_ids:
+                # Descriptions for breakpoints will be in the form
+                # "breakpoint 1.1", so look for any description that matches
+                # ("breakpoint 1.") in the description field as verification
+                # that one of the breakpoint locations was hit. DAP doesn't
+                # allow breakpoints to have multiple locations, but LLDB does.
+                # So when looking at the description we just want to make sure
+                # the right breakpoint matches and not worry about the actual
+                # location.
+                description = body["description"]
+                for breakpoint_id in breakpoint_ids:
+                    match_desc = f"breakpoint {breakpoint_id}."
+                    if match_desc in description:
                         return
-        self.assertTrue(
-            False,
-            f"breakpoint not hit, wanted breakpoint_ids={breakpoint_ids} stopped_events={stopped_events}",
-        )
+        self.assertTrue(False, f"breakpoint not hit, stopped_events={stopped_events}")
 
     def verify_stop_exception_info(self, expected_description, timeout=DEFAULT_TIMEOUT):
         """Wait for the process we are debugging to stop, and verify the stop
@@ -208,9 +205,7 @@ def verify_commands(self, flavor, output, commands):
                     found = True
                     break
             self.assertTrue(
-                found,
-                "verify '%s' found in console output for '%s' in %s"
-                % (cmd, flavor, output),
+                found, "verify '%s' found in console output for '%s'" % (cmd, flavor)
             )
 
     def get_dict_value(self, d, key_path):
@@ -282,30 +277,26 @@ def get_source_and_line(self, threadId=None, frameIndex=0):
                         return (source["path"], stackFrame["line"])
         return ("", 0)
 
-    def get_stdout(self):
-        return self.dap_server.get_output("stdout")
+    def get_stdout(self, timeout=0.0):
+        return self.dap_server.get_output("stdout", timeout=timeout)
 
-    def get_console(self):
-        return self.dap_server.get_output("console")
+    def get_console(self, timeout=0.0):
+        return self.dap_server.get_output("console", timeout=timeout)
 
-    def get_important(self):
-        return self.dap_server.get_output("important")
+    def get_important(self, timeout=0.0):
+        return self.dap_server.get_output("important", timeout=timeout)
 
-    def collect_stdout(self, timeout_secs: float, pattern: Optional[str] = None) -> str:
+    def collect_stdout(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "stdout", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_console(
-        self, timeout_secs: float, pattern: Optional[str] = None
-    ) -> str:
+    def collect_console(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "console", timeout_secs=timeout_secs, pattern=pattern
         )
 
-    def collect_important(
-        self, timeout_secs: float, pattern: Optional[str] = None
-    ) -> str:
+    def collect_important(self, timeout_secs, pattern=None):
         return self.dap_server.collect_output(
             "important", timeout_secs=timeout_secs, pattern=pattern
         )
@@ -364,7 +355,7 @@ def stepOut(self, threadId=None, waitForStop=True, timeout=DEFAULT_TIMEOUT):
             return self.dap_server.wait_for_stopped(timeout)
         return None
 
-    def do_continue(self) -> None:  # `continue` is a keyword.
+    def do_continue(self):  # `continue` is a keyword.
         resp = self.dap_server.request_continue()
         self.assertTrue(resp["success"], f"continue request failed: {resp}")
 
@@ -372,14 +363,10 @@ def continue_to_next_stop(self, timeout=DEFAULT_TIMEOUT):
         self.do_continue()
         return self.dap_server.wait_for_stopped(timeout)
 
-    def continue_to_breakpoint(
-        self, breakpoint_id: int, timeout: Optional[float] = DEFAULT_TIMEOUT
-    ) -> None:
-        self.continue_to_breakpoints([breakpoint_id], timeout)
+    def continue_to_breakpoint(self, breakpoint_id: str, timeout=DEFAULT_TIMEOUT):
+        self.continue_to_breakpoints((breakpoint_id), timeout)
 
-    def continue_to_breakpoints(
-        self, breakpoint_ids: list[int], timeout: Optional[float] = DEFAULT_TIMEOUT
-    ) -> None:
+    def continue_to_breakpoints(self, breakpoint_ids, timeout=DEFAULT_TIMEOUT):
         self.do_continue()
         self.verify_breakpoint_hit(breakpoint_ids, timeout)
 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index a6eeee3a02543..831edd6494c1e 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -78,7 +78,7 @@ def test_source_map(self):
         self.assertFalse(breakpoint["verified"])
         self.assertEqual(other_basename, breakpoint["source"]["name"])
         self.assertEqual(new_other_path, breakpoint["source"]["path"])
-        other_breakpoint_id = str(breakpoint["id"])
+        other_breakpoint_id = breakpoint["id"]
 
         self.dap_server.request_continue()
         self.verify_breakpoint_hit([other_breakpoint_id])
@@ -379,8 +379,7 @@ def test_column_breakpoints(self):
             self.assertEqual(breakpoint["line"], loop_line)
             self.assertEqual(breakpoint["column"], columns[index])
             self.assertTrue(breakpoint["verified"], "expect breakpoint verified")
-            self.assertIn("id", breakpoint, "expected breakpoint id")
-            breakpoint_ids.append(str(breakpoint["id"]))
+            breakpoint_ids.append(breakpoint["id"])
 
         # Continue to the first breakpoint,
         self.continue_to_breakpoints([breakpoint_ids[0]])
diff --git a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
index c750cff071a80..824ed8fe3bb97 100644
--- a/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
+++ b/lldb/test/API/tools/lldb-dap/cancel/TestDAP_cancel.py
@@ -54,18 +54,18 @@ def test_pending_request(self):
         pending_seq = self.async_blocking_request(duration=self.DEFAULT_TIMEOUT / 2)
         cancel_seq = self.async_cancel(requestId=pending_seq)
 
-        blocking_resp = self.dap_server.receive_response(blocking_seq)
+        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], True)
 
-        pending_resp = self.dap_server.receive_response(pending_seq)
+        pending_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(pending_resp["request_seq"], pending_seq)
         self.assertEqual(pending_resp["command"], "evaluate")
         self.assertEqual(pending_resp["success"], False)
         self.assertEqual(pending_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.receive_response(cancel_seq)
+        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
@@ -86,13 +86,13 @@ def test_inflight_request(self):
         )
         cancel_seq = self.async_cancel(requestId=blocking_seq)
 
-        blocking_resp = self.dap_server.receive_response(blocking_seq)
+        blocking_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(blocking_resp["request_seq"], blocking_seq)
         self.assertEqual(blocking_resp["command"], "evaluate")
         self.assertEqual(blocking_resp["success"], False)
         self.assertEqual(blocking_resp["message"], "cancelled")
 
-        cancel_resp = self.dap_server.receive_response(cancel_seq)
+        cancel_resp = self.dap_server.recv_packet(filter_type=["response"])
         self.assertEqual(cancel_resp["request_seq"], cancel_seq)
         self.assertEqual(cancel_resp["command"], "cancel")
         self.assertEqual(cancel_resp["success"], True)
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index c29e0d3fa7b81..ae8142ae4f484 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -191,7 +191,7 @@ def test_disableSTDIO(self):
         self.continue_to_exit()
         # Now get the STDOUT and verify our program argument is correct
         output = self.get_stdout()
-        self.assertEqual(output, "", "expect no program output")
+        self.assertEqual(output, None, "expect no program output")
 
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
@@ -392,14 +392,14 @@ def test_commands(self):
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue again and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the second breakpoint was hit
         self.continue_to_breakpoints(breakpoint_ids)
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
@@ -461,21 +461,21 @@ def test_extra_launch_commands(self):
         self.verify_commands("launchCommands", output, launchCommands)
         # Verify the "stopCommands" here
         self.continue_to_next_stop()
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue and hit the second breakpoint.
         # Get output from the console. This should contain both the
         # "stopCommands" that were run after the first breakpoint was hit
         self.continue_to_next_stop()
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("stopCommands", output, stopCommands)
 
         # Continue until the program exits
         self.continue_to_exit()
         # Get output from the console. This should contain both the
         # "exitCommands" that were run after the second breakpoint was hit
-        output = self.get_console()
+        output = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.verify_commands("exitCommands", output, exitCommands)
 
     def test_failing_launch_commands(self):
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index b1823e4c8b1c3..4fc221668a8ee 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -54,7 +54,7 @@ def checkSymbolsLoadedWithSize():
             return symbol_regex.match(program_module["symbolStatus"])
 
         if expect_debug_info_size:
-            self.wait_until(checkSymbolsLoadedWithSize)
+            self.waitUntil(checkSymbolsLoadedWithSize)
         active_modules = self.dap_server.get_modules()
         program_module = active_modules[program_basename]
         self.assertEqual(program_basename, program_module["name"])
diff --git a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
index 4fcde623e3829..0425b55a5e552 100644
--- a/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
+++ b/lldb/test/API/tools/lldb-dap/output/TestDAP_output.py
@@ -37,14 +37,14 @@ def test_output(self):
         # Disconnecting from the server to ensure any pending IO is flushed.
         self.dap_server.request_disconnect()
 
-        output += self.get_stdout()
+        output += self.get_stdout(timeout=self.DEFAULT_TIMEOUT)
         self.assertTrue(output and len(output) > 0, "expect program stdout")
         self.assertIn(
             "abcdefghi\r\nhello world\r\nfinally\0\0",
             output,
             "full stdout not found in: " + repr(output),
         )
-        console = self.get_console()
+        console = self.get_console(timeout=self.DEFAULT_TIMEOUT)
         self.assertTrue(console and len(console) > 0, "expect dap messages")
         self.assertIn(
             "out\0\0\r\nerr\0\0\r\n", console, f"full console message not found"

From c9b28163888574bcfba0171372ae0dcfb40abbfa Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:07:03 +0900
Subject: [PATCH 819/851] AMDGPU: Fix cost model for 16-bit operations on gfx8
 (#141943)

We should only divide the number of pieces to fit the packed instructions
if we actually have pk instructions. This increases the cost of copysign,
but is closer to the current codegen output. It could be much cheaper
than it is now.
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  2 +-
 .../Analysis/CostModel/AMDGPU/canonicalize.ll | 24 ++++++++--------
 .../Analysis/CostModel/AMDGPU/copysign.ll     | 28 +++++++++----------
 .../SLPVectorizer/AMDGPU/slp-v2f16.ll         | 12 ++++----
 4 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 58bfc0b80b24f..b2b25ac66677e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -721,7 +721,7 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (SLT == MVT::f64)
     return LT.first * NElts * get64BitInstrCost(CostKind);
 
-  if ((ST->has16BitInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index e162edbf611e2..7ac4db3119210 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -22,12 +22,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-LABEL: 'canonicalize_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'canonicalize_f16'
@@ -62,12 +62,12 @@ define void @canonicalize_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'canonicalize_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.canonicalize.f16(half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.canonicalize.v5f16(<5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.canonicalize.v16f16(<16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.canonicalize.v17f16(<17 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'canonicalize_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
index 06a058ff2e7b1..334bb341a3c3e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -23,13 +23,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-LABEL: 'copysign_f16'
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'copysign_f16'
@@ -67,13 +67,13 @@ define void @copysign_f16() {
 ;
 ; GFX8-SIZE-LABEL: 'copysign_f16'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.copysign.f16(half undef, half undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.copysign.v2f16(<2 x half> undef, <2 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.copysign.v3f16(<3 x half> undef, <3 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.copysign.v4f16(<4 x half> undef, <4 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.copysign.v5f16(<5 x half> undef, <5 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.copysign.v8f16(<8 x half> undef, <8 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f16 = call <9 x half> @llvm.copysign.v9f16(<9 x half> undef, <9 x half> undef)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.copysign.v16f16(<16 x half> undef, <16 x half> undef)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'copysign_f16'
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index c79fa9c84d1c3..0c26bcb343bfc 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -271,7 +271,9 @@ bb:
 }
 
 ; GCN-LABEL: @copysign_combine_v2f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX8: call half @llvm.copysign.f16(
+; GFX9: call <2 x half> @llvm.copysign.v2f16(
 define void @copysign_combine_v2f16(ptr addrspace(1) %arg, half %sign) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -290,8 +292,6 @@ bb:
 
 ; FIXME: Should always vectorize
 ; GCN-LABEL: @copysign_combine_v4f16
-; GCN: call <2 x half> @llvm.copysign.v2f16(
-
 ; GFX8: call half @llvm.copysign.f16(
 ; GFX8: call half @llvm.copysign.f16(
 
@@ -327,8 +327,10 @@ bb:
 }
 
 ; GCN-LABEL: @canonicalize_combine_v4f16
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
-; GCN: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX8: call half @llvm.canonicalize.f16(
+; GFX8: call half @llvm.canonicalize.f16(
+
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
 define void @canonicalize_combine_v4f16(ptr addrspace(1) %arg) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

From 3800a83160a42f32947b82700e454cc07c600734 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:10:53 +0900
Subject: [PATCH 820/851] AMDGPU: Reduce cost of f64 copysign (#141944)

The real implementation is 1 real instruction plus a constant
materialize. Call that a 1, it's not a real f64 operation.
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 12 ++++---
 .../Analysis/CostModel/AMDGPU/copysign.ll     | 32 +++++++++----------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b2b25ac66677e..b79c9be3eac93 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -718,9 +718,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
-  if (SLT == MVT::f64)
-    return LT.first * NElts * get64BitInstrCost(CostKind);
-
   if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
@@ -731,6 +728,11 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   switch (ICA.getID()) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
+    if (SLT == MVT::f64) {
+      InstRate = get64BitInstrCost(CostKind);
+      break;
+    }
+
     if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
       InstRate = getFullRateInstrCost();
     else {
@@ -741,8 +743,8 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::copysign:
     return NElts * getFullRateInstrCost();
   case Intrinsic::canonicalize: {
-    assert(SLT != MVT::f64);
-    InstRate = getFullRateInstrCost();
+    InstRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
     break;
   }
   case Intrinsic::uadd_sat:
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
index 334bb341a3c3e..5b042a8a04603 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/copysign.ll
@@ -245,25 +245,25 @@ define void @copysign_bf16() {
 
 define void @copysign_f64() {
 ; ALL-LABEL: 'copysign_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'copysign_f64'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64 = call double @llvm.copysign.f64(double undef, double undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = call <3 x double> @llvm.copysign.v3f64(<3 x double> undef, <3 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f64 = call <5 x double> @llvm.copysign.v5f64(<5 x double> undef, <5 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9f64 = call <9 x double> @llvm.copysign.v9f64(<9 x double> undef, <9 x double> undef)
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64 = call <16 x double> @llvm.copysign.v16f64(<16 x double> undef, <16 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.copysign.f64(double undef, double undef)

From bec9ac2dafe1c9fca975721e9951c5f7f6b1b559 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 16:13:42 -0700
Subject: [PATCH 821/851] [llvm] Lower latency bonus threshold in function
 specialization. (#143954)

Related to #143219.

Function specialization does not kick in if flang sets `noalias`
attributes on the function arguments of `digits_2`, because PRE
optimizes several `srem` instructions and other memory accesses
from the inner loops causing the latency bonus to be lower than
the current 40% threshold.

While looking at this, I did not really get why we compute the latency
bonus as a ratio of the latency of the "eliminated" instructions
and the code-size of the whole function. It did not make much sense
to me.

I tried computing the total latency as a sum of latencies
of the instructions that belong to non-dead code (including
the instructions that would be executed had they not been
"eliminated" due to the constant propagation). This total
latency should identify the total cost of executing the function
with the given argument being dynamically equal to the tried
constant value. Then the latency bonus would be computed
as the ratio between the latency of the "eliminated" instructions
and the total latency. Unfortunately, this did not given me a good
heuristics either. The bonus was close to 0% on some targets,
and as big as 3-5% on other targets. This does match very well
with the performance gain achieved by function specialization
for exchange2, so it seemd like another artificial heuristic
not better than the current one.

It seems that GCC uses a set of different heuristics for function
specialization, but I am not an expert here and I cannot say
if we can match them in LLVM.

With all that said, I decided to try to lower the threshold
to avoid the regression and be able to re-enable the generally
good change for `noalias` attribute.

With this patch, I was able to reduce the effect of `noalias`,
so that `-force-no-alias=true` is only ~10% slower than
`-force-no-alias=false` code on neoverse-v1 and neoverse-v2.
On neoverse-n1, `-force-no-alias=true` is >2x faster than
`-force-no-alias=false` regardless of this patch.

This threshold has been changed before also due to improved
alias information:
https://github.com/llvm/llvm-project/commit/2fb51fba8ca904a6d3ddf30ae94228ecf9e6a231#diff-066363256b7b4164e66b28a3028b2cb9e405c9136241baa33db76ebd2edb87cd

Please let me know what testing I should run to make sure this change
is safe. As I understand, it may affect the compilation time
performance,
and I will appreciate it if someone points out which benchmarks
need to be checked before merging this.
---
 llvm/lib/Transforms/IPO/FunctionSpecialization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 1034ce9582152..45fa9d57e4862 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -71,7 +71,7 @@ static cl::opt<unsigned> MinCodeSizeSavings(
              "much percent of the original function size"));
 
 static cl::opt<unsigned> MinLatencySavings(
-    "funcspec-min-latency-savings", cl::init(40), cl::Hidden,
+    "funcspec-min-latency-savings", cl::init(20), cl::Hidden,
     cl::desc("Reject specializations whose latency savings are less than this "
              "much percent of the original function size"));
 

From af65cb68f553759eac307edda87ff7d8b5fdffa9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:13:57 +0900
Subject: [PATCH 822/851] AMDGPU: Move fpenvIEEEMode into TTI (#141945)

---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     | 28 ++-----------------
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 17 +++++++++++
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.h |  7 +++++
 3 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 9be8821d5bf96..d12170a60905b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -60,28 +60,6 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
   return maxnum(Src0, Src1);
 }
 
-enum class KnownIEEEMode { Unknown, On, Off };
-
-/// Return KnownIEEEMode::On if we know if the use context can assume
-/// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
-/// "amdgpu-ieee"="false".
-static KnownIEEEMode fpenvIEEEMode(const Instruction &I,
-                                   const GCNSubtarget &ST) {
-  if (!ST.hasIEEEMode()) // Only mode on gfx12
-    return KnownIEEEMode::On;
-
-  const Function *F = I.getFunction();
-  if (!F)
-    return KnownIEEEMode::Unknown;
-
-  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
-  if (IEEEAttr.isValid())
-    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
-
-  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
-                                               : KnownIEEEMode::On;
-}
-
 // Check if a value can be converted to a 16-bit value without losing
 // precision.
 // The value is expected to be either a float (IsFloat = true) or an unsigned
@@ -1004,7 +982,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     // TODO: Also can fold to 2 operands with infinities.
     if ((match(Src0, m_APFloat(ConstSrc0)) && ConstSrc0->isNaN()) ||
         isa<UndefValue>(Src0)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc0 && ConstSrc0->isSignaling())
@@ -1019,7 +997,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src1, m_APFloat(ConstSrc1)) && ConstSrc1->isNaN()) ||
                isa<UndefValue>(Src1)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         // TODO: If Src2 is snan, does it need quieting?
         if (ConstSrc1 && ConstSrc1->isSignaling())
@@ -1035,7 +1013,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     } else if ((match(Src2, m_APFloat(ConstSrc2)) && ConstSrc2->isNaN()) ||
                isa<UndefValue>(Src2)) {
-      switch (fpenvIEEEMode(II, *ST)) {
+      switch (fpenvIEEEMode(II)) {
       case KnownIEEEMode::On:
         if (ConstSrc2 && ConstSrc2->isSignaling()) {
           auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index b79c9be3eac93..ce2098a3a19bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1445,3 +1445,20 @@ void GCNTTIImpl::collectKernelLaunchBounds(
   LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
   LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
 }
+
+GCNTTIImpl::KnownIEEEMode
+GCNTTIImpl::fpenvIEEEMode(const Instruction &I) const {
+  if (!ST->hasIEEEMode()) // Only mode on gfx12
+    return KnownIEEEMode::On;
+
+  const Function *F = I.getFunction();
+  if (!F)
+    return KnownIEEEMode::Unknown;
+
+  Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
+  if (IEEEAttr.isValid())
+    return IEEEAttr.getValueAsBool() ? KnownIEEEMode::On : KnownIEEEMode::Off;
+
+  return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
+                                               : KnownIEEEMode::On;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index ec298c7e9631a..0fae301abf532 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -281,6 +281,13 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   void collectKernelLaunchBounds(
       const Function &F,
       SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const override;
+
+  enum class KnownIEEEMode { Unknown, On, Off };
+
+  /// Return KnownIEEEMode::On if we know if the use context can assume
+  /// "amdgpu-ieee"="true" and KnownIEEEMode::Off if we can assume
+  /// "amdgpu-ieee"="false".
+  KnownIEEEMode fpenvIEEEMode(const Instruction &I) const;
 };
 
 } // end namespace llvm

From 70343c8d44273c187e3f7fa5e2037fbc41307077 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 17 Jun 2025 16:14:13 -0700
Subject: [PATCH 823/851] [mlir][flang] Added
 Weighted[Region]BranchOpInterface's. (#142079)

The new interfaces provide getters and setters for the weight
information about the branches of BranchOpInterface and
RegionBranchOpInterface operations.

These interfaces are done the same way as LLVM dialect's
BranchWeightOpInterface.

The plan is to produce this information in Flang, e.g. mark
most probably "cold" code as such and allow LLVM to order
basic blocks accordingly. An example of such a code is
copy loops generated for arrays repacking - we can mark it
as "cold" assuming that the copy will not happen dynamically.
If the copy actually happens the overhead of the copy is probably high
enough so that we may not care about the little overhead
of jumping to the "cold" code and fetching it.
---
 .../include/flang/Optimizer/Dialect/FIROps.td |  18 ++-
 flang/lib/Optimizer/Dialect/FIROps.cpp        |  21 +++-
 .../Transforms/ControlFlowConverter.cpp       |   5 +-
 flang/test/Fir/cfg-conversion-if.fir          |  46 +++++++
 flang/test/Fir/fir-ops.fir                    |  16 +++
 flang/test/Fir/invalid.fir                    |  28 +++++
 .../Dialect/ControlFlow/IR/ControlFlowOps.td  |  34 +++---
 .../mlir/Dialect/LLVMIR/LLVMInterfaces.td     |  36 ------
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  57 +++++----
 .../mlir/Interfaces/ControlFlowInterfaces.h   |  20 ++++
 .../mlir/Interfaces/ControlFlowInterfaces.td  | 112 ++++++++++++++++++
 .../mlir/Target/LLVMIR/ModuleTranslation.h    |   2 +-
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |   7 +-
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  11 +-
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |  46 +++++++
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  11 +-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  11 +-
 .../Conversion/ControlFlowToLLVM/branch.mlir  |  14 +++
 mlir/test/Dialect/ControlFlow/invalid.mlir    |  36 ++++++
 mlir/test/Dialect/ControlFlow/ops.mlir        |  10 ++
 .../LLVMIR/Import/metadata-profiling.ll       |  13 +-
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  16 +++
 mlir/test/Target/LLVMIR/llvmir.mlir           |  26 ----
 23 files changed, 461 insertions(+), 135 deletions(-)
 create mode 100644 flang/test/Fir/cfg-conversion-if.fir

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 90e05ce3d5ca6..27a6ca4ebdb4e 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2323,9 +2323,13 @@ def fir_DoLoopOp : region_Op<"do_loop", [AttrSizedOperandSegments,
   }];
 }
 
-def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterface, [
-    "getRegionInvocationBounds", "getEntrySuccessorRegions"]>, RecursiveMemoryEffects,
-    NoRegionArguments]> {
+def fir_IfOp
+    : region_Op<
+          "if", [DeclareOpInterfaceMethods<
+                     RegionBranchOpInterface, ["getRegionInvocationBounds",
+                                               "getEntrySuccessorRegions"]>,
+                 RecursiveMemoryEffects, NoRegionArguments,
+                 WeightedRegionBranchOpInterface]> {
   let summary = "if-then-else conditional operation";
   let description = [{
     Used to conditionally execute operations. This operation is the FIR
@@ -2342,7 +2346,8 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
     ```
   }];
 
-  let arguments = (ins I1:$condition);
+  let arguments = (ins I1:$condition,
+      OptionalAttr<DenseI32ArrayAttr>:$region_weights);
   let results = (outs Variadic<AnyType>:$results);
 
   let regions = (region
@@ -2371,6 +2376,11 @@ def fir_IfOp : region_Op<"if", [DeclareOpInterfaceMethods<RegionBranchOpInterfac
 
     void resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
                            unsigned resultNum);
+
+    /// Returns the display name string for the region_weights attribute.
+    static constexpr llvm::StringRef getWeightsAttrAssemblyName() {
+      return "weights";
+    }
   }];
 }
 
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 6181e1fad4240..ecfa2939e96a6 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -4418,6 +4418,19 @@ mlir::ParseResult fir::IfOp::parse(mlir::OpAsmParser &parser,
       parser.resolveOperand(cond, i1Type, result.operands))
     return mlir::failure();
 
+  if (mlir::succeeded(
+          parser.parseOptionalKeyword(getWeightsAttrAssemblyName()))) {
+    if (parser.parseLParen())
+      return mlir::failure();
+    mlir::DenseI32ArrayAttr weights;
+    if (parser.parseCustomAttributeWithFallback(weights, mlir::Type{}))
+      return mlir::failure();
+    if (weights)
+      result.addAttribute(getRegionWeightsAttrName(result.name), weights);
+    if (parser.parseRParen())
+      return mlir::failure();
+  }
+
   if (parser.parseOptionalArrowTypeList(result.types))
     return mlir::failure();
 
@@ -4449,6 +4462,11 @@ llvm::LogicalResult fir::IfOp::verify() {
 void fir::IfOp::print(mlir::OpAsmPrinter &p) {
   bool printBlockTerminators = false;
   p << ' ' << getCondition();
+  if (auto weights = getRegionWeightsAttr()) {
+    p << ' ' << getWeightsAttrAssemblyName() << '(';
+    p.printStrippedAttrOrType(weights);
+    p << ')';
+  }
   if (!getResults().empty()) {
     p << " -> (" << getResultTypes() << ')';
     printBlockTerminators = true;
@@ -4464,7 +4482,8 @@ void fir::IfOp::print(mlir::OpAsmPrinter &p) {
     p.printRegion(otherReg, /*printEntryBlockArgs=*/false,
                   printBlockTerminators);
   }
-  p.printOptionalAttrDict((*this)->getAttrs());
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elideAttrs=*/{getRegionWeightsAttrName()});
 }
 
 void fir::IfOp::resultToSourceOps(llvm::SmallVectorImpl<mlir::Value> &results,
diff --git a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
index 8a9e9b80134b8..3d35803e6a2d3 100644
--- a/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
+++ b/flang/lib/Optimizer/Transforms/ControlFlowConverter.cpp
@@ -212,9 +212,12 @@ class CfgIfConv : public mlir::OpRewritePattern<fir::IfOp> {
     }
 
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<mlir::cf::CondBranchOp>(
+    auto branchOp = rewriter.create<mlir::cf::CondBranchOp>(
         loc, ifOp.getCondition(), ifOpBlock, llvm::ArrayRef<mlir::Value>(),
         otherwiseBlock, llvm::ArrayRef<mlir::Value>());
+    llvm::ArrayRef<int32_t> weights = ifOp.getWeights();
+    if (!weights.empty())
+      branchOp.setWeights(weights);
     rewriter.replaceOp(ifOp, continueBlock->getArguments());
     return success();
   }
diff --git a/flang/test/Fir/cfg-conversion-if.fir b/flang/test/Fir/cfg-conversion-if.fir
new file mode 100644
index 0000000000000..1e30ee8e64f02
--- /dev/null
+++ b/flang/test/Fir/cfg-conversion-if.fir
@@ -0,0 +1,46 @@
+// RUN: fir-opt --split-input-file --cfg-conversion %s | FileCheck %s
+
+func.func private @callee() -> none
+
+// CHECK-LABEL:   func.func @if_then(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+// CHECK:           cf.cond_br %[[ARG0]] weights([10, 90]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           %[[VAL_0:.*]] = fir.call @callee() : () -> none
+// CHECK:           cf.br ^bb2
+// CHECK:         ^bb2:
+// CHECK:           return
+// CHECK:         }
+func.func @if_then(%cond: i1) {
+  fir.if %cond weights([10, 90]) {
+    fir.call @callee() : () -> none
+  }
+  return
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @if_then_else(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) -> i32 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           cf.cond_br %[[ARG0]] weights([90, 10]), ^bb1, ^bb2
+// CHECK:         ^bb1:
+// CHECK:           cf.br ^bb3(%[[VAL_0]] : i32)
+// CHECK:         ^bb2:
+// CHECK:           cf.br ^bb3(%[[VAL_1]] : i32)
+// CHECK:         ^bb3(%[[VAL_2:.*]]: i32):
+// CHECK:           cf.br ^bb4
+// CHECK:         ^bb4:
+// CHECK:           return %[[VAL_2]] : i32
+// CHECK:         }
+func.func @if_then_else(%cond: i1) -> i32 {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %result = fir.if %cond weights([90, 10]) -> i32 {
+    fir.result %c0 : i32
+  } else {
+    fir.result %c1 : i32
+  }
+  return %result : i32
+}
diff --git a/flang/test/Fir/fir-ops.fir b/flang/test/Fir/fir-ops.fir
index 9c444d2f4e0bc..3585bf9efca3e 100644
--- a/flang/test/Fir/fir-ops.fir
+++ b/flang/test/Fir/fir-ops.fir
@@ -1015,3 +1015,19 @@ func.func @test_box_total_elements(%arg0: !fir.class<!fir.type<sometype{i:i32}>>
   %6 = arith.addi %2, %5 : index
   return %6 : index
 }
+
+// CHECK-LABEL:   func.func @test_if_weights(
+// CHECK-SAME:      %[[ARG0:.*]]: i1) {
+func.func @test_if_weights(%cond: i1) {
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           }
+  fir.if %cond weights([99, 1]) {
+  }
+// CHECK:           fir.if %[[ARG0]] weights([99, 1]) {
+// CHECK:           } else {
+// CHECK:           }
+  fir.if %cond weights ([99,1]) {
+  } else {
+  }
+  return
+}
diff --git a/flang/test/Fir/invalid.fir b/flang/test/Fir/invalid.fir
index 45cae1f82cb8e..aca0ecc1abdc1 100644
--- a/flang/test/Fir/invalid.fir
+++ b/flang/test/Fir/invalid.fir
@@ -1393,3 +1393,31 @@ fir.local {type = local_init} @x.localizer : f32 init {
 ^bb0(%arg0: f32, %arg1: f32):
   fir.yield(%arg0 : f32)
 }
+
+// -----
+
+func.func @wrong_weights_number_in_if_then(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 1 vs 2}}
+  fir.if %cond weights([50]) {
+  }
+  return
+}
+
+// -----
+
+func.func @wrong_weights_number_in_if_then_else(%cond: i1) {
+// expected-error @below {{expects number of region weights to match number of regions: 3 vs 2}}
+  fir.if %cond weights([50, 40, 10]) {
+  } else {
+  }
+  return
+}
+
+// -----
+
+func.func @negative_weight_in_if_then(%cond: i1) {
+// expected-error @below {{weight #0 must be non-negative}}
+  fir.if %cond weights([-1, 101]) {
+  }
+  return
+}
diff --git a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
index 48f12b46a57f1..79da81ba049dd 100644
--- a/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
+++ b/mlir/include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td
@@ -112,10 +112,11 @@ def BranchOp : CF_Op<"br", [
 // CondBranchOp
 //===----------------------------------------------------------------------===//
 
-def CondBranchOp : CF_Op<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface, ["getSuccessorForOperands"]>,
-     Pure, Terminator]> {
+def CondBranchOp
+    : CF_Op<"cond_br", [AttrSizedOperandSegments,
+                        DeclareOpInterfaceMethods<
+                            BranchOpInterface, ["getSuccessorForOperands"]>,
+                        WeightedBranchOpInterface, Pure, Terminator]> {
   let summary = "Conditional branch operation";
   let description = [{
     The `cf.cond_br` terminator operation represents a conditional branch on a
@@ -144,20 +145,23 @@ def CondBranchOp : CF_Op<"cond_br",
     ```
   }];
 
-  let arguments = (ins I1:$condition,
-                       Variadic<AnyType>:$trueDestOperands,
-                       Variadic<AnyType>:$falseDestOperands);
+  let arguments = (ins I1:$condition, Variadic<AnyType>:$trueDestOperands,
+      Variadic<AnyType>:$falseDestOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$branch_weights);
   let successors = (successor AnySuccessor:$trueDest, AnySuccessor:$falseDest);
 
-  let builders = [
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "ValueRange":$trueOperands, "Block *":$falseDest,
-      "ValueRange":$falseOperands), [{
-      build($_builder, $_state, condition, trueOperands, falseOperands, trueDest,
+  let builders = [OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "ValueRange":$trueOperands,
+                                "Block *":$falseDest,
+                                "ValueRange":$falseOperands),
+                            [{
+      build($_builder, $_state, condition, trueOperands, falseOperands, /*branch_weights=*/{}, trueDest,
             falseDest);
     }]>,
-    OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
-      "Block *":$falseDest, CArg<"ValueRange", "{}">:$falseOperands), [{
+                  OpBuilder<(ins "Value":$condition, "Block *":$trueDest,
+                                "Block *":$falseDest,
+                                CArg<"ValueRange", "{}">:$falseOperands),
+                            [{
       build($_builder, $_state, condition, trueDest, ValueRange(), falseDest,
             falseOperands);
     }]>];
@@ -216,7 +220,7 @@ def CondBranchOp : CF_Op<"cond_br",
 
   let hasCanonicalizer = 1;
   let assemblyFormat = [{
-    $condition `,`
+    $condition (`weights` `(` $branch_weights^ `)` )? `,`
     $trueDest (`(` $trueDestOperands^ `:` type($trueDestOperands) `)`)? `,`
     $falseDest (`(` $falseDestOperands^ `:` type($falseDestOperands) `)`)?
     attr-dict
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
index 2824f09dab6ce..138170f8c8762 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td
@@ -168,42 +168,6 @@ def NonNegFlagInterface : OpInterface<"NonNegFlagInterface"> {
   ];
 }
 
-def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> {
-  let description = [{
-    An interface for operations that can carry branch weights metadata. It
-    provides setters and getters for the operation's branch weights attribute.
-    The default implementation of the interface methods expect the operation to
-    have an attribute of type DenseI32ArrayAttr named branch_weights.
-  }];
-
-  let cppNamespace = "::mlir::LLVM";
-
-  let methods = [
-    InterfaceMethod<
-      /*desc=*/        "Returns the branch weights attribute or nullptr",
-      /*returnType=*/  "::mlir::DenseI32ArrayAttr",
-      /*methodName=*/  "getBranchWeightsOrNull",
-      /*args=*/        (ins),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        return op.getBranchWeightsAttr();
-      }]
-      >,
-    InterfaceMethod<
-      /*desc=*/        "Sets the branch weights attribute",
-      /*returnType=*/  "void",
-      /*methodName=*/  "setBranchWeights",
-      /*args=*/        (ins "::mlir::DenseI32ArrayAttr":$attr),
-      /*methodBody=*/  [{}],
-      /*defaultImpl=*/ [{
-        auto op = cast<ConcreteOp>(this->getOperation());
-        op.setBranchWeightsAttr(attr);
-      }]
-      >
-  ];
-}
-
 def AccessGroupOpInterface : OpInterface<"AccessGroupOpInterface"> {
   let description = [{
     An interface for memory operations that can carry access groups metadata.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 68fa620d239b9..939e7a09a73ad 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -660,12 +660,12 @@ def LLVM_FPTruncOp : LLVM_CastOp<"fptrunc", "FPTrunc",
                                  LLVM_ScalarOrVectorOf<LLVM_AnyFloat>>;
 
 // Call-related operations.
-def LLVM_InvokeOp : LLVM_Op<"invoke", [
-                      AttrSizedOperandSegments,
-                      DeclareOpInterfaceMethods<BranchOpInterface>,
-                      DeclareOpInterfaceMethods<CallOpInterface>,
-                      DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-                      Terminator]> {
+def LLVM_InvokeOp
+    : LLVM_Op<"invoke", [AttrSizedOperandSegments,
+                         DeclareOpInterfaceMethods<BranchOpInterface>,
+                         DeclareOpInterfaceMethods<CallOpInterface>,
+                         DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                         Terminator]> {
   let arguments = (ins
                    OptionalAttr<TypeAttrOf<LLVM_FunctionType>>:$var_callee_type,
                    OptionalAttr<FlatSymbolRefAttr>:$callee,
@@ -734,12 +734,12 @@ def LLVM_VaArgOp : LLVM_Op<"va_arg"> {
 // CallOp
 //===----------------------------------------------------------------------===//
 
-def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
-                    [AttrSizedOperandSegments,
-                     DeclareOpInterfaceMethods<FastmathFlagsInterface>,
-                     DeclareOpInterfaceMethods<CallOpInterface>,
-                     DeclareOpInterfaceMethods<SymbolUserOpInterface>,
-                     DeclareOpInterfaceMethods<BranchWeightOpInterface>]> {
+def LLVM_CallOp
+    : LLVM_MemAccessOpBase<
+          "call", [AttrSizedOperandSegments,
+                   DeclareOpInterfaceMethods<FastmathFlagsInterface>,
+                   DeclareOpInterfaceMethods<CallOpInterface>,
+                   DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call to an LLVM function.";
   let description = [{
     In LLVM IR, functions may return either 0 or 1 value. LLVM IR dialect
@@ -788,21 +788,16 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
       OptionalAttr<FlatSymbolRefAttr>:$callee,
       Variadic<LLVM_Type>:$callee_operands,
       DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags,
-      OptionalAttr<DenseI32ArrayAttr>:$branch_weights,
       DefaultValuedAttr<CConv, "CConv::C">:$CConv,
       DefaultValuedAttr<TailCallKind, "TailCallKind::None">:$TailCallKind,
       OptionalAttr<LLVM_MemoryEffectsAttr>:$memory_effects,
-      UnitAttr:$convergent,
-      UnitAttr:$no_unwind,
-      UnitAttr:$will_return,
+      UnitAttr:$convergent, UnitAttr:$no_unwind, UnitAttr:$will_return,
       VariadicOfVariadic<LLVM_Type, "op_bundle_sizes">:$op_bundle_operands,
       DenseI32ArrayAttr:$op_bundle_sizes,
       OptionalAttr<ArrayAttr>:$op_bundle_tags,
       OptionalAttr<DictArrayAttr>:$arg_attrs,
-      OptionalAttr<DictArrayAttr>:$res_attrs,
-      UnitAttr:$no_inline,
-      UnitAttr:$always_inline,
-      UnitAttr:$inline_hint);
+      OptionalAttr<DictArrayAttr>:$res_attrs, UnitAttr:$no_inline,
+      UnitAttr:$always_inline, UnitAttr:$inline_hint);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1047,11 +1042,12 @@ def LLVM_BrOp : LLVM_TerminatorOp<"br",
     LLVM_TerminatorPassthroughOpBuilder
   ];
 }
-def LLVM_CondBrOp : LLVM_TerminatorOp<"cond_br",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_CondBrOp
+    : LLVM_TerminatorOp<
+          "cond_br", [AttrSizedOperandSegments,
+                      DeclareOpInterfaceMethods<BranchOpInterface>,
+                      DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                      Pure]> {
   let arguments = (ins I1:$condition,
                    Variadic<LLVM_Type>:$trueDestOperands,
                    Variadic<LLVM_Type>:$falseDestOperands,
@@ -1136,11 +1132,12 @@ def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable"> {
   }];
 }
 
-def LLVM_SwitchOp : LLVM_TerminatorOp<"switch",
-    [AttrSizedOperandSegments,
-     DeclareOpInterfaceMethods<BranchOpInterface>,
-     DeclareOpInterfaceMethods<BranchWeightOpInterface>,
-     Pure]> {
+def LLVM_SwitchOp
+    : LLVM_TerminatorOp<
+          "switch", [AttrSizedOperandSegments,
+                     DeclareOpInterfaceMethods<BranchOpInterface>,
+                     DeclareOpInterfaceMethods<WeightedBranchOpInterface>,
+                     Pure]> {
   let arguments = (ins
     AnySignlessInteger:$value,
     Variadic<AnyType>:$defaultOperands,
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
index 7f6967f11444f..d63800c12d132 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.h
@@ -142,6 +142,26 @@ LogicalResult verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
                                             const SuccessorOperands &operands);
 } // namespace detail
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the branch weights attached to an operation
+/// implementing WeightedBranchOpInterface are correct.
+LogicalResult verifyBranchWeights(Operation *op);
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// WeightedRegiobBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+namespace detail {
+/// Verify that the region weights attached to an operation
+/// implementing WeightedRegiobBranchOpInterface are correct.
+LogicalResult verifyRegionBranchWeights(Operation *op);
+} // namespace detail
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index 69bce78e946c8..46ab0b9ebbc6b 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -375,6 +375,118 @@ def SelectLikeOpInterface : OpInterface<"SelectLikeOpInterface"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+def WeightedBranchOpInterface : OpInterface<"WeightedBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for branching terminator
+    operations, i.e. terminator operations with successors.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a branch
+    is computed as the ratio between the branch's weight and the total
+    sum of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of successors of the operation.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the branch weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getBranchWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the branch weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setBranchWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyBranchWeights($_op);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+// TODO: the probabilities of entering a particular region seem
+// to correlate with the values returned by
+// RegionBranchOpInterface::invocationBounds(), and we should probably
+// verify that the values are consistent. In that case, should
+// WeightedRegionBranchOpInterface extend RegionBranchOpInterface?
+def WeightedRegionBranchOpInterface
+    : OpInterface<"WeightedRegionBranchOpInterface"> {
+  let description = [{
+    This interface provides weight information for region operations
+    that exhibit branching behavior between held regions.
+
+    This interface provides methods for getting/setting integer non-negative
+    weight of each branch. The probability of executing a region is computed
+    as the ratio between the region branch's weight and the total sum
+    of the weights (which cannot be zero).
+    The weights are optional. If they are provided, then their number
+    must match the number of regions held by the operation
+    (including empty regions).
+
+    The weights specify the probability of branching to a particular
+    region when first executing the operation.
+    For example, for loop-like operations with a single region
+    the weight specifies the probability of entering the loop.
+
+    The default implementations of the methods expect the operation
+    to have an attribute of type DenseI32ArrayAttr named branch_weights.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [InterfaceMethod<
+                     /*desc=*/"Returns the region weights",
+                     /*returnType=*/"::llvm::ArrayRef<int32_t>",
+                     /*methodName=*/"getWeights",
+                     /*args=*/(ins),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        if (auto attr = op.getRegionWeightsAttr())
+          return attr.asArrayRef();
+        return {};
+      }]>,
+                 InterfaceMethod<
+                     /*desc=*/"Sets the region weights",
+                     /*returnType=*/"void",
+                     /*methodName=*/"setWeights",
+                     /*args=*/(ins "::llvm::ArrayRef<int32_t>":$weights),
+                     /*methodBody=*/[{}],
+                     /*defaultImpl=*/[{
+        auto op = cast<ConcreteOp>(this->getOperation());
+        op.setRegionWeightsAttr(::mlir::DenseI32ArrayAttr::get(op->getContext(), weights));
+      }]>,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifyRegionBranchWeights($_op);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ControlFlow Traits
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
index 97ae14aa0d6af..0f136c5c46d79 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h
@@ -189,7 +189,7 @@ class ModuleTranslation {
                                   llvm::Instruction *inst);
 
   /// Sets LLVM profiling metadata for operations that have branch weights.
-  void setBranchWeightsMetadata(BranchWeightOpInterface op);
+  void setBranchWeightsMetadata(WeightedBranchOpInterface op);
 
   /// Sets LLVM loop metadata for branch operations that have a loop annotation
   /// attribute.
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index debfd003bd5b5..d31d7d801e149 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -166,10 +166,15 @@ struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
                           TypeRange(adaptor.getFalseDestOperands()));
     if (failed(convertedFalseBlock))
       return failure();
-    Operation *newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
+    auto newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
         op, adaptor.getCondition(), *convertedTrueBlock,
         adaptor.getTrueDestOperands(), *convertedFalseBlock,
         adaptor.getFalseDestOperands());
+    ArrayRef<int32_t> weights = op.getWeights();
+    if (!weights.empty()) {
+      newOp.setWeights(weights);
+      op.removeBranchWeightsAttr();
+    }
     // TODO: We should not just forward all attributes like that. But there are
     // existing Flang tests that depend on this behavior.
     newOp->setAttrs(op->getAttrDictionary());
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index c7528c970a4ba..a12aef0dfad38 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -589,10 +589,6 @@ LogicalResult SwitchOp::verify() {
            static_cast<int64_t>(getCaseDestinations().size())))
     return emitOpError("expects number of case values to match number of "
                        "case destinations");
-  if (getBranchWeights() && getBranchWeights()->size() != getNumSuccessors())
-    return emitError("expects number of branch weights to match number of "
-                     "successors: ")
-           << getBranchWeights()->size() << " vs " << getNumSuccessors();
   if (getCaseValues() &&
       getValue().getType() != getCaseValues()->getElementType())
     return emitError("expects case value type to match condition value type");
@@ -962,7 +958,6 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
   assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
         /*var_callee_type=*/nullptr, callee, args, /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -992,7 +987,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, args,
         /*fastmathFlags=*/nullptr,
-        /*branch_weights=*/nullptr, /*CConv=*/nullptr,
+        /*CConv=*/nullptr,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
@@ -1009,7 +1004,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType),
         /*callee=*/nullptr, args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
@@ -1025,7 +1020,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), args,
-        /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
+        /*fastmathFlags=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
         /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 2ae334b517a31..3a63db35eec0f 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -9,6 +9,7 @@
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
@@ -80,6 +81,51 @@ detail::verifyBranchSuccessorOperands(Operation *op, unsigned succNo,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WeightedBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+static LogicalResult verifyWeights(Operation *op,
+                                   llvm::ArrayRef<int32_t> weights,
+                                   std::size_t expectedWeightsNum,
+                                   llvm::StringRef weightAnchorName,
+                                   llvm::StringRef weightRefName) {
+  if (weights.empty())
+    return success();
+
+  if (weights.size() != expectedWeightsNum)
+    return op->emitError() << "expects number of " << weightAnchorName
+                           << " weights to match number of " << weightRefName
+                           << ": " << weights.size() << " vs "
+                           << expectedWeightsNum;
+
+  for (auto [index, weight] : llvm::enumerate(weights))
+    if (weight < 0)
+      return op->emitError() << "weight #" << index << " must be non-negative";
+
+  if (llvm::all_of(weights, [](int32_t value) { return value == 0; }))
+    return op->emitError() << "branch weights cannot all be zero";
+
+  return success();
+}
+
+LogicalResult detail::verifyBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumSuccessors(), "branch",
+                       "successors");
+}
+
+//===----------------------------------------------------------------------===//
+// WeightedRegionBranchOpInterface
+//===----------------------------------------------------------------------===//
+
+LogicalResult detail::verifyRegionBranchWeights(Operation *op) {
+  llvm::ArrayRef<int32_t> weights =
+      cast<WeightedRegionBranchOpInterface>(op).getWeights();
+  return verifyWeights(op, weights, op->getNumRegions(), "region", "regions");
+}
+
 //===----------------------------------------------------------------------===//
 // RegionBranchOpInterface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 1b5ce868b5c77..e67aa892afe09 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -146,8 +146,15 @@ static LogicalResult setProfilingAttr(OpBuilder &builder, llvm::MDNode *node,
     branchWeights.push_back(branchWeight->getZExtValue());
   }
 
-  if (auto iface = dyn_cast<BranchWeightOpInterface>(op)) {
-    iface.setBranchWeights(builder.getDenseI32ArrayAttr(branchWeights));
+  if (auto iface = dyn_cast<WeightedBranchOpInterface>(op)) {
+    // LLVM allows attaching a single weight to call instructions.
+    // This is used for carrying the execution count information
+    // in PGO modes. MLIR WeightedBranchOpInterface does not allow this,
+    // so we drop the metadata in this case.
+    // LLVM should probably use the VP form of MD_prof metadata
+    // for such cases.
+    if (op->getNumSuccessors() != 0)
+      iface.setWeights(branchWeights);
     return success();
   }
   return failure();
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e5ca147ea98f8..3eaa24eb5c95b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1055,7 +1055,7 @@ LogicalResult ModuleTranslation::convertBlockImpl(Block &bb,
       return failure();
 
     // Set the branch weight metadata on the translated instruction.
-    if (auto iface = dyn_cast<BranchWeightOpInterface>(op))
+    if (auto iface = dyn_cast<WeightedBranchOpInterface>(op))
       setBranchWeightsMetadata(iface);
   }
 
@@ -2026,14 +2026,15 @@ void ModuleTranslation::setDereferenceableMetadata(
   inst->setMetadata(kindId, derefSizeNode);
 }
 
-void ModuleTranslation::setBranchWeightsMetadata(BranchWeightOpInterface op) {
-  DenseI32ArrayAttr weightsAttr = op.getBranchWeightsOrNull();
-  if (!weightsAttr)
+void ModuleTranslation::setBranchWeightsMetadata(WeightedBranchOpInterface op) {
+  SmallVector<uint32_t> weights;
+  llvm::transform(op.getWeights(), std::back_inserter(weights),
+                  [](int32_t value) { return static_cast<uint32_t>(value); });
+  if (weights.empty())
     return;
 
   llvm::Instruction *inst = isa<CallOp>(op) ? lookupCall(op) : lookupBranch(op);
   assert(inst && "expected the operation to have a mapping to an instruction");
-  SmallVector<uint32_t> weights(weightsAttr.asArrayRef());
   inst->setMetadata(
       llvm::LLVMContext::MD_prof,
       llvm::MDBuilder(getLLVMContext()).createBranchWeights(weights));
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
index 9a0f2b7714544..7c78211d59010 100644
--- a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
+++ b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
@@ -67,3 +67,17 @@ func.func @unreachable_block() {
 ^bb1(%arg0: index):
   cf.br ^bb1(%arg0 : index)
 }
+
+// -----
+
+// Test case for cf.cond_br with weights.
+
+// CHECK-LABEL:   func.func @cf_cond_br_with_weights(
+func.func @cf_cond_br_with_weights(%cond: i1, %a: index, %b: index) -> index {
+// CHECK:           llvm.cond_br %{{.*}} weights([90, 10]), ^bb1(%{{.*}} : i64), ^bb2(%{{.*}} : i64)
+  cf.cond_br %cond, ^bb1(%a : index), ^bb2(%b : index) {branch_weights = array<i32: 90, 10>}
+^bb1(%arg1: index):
+  return %arg1 : index
+^bb2(%arg2: index):
+  return %arg2 : index
+}
diff --git a/mlir/test/Dialect/ControlFlow/invalid.mlir b/mlir/test/Dialect/ControlFlow/invalid.mlir
index b51d8095c9974..1b8de22a9ff9f 100644
--- a/mlir/test/Dialect/ControlFlow/invalid.mlir
+++ b/mlir/test/Dialect/ControlFlow/invalid.mlir
@@ -67,3 +67,39 @@ func.func @switch_missing_default(%flag : i32, %caseOperand : i32) {
   ^bb3(%bb3arg : i32):
     return
 }
+
+// -----
+
+// CHECK-LABEL: func @wrong_weights_number
+func.func @wrong_weights_number(%cond: i1) {
+  // expected-error@+1 {{expects number of branch weights to match number of successors: 1 vs 2}}
+  cf.cond_br %cond weights([100]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @negative_weight
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{weight #0 must be non-negative}}
+  cf.cond_br %cond weights([-1, 101]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
+
+// -----
+
+// CHECK-LABEL: func @zero_weights
+func.func @wrong_total_weight(%cond: i1) {
+  // expected-error@+1 {{branch weights cannot all be zero}}
+  cf.cond_br %cond weights([0, 0]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Dialect/ControlFlow/ops.mlir b/mlir/test/Dialect/ControlFlow/ops.mlir
index c9317c7613972..160534240e0fa 100644
--- a/mlir/test/Dialect/ControlFlow/ops.mlir
+++ b/mlir/test/Dialect/ControlFlow/ops.mlir
@@ -51,3 +51,13 @@ func.func @switch_result_number(%arg0: i32) {
   ^bb2:
     return
 }
+
+// CHECK-LABEL: func @cond_weights
+func.func @cond_weights(%cond: i1) {
+// CHECK: cf.cond_br %{{.*}} weights([60, 40]), ^{{.*}}, ^{{.*}}
+  cf.cond_br %cond weights([60, 40]), ^bb1, ^bb2
+  ^bb1:
+    return
+  ^bb2:
+    return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
index cc3b47a54dfe9..c623df0b605b2 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-profiling.ll
@@ -36,14 +36,17 @@ bbd:
 
 ; // -----
 
+; Verify that a single weight attached to a call is not translated.
+; The MLIR WeightedBranchOpInterface does not support this case.
+
 ; CHECK: llvm.func @fn()
-declare void @fn()
+declare i32 @fn()
 
 ; CHECK-LABEL: @call_branch_weights
-define void @call_branch_weights() {
-  ; CHECK:  llvm.call @fn() {branch_weights = array<i32: 42>}
-  call void @fn(), !prof !0
-  ret void
+define i32 @call_branch_weights() {
+  ; CHECK:  llvm.call @fn() : () -> i32
+  %1 = call i32 @fn(), !prof !0
+  ret i32 %1
 }
 
 !0 = !{!"branch_weights", i32 42}
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 24a7b42557278..a8ef401fff27e 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -448,3 +448,19 @@ llvm.mlir.global external constant @const() {addr_space = 0 : i32, dso_local} :
 }
 
 llvm.func extern_weak @extern_func()
+
+// -----
+
+llvm.func @invoke_branch_weights_callee()
+llvm.func @__gxx_personality_v0(...) -> i32
+
+llvm.func @invoke_branch_weights() -> i32 attributes {personality = @__gxx_personality_v0} {
+  %0 = llvm.mlir.constant(1 : i32) : i32
+  // expected-error @below{{expects number of branch weights to match number of successors: 1 vs 2}}
+  llvm.invoke @invoke_branch_weights_callee() to ^bb2 unwind ^bb1 {branch_weights = array<i32 : 42>} : () -> ()
+^bb1:  // pred: ^bb0
+  %1 = llvm.landingpad cleanup : !llvm.struct<(ptr, i32)>
+  llvm.br ^bb2
+^bb2:  // 2 preds: ^bb0, ^bb1
+  llvm.return %0 : i32
+}
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 7742259e7a478..fc1993b50ba2d 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1906,32 +1906,6 @@ llvm.func @cond_br_weights(%cond : i1, %arg0 : i32,  %arg1 : i32) -> i32 {
 
 // -----
 
-llvm.func @fn()
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> ()
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
-llvm.func @fn() -> i32
-
-// CHECK-LABEL: @call_branch_weights
-llvm.func @call_branch_weights() {
-  // CHECK: !prof ![[NODE:[0-9]+]]
-  %res = llvm.call @fn() {branch_weights = array<i32 : 42>} : () -> i32
-  llvm.return
-}
-
-// CHECK: ![[NODE]] = !{!"branch_weights", i32 42}
-
-// -----
-
 llvm.func @foo()
 llvm.func @__gxx_personality_v0(...) -> i32
 

From 54015f36c682aab9024a21a93957312a69c5bc9b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:19:06 +0900
Subject: [PATCH 824/851] AMDGPU: Cost model for minimumnum/maximumnum
 (#141946)

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  19 +
 .../Analysis/CostModel/AMDGPU/maximumnum.ll   | 582 ++++++++++--------
 .../Analysis/CostModel/AMDGPU/minimumnum.ll   | 582 ++++++++++--------
 .../SLPVectorizer/AMDGPU/slp-v2f16.ll         |  40 ++
 4 files changed, 695 insertions(+), 528 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ce2098a3a19bb..f3474fcbbfb56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -685,6 +685,8 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
   case Intrinsic::copysign:
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum:
   case Intrinsic::canonicalize:
   // There's a small benefit to using vector ops in the legalized code.
   case Intrinsic::round:
@@ -742,6 +744,23 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     break;
   case Intrinsic::copysign:
     return NElts * getFullRateInstrCost();
+  case Intrinsic::minimumnum:
+  case Intrinsic::maximumnum: {
+    // Instruction + 2 canonicalizes. For cases that need type promotion, we the
+    // promotion takes the place of the canonicalize.
+    unsigned NumOps = 3;
+    if (const IntrinsicInst *II = ICA.getInst()) {
+      // Directly legal with ieee=0
+      // TODO: Not directly legal with strictfp
+      if (fpenvIEEEMode(*II) == KnownIEEEMode::Off)
+        NumOps = 1;
+    }
+
+    unsigned BaseRate =
+        SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
+    InstRate = BaseRate * NumOps;
+    break;
+  }
   case Intrinsic::canonicalize: {
     InstRate =
         SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
index 5b158e3d8d674..a81cb63f0c51f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/maximumnum.ll
@@ -11,75 +11,75 @@
 
 define void @maximumnum_f16() {
 ; GFX7-LABEL: 'maximumnum_f16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_f16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_f16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_f16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_f16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_f16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_f16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_f16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
@@ -92,77 +92,23 @@ define void @maximumnum_f16() {
 }
 
 define void @maximumnum_bf16() {
-; GFX7-LABEL: 'maximumnum_bf16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX8-LABEL: 'maximumnum_bf16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX9-LABEL: 'maximumnum_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX12-LABEL: 'maximumnum_bf16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX7-SIZE-LABEL: 'maximumnum_bf16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX8-SIZE-LABEL: 'maximumnum_bf16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX9-SIZE-LABEL: 'maximumnum_bf16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; ALL-LABEL: 'maximumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; GFX12-SIZE-LABEL: 'maximumnum_bf16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SIZE-LABEL: 'maximumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
   %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
@@ -175,21 +121,21 @@ define void @maximumnum_bf16() {
 
 define void @maximumnum_f32() {
 ; ALL-LABEL: 'maximumnum_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maximumnum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
@@ -203,21 +149,21 @@ define void @maximumnum_f32() {
 
 define void @maximumnum_f64() {
 ; ALL-LABEL: 'maximumnum_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'maximumnum_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
@@ -231,75 +177,75 @@ define void @maximumnum_f64() {
 
 define void @maximumnum_f16_no_ieee() #0 {
 ; GFX7-LABEL: 'maximumnum_f16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_f16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_f16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_f16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_f16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.maximumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.maximumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.maximumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.maximumnum.f16(half poison, half poison)
@@ -313,75 +259,75 @@ define void @maximumnum_f16_no_ieee() #0 {
 
 define void @maximumnum_bf16_no_ieee() #0 {
 ; GFX7-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'maximumnum_bf16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.maximumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.maximumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.maximumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.maximumnum.bf16(bfloat poison, bfloat poison)
@@ -394,23 +340,77 @@ define void @maximumnum_bf16_no_ieee() #0 {
 }
 
 define void @maximumnum_f32_no_ieee() #0 {
-; ALL-LABEL: 'maximumnum_f32_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'maximumnum_f32_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.maximumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.maximumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.maximumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.maximumnum.f32(float poison, float poison)
   %v2f32 = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> poison, <2 x float> poison)
@@ -422,23 +422,77 @@ define void @maximumnum_f32_no_ieee() #0 {
 }
 
 define void @maximumnum_f64_no_ieee() #0 {
-; ALL-LABEL: 'maximumnum_f64_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'maximumnum_f64_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'maximumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.maximumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.maximumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.maximumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.maximumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.maximumnum.f64(double poison, double poison)
   %v2f64 = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> poison, <2 x double> poison)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
index 97715cbab7d8a..b027ccc61266f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/minimumnum.ll
@@ -11,75 +11,75 @@
 
 define void @minimumnum_f16() {
 ; GFX7-LABEL: 'minimumnum_f16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_f16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_f16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_f16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_f16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_f16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_f16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_f16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
@@ -92,77 +92,23 @@ define void @minimumnum_f16() {
 }
 
 define void @minimumnum_bf16() {
-; GFX7-LABEL: 'minimumnum_bf16'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX8-LABEL: 'minimumnum_bf16'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX9-LABEL: 'minimumnum_bf16'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX12-LABEL: 'minimumnum_bf16'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
-;
-; GFX7-SIZE-LABEL: 'minimumnum_bf16'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX8-SIZE-LABEL: 'minimumnum_bf16'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; GFX9-SIZE-LABEL: 'minimumnum_bf16'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; ALL-LABEL: 'minimumnum_bf16'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; GFX12-SIZE-LABEL: 'minimumnum_bf16'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SIZE-LABEL: 'minimumnum_bf16'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
   %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
@@ -175,21 +121,21 @@ define void @minimumnum_bf16() {
 
 define void @minimumnum_f32() {
 ; ALL-LABEL: 'minimumnum_f32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'minimumnum_f32'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
@@ -203,21 +149,21 @@ define void @minimumnum_f32() {
 
 define void @minimumnum_f64() {
 ; ALL-LABEL: 'minimumnum_f64'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; ALL-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'minimumnum_f64'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
@@ -231,75 +177,75 @@ define void @minimumnum_f64() {
 
 define void @minimumnum_f16_no_ieee() #0 {
 ; GFX7-LABEL: 'minimumnum_f16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_f16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_f16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_f16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_f16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f16 = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> poison, <2 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f16 = call <3 x half> @llvm.minimumnum.v3f16(<3 x half> poison, <3 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16 = call <4 x half> @llvm.minimumnum.v4f16(<4 x half> poison, <4 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f16 = call <8 x half> @llvm.minimumnum.v8f16(<8 x half> poison, <8 x half> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f16 = call <16 x half> @llvm.minimumnum.v16f16(<16 x half> poison, <16 x half> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = call half @llvm.minimumnum.f16(half poison, half poison)
@@ -313,75 +259,75 @@ define void @minimumnum_f16_no_ieee() #0 {
 
 define void @minimumnum_bf16_no_ieee() #0 {
 ; GFX7-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX8-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX12-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 87 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-NEXT:  Cost Model: Found an estimated cost of 175 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX7-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX8-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX9-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; GFX12-SIZE-LABEL: 'minimumnum_bf16_no_ieee'
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
-; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2bf16 = call <2 x bfloat> @llvm.minimumnum.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3bf16 = call <3 x bfloat> @llvm.minimumnum.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4bf16 = call <4 x bfloat> @llvm.minimumnum.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8bf16 = call <8 x bfloat> @llvm.minimumnum.v8bf16(<8 x bfloat> poison, <8 x bfloat> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16bf16 = call <16 x bfloat> @llvm.minimumnum.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison)
 ; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %bf16 = call bfloat @llvm.minimumnum.bf16(bfloat poison, bfloat poison)
@@ -394,23 +340,77 @@ define void @minimumnum_bf16_no_ieee() #0 {
 }
 
 define void @minimumnum_f32_no_ieee() #0 {
-; ALL-LABEL: 'minimumnum_f32_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'minimumnum_f32_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f32_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v3f32 = call <3 x float> @llvm.minimumnum.v3f32(<3 x float> poison, <3 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> poison, <4 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8f32 = call <8 x float> @llvm.minimumnum.v8f32(<8 x float> poison, <8 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v16f32 = call <16 x float> @llvm.minimumnum.v16f32(<16 x float> poison, <16 x float> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f32 = call float @llvm.minimumnum.f32(float poison, float poison)
   %v2f32 = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> poison, <2 x float> poison)
@@ -422,23 +422,77 @@ define void @minimumnum_f32_no_ieee() #0 {
 }
 
 define void @minimumnum_f64_no_ieee() #0 {
-; ALL-LABEL: 'minimumnum_f64_no_ieee'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+; GFX7-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
-; SIZE-LABEL: 'minimumnum_f64_no_ieee'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; GFX8-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX9-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX12-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; GFX7-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX7-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX8-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX9-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX12-SIZE-LABEL: 'minimumnum_f64_no_ieee'
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3f64 = call <3 x double> @llvm.minimumnum.v3f64(<3 x double> poison, <3 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = call <4 x double> @llvm.minimumnum.v4f64(<4 x double> poison, <4 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = call <8 x double> @llvm.minimumnum.v8f64(<8 x double> poison, <8 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v16f64 = call <16 x double> @llvm.minimumnum.v16f64(<16 x double> poison, <16 x double> poison)
+; GFX12-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.minimumnum.f64(double poison, double poison)
   %v2f64 = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> poison, <2 x double> poison)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
index 0c26bcb343bfc..f71fdbdee527b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/slp-v2f16.ll
@@ -360,3 +360,43 @@ bb:
   store half %tmp16, ptr addrspace(1) %tmp14, align 2
   ret void
 }
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @minimumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.minimumnum.v2f16
+; GFX9: call <2 x half> @llvm.minimumnum.v2f16
+define void @minimumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.minimumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.minimumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}
+
+; FIXME: Should not vectorize on gfx8
+; GCN-LABEL: @maximumnum_combine_v2f16
+; GFX8: call <2 x half> @llvm.maximumnum.v2f16
+; GFX9: call <2 x half> @llvm.maximumnum.v2f16
+define void @maximumnum_combine_v2f16(ptr addrspace(1) %arg) {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp1
+  %tmp3 = load half, ptr addrspace(1) %tmp2, align 2
+  %tmp4 = call half @llvm.maximumnum.f16(half %tmp3, half 1.000000e+00)
+  store half %tmp4, ptr addrspace(1) %tmp2, align 2
+  %tmp5 = add nuw nsw i64 %tmp1, 1
+  %tmp6 = getelementptr inbounds half, ptr addrspace(1) %arg, i64 %tmp5
+  %tmp7 = load half, ptr addrspace(1) %tmp6, align 2
+  %tmp8 = call half @llvm.maximumnum.f16(half %tmp7, half 1.000000e+00)
+  store half %tmp8, ptr addrspace(1) %tmp6, align 2
+  ret void
+}

From f08474ab1fa984560565e917453a42bc8562a6f9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:21:55 +0900
Subject: [PATCH 825/851] AMDGPU: Add baseline cost model tests for special
 argument intrinsics (#141947)

---
 .../AMDGPU/special-argument-intrinsics.ll     | 202 ++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll

diff --git a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
new file mode 100644
index 0000000000000..ea045e04310be
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
@@ -0,0 +1,202 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=ALL,PACKEDID %s
+
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZE,SIZE-UNPACKEDID %s
+; RUN: opt -passes='print<cost-model>' -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=SIZE,SIZE-PACKEDID %s
+
+define i32 @workitem_id_x() {
+; ALL-LABEL: 'workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_y() {
+; ALL-LABEL: 'workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workitem_id_z() {
+; ALL-LABEL: 'workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.y()
+  ret i32 %result
+}
+
+define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
+; ALL-LABEL: 'kernel_workitem_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
+; SIZE-LABEL: 'kernel_workitem_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.z()
+  store i32 %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define i32 @workgroup_id_x() {
+; ALL-LABEL: 'workgroup_id_x'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_x'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.x()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_y() {
+; ALL-LABEL: 'workgroup_id_y'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_y'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @workgroup_id_z() {
+; ALL-LABEL: 'workgroup_id_z'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'workgroup_id_z'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.workgroup.id.y()
+  ret i32 %result
+}
+
+define i32 @lds_kernel_id() {
+; ALL-LABEL: 'lds_kernel_id'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
+;
+; SIZE-LABEL: 'lds_kernel_id'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
+;
+  %result = call i32 @llvm.amdgcn.lds.kernel.id()
+  ret i32 %result
+}
+
+define ptr addrspace(4) @dispatch_ptr() {
+; ALL-LABEL: 'dispatch_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'dispatch_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define i64 @dispatch_id_() {
+; ALL-LABEL: 'dispatch_id_'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i64 %result
+;
+; SIZE-LABEL: 'dispatch_id_'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %result
+;
+  %result = call i64 @llvm.amdgcn.dispatch.id()
+  ret i64 %result
+}
+
+define ptr addrspace(4) @implicitarg_ptr() {
+; ALL-LABEL: 'implicitarg_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'implicitarg_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  ret ptr addrspace(4) %result
+}
+
+define ptr addrspace(4) @queue_ptr() {
+; ALL-LABEL: 'queue_ptr'
+; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
+;
+; SIZE-LABEL: 'queue_ptr'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
+;
+  %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  ret ptr addrspace(4) %result
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; PACKEDID: {{.*}}
+; SIZE-PACKEDID: {{.*}}
+; SIZE-UNPACKEDID: {{.*}}
+; UNPACKEDID: {{.*}}

From f3af1cd08cd456214961af915c17f858c9eef1a5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 17 Jun 2025 16:24:50 -0700
Subject: [PATCH 826/851] [RISCV] Set the exact flag on the SRL created for
 converting vscale to a read of vlenb. (#144571)

We know that vlenb is a multiple of RVVBytesPerBlock so we aren't
shifting out any non-zero bits.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  13 +-
 .../CodeGen/RISCV/rvv/extract-subvector.ll    |  12 +-
 .../CodeGen/RISCV/rvv/get_vector_length.ll    |  24 +--
 .../CodeGen/RISCV/rvv/insert-subvector.ll     |  24 +--
 .../CodeGen/RISCV/rvv/legalize-load-sdnode.ll |  12 +-
 .../RISCV/rvv/legalize-store-sdnode.ll        |   6 +-
 llvm/test/CodeGen/RISCV/rvv/stepvector.ll     |  20 +--
 llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll   |  48 +++---
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |  22 +--
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  | 155 +++++++++---------
 .../RISCV/rvv/vreductions-fp-sdnode.ll        |  42 ++---
 11 files changed, 179 insertions(+), 199 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 33aae7ab16cca..e670567bd1844 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7353,20 +7353,25 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     uint64_t Val = Op.getConstantOperandVal(0);
     if (isPowerOf2_64(Val)) {
       uint64_t Log2 = Log2_64(Val);
-      if (Log2 < 3)
+      if (Log2 < 3) {
+        SDNodeFlags Flags;
+        Flags.setExact(true);
         Res = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                          DAG.getConstant(3 - Log2, DL, VT));
-      else if (Log2 > 3)
+                          DAG.getConstant(3 - Log2, DL, XLenVT), Flags);
+      } else if (Log2 > 3) {
         Res = DAG.getNode(ISD::SHL, DL, XLenVT, Res,
                           DAG.getConstant(Log2 - 3, DL, XLenVT));
+      }
     } else if ((Val % 8) == 0) {
       // If the multiplier is a multiple of 8, scale it down to avoid needing
       // to shift the VLENB value.
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, Res,
                         DAG.getConstant(Val / 8, DL, XLenVT));
     } else {
+      SDNodeFlags Flags;
+      Flags.setExact(true);
       SDValue VScale = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
-                                   DAG.getConstant(3, DL, XLenVT));
+                                   DAG.getConstant(3, DL, XLenVT), Flags);
       Res = DAG.getNode(ISD::MUL, DL, XLenVT, VScale,
                         DAG.getConstant(Val, DL, XLenVT));
     }
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index 83637e4a71d45..d42c42c7ce036 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -290,8 +290,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_6(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_6:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
@@ -314,8 +313,7 @@ define <vscale x 2 x i8> @extract_nxv32i8_nxv2i8_22(<vscale x 32 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv32i8_nxv2i8_22:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
@@ -341,9 +339,9 @@ define <vscale x 1 x i8> @extract_nxv4i8_nxv1i8_3(<vscale x 4 x i8> %vec) {
 ; CHECK-LABEL: extract_nxv4i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v8, a0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
index bd0fecd285515..aea688f03cf72 100644
--- a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll
@@ -257,9 +257,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_i32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB22_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -270,9 +270,9 @@ define i32 @vector_length_vf3_i32(i32 zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB22_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
@@ -286,9 +286,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV32-LABEL: vector_length_vf3_XLen:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    srli a1, a1, 3
-; RV32-NEXT:    slli a2, a1, 1
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    srli a2, a1, 3
+; RV32-NEXT:    srli a1, a1, 2
+; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    bltu a0, a1, .LBB23_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
@@ -299,9 +299,9 @@ define i32 @vector_length_vf3_XLen(iXLen zeroext %tc) {
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    slli a2, a1, 1
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    add a1, a1, a2
 ; RV64-NEXT:    bltu a0, a1, .LBB23_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index ca9cec921b3cd..61cf1f56aee3c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -78,12 +78,12 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-LABEL: insert_nxv1i8_nxv4i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 4 x i8> %v
@@ -309,12 +309,12 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
   ret <vscale x 16 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
index e9e1303d10768..f847ccafefdaf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
@@ -8,9 +8,9 @@ define <vscale x 3 x i8> @load_nxv3i8(ptr %ptr) {
 ; CHECK-LABEL: load_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -22,9 +22,9 @@ define <vscale x 5 x half> @load_nxv5f16(ptr %ptr) {
 ; CHECK-LABEL: load_nxv5f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 2
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
index 77438ee53b634..03b84ec177ee9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
@@ -8,9 +8,9 @@ define void @store_nxv3i8(<vscale x 3 x i8> %val, ptr %ptr) {
 ; CHECK-LABEL: store_nxv3i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    srli a2, a1, 3
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a0)
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
index d4e2c08d70d3d..95c1292e41927 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stepvector.ll
@@ -637,21 +637,21 @@ define <vscale x 16 x i64> @mul_bigimm_stepvector_nxv16i64() {
 ; RV32-NEXT:    lui a1, 797989
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    lui a3, 11557
-; RV32-NEXT:    lui a4, 92455
 ; RV32-NEXT:    addi a1, a1, -683
-; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    srli a4, a2, 2
 ; RV32-NEXT:    sw a1, 8(sp)
 ; RV32-NEXT:    sw a0, 12(sp)
-; RV32-NEXT:    srli a0, a2, 3
-; RV32-NEXT:    addi a1, a4, -1368
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    mulhu a1, a0, a1
-; RV32-NEXT:    slli a3, a0, 1
-; RV32-NEXT:    slli a0, a0, 6
-; RV32-NEXT:    sub a0, a0, a3
+; RV32-NEXT:    slli a0, a2, 3
+; RV32-NEXT:    sub a0, a0, a4
+; RV32-NEXT:    lui a1, 92455
+; RV32-NEXT:    addi a3, a3, -683
+; RV32-NEXT:    mul a3, a2, a3
+; RV32-NEXT:    srli a2, a2, 3
+; RV32-NEXT:    addi a1, a1, -1368
+; RV32-NEXT:    mulhu a1, a2, a1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    addi a1, sp, 8
-; RV32-NEXT:    sw a2, 0(sp)
+; RV32-NEXT:    sw a3, 0(sp)
 ; RV32-NEXT:    sw a0, 4(sp)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v8, (a1), zero
diff --git a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
index aef46e1f5cf1b..66e114c938c06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vandn-sdnode.ll
@@ -2240,20 +2240,19 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-RV32:       # %bb.0: # %entry
 ; CHECK-RV32-NEXT:    csrr a4, vlenb
-; CHECK-RV32-NEXT:    srli a3, a4, 3
-; CHECK-RV32-NEXT:    li a2, 64
+; CHECK-RV32-NEXT:    srli a2, a4, 3
+; CHECK-RV32-NEXT:    li a3, 64
 ; CHECK-RV32-NEXT:    not a1, a1
-; CHECK-RV32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-RV32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-RV32-NEXT:  # %bb.1:
 ; CHECK-RV32-NEXT:    li a3, 0
 ; CHECK-RV32-NEXT:    li a2, 0
 ; CHECK-RV32-NEXT:    j .LBB98_5
 ; CHECK-RV32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-RV32-NEXT:    li a2, 0
-; CHECK-RV32-NEXT:    slli a3, a3, 2
-; CHECK-RV32-NEXT:    neg a3, a3
-; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    srli a4, a4, 1
+; CHECK-RV32-NEXT:    neg a3, a4
+; CHECK-RV32-NEXT:    andi a3, a3, 256
 ; CHECK-RV32-NEXT:    li a6, 0
 ; CHECK-RV32-NEXT:    li a5, 0
 ; CHECK-RV32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2300,10 +2299,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-RV64-NEXT:    li a2, 0
 ; CHECK-RV64-NEXT:    j .LBB98_5
 ; CHECK-RV64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-RV64-NEXT:    slli a2, a2, 2
-; CHECK-RV64-NEXT:    negw a2, a2
-; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    srli a3, a4, 1
+; CHECK-RV64-NEXT:    negw a2, a3
+; CHECK-RV64-NEXT:    andi a2, a2, 256
 ; CHECK-RV64-NEXT:    slli a4, a4, 1
 ; CHECK-RV64-NEXT:    mv a5, a0
 ; CHECK-RV64-NEXT:    mv a6, a2
@@ -2335,19 +2333,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-NOZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-NOZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-NOZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-NOZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-NOZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-NOZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-NOZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-NOZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-NOZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-NOZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2395,10 +2392,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-NOZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-NOZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-NOZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-NOZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-NOZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-NOZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-NOZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-NOZBB64-NEXT:    mv a6, a2
@@ -2431,19 +2427,18 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB32-LABEL: vand_vx_loop_hoisted_not:
 ; CHECK-ZVKB-ZBB32:       # %bb.0: # %entry
 ; CHECK-ZVKB-ZBB32-NEXT:    csrr a4, vlenb
-; CHECK-ZVKB-ZBB32-NEXT:    srli a3, a4, 3
-; CHECK-ZVKB-ZBB32-NEXT:    li a2, 64
-; CHECK-ZVKB-ZBB32-NEXT:    bgeu a2, a3, .LBB98_2
+; CHECK-ZVKB-ZBB32-NEXT:    srli a2, a4, 3
+; CHECK-ZVKB-ZBB32-NEXT:    li a3, 64
+; CHECK-ZVKB-ZBB32-NEXT:    bgeu a3, a2, .LBB98_2
 ; CHECK-ZVKB-ZBB32-NEXT:  # %bb.1:
 ; CHECK-ZVKB-ZBB32-NEXT:    li a3, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB32-NEXT:  .LBB98_2: # %vector.ph
 ; CHECK-ZVKB-ZBB32-NEXT:    li a2, 0
-; CHECK-ZVKB-ZBB32-NEXT:    slli a3, a3, 2
-; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a3
-; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    srli a4, a4, 1
+; CHECK-ZVKB-ZBB32-NEXT:    neg a3, a4
+; CHECK-ZVKB-ZBB32-NEXT:    andi a3, a3, 256
 ; CHECK-ZVKB-ZBB32-NEXT:    li a6, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    li a5, 0
 ; CHECK-ZVKB-ZBB32-NEXT:    vsetvli a7, zero, e32, m2, ta, ma
@@ -2489,10 +2484,9 @@ define void @vand_vx_loop_hoisted_not(ptr %a, i32 noundef signext %mask) {
 ; CHECK-ZVKB-ZBB64-NEXT:    li a2, 0
 ; CHECK-ZVKB-ZBB64-NEXT:    j .LBB98_5
 ; CHECK-ZVKB-ZBB64-NEXT:  .LBB98_2: # %vector.ph
-; CHECK-ZVKB-ZBB64-NEXT:    slli a2, a2, 2
-; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a2
-; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    srli a3, a4, 1
+; CHECK-ZVKB-ZBB64-NEXT:    negw a2, a3
+; CHECK-ZVKB-ZBB64-NEXT:    andi a2, a2, 256
 ; CHECK-ZVKB-ZBB64-NEXT:    slli a4, a4, 1
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a5, a0
 ; CHECK-ZVKB-ZBB64-NEXT:    mv a6, a2
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index ca7f2563e4fc9..baace6d26f144 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -338,16 +338,14 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    add a3, a0, a0
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a2
 ; CHECK-NEXT:    vslideup.vx v8, v13, a2
-; CHECK-NEXT:    add a2, a0, a0
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v14, a3
-; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v14, a4
+; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
@@ -381,20 +379,18 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    add a3, a1, a1
 ; CHECK-NEXT:    add a4, a2, a1
-; CHECK-NEXT:    slli a5, a1, 1
-; CHECK-NEXT:    add a6, a0, a0
+; CHECK-NEXT:    add a5, a0, a0
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v9, a1
-; CHECK-NEXT:    add a5, a5, a1
 ; CHECK-NEXT:    vslideup.vx v8, v13, a1
+; CHECK-NEXT:    add a1, a4, a1
 ; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
 ; CHECK-NEXT:    vslideup.vx v10, v11, a2
-; CHECK-NEXT:    add a1, a5, a1
 ; CHECK-NEXT:    vslideup.vx v8, v14, a2
 ; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v12, a5
-; CHECK-NEXT:    vslideup.vx v8, v15, a5
-; CHECK-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a4
+; CHECK-NEXT:    vslideup.vx v8, v15, a4
+; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs1r.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 6a08f5a28a295..75f92c86ff09f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -477,27 +477,26 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v8, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    sub a0, a0, a1
-; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v16, (a0)
 ; CHECK-NEXT:    vlseg4e8.v v8, (a0)
 ; CHECK-NEXT:    vmsne.vi v0, v8, 0
@@ -606,11 +605,9 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a2, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
@@ -836,39 +833,37 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    vmv1r.v v9, v0
 ; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
-; CHECK-NEXT:    srli a2, a0, 2
-; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    srli a3, a0, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a3
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vs8r.v v16, (a2)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg6e8.v v16, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg6e8.v v16, (a2)
 ; CHECK-NEXT:    vlseg6e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v16
 ; CHECK-NEXT:    vmv2r.v v22, v18
@@ -1068,36 +1063,35 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv.v.i v12, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
-; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    sub a2, a0, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v22, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    vmv1r.v v10, v15
-; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmerge.vim v24, v12, 1, v0
 ; CHECK-NEXT:    vmv1r.v v11, v24
-; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a0
 ; CHECK-NEXT:    vmv1r.v v8, v23
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vmv1r.v v9, v14
@@ -1339,49 +1333,48 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
 ; CHECK-NEXT:    srli a1, a0, 2
-; CHECK-NEXT:    srli a2, a0, 1
-; CHECK-NEXT:    srli a3, a0, 3
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
-; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v9, a2
-; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a0, a0, 1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v9, a0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v8, a1
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vslidedown.vx v0, v9, a1
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    vs8r.v v16, (a2)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
 ; CHECK-NEXT:    vs8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg8e8.v v18, (a1)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v18, (a2)
 ; CHECK-NEXT:    vlseg8e8.v v10, (a0)
 ; CHECK-NEXT:    vmv2r.v v8, v18
 ; CHECK-NEXT:    vmv2r.v v26, v20
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 3da04eb7e6abe..78aae96242fd3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -887,9 +887,9 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv3f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -906,8 +906,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv6f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -925,8 +924,7 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv10f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -944,9 +942,8 @@ define half @vreduce_ord_fadd_nxv12f16(<vscale x 12 x half> %v, half %s) {
 ; CHECK-LABEL: vreduce_ord_fadd_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -965,9 +962,9 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v10, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
@@ -984,8 +981,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    lui a1, 1048568
 ; CHECK-NEXT:    vmv.s.x v11, a1
@@ -1002,13 +998,12 @@ declare half @llvm.vector.reduce.fmin.nxv10f16(<vscale x 10 x half>)
 define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-LABEL: vreduce_fmin_nxv10f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    lui a1, %hi(.LCPI73_0)
-; CHECK-NEXT:    addi a1, a1, %lo(.LCPI73_0)
+; CHECK-NEXT:    lui a0, %hi(.LCPI73_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI73_0)
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT:    vle16.v v12, (a1)
-; CHECK-NEXT:    srli a1, a0, 3
-; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a1, a0, 2
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v12, v8, v12
@@ -1024,9 +1019,8 @@ define half @vreduce_fmax_nxv12f16(<vscale x 12 x half> %v) {
 ; CHECK-LABEL: vreduce_fmax_nxv12f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 3
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub a0, a0, a1
 ; CHECK-NEXT:    li a1, -512
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma

From a9811340b75baae8e06fb9ab83015a90d61510ee Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 08:24:58 +0900
Subject: [PATCH 827/851] AMDGPU: Report special input intrinsics as free
 (#141948)

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 23 +++++++-
 .../AMDGPU/special-argument-intrinsics.ll     | 56 +++++++++----------
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f3474fcbbfb56..d5a1aaef4ad68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -704,8 +704,29 @@ static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
 InstructionCost
 GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                   TTI::TargetCostKind CostKind) const {
-  if (ICA.getID() == Intrinsic::fabs)
+  switch (ICA.getID()) {
+  case Intrinsic::fabs:
+    // Free source modifier in the common case.
+    return 0;
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::amdgcn_workitem_id_z:
+    // TODO: If hasPackedTID, or if the calling context is not an entry point
+    // there may be a bit instruction.
+    return 0;
+  case Intrinsic::amdgcn_workgroup_id_x:
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::amdgcn_lds_kernel_id:
+  case Intrinsic::amdgcn_dispatch_ptr:
+  case Intrinsic::amdgcn_dispatch_id:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+  case Intrinsic::amdgcn_queue_ptr:
+    // Read from an argument register.
     return 0;
+  default:
+    break;
+  }
 
   if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
index ea045e04310be..00dbcff0a021f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/special-argument-intrinsics.ll
@@ -7,11 +7,11 @@
 
 define i32 @workitem_id_x() {
 ; ALL-LABEL: 'workitem_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.x()
@@ -20,12 +20,12 @@ define i32 @workitem_id_x() {
 
 define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -36,11 +36,11 @@ define amdgpu_kernel void @kernel_workitem_id_x(ptr addrspace(1) %ptr) {
 
 define i32 @workitem_id_y() {
 ; ALL-LABEL: 'workitem_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.y()
@@ -49,12 +49,12 @@ define i32 @workitem_id_y() {
 
 define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -65,11 +65,11 @@ define amdgpu_kernel void @kernel_workitem_id_y(ptr addrspace(1) %ptr) {
 
 define i32 @workitem_id_z() {
 ; ALL-LABEL: 'workitem_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workitem_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.y()
@@ -78,12 +78,12 @@ define i32 @workitem_id_z() {
 
 define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
 ; ALL-LABEL: 'kernel_workitem_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SIZE-LABEL: 'kernel_workitem_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workitem.id.z()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 %result, ptr addrspace(1) %ptr, align 4
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -94,11 +94,11 @@ define amdgpu_kernel void @kernel_workitem_id_z(ptr addrspace(1) %ptr) {
 
 define i32 @workgroup_id_x() {
 ; ALL-LABEL: 'workgroup_id_x'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_x'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.x()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -107,11 +107,11 @@ define i32 @workgroup_id_x() {
 
 define i32 @workgroup_id_y() {
 ; ALL-LABEL: 'workgroup_id_y'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_y'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -120,11 +120,11 @@ define i32 @workgroup_id_y() {
 
 define i32 @workgroup_id_z() {
 ; ALL-LABEL: 'workgroup_id_z'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'workgroup_id_z'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.workgroup.id.y()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -133,11 +133,11 @@ define i32 @workgroup_id_z() {
 
 define i32 @lds_kernel_id() {
 ; ALL-LABEL: 'lds_kernel_id'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 %result
 ;
 ; SIZE-LABEL: 'lds_kernel_id'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i32 @llvm.amdgcn.lds.kernel.id()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %result
 ;
   %result = call i32 @llvm.amdgcn.lds.kernel.id()
@@ -146,11 +146,11 @@ define i32 @lds_kernel_id() {
 
 define ptr addrspace(4) @dispatch_ptr() {
 ; ALL-LABEL: 'dispatch_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'dispatch_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -159,11 +159,11 @@ define ptr addrspace(4) @dispatch_ptr() {
 
 define i64 @dispatch_id_() {
 ; ALL-LABEL: 'dispatch_id_'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i64 %result
 ;
 ; SIZE-LABEL: 'dispatch_id_'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call i64 @llvm.amdgcn.dispatch.id()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %result
 ;
   %result = call i64 @llvm.amdgcn.dispatch.id()
@@ -172,11 +172,11 @@ define i64 @dispatch_id_() {
 
 define ptr addrspace(4) @implicitarg_ptr() {
 ; ALL-LABEL: 'implicitarg_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'implicitarg_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -185,11 +185,11 @@ define ptr addrspace(4) @implicitarg_ptr() {
 
 define ptr addrspace(4) @queue_ptr() {
 ; ALL-LABEL: 'queue_ptr'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret ptr addrspace(4) %result
 ;
 ; SIZE-LABEL: 'queue_ptr'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr addrspace(4) %result
 ;
   %result = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()

From 628274dadf92995f4544d6134cba45d327d9eaaa Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Tue, 17 Jun 2025 16:35:47 -0700
Subject: [PATCH 828/851] [NFC] Extract Printing portions of DWARFCFIProgram to
 new files (#143762)

CFIPrograms' most common uses are within debug frames, but it is not
their only use. For example, some assembly writers encode them by hand
into .cfi_escape directives. This PR extracts printing code for them
into its own files, which avoids the need for the main class to depend
on DWARFUnit, sections, and similar.

One in a series of NFC DebugInfo/DWARF refactoring changes to layer it
more cleanly, so that binary CFI parsing can be used from low-level
code, (such as byte strings created via .cfi_escape) without circular
dependencies. The final goal is to make a more limited dwarf library
usable from lower-level code.

More information can be found at
https://discourse.llvm.org/t/rfc-debuginfo-dwarf-refactor-into-to-lower-and-higher-level-libraries/86665
---
 .../llvm/DebugInfo/DWARF/DWARFCFIPrinter.h    |  28 ++++
 .../llvm/DebugInfo/DWARF/DWARFCFIProgram.h    |  62 ++++-----
 llvm/lib/DebugInfo/DWARF/CMakeLists.txt       |   1 +
 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp  | 121 ++++++++++++++++++
 llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp  |  94 --------------
 llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp  |   6 +-
 llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h   |   5 +-
 7 files changed, 184 insertions(+), 133 deletions(-)
 create mode 100644 llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
 create mode 100644 llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
new file mode 100644
index 0000000000000..32e8247ac4c22
--- /dev/null
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIPrinter.h
@@ -0,0 +1,28 @@
+//===- DWARFCFIPrinter.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+#define LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+
+namespace llvm {
+
+struct DIDumpOptions;
+
+namespace dwarf {
+
+void printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                     const DIDumpOptions &DumpOpts, unsigned IndentLevel,
+                     std::optional<uint64_t> Address);
+
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFCFIPRINTER_H
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
index 24a0f389470db..ad7358c28f16b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFCFIProgram.h
@@ -24,6 +24,7 @@
 namespace llvm {
 
 namespace dwarf {
+
 /// Represent a sequence of Call Frame Information instructions that, when read
 /// in order, construct a table mapping PC to frame state. This can also be
 /// referred to as "CFI rules" in DWARF literature to avoid confusion with
@@ -80,15 +81,37 @@ class CFIProgram {
   LLVM_ABI Error parse(DWARFDataExtractor Data, uint64_t *Offset,
                        uint64_t EndOffset);
 
-  LLVM_ABI void dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                     unsigned IndentLevel,
-                     std::optional<uint64_t> InitialLocation) const;
-
   void addInstruction(const Instruction &I) { Instructions.push_back(I); }
 
   /// Get a DWARF CFI call frame string for the given DW_CFA opcode.
   LLVM_ABI StringRef callFrameString(unsigned Opcode) const;
 
+  /// Types of operands to CFI instructions
+  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
+  /// thus this type doesn't need to be explicitly written to the file (this is
+  /// not a DWARF encoding). The relationship of instrs to operand types can
+  /// be obtained from getOperandTypes() and is only used to simplify
+  /// instruction printing and error messages.
+  enum OperandType {
+    OT_Unset,
+    OT_None,
+    OT_Address,
+    OT_Offset,
+    OT_FactoredCodeOffset,
+    OT_SignedFactDataOffset,
+    OT_UnsignedFactDataOffset,
+    OT_Register,
+    OT_AddressSpace,
+    OT_Expression
+  };
+
+  /// Get the OperandType as a "const char *".
+  static const char *operandTypeString(OperandType OT);
+
+  /// Retrieve the array describing the types of operands according to the enum
+  /// above. This is indexed by opcode.
+  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
+
 private:
   std::vector<Instruction> Instructions;
   const uint64_t CodeAlignmentFactor;
@@ -121,37 +144,6 @@ class CFIProgram {
     Instructions.back().Ops.push_back(Operand2);
     Instructions.back().Ops.push_back(Operand3);
   }
-
-  /// Types of operands to CFI instructions
-  /// In DWARF, this type is implicitly tied to a CFI instruction opcode and
-  /// thus this type doesn't need to be explicitly written to the file (this is
-  /// not a DWARF encoding). The relationship of instrs to operand types can
-  /// be obtained from getOperandTypes() and is only used to simplify
-  /// instruction printing.
-  enum OperandType {
-    OT_Unset,
-    OT_None,
-    OT_Address,
-    OT_Offset,
-    OT_FactoredCodeOffset,
-    OT_SignedFactDataOffset,
-    OT_UnsignedFactDataOffset,
-    OT_Register,
-    OT_AddressSpace,
-    OT_Expression
-  };
-
-  /// Get the OperandType as a "const char *".
-  static const char *operandTypeString(OperandType OT);
-
-  /// Retrieve the array describing the types of operands according to the enum
-  /// above. This is indexed by opcode.
-  static ArrayRef<OperandType[MaxOperands]> getOperandTypes();
-
-  /// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-  void printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                    const Instruction &Instr, unsigned OperandIdx,
-                    uint64_t Operand, std::optional<uint64_t> &Address) const;
 };
 
 } // end namespace dwarf
diff --git a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
index cc9734f9f22be..86e74110b15ea 100644
--- a/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/DWARF/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMDebugInfoDWARF
   DWARFAbbreviationDeclaration.cpp
   DWARFAddressRange.cpp
   DWARFAcceleratorTable.cpp
+  DWARFCFIPrinter.cpp
   DWARFCFIProgram.cpp
   DWARFCompileUnit.cpp
   DWARFContext.cpp
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
new file mode 100644
index 0000000000000..e52f671e4fa1c
--- /dev/null
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIPrinter.cpp
@@ -0,0 +1,121 @@
+//===- DWARFCFIPrinter.cpp - Print the cfi-portions of .debug_frame -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <optional>
+
+using namespace llvm;
+using namespace dwarf;
+
+static void printRegister(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                          unsigned RegNum) {
+  if (DumpOpts.GetNameForDWARFReg) {
+    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
+    if (!RegName.empty()) {
+      OS << RegName;
+      return;
+    }
+  }
+  OS << "reg" << RegNum;
+}
+
+/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
+static void printOperand(raw_ostream &OS, const DIDumpOptions &DumpOpts,
+                         const CFIProgram &P,
+                         const CFIProgram::Instruction &Instr,
+                         unsigned OperandIdx, uint64_t Operand,
+                         std::optional<uint64_t> &Address) {
+  assert(OperandIdx < CFIProgram::MaxOperands);
+  uint8_t Opcode = Instr.Opcode;
+  CFIProgram::OperandType Type = P.getOperandTypes()[Opcode][OperandIdx];
+
+  switch (Type) {
+  case CFIProgram::OT_Unset: {
+    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
+    auto OpcodeName = P.callFrameString(Opcode);
+    if (!OpcodeName.empty())
+      OS << " " << OpcodeName;
+    else
+      OS << format(" Opcode %x", Opcode);
+    break;
+  }
+  case CFIProgram::OT_None:
+    break;
+  case CFIProgram::OT_Address:
+    OS << format(" %" PRIx64, Operand);
+    Address = Operand;
+    break;
+  case CFIProgram::OT_Offset:
+    // The offsets are all encoded in a unsigned form, but in practice
+    // consumers use them signed. It's most certainly legacy due to
+    // the lack of signed variants in the first Dwarf standards.
+    OS << format(" %+" PRId64, int64_t(Operand));
+    break;
+  case CFIProgram::OT_FactoredCodeOffset: // Always Unsigned
+    if (P.codeAlign())
+      OS << format(" %" PRId64, Operand * P.codeAlign());
+    else
+      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
+    if (Address && P.codeAlign()) {
+      *Address += Operand * P.codeAlign();
+      OS << format(" to 0x%" PRIx64, *Address);
+    }
+    break;
+  case CFIProgram::OT_SignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, int64_t(Operand) * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
+    break;
+  case CFIProgram::OT_UnsignedFactDataOffset:
+    if (P.dataAlign())
+      OS << format(" %" PRId64, Operand * P.dataAlign());
+    else
+      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
+    break;
+  case CFIProgram::OT_Register:
+    OS << ' ';
+    printRegister(OS, DumpOpts, Operand);
+    break;
+  case CFIProgram::OT_AddressSpace:
+    OS << format(" in addrspace%" PRId64, Operand);
+    break;
+  case CFIProgram::OT_Expression:
+    assert(Instr.Expression && "missing DWARFExpression object");
+    OS << " ";
+    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
+                                  nullptr);
+    break;
+  }
+}
+
+void llvm::dwarf::printCFIProgram(const CFIProgram &P, raw_ostream &OS,
+                                  const DIDumpOptions &DumpOpts,
+                                  unsigned IndentLevel,
+                                  std::optional<uint64_t> Address) {
+  for (const auto &Instr : P) {
+    uint8_t Opcode = Instr.Opcode;
+    OS.indent(2 * IndentLevel);
+    OS << P.callFrameString(Opcode) << ":";
+    for (size_t i = 0; i < Instr.Ops.size(); ++i)
+      printOperand(OS, DumpOpts, P, Instr, i, Instr.Ops[i], Address);
+    OS << '\n';
+  }
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
index 8d25599627c4a..365b26b98a1e3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCFIProgram.cpp
@@ -23,18 +23,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-static void printRegister(raw_ostream &OS, DIDumpOptions DumpOpts,
-                          unsigned RegNum) {
-  if (DumpOpts.GetNameForDWARFReg) {
-    auto RegName = DumpOpts.GetNameForDWARFReg(RegNum, DumpOpts.IsEH);
-    if (!RegName.empty()) {
-      OS << RegName;
-      return;
-    }
-  }
-  OS << "reg" << RegNum;
-}
-
 // See DWARF standard v3, section 7.23
 const uint8_t DWARF_CFI_PRIMARY_OPCODE_MASK = 0xc0;
 const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
@@ -361,85 +349,3 @@ CFIProgram::getOperandTypes() {
 
   return ArrayRef<OperandType[MaxOperands]>(&OpTypes[0], DW_CFA_restore + 1);
 }
-
-/// Print \p Opcode's operand number \p OperandIdx which has value \p Operand.
-void CFIProgram::printOperand(raw_ostream &OS, DIDumpOptions DumpOpts,
-                              const Instruction &Instr, unsigned OperandIdx,
-                              uint64_t Operand,
-                              std::optional<uint64_t> &Address) const {
-  assert(OperandIdx < MaxOperands);
-  uint8_t Opcode = Instr.Opcode;
-  OperandType Type = getOperandTypes()[Opcode][OperandIdx];
-
-  switch (Type) {
-  case OT_Unset: {
-    OS << " Unsupported " << (OperandIdx ? "second" : "first") << " operand to";
-    auto OpcodeName = callFrameString(Opcode);
-    if (!OpcodeName.empty())
-      OS << " " << OpcodeName;
-    else
-      OS << format(" Opcode %x", Opcode);
-    break;
-  }
-  case OT_None:
-    break;
-  case OT_Address:
-    OS << format(" %" PRIx64, Operand);
-    Address = Operand;
-    break;
-  case OT_Offset:
-    // The offsets are all encoded in a unsigned form, but in practice
-    // consumers use them signed. It's most certainly legacy due to
-    // the lack of signed variants in the first Dwarf standards.
-    OS << format(" %+" PRId64, int64_t(Operand));
-    break;
-  case OT_FactoredCodeOffset: // Always Unsigned
-    if (CodeAlignmentFactor)
-      OS << format(" %" PRId64, Operand * CodeAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*code_alignment_factor", Operand);
-    if (Address && CodeAlignmentFactor) {
-      *Address += Operand * CodeAlignmentFactor;
-      OS << format(" to 0x%" PRIx64, *Address);
-    }
-    break;
-  case OT_SignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, int64_t(Operand) * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", int64_t(Operand));
-    break;
-  case OT_UnsignedFactDataOffset:
-    if (DataAlignmentFactor)
-      OS << format(" %" PRId64, Operand * DataAlignmentFactor);
-    else
-      OS << format(" %" PRId64 "*data_alignment_factor", Operand);
-    break;
-  case OT_Register:
-    OS << ' ';
-    printRegister(OS, DumpOpts, Operand);
-    break;
-  case OT_AddressSpace:
-    OS << format(" in addrspace%" PRId64, Operand);
-    break;
-  case OT_Expression:
-    assert(Instr.Expression && "missing DWARFExpression object");
-    OS << " ";
-    DWARFExpressionPrinter::print(&Instr.Expression.value(), OS, DumpOpts,
-                                  nullptr);
-    break;
-  }
-}
-
-void CFIProgram::dump(raw_ostream &OS, DIDumpOptions DumpOpts,
-                      unsigned IndentLevel,
-                      std::optional<uint64_t> Address) const {
-  for (const auto &Instr : Instructions) {
-    uint8_t Opcode = Instr.Opcode;
-    OS.indent(2 * IndentLevel);
-    OS << callFrameString(Opcode) << ":";
-    for (unsigned i = 0; i < Instr.Ops.size(); ++i)
-      printOperand(OS, DumpOpts, Instr, i, Instr.Ops[i], Address);
-    OS << '\n';
-  }
-}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index c46b14b4446f7..9dff925073dbe 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFCFIProgram.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -602,7 +603,8 @@ void CIE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     OS << "\n";
   }
   OS << "\n";
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, /*InitialLocation=*/{});
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1,
+                  /*InitialLocation=*/{});
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
@@ -630,7 +632,7 @@ void FDE::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   OS << "  Format:       " << FormatString(IsDWARF64) << "\n";
   if (LSDAAddress)
     OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
-  CFIs.dump(OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
+  printCFIProgram(CFIs, OS, DumpOpts, /*IndentLevel=*/1, InitialLocation);
   OS << "\n";
 
   if (Expected<UnwindTable> RowsOrErr = UnwindTable::create(this))
diff --git a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
index 94a44e3afccb4..85c4165de4aa9 100644
--- a/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
+++ b/llvm/tools/llvm-readobj/DwarfCFIEHPrinter.h
@@ -12,6 +12,7 @@
 #include "llvm-readobj.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFCFIPrinter.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
@@ -228,8 +229,8 @@ void PrinterContext<ELFT>::printEHFrame(const Elf_Shdr *EHFrameShdr) const {
     W.indent();
     auto DumpOpts = DIDumpOptions();
     DumpOpts.IsEH = true;
-    Entry.cfis().dump(W.getOStream(), DumpOpts, W.getIndentLevel(),
-                      InitialLocation);
+    printCFIProgram(Entry.cfis(), W.getOStream(), DumpOpts, W.getIndentLevel(),
+                    InitialLocation);
     W.unindent();
     W.unindent();
     W.getOStream() << "\n";

From a871b919ed135b3b50db58ed816d6ddb488d9c5e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:27:25 +0000
Subject: [PATCH 829/851] [gn build] Port 9e0186d925f0

---
 llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
index cd7d0671fbe71..7338fb159419a 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Frontend/BUILD.gn
@@ -14,6 +14,7 @@ unittest("LLVMFrontendTests") {
   ]
   sources = [
     "HLSLRootSignatureDumpTest.cpp",
+    "HLSLRootSignatureRangesTest.cpp",
     "OpenACCTest.cpp",
     "OpenMPCompositionTest.cpp",
     "OpenMPContextTest.cpp",

From 535291409cc7e4ae571318a38bd3617d7f608002 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:27:26 +0000
Subject: [PATCH 830/851] [gn build] Port 9ec75a50bc48

---
 .../gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
index a10a0d5637e95..87c02122e0f63 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
@@ -67,7 +67,6 @@ static_library("MCTargetDesc") {
     "MipsInstPrinter.cpp",
     "MipsMCAsmInfo.cpp",
     "MipsMCCodeEmitter.cpp",
-    "MipsMCExpr.cpp",
     "MipsMCTargetDesc.cpp",
     "MipsNaClELFStreamer.cpp",
     "MipsOptionRecord.cpp",

From 6652961ae5fee4d81871e4310a9e842c61136c10 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:34:15 +0000
Subject: [PATCH 831/851] [gn build] Manually port 556e69b7

---
 llvm/utils/gn/secondary/lldb/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 15b57f7d85fc7..6dcce2db37964 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -92,6 +92,7 @@ write_lit_cfg("lit_api_site_cfg") {
     "LLDB_FRAMEWORK_DIR=XXX_framework_dir",
     "CMAKE_CXX_COMPILER=c++",  # XXX use bin/clang++ instead?
     "HOST_OS=$host_os",  # XXX
+    "Python3_ROOT_DIR=",  # FIXME
   ]
 
   if (is_debug) {

From b164d3613ad9b86a8b951cfc43fadc0edfc7644e Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 17 Jun 2025 23:42:47 +0000
Subject: [PATCH 832/851] [gn build] Port 628274dadf92

---
 llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
index fc071e5471d01..cb46f7cf55fe0 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DebugInfo/DWARF/BUILD.gn
@@ -11,6 +11,7 @@ static_library("DWARF") {
     "DWARFAbbreviationDeclaration.cpp",
     "DWARFAcceleratorTable.cpp",
     "DWARFAddressRange.cpp",
+    "DWARFCFIPrinter.cpp",
     "DWARFCFIProgram.cpp",
     "DWARFCompileUnit.cpp",
     "DWARFContext.cpp",

From f2d2c99866dfd133e7b9c98b1d4983c6bce33d67 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 17 Jun 2025 16:43:55 -0700
Subject: [PATCH 833/851] [clang] Remove separate evaluation step for static
 class member init. (#142713)

We already evaluate the initializers for all global variables, as
required by the standard. Leverage that evaluation instead of trying to
separately validate static class members.

This has a few benefits:

- Improved diagnostics; we now get notes explaining what failed to
evaluate.
- Improved correctness: is_constant_evaluated is handled correctly.

The behavior follows the proposed resolution for CWG1721.

Fixes #88462. Fixes #99680.
---
 clang/lib/Sema/SemaDecl.cpp                   | 39 +++++++++----------
 .../SemaCXX/builtin-is-constant-evaluated.cpp | 14 +++++++
 clang/test/SemaCXX/class.cpp                  | 28 ++++++++-----
 clang/test/SemaCXX/cxx0x-class.cpp            | 11 ++++--
 clang/test/SemaCXX/cxx2a-consteval.cpp        |  8 ++--
 .../SemaTemplate/instantiate-static-var.cpp   | 10 +++--
 6 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 02ac898a2b702..1bf72e5bb7b9d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -13963,31 +13963,10 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
 
     // We allow integer constant expressions in all cases.
     } else if (DclT->isIntegralOrEnumerationType()) {
-      // Check whether the expression is a constant expression.
-      SourceLocation Loc;
       if (getLangOpts().CPlusPlus11 && DclT.isVolatileQualified())
         // In C++11, a non-constexpr const static data member with an
         // in-class initializer cannot be volatile.
         Diag(VDecl->getLocation(), diag::err_in_class_initializer_volatile);
-      else if (Init->isValueDependent())
-        ; // Nothing to check.
-      else if (Init->isIntegerConstantExpr(Context, &Loc))
-        ; // Ok, it's an ICE!
-      else if (Init->getType()->isScopedEnumeralType() &&
-               Init->isCXX11ConstantExpr(Context))
-        ; // Ok, it is a scoped-enum constant expression.
-      else if (Init->isEvaluatable(Context)) {
-        // If we can constant fold the initializer through heroics, accept it,
-        // but report this as a use of an extension for -pedantic.
-        Diag(Loc, diag::ext_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-      } else {
-        // Otherwise, this is some crazy unknown case.  Report the issue at the
-        // location provided by the isIntegerConstantExpr failed check.
-        Diag(Loc, diag::err_in_class_initializer_non_constant)
-          << Init->getSourceRange();
-        VDecl->setInvalidDecl();
-      }
 
     // We allow foldable floating-point constants as an extension.
     } else if (DclT->isFloatingType()) { // also permits complex, which is ok
@@ -14715,6 +14694,17 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
       // Compute and cache the constant value, and remember that we have a
       // constant initializer.
       if (HasConstInit) {
+        if (var->isStaticDataMember() && !var->isInline() &&
+            var->getLexicalDeclContext()->isRecord() &&
+            type->isIntegralOrEnumerationType()) {
+          // In C++98, in-class initialization for a static data member must
+          // be an integer constant expression.
+          SourceLocation Loc;
+          if (!Init->isIntegerConstantExpr(Context, &Loc)) {
+            Diag(Loc, diag::ext_in_class_initializer_non_constant)
+                << Init->getSourceRange();
+          }
+        }
         (void)var->checkForConstantInitialization(Notes);
         Notes.clear();
       } else if (CacheCulprit) {
@@ -14750,6 +14740,13 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
           << Attr->getRange() << Attr->isConstinit();
       for (auto &it : Notes)
         Diag(it.first, it.second);
+    } else if (var->isStaticDataMember() && !var->isInline() &&
+               var->getLexicalDeclContext()->isRecord()) {
+      Diag(var->getLocation(), diag::err_in_class_initializer_non_constant)
+          << Init->getSourceRange();
+      for (auto &it : Notes)
+        Diag(it.first, it.second);
+      var->setInvalidDecl();
     } else if (IsGlobal &&
                !getDiagnostics().isIgnored(diag::warn_global_constructor,
                                            var->getLocation())) {
diff --git a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
index c775fe71069df..66981acf87a8a 100644
--- a/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
+++ b/clang/test/SemaCXX/builtin-is-constant-evaluated.cpp
@@ -154,3 +154,17 @@ namespace narrowing {
     // expected-note {{insert an explicit cast to silence this issue}}
   }
 }
+
+struct GH99680 {
+  static const int x1 = 1/(1-__builtin_is_constant_evaluated()); // expected-error {{in-class initializer for static data member is not a constant expression}} \
+    // expected-note {{division by zero}}
+  static const int x2 = __builtin_is_constant_evaluated();
+  static_assert(x2 == 1);
+  static const float x3 = 1/(1-__builtin_is_constant_evaluated());  // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}} \
+  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // expected-note {{division by zero}}
+  static const float x4 = __builtin_is_constant_evaluated(); // expected-error {{in-class initializer for static data member of type 'const float' requires 'constexpr' specifier}} \
+  // expected-note {{add 'constexpr'}}
+  static_assert(fold(x4 == 1));
+};
diff --git a/clang/test/SemaCXX/class.cpp b/clang/test/SemaCXX/class.cpp
index 2f59544e7f36c..f1e02d5158aac 100644
--- a/clang/test/SemaCXX/class.cpp
+++ b/clang/test/SemaCXX/class.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -Wc++11-compat %s
-// RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s -std=c++98
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -Wc++11-compat %s -std=c++98
 class C {
 public:
   auto int errx; // expected-error {{storage class specified for a member declaration}}
@@ -32,7 +32,7 @@ class C {
   int : 1, : 2;
   typedef int E : 1; // expected-error {{typedef member 'E' cannot be a bit-field}}
   static int sb : 1; // expected-error {{static member 'sb' cannot be a bit-field}}
-  static int vs;
+  static int vs; // cxx11-note {{declared here}}
 
   typedef int func();
   func tm;
@@ -48,20 +48,28 @@ class C {
 #endif
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{read of non-const variable 'vs' is not allowed in a constant expression}} \
+  // cxx98-note {{subexpression not valid in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // ok, illegal in C++11
 #if __cplusplus >= 201103L
   // expected-error@-2 {{static const volatile data member must be initialized out of line}}
 #endif
   static const E evi = 0;
-  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-                                               // expected-warning@-1 {{overflow in expression}}
-  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
-  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}}
+  static const int overflow = 1000000*1000000; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                               // cxx11-note {{value 1000000000000 is outside the range of representable values of type 'int'}} \
+                                               // expected-warning {{overflow in expression}}
+  static const int overflow_shift = 1<<32; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                           // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift2 = 1>>32; // cxx11-error {{in-class initializer for static data member is not a constant expression}}\
+                                            // cxx11-note {{shift count 32 >= width of type 'int' (32 bits)}}
+  static const int overflow_shift3 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift4 = 1<<-1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{negative shift count -1}}
+  static const int overflow_shift5 = -1<<1; // cxx11-error {{in-class initializer for static data member is not a constant expression}} \
+                                            // cxx11-note {{left shift of negative value -1}}
 
   void m() {
     sx = 0;
diff --git a/clang/test/SemaCXX/cxx0x-class.cpp b/clang/test/SemaCXX/cxx0x-class.cpp
index a612a5c07e6ed..4b54221cceff2 100644
--- a/clang/test/SemaCXX/cxx0x-class.cpp
+++ b/clang/test/SemaCXX/cxx0x-class.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -Wno-uninitialized -fsyntax-only -verify -std=c++11 -Wno-error=static-float-init %s 
 
-int vs = 0;
+int vs = 0; // expected-note {{declared here}}
 
 class C {
 public:
@@ -11,17 +11,20 @@ class C {
   int i = 0;
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const NestedC' must be initialized out of line}}
-  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
+  static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                             // expected-note {{read of non-const variable 'vs' is not allowed in a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // expected-error {{static const volatile data member must be initialized out of line}}
 };
 
 namespace rdar8367341 {
-  float foo(); // expected-note {{here}}
+  float foo(); // expected-note 2 {{here}}
 
   struct A {
     static const float x = 5.0f; // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
-    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}}
+    static const float y = foo(); // expected-warning {{requires 'constexpr'}} expected-note {{add 'constexpr'}} \
+                                  // expected-error {{in-class initializer for static data member is not a constant expression}} \
+                                  // expected-note {{non-constexpr function 'foo' cannot be used in a constant expression}}
     static constexpr float x2 = 5.0f;
     static constexpr float y2 = foo(); // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr function 'foo'}}
   };
diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp
index d9932e4dd8241..1474c48cda3c1 100644
--- a/clang/test/SemaCXX/cxx2a-consteval.cpp
+++ b/clang/test/SemaCXX/cxx2a-consteval.cpp
@@ -1154,20 +1154,20 @@ namespace GH65985 {
 int consteval operator""_foo(unsigned long long V) {
     return 0;
 }
-int consteval operator""_bar(unsigned long long V); // expected-note 3{{here}}
+int consteval operator""_bar(unsigned long long V); // expected-note 4 {{here}}
 
 int consteval f() {
   return 0;
 }
 
-int consteval g();  // expected-note {{here}}
+int consteval g();  // expected-note 2 {{here}}
 
 
 struct C {
     static const int a = 1_foo;
     static constexpr int b = 1_foo;
     static const int c = 1_bar; // expected-error {{call to consteval function 'GH65985::operator""_bar' is not a constant expression}} \
-                                // expected-note {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
+                                // expected-note 2 {{undefined function 'operator""_bar' cannot be used in a constant expression}} \
                                 // expected-error {{in-class initializer for static data member is not a constant expression}}
 
     // FIXME: remove duplicate diagnostics
@@ -1179,7 +1179,7 @@ struct C {
     static const int e = f();
     static const int f = g(); // expected-error {{call to consteval function 'GH65985::g' is not a constant expression}} \
                               // expected-error {{in-class initializer for static data member is not a constant expression}} \
-                              // expected-note  {{undefined function 'g' cannot be used in a constant expression}}
+                              // expected-note 2 {{undefined function 'g' cannot be used in a constant expression}}
 };
 
 }
diff --git a/clang/test/SemaTemplate/instantiate-static-var.cpp b/clang/test/SemaTemplate/instantiate-static-var.cpp
index 63d8366b617c1..6602670af901f 100644
--- a/clang/test/SemaTemplate/instantiate-static-var.cpp
+++ b/clang/test/SemaTemplate/instantiate-static-var.cpp
@@ -1,11 +1,13 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx98 -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx11 -std=c++11 %s
 
 template<typename T, T Divisor>
 class X {
 public:
-  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}}
+  static const T value = 10 / Divisor; // expected-error{{in-class initializer for static data member is not a constant expression}} \
+  // cxx11-note {{division by zero}} \
+  // cxx98-note {{subexpression not valid}}
 };
 
 int array1[X<int, 2>::value == 5? 1 : -1];

From c21a4c6c43bb6d68dfe52e07a5a391a6167eedf9 Mon Sep 17 00:00:00 2001
From: Andrei Safronov <andrei.safronov@espressif.com>
Date: Wed, 18 Jun 2025 02:57:47 +0300
Subject: [PATCH 834/851] [Xtensa] Implement Xtensa Interrupt/Exception/Debug
 Options. (#143820)

Implement Xtensa Interrupt. HighInterrupts, Exception, Debug Options.
Also implement small Xtensa Options like PRID, Coprocessor and Timers.
---
 .../Xtensa/AsmParser/XtensaAsmParser.cpp      |  29 +-
 .../Disassembler/XtensaDisassembler.cpp       |  56 +++-
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       |  89 +++++-
 .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h  |   9 +-
 llvm/lib/Target/Xtensa/XtensaFeatures.td      |  40 +++
 llvm/lib/Target/Xtensa/XtensaInstrInfo.td     | 115 +++++++
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.td  | 111 ++++++-
 llvm/lib/Target/Xtensa/XtensaSubtarget.h      |   9 +
 .../MC/Disassembler/Xtensa/coprocessor.txt    |  10 +
 llvm/test/MC/Disassembler/Xtensa/debug.txt    |  62 ++++
 .../test/MC/Disassembler/Xtensa/exception.txt |  42 +++
 .../MC/Disassembler/Xtensa/highinterrupts.txt |  82 +++++
 .../test/MC/Disassembler/Xtensa/interrupt.txt |  26 ++
 llvm/test/MC/Disassembler/Xtensa/prid.txt     |  10 +
 llvm/test/MC/Disassembler/Xtensa/timer.txt    |  22 ++
 llvm/test/MC/Xtensa/Core/processor-control.s  |   5 +
 llvm/test/MC/Xtensa/coprocessor.s             |  20 ++
 llvm/test/MC/Xtensa/debug-invalid.s           |   9 +
 llvm/test/MC/Xtensa/debug.s                   | 190 ++++++++++++
 llvm/test/MC/Xtensa/exception.s               | 100 +++++++
 llvm/test/MC/Xtensa/highinterrupts.s          | 280 ++++++++++++++++++
 llvm/test/MC/Xtensa/interrupt.s               |  60 ++++
 llvm/test/MC/Xtensa/prid.s                    |  20 ++
 llvm/test/MC/Xtensa/timer.s                   |  65 ++++
 24 files changed, 1438 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/debug.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/exception.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/interrupt.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/prid.txt
 create mode 100644 llvm/test/MC/Disassembler/Xtensa/timer.txt
 create mode 100644 llvm/test/MC/Xtensa/coprocessor.s
 create mode 100644 llvm/test/MC/Xtensa/debug-invalid.s
 create mode 100644 llvm/test/MC/Xtensa/debug.s
 create mode 100644 llvm/test/MC/Xtensa/exception.s
 create mode 100644 llvm/test/MC/Xtensa/highinterrupts.s
 create mode 100644 llvm/test/MC/Xtensa/interrupt.s
 create mode 100644 llvm/test/MC/Xtensa/prid.s
 create mode 100644 llvm/test/MC/Xtensa/timer.s

diff --git a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
index 1f6cfec8edf4e..6c4e365451af0 100644
--- a/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
+++ b/llvm/lib/Target/Xtensa/AsmParser/XtensaAsmParser.cpp
@@ -62,11 +62,14 @@ class XtensaAsmParser : public MCTargetAsmParser {
 #include "XtensaGenAsmMatcher.inc"
 
   ParseStatus parseImmediate(OperandVector &Operands);
-  ParseStatus parseRegister(OperandVector &Operands, bool AllowParens = false,
-                            bool SR = false);
+  ParseStatus
+  parseRegister(OperandVector &Operands, bool AllowParens = false,
+                bool SR = false,
+                Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   ParseStatus parseOperandWithModifier(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                    bool SR = false);
+  bool
+  parseOperand(OperandVector &Operands, StringRef Mnemonic, bool SR = false,
+               Xtensa::RegisterAccessType RAType = Xtensa::REGISTER_EXCHANGE);
   bool ParseInstructionWithSR(ParseInstructionInfo &Info, StringRef Name,
                               SMLoc NameLoc, OperandVector &Operands);
   ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
@@ -580,7 +583,8 @@ bool XtensaAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
 }
 
 ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
-                                           bool AllowParens, bool SR) {
+                                           bool AllowParens, bool SR,
+                                           Xtensa::RegisterAccessType RAType) {
   SMLoc FirstS = getLoc();
   bool HadParens = false;
   AsmToken Buf[2];
@@ -624,7 +628,7 @@ ParseStatus XtensaAsmParser::parseRegister(OperandVector &Operands,
     return ParseStatus::NoMatch;
   }
 
-  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+  if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
     return ParseStatus::NoMatch;
 
   if (HadParens)
@@ -685,7 +689,7 @@ ParseStatus XtensaAsmParser::parseOperandWithModifier(OperandVector &Operands) {
 /// from this information, adding to Operands.
 /// If operand was parsed, returns false, else true.
 bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
-                                   bool SR) {
+                                   bool SR, Xtensa::RegisterAccessType RAType) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   ParseStatus Res = MatchOperandParserImpl(Operands, Mnemonic);
@@ -699,7 +703,7 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
     return true;
 
   // Attempt to parse token as register
-  if (parseRegister(Operands, true, SR).isSuccess())
+  if (parseRegister(Operands, true, SR, RAType).isSuccess())
     return false;
 
   // Attempt to parse token as an immediate
@@ -713,6 +717,11 @@ bool XtensaAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
 bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
                                              StringRef Name, SMLoc NameLoc,
                                              OperandVector &Operands) {
+  Xtensa::RegisterAccessType RAType =
+      Name[0] == 'w' ? Xtensa::REGISTER_WRITE
+                     : (Name[0] == 'r' ? Xtensa::REGISTER_READ
+                                       : Xtensa::REGISTER_EXCHANGE);
+
   if ((Name.starts_with("wsr.") || Name.starts_with("rsr.") ||
        Name.starts_with("xsr.")) &&
       (Name.size() > 4)) {
@@ -728,7 +737,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     if (RegNo == 0)
       RegNo = MatchRegisterAltName(RegName);
 
-    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits()))
+    if (!Xtensa::checkRegister(RegNo, getSTI().getFeatureBits(), RAType))
       return Error(NameLoc, "invalid register name");
 
     // Parse operand
@@ -753,7 +762,7 @@ bool XtensaAsmParser::ParseInstructionWithSR(ParseInstructionInfo &Info,
     }
 
     // Parse second operand
-    if (parseOperand(Operands, Name, true))
+    if (parseOperand(Operands, Name, true, RAType))
       return true;
   }
 
diff --git a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
index dbd34964db074..3b37ac88b9b17 100644
--- a/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
+++ b/llvm/lib/Target/Xtensa/Disassembler/XtensaDisassembler.cpp
@@ -119,13 +119,39 @@ struct DecodeRegister {
 };
 
 const DecodeRegister SRDecoderTable[] = {
-    {Xtensa::LBEG, 0},    {Xtensa::LEND, 1},        {Xtensa::LCOUNT, 2},
-    {Xtensa::SAR, 3},     {Xtensa::BREG, 4},        {Xtensa::SAR, 3},
-    {Xtensa::LITBASE, 5}, {Xtensa::ACCLO, 16},      {Xtensa::ACCHI, 17},
-    {Xtensa::M0, 32},     {Xtensa::M1, 33},         {Xtensa::M2, 34},
-    {Xtensa::M3, 35},     {Xtensa::WINDOWBASE, 72}, {Xtensa::WINDOWSTART, 73},
-    {Xtensa::MEMCTL, 97}, {Xtensa::VECBASE, 231},   {Xtensa::MISC0, 244},
-    {Xtensa::MISC1, 245}, {Xtensa::MISC2, 246},     {Xtensa::MISC3, 247}};
+    {Xtensa::LBEG, 0},          {Xtensa::LEND, 1},
+    {Xtensa::LCOUNT, 2},        {Xtensa::SAR, 3},
+    {Xtensa::BREG, 4},          {Xtensa::LITBASE, 5},
+    {Xtensa::ACCLO, 16},        {Xtensa::ACCHI, 17},
+    {Xtensa::M0, 32},           {Xtensa::M1, 33},
+    {Xtensa::M2, 34},           {Xtensa::M3, 35},
+    {Xtensa::WINDOWBASE, 72},   {Xtensa::WINDOWSTART, 73},
+    {Xtensa::IBREAKENABLE, 96}, {Xtensa::MEMCTL, 97},
+    {Xtensa::DDR, 104},         {Xtensa::IBREAKA0, 128},
+    {Xtensa::IBREAKA1, 129},    {Xtensa::DBREAKA0, 144},
+    {Xtensa::DBREAKA1, 145},    {Xtensa::DBREAKC0, 160},
+    {Xtensa::DBREAKC1, 161},    {Xtensa::CONFIGID0, 176},
+    {Xtensa::EPC1, 177},        {Xtensa::EPC2, 178},
+    {Xtensa::EPC3, 179},        {Xtensa::EPC4, 180},
+    {Xtensa::EPC5, 181},        {Xtensa::EPC6, 182},
+    {Xtensa::EPC7, 183},        {Xtensa::DEPC, 192},
+    {Xtensa::EPS2, 194},        {Xtensa::EPS3, 195},
+    {Xtensa::EPS4, 196},        {Xtensa::EPS5, 197},
+    {Xtensa::EPS6, 198},        {Xtensa::EPS7, 199},
+    {Xtensa::CONFIGID1, 208},   {Xtensa::EXCSAVE1, 209},
+    {Xtensa::EXCSAVE2, 210},    {Xtensa::EXCSAVE3, 211},
+    {Xtensa::EXCSAVE4, 212},    {Xtensa::EXCSAVE5, 213},
+    {Xtensa::EXCSAVE6, 214},    {Xtensa::EXCSAVE7, 215},
+    {Xtensa::CPENABLE, 224},    {Xtensa::INTERRUPT, 226},
+    {Xtensa::INTCLEAR, 227},    {Xtensa::INTENABLE, 228},
+    {Xtensa::PS, 230},          {Xtensa::VECBASE, 231},
+    {Xtensa::EXCCAUSE, 232},    {Xtensa::DEBUGCAUSE, 233},
+    {Xtensa::CCOUNT, 234},      {Xtensa::PRID, 235},
+    {Xtensa::ICOUNT, 236},      {Xtensa::ICOUNTLEVEL, 237},
+    {Xtensa::EXCVADDR, 238},    {Xtensa::CCOMPARE0, 240},
+    {Xtensa::CCOMPARE1, 241},   {Xtensa::CCOMPARE2, 242},
+    {Xtensa::MISC0, 244},       {Xtensa::MISC1, 245},
+    {Xtensa::MISC2, 246},       {Xtensa::MISC3, 247}};
 
 static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
                                           uint64_t Address,
@@ -133,12 +159,24 @@ static DecodeStatus DecodeSRRegisterClass(MCInst &Inst, uint64_t RegNo,
   if (RegNo > 255)
     return MCDisassembler::Fail;
 
+  Xtensa::RegisterAccessType RAType =
+      Inst.getOpcode() == Xtensa::WSR
+          ? Xtensa::REGISTER_WRITE
+          : (Inst.getOpcode() == Xtensa::RSR ? Xtensa::REGISTER_READ
+                                             : Xtensa::REGISTER_EXCHANGE);
+
   for (unsigned i = 0; i < std::size(SRDecoderTable); i++) {
     if (SRDecoderTable[i].RegNo == RegNo) {
       MCPhysReg Reg = SRDecoderTable[i].Reg;
 
-      if (!Xtensa::checkRegister(Reg,
-                                 Decoder->getSubtargetInfo().getFeatureBits()))
+      // Handle special case. The INTERRUPT/INTSET registers use the same
+      // encoding, but INTERRUPT used for read and INTSET for write.
+      if (Reg == Xtensa::INTERRUPT && RAType == Xtensa::REGISTER_WRITE) {
+        Reg = Xtensa::INTSET;
+      }
+
+      if (!Xtensa::checkRegister(
+              Reg, Decoder->getSubtargetInfo().getFeatureBits(), RAType))
         return MCDisassembler::Fail;
 
       Inst.addOperand(MCOperand::createReg(Reg));
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 63fed46ac411f..f48c6225827b0 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -75,10 +75,95 @@ bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) {
 }
 
 // Verify Special Register
-bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
+bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                           RegisterAccessType RAType) {
   switch (RegNo) {
   case Xtensa::BREG:
     return FeatureBits[Xtensa::FeatureBoolean];
+  case Xtensa::CCOUNT:
+  case Xtensa::CCOMPARE0:
+    if (FeatureBits[Xtensa::FeatureTimers1])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE1:
+    if (FeatureBits[Xtensa::FeatureTimers2])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::CCOMPARE2:
+    if (FeatureBits[Xtensa::FeatureTimers3])
+      return true;
+    return false;
+  case Xtensa::CONFIGID0:
+    return RAType != Xtensa::REGISTER_EXCHANGE;
+  case Xtensa::CONFIGID1:
+    return RAType == Xtensa::REGISTER_READ;
+  case Xtensa::CPENABLE:
+    return FeatureBits[Xtensa::FeatureCoprocessor];
+  case Xtensa::DEBUGCAUSE:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeatureDebug];
+  case Xtensa::DEPC:
+  case Xtensa::EPC1:
+  case Xtensa::EXCCAUSE:
+  case Xtensa::EXCSAVE1:
+  case Xtensa::EXCVADDR:
+    return FeatureBits[Xtensa::FeatureException];
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC2:
+  case Xtensa::EPS2:
+  case Xtensa::EXCSAVE2:
+    if (FeatureBits[Xtensa::FeatureHighPriInterrupts])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC3:
+  case Xtensa::EPS3:
+  case Xtensa::EXCSAVE3:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel3])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC4:
+  case Xtensa::EPS4:
+  case Xtensa::EXCSAVE4:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel4])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC5:
+  case Xtensa::EPS5:
+  case Xtensa::EXCSAVE5:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel5])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC6:
+  case Xtensa::EPS6:
+  case Xtensa::EXCSAVE6:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel6])
+      return true;
+    LLVM_FALLTHROUGH;
+  case Xtensa::EPC7:
+  case Xtensa::EPS7:
+  case Xtensa::EXCSAVE7:
+    if (FeatureBits[Xtensa::FeatureHighPriInterruptsLevel7])
+      return true;
+    return false;
+  case Xtensa::INTENABLE:
+    return FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTERRUPT:
+    return RAType == Xtensa::REGISTER_READ &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::INTSET:
+  case Xtensa::INTCLEAR:
+    return RAType == Xtensa::REGISTER_WRITE &&
+           FeatureBits[Xtensa::FeatureInterrupt];
+  case Xtensa::ICOUNT:
+  case Xtensa::ICOUNTLEVEL:
+  case Xtensa::IBREAKENABLE:
+  case Xtensa::DDR:
+  case Xtensa::IBREAKA0:
+  case Xtensa::IBREAKA1:
+  case Xtensa::DBREAKA0:
+  case Xtensa::DBREAKA1:
+  case Xtensa::DBREAKC0:
+  case Xtensa::DBREAKC1:
+    return FeatureBits[Xtensa::FeatureDebug];
   case Xtensa::LBEG:
   case Xtensa::LEND:
   case Xtensa::LCOUNT:
@@ -99,6 +184,8 @@ bool Xtensa::checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits) {
   case Xtensa::MISC2:
   case Xtensa::MISC3:
     return FeatureBits[Xtensa::FeatureMiscSR];
+  case Xtensa::PRID:
+    return RAType == Xtensa::REGISTER_READ && FeatureBits[Xtensa::FeaturePRID];
   case Xtensa::VECBASE:
     return FeatureBits[Xtensa::FeatureRelocatableVector];
   case Xtensa::WINDOWBASE:
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index cedc57a14f142..ec91f656bdcbd 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -55,8 +55,15 @@ bool isValidAddrOffset(int Scale, int64_t OffsetVal);
 // Check address offset for load/store instructions.
 bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset);
 
+enum RegisterAccessType {
+  REGISTER_WRITE = 1,
+  REGISTER_READ = 2,
+  REGISTER_EXCHANGE = 3
+};
+
 // Verify if it's correct to use a special register.
-bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits);
+bool checkRegister(MCRegister RegNo, const FeatureBitset &FeatureBits,
+                   RegisterAccessType RA);
 } // namespace Xtensa
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFeatures.td b/llvm/lib/Target/Xtensa/XtensaFeatures.td
index 55977277daf8e..1dd03283e9313 100644
--- a/llvm/lib/Target/Xtensa/XtensaFeatures.td
+++ b/llvm/lib/Target/Xtensa/XtensaFeatures.td
@@ -92,3 +92,43 @@ def FeatureDataCache : SubtargetFeature<"dcache", "HasDataCache", "true",
                                         "Enable Xtensa Data Cache option">;
 def HasDataCache : Predicate<"Subtarget->hasDataCache()">,
                    AssemblerPredicate<(all_of FeatureDataCache)>;
+
+// Xtensa Interrupts Options.
+def FeatureHighPriInterrupts : SubtargetFeature<"highpriinterrupts",
+                                                "HasHighPriInterrupts", "true",
+                                                "Enable Xtensa HighPriInterrupts option">;
+def HasHighPriInterrupts : Predicate<"Subtarget->hasHighPriInterrupts()">,
+                                      AssemblerPredicate<(all_of FeatureHighPriInterrupts)>;
+
+foreach i = {3-7} in
+    def FeatureHighPriInterruptsLevel#i : SubtargetFeature<"highpriinterrupts-level"#i,
+         "HasHighPriInterruptsLevel"#i#"", "true", "Enable Xtensa HighPriInterrupts Level"#i, [FeatureHighPriInterrupts]>;
+
+def FeatureInterrupt : SubtargetFeature<"interrupt", "HasInterrupt", "true",
+                                        "Enable Xtensa Interrupt option">;
+def HasInterrupt : Predicate<"Subtarget->hasInterrupt()">,
+                              AssemblerPredicate<(all_of FeatureInterrupt)>;
+
+def FeatureException : SubtargetFeature<"exception", "HasException", "true",
+                                        "Enable Xtensa Exception option">;
+def HasException : Predicate<"Subtarget->hasException()">,
+                              AssemblerPredicate<(all_of FeatureException)>;
+
+def FeatureDebug : SubtargetFeature<"debug", "HasDebug", "true",
+                                    "Enable Xtensa Debug option">;
+def HasDebug : Predicate<"Subtarget->hasDebug()">,
+                          AssemblerPredicate<(all_of FeatureDebug)>;
+
+foreach i = {1-3} in
+    def FeatureTimers#i : SubtargetFeature<"timers"#i,
+         "HasTimers"#i#"", "true", "Enable Xtensa Timers "#i>;
+
+def FeaturePRID : SubtargetFeature<"prid", "HasPRID", "true",
+                                   "Enable Xtensa Processor ID option">;
+def HasPRID : Predicate<"Subtarget->hasPRID()">,
+                         AssemblerPredicate<(all_of FeaturePRID)>;
+
+def FeatureCoprocessor : SubtargetFeature<"coprocessor", "HasCoprocessor", "true",
+                                          "Enable Xtensa Coprocessor option">;
+def HasCoprocessor : Predicate<"Subtarget->hasCoprocessor()">,
+                                AssemblerPredicate<(all_of FeatureCoprocessor)>;
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index 9a9424f916996..7e9fcd7058c20 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -499,6 +499,18 @@ def EXTW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
   let hasSideEffects = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Illegal instructions
+//===----------------------------------------------------------------------===//
+
+def ILL : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                    "ill", []> {
+  let m = 0x0;
+  let n = 0x0;
+  let r = 0;
+  let s = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // Processor control instructions
 //===----------------------------------------------------------------------===//
@@ -1044,6 +1056,109 @@ let Predicates = [HasRegionProtection] in {
   }
 }
 
+//===----------------------------------------------------------------------===//
+// Debug instructions
+//===----------------------------------------------------------------------===//
+
+let isBarrier = 1, isTerminator = 1 in {
+  def BREAK : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$s, uimm4:$t),
+                      "break\t$s, $t", []>, Requires<[HasDebug]> {
+    let r = 0x04;
+  }
+
+  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm),
+                         "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {
+    bits<4> imm;
+
+    let r = 0xf;
+    let s = imm;
+    let t = 0x2;
+  }
+}
+
+def : InstAlias<"_break.n\t$imm", (BREAK_N uimm4:$imm)>;
+
+def : Pat<(trap), (BREAK (i32 1), (i32 15))>;
+
+// Load instruction
+def LDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$s), (ins),
+                       "lddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xe;
+  let mayLoad = 1;
+}
+
+// Store instruction
+def SDDR32P : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins AR:$s),
+                       "sddr32.p\t$s", []>, Requires<[HasDebug]> {
+  let r = 0x7;
+  let t = 0xf;
+  let mayStore = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Exception feature instructions
+//===----------------------------------------------------------------------===//
+
+def EXCW : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "excw", []>, Requires<[HasException]> {
+  let r = 0x2;
+  let s = 0x0;
+  let t = 0x8;
+}
+
+def RFDE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                   "rfde", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x2;
+  let t = 0x0;
+}
+
+
+def RFE : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                  "rfe", []>, Requires<[HasException]> {
+  let r = 0x3;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+def SYSCALL : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins),
+                      "syscall", []>, Requires<[HasException]> {
+  let r = 0x5;
+  let s = 0x0;
+  let t = 0x0;
+}
+
+//===----------------------------------------------------------------------===//
+// Interrupt feature instructions
+//===----------------------------------------------------------------------===//
+
+def RSIL : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$t), (ins uimm4:$imm),
+                   "rsil\t$t, $imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x6;
+  let s = imm{3-0};
+}
+
+def WAITI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                   "waiti\t$imm", []>, Requires<[HasInterrupt]> {
+  bits<4> imm;
+
+  let r = 0x7;
+  let s = imm{3-0};
+  let t = 0;
+}
+
+def RFI : RRR_Inst<0x00, 0x00, 0x00, (outs), (ins uimm4:$imm),
+                  "rfi\t$imm", []>, Requires<[HasHighPriInterrupts]> {
+  bits<4> imm;
+
+  let r = 0x3;
+  let s = imm{3-0};
+  let t = 0x1;
+}
+
 //===----------------------------------------------------------------------===//
 // DSP Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
index c54e2556ba11f..7d44029124344 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.td
@@ -91,9 +91,111 @@ def LITBASE : SRReg<5, "litbase", ["LITBASE", "5"]>;
 def WINDOWBASE : SRReg<72, "windowbase", ["WINDOWBASE", "72"]>;
 def WINDOWSTART : SRReg<73, "windowstart", ["WINDOWSTART", "73"]>;
 
+// Instuction breakpoint enable register
+def IBREAKENABLE : SRReg<96, "ibreakenable", ["IBREAKENABLE", "96"]>;
+
 // Memory Control Register
 def MEMCTL : SRReg<97, "memctl", ["MEMCTL", "97"]>;
 
+def DDR : SRReg<104, "ddr", ["DDR", "104"]>;
+
+// Instuction break address register 0
+def IBREAKA0 : SRReg<128, "ibreaka0", ["IBREAKA0", "128"]>;
+
+// Instuction break address register 1
+def IBREAKA1 : SRReg<129, "ibreaka1", ["IBREAKA1", "129"]>;
+
+// Data break address register 0
+def DBREAKA0 : SRReg<144, "dbreaka0", ["DBREAKA0", "144"]>;
+
+// Data break address register 1
+def DBREAKA1 : SRReg<145, "dbreaka1", ["DBREAKA1", "145"]>;
+
+// Data breakpoint control register 0
+def DBREAKC0 : SRReg<160, "dbreakc0", ["DBREAKC0", "160"]>;
+
+// Data breakpoint control register 1
+def DBREAKC1 : SRReg<161, "dbreakc1", ["DBREAKC1", "161"]>;
+
+def CONFIGID0 : SRReg<176, "configid0", ["CONFIGID0", "176"]>;
+
+// Exception PC1
+def EPC1 : SRReg<177, "epc1", ["EPC1", "177"]>;
+
+// Exception PC2
+def EPC2 : SRReg<178, "epc2", ["EPC2", "178"]>;
+
+// Exception PC3
+def EPC3 : SRReg<179, "epc3", ["EPC3", "179"]>;
+
+// Exception PC4
+def EPC4 : SRReg<180, "epc4", ["EPC4", "180"]>;
+
+// Exception PC5
+def EPC5 : SRReg<181, "epc5", ["EPC5", "181"]>;
+
+// Exception PC6
+def EPC6 : SRReg<182, "epc6", ["EPC6", "182"]>;
+
+// Exception PC7
+def EPC7 : SRReg<183, "epc7", ["EPC7", "183"]>;
+
+def DEPC : SRReg<192, "depc", ["DEPC", "192"]>;
+def EPS2 : SRReg<194, "eps2", ["EPS2", "194"]>;
+def EPS3 : SRReg<195, "eps3", ["EPS3", "195"]>;
+def EPS4 : SRReg<196, "eps4", ["EPS4", "196"]>;
+def EPS5 : SRReg<197, "eps5", ["EPS5", "197"]>;
+def EPS6 : SRReg<198, "eps6", ["EPS6", "198"]>;
+def EPS7 : SRReg<199, "eps7", ["EPS7", "199"]>;
+
+def CONFIGID1 : SRReg<208, "configid1", ["CONFIGID1", "208"]>;
+
+def EXCSAVE1 : SRReg<209, "excsave1", ["EXCSAVE1", "209"]>;
+def EXCSAVE2 : SRReg<210, "excsave2", ["EXCSAVE2", "210"]>;
+def EXCSAVE3 : SRReg<211, "excsave3", ["EXCSAVE3", "211"]>;
+def EXCSAVE4 : SRReg<212, "excsave4", ["EXCSAVE4", "212"]>;
+def EXCSAVE5 : SRReg<213, "excsave5", ["EXCSAVE5", "213"]>;
+def EXCSAVE6 : SRReg<214, "excsave6", ["EXCSAVE6", "214"]>;
+def EXCSAVE7 : SRReg<215, "excsave7", ["EXCSAVE7", "215"]>;
+
+def CPENABLE : SRReg<224, "cpenable", ["CPENABLE", "224"]>;
+
+// Interrupt enable mask register
+def INTERRUPT : SRReg<226, "interrupt", ["INTERRUPT", "226"]>;
+
+def INTSET : SRReg<226, "intset", ["INTSET"]>;
+
+def INTCLEAR : SRReg<227, "intclear", ["INTCLEAR", "227"]>;
+
+def INTENABLE : SRReg<228, "intenable", ["INTENABLE", "228"]>;
+
+// Processor State
+def PS : SRReg<230, "ps", ["PS", "230"]>;
+
+def EXCCAUSE : SRReg<232, "exccause", ["EXCCAUSE", "232"]>;
+
+// Cause of last debug exception register
+def DEBUGCAUSE : SRReg<233, "debugcause", ["DEBUGCAUSE", "233"]>;
+
+// Processor Clock Count Register
+def CCOUNT : SRReg<234, "ccount", ["CCOUNT", "234"]>;
+
+// Processor ID Register
+def PRID : SRReg<235, "prid", ["PRID", "235"]>;
+
+def ICOUNT : SRReg<236, "icount", ["ICOUNT", "236"]>;
+def ICOUNTLEVEL : SRReg<237, "icountlevel", ["ICOUNTLEVEL", "237"]>;
+def EXCVADDR : SRReg<238, "excvaddr", ["EXCVADDR", "238"]>;
+
+// Cycle number to interrupt register 0
+def CCOMPARE0 : SRReg<240, "ccompare0", ["CCOMPARE0", "240"]>;
+
+// Cycle number to interrupt register 1
+def CCOMPARE1 : SRReg<241, "ccompare1", ["CCOMPARE1", "241"]>;
+
+// Cycle number to interrupt register 2
+def CCOMPARE2 : SRReg<242, "ccompare2", ["CCOMPARE2", "242"]>;
+
 // Vector base register
 def VECBASE : SRReg<231, "vecbase", ["VECBASE", "231"]>;
 
@@ -116,8 +218,13 @@ def MR23 :  RegisterClass<"Xtensa", [i32], 32, (add M2, M3)>;
 def MR   :  RegisterClass<"Xtensa", [i32], 32, (add MR01, MR23)>;
 
 def SR :  RegisterClass<"Xtensa", [i32], 32, (add
-  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR, WINDOWBASE, WINDOWSTART,
-  MEMCTL, VECBASE, MISC0, MISC1, MISC2, MISC3)>;
+  LBEG, LEND, LCOUNT, SAR, BREG, LITBASE, ACCLO, ACCHI, MR,
+  WINDOWBASE, WINDOWSTART, IBREAKENABLE, MEMCTL, DDR, IBREAKA0, IBREAKA1,
+  DBREAKA0, DBREAKA1, DBREAKC0, DBREAKC1, CONFIGID0, EPC1, EPC2, EPC3, EPC4, EPC5,
+  EPC6, EPC7, DEPC, EPS2, EPS3, EPS4, EPS5, EPS6, EPS7, CONFIGID1, EXCSAVE1, EXCSAVE2,
+  EXCSAVE3, EXCSAVE4, EXCSAVE5, EXCSAVE6, EXCSAVE7, CPENABLE, INTERRUPT, INTSET, INTCLEAR, INTENABLE,
+  PS, VECBASE, EXCCAUSE, DEBUGCAUSE, CCOUNT, PRID, ICOUNT, ICOUNTLEVEL, EXCVADDR, CCOMPARE0,
+  CCOMPARE1, CCOMPARE2, MISC0, MISC1, MISC2, MISC3)>;
 
 //===----------------------------------------------------------------------===//
 // Boolean registers
diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
index 9909fb9ff4b37..da4e14a53eef3 100644
--- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h
+++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h
@@ -82,6 +82,15 @@ class XtensaSubtarget : public XtensaGenSubtargetInfo {
   bool hasMiscSR() const { return HasMiscSR; }
   bool hasExtendedL32R() const { return HasExtendedL32R; }
   bool hasDataCache() const { return HasDataCache; }
+  bool hasHighPriInterrupts() const { return HasHighPriInterrupts; }
+  bool hasHighPriInterruptsLevel3() const { return HasHighPriInterruptsLevel3; }
+  bool hasHighPriInterruptsLevel4() const { return HasHighPriInterruptsLevel4; }
+  bool hasHighPriInterruptsLevel5() const { return HasHighPriInterruptsLevel5; }
+  bool hasHighPriInterruptsLevel6() const { return HasHighPriInterruptsLevel6; }
+  bool hasHighPriInterruptsLevel7() const { return HasHighPriInterruptsLevel7; }
+  bool hasInterrupt() const { return HasInterrupt; }
+  bool hasException() const { return HasException; }
+
   bool isWindowedABI() const { return hasWindowed(); }
 
   // Automatically generated by tblgen.
diff --git a/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
new file mode 100644
index 0000000000000..83904dcde938c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/coprocessor.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+coprocessor -disassemble %s | FileCheck -check-prefixes=CHECK-COPROCESSOR %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa coprocessor option enabled. Also verify that dissasembling without
+## Xtensa coprocessor option generates warnings.
+
+[0x20,0xe0,0x61]
+#CHECK-COPROCESSOR: xsr a2, cpenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
new file mode 100644
index 0000000000000..1321f09a973c3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -0,0 +1,62 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+debug,+density -disassemble %s | FileCheck -check-prefixes=CHECK-DEBUG %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa debug option enabled. Also verify that dissasembling without
+## Xtensa debug option generates warnings.
+
+[0x10,0x41,0x00]
+# CHECK-DEBUG: break 1, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x2c,0xf1]
+# CHECK-DEBUG: break.n 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xe0,0x73,0x00]
+# CHECK-DEBUG: lddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0xf0,0x73,0x00]
+# CHECK-DEBUG: sddr32.p a3
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xec, 0x61]
+#CHECK-DEBUG: xsr a2, icount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xed, 0x61]
+#CHECK-DEBUG: xsr a2, icountlevel
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x60, 0x61]
+#CHECK-DEBUG: xsr a2, ibreakenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x68, 0x61]
+#CHECK-DEBUG: xsr a2, ddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x80, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x81, 0x61]
+#CHECK-DEBUG: xsr a2, ibreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x90, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0x91, 0x61]
+#CHECK-DEBUG: xsr a2, dbreaka1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa0, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xa1, 0x61]
+#CHECK-DEBUG: xsr a2, dbreakc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/exception.txt b/llvm/test/MC/Disassembler/Xtensa/exception.txt
new file mode 100644
index 0000000000000..f40cc9e6549ba
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/exception.txt
@@ -0,0 +1,42 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+exception -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa exception option enabled. Also verify that dissasembling without
+## Xtensa exception option generates warnings.
+
+[0x80,0x20,0x00]
+# CHECK-EXCEPTION: excw
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x50,0x00]
+# CHECK-EXCEPTION: syscall
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x30,0x00]
+# CHECK-EXCEPTION: rfe
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x32,0x00]
+# CHECK-EXCEPTION: rfde
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xb1, 0x61]
+#CHECK-INST: xsr a2, epc1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xd1, 0x61]
+#CHECK-INST: xsr a2, excsave1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe8, 0x61]
+#CHECK-INST: xsr a2, exccause
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xee, 0x61]
+#CHECK-INST: xsr a2, excvaddr
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xc0, 0x61]
+#CHECK-INST: xsr a2, depc
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
new file mode 100644
index 0000000000000..d5d87918c9d52
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/highinterrupts.txt
@@ -0,0 +1,82 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+highpriinterrupts,+highpriinterrupts-level7 -disassemble %s | FileCheck -check-prefixes=CHECK-HPINTERRUPTS %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa highpriinterrupts option enabled. Also verify that dissasembling without
+## Xtensa highpriinterrupts option generates warnings.
+
+[0x10,0x31,0x00]
+# CHECK-HPINTERRUPTS: rfi 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xb7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, epc7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xc7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, eps7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd2,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd3,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave3
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd4,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave4
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd5,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave5
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd6,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave6
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xd7,0x61]
+#CHECK-HPINTERRUPTS: xsr a2, excsave7
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/interrupt.txt b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
new file mode 100644
index 0000000000000..da8ea3aa5dc47
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/interrupt.txt
@@ -0,0 +1,26 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+interrupt -disassemble %s | FileCheck -check-prefixes=CHECK-EXCEPTION %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa interrupt option enabled. Also verify that dissasembling without
+## Xtensa interrupt option generates warnings.
+
+[0x20,0x61,0x00]
+# CHECK-EXCEPTION: rsil a2, 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x00,0x71,0x00]
+# CHECK-EXCEPTION: waiti 1
+# CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe4, 0x61]
+#CHECK-INST: xsr a2, intenable
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe2, 0x03]
+#CHECK-INST: rsr a2, interrupt
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20, 0xe3, 0x13]
+#CHECK-INST: wsr a2, intclear
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/prid.txt b/llvm/test/MC/Disassembler/Xtensa/prid.txt
new file mode 100644
index 0000000000000..104ad1c31185c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/prid.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+prid -disassemble %s | FileCheck -check-prefixes=CHECK-PRID %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa prid option enabled. Also verify that dissasembling without
+## Xtensa prid option generates warnings.
+
+[0x20,0xeb,0x03]
+#CHECK-PRID: rsr a2, prid
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Disassembler/Xtensa/timer.txt b/llvm/test/MC/Disassembler/Xtensa/timer.txt
new file mode 100644
index 0000000000000..daacf27872daa
--- /dev/null
+++ b/llvm/test/MC/Disassembler/Xtensa/timer.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -triple=xtensa -mattr=+timers3 -disassemble %s | FileCheck -check-prefixes=CHECK-TIMER %s
+# RUN: not llvm-mc -triple=xtensa -disassemble %s 2>&1 | FileCheck --implicit-check-not=warning: -check-prefixes=CHECK-CORE %s
+
+## Verify that binary code is correctly disassembled with
+## Xtensa timer option enabled. Also verify that dissasembling without
+## Xtensa timer option generates warnings.
+
+[0x20,0xea,0x61]
+#CHECK-INST: xsr a2, ccount
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf0,0x61]
+#CHECK-TIMER: xsr a2, ccompare0
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf1,0x61]
+#CHECK-TIMER: xsr a2, ccompare1
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
+
+[0x20,0xf2,0x61]
+#CHECK-TIMER: xsr a2, ccompare2
+#CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
diff --git a/llvm/test/MC/Xtensa/Core/processor-control.s b/llvm/test/MC/Xtensa/Core/processor-control.s
index 5b648356fc68b..4a37d8346893e 100644
--- a/llvm/test/MC/Xtensa/Core/processor-control.s
+++ b/llvm/test/MC/Xtensa/Core/processor-control.s
@@ -20,6 +20,11 @@ esync
 # CHECK: encoding: [0x00,0x20,0x00]
 isync
 
+# Instruction format CALLX
+# CHECK-INST: ill
+# CHECK: encoding: [0x00,0x00,0x00]
+ill
+
 # Instruction format RRR
 # CHECK-INST: nop
 # CHECK: encoding: [0xf0,0x20,0x00]
diff --git a/llvm/test/MC/Xtensa/coprocessor.s b/llvm/test/MC/Xtensa/coprocessor.s
new file mode 100644
index 0000000000000..dca8c55fd72cb
--- /dev/null
+++ b/llvm/test/MC/Xtensa/coprocessor.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+coprocessor \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2,cpenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr.cpenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, cpenable
+#CHECK: encoding: [0x20,0xe0,0x61]
+xsr a2, 224
diff --git a/llvm/test/MC/Xtensa/debug-invalid.s b/llvm/test/MC/Xtensa/debug-invalid.s
new file mode 100644
index 0000000000000..74f0df9fe8148
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug-invalid.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -triple xtensa --mattr=+debug,+density %s 2>&1 | FileCheck %s
+
+LBL0:
+
+# Out of range immediates
+
+# uimm4
+break 16, 0
+# CHECK: :[[#@LINE-1]]:7: error: expected immediate in range [0, 15]
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
new file mode 100644
index 0000000000000..36b1f110d120b
--- /dev/null
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -0,0 +1,190 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+debug,+density \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: break 1, 1
+# CHECK: encoding: [0x10,0x41,0x00]
+break 1, 1
+
+# Instruction format RRRN
+# CHECK-INST: break.n 1
+# CHECK: encoding: [0x2c,0xf1]
+break.n 1
+
+# Instruction format RRR
+# CHECK-INST: lddr32.p a3
+# CHECK: encoding: [0xe0,0x73,0x00]
+lddr32.p a3
+
+# Instruction format RRR
+# CHECK-INST: sddr32.p a3
+# CHECK: encoding: [0xf0,0x73,0x00]
+sddr32.p a3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2,icount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr.icount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icount
+#CHECK: encoding: [0x20,0xec,0x61]
+xsr a2, 236
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2,icountlevel
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr.icountlevel a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, icountlevel
+#CHECK: encoding: [0x20,0xed,0x61]
+xsr a2, 237
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2,ibreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr.ibreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka0
+#CHECK: encoding: [0x20,0x80,0x61]
+xsr a2, 128
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2,ibreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr.ibreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreaka1
+#CHECK: encoding: [0x20,0x81,0x61]
+xsr a2, 129
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2,dbreaka0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr.dbreaka0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka0
+#CHECK: encoding: [0x20,0x90,0x61]
+xsr a2, 144
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2,dbreaka1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr.dbreaka1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreaka1
+#CHECK: encoding: [0x20,0x91,0x61]
+xsr a2, 145
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2,dbreakc0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr.dbreakc0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc0
+#CHECK: encoding: [0x20,0xa0,0x61]
+xsr a2, 160
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2,dbreakc1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr.dbreakc1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, dbreakc1
+#CHECK: encoding: [0x20,0xa1,0x61]
+xsr a2, 161
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2,ibreakenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr.ibreakenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ibreakenable
+#CHECK: encoding: [0x20,0x60,0x61]
+xsr a2, 96
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2,debugcause
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr.debugcause a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, debugcause
+#CHECK: encoding: [0x20,0xe9,0x03]
+rsr a2, 233
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2,ddr
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr.ddr a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ddr
+#CHECK: encoding: [0x20,0x68,0x61]
+xsr a2, 104
diff --git a/llvm/test/MC/Xtensa/exception.s b/llvm/test/MC/Xtensa/exception.s
new file mode 100644
index 0000000000000..7084ddacf0136
--- /dev/null
+++ b/llvm/test/MC/Xtensa/exception.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+exception \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: excw
+# CHECK: encoding: [0x80,0x20,0x00]
+excw
+
+# Instruction format RRR
+# CHECK-INST: syscall
+# CHECK: encoding: [0x00,0x50,0x00]
+syscall
+
+# Instruction format RRR
+# CHECK-INST: rfe
+# CHECK: encoding: [0x00,0x30,0x00]
+rfe
+
+# Instruction format RRR
+# CHECK-INST: rfde
+# CHECK: encoding: [0x00,0x32,0x00]
+rfde
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, epc1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr.epc1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, epc1
+# CHECK: encoding: [0x20,0xb1,0x61]
+xsr a2, 177
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, excsave1
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr.excsave1 a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excsave1
+# CHECK: encoding: [0x20,0xd1,0x61]
+xsr a2, 209
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, exccause
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr.exccause a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, exccause
+# CHECK: encoding: [0x20,0xe8,0x61]
+xsr a2, 232
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, excvaddr
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr.excvaddr a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, excvaddr
+# CHECK: encoding: [0x20,0xee,0x61]
+xsr a2, 238
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, depc
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr.depc a2
+
+# Instruction format RRR
+# CHECK-INST: xsr a2, depc
+# CHECK: encoding: [0x20,0xc0,0x61]
+xsr a2, 192
diff --git a/llvm/test/MC/Xtensa/highinterrupts.s b/llvm/test/MC/Xtensa/highinterrupts.s
new file mode 100644
index 0000000000000..4908176b1b030
--- /dev/null
+++ b/llvm/test/MC/Xtensa/highinterrupts.s
@@ -0,0 +1,280 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+highpriinterrupts,+highpriinterrupts-level7 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rfi 1
+# CHECK: encoding: [0x10,0x31,0x00]
+rfi 1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2,epc2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr.epc2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc2
+#CHECK: encoding: [0x20,0xb2,0x61]
+xsr a2, 178
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2,epc3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr.epc3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc3
+#CHECK: encoding: [0x20,0xb3,0x61]
+xsr a2, 179
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2,epc4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr.epc4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc4
+#CHECK: encoding: [0x20,0xb4,0x61]
+xsr a2, 180
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2,epc5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr.epc5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc5
+#CHECK: encoding: [0x20,0xb5,0x61]
+xsr a2, 181
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2,epc6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr.epc6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc6
+#CHECK: encoding: [0x20,0xb6,0x61]
+xsr a2, 182
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2,epc7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr.epc7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, epc7
+#CHECK: encoding: [0x20,0xb7,0x61]
+xsr a2, 183
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2,eps2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr.eps2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps2
+#CHECK: encoding: [0x20,0xc2,0x61]
+xsr a2, 194
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2,eps3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr.eps3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps3
+#CHECK: encoding: [0x20,0xc3,0x61]
+xsr a2, 195
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2,eps4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr.eps4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps4
+#CHECK: encoding: [0x20,0xc4,0x61]
+xsr a2, 196
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2,eps5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr.eps5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps5
+#CHECK: encoding: [0x20,0xc5,0x61]
+xsr a2, 197
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2,eps6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr.eps6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps6
+#CHECK: encoding: [0x20,0xc6,0x61]
+xsr a2, 198
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2,eps7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr.eps7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, eps7
+#CHECK: encoding: [0x20,0xc7,0x61]
+xsr a2, 199
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2,excsave2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr.excsave2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave2
+#CHECK: encoding: [0x20,0xd2,0x61]
+xsr a2, 210
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2,excsave3
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr.excsave3 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave3
+#CHECK: encoding: [0x20,0xd3,0x61]
+xsr a2, 211
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2,excsave4
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr.excsave4 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave4
+#CHECK: encoding: [0x20,0xd4,0x61]
+xsr a2, 212
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2,excsave5
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr.excsave5 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave5
+#CHECK: encoding: [0x20,0xd5,0x61]
+xsr a2, 213
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2,excsave6
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr.excsave6 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave6
+#CHECK: encoding: [0x20,0xd6,0x61]
+xsr a2, 214
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2,excsave7
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr.excsave7 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, excsave7
+#CHECK: encoding: [0x20,0xd7,0x61]
+xsr a2, 215
diff --git a/llvm/test/MC/Xtensa/interrupt.s b/llvm/test/MC/Xtensa/interrupt.s
new file mode 100644
index 0000000000000..cb1b82dbfe5aa
--- /dev/null
+++ b/llvm/test/MC/Xtensa/interrupt.s
@@ -0,0 +1,60 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+interrupt \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+# Instruction format RRR
+# CHECK-INST: rsil a2, 1
+# CHECK: encoding: [0x20,0x61,0x00]
+rsil a2, 1
+
+# Instruction format RRR
+# CHECK-INST: waiti 1
+# CHECK: encoding: [0x00,0x71,0x00]
+waiti 1
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, interrupt
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr.interrupt a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, interrupt
+#CHECK: encoding: [0x20,0xe2,0x03]
+rsr a2, 226
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, intclear
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr.intclear a2
+
+#Instruction format RRR
+#CHECK-INST: wsr a2, intclear
+#CHECK: encoding: [0x20,0xe3,0x13]
+wsr a2, 227
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, intenable
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr.intenable a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, intenable
+#CHECK: encoding: [0x20,0xe4,0x61]
+xsr a2, 228
diff --git a/llvm/test/MC/Xtensa/prid.s b/llvm/test/MC/Xtensa/prid.s
new file mode 100644
index 0000000000000..75fcc151e8eff
--- /dev/null
+++ b/llvm/test/MC/Xtensa/prid.s
@@ -0,0 +1,20 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+prid \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2,prid
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr.prid a2
+
+#Instruction format RRR
+#CHECK-INST: rsr a2, prid
+#CHECK: encoding: [0x20,0xeb,0x03]
+rsr a2, 235
diff --git a/llvm/test/MC/Xtensa/timer.s b/llvm/test/MC/Xtensa/timer.s
new file mode 100644
index 0000000000000..f1fc9709cdec9
--- /dev/null
+++ b/llvm/test/MC/Xtensa/timer.s
@@ -0,0 +1,65 @@
+# RUN: llvm-mc %s -triple=xtensa -show-encoding --mattr=+timers3 \
+# RUN:     | FileCheck -check-prefixes=CHECK,CHECK-INST %s
+
+.align	4
+LBL0:
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2,ccount
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr.ccount a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccount
+#CHECK: encoding: [0x20,0xea,0x61]
+xsr a2, 234
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2,ccompare0
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr.ccompare0 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare0
+#CHECK: encoding: [0x20,0xf0,0x61]
+xsr a2, 240
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2,ccompare1
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr.ccompare1 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare1
+#CHECK: encoding: [0x20,0xf1,0x61]
+xsr a2, 241
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2,ccompare2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr.ccompare2 a2
+
+#Instruction format RRR
+#CHECK-INST: xsr a2, ccompare2
+#CHECK: encoding: [0x20,0xf2,0x61]
+xsr a2, 242

From 15482c83aa2b05779d7ad947c34835656ab9da1c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 17 Jun 2025 19:58:24 -0400
Subject: [PATCH 835/851] [ElimAvailExtern] Add an option to allow to convert
 global variables in a specified address space to local (#144287)

Currently, the `EliminateAvailableExternallyPass` only converts certain
available externally functions to local if `avail-extern-to-local` is
set or in
contextual profiling mode. For global variables, it only drops their
initializers.

This PR adds an option to allow the pass to convert global variables in
a
specified address space to local. The motivation for this change is to
correctly
support lowering of LDS variables (`__shared__` variables, in more
generic
terminology) when ThinLTO is enabled for AMDGPU.

A `__shared__` variable is lowered to a hidden global variable in a
particular
address space by the frontend, which is roughly same as a `static` local
variable. To properly lower it in the backend, the compiler needs to
check all
its uses. Enabling ThinLTO currently breaks this when a function
containing a
`__shared__` variable is imported from another module. Even though the
global
variable is imported along with its associated function, and the
function is
privatized by the `EliminateAvailableExternallyPass`, the global
variable itself
is not.

It's safe to privatize such global variables, because they're _local_ to
their
associated functions. If the function itself is privatized, its
associated
global variables should also be privatized accordingly.
---
 llvm/lib/Transforms/IPO/ElimAvailExtern.cpp   | 36 ++++++++++++++++---
 .../convert-global-variables-to-local.ll      | 21 +++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll

diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index 718452fc02764..bc98f994f490c 100644
--- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -35,8 +35,15 @@ static cl::opt<bool> ConvertToLocal(
     cl::desc("Convert available_externally into locals, renaming them "
              "to avoid link-time clashes."));
 
+static cl::opt<unsigned> ConvertGlobalVariableInAddrSpace(
+    "avail-extern-gv-in-addrspace-to-local", cl::Hidden,
+    cl::desc(
+        "Convert available_externally global variables into locals if they are "
+        "in specificed addrspace, renaming them to avoid link-time clashes."));
+
 STATISTIC(NumRemovals, "Number of functions removed");
-STATISTIC(NumConversions, "Number of functions converted");
+STATISTIC(NumFunctionsConverted, "Number of functions converted");
+STATISTIC(NumGlobalVariablesConverted, "Number of global variables converted");
 STATISTIC(NumVariables, "Number of global variables removed");
 
 void deleteFunction(Function &F) {
@@ -45,6 +52,10 @@ void deleteFunction(Function &F) {
   ++NumRemovals;
 }
 
+static std::string getNewName(Module &M, const GlobalValue &GV) {
+  return GV.getName().str() + ".__uniq" + getUniqueModuleId(&M);
+}
+
 /// Create a copy of the thinlto import, mark it local, and redirect direct
 /// calls to the copy. Only direct calls are replaced, so that e.g. indirect
 /// call function pointer tests would use the global identity of the function.
@@ -68,7 +79,7 @@ static void convertToLocalCopy(Module &M, Function &F) {
   // functions with the same name, but that just creates more trouble than
   // necessary e.g. distinguishing profiles or debugging. Instead, we append the
   // module identifier.
-  auto NewName = OrigName + ".__uniq" + getUniqueModuleId(&M);
+  std::string NewName = getNewName(M, F);
   F.setName(NewName);
   if (auto *SP = F.getSubprogram())
     SP->replaceLinkageName(MDString::get(F.getParent()->getContext(), NewName));
@@ -85,16 +96,33 @@ static void convertToLocalCopy(Module &M, Function &F) {
                        F.getAddressSpace(), OrigName, F.getParent());
   F.replaceUsesWithIf(Decl,
                       [&](Use &U) { return !isa<CallBase>(U.getUser()); });
-  ++NumConversions;
+  ++NumFunctionsConverted;
+}
+
+/// Similar to the function above, this is to convert an externally available
+/// global variable to local.
+static void convertToLocalCopy(Module &M, GlobalVariable &GV) {
+  assert(GV.hasAvailableExternallyLinkage());
+  GV.setName(getNewName(M, GV));
+  GV.setLinkage(GlobalValue::InternalLinkage);
+  ++NumGlobalVariablesConverted;
 }
 
 static bool eliminateAvailableExternally(Module &M, bool Convert) {
   bool Changed = false;
 
-  // Drop initializers of available externally global variables.
+  // If a global variable is available externally and in the specified address
+  // space, convert it to local linkage; otherwise, drop its initializer.
   for (GlobalVariable &GV : M.globals()) {
     if (!GV.hasAvailableExternallyLinkage())
       continue;
+    if (ConvertGlobalVariableInAddrSpace.getNumOccurrences() &&
+        GV.getAddressSpace() == ConvertGlobalVariableInAddrSpace &&
+        !GV.use_empty()) {
+      convertToLocalCopy(M, GV);
+      Changed = true;
+      continue;
+    }
     if (GV.hasInitializer()) {
       Constant *Init = GV.getInitializer();
       GV.setInitializer(nullptr);
diff --git a/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
new file mode 100644
index 0000000000000..6995b97e79887
--- /dev/null
+++ b/llvm/test/Transforms/EliminateAvailableExternally/convert-global-variables-to-local.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -passes=elim-avail-extern -avail-extern-gv-in-addrspace-to-local=3 %s -o - | FileCheck %s
+
+@shared = internal addrspace(3) global i32 undef, align 4
+@shared.imported = available_externally hidden unnamed_addr addrspace(3) global i32 undef, align 4
+
+;.
+; CHECK: @shared = internal addrspace(3) global i32 undef, align 4
+; CHECK: @shared.imported.__uniq.[[UUID:.*]] = internal unnamed_addr addrspace(3) global i32 undef, align 4
+;.
+define void @foo(i32 %v) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[V:%.*]]) {
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared, align 4
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(3) @shared.imported.__uniq.[[UUID]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %v, ptr addrspace(3) @shared, align 4
+  store i32 %v, ptr addrspace(3) @shared.imported, align 4
+  ret void
+}

From 64155a32297f4884875783664ff13bec9ab376f5 Mon Sep 17 00:00:00 2001
From: Minding <77574923+Minding000@users.noreply.github.com>
Date: Wed, 18 Jun 2025 02:09:07 +0200
Subject: [PATCH 836/851] Added clarifying comment to 'LLVMLinkInMCJIT' and
 'LLVMLinkInInterpreter' (#92467)

Clarify that these functions are no-ops when linking to LLVM as a shared object.
---
 llvm/include/llvm-c/ExecutionEngine.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/llvm/include/llvm-c/ExecutionEngine.h b/llvm/include/llvm-c/ExecutionEngine.h
index c5fc9bdb4d07f..2062cbf470d8b 100644
--- a/llvm/include/llvm-c/ExecutionEngine.h
+++ b/llvm/include/llvm-c/ExecutionEngine.h
@@ -33,7 +33,15 @@ LLVM_C_EXTERN_C_BEGIN
  * @{
  */
 
+/**
+ * Empty function used to force the linker to link MCJIT.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInMCJIT(void);
+/**
+ * Empty function used to force the linker to link the LLVM interpreter.
+ * Has no effect when called on a pre-built library (dylib interface).
+ */
 void LLVMLinkInInterpreter(void);
 
 typedef struct LLVMOpaqueGenericValue *LLVMGenericValueRef;

From abbdd1670d8b12dd72ec353b14e256619ff4694b Mon Sep 17 00:00:00 2001
From: Andrew Rogers <andrurogerz@gmail.com>
Date: Tue, 17 Jun 2025 17:21:40 -0700
Subject: [PATCH 837/851] [llvm] minor fixes for clang-cl Windows DLL build
 (#144386)

## Purpose

This patch makes a minor changes to LLVM and Clang so that LLVM can be
built as a Windows DLL with `clang-cl`. These changes were not required
for building a Windows DLL with MSVC.

## Background

The Windows DLL effort is tracked in #109483. Additional context is
provided in [this
discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307),
and documentation for `LLVM_ABI` and related annotations is found in the
LLVM repo
[here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst).

## Overview
Specific changes made in this patch:
- Remove `constexpr` fields that reference DLL exported symbols. These
symbols cannot be resolved at compile time when building a Windows DLL
using `clang-cl`, so they cannot be `constexpr`. Instead, they are made
`const` and initialized in the implementation file rather than at
declaration in the header.
- Annotate symbols now defined out-of-line with `LLVM_ABI` so they are
exported when building as a shared library.
- Explicitly add default copy assignment operator for `ELFFile` to
resolve a compiler warning.

## Validation

Local builds and tests to validate cross-platform compatibility. This
included llvm, clang, and lldb on the following configurations:

- Windows with MSVC
- Windows with Clang
- Linux with GCC
- Linux with Clang
- Darwin with Clang
---
 .../lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp  |  2 +-
 llvm/include/llvm/BinaryFormat/Dwarf.h               | 12 ++++++------
 llvm/include/llvm/Object/ELF.h                       |  4 +++-
 llvm/lib/BinaryFormat/Dwarf.cpp                      | 12 ++++++++++++
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
index 836fc375809ad..f965bfb590d80 100644
--- a/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
+++ b/clang/lib/StaticAnalyzer/Core/Z3CrosscheckVisitor.cpp
@@ -92,7 +92,7 @@ void Z3CrosscheckVisitor::finalizeVisitor(BugReporterContext &BRC,
   };
 
   auto AttemptOnce = [&](const llvm::SMTSolverRef &Solver) -> Z3Result {
-    constexpr auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
+    auto getCurrentTime = llvm::TimeRecord::getCurrentTime;
     unsigned InitialRLimit = GetUsedRLimit(Solver);
     double Start = getCurrentTime(/*Start=*/true).getWallTime();
     std::optional<bool> IsSAT = Solver->check();
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.h b/llvm/include/llvm/BinaryFormat/Dwarf.h
index 2ead62025efa7..231b7ac17d75f 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.h
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.h
@@ -1191,32 +1191,32 @@ template <typename Enum> struct EnumTraits : public std::false_type {};
 
 template <> struct EnumTraits<Attribute> : public std::true_type {
   static constexpr char Type[3] = "AT";
-  static constexpr StringRef (*StringFn)(unsigned) = &AttributeString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Form> : public std::true_type {
   static constexpr char Type[5] = "FORM";
-  static constexpr StringRef (*StringFn)(unsigned) = &FormEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Index> : public std::true_type {
   static constexpr char Type[4] = "IDX";
-  static constexpr StringRef (*StringFn)(unsigned) = &IndexString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<Tag> : public std::true_type {
   static constexpr char Type[4] = "TAG";
-  static constexpr StringRef (*StringFn)(unsigned) = &TagString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LineNumberOps> : public std::true_type {
   static constexpr char Type[4] = "LNS";
-  static constexpr StringRef (*StringFn)(unsigned) = &LNStandardString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 template <> struct EnumTraits<LocationAtom> : public std::true_type {
   static constexpr char Type[3] = "OP";
-  static constexpr StringRef (*StringFn)(unsigned) = &OperationEncodingString;
+  LLVM_ABI static StringRef (*const StringFn)(unsigned);
 };
 
 inline uint64_t computeTombstoneAddress(uint8_t AddressByteSize) {
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index a0dc522e13cab..8d7545144dfd9 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -256,8 +256,10 @@ class ELFFile {
 public:
   LLVM_ELF_IMPORT_TYPES_ELFT(ELFT)
 
-  // Default ctor required to instantiate the template for DLL export.
+  // Default ctor and copy assignment operator required to instantiate the
+  // template for DLL export.
   ELFFile(const ELFFile &) = default;
+  ELFFile &operator=(const ELFFile &) = default;
 
   // This is a callback that can be passed to a number of functions.
   // It can be used to ignore non-critical errors (warnings), which is
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index b9b10a541b263..0d17dc175fed9 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -911,6 +911,18 @@ StringRef llvm::dwarf::RLEString(unsigned RLE) {
   }
 }
 
+StringRef (*const llvm::dwarf::EnumTraits<Tag>::StringFn)(unsigned) = TagString;
+StringRef (*const llvm::dwarf::EnumTraits<Attribute>::StringFn)(unsigned) =
+    AttributeString;
+StringRef (*const llvm::dwarf::EnumTraits<Form>::StringFn)(unsigned) =
+    FormEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LocationAtom>::StringFn)(unsigned) =
+    OperationEncodingString;
+StringRef (*const llvm::dwarf::EnumTraits<LineNumberOps>::StringFn)(unsigned) =
+    LNStandardString;
+StringRef (*const llvm::dwarf::EnumTraits<Index>::StringFn)(unsigned) =
+    IndexString;
+
 constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Index>::Type[];

From 99e263228f4513c166f20469968b2b646edaaa33 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 09:28:24 +0900
Subject: [PATCH 838/851] github: Add mips backend to PR autolabeler (#140909)

---
 .github/new-prs-labeler.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 162161ff13fb0..2f8d5745668d9 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -777,6 +777,10 @@ backend:NVPTX:
   - 'llvm/**/*nvptx*/**'
   - 'llvm/**/*NVPTX*/**'
 
+backend:MIPS:
+  - '**/*mips*'
+  - '**/*Mips*'
+
 backend:RISC-V:
   - clang/**/*riscv*
   - clang/**/*RISCV*

From 4e090b6e84e33e2a442e3951253ca570f8f842f8 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Tue, 17 Jun 2025 17:34:09 -0700
Subject: [PATCH 839/851] [lldb] Re-insert code to search for a binary by
 filepath if provided

July 14 2024 I landed a change to update progress reporting when
loading kernel/firmware binaries
https://github.com/llvm/llvm-project/pull/98845
In DynamicLoader::LoadBinaryWithUUIDAndAddress I removed code that
was setting the ModuleSpec to the provided name, if the name provided
is that of a file on disk.  With this code missing, if a filepath
name is passed in, this code will fail to find that binary on the local
disk.  There's nothing in the PR / intention that would lead to this
change, it was unintentional.
---
 lldb/source/Core/DynamicLoader.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 291e6b73a2c39..4be9f3eb9abc5 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -229,6 +229,8 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
   ModuleSpec module_spec;
   module_spec.GetUUID() = uuid;
   FileSpec name_filespec(name);
+  if (FileSystem::Instance().Exists(name_filespec))
+    module_spec.GetFileSpec() = name_filespec;
 
   if (uuid.IsValid()) {
     Progress progress("Locating binary", prog_str.GetString().str());

From 86a09f36154fbd264f61ea6462c8cf48b1ff2eb0 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 17 Jun 2025 17:48:09 -0700
Subject: [PATCH 840/851] [MLIR][XeGPU] Clean up xegpu op tests  (#144592)

Test cleanup:
1) separate layout.mlir from ops.mlir for layout related test
2) remove lane layout for ops working at work item scope.
3) remove redundant test in create_tdesc/update_tdesc/prefetch.
4) remove "test_" from all test function name.
---
 mlir/test/Dialect/XeGPU/invalid.mlir          |  96 ++---
 mlir/test/Dialect/XeGPU/layout.mlir           |  49 +++
 mlir/test/Dialect/XeGPU/ops.mlir              | 355 +++++-------------
 .../XeGPU/subgroup-map-propagation.mlir       |  72 ++--
 mlir/test/Dialect/XeGPU/xegpu-blocking.mlir   |  12 +-
 .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir |  32 +-
 mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir   |  40 +-
 7 files changed, 275 insertions(+), 381 deletions(-)
 create mode 100644 mlir/test/Dialect/XeGPU/layout.mlir

diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index b05c317231ad9..0a37ae70b5d99 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
 // -----
-func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) {
   // expected-error@+1 {{Expecting the TensorDesc rank is up to 2 and not greater than the ranks of shape, strides, offsets or the memref source}}
   %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
   return
@@ -9,49 +9,49 @@ func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
 
 // -----
 
-func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
   // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{SLM is not supported for 2D block tensor}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [24, 48]>>
   return
 }
 
 // -----
-func.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
   // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>}}
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [64, 32]>>
   return
 }
 
 // -----
-func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
@@ -59,7 +59,7 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
+func.func @prefetch_nd_vc_2(%src: memref<24xf16>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<24xf16>, vector<8xindex>
                 -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
@@ -70,7 +70,7 @@ func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_1(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<write_back>}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
@@ -79,7 +79,7 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
+func.func @load_nd_vc_2(%src: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<16xf16>, vector<8xindex>
           -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -90,7 +90,7 @@ func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
+func.func @load_nd_vc_3(%src: memref<8x16xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // expected-warning@+1 {{Invalid Packed Attr.}}
   %2 = xegpu.load_nd %1 <{packed, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
@@ -99,7 +99,7 @@ func.func @test_load_nd_vc_3(%src: memref<8x16xf16>) {
 }
 
 // -----
-func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
+func.func @load_nd_vc_4(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Result shape [8, 1] is not consistent with tensor descriptor}}
@@ -110,7 +110,7 @@ func.func @test_load_nd_vc_4(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
+func.func @load_nd_layout(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Result shape [3] is not a valid distribution for tensor descriptor}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>,
@@ -119,7 +119,7 @@ func.func @test_load_nd_layout(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
+func.func @load_nd_simt(%src: memref<24x32xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8xf32>
@@ -127,7 +127,7 @@ func.func @test_load_nd_simt(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_1(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{invalid l1_hint: #xegpu.cache_hint<streaming>}}
@@ -136,7 +136,7 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
+func.func @store_nd_vc_2(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = arith.constant dense<1.0>: vector<8x2xf16>
   %2 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
@@ -148,7 +148,7 @@ func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
+func.func @store_nd_vc_3(%dst: memref<24x32xf16>) {
   %1 = arith.constant dense<1.0>: vector<2x24x32xf16>
   %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // expected-error@+1 {{array length is not supported by store_nd}}
@@ -157,7 +157,7 @@ func.func @test_store_nd_vc_3(%dst: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
+func.func @store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16xf32>
   // expected-error@+1 {{Value shape [3] is not a valid distribution for tensor descriptor}}
   xegpu.store_nd %data, %1 : vector<3xf32>, !xegpu.tensor_desc<16xf32>
@@ -165,7 +165,7 @@ func.func @test_store_nd_simt(%dst: memref<24x32xf32>, %data: vector<3xf32>) {
 }
 
 // -----
-func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
+func.func @store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
   // expected-error@+1 {{TensorDesc doesn't need LayoutAttr for SIMT code}}
   xegpu.store_nd %data, %1 : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -173,7 +173,7 @@ func.func @test_store_nd_simt(%src: memref<24x32xf32>, %data: vector<8xf32>) {
 }
 
 // -----
-func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
+func.func @store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
   %1 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf32> ->
     !xegpu.tensor_desc<8x16xf32>
   // expected-error@+1 {{Value shape [8, 1] is not consistent with tensor descriptor}}
@@ -182,7 +182,7 @@ func.func @test_store_nd_vc_5(%dst: memref<24x32xf32>, %data: vector<8x1xf32>) {
 }
 
 // -----
-func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
+func.func @update_nd_offset_1(%dst: memref<16xf16>) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %dst, %0 : memref<16xf16>, vector<8xindex>
             -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -192,7 +192,7 @@ func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_1(%src: ui64) {
+func.func @create_tdesc_vc_1(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex> -> !xegpu.tensor_desc<8xf16>
@@ -200,7 +200,7 @@ func.func @test_create_tdesc_vc_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_2(%src: ui64) {
+func.func @create_tdesc_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14]> : vector<8xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<8xindex>
   // expected-error@+1 {{expected chunk blocks for 2D tensor}}
@@ -209,7 +209,7 @@ func.func @test_create_tdesc_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_3(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{Memory space mismatch}}
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
@@ -218,7 +218,7 @@ func.func @test_create_tdesc_vc_3(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_4(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{invalid chunk size}}
@@ -227,7 +227,7 @@ func.func @test_create_tdesc_vc_4(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
+func.func @create_tdesc_vc_5(%src: memref<?xf32>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>
   // expected-error@+1 {{expected tensor shape[1] to match chunk size}}
@@ -236,7 +236,7 @@ func.func @test_create_tdesc_vc_5(%src: memref<?xf32>) {
 }
 
 // -----
-func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
+func.func @create_tdesc_vc_6(%src: memref<?xf16>) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : memref<?xf16>, vector<4xindex>
   // expected-error@+1 {{tensor shape[1] to be a multiple of packing factor 2}}
@@ -246,7 +246,7 @@ func.func @test_create_tdesc_vc_6(%src: memref<?xf16>) {
 
 
 // -----
-func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
+func.func @prefetch_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
   xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
@@ -254,7 +254,7 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_prefetch_vc_2(%src: ui64) {
+func.func @prefetch_vc_2(%src: ui64) {
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex>
           -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -264,7 +264,7 @@ func.func @test_prefetch_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_1(%src: ui64) {
+func.func @create_tdesc_layout_1(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{expected layout rank to match tensor rank}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
@@ -272,7 +272,7 @@ func.func @test_create_tdesc_layout_1(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_2(%src: ui64) {
+func.func @create_tdesc_layout_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{cannot map over non-contiguous scattered row elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>,   #xegpu.layout<lane_layout = [1, 4], lane_data = [2, 1]>>
@@ -280,7 +280,7 @@ func.func @test_create_tdesc_layout_2(%src: ui64) {
 }
 
 // -----
-func.func @test_create_tdesc_layout_3(%src: ui64) {
+func.func @create_tdesc_layout_3(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   // expected-error@+1 {{work item data mapping must match the number of contiguous elements}}
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x3xf32, #xegpu.scatter_tdesc_attr<chunk_size = 3>,   #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
@@ -288,7 +288,7 @@ func.func @test_create_tdesc_layout_3(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_simt_1(%src: ui64) {
+func.func @load_gather_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
@@ -298,7 +298,7 @@ func.func @test_load_gather_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_simt_1(%src: ui64) {
+func.func @store_scatter_simt_1(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %val = arith.constant dense<2.9>: vector<6xf32>
@@ -309,7 +309,7 @@ func.func @test_store_scatter_simt_1(%src: ui64) {
 }
 
 // -----
-func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
+func.func @load_gather_vc_1(%src: memref<24x32xf16>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
   // expected-error@+1 {{Expects a scattered TensorDesc}}
@@ -319,7 +319,7 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
 }
 
 // -----
-func.func @test_load_gather_vc_2(%src: ui64) {
+func.func @load_gather_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = xegpu.create_tdesc %src, %cst : ui64, vector<4xindex>
@@ -332,7 +332,7 @@ func.func @test_load_gather_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
+func.func @store_scatter_vc_1(%src: memref<24x32xf32>) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
   %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
@@ -343,7 +343,7 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
 }
 
 // -----
-func.func @test_store_scatter_vc_2(%src: ui64) {
+func.func @store_scatter_vc_2(%src: ui64) {
   %cst = arith.constant dense<[0, 8, 16, 24]>: vector<4xindex>
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
@@ -356,49 +356,49 @@ func.func @test_store_scatter_vc_2(%src: ui64) {
 }
 
 // -----
-func.func @test_dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_1(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{expecting lhs and result to be a 2D vector, and rhs to be either 2D or 3D (packed) vector}}
   %1 = xegpu.dpas %a, %b : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_3(%a : vector<8x8xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{K-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x8xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
+func.func @dpas_4(%a : vector<16x16xf16>, %b: vector<8x16x2xf16>) {
   // expected-error@+1 {{M-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<16x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
+func.func @dpas_5(%a : vector<8x16xf16>, %b: vector<8x8x2xf16>) {
   // expected-error@+1 {{N-dimension mismatch}}
   %1 = xegpu.dpas %a, %b : vector<8x16xf16>, vector<8x8x2xf16> -> vector<8x16xf32>
   return
 }
 
 // -----
-func.func @test_dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
+func.func @dpas_simt_1(%a : vector<8xf16>, %b: vector<15xf16>) {
   // expected-error@+1 {{Expecting B operand to be a multiple of 32 bits}}
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<15xf16> -> vector<8xf32>
   return
 }
 
 // -----
-func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
+func.func @atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
   %0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %1 = xegpu.create_tdesc %src, %0 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
   // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
@@ -512,7 +512,7 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
 }
 
 // -----
-func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
+func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected different srcMap and resMap}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
@@ -520,7 +520,7 @@ func.func @test_convert_layout_same_map(%a: vector<32x64xf16>) {
 }
 
 // -----
-func.func @test_convert_layout_unmatch(%a: vector<32x64xf16>) {
+func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
   // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
   %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
                                 resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir
new file mode 100644
index 0000000000000..7f3ebec225cdf
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/layout.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
+gpu.func @create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
+  gpu.return
+}
+
+// CHECK: gpu.func @create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
+  gpu.return
+}
+
+gpu.func @convert_layout(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
+                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
+  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
+                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
+  gpu.return
+}
+
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 76af59d6aedc7..054c4d12fdb28 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -6,23 +6,15 @@
 
 // CHECK-LABEL: gpu.module @test {
 gpu.module @test {
-// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> ->
-    !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+// CHECK: gpu.func @create_nd_tdesc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
   //CHECK: %[[C:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
@@ -30,94 +22,41 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
-gpu.func @test_create_nd_tdesc_simt_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
-  //CHECK: %[[C:.*]] = arith.constant 1 : index
-  %c1 = arith.constant 1 : index
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_3(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
+gpu.func @create_nd_tdesc_4(%src: memref<2x24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_4(%[[arg0:.*]]: memref<2x24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_4(%src: memref<2x24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
+// CHECK: gpu.func @create_nd_tdesc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @create_nd_tdesc_5(%src: memref<2x24x32xf32, 3>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
-gpu.func @test_create_nd_tdesc_simt_5(%src: memref<2x24x32xf32, 3>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>, #xegpu.layout<lane_layout = [16], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_vc_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_vc_6(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @create_nd_tdesc_6(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @create_nd_tdesc_6(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_simt_6(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_simt_6(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_1(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_2(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_2(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_create_nd_tdesc_subgroup_3(%[[arg0:.*]]: memref<128x128xf32>) {
-gpu.func @test_create_nd_tdesc_subgroup_3(%src: memref<128x128xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout<sg_layout = [4, 2], sg_data = [32, 64], inst_data = [8, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+// CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @prefetch_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
@@ -125,17 +64,9 @@ gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_prefetch_nd_simt(%src: memref<24x32xf16>) {
-  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
@@ -144,8 +75,8 @@ gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16>
@@ -154,8 +85,8 @@ gpu.func @test_load_nd_simt(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
+// CHECK: func @subgroup_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @subgroup_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<16xf16>
@@ -163,8 +94,8 @@ gpu.func @test_load_nd_vc_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_2(%[[arg0:.*]]: memref<8x16xf16>) {
-gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
+// CHECK: func @simt_load_nd_2(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @simt_load_nd_2(%src: memref<8x16xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16xf16> -> vector<1xf16>
@@ -172,8 +103,8 @@ gpu.func @test_load_nd_simt_2(%src: memref<8x16xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32>
@@ -181,8 +112,8 @@ gpu.func @test_load_nd_vc_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_3(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_3(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
@@ -190,8 +121,8 @@ gpu.func @test_load_nd_simt_3(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16>
@@ -199,8 +130,8 @@ gpu.func @test_load_nd_vc_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_4(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_4(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_4(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16>
@@ -208,8 +139,8 @@ gpu.func @test_load_nd_simt_4(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<32xf32>
@@ -217,8 +148,8 @@ gpu.func @test_load_nd_vc_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_5(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_5(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_5(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<32xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<32xf32> -> vector<2xf32>
@@ -226,8 +157,8 @@ gpu.func @test_load_nd_simt_5(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x16x16xf16>
@@ -235,8 +166,8 @@ gpu.func @test_load_nd_vc_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_6(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_6(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_6(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -245,8 +176,8 @@ gpu.func @test_load_nd_simt_6(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
+// CHECK: func @subgroup_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, packed}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x8x16x2xf16>
@@ -254,8 +185,8 @@ gpu.func @test_load_nd_vc_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_7(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
+// CHECK: func @simt_load_nd_7(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_load_nd_7(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2>>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<32xf16>
@@ -264,8 +195,8 @@ gpu.func @test_load_nd_simt_7(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_vc_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
+// CHECK: func @subgroup_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @subgroup_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
@@ -273,8 +204,8 @@ gpu.func @test_load_nd_vc_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_load_nd_simt_8(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
+// CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
   // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8xf32>
@@ -282,8 +213,8 @@ gpu.func @test_load_nd_simt_8(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
   %1 = arith.constant dense<1.0>: vector<24x32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -293,8 +224,8 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: func @test_store_nd_simt(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<48xf16>
   %1 = arith.constant dense<1.0>: vector<48xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
@@ -306,8 +237,8 @@ gpu.func @test_store_nd_simt(%src: memref<24x32xf16>) {
 
 
-// CHECK: func @test_store_nd_vc_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
+// CHECK: func @subgroup_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @subgroup_store_nd_2(%dst: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<32xf16>
   %1 = arith.constant dense<1.0>: vector<32xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -318,8 +249,8 @@ gpu.func @test_store_nd_vc_2(%dst: memref<24x32xf16>) {
 }
 
 
-// CHECK: func @test_store_nd_simt_2(%[[arg0:.*]]: memref<24x32xf16>) {
-gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
+// CHECK: func @simt_store_nd_2(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @simt_store_nd_2(%src: memref<24x32xf16>) {
   // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<2xf16>
   %1 = arith.constant dense<1.0>: vector<2xf16>
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<32xf16>
@@ -329,8 +260,8 @@ gpu.func @test_store_nd_simt_2(%src: memref<24x32xf16>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+// CHECK: gpu.func @update_nd_tdesc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @update_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
@@ -338,17 +269,9 @@ gpu.func @test_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_update_nd_tdesc_simt(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_update_nd_tdesc_simt(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -356,18 +279,9 @@ gpu.func @test_create_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
 
-// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
+// CHECK: gpu.func @create_tdesc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @create_tdesc_1(%src: memref<?xf32, 3>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>>
@@ -375,18 +289,9 @@ gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_1(%[[arg0:.*]]: memref<?xf32, 3>) {
-gpu.func @test_create_tdesc_simt_1(%src: memref<?xf32, 3>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32, 3>, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32, 3>, vector<4xindex>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
 
-// CHECK: gpu.func @test_create_tdesc_vc_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
+// CHECK: gpu.func @create_tdesc_2(%[[arg0:.*]]: memref<?xf32>) {
+gpu.func @create_tdesc_2(%src: memref<?xf32>) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
@@ -394,17 +299,9 @@ gpu.func @test_create_tdesc_vc_2(%src: memref<?xf32>) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_tdesc_simt_2(%[[arg0:.*]]: memref<?xf32>) {
-gpu.func @test_create_tdesc_simt_2(%src: memref<?xf32>) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : memref<?xf32>, vector<4xindex> -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<>
-  %1 = xegpu.create_tdesc %src, %0 : memref<?xf32>, vector<4xindex>  -> !xegpu.tensor_desc<4xf32, #xegpu.scatter_tdesc_attr<chunk_size = 1>, #xegpu.layout<lane_layout = [4], lane_data = [1]>>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_tdesc_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_tdesc_vc_3(%src: ui64) {
+// CHECK: gpu.func @create_tdesc_3(%[[arg0:.*]]: ui64) {
+gpu.func @create_tdesc_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -413,17 +310,8 @@ gpu.func @test_create_tdesc_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_create_tdesc_simt_3(%arg0: ui64) {
-gpu.func @test_create_tdesc_simt_3(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 2]>>
-  gpu.return
-}
-
-// CHECK: gpu.func @test_load_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_load(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -435,8 +323,8 @@ gpu.func @test_load_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt(%src: ui64) {
+// CHECK: gpu.func @simt_load(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -448,8 +336,8 @@ gpu.func @test_load_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -461,8 +349,8 @@ gpu.func @test_load_vc_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_load_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -474,8 +362,8 @@ gpu.func @test_load_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -487,8 +375,8 @@ gpu.func @test_load_vc_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_load_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_load_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_load_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_load_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -500,8 +388,8 @@ gpu.func @test_load_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc(%src: ui64) {
+// CHECK: gpu.func @subgroup_store(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -517,8 +405,8 @@ gpu.func @test_store_vc(%src: ui64) {
 
 
-// CHECK: gpu.func @test_store_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt(%src: ui64) {
+// CHECK: gpu.func @simt_store(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -532,8 +420,8 @@ gpu.func @test_store_simt(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_2(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -549,8 +437,8 @@ gpu.func @test_store_vc_2(%src: ui64) {
 
 
-// CHECK: gpu.func @test_store_simt_2(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_2(%src: ui64) {
+// CHECK: gpu.func @simt_store_2(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_2(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -564,8 +452,8 @@ gpu.func @test_store_simt_2(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_store_vc_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_vc_3(%src: ui64) {
+// CHECK: gpu.func @subgroup_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @subgroup_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -580,8 +468,8 @@ gpu.func @test_store_vc_3(%src: ui64) {
 }
 
 
-// CHECK: gpu.func @test_store_simt_3(%[[arg0:.*]]: ui64) {
-gpu.func @test_store_simt_3(%src: ui64) {
+// CHECK: gpu.func @simt_store_3(%[[arg0:.*]]: ui64) {
+gpu.func @simt_store_3(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[cst1:.*]] = arith.constant dense<true> : vector<4xi1>
@@ -595,20 +483,8 @@ gpu.func @test_store_simt_3(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_prefetch_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  gpu.return
-}
-
-
-// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_prefetch_vc(%src: ui64) {
+// CHECK: gpu.func @prefetch(%[[arg0:.*]]: ui64) {
+gpu.func @prefetch(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
@@ -618,21 +494,9 @@ gpu.func @test_prefetch_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_update_tdesc_simt(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_simt(%src: ui64) {
-  //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], %[[st]] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  %0 = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
-  %1 = xegpu.create_tdesc %src, %0 : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>
-  %s = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
-  %2 = xegpu.update_offset %1, %s : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>, #xegpu.layout<lane_layout = [4, 1], lane_data = [1, 1]>>, vector<4xindex>
-  gpu.return
-}
 
-// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
-gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+// CHECK: gpu.func @create_update_tdesc(%[[arg0:.*]]: ui64) {
+gpu.func @create_update_tdesc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<[0, 8, 16, 24]> : vector<4xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[cst]] : ui64, vector<4xindex> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
   //CHECK: %[[st:.*]] = arith.constant dense<32> : vector<4xindex>
@@ -644,29 +508,29 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) {
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
-gpu.func @test_dpas_vc(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
+// CHECK: gpu.func @subgroup_dpas(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<16x16xf16>)
+gpu.func @subgroup_dpas(%a : vector<8x16xf16>, %b: vector<16x16xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_simt(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
-gpu.func @test_dpas_simt(%a : vector<8xf16>, %b: vector<16xf16>) {
+// CHECK: gpu.func @simt_dpas(%[[arg0:.*]]: vector<8xf16>, %[[arg1:.*]]: vector<16xf16>)
+gpu.func @simt_dpas(%a : vector<8xf16>, %b: vector<16xf16>) {
   // CHECK: xegpu.dpas %[[arg0]], %[[arg1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   %1 = xegpu.dpas %a, %b : vector<8xf16>, vector<16xf16> -> vector<8xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_dpas_vc_with_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
-gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
+// CHECK: gpu.func @subgroup_dpas_packed_b(%[[arg0:.*]]: vector<8x16xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
+gpu.func @subgroup_dpas_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf16>) {
   // CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   %1 = xegpu.dpas %a, %b: vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
   gpu.return
 }
 
-// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
-gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
+// CHECK: gpu.func @subgroup_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
+gpu.func @subgroup_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
   //CHECK: %[[c:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   %c = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex>
   //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]], %[[c]] : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -715,23 +579,4 @@ gpu.func @fence() {
   gpu.return
 }
 
-// CHECK: gpu.func @test_create_nd_tdesc_wg_1(%[[arg0:.*]]: memref<24x32xf32>) {
-gpu.func @test_create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [3, 2], sg_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>>
-  gpu.return
-}
-
-gpu.func @test_convert_layout(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
-                                resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
-gpu.func @test_convert_layout_wg(%a: vector<32x64xf16>) {
-  %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
-                                resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
-  gpu.return
-}
-
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
index c7c82fc8dbb3c..35ac39d074c70 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-map-propagation.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -xegpu-subgroup-distribute='print-analysis-only=true' -split-input-file %s | FileCheck %s
 
-// CHECK: function: test_dpas_f16:
+// CHECK: function: dpas_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -23,7 +23,7 @@
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -38,7 +38,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 
 
 // -----
-// CHECK: function: test_dpas_i8:
+// CHECK: function: dpas_i8:
 // CHECK-NEXT: argument: <block argument> of type 'vector<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 2]
 // CHECK-NEXT: argument: <block argument> of type 'vector<32x16xi8>' at index: 1
@@ -51,7 +51,7 @@ func.func @test_dpas_f16(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T1:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.dpas %arg0, %arg1 : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
   %1 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
@@ -60,7 +60,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 }
 
 // -----
-// CHECK: function: test_load_with_transpose_effect:
+// CHECK: function: load_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -83,7 +83,7 @@ func.func @test_dpas_i8(%arg0: vector<8x32xi8>, %arg1: vector<32x16xi8>, %arg2:
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -97,7 +97,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 }
 
 // -----
-// CHECK: function: test_vector_transpose:
+// CHECK: function: vector_transpose:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x16xf16>' at index: 1
@@ -122,7 +122,7 @@ func.func @test_load_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memre
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %cst = arith.constant dense<0.000000e+00> : vector<8x16xf32>
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -137,7 +137,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 }
 
 // -----
-// CHECK: function: test_extf_truncf:
+// CHECK: function: extf_truncf:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -152,7 +152,7 @@ func.func @test_vector_transpose(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: Not assigned.
-func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
+func.func @extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) -> vector<8x16xf32> {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = arith.extf %1 : vector<16x16xf16> to vector<16x16xf32>
@@ -162,7 +162,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_load_gather_with_transpose_effect:
+// CHECK: function: load_gather_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf16>' at index: 1
@@ -187,7 +187,7 @@ func.func @test_extf_truncf(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T5:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
+func.func @load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1: memref<256xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
@@ -202,7 +202,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 }
 
 // -----
-// CHECK: function: test_load_gather_1d:
+// CHECK: function: load_gather_1d:
 // CHECK: argument: <block argument> of type 'memref<256xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -215,7 +215,7 @@ func.func @test_load_gather_with_transpose_effect(%arg0: memref<8x16xf16>, %arg1
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T1]] = xegpu.load %[[T0]], %[[CST0]]  : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg0, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -225,7 +225,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 }
 
 // -----
-// CHECK: function: test_store_scatter_with_transpose_effect:
+// CHECK: function: store_scatter_with_transpose_effect:
 // CHECK-NEXT: argument: <block argument> of type 'memref<128xf32>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: op    : %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<8x16xf32>
@@ -236,7 +236,7 @@ func.func @test_load_gather_1d(%arg0: memref<256xf32>, %arg1: !xegpu.tensor_desc
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST1]] : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8 : i64>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16, 1], lane_data: [1, 1]
-func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
+func.func @store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
   %cst = arith.constant dense<1.000000e+00> : vector<8x16xf32>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %cst_1 = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
@@ -246,7 +246,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 }
 
 // -----
-// CHECK: function: test_store_scatter_1d:
+// CHECK: function: store_scatter_1d:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: argument: <block argument> of type 'memref<256xf32>' at index: 1
@@ -257,7 +257,7 @@ func.func @test_store_scatter_with_transpose_effect(%arg0: memref<128xf32>) {
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = xegpu.create_tdesc %{{.*}}, %[[CST]] : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
+func.func @store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>) {
   %cst = arith.constant dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]> : vector<16xindex>
   %cst_0 = arith.constant dense<true> : vector<16xi1>
   %0 = xegpu.create_tdesc %arg1, %cst : memref<256xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
@@ -266,7 +266,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i16_to_i8:
+// CHECK: function: vector_bitcast_i16_to_i8:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x16xi16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<32x16xi8>' at index: 1
@@ -289,7 +289,7 @@ func.func @test_store_scatter_1d(%arg0: vector<16xf32>, %arg1: memref<256xf32>)
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T6:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xi32> -> !xegpu.tensor_desc<8x16xi32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
+func.func @vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<32x16xi8>, %arg2: memref<8x16xi32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xi16> -> !xegpu.tensor_desc<8x16xi16>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8>
@@ -303,7 +303,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 }
 
 // -----
-// CHECK: function: test_vector_bitcast_i8_to_f16:
+// CHECK: function: vector_bitcast_i8_to_f16:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x32xi8>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<16x32xi8>' at index: 1
@@ -328,7 +328,7 @@ func.func @test_vector_bitcast_i16_to_i8(%arg0: memref<8x16xi16>, %arg1: memref<
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T7:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
+func.func @vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<16x32xi8>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x32xi8> -> !xegpu.tensor_desc<8x32xi8>
   %1 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x32xi8> -> !xegpu.tensor_desc<16x32xi8>
@@ -343,7 +343,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 }
 
 // -----
-// CHECK: function: test_binary_op_one_use:
+// CHECK: function: binary_op_one_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -360,7 +360,7 @@ func.func @test_vector_bitcast_i8_to_f16(%arg0: memref<8x32xi8>, %arg1: memref<1
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T4:.*]] = xegpu.dpas %[[T0]], %[[T3]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
+func.func @binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %2 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -371,7 +371,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 }
 
 // -----
-// CHECK: function: test_binary_op_multiple_uses:
+// CHECK: function: binary_op_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -390,7 +390,7 @@ func.func @test_binary_op_one_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !x
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.dpas %[[T0]], %[[T2]] : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
+func.func @binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: !xegpu.tensor_desc<8x16xf32>, %arg3: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
   %cst = arith.constant dense<1.000000e+00> : vector<16x16xf16>
@@ -402,7 +402,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 }
 
 // -----
-// CHECK: function: test_for_op:
+// CHECK: function: for_op:
 // CHECK-NEXT: argument: <block argument> of type 'memref<8x128xf16>' at index: 0
 // CHECK-NEXT: layout  : Not assigned.
 // CHECK-NEXT: argument: <block argument> of type 'memref<128x16xf16>' at index: 1
@@ -437,7 +437,7 @@ func.func @test_binary_op_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %ar
 // CHECK-NEXT: layout for result #2: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T3:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
+func.func @for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg2: memref<8x16xf32>) {
   %c0 = arith.constant 0 : index
   %c128 = arith.constant 128 : index
   %c16 = arith.constant 16 : index
@@ -458,7 +458,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 }
 
 // -----
-// CHECK: function: test_if_single_use:
+// CHECK: function: if_single_use:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -477,7 +477,7 @@ func.func @test_for_op(%arg0: memref<8x128xf16>, %arg1: memref<128x16xf16>, %arg
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [2, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
+func.func @if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -492,7 +492,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 }
 
 // -----
-// CHECK: function: test_if_multiple_uses:
+// CHECK: function: if_multiple_uses:
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<8x16xf16>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16x16xf16>' at index: 1
@@ -513,7 +513,7 @@ func.func @test_if_single_use(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: op    : %[[T2:.*]] = xegpu.dpas %[[T0]], %{{.*}} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [1, 16], lane_data: [1, 1]
-func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
+func.func @if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>, %arg2: i1, %arg3: !xegpu.tensor_desc<8x16xf32>, %arg4: !xegpu.tensor_desc<16x16xf16>) {
   %0 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16>
   %1 = scf.if %arg2 -> (vector<16x16xf16>) {
     %3 = xegpu.load_nd %arg1  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
@@ -529,7 +529,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 }
 
 // -----
-// CHECK: function: test_vector_outer_reduction:
+// CHECK: function: vector_outer_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -538,7 +538,7 @@ func.func @test_if_multiple_uses(%arg0: !xegpu.tensor_desc<8x16xf16>, %arg1: !xe
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [0] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [0] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
@@ -546,7 +546,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 }
 
 // -----
-// CHECK: function: test_vector_inner_reduction:
+// CHECK: function: vector_inner_reduction:
 // CHECK-NEXT: argument: <block argument> of type 'vector<16x16xf32>' at index: 0
 // CHECK-NEXT: layout  : lane_layout: [1, 16], lane_data: [1, 1]
 // CHECK-NEXT: argument: <block argument> of type '!xegpu.tensor_desc<16xf32>' at index: 1
@@ -555,7 +555,7 @@ func.func @test_vector_outer_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.t
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
 // CHECK-NEXT: op    : %[[T0:.*]] = vector.multi_reduction <add>, %{{.*}}, %[[CST]] [1] : vector<16x16xf32> to vector<16xf32>
 // CHECK-NEXT: layout for result #0: lane_layout: [16], lane_data: [1]
-func.func @test_vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
+func.func @vector_inner_reduction(%arg0: vector<16x16xf32>, %arg1: !xegpu.tensor_desc<16xf32>) {
   %cst = arith.constant dense<0.000000e+00> : vector<16xf32>
   %0 = vector.multi_reduction <add>, %arg0, %cst [1] : vector<16x16xf32> to vector<16xf32>
   xegpu.store_nd %0, %arg1  : vector<16xf32>, !xegpu.tensor_desc<16xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index 8e3673d04eacb..67d3bd9b393c0 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -4,7 +4,7 @@
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_n_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -45,7 +45,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_inst_data_only_attribute(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -86,7 +86,7 @@ gpu.module @test_kernel {
 #l1 = #xegpu.layout<inst_data = [8, 16]>
 #l2 = #xegpu.layout<inst_data = [16, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_one_to_one_lowering(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c8 = arith.constant 8 : index
     %c16 = arith.constant 16 : index
@@ -130,7 +130,7 @@ gpu.module @test_kernel {
 #b = #xegpu.layout<inst_data = [16, 16], lane_layout = [1, 16], lane_data = [16, 1]>
 #c = #xegpu.layout<inst_data = [8, 16], lane_layout = [1, 16], lane_data = [8, 1]>
 gpu.module @test_kernel {
-  gpu.func @test_gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
+  gpu.func @gemm_with_elemwise_preop(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf32>) {
     %c0 = arith.constant 0 : index
     %c16 = arith.constant 16 : index
     %c32 = arith.constant 32 : index
@@ -172,7 +172,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8, 16]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_with_inst_data_only(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
@@ -211,7 +211,7 @@ gpu.module @test_kernel {
 // -----
 #l = #xegpu.layout<inst_data = [8]>
 gpu.module @test_kernel {
-  gpu.func @test_elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
+  gpu.func @elementwise_1D(%A: memref<1024x1024xf16>, %B: memref<1024x1024xf16>, %C: memref<1024x1024xf16>) {
     %c0 = arith.constant 0 : index
     %c32 = arith.constant 32 : index
     %c1024 = arith.constant 1024 : index
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index 35ad16d8cd9a9..c6124f90e0f48 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
 gpu.module @test_round_robin_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
       // CHECK-COUNT-12: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<24x32xf32>
       // CHECK-SAME: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
@@ -12,9 +12,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.load_nd %{{.*}}
@@ -27,9 +27,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
     }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
         -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       // CHECK-COUNT-12: xegpu.store_nd %{{.*}}, %{{.*}}
@@ -43,9 +43,9 @@ gpu.module @test_round_robin_assignment {
       gpu.return
   }
 
-  // CHECK-LABEL: test_update_nd
+  // CHECK-LABEL: update_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_update_nd(%src: memref<24x32xf32>){
+  gpu.func @update_nd(%src: memref<24x32xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32>
       ->  !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-COUNT-12: xegpu.update_nd_offset %{{.*}}, [0, 16]
@@ -56,9 +56,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas
+  // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<8x8xf32>, %[[ARG_1:.*]]: memref<8x8xf32>, %[[ARG_2:.*]]: memref<8x8xf32>)
-  gpu.func @test_dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
+  gpu.func @dpas(%a: memref<8x8xf32>, %b: memref<8x8xf32>, %c: memref<8x8xf32>) {
     // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<8x8xf32>
     // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.create_nd_tdesc
@@ -90,9 +90,9 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK-COUNT-12: xegpu.prefetch_nd %{{.*}}
     // CHECK-SAME-COUNT-12 : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
@@ -103,7 +103,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1 = arith.constant 1 : index
     %c10 = arith.constant 10 : index
     %c0 = arith.constant 0 : index
@@ -126,7 +126,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -150,7 +150,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %0 = gpu.subgroup_id : index
     %1 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -173,7 +173,7 @@ gpu.module @test_round_robin_assignment {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index 466842c968448..44b11c304cc80 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -3,9 +3,9 @@
 //CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
 //CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
-  // CHECK-LABEL: test_create_nd_tdesc
+  // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_create_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) {
   // CHECK: %[[SGID:.*]] = gpu.subgroup_id
   // CHECK: %[[C12:.*]] = arith.constant 12 : index
   // CHECK: %[[C4:.*]] = arith.constant 4 : index
@@ -30,9 +30,9 @@ gpu.module @test_1_1_assignment {
   gpu.return
   }
 
-  // CHECK-LABEL: test_load_nd_tdesc
+  // CHECK-LABEL: load_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_load_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @load_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -46,9 +46,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
   }
 
-  // CHECK-LABEL: test_store_nd
+  // CHECK-LABEL: store_nd
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_store_nd(%src: memref<24x32xf32>) {
+  gpu.func @store_nd(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]]
@@ -66,9 +66,9 @@ gpu.module @test_1_1_assignment {
     gpu.return
 }
 
-// CHECK-LABEL: test_update_nd
+// CHECK-LABEL: update_nd
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-gpu.func @test_update_nd(%src: memref<24x32xf32>){
+gpu.func @update_nd(%src: memref<24x32xf32>){
   // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
   // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
   // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
@@ -80,10 +80,10 @@ gpu.func @test_update_nd(%src: memref<24x32xf32>){
   gpu.return
 }
 
-// CHECK-LABEL: test_dpas
+// CHECK-LABEL: dpas
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -114,10 +114,10 @@ gpu.func @test_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
   }
 
 
-// CHECK-LABEL: test_dpas_no_sg_data
+// CHECK-LABEL: dpas_no_sg_data
 // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
 // CHECK-SAME: %[[ARG_1:.*]]: memref<32x24xf32>
-gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
+gpu.func @dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECk-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]]
@@ -147,9 +147,9 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_prefetch_nd_tdesc
+  // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<24x32xf32>
-  gpu.func @test_prefetch_nd_tdesc(%src: memref<24x32xf32>) {
+  gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) {
     // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<24x32xf32>
     // CHECK-SAME: -> !xegpu.tensor_desc<12x8xf32, #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>>
     // CHECK: xegpu.prefetch_nd %[[TDESC]]
@@ -161,8 +161,8 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  // CHECK-LABEL: test_dpas_with_no_create_nd_desc
-  gpu.func @test_dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
+  // CHECK-LABEL: dpas_with_no_create_nd_desc
+  gpu.func @dpas_with_no_create_nd_desc(%a: vector<24x32xf32>, %b: vector<32x24xf32>) {
     // CHECK-NOT: vector<12x12xf32>
     %dpas = xegpu.dpas %a, %b
       {layout =  #xegpu.layout<sg_layout = [2, 2], sg_data = [12, 12], lane_layout = [2, 2], lane_data = [1, 1]>}
@@ -170,7 +170,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
+  gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
     //CHECK: [[c0:%.+]] = arith.constant 0 : index
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
     //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
@@ -213,7 +213,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_while_and_condition(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c1_i32 = arith.constant 1 : i32
     %c10_i32 = arith.constant 10 : i32
     %c0_i32 = arith.constant 0 : i32
@@ -238,7 +238,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 
@@ -267,7 +267,7 @@ gpu.func @test_dpas_no_sg_data(%a: memref<24x32xf32>, %b: memref<32x24xf32>) {
     gpu.return
   }
 
-  gpu.func @test_scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
+  gpu.func @scf_if_tensor_desc(%arg0: memref<1024xf32>, %arg1: memref<1024xf32>) {
     %c10 = arith.constant 10 : index
     %id = gpu.subgroup_id : index
 

From 0defde8e06338cbe968d55d1d9e8581d55f3ae2b Mon Sep 17 00:00:00 2001
From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com>
Date: Wed, 18 Jun 2025 09:00:07 +0800
Subject: [PATCH 841/851] [AMDGPU] Support D16 folding for image.sample with
 multiple extractelement and fptrunc users (#141758)

Now we only support D16 folding for `image sample` instructions with a
single user: a `fptrunc` to half.
However, we can actually support D16 folding for image.sample
instructions with multiple users,
as long as each user follows the pattern of extractelement followed by
fptrunc to half.
For example:
```
  %sample = call <4 x float> @llvm.amdgcn.image.sample
  %e0 = extractelement <4 x float> %sample, i32 0
  %h0 = fptrunc float %e0 to half
  %e1 = extractelement <4 x float> %sample, i32 1
  %h1 = fptrunc float %e1 to half
  %e2 = extractelement <4 x float> %sample, i32 2
  %h2 = fptrunc float %e2 to half
```
This change enables D16 folding for such cases and avoids generating
`v_cvt_f16_f32_e32` instructions.
---
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  60 +++++++++
 .../InstCombine/AMDGPU/image-d16.ll           | 118 ++++++++++++++++++
 2 files changed, 178 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index d12170a60905b..5477c5eae9392 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -248,6 +248,66 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
                                      });
         }
       }
+
+      // Only perform D16 folding if every user of the image sample is
+      // an ExtractElementInst immediately followed by an FPTrunc to half.
+      SmallVector<std::pair<ExtractElementInst *, FPTruncInst *>, 4>
+          ExtractTruncPairs;
+      bool AllHalfExtracts = true;
+
+      for (User *U : II.users()) {
+        auto *Ext = dyn_cast<ExtractElementInst>(U);
+        if (!Ext || !Ext->hasOneUse()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
+        if (!Tr || !Tr->getType()->isHalfTy()) {
+          AllHalfExtracts = false;
+          break;
+        }
+
+        ExtractTruncPairs.emplace_back(Ext, Tr);
+      }
+
+      if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
+        auto *VecTy = cast<VectorType>(II.getType());
+        Type *HalfVecTy =
+            VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
+
+        // Obtain the original image sample intrinsic's signature
+        // and replace its return type with the half-vector for D16 folding
+        SmallVector<Type *, 8> SigTys;
+        Intrinsic::getIntrinsicSignature(II.getCalledFunction(), SigTys);
+        SigTys[0] = HalfVecTy;
+
+        Module *M = II.getModule();
+        Function *HalfDecl =
+            Intrinsic::getOrInsertDeclaration(M, ImageDimIntr->Intr, SigTys);
+
+        II.mutateType(HalfVecTy);
+        II.setCalledFunction(HalfDecl);
+
+        IRBuilder<> Builder(II.getContext());
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          Value *Idx = Ext->getIndexOperand();
+
+          Builder.SetInsertPoint(Tr);
+
+          Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
+          HalfExtract->takeName(Tr);
+
+          Tr->replaceAllUsesWith(HalfExtract);
+        }
+
+        for (auto &[Ext, Tr] : ExtractTruncPairs) {
+          IC.eraseInstFromFunction(*Tr);
+          IC.eraseInstFromFunction(*Ext);
+        }
+
+        return &II;
+      }
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
index 30431ad724843..ee5ccf5af987d 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/image-d16.ll
@@ -3,6 +3,7 @@
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx810 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx900 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 ; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1010 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
+; RUN: opt -mtriple=amdgcn--amdpal -mcpu=gfx1100 -S -passes=instcombine %s | FileCheck --check-prefixes=GFX81PLUS %s
 
 define amdgpu_ps half @image_sample_2d_fptrunc_to_d16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
 ; GFX7-LABEL: @image_sample_2d_fptrunc_to_d16(
@@ -121,6 +122,123 @@ main_body:
   ret half %addf_sum.2
 }
 
+define amdgpu_ps half @image_sample_2d_multi_fptrunc_to_d16(<8 x i32> %surf_desc, <4 x i32> %samp, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_to_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x half> @llvm.amdgcn.image.sample.lz.2d.v3f16.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[H0:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H1:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H2:%.*]] = extractelement <3 x half> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %h2
+  ret half %res
+}
+
+define amdgpu_ps half @image_sample_2d_extractelement_multi_use_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX7-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX7-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX7-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX7-NEXT:    ret half [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_extractelement_multi_use_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32.v8i32.v4i32(i32 3, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to half
+; GFX81PLUS-NEXT:    [[USER2:%.*]] = fadd float [[E0]], 1.000000e+00
+; GFX81PLUS-NEXT:    [[HALF:%.*]] = fptrunc float [[USER2]] to half
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <2 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to half
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul half [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd half [[MUL]], [[HALF]]
+; GFX81PLUS-NEXT:    ret half [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to half
+  %user2 = fadd float %e0, 1.0
+  %half = fptrunc float %user2 to half
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to half
+  %mul = fmul half %h0, %h1
+  %res = fadd half %mul, %half
+  ret half %res
+}
+
+define amdgpu_ps bfloat @image_sample_2d_multi_fptrunc_non_half_no_d16(<8 x i32> %surf_desc, <4 x i32> %sampler_desc, float %u, float %v) {
+; GFX7-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX7-NEXT:  main_body:
+; GFX7-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX7-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX7-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX7-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX7-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX7-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX7-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX7-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX7-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX7-NEXT:    ret bfloat [[RES]]
+;
+; GFX81PLUS-LABEL: @image_sample_2d_multi_fptrunc_non_half_no_d16(
+; GFX81PLUS-NEXT:  main_body:
+; GFX81PLUS-NEXT:    [[SAMPLE:%.*]] = call <3 x float> @llvm.amdgcn.image.sample.lz.2d.v3f32.f32.v8i32.v4i32(i32 7, float [[U:%.*]], float [[V:%.*]], <8 x i32> [[SURF_DESC:%.*]], <4 x i32> [[SAMPLER_DESC:%.*]], i1 false, i32 0, i32 0)
+; GFX81PLUS-NEXT:    [[E0:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 0
+; GFX81PLUS-NEXT:    [[H0:%.*]] = fptrunc float [[E0]] to bfloat
+; GFX81PLUS-NEXT:    [[E1:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 1
+; GFX81PLUS-NEXT:    [[H1:%.*]] = fptrunc float [[E1]] to bfloat
+; GFX81PLUS-NEXT:    [[E2:%.*]] = extractelement <3 x float> [[SAMPLE]], i64 2
+; GFX81PLUS-NEXT:    [[H2:%.*]] = fptrunc float [[E2]] to bfloat
+; GFX81PLUS-NEXT:    [[MUL:%.*]] = fmul bfloat [[H0]], [[H1]]
+; GFX81PLUS-NEXT:    [[RES:%.*]] = fadd bfloat [[MUL]], [[H2]]
+; GFX81PLUS-NEXT:    ret bfloat [[RES]]
+;
+main_body:
+  %sample = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32.v8i32.v4i32(i32 15, float %u, float %v, <8 x i32> %surf_desc, <4 x i32> %sampler_desc, i1 false, i32 0, i32 0)
+  %e0 = extractelement <4 x float> %sample, i32 0
+  %h0 = fptrunc float %e0 to bfloat
+  %e1 = extractelement <4 x float> %sample, i32 1
+  %h1 = fptrunc float %e1 to bfloat
+  %e2 = extractelement <4 x float> %sample, i32 2
+  %h2 = fptrunc float %e2 to bfloat
+  %mul = fmul bfloat %h0, %h1
+  %res = fadd bfloat %mul, %h2
+  ret bfloat %res
+}
+
 define amdgpu_ps half @image_gather4_2d_v4f32(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s, half %t) {
 ; GFX7-LABEL: @image_gather4_2d_v4f32(
 ; GFX7-NEXT:  main_body:

From 9265b1f0cff74c929214efb64f41183299f31772 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Tue, 17 Jun 2025 18:15:06 -0700
Subject: [PATCH 842/851] LowerTypeTests: Use jump table entry type as value
 type of jump table alias.

The motivation for this is that it causes the jump table entry's symbol
to have an st_size equal to the jump table entry size, instead of being
equal to the size of the entire jump table, which is incorrect and can
lead to unexpected behavior in binary analysis tools that rely on the
size field such as Bloaty.

Reviewers: fmayer

Reviewed By: fmayer

Pull Request: https://github.com/llvm/llvm-project/pull/144462
---
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp        | 10 ++++++----
 .../Transforms/LowerTypeTests/blockaddress-2.ll   |  2 +-
 .../Transforms/LowerTypeTests/cfi-icall-alias.ll  |  2 +-
 .../Transforms/LowerTypeTests/export-alias.ll     |  4 ++--
 .../Transforms/LowerTypeTests/export-icall.ll     | 12 ++++++------
 .../LowerTypeTests/function-disjoint.ll           |  4 ++--
 llvm/test/Transforms/LowerTypeTests/function.ll   | 15 +++++++--------
 .../LowerTypeTests/icall-branch-funnel.ll         |  4 ++--
 llvm/test/Transforms/LowerTypeTests/pr37625.ll    |  2 +-
 llvm/test/Transforms/LowerTypeTests/section.ll    |  2 +-
 10 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 20b54c056cc2d..86e1ebf937dbe 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1701,8 +1701,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
                        GlobalValue::PrivateLinkage,
                        M.getDataLayout().getProgramAddressSpace(),
                        ".cfi.jumptable", &M);
+  ArrayType *JumpTableEntryType = ArrayType::get(Int8Ty, EntrySize);
   ArrayType *JumpTableType =
-      ArrayType::get(ArrayType::get(Int8Ty, EntrySize), Functions.size());
+      ArrayType::get(JumpTableEntryType, Functions.size());
   auto JumpTable = ConstantExpr::getPointerCast(
       JumpTableFn, PointerType::getUnqual(M.getContext()));
 
@@ -1723,7 +1724,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     if (!IsJumpTableCanonical) {
       GlobalValue::LinkageTypes LT = IsExported ? GlobalValue::ExternalLinkage
                                                 : GlobalValue::InternalLinkage;
-      GlobalAlias *JtAlias = GlobalAlias::create(F->getValueType(), 0, LT,
+      GlobalAlias *JtAlias = GlobalAlias::create(JumpTableEntryType, 0, LT,
                                                  F->getName() + ".cfi_jt",
                                                  CombinedGlobalElemPtr, &M);
       if (IsExported)
@@ -1748,8 +1749,9 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
-      GlobalAlias *FAlias = GlobalAlias::create(
-          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
+      GlobalAlias *FAlias =
+          GlobalAlias::create(JumpTableEntryType, 0, F->getLinkage(), "",
+                              CombinedGlobalElemPtr, &M);
       FAlias->setVisibility(F->getVisibility());
       FAlias->takeName(F);
       if (FAlias->hasName()) {
diff --git a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
index 51a2a59365434..34e740771fe2b 100644
--- a/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
+++ b/llvm/test/Transforms/LowerTypeTests/blockaddress-2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests | FileCheck %s
 
 ; CHECK: @badfileops = internal global %struct.f { ptr @bad_f, ptr @bad_f }
-; CHECK: @bad_f = internal alias void (), ptr @.cfi.jumptable
+; CHECK: @bad_f = internal alias [8 x i8], ptr @.cfi.jumptable
 ; CHECK: define internal void @bad_f.cfi() !type !0 {
 ; CHECK-NEXT:  ret void
 
diff --git a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
index 0c5324ee96c93..6b821186b0ad7 100644
--- a/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/cfi-icall-alias.ll
@@ -12,7 +12,7 @@ RUN: opt test1.bc -passes=lowertypetests -lowertypetests-read-summary=in.yaml \
 RUN:   -lowertypetests-summary-action=export -lowertypetests-write-summary=exported.yaml \
 RUN:   -S -o - | FileCheck %s --check-prefix=REGULAR
 REGULAR: @__typeid__ZTSFvvE_global_addr = hidden alias i8, ptr @.cfi.jumptable
-REGULAR: @f = alias void (), ptr @.cfi.jumptable
+REGULAR: @f = alias [8 x i8], ptr @.cfi.jumptable
 REGULAR: define private void @.cfi.jumptable()
 
 ;; CHECK that @llvm.type.test() is lowered to an actual check.
diff --git a/llvm/test/Transforms/LowerTypeTests/export-alias.ll b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
index 255e6b6ca4d17..45b4db63def18 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-alias.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-alias.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S %s -passes=lowertypetests -lowertypetests-summary-action=export -lowertypetests-read-summary=%S/Inputs/exported-funcs.yaml | FileCheck %s
 ;
-; CHECK: @alias1 = weak alias void (), ptr @external_addrtaken
-; CHECK: @alias2 = hidden alias void (), ptr @external_addrtaken
+; CHECK: @alias1 = weak alias [8 x i8], ptr @external_addrtaken
+; CHECK: @alias2 = hidden alias [8 x i8], ptr @external_addrtaken
 ; CHECK-NOT: @alias3 = alias
 ; CHECK-NOT: @not_present
 
diff --git a/llvm/test/Transforms/LowerTypeTests/export-icall.ll b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
index 47156deb57de7..f8adb2d69910f 100644
--- a/llvm/test/Transforms/LowerTypeTests/export-icall.ll
+++ b/llvm/test/Transforms/LowerTypeTests/export-icall.ll
@@ -40,15 +40,15 @@ define void @f3(i32 %x) !type !8 {
 ; CHECK-DAG: @__typeid_typeid1_align = hidden alias i8, inttoptr (i64 3 to ptr)
 ; CHECK-DAG: @__typeid_typeid1_size_m1 = hidden alias i8, inttoptr (i64 4 to ptr)
 
-; CHECK-DAG: @h                    = alias void (i8), ptr [[JT1]]
-; CHECK-DAG: @f                    = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @f2                   = alias void (i32), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external.cfi_jt      = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
-; CHECK-DAG: @external_weak.cfi_jt = hidden alias void (), {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @h                    = alias [8 x i8], ptr [[JT1]]
+; CHECK-DAG: @f                    = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @f2                   = alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external.cfi_jt      = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
+; CHECK-DAG: @external_weak.cfi_jt = hidden alias [8 x i8], {{.*}}getelementptr {{.*}}ptr [[JT1]]
 
 ; CHECK-DAG: @__typeid_typeid2_global_addr = hidden alias i8, ptr [[JT2:.*]]
 
-; CHECK-DAG: @g                    = alias void (), ptr [[JT2]]
+; CHECK-DAG: @g                    = alias [8 x i8], ptr [[JT2]]
 
 ; CHECK-DAG: define hidden void @h.cfi(i8 {{.*}}) !type !{{.*}}
 ; CHECK-DAG: declare !type !{{.*}} void @external()
diff --git a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
index d7ba3a6814194..ae676df6e9f31 100644
--- a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll
@@ -5,8 +5,8 @@
 
 target datalayout = "e-p:64:64"
 
-; X64: @g = alias void (), ptr @[[JT1:.*]]
-; X64: @f = alias void (), ptr @[[JT0:.*]]
+; X64: @g = alias [8 x i8], ptr @[[JT1:.*]]
+; X64: @f = alias [8 x i8], ptr @[[JT0:.*]]
 
 ; WASM32: private constant [0 x i8] zeroinitializer
 @0 = private unnamed_addr constant [2 x ptr] [ptr @f, ptr @g], align 16
diff --git a/llvm/test/Transforms/LowerTypeTests/function.ll b/llvm/test/Transforms/LowerTypeTests/function.ll
index 5b0852c82ea68..ab3cfb6acccf8 100644
--- a/llvm/test/Transforms/LowerTypeTests/function.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function.ll
@@ -28,14 +28,13 @@ target datalayout = "e-p:64:64"
 ; NATIVE: private constant [0 x i8] zeroinitializer
 ; WASM32: private constant [0 x i8] zeroinitializer
 
-; NATIVE: @f = alias void (), ptr @[[JT:.*]]
-
-; X86: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; ARM: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMB: @g = internal alias void (), getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
-; THUMBV6M: @g = internal alias void (), getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
-; RISCV: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
-; LOONGARCH64: @g = internal alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT4: @f = alias [4 x i8], ptr @[[JT:.*]]
+; JT8: @f = alias [8 x i8], ptr @[[JT:.*]]
+; JT16: @f = alias [16 x i8], ptr @[[JT:.*]]
+
+; JT4: @g = internal alias [4 x i8], getelementptr inbounds ([2 x [4 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT8: @g = internal alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @[[JT]], i64 0, i64 1)
+; JT16: @g = internal alias [16 x i8], getelementptr inbounds ([2 x [16 x i8]], ptr @[[JT]], i64 0, i64 1)
 
 ; NATIVE: define hidden void @f.cfi()
 ; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]]
diff --git a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
index f67e0b1711652..8cb41398e8f53 100644
--- a/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
+++ b/llvm/test/Transforms/LowerTypeTests/icall-branch-funnel.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux"
 ; CHECK: @0 = private constant { i32, [0 x i8], i32 } { i32 1, [0 x i8] zeroinitializer, i32 2 }
 ; CHECK: @g1 = alias i32, ptr @0
 ; CHECK: @g2 = alias i32, getelementptr inbounds ({ i32, [0 x i8], i32 }, ptr @0, i32 0, i32 2)
-; CHECK: @f1 = alias void (), ptr @.cfi.jumptable
-; CHECK: @f2 = alias void (), getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
+; CHECK: @f1 = alias [8 x i8], ptr @.cfi.jumptable
+; CHECK: @f2 = alias [8 x i8], getelementptr inbounds ([2 x [8 x i8]], ptr @.cfi.jumptable, i64 0, i64 1)
 
 @g1 = constant i32 1
 @g2 = constant i32 2
diff --git a/llvm/test/Transforms/LowerTypeTests/pr37625.ll b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
index 639cc3fa32bcb..cf52cdf0759a3 100644
--- a/llvm/test/Transforms/LowerTypeTests/pr37625.ll
+++ b/llvm/test/Transforms/LowerTypeTests/pr37625.ll
@@ -11,4 +11,4 @@ declare !type !2 extern_weak void @external_addrtaken(i8)
 !1 = !{!"external_addrtaken", i8 0, !2}
 !2 = !{i64 0, !"typeid1"}
 
-; CHECK-DAG: @external_addrtaken = alias void (i8), ptr @.cfi.jumptable
+; CHECK-DAG: @external_addrtaken = alias [8 x i8], ptr @.cfi.jumptable
diff --git a/llvm/test/Transforms/LowerTypeTests/section.ll b/llvm/test/Transforms/LowerTypeTests/section.ll
index d0d3c212c826e..bd91389c60ef0 100644
--- a/llvm/test/Transforms/LowerTypeTests/section.ll
+++ b/llvm/test/Transforms/LowerTypeTests/section.ll
@@ -5,7 +5,7 @@
 
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: @f = alias void (), ptr @[[JT:.*]]
+; CHECK: @f = alias [8 x i8], ptr @[[JT:.*]]
 ; CHECK: define hidden void @f.cfi() section "xxx"
 
 define void @f() section "xxx" !type !0 {

From 8ddada41df0488358373cff1d31a47e5ef4961e0 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 18 Jun 2025 09:17:46 +0800
Subject: [PATCH 843/851] [RISCV] Add Andes XAndesVBFHCvt (Andes Vector
 BFLOAT16 Conversion) extension (#144320)

The spec can be found at:
https://github.com/andestech/andes-v5-isa/releases/tag/ast-v5_4_0-release.

This patch only supports assembler. The instructions are similar to
`Zvfbfmin` and the only difference with `Zvfbfmin` is that
`XAndesVBFHCvt` doesn't have mask variant.
---
 .../Driver/print-supported-extensions-riscv.c |  1 +
 .../riscv-target-features-andes.c             |  8 +++++
 llvm/docs/RISCVUsage.rst                      |  3 ++
 llvm/docs/ReleaseNotes.md                     |  1 +
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |  4 +--
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  9 ++++++
 llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td | 31 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/attributes.ll         |  4 +++
 llvm/test/CodeGen/RISCV/features-info.ll      |  1 +
 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s      | 27 ++++++++++++++++
 .../TargetParser/RISCVISAInfoTest.cpp         |  1 +
 11 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/xandesvbfhcvt-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index e1f5a7a0105d7..5008c2b7f789d 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -159,6 +159,7 @@
 // CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
 // CHECK-NEXT:     svvptc               1.0       'Svvptc' (Obviating Memory-Management Instructions after Marking PTEs Valid)
 // CHECK-NEXT:     xandesperf           5.0       'XAndesPerf' (Andes Performance Extension)
+// CHECK-NEXT:     xandesvbfhcvt        5.0       'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)
 // CHECK-NEXT:     xandesvdot           5.0       'XAndesVDot' (Andes Vector Dot Product Extension)
 // CHECK-NEXT:     xandesvpackfph       5.0       'XAndesVPackFPH' (Andes Vector Packed FP16 Extension)
 // CHECK-NEXT:     xcvalu               1.0       'XCValu' (CORE-V ALU Operations)
diff --git a/clang/test/Preprocessor/riscv-target-features-andes.c b/clang/test/Preprocessor/riscv-target-features-andes.c
index 3cd9b04354132..c66d4427b5cf2 100644
--- a/clang/test/Preprocessor/riscv-target-features-andes.c
+++ b/clang/test/Preprocessor/riscv-target-features-andes.c
@@ -15,6 +15,14 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESPERF %s
 // CHECK-XANDESPERF: __riscv_xandesperf  5000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN:   -march=rv32i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// RUN: %clang --target=riscv64 \
+// RUN:   -march=rv64i_xandesvbfhcvt -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVBFHCVT %s
+// CHECK-XANDESVBFHCVT: __riscv_xandesvbfhcvt  5000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN:   -march=rv32i_xandesvpackfph -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XANDESVPACKFPH %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index aadda309feab0..81684ba30f12c 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -513,6 +513,9 @@ The current vendor extensions supported are:
 ``XAndesPerf``
   LLVM implements `version 5.0.0 of the Andes Performance Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
+``XAndesVBFHCvt``
+  LLVM implements `version 5.0.0 of the Andes Vector BFLOAT16 Conversion Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
+
 ``XAndesVPackFPH``
   LLVM implements `version 5.0.0 of the Andes Vector Packed FP16 Extension specification <https://github.com/andestech/andes-v5-isa/releases/download/ast-v5_4_0-release/AndeStar_V5_ISA_Spec_UM165-v1.5.08-20250317.pdf>`__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification.
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 5c9ed181af59e..0395f43c61953 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -210,6 +210,7 @@ Changes to the RISC-V Backend
 * The `Shlcofideleg` extension was added.
 * `-mcpu=sifive-x390` was added.
 * `-mtune=andes-45-series` was added.
+* Adds assembler support for the Andes `XAndesvbfhcvt` (Andes Vector BFLOAT16 Conversion extension).
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index cbab081a6731e..27e04c0cb1f8b 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -774,8 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = {
     RISCV::FeatureVendorXTHeadVdot};
 
 static constexpr FeatureBitset XAndesGroup = {
-    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVPackFPH,
-    RISCV::FeatureVendorXAndesVDot};
+    RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt,
+    RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot};
 
 static constexpr DecoderListEntry DecoderList32[]{
     // Vendor Extensions
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 0b35084267324..6df6368929dac 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1599,6 +1599,15 @@ def HasVendorXAndesPerf
       AssemblerPredicate<(all_of FeatureVendorXAndesPerf),
                          "'XAndesPerf' (Andes Performance Extension)">;
 
+def FeatureVendorXAndesVBFHCvt
+    : RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXAndesVBFHCvt
+    : Predicate<"Subtarget->hasVendorXAndesVBFHCvt()">,
+      AssemblerPredicate<(all_of FeatureVendorXAndesVBFHCvt),
+                         "'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)">;
+
+
 def FeatureVendorXAndesVPackFPH
     : RISCVExtension<5, 0, "Andes Vector Packed FP16 Extension",
                      [FeatureStdExtZvfhmin]>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
index 4cf8309ea17f4..3ba21e51e7c66 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td
@@ -361,6 +361,25 @@ class NDSRVInstVD4DOT<bits<6> funct6, string opcodestr>
   let RVVConstraint = VMConstraint;
 }
 
+class NDSRVInstVBFHCvt<bits<7> funct7, bits<5> vs1, string opcodestr>
+    : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm),
+             opcodestr, "$vd, $vs2", [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> vd;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14-12} = 0b100;
+  let Inst{11-7} = vd;
+  let Inst{6-0} = OPC_CUSTOM_2.Value;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+
+  let Uses = [VL, VTYPE];
+}
+
 //===----------------------------------------------------------------------===//
 // Multiclass
 //===----------------------------------------------------------------------===//
@@ -460,6 +479,18 @@ def NDS_LDGP  : NDSRVInstLDGP<0b011, "nds.ldgp">;
 def NDS_SDGP  : NDSRVInstSDGP<0b111, "nds.sdgp">;
 } // Predicates = [HasVendorXAndesPerf, IsRV64]
 
+//===----------------------------------------------------------------------===//
+// XAndesVBFHCvt
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXAndesVBFHCvt], Constraints = "@earlyclobber $vd",
+    mayRaiseFPException = true in {
+let RVVConstraint = VS2Constraint, DestEEW = EEWSEWx2 in
+def NDS_VFWCVT_S_BF16 : NDSRVInstVBFHCvt<0b0000000, 0b00000, "nds.vfwcvt.s.bf16">;
+let Uses = [FRM, VL, VTYPE] in
+def NDS_VFNCVT_BF16_S : NDSRVInstVBFHCvt<0b0000000, 0b00001, "nds.vfncvt.bf16.s">;
+}
+
 //===----------------------------------------------------------------------===//
 // XAndesVPackFPH
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index cdbf1caff5d80..c5188aa1918bf 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -106,6 +106,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisls %s -o - | FileCheck --check-prefix=RV32XQCISLS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-xqcisync %s -o - | FileCheck --check-prefix=RV32XQCISYNC %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s
+; RUN: llc -mtriple=riscv32 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV32XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zaamo %s -o - | FileCheck --check-prefix=RV32ZAAMO %s
@@ -260,6 +261,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s
+; RUN: llc -mtriple=riscv64 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESVBFHCVT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV64XANDESVPACKFPH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s
@@ -457,6 +459,7 @@
 ; RV32XQCISLS: .attribute 5, "rv32i2p1_xqcisls0p2"
 ; RV32XQCISYNC: attribute 5, "rv32i2p1_zca1p0_xqcisync0p3"
 ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0"
+; RV32XANDESVBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV32XANDESVPACKFPH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV32ZAAMO: .attribute 5, "rv32i2p1_zaamo1p0"
@@ -612,6 +615,7 @@
 ; RV64XTHEADSYNC: .attribute 5, "rv64i2p1_xtheadsync1p0"
 ; RV64XTHEADVDOT: .attribute 5, "rv64i2p1_f2p2_d2p2_v1p0_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xtheadvdot1p0"
 ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0"
+; RV64XANDESVBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0"
 ; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0"
 ; RV64XANDESVPACKFPH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfhmin1p0_zvl32b1p0_xandesvpackfph5p0"
 ; RV64ZTSO: .attribute 5, "rv64i2p1_ztso1p0"
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index fab2e94959301..8b931f70aa5cc 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -174,6 +174,7 @@
 ; CHECK-NEXT:   ventana-veyron                   - Ventana Veyron-Series processors.
 ; CHECK-NEXT:   vxrm-pipeline-flush              - VXRM writes causes pipeline flush.
 ; CHECK-NEXT:   xandesperf                       - 'XAndesPerf' (Andes Performance Extension).
+; CHECK-NEXT:   xandesvbfhcvt                    - 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension).
 ; CHECK-NEXT:   xandesvdot                       - 'XAndesVDot' (Andes Vector Dot Product Extension).
 ; CHECK-NEXT:   xandesvpackfph                   - 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension).
 ; CHECK-NEXT:   xcvalu                           - 'XCValu' (CORE-V ALU Operations).
diff --git a/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
new file mode 100644
index 0000000000000..355846719e46f
--- /dev/null
+++ b/llvm/test/MC/RISCV/xandesvbfhcvt-valid.s
@@ -0,0 +1,27 @@
+# XAndesVBFHCvt - Andes Vector BFLOAT16 Conversion Extension
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xandesvbfhcvt -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM %s
+# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesvbfhcvt < %s \
+# RUN:     | llvm-objdump --mattr=+xandesvbfhcvt -M no-aliases -d -r - \
+# RUN:     | FileCheck -check-prefixes=CHECK-OBJ %s
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+# CHECK-OBJ: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: nds.vfwcvt.s.bf16 v8, v10
+# CHECK-ASM: encoding: [0x5b,0x44,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfwcvt.s.bf16 v8, v10
+
+# CHECK-OBJ: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: nds.vfncvt.bf16.s v8, v10
+# CHECK-ASM: encoding: [0x5b,0xc4,0xa0,0x00]
+# CHECK-ERROR: instruction requires the following: 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension){{$}}
+nds.vfncvt.bf16.s v8, v10
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index a0910a164ea08..66e335a33a3f7 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1130,6 +1130,7 @@ R"(All available -march extensions for RISC-V
     svpbmt               1.0
     svvptc               1.0
     xandesperf           5.0
+    xandesvbfhcvt        5.0
     xandesvdot           5.0
     xandesvpackfph       5.0
     xcvalu               1.0

From a96a3f1b26baa8e5ee0abbac629f02566b7e9d1c Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Tue, 17 Jun 2025 18:37:15 -0700
Subject: [PATCH 844/851] [lldb][Minidump Parser] Implement a range data vector
 for minidump memory ranges (#136040)

Recently I was debugging a Minidump with a few thousand ranges, and came
across the (now deleted) comment:

```
  // I don't have a sense of how frequently this is called or how many memory
  // ranges a Minidump typically has, so I'm not sure if searching for the
  // appropriate range linearly each time is stupid.  Perhaps we should build
  // an index for faster lookups.
```

blaming this comment, it's 9 years old! Much overdue for this simple fix
with a range data vector.

I had to add a default constructor to Range in order to implement the
RangeDataVector, but otherwise this just a replacement of look up logic.
---
 .../Process/minidump/MinidumpParser.cpp       | 72 ++++++++++---------
 .../Plugins/Process/minidump/MinidumpParser.h | 21 +++++-
 2 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
index 94c0a5f11e435..ef691b77193ce 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp
@@ -20,8 +20,8 @@
 #include <algorithm>
 #include <map>
 #include <optional>
-#include <vector>
 #include <utility>
+#include <vector>
 
 using namespace lldb_private;
 using namespace minidump;
@@ -75,8 +75,7 @@ UUID MinidumpParser::GetModuleUUID(const minidump::Module *module) {
     if (GetArchitecture().GetTriple().isOSBinFormatELF()) {
       if (pdb70_uuid->Age != 0)
         return UUID(pdb70_uuid, sizeof(*pdb70_uuid));
-      return UUID(&pdb70_uuid->Uuid,
-                                    sizeof(pdb70_uuid->Uuid));
+      return UUID(&pdb70_uuid->Uuid, sizeof(pdb70_uuid->Uuid));
     }
     return UUID(*pdb70_uuid);
   } else if (cv_signature == CvSignature::ElfBuildId)
@@ -429,62 +428,65 @@ MinidumpParser::GetExceptionStreams() {
 
 std::optional<minidump::Range>
 MinidumpParser::FindMemoryRange(lldb::addr_t addr) {
-  Log *log = GetLog(LLDBLog::Modules);
+  if (m_memory_ranges.IsEmpty())
+    PopulateMemoryRanges();
+
+  const MemoryRangeVector::Entry *entry =
+      m_memory_ranges.FindEntryThatContains(addr);
+  if (!entry)
+    return std::nullopt;
 
+  return entry->data;
+}
+
+void MinidumpParser::PopulateMemoryRanges() {
+  Log *log = GetLog(LLDBLog::Modules);
   auto ExpectedMemory = GetMinidumpFile().getMemoryList();
-  if (!ExpectedMemory) {
-    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
-                   "Failed to read memory list: {0}");
-  } else {
+  if (ExpectedMemory) {
     for (const auto &memory_desc : *ExpectedMemory) {
       const LocationDescriptor &loc_desc = memory_desc.Memory;
       const lldb::addr_t range_start = memory_desc.StartOfMemoryRange;
       const size_t range_size = loc_desc.DataSize;
-
-      if (loc_desc.RVA + loc_desc.DataSize > GetData().size())
-        return std::nullopt;
-
-      if (range_start <= addr && addr < range_start + range_size) {
-        auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
-        if (!ExpectedSlice) {
-          LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
-                         "Failed to get memory slice: {0}");
-          return std::nullopt;
-        }
-        return minidump::Range(range_start, *ExpectedSlice);
+      auto ExpectedSlice = GetMinidumpFile().getRawData(loc_desc);
+      if (!ExpectedSlice) {
+        LLDB_LOG_ERROR(log, ExpectedSlice.takeError(),
+                       "Failed to get memory slice: {0}");
+        continue;
       }
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          range_start, range_size,
+          minidump::Range(range_start, *ExpectedSlice)));
     }
+  } else {
+    LLDB_LOG_ERROR(log, ExpectedMemory.takeError(),
+                   "Failed to read memory list: {0}");
   }
 
   if (!GetStream(StreamType::Memory64List).empty()) {
     llvm::Error err = llvm::Error::success();
-    for (const auto &memory_desc :  GetMinidumpFile().getMemory64List(err)) {
-      if (memory_desc.first.StartOfMemoryRange <= addr 
-          && addr < memory_desc.first.StartOfMemoryRange + memory_desc.first.DataSize) {
-        return minidump::Range(memory_desc.first.StartOfMemoryRange, memory_desc.second);
-      }
+    for (const auto &memory_desc : GetMinidumpFile().getMemory64List(err)) {
+      m_memory_ranges.Append(MemoryRangeVector::Entry(
+          memory_desc.first.StartOfMemoryRange, memory_desc.first.DataSize,
+          minidump::Range(memory_desc.first.StartOfMemoryRange,
+                          memory_desc.second)));
     }
 
     if (err)
       LLDB_LOG_ERROR(log, std::move(err), "Failed to read memory64 list: {0}");
   }
 
-  return std::nullopt;
+  m_memory_ranges.Sort();
 }
 
 llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
                                                   size_t size) {
-  // I don't have a sense of how frequently this is called or how many memory
-  // ranges a Minidump typically has, so I'm not sure if searching for the
-  // appropriate range linearly each time is stupid.  Perhaps we should build
-  // an index for faster lookups.
   std::optional<minidump::Range> range = FindMemoryRange(addr);
   if (!range)
     return {};
 
   // There's at least some overlap between the beginning of the desired range
-  // (addr) and the current range.  Figure out where the overlap begins and how
-  // much overlap there is.
+  // (addr) and the current range.  Figure out where the overlap begins and
+  // how much overlap there is.
 
   const size_t offset = addr - range->start;
 
@@ -495,7 +497,8 @@ llvm::ArrayRef<uint8_t> MinidumpParser::GetMemory(lldb::addr_t addr,
   return range->range_ref.slice(offset, overlap);
 }
 
-llvm::iterator_range<FallibleMemory64Iterator> MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
+llvm::iterator_range<FallibleMemory64Iterator>
+MinidumpParser::GetMemory64Iterator(llvm::Error &err) {
   llvm::ErrorAsOutParameter ErrAsOutParam(&err);
   return m_file->getMemory64List(err);
 }
@@ -607,8 +610,7 @@ std::pair<MemoryRegionInfos, bool> MinidumpParser::BuildMemoryRegions() {
   case StreamType::ST:                                                         \
     return #ST
 
-llvm::StringRef
-MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
+llvm::StringRef MinidumpParser::GetStreamTypeAsString(StreamType stream_type) {
   switch (stream_type) {
     ENUM_TO_CSTR(Unused);
     ENUM_TO_CSTR(ThreadList);
diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
index 2c5e6f19ff9a1..14599f8d572aa 100644
--- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h
+++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h
@@ -17,6 +17,7 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/UUID.h"
 
+#include "lldb/Utility/RangeMap.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -35,6 +36,9 @@ namespace minidump {
 
 // Describes a range of memory captured in the Minidump
 struct Range {
+  // Default constructor required for range data vector
+  // but unusued.
+  Range() = default;
   lldb::addr_t start; // virtual address of the beginning of the range
   // range_ref - absolute pointer to the first byte of the range and size
   llvm::ArrayRef<uint8_t> range_ref;
@@ -45,9 +49,18 @@ struct Range {
   friend bool operator==(const Range &lhs, const Range &rhs) {
     return lhs.start == rhs.start && lhs.range_ref == rhs.range_ref;
   }
+
+  friend bool operator<(const Range &lhs, const Range &rhs) {
+    if (lhs.start == rhs.start)
+      return lhs.range_ref.size() < rhs.range_ref.size();
+    return lhs.start < rhs.start;
+  }
 };
 
-using FallibleMemory64Iterator = llvm::object::MinidumpFile::FallibleMemory64Iterator;
+using MemoryRangeVector =
+    lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t, minidump::Range>;
+using FallibleMemory64Iterator =
+    llvm::object::MinidumpFile::FallibleMemory64Iterator;
 using ExceptionStreamsIterator =
     llvm::object::MinidumpFile::ExceptionStreamsIterator;
 
@@ -97,7 +110,8 @@ class MinidumpParser {
   /// complete (includes all regions mapped into the process memory).
   std::pair<MemoryRegionInfos, bool> BuildMemoryRegions();
 
-  llvm::iterator_range<FallibleMemory64Iterator> GetMemory64Iterator(llvm::Error &err);
+  llvm::iterator_range<FallibleMemory64Iterator>
+  GetMemory64Iterator(llvm::Error &err);
 
   static llvm::StringRef GetStreamTypeAsString(StreamType stream_type);
 
@@ -109,10 +123,11 @@ class MinidumpParser {
 private:
   MinidumpParser(lldb::DataBufferSP data_sp,
                  std::unique_ptr<llvm::object::MinidumpFile> file);
-
+  void PopulateMemoryRanges();
   lldb::DataBufferSP m_data_sp;
   std::unique_ptr<llvm::object::MinidumpFile> m_file;
   ArchSpec m_arch;
+  MemoryRangeVector m_memory_ranges;
 };
 
 } // end namespace minidump

From a2ad65661ad560b04952d4d992248d2db3be36c8 Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Wed, 18 Jun 2025 07:54:08 +0530
Subject: [PATCH 845/851] [RISCV] Add patterns for generating QC_CTO and QC_CLO
 (#144532)

These instructions count leading/trailing ones in the register.

Currently these are only generated when we have `Zbb` enabled (along
with `Xqcibm`) since it contains the `CTTZ/CTLZ` instructions.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td |   5 +
 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll   | 958 ++++++++++++++++++++
 2 files changed, 963 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index b94fee3c6e575..09852c6fd5969 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1452,6 +1452,11 @@ def : Pat<(i32 (and GPRNoX0:$rs, 1023)), (QC_EXTU GPRNoX0:$rs, 10, 0)>;
 def : Pat<(i32 (and GPRNoX0:$rs, 2047)), (QC_EXTU GPRNoX0:$rs, 11, 0)>;
 } // Predicates = [HasVendorXqcibm, IsRV32]
 
+let Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32] in {
+def: Pat<(i32 (cttz (not (i32 GPR:$rs1)))), (QC_CTO GPR:$rs1)>;
+def: Pat<(i32 (ctlz (not (i32 GPR:$rs1)))), (QC_CLO GPR:$rs1)>;
+} // Predicates = [HasVendorXqcibm, HasStdExtZbb, IsRV32]
+
 let Predicates = [HasVendorXqciint, IsRV32] in
 def : Pat<(riscv_mileaveret_glue), (QC_C_MILEAVERET)>;
 
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
new file mode 100644
index 0000000000000..fe2bcf00ba7d4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo.ll
@@ -0,0 +1,958 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV32ZBB
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,experimental-xqcibm -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32ZBBXQCIBM
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @test_cttz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB0_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB0_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    li a1, 256
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    li a1, 256
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB1_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi a1, a0, -1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    lui a1, 16
+; RV32ZBB-NEXT:    orn a0, a1, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    lui a1, 16
+; RV32ZBBXQCIBM-NEXT:    orn a0, a1, a0
+; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB2_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI2_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI2_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB2_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s2, a0
+; RV32I-NEXT:    or a0, s2, s3
+; RV32I-NEXT:    beqz a0, .LBB3_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    neg a0, s2
+; RV32I-NEXT:    and a0, s2, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s4, %hi(.LCPI3_0)
+; RV32I-NEXT:    addi s4, s4, %lo(.LCPI3_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s2, .LBB3_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s4, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB3_5
+; RV32I-NEXT:  .LBB3_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    j .LBB3_6
+; RV32I-NEXT:  .LBB3_4:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s4, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB3_5: # %cond.false
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:  .LBB3_6: # %cond.end
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB3_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB3_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB3_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB3_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_cttz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_cttz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.cttz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_cttz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_cttz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a1, a0
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a2, a2, 1365
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    and a1, a1, a2
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.cttz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_cttz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    neg a1, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi a1, a1, 1329
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV32I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.cttz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_cttz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_cttz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -32
+; RV32I-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    not s3, a1
+; RV32I-NEXT:    not s4, a0
+; RV32I-NEXT:    neg a0, s4
+; RV32I-NEXT:    and a0, s4, a0
+; RV32I-NEXT:    lui a1, 30667
+; RV32I-NEXT:    addi s1, a1, 1329
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    lui s2, %hi(.LCPI7_0)
+; RV32I-NEXT:    addi s2, s2, %lo(.LCPI7_0)
+; RV32I-NEXT:    neg a0, s3
+; RV32I-NEXT:    and a0, s3, a0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __mulsi3
+; RV32I-NEXT:    bnez s4, .LBB7_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    srli a0, a0, 27
+; RV32I-NEXT:    add a0, s2, a0
+; RV32I-NEXT:    lbu a0, 0(a0)
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    j .LBB7_3
+; RV32I-NEXT:  .LBB7_2:
+; RV32I-NEXT:    srli s0, s0, 27
+; RV32I-NEXT:    add s0, s2, s0
+; RV32I-NEXT:    lbu a0, 0(s0)
+; RV32I-NEXT:  .LBB7_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    bnez a0, .LBB7_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a1
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB7_2:
+; RV32ZBB-NEXT:    ctz a0, a0
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_cttz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a0
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB7_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a1
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB7_2:
+; RV32ZBBXQCIBM-NEXT:    qc.cto a0, a0
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.cttz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    zext.b a1, a0
+; RV32I-NEXT:    beqz a1, .LBB8_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB8_2:
+; RV32I-NEXT:    li a0, 8
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    beqz a1, .LBB9_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a1, 17
+; RV32I-NEXT:    lui a2, 5
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB9_2:
+; RV32I-NEXT:    li a0, 16
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 false)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    beqz a0, .LBB10_2
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    lui a2, 349525
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, 1365
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB10_2:
+; RV32I-NEXT:    li a0, 32
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a3, a1
+; RV32I-NEXT:    not a4, a0
+; RV32I-NEXT:    or a0, a4, a3
+; RV32I-NEXT:    beqz a0, .LBB11_3
+; RV32I-NEXT:  # %bb.1: # %cond.false
+; RV32I-NEXT:    lui a0, 349525
+; RV32I-NEXT:    lui a1, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a2, a0, 1365
+; RV32I-NEXT:    addi a1, a1, 819
+; RV32I-NEXT:    addi a0, a5, -241
+; RV32I-NEXT:    bnez a3, .LBB11_4
+; RV32I-NEXT:  # %bb.2: # %cond.false
+; RV32I-NEXT:    srli a3, a4, 1
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_3:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    li a0, 64
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB11_4:
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 2
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 4
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 8
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    srli a4, a3, 16
+; RV32I-NEXT:    or a3, a3, a4
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    srli a4, a3, 1
+; RV32I-NEXT:    and a2, a4, a2
+; RV32I-NEXT:    sub a3, a3, a2
+; RV32I-NEXT:    and a2, a3, a1
+; RV32I-NEXT:    srli a3, a3, 2
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    srli a2, a1, 4
+; RV32I-NEXT:    add a1, a1, a2
+; RV32I-NEXT:    and a0, a1, a0
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB11_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB11_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB11_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB11_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 false)
+  ret i64 %tmp
+}
+
+define i8 @test_ctlz_i8_zero_undef(i8 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i8_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 25
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 26
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 24
+; RV32I-NEXT:    srli a1, a1, 28
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a1, a0, 1
+; RV32I-NEXT:    andi a1, a1, 85
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 51
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    andi a0, a0, 51
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a0, a0, 15
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 24
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i8_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 24
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i8 %a, -1
+  %tmp = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
+  ret i8 %tmp
+}
+
+define i16 @test_ctlz_i16_zero_undef(i16 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i16_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 5
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    srli a2, a2, 17
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 18
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 20
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    slli a2, a0, 16
+; RV32I-NEXT:    srli a2, a2, 24
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 3
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    andi a1, a0, 15
+; RV32I-NEXT:    slli a0, a0, 20
+; RV32I-NEXT:    srli a0, a0, 28
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    slli a0, a0, 16
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i16_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
+; RV32ZBBXQCIBM-NEXT:    slli a0, a0, 16
+; RV32ZBBXQCIBM-NEXT:    clz a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i16 %a, -1
+  %tmp = call i16 @llvm.ctlz.i16(i16 %1, i1 true)
+  ret i16 %tmp
+}
+
+define i32 @test_ctlz_i32_zero_undef(i32 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i32_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    addi a1, a1, 1365
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 2
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 8
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    srli a2, a0, 16
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a2, a0, 1
+; RV32I-NEXT:    and a1, a2, a1
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    and a1, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a2, 61681
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    srli a1, a0, 4
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    addi a1, a2, -241
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i32_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i32 %a, -1
+  %tmp = call i32 @llvm.ctlz.i32(i32 %1, i1 true)
+  ret i32 %tmp
+}
+
+define i64 @test_ctlz_i64_zero_undef(i64 %a) nounwind {
+; RV32I-LABEL: test_ctlz_i64_zero_undef:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a4, a1
+; RV32I-NEXT:    lui a1, 349525
+; RV32I-NEXT:    lui a2, 209715
+; RV32I-NEXT:    lui a5, 61681
+; RV32I-NEXT:    addi a3, a1, 1365
+; RV32I-NEXT:    addi a2, a2, 819
+; RV32I-NEXT:    addi a1, a5, -241
+; RV32I-NEXT:    bnez a4, .LBB15_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    addi a0, a0, 32
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+; RV32I-NEXT:  .LBB15_2:
+; RV32I-NEXT:    srli a0, a4, 1
+; RV32I-NEXT:    or a0, a4, a0
+; RV32I-NEXT:    srli a4, a0, 2
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 4
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 8
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    srli a4, a0, 16
+; RV32I-NEXT:    or a0, a0, a4
+; RV32I-NEXT:    not a0, a0
+; RV32I-NEXT:    srli a4, a0, 1
+; RV32I-NEXT:    and a3, a4, a3
+; RV32I-NEXT:    sub a0, a0, a3
+; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    srli a0, a0, 2
+; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    add a0, a3, a0
+; RV32I-NEXT:    srli a2, a0, 4
+; RV32I-NEXT:    add a0, a0, a2
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    slli a1, a0, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    srli a0, a0, 24
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a1, a1
+; RV32ZBB-NEXT:    bnez a1, .LBB15_2
+; RV32ZBB-NEXT:  # %bb.1:
+; RV32ZBB-NEXT:    not a0, a0
+; RV32ZBB-NEXT:    clz a0, a0
+; RV32ZBB-NEXT:    addi a0, a0, 32
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+; RV32ZBB-NEXT:  .LBB15_2:
+; RV32ZBB-NEXT:    clz a0, a1
+; RV32ZBB-NEXT:    li a1, 0
+; RV32ZBB-NEXT:    ret
+;
+; RV32ZBBXQCIBM-LABEL: test_ctlz_i64_zero_undef:
+; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a2, a1
+; RV32ZBBXQCIBM-NEXT:    bnez a2, .LBB15_2
+; RV32ZBBXQCIBM-NEXT:  # %bb.1:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a0
+; RV32ZBBXQCIBM-NEXT:    addi a0, a0, 32
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+; RV32ZBBXQCIBM-NEXT:  .LBB15_2:
+; RV32ZBBXQCIBM-NEXT:    qc.clo a0, a1
+; RV32ZBBXQCIBM-NEXT:    li a1, 0
+; RV32ZBBXQCIBM-NEXT:    ret
+  %1 = xor i64 %a, -1
+  %tmp = call i64 @llvm.ctlz.i64(i64 %1, i1 true)
+  ret i64 %tmp
+}

From e14f327d8094e02134efa98625acaf6fd43fee08 Mon Sep 17 00:00:00 2001
From: Liao Chunyu <chunyu@iscas.ac.cn>
Date: Tue, 17 Jun 2025 23:32:01 -0400
Subject: [PATCH 846/851] [RISCV] Pre-test for #144461

---
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll | 26 +++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dda..522c83fd9fa99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
   ret <vscale x 2 x i64> %v
 }
 
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 100
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
 declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
 
 define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT:    bltu a2, a1, .LBB48_2
+; CHECK-NEXT:    bltu a2, a1, .LBB50_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:  .LBB48_2:
+; CHECK-NEXT:  .LBB50_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # vscale x 64-byte Folded Reload

From af49a650e172d56d684581b66afa9ab0368ec8f9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 13:23:17 +0900
Subject: [PATCH 847/851] PowerPC: Add baseline tests for more f128 libcall
 handling (#144381)

Some of these incorrectly call the l suffixed version of libm
functions and others assert.
---
 llvm/test/CodeGen/PowerPC/f128-arith.ll | 445 ++++++++++++++++++++++++
 1 file changed, 445 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index decc4a38f7ccd..ffa7ac6cb0078 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1403,3 +1403,448 @@ entry:
   ret fp128 %3
 }
 declare { fp128, i32 } @llvm.frexp.f128.i32(fp128)
+
+
+define dso_local fp128 @acos_f128(fp128 %x) {
+; CHECK-LABEL: acos_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl acosl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: acos_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl acosl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.acos.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @asin_f128(fp128 %x) {
+; CHECK-LABEL: asin_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl asinl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: asin_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl asinl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.asin.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan_f128(fp128 %x) {
+; CHECK-LABEL: atan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atanl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atanl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: atan2_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl atan2l
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: atan2_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl atan2l
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.atan2.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @copysign_f128(fp128 %x, fp128 %y) {
+; CHECK-LABEL: copysign_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xscpsgnqp v2, v3, v2
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: copysign_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    xxswapd vs0, v3
+; CHECK-P8-NEXT:    addi r3, r1, -16
+; CHECK-P8-NEXT:    xxswapd vs1, v2
+; CHECK-P8-NEXT:    stxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    addi r3, r1, -32
+; CHECK-P8-NEXT:    stxvd2x vs1, 0, r3
+; CHECK-P8-NEXT:    lbz r4, -1(r1)
+; CHECK-P8-NEXT:    lbz r5, -17(r1)
+; CHECK-P8-NEXT:    rlwimi r5, r4, 0, 0, 24
+; CHECK-P8-NEXT:    stb r5, -17(r1)
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    xxswapd v2, vs0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.copysign.f128(fp128 %x, fp128 %y)
+  ret fp128 %result
+}
+
+define dso_local fp128 @cosh_f128(fp128 %x) {
+; CHECK-LABEL: cosh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl coshl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: cosh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl coshl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.cosh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @exp10_f128(fp128 %x) {
+; CHECK-LABEL: exp10_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl exp10l
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: exp10_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl exp10l
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.exp10.f128(fp128 %x)
+  ret fp128 %result
+}
+
+; FIXME: Asserts
+; define dso_local fp128 @maximum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @maximumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+; FIXME: Asserts
+; define dso_local fp128 @minimumnum_f128(fp128 %x, fp128 %y) {
+;   %result = call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y)
+;   ret fp128 %result
+; }
+
+define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
+; CHECK-LABEL: ldexp_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    extsw r5, r5
+; CHECK-NEXT:    bl ldexpl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: ldexp_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    extsw r5, r5
+; CHECK-P8-NEXT:    bl ldexpl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.ldexp.f128.i32(fp128 %x, i32 %y)
+  ret fp128 %result
+}
+
+define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
+; CHECK-LABEL: modf_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    std r0, 64(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    addi r5, r1, 32
+; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    lxv v3, 32(r1)
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: modf_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    .cfi_offset r30, -16
+; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-P8-NEXT:    stdu r1, -64(r1)
+; CHECK-P8-NEXT:    addi r30, r1, 32
+; CHECK-P8-NEXT:    std r0, 80(r1)
+; CHECK-P8-NEXT:    mr r5, r30
+; CHECK-P8-NEXT:    bl modfl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
+; CHECK-P8-NEXT:    xxswapd v3, vs0
+; CHECK-P8-NEXT:    addi r1, r1, 64
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call { fp128, fp128 } @llvm.modf.f128(fp128 %x)
+  ret { fp128, fp128 } %result
+}
+
+define dso_local fp128 @roundeven_f128(fp128 %x) {
+; CHECK-LABEL: roundeven_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl roundevenl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: roundeven_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl roundevenl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.roundeven.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @sinh_f128(fp128 %x) {
+; CHECK-LABEL: sinh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl sinhl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: sinh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl sinhl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.sinh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tanh_f128(fp128 %x) {
+; CHECK-LABEL: tanh_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanhl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tanh_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanhl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tanh.f128(fp128 %x)
+  ret fp128 %result
+}
+
+define dso_local fp128 @tan_f128(fp128 %x) {
+; CHECK-LABEL: tan_f128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl tanl
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; CHECK-P8-LABEL: tan_f128:
+; CHECK-P8:       # %bb.0:
+; CHECK-P8-NEXT:    mflr r0
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-P8-NEXT:    .cfi_offset lr, 16
+; CHECK-P8-NEXT:    bl tanl
+; CHECK-P8-NEXT:    nop
+; CHECK-P8-NEXT:    addi r1, r1, 32
+; CHECK-P8-NEXT:    ld r0, 16(r1)
+; CHECK-P8-NEXT:    mtlr r0
+; CHECK-P8-NEXT:    blr
+  %result = call fp128 @llvm.tan.f128(fp128 %x)
+  ret fp128 %result
+}

From 7b9d10d2e6410029fd0750b2e0566432dbf03dc7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 18 Jun 2025 13:26:15 +0900
Subject: [PATCH 848/851] PowerPC: Fix using long double libm functions for
 f128 intrinsics (#144382)

This wasn't setting the correct libcall names, which default to the
l suffixed libm names.
---
 llvm/lib/IR/RuntimeLibcalls.cpp         | 143 +++++++++++-------------
 llvm/test/CodeGen/PowerPC/f128-arith.ll |  48 ++++----
 2 files changed, 91 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 7396626a03d41..a57b089193462 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -281,6 +281,69 @@ void RuntimeLibcallsInfo::initSoftFloatCmpLibcallPredicates() {
   SoftFloatCompareLibcallPredicates[RTLIB::UO_PPCF128] = CmpInst::ICMP_NE;
 }
 
+static void setLongDoubleIsF128Libm(RuntimeLibcallsInfo &Info,
+                                    bool FiniteOnlyFuncs = false) {
+  Info.setLibcallName(RTLIB::REM_F128, "fmodf128");
+  Info.setLibcallName(RTLIB::FMA_F128, "fmaf128");
+  Info.setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
+  Info.setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
+  Info.setLibcallName(RTLIB::LOG_F128, "logf128");
+  Info.setLibcallName(RTLIB::LOG2_F128, "log2f128");
+  Info.setLibcallName(RTLIB::LOG10_F128, "log10f128");
+  Info.setLibcallName(RTLIB::EXP_F128, "expf128");
+  Info.setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+  Info.setLibcallName(RTLIB::EXP10_F128, "exp10f128");
+  Info.setLibcallName(RTLIB::SIN_F128, "sinf128");
+  Info.setLibcallName(RTLIB::COS_F128, "cosf128");
+  Info.setLibcallName(RTLIB::TAN_F128, "tanf128");
+  Info.setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
+  Info.setLibcallName(RTLIB::ASIN_F128, "asinf128");
+  Info.setLibcallName(RTLIB::ACOS_F128, "acosf128");
+  Info.setLibcallName(RTLIB::ATAN_F128, "atanf128");
+  Info.setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
+  Info.setLibcallName(RTLIB::SINH_F128, "sinhf128");
+  Info.setLibcallName(RTLIB::COSH_F128, "coshf128");
+  Info.setLibcallName(RTLIB::TANH_F128, "tanhf128");
+  Info.setLibcallName(RTLIB::POW_F128, "powf128");
+  Info.setLibcallName(RTLIB::CEIL_F128, "ceilf128");
+  Info.setLibcallName(RTLIB::TRUNC_F128, "truncf128");
+  Info.setLibcallName(RTLIB::RINT_F128, "rintf128");
+  Info.setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
+  Info.setLibcallName(RTLIB::ROUND_F128, "roundf128");
+  Info.setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
+  Info.setLibcallName(RTLIB::FLOOR_F128, "floorf128");
+  Info.setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
+  Info.setLibcallName(RTLIB::FMIN_F128, "fminf128");
+  Info.setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
+  Info.setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
+  Info.setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
+  Info.setLibcallName(RTLIB::LROUND_F128, "lroundf128");
+  Info.setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
+  Info.setLibcallName(RTLIB::LRINT_F128, "lrintf128");
+  Info.setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
+  Info.setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
+  Info.setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+  Info.setLibcallName(RTLIB::MODF_F128, "modff128");
+
+  if (FiniteOnlyFuncs) {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
+  } else {
+    Info.setLibcallName(RTLIB::LOG_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::LOG10_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::EXP2_FINITE_F128, nullptr);
+    Info.setLibcallName(RTLIB::POW_FINITE_F128, nullptr);
+  }
+}
+
 /// Set default libcall names. If a target wants to opt-out of a libcall it
 /// should be placed here.
 void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
@@ -295,57 +358,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
 #undef LIBCALL_NO_NAME
 
   // Use the f128 variants of math functions on x86
-  if (TT.isX86() && TT.isGNUEnvironment()) {
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CBRT_F128, "cbrtf128");
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG_FINITE_F128, "__logf128_finite");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG2_FINITE_F128, "__log2f128_finite");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::LOG10_FINITE_F128, "__log10f128_finite");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP_FINITE_F128, "__expf128_finite");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::EXP2_FINITE_F128, "__exp2f128_finite");
-    setLibcallName(RTLIB::EXP10_F128, "exp10f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::TAN_F128, "tanf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::ASIN_F128, "asinf128");
-    setLibcallName(RTLIB::ACOS_F128, "acosf128");
-    setLibcallName(RTLIB::ATAN_F128, "atanf128");
-    setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
-    setLibcallName(RTLIB::SINH_F128, "sinhf128");
-    setLibcallName(RTLIB::COSH_F128, "coshf128");
-    setLibcallName(RTLIB::TANH_F128, "tanhf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::ROUNDEVEN_F128, "roundevenf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::COPYSIGN_F128, "copysignf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::FMINIMUM_F128, "fminimumf128");
-    setLibcallName(RTLIB::FMAXIMUM_F128, "fmaximumf128");
-    setLibcallName(RTLIB::FMINIMUM_NUM_F128, "fminimum_numf128");
-    setLibcallName(RTLIB::FMAXIMUM_NUM_F128, "fmaximum_numf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::LDEXP_F128, "ldexpf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
-    setLibcallName(RTLIB::MODF_F128, "modff128");
-  }
+  if (TT.isX86() && TT.isGNUEnvironment())
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/true);
 
   // For IEEE quad-precision libcall names, PPC uses "kf" instead of "tf".
   if (TT.isPPC()) {
@@ -379,31 +393,8 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::OGT_F128, "__gtkf2");
     setLibcallName(RTLIB::UO_F128, "__unordkf2");
 
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-    setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
-    setLibcallName(RTLIB::CEIL_F128, "ceilf128");
-    setLibcallName(RTLIB::FLOOR_F128, "floorf128");
-    setLibcallName(RTLIB::TRUNC_F128, "truncf128");
-    setLibcallName(RTLIB::ROUND_F128, "roundf128");
-    setLibcallName(RTLIB::LROUND_F128, "lroundf128");
-    setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
-    setLibcallName(RTLIB::RINT_F128, "rintf128");
-    setLibcallName(RTLIB::LRINT_F128, "lrintf128");
-    setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
-    setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
-    setLibcallName(RTLIB::FMA_F128, "fmaf128");
-    setLibcallName(RTLIB::FREXP_F128, "frexpf128");
+    // TODO: Do the finite only functions exist?
+    setLongDoubleIsF128Libm(*this, /*FiniteOnlyFuncs=*/false);
 
     if (TT.isOSAIX()) {
       bool isPPC64 = TT.isPPC64();
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index ffa7ac6cb0078..f9c953d483ff2 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1413,7 +1413,7 @@ define dso_local fp128 @acos_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl acosl
+; CHECK-NEXT:    bl acosf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1427,7 +1427,7 @@ define dso_local fp128 @acos_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl acosl
+; CHECK-P8-NEXT:    bl acosf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1445,7 +1445,7 @@ define dso_local fp128 @asin_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl asinl
+; CHECK-NEXT:    bl asinf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1459,7 +1459,7 @@ define dso_local fp128 @asin_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl asinl
+; CHECK-P8-NEXT:    bl asinf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1477,7 +1477,7 @@ define dso_local fp128 @atan_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl atanl
+; CHECK-NEXT:    bl atanf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1491,7 +1491,7 @@ define dso_local fp128 @atan_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl atanl
+; CHECK-P8-NEXT:    bl atanf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1509,7 +1509,7 @@ define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl atan2l
+; CHECK-NEXT:    bl atan2f128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1523,7 +1523,7 @@ define dso_local fp128 @atan2_f128(fp128 %x, fp128 %y) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl atan2l
+; CHECK-P8-NEXT:    bl atan2f128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1566,7 +1566,7 @@ define dso_local fp128 @cosh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl coshl
+; CHECK-NEXT:    bl coshf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1580,7 +1580,7 @@ define dso_local fp128 @cosh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl coshl
+; CHECK-P8-NEXT:    bl coshf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1598,7 +1598,7 @@ define dso_local fp128 @exp10_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl exp10l
+; CHECK-NEXT:    bl exp10f128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1612,7 +1612,7 @@ define dso_local fp128 @exp10_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl exp10l
+; CHECK-P8-NEXT:    bl exp10f128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1655,7 +1655,7 @@ define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    extsw r5, r5
-; CHECK-NEXT:    bl ldexpl
+; CHECK-NEXT:    bl ldexpf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1670,7 +1670,7 @@ define dso_local fp128 @ldexp_f128(fp128 %x, i32 %y) {
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
 ; CHECK-P8-NEXT:    extsw r5, r5
-; CHECK-P8-NEXT:    bl ldexpl
+; CHECK-P8-NEXT:    bl ldexpf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1689,7 +1689,7 @@ define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    .cfi_offset lr, 16
 ; CHECK-NEXT:    addi r5, r1, 32
-; CHECK-NEXT:    bl modfl
+; CHECK-NEXT:    bl modff128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    lxv v3, 32(r1)
 ; CHECK-NEXT:    addi r1, r1, 48
@@ -1708,7 +1708,7 @@ define dso_local { fp128, fp128 } @modf_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    addi r30, r1, 32
 ; CHECK-P8-NEXT:    std r0, 80(r1)
 ; CHECK-P8-NEXT:    mr r5, r30
-; CHECK-P8-NEXT:    bl modfl
+; CHECK-P8-NEXT:    bl modff128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r30
 ; CHECK-P8-NEXT:    xxswapd v3, vs0
@@ -1729,7 +1729,7 @@ define dso_local fp128 @roundeven_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl roundevenl
+; CHECK-NEXT:    bl roundevenf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1743,7 +1743,7 @@ define dso_local fp128 @roundeven_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl roundevenl
+; CHECK-P8-NEXT:    bl roundevenf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1761,7 +1761,7 @@ define dso_local fp128 @sinh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl sinhl
+; CHECK-NEXT:    bl sinhf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1775,7 +1775,7 @@ define dso_local fp128 @sinh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl sinhl
+; CHECK-P8-NEXT:    bl sinhf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1793,7 +1793,7 @@ define dso_local fp128 @tanh_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl tanhl
+; CHECK-NEXT:    bl tanhf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1807,7 +1807,7 @@ define dso_local fp128 @tanh_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl tanhl
+; CHECK-P8-NEXT:    bl tanhf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -1825,7 +1825,7 @@ define dso_local fp128 @tan_f128(fp128 %x) {
 ; CHECK-NEXT:    std r0, 48(r1)
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    bl tanl
+; CHECK-NEXT:    bl tanf128
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
@@ -1839,7 +1839,7 @@ define dso_local fp128 @tan_f128(fp128 %x) {
 ; CHECK-P8-NEXT:    std r0, 48(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    bl tanl
+; CHECK-P8-NEXT:    bl tanf128
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)

From 894d8d1ac82b2502af26062a9ff10329110c1314 Mon Sep 17 00:00:00 2001
From: Arjun Patel <arjunpatel151002@gmail.com>
Date: Wed, 18 Jun 2025 03:33:47 -0400
Subject: [PATCH 849/851] Change how labels are searched for by llvm-objdump

Previously, the loop would only add labels in the set of sections that were closest to Target. Set of section here because multiple sections can have the same address, so all of their symbols would be added to the set of candidate symbols. The following changes make it such that we loop down from sections closest to the Target and populate the set of candidate symbols with symbols from the first set of sections that do contain symbols
---
 llvm/tools/llvm-objdump/llvm-objdump.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 5ecb33375943f..dabb200d526ba 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2392,14 +2392,19 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                     [=](const std::pair<uint64_t, SectionRef> &O) {
                       return O.first <= Target;
                     });
-                uint64_t TargetSecAddr = 0;
+                uint64_t TargetSecAddr = It == SectionAddresses.end() ? 0 : It->first;
+                bool FoundSymbols = false;
                 while (It != SectionAddresses.begin()) {
                   --It;
-                  if (TargetSecAddr == 0)
+                  if (It->first != TargetSecAddr) {
+                    if (FoundSymbols)
+                      break;
                     TargetSecAddr = It->first;
-                  if (It->first != TargetSecAddr)
-                    break;
-                  TargetSectionSymbols.push_back(&AllSymbols[It->second]);
+                  }
+                  auto *SectionSymbols = &AllSymbols[It->second];
+                  TargetSectionSymbols.push_back(SectionSymbols);
+                  if (!SectionSymbols->empty())
+                    FoundSymbols = true;
                 }
               } else {
                 TargetSectionSymbols.push_back(&Symbols);

From ec782afc54b23d96b54e1dfd90d622671ca129a0 Mon Sep 17 00:00:00 2001
From: Arjun Patel <arjunpatel151002@gmail.com>
Date: Wed, 18 Jun 2025 16:55:33 -0400
Subject: [PATCH 850/851] Update tests affected by cchange in search pattern

---
 lld/test/ELF/aarch64-feature-pauth.s          |  4 +--
 lld/test/ELF/aarch64-gnu-ifunc-plt.s          |  4 +--
 lld/test/ELF/i386-feature-cet.s               |  2 +-
 lld/test/ELF/loongarch-relax-call36.s         |  4 +--
 lld/test/ELF/x86-64-feature-cet.s             |  4 +--
 .../X86/disassemble-same-section-addr.test    | 27 ++++++++-----------
 6 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s
index e8c900b9cb134..765a030736796 100644
--- a/lld/test/ELF/aarch64-feature-pauth.s
+++ b/lld/test/ELF/aarch64-feature-pauth.s
@@ -70,7 +70,7 @@
 # PACPLT: Disassembly of section .plt:
 # PACPLT:      <.plt>:
 # PACPLT-NEXT:     stp     x16, x30, [sp, #-0x10]!
-# PACPLT-NEXT:     adrp    x16, 0x30000 <func3+0x30000>
+# PACPLT-NEXT:     adrp    x16, 0x30000 <_DYNAMIC+0x{{[0-9a-fA-F]+}}>
 # PACPLT-NEXT:     ldr     x17, [x16, #0x[[B]]]
 # PACPLT-NEXT:     add     x16, x16, #0x[[B]]
 # PACPLT-NEXT:     br      x17
@@ -78,7 +78,7 @@
 # PACPLT-NEXT:     nop
 # PACPLT-NEXT:     nop
 # PACPLT:      <func3@plt>:
-# PACPLT-NEXT:     adrp    x16, 0x30000 <func3+0x30000>
+# PACPLT-NEXT:     adrp    x16, 0x30000 <_DYNAMIC+0x{{[0-9a-fA-F]+}}>
 # PACPLT-NEXT:     ldr     x17, [x16, #0x[[C]]]
 # PACPLT-NEXT:     add     x16, x16, #0x[[C]]
 # NOHINT-NEXT:     braa    x17, x16
diff --git a/lld/test/ELF/aarch64-gnu-ifunc-plt.s b/lld/test/ELF/aarch64-gnu-ifunc-plt.s
index 73ecf58ee76bf..09fd792c1b94b 100644
--- a/lld/test/ELF/aarch64-gnu-ifunc-plt.s
+++ b/lld/test/ELF/aarch64-gnu-ifunc-plt.s
@@ -55,8 +55,8 @@
 // DISASM:      <bar>:
 // DISASM-NEXT:    2102dc: ret
 // DISASM:      <_start>:
-// DISASM-NEXT:    2102e0: bl      0x210330 <zed2+0x210330>
-// DISASM-NEXT:    2102e4: bl      0x210340 <zed2+0x210340>
+// DISASM-NEXT:    2102e0: bl      0x210330 <zed2@plt+0x10>
+// DISASM-NEXT:    2102e4: bl      0x210340 <zed2@plt+0x20>
 // DISASM-NEXT:    2102e8: bl      0x210310 <bar2@plt>
 // DISASM-NEXT:    2102ec: bl      0x210320 <zed2@plt>
 // DISASM-EMPTY:
diff --git a/lld/test/ELF/i386-feature-cet.s b/lld/test/ELF/i386-feature-cet.s
index a7de05a1870dc..606c88d60894b 100644
--- a/lld/test/ELF/i386-feature-cet.s
+++ b/lld/test/ELF/i386-feature-cet.s
@@ -58,7 +58,7 @@
 
 # DISASM:      Disassembly of section .text:
 # DISASM:      00401200 <func1>:
-# DISASM-NEXT: 401200:       calll   0x401230 <func2+0x401230>
+# DISASM-NEXT: 401200:       calll   0x401230 <func1+0x30>
 # DISASM-NEXT: 401205:       calll   0x401240 <ifunc>
 # DISASM-NEXT:               retl
 
diff --git a/lld/test/ELF/loongarch-relax-call36.s b/lld/test/ELF/loongarch-relax-call36.s
index fa6e79dfa5803..e2c81460e162f 100644
--- a/lld/test/ELF/loongarch-relax-call36.s
+++ b/lld/test/ELF/loongarch-relax-call36.s
@@ -25,8 +25,8 @@
 # RELAX-NEXT:              nop
 # RELAX-NEXT:              nop
 ## offset = .plt(0x10400)+32 - 0x10010 = 1040
-# RELAX-NEXT:      10010:  bl     1040 <bar+0x10420>
-# RELAX-NEXT:              b      1036 <bar+0x10420>
+# RELAX-NEXT:      10010:  bl     1040 <_start_end+0x404>
+# RELAX-NEXT:              b      1036 <_start_end+0x404>
 # RELAX-EMPTY:
 # RELAX-NEXT: <a>:
 # RELAX-NEXT:      10018:  ret
diff --git a/lld/test/ELF/x86-64-feature-cet.s b/lld/test/ELF/x86-64-feature-cet.s
index 6a88463ff8bfd..3f7b3180d6a44 100644
--- a/lld/test/ELF/x86-64-feature-cet.s
+++ b/lld/test/ELF/x86-64-feature-cet.s
@@ -65,8 +65,8 @@
 
 # DISASM:      Disassembly of section .text:
 # DISASM:      0000000000201330 <func1>:
-# DISASM-NEXT: 201330:       callq   0x201360 <func2+0x201360>
-# DISASM-NEXT: 201335:       callq   0x201370 <func2+0x201370>
+# DISASM-NEXT: 201330:       callq   0x201360 <ifunc+0x25>
+# DISASM-NEXT: 201335:       callq   0x201370 <ifunc+0x35>
 # DISASM-NEXT:               retq
 
 # DISASM:      Disassembly of section .plt:
diff --git a/llvm/test/tools/llvm-objdump/X86/disassemble-same-section-addr.test b/llvm/test/tools/llvm-objdump/X86/disassemble-same-section-addr.test
index 4b0da011c9d46..89f699c9fe4b1 100644
--- a/llvm/test/tools/llvm-objdump/X86/disassemble-same-section-addr.test
+++ b/llvm/test/tools/llvm-objdump/X86/disassemble-same-section-addr.test
@@ -5,33 +5,28 @@
 ## to reproduce the original failure.
 
 ## Two empty sections, one with symbol in, one without.
-# RUN: yaml2obj %s --docnum=1 -o %t1 -D SIZE1=0 -D SIZE2=0 -D SECTION=.second -D INDEX=SHN_ABS
+# RUN: yaml2obj %s --docnum=1 -o %t1 -D SIZE1=0 -D SIZE2=0 -D SECTION=.second -D INDEX=SHN_ABS -D VALUE=0x5
 # RUN: llvm-objdump -d %t1 | FileCheck %s --check-prefix=TARGET
-# RUN: yaml2obj %s --docnum=1 -o %t2 -D SIZE1=0 -D SIZE2=0 -D SECTION=.first -D INDEX=SHN_ABS
+# RUN: yaml2obj %s --docnum=1 -o %t2 -D SIZE1=0 -D SIZE2=0 -D SECTION=.first -D INDEX=SHN_ABS -D VALUE=0x5
 # RUN: llvm-objdump -d %t2 | FileCheck %s --check-prefix=TARGET
 
 ## Two sections, one empty with symbol, other non-empty, without symbol.
-# RUN: yaml2obj %s --docnum=1 -o %t3 -D SIZE1=1 -D SIZE2=0 -D SECTION=.second -D INDEX=SHN_ABS
+# RUN: yaml2obj %s --docnum=1 -o %t3 -D SIZE1=1 -D SIZE2=0 -D SECTION=.second -D INDEX=SHN_ABS -D VALUE=0x5
 # RUN: llvm-objdump -d %t3 | FileCheck %s --check-prefix=TARGET
-# RUN: yaml2obj %s --docnum=1 -o %t4 -D SIZE1=0 -D SIZE2=1 -D SECTION=.first -D INDEX=SHN_ABS
+# RUN: yaml2obj %s --docnum=1 -o %t4 -D SIZE1=0 -D SIZE2=1 -D SECTION=.first -D INDEX=SHN_ABS -D VALUE=0x5
 # RUN: llvm-objdump -d %t4 | FileCheck %s --check-prefix=TARGET
 
-## Fall back to absolute symbol if no symbol found in candidate sections.
-# RUN: yaml2obj %s --docnum=1 -o %t5 -D SIZE1=1 -D SIZE2=0 -D SECTION=.caller -D INDEX=SHN_ABS
+## Fall back to absolute symbols if no symbol found in candidate sections.
+# RUN: llvm-objcopy -N foo --add-symbol absol=0 %p/../ELF/Inputs/call-absolute-symbol.elf-x86_64 %t5
 # RUN: llvm-objdump -d %t5 | FileCheck %s --check-prefix=ABSOLUTE
 
-## Show that other symbols with reserved st_shndx values are treated as absolute
-## symbols.
-# RUN: yaml2obj %s --docnum=1 -o %t6 -D SIZE1=1 -D SIZE2=0 -D SECTION=.caller -D INDEX=SHN_LOPROC
-# RUN: llvm-objdump -d %t6 | FileCheck %s --check-prefix=ABSOLUTE
-
 ## Print no target if no symbol in section/absolute symbol found.
-# RUN: llvm-objcopy %t5 %t7 -N other
-# RUN: llvm-objdump -d %t7 | FileCheck %s --check-prefix=FAIL
+# RUN: llvm-objcopy %p/../ELF/Inputs/call-absolute-symbol.elf-x86_64 %t6 -N foo
+# RUN: llvm-objdump -d %t6 | FileCheck %s --check-prefix=FAIL
 
 # TARGET:   callq 0x5 <target>
-# ABSOLUTE: callq 0x5 <other+0x5>
-# FAIL:     callq 0x5{{$}}
+# ABSOLUTE: callq 0x100 <absol+0x100>
+# FAIL:     callq 0x100{{$}}
 
 --- !ELF
 FileHeader:
@@ -58,7 +53,7 @@ Sections:
 Symbols:
   - Name:    target
     Section: [[SECTION]]
-    Value:   0x5
+    Value:   [[VALUE]]
   - Name:    other
     Index:   [[INDEX]]
     Value:   0x0

From bf0a48615f43f1d41d2780e7c9c1b4b80fb9c1cd Mon Sep 17 00:00:00 2001
From: Arjun Patel <arjunpatel151002@gmail.com>
Date: Sun, 22 Jun 2025 20:42:23 -0400
Subject: [PATCH 851/851] Add tests

---
 .../tools/llvm-objdump/RISCV/riscv-sym-search.s | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 cross-project-tests/tools/llvm-objdump/RISCV/riscv-sym-search.s

diff --git a/cross-project-tests/tools/llvm-objdump/RISCV/riscv-sym-search.s b/cross-project-tests/tools/llvm-objdump/RISCV/riscv-sym-search.s
new file mode 100644
index 0000000000000..01fc4806116a4
--- /dev/null
+++ b/cross-project-tests/tools/llvm-objdump/RISCV/riscv-sym-search.s
@@ -0,0 +1,17 @@
+# RUN: %clang --target=fuchsia-elf-riscv64 -march=rv64g %s -nostdlib -o %t
+# RUN: llvm-objdump -d %t | FileCheck %s
+
+# CHECK:   auipc a0, 0x101
+# CHECK:   ld a0, 0x8(a0) <ldata+0x1000>
+.global _start
+.text
+_start:
+  la a0, gdata
+
+.skip 0x100000
+ldata:
+  .int 0
+
+.data
+gdata:
+  .int 0